bioinformatics - R code to work on genotype data -

i have data called mydf.

i need match letters (dna letters) in column ref , alt colnames(x) ("a","t","g","c") , corresponding numerical values pasted "ref,alt".

however, there lines have "snp:+[0-9]" , "flat$" in type column.

now "flat$" lines want to:

sum alt values many "snp:+[0-9]" of corresponding "start" ids, including flat line itself, if alt letters unique (please see script enclosed in curly brackets 1 flat line)
paste alt value again "ref,alt" (ref value same both "snp:+[0-9]" , "flat$" same start id)
get output shown in result.

i have done 1 flat line,but need making function flatcase same flat lines.

how can make function flatcase?

code

normalcase <- function(x, ns) {       ref.idx <- which(ns == "ref")       ref.allele <- x[ref.idx]       ref.count <- x[which(ns == ref.allele)]        alt.idx <- which(ns == "alt")       alt.allele <- x[alt.idx]       alt.count <- x[which(ns == alt.allele)]        paste(ref.count, alt.count, sep=",")     }        flatcase??{       g<-x[,"start"]=="chr16:2530921"& grepl("snp:+[0-9]",x[,"type"])      myt<-x[g,]      x[g,"alt"]      unique(x[g,"alt"])      c<-unique(x[g,"alt"])      flat<-myt[grepl("flat$",myt[,"type"]),]      c<-unique(x[g,"alt"])     alt.count<- sum(as.numeric(flat[c]))     }      calculatead <- function(x, mat, ns) {       if (grepl("flat$", x[which(ns == 'type')])) {         flatcase(x, mat, ns)       } else {         normalcase(x, ns)       }     }       bamad <- function(x) {        new.x <- cbind(x, apply(x, 1, calculatead, x, colnames(x)))       colnames(new.x)[ncol(new.x)] <- "bam.ad"       new.x           }

the function have tried flatcase is:

flatcase <- function(x, mat, ns) {   id.idx <- which(ns == 'start')   type.idx <- which(ns == 'type')   ref.idx <- which(ns == 'ref')   alt.idx <- which(ns == 'alt')     id <- x[id.idx]   #m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]   #m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]   m<-mat[grepl(id,mat[, id.idx]) & grepl("snp:+[0-9]",mat[, type.idx]),]   #flat<-mat[grepl("flat$",mat[, type.idx]),]   ref.allele <- x[ref.idx]   ref.count<-x[which(ns == ref.allele)]     alt.count <- sum(apply(m, 1, function(x) as.numeric(x[which(ns == x[alt.idx])])))   paste(ref.count, alt.count, sep=",")  }

mydf

x <- as.matrix(read.csv(text="start,a,t,g,c,ref,alt,type  chr20:5363934,95,29,14,59,c,t,snp chr5:8529759,24,1,28,41,g,c,snp chr14:9620689,65,49,41,96,t,g,snp chr18:547375,94,1,51,67,g,c,snp chr8:5952145,27,80,25,96,t,t,snp chr14:8694382,68,94,26,30,a,a,snp chr16:2530921,49,15,79,72,a,t,snp:2530921 chr16:2530921,49,15,79,72,a,g,snp:2530921 chr16:2530921,49,15,79,72,a,t,snp:2530921flat chr16:2533924,42,13,19,52,g,t,snp:2533924flat chr16:2543344,4,13,13,42,g,t,snp:2543344flat chr16:2543344,4,23,13,42,g,a,snp:2543344 chr14:4214117,73,49,18,77,g,a,snp chr4:7799768,36,28,1,16,c,a,snp chr3:9141263,27,41,93,90,a,a,snp", stringsasfactors=false))

result:

       start              t    g    c    ref alt type              bam.ad    [1,] "chr20:5363934" "95" "29" "14" "59" "c" "t" "snp"             "59,29"   [2,] "chr5:8529759"  "24" " 1" "28" "41" "g" "c" "snp"             "28,41"   [3,] "chr14:9620689" "65" "49" "41" "96" "t" "g" "snp"             "49,41"   [4,] "chr18:547375"  "94" " 1" "51" "67" "g" "c" "snp"             "51,67"   [5,] "chr8:5952145"  "27" "80" "25" "96" "t" "t" "snp"             "80,80"   [6,] "chr14:8694382" "68" "94" "26" "30" "a" "a" "snp"             "68,68"   [7,] "chr16:2530921" "49" "15" "79" "72" "a" "t" "snp:2530921"     "49,15"   [8,] "chr16:2530921" "49" "15" "79" "72" "a" "g" "snp:2530921"     "49,79"   [9,] "chr16:2530921" "49" "15" "79" "72" "a" "t" "snp:2530921flat" "49,94" [10,] "chr16:2533924" "42" "13" "19" "52" "g" "t" "snp:2533924flat" "19,13"   [11,] "chr16:2543344" "42" "13" "13" "42" "g" "t" "snp:2543344flat" "13,55"  [12,] "chr16:2543344" "42" "23" "13" "42" "g" "a" "snp:2543344"     "13,42"  [13,] "chr14:4214117" "73" "49" "18" "77" "g" "a" "snp"             "18,73"  [14,] "chr4:7799768"  "36" "28" " 1" "16" "c" "a" "snp"             "16,36"  [15,] "chr3:9141263"  "27" "41" "93" "90" "a" "a" "snp"             "27,27"

here's way all, vectorised.

first, note ref same regardless of type. can using ref coordinate matrix, e.g. row 1 has ref c, if coordinates (1, "c") ref value row.

# refs same regardless of type rownames(x) <- 1:nrow(x) ref <- x[cbind(1:nrow(x), x[, 'ref'])]

have @ cbind(1:nrow(x), x[, 'ref']): list of coordinates (row number, ref) , use ref number.

then same alt:

alt <- x[cbind(1:nrow(x), x[, 'alt'])]

however have make sure if type 'flat', add other alts 'flat' row's alt (only unique ones, say).

first, work out rows flat:

which.flat <- grep('flat$', x[, 'type'])

next, each flat row, alt of other rows same 'start' (that's x[, 'start'] == x[i, 'start'] bit), , exclude rows have duplicate alt (that's x[, 'alt'] != x[i, 'alt'] bit). here i index of current flat line. add them flat line's alt. sapply vectorises each flat line.

# add other alts alt of 'flat' line. alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,     function (i) {         sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &              x[, 'alt'] != x[i, 'alt'] ]))     })

now paste together:

x <- cbind(x, bam.ad=paste(ref, alt, sep=','))

the result same yours except row 10 believe have made mistake - there's 1 row "chr16:2533924" , alt "t" (value 13), bam.ad "19,13" (you have "19,42" if alt "a", isn't).

if must stick function form in question (quite slow & inefficient!), it's same have done (hence why can without apply call , skip loop entirely):

flatcase <- function(x, mat, ns) { # alt of flat row alt <- as.numeric(x[x['alt']])

# other rows same 'start' , different 'alt' xx <- mat[mat[, 'start'] == x['start'] & mat[, 'alt'] != x['alt'], ,drop=f] if (nrow(xx) > 0) {   # grab alts done before   rownames(xx) <- 1:nrow(xx)   alt <- alt + sum(as.numeric(xx[cbind(1:nrow(xx), xx[, 'alt'])]))  }  ref <- x[x['ref']] return(paste(ref, alt, sep=',')) }

however mentioned before, if vectorise entire code have above reduces few lines, , faster:

newbamad <- function (x) {     # version above     rownames(x) <- 1:nrow(x)     ref <- x[cbind(1:nrow(x), x[, 'ref'])]     alt <- x[cbind(1:nrow(x), x[, 'alt'])]     which.flat <- grep('flat$', x[, 'type'])     alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,         function (i) {             sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &                  x[, 'alt'] != x[i, 'alt'] ]))         })     cbind(x, bam.ad=paste(ref, alt, sep=',')) }  library(rbenchmark) benchmark(   bamad=bamad(x),   newbamad=newbamad(x) ) #       test replications elapsed relative user.self sys.self user.child sys.child # 1    bamad          100   0.082    3.905     0.072    0.004          0         0 # 2 newbamad          100   0.021    1.000     0.020    0.000          0         0

the vectorised version 4x faster.

Brazille

Search This Blog

bioinformatics - R code to work on genotype data -

Comments

Post a Comment