i have data called mydf.
i need match letters (dna letters) in column ref , alt colnames(x) ("a","t","g","c") , corresponding numerical values pasted "ref,alt".
however, there lines have "snp:+[0-9]" , "flat$" in type column.
now "flat$" lines want to:
- sum
altvalues many"snp:+[0-9]"of corresponding"start"ids, including flat line itself, ifaltletters unique (please see script enclosed in curly brackets 1 flat line) - paste
altvalue again"ref,alt"(refvalue same both"snp:+[0-9]","flat$"same start id) - get output shown in result.
i have done 1 flat line,but need making function flatcase same flat lines.
how can make function flatcase?
code
normalcase <- function(x, ns) { ref.idx <- which(ns == "ref") ref.allele <- x[ref.idx] ref.count <- x[which(ns == ref.allele)] alt.idx <- which(ns == "alt") alt.allele <- x[alt.idx] alt.count <- x[which(ns == alt.allele)] paste(ref.count, alt.count, sep=",") } flatcase??{ g<-x[,"start"]=="chr16:2530921"& grepl("snp:+[0-9]",x[,"type"]) myt<-x[g,] x[g,"alt"] unique(x[g,"alt"]) c<-unique(x[g,"alt"]) flat<-myt[grepl("flat$",myt[,"type"]),] c<-unique(x[g,"alt"]) alt.count<- sum(as.numeric(flat[c])) } calculatead <- function(x, mat, ns) { if (grepl("flat$", x[which(ns == 'type')])) { flatcase(x, mat, ns) } else { normalcase(x, ns) } } bamad <- function(x) { new.x <- cbind(x, apply(x, 1, calculatead, x, colnames(x))) colnames(new.x)[ncol(new.x)] <- "bam.ad" new.x } the function have tried flatcase is:
flatcase <- function(x, mat, ns) { id.idx <- which(ns == 'start') type.idx <- which(ns == 'type') ref.idx <- which(ns == 'ref') alt.idx <- which(ns == 'alt') id <- x[id.idx] #m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ] #m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ] m<-mat[grepl(id,mat[, id.idx]) & grepl("snp:+[0-9]",mat[, type.idx]),] #flat<-mat[grepl("flat$",mat[, type.idx]),] ref.allele <- x[ref.idx] ref.count<-x[which(ns == ref.allele)] alt.count <- sum(apply(m, 1, function(x) as.numeric(x[which(ns == x[alt.idx])]))) paste(ref.count, alt.count, sep=",") } mydf
x <- as.matrix(read.csv(text="start,a,t,g,c,ref,alt,type chr20:5363934,95,29,14,59,c,t,snp chr5:8529759,24,1,28,41,g,c,snp chr14:9620689,65,49,41,96,t,g,snp chr18:547375,94,1,51,67,g,c,snp chr8:5952145,27,80,25,96,t,t,snp chr14:8694382,68,94,26,30,a,a,snp chr16:2530921,49,15,79,72,a,t,snp:2530921 chr16:2530921,49,15,79,72,a,g,snp:2530921 chr16:2530921,49,15,79,72,a,t,snp:2530921flat chr16:2533924,42,13,19,52,g,t,snp:2533924flat chr16:2543344,4,13,13,42,g,t,snp:2543344flat chr16:2543344,4,23,13,42,g,a,snp:2543344 chr14:4214117,73,49,18,77,g,a,snp chr4:7799768,36,28,1,16,c,a,snp chr3:9141263,27,41,93,90,a,a,snp", stringsasfactors=false)) result:
start t g c ref alt type bam.ad [1,] "chr20:5363934" "95" "29" "14" "59" "c" "t" "snp" "59,29" [2,] "chr5:8529759" "24" " 1" "28" "41" "g" "c" "snp" "28,41" [3,] "chr14:9620689" "65" "49" "41" "96" "t" "g" "snp" "49,41" [4,] "chr18:547375" "94" " 1" "51" "67" "g" "c" "snp" "51,67" [5,] "chr8:5952145" "27" "80" "25" "96" "t" "t" "snp" "80,80" [6,] "chr14:8694382" "68" "94" "26" "30" "a" "a" "snp" "68,68" [7,] "chr16:2530921" "49" "15" "79" "72" "a" "t" "snp:2530921" "49,15" [8,] "chr16:2530921" "49" "15" "79" "72" "a" "g" "snp:2530921" "49,79" [9,] "chr16:2530921" "49" "15" "79" "72" "a" "t" "snp:2530921flat" "49,94" [10,] "chr16:2533924" "42" "13" "19" "52" "g" "t" "snp:2533924flat" "19,13" [11,] "chr16:2543344" "42" "13" "13" "42" "g" "t" "snp:2543344flat" "13,55" [12,] "chr16:2543344" "42" "23" "13" "42" "g" "a" "snp:2543344" "13,42" [13,] "chr14:4214117" "73" "49" "18" "77" "g" "a" "snp" "18,73" [14,] "chr4:7799768" "36" "28" " 1" "16" "c" "a" "snp" "16,36" [15,] "chr3:9141263" "27" "41" "93" "90" "a" "a" "snp" "27,27"
here's way all, vectorised.
first, note ref same regardless of type. can using ref coordinate matrix, e.g. row 1 has ref c, if coordinates (1, "c") ref value row.
# refs same regardless of type rownames(x) <- 1:nrow(x) ref <- x[cbind(1:nrow(x), x[, 'ref'])] have @ cbind(1:nrow(x), x[, 'ref']): list of coordinates (row number, ref) , use ref number.
then same alt:
alt <- x[cbind(1:nrow(x), x[, 'alt'])] however have make sure if type 'flat', add other alts 'flat' row's alt (only unique ones, say).
first, work out rows flat:
which.flat <- grep('flat$', x[, 'type']) next, each flat row, alt of other rows same 'start' (that's x[, 'start'] == x[i, 'start'] bit), , exclude rows have duplicate alt (that's x[, 'alt'] != x[i, 'alt'] bit). here i index of current flat line. add them flat line's alt. sapply vectorises each flat line.
# add other alts alt of 'flat' line. alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat, function (i) { sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] & x[, 'alt'] != x[i, 'alt'] ])) }) now paste together:
x <- cbind(x, bam.ad=paste(ref, alt, sep=',')) the result same yours except row 10 believe have made mistake - there's 1 row "chr16:2533924" , alt "t" (value 13), bam.ad "19,13" (you have "19,42" if alt "a", isn't).
if must stick function form in question (quite slow & inefficient!), it's same have done (hence why can without apply call , skip loop entirely):
flatcase <- function(x, mat, ns) { # alt of flat row alt <- as.numeric(x[x['alt']])
# other rows same 'start' , different 'alt' xx <- mat[mat[, 'start'] == x['start'] & mat[, 'alt'] != x['alt'], ,drop=f] if (nrow(xx) > 0) { # grab alts done before rownames(xx) <- 1:nrow(xx) alt <- alt + sum(as.numeric(xx[cbind(1:nrow(xx), xx[, 'alt'])])) } ref <- x[x['ref']] return(paste(ref, alt, sep=',')) } however mentioned before, if vectorise entire code have above reduces few lines, , faster:
newbamad <- function (x) { # version above rownames(x) <- 1:nrow(x) ref <- x[cbind(1:nrow(x), x[, 'ref'])] alt <- x[cbind(1:nrow(x), x[, 'alt'])] which.flat <- grep('flat$', x[, 'type']) alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat, function (i) { sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] & x[, 'alt'] != x[i, 'alt'] ])) }) cbind(x, bam.ad=paste(ref, alt, sep=',')) } library(rbenchmark) benchmark( bamad=bamad(x), newbamad=newbamad(x) ) # test replications elapsed relative user.self sys.self user.child sys.child # 1 bamad 100 0.082 3.905 0.072 0.004 0 0 # 2 newbamad 100 0.021 1.000 0.020 0.000 0 0 the vectorised version 4x faster.
Comments
Post a Comment