Skip to content

Commit e5d2ecd

Browse files
author
Boris Grinshpun
committed
Sorting filter for Farber data across all tissues
1 parent 8a89d26 commit e5d2ecd

File tree

1 file changed

+108
-0
lines changed

1 file changed

+108
-0
lines changed

allofilter_farber.R

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
###### Boris Grinshpun 01/30/16 ######
2+
3+
# presets
4+
rm(list=ls())
5+
options(stringsAsFactors = FALSE)
6+
setwd('~/DropboxCGC/FarberSamples/productive') # path to data files
7+
outputpath="~/DropboxCGC/FarberSamples/productive/filtered/" # where filtered results are saved
8+
9+
### DATA LOADING ###
10+
11+
#read file
12+
args <- commandArgs(TRUE)
13+
donor=args[1]
14+
files=Sys.glob(paste(donor,'*.tsv',sep=''))
15+
data=lapply(files, function(f) read.table(f,header=T))
16+
data=lapply(data,function(x) {rownames(x)=x$nucleotide;x;})
17+
mynames=sapply(files, function(f) strsplit(f,split='.',fixed=T)[[1]][1])
18+
19+
data.simplified=lapply(seq(data), function(i){x=data.frame(data[[i]]$nucleotide,data[[i]]$count); colnames(x)=c("nucleotide",mynames[i]);x;})
20+
dd=Reduce(function(...) merge(..., all=T,by='nucleotide'), data.simplified)
21+
dd[is.na(dd)]=0
22+
23+
24+
sorting_filter<-function(X,cutoff=2){
25+
output=data.frame(nucleotide=X$nucleotide)
26+
27+
Xfq=X[,2:length(X)]
28+
Xfq=sweep(Xfq,2,colSums(Xfq),'/') # normalize to frequencies
29+
ratios=t(apply(Xfq, 1, function(x) x/max(x))) # normalize so that highest frequency in each row is set to 1
30+
indices=data.frame(0+(ratios>=1/cutoff)) # index on whether or not the clone is below the frequency cutoff (0 if yes, 1 if no)
31+
# indices$flag=rep(0,dim(indices)[1]) # initialize flags
32+
33+
CD4vals=indices[,grep('CD4',names(indices),ignore.case=T,value=T)]
34+
CD8vals=indices[,grep('CD8',names(indices),ignore.case=T,value=T)]
35+
CD4ratios=data.frame(ratios[,grep('CD4',colnames(ratios),ignore.case=T,value=T)])
36+
CD8ratios=data.frame(ratios[,grep('CD8',colnames(ratios),ignore.case=T,value=T)])
37+
38+
CD4vals$total=rowSums(CD4vals)
39+
CD8vals$total=rowSums(CD8vals)
40+
CD4ratios$total=rowSums(0+(CD4ratios>0))
41+
CD8ratios$total=rowSums(0+(CD8ratios>0))
42+
43+
indices$CD4present=0+(CD4vals$total>0)
44+
indices$CD8present=0+(CD8vals$total>0)
45+
46+
# resolve ambiguity to the best possible first by indices, then by ratios across all samples.
47+
ambiguous=which(indices$CD4present>0 & indices$CD8present>0)
48+
indices$CD4present[ambiguous[CD4vals$total[ambiguous]/CD8vals$total[ambiguous]<=0.5]]=0
49+
indices$CD8present[ambiguous[CD4vals$total[ambiguous]/CD8vals$total[ambiguous]>=2]]=0
50+
51+
ambiguous=which(indices$CD4present>0 & indices$CD8present>0)
52+
indices$CD4present[ambiguous[CD4ratios$total[ambiguous]/CD8ratios$total[ambiguous]<=0.5]]=0
53+
indices$CD8present[ambiguous[CD4ratios$total[ambiguous]/CD8ratios$total[ambiguous]>=2]]=0
54+
55+
# indices$flag=indices$CD4present+indices$CD8present
56+
57+
# indices[indices$flag==2,]
58+
59+
#CD4cols=grep('CD4',names(indices),ignore.case=T,value=T)
60+
#CD4cols=CD4cols[grep('present',CD4cols,invert=T)]
61+
#CD8cols=grep('CD8',names(indices),ignore.case=T,value=T)
62+
#CD8cols=CD8cols[grep('present',CD8cols,invert=T)]
63+
#indices[CD4cols[,indices$CD4present==1]=1]
64+
65+
#indices[indices$flag==2,]=0
66+
#indices[indices$CD4present==1,CD4cols]=1
67+
#indices[indices$CD4present==1,CD8cols]=0
68+
#indices[indices$CD8present==1,CD4cols]=0
69+
#indices[indices$CD8present==1,CD8cols]=1
70+
71+
#output[,2:dim(output)[2]]=output[,2:dim(output)[2]]*indices[,1:(dim(indices)[2]-3)]
72+
73+
ambiguous=which(indices$CD4present>0 & indices$CD8present>0)
74+
indices$CD4present[ambiguous]=0
75+
indices$CD8present[ambiguous]=0
76+
77+
output$CD4inds=indices$CD4present
78+
output$CD8inds=indices$CD8present
79+
80+
return(output)
81+
82+
}
83+
84+
inds=sorting_filter(dd)
85+
rownames(inds)<-inds$nucleotide
86+
data.f=data
87+
88+
# filter
89+
for(i in seq(data)){
90+
n=mynames[i]
91+
d=data[[i]]
92+
seq=intersect(rownames(d),rownames(inds))
93+
d=d[seq,]
94+
currentinds=inds[seq,]
95+
if(length(grep("CD4",n))){
96+
d$count= d$count*currentinds$CD4inds
97+
} else if(length(grep("CD8",n))){
98+
d$count= d$count*currentinds$CD8inds
99+
} else{print('PROBLEM')}
100+
data.f[[i]]=d
101+
}
102+
103+
# remove 0s
104+
data.ff=lapply(data.f, function(x) x[x$count>0,])
105+
106+
for(i in seq(data.ff)){
107+
write.table(data.ff[[i]],file=paste(outputpath,'/',mynames[i],'.filtered',sep=''),quote=F,sep='\t',row.names = F)
108+
}

0 commit comments

Comments
 (0)