Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions apps/RepSeq/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM rocker/r-ver:4.0.0

ARG WHEN

RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libssl-dev libssh2-1-dev libxml2-dev zlib1g-dev && \
R -e "options(repos = \
list(CRAN = 'http://mran.revolutionanalytics.com/snapshot/${WHEN}')); \
install.packages(c('devtools','argparse', 'testthat', 'roxygen2', 'BiocManager', 'data.table','pbapply','pheatmap','naturalsort', 'scales', 'magick','Rcpp', 'vegan', 'ggplot2'))"
RUN R -e "BiocManager:: install('DESeq2')"



COPY ./common/process_mixcr_to_tm.R /home/analysis/process_mixcr_to_r.R
COPY ./common/diversity.R /home/analysis/diversity.R
COPY ./common/cdr3length.R /home/analysis/cdr3length.R

11 changes: 11 additions & 0 deletions apps/RepSeq/common/cdr3length.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#' a small function to compute CDR3 length and frequence by length
#' @param datatab a dataframe of clonotype
lengthFreq<-function(datatab){
#compute cdr3dna length
lengthdf<-datatab@assayData[,.(length=nchar(CDR3dna)),by=c("VpJ", "count","lib")]
# compute frequency and format dataframe
lengthDfFormated<-lengthdf %>% group_by(lib) %>% dplyr::count( length) %>%group_by(
lib)%>%mutate(n=n/sum(n))%>% tidyr::pivot_wider(names_from = length, values_from = n) %>% select(
lib,order(colnames(.)))%>%mutate_all(~replace(., is.na(.), 0))
return(lengthDfFormated)
}
142 changes: 142 additions & 0 deletions apps/RepSeq/common/diversity.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@

#' compute lots of diversity indices
#' modified from repseq package
#' @param x: Repseqexperiment S4 object
#' @param level: level for computation of diversity
#' @return dataframe of diversity
basicIndicesA <- function(x, level=c("VpJ", "V", "J", "VJ", "CDR3aa")) {
# function to create all diversity

if (missing(x)) stop("x is missing.")
if (!is.RepSeqExperiment(x)) stop("an object of class is expected.")
levelChoice <- match.arg(level)
# summary function
pastek <- function(y) list(shannon=diversityA(y, index="shannon"), simpson=diversityA(y, index="simpson"), berger_parker=diversityA(y, index="berger_parker"),
invsimpson=diversityA(y, index="invsimpson"),pielou=diversityA(y, index="pielou"),richness=diversityA(y, index="richness"), gini=gini(y),
chao1=chao1(y)$chao.est, chao1.se=chao1(y)$chao.se, ichao=iChao(y), chaowor=Chaowor(y))
out <- assay(x)[, .(count=sum(count)), by=c("lib", levelChoice)][, pastek(count), by="lib"]
return(out)
}

#' modified function of repseq package to add other diversity indices
#' compute: simpson, shannon, pielou, richness, gini, berger_parker, invsimpson
#' parameters:
#' @param x: a dataframe of count by sample ( lib)
#' @param index: diversity index to compute
#' @param norm: boolean to normalise some diversity
#' @param base: exponential 1
#' @return the diversity indice needed for a repertoir
diversityA <- function(x, index=c("shannon", "simpson", "invsimpson", "berger_parker", "pielou", "richness"), norm=TRUE, base=exp(1)) {


if (missing(x)) stop("data set x is required.")
x <- x/sum(x)
id <- match.arg(index)
if(id=="berger_parker"){
# berger parker

H <- -log(max(x))
}
if (id == "shannon"|| id=="pielou") {
x <- -x * log(x, base)

}else if(id=="berger_parker"){

}
else {
x <- x * x
}
H <- sum(x, na.rm = TRUE)
if (id=="pielou"){
norm=FALSE
H <-H/log(length(x),base)
}
if (norm) H <- H/log(sum(x>0), base)
if (id == "simpson") {
H <- 1 - H
} else if (id == "invsimpson") {
H <- 1/H
}
if (id=="berger_parker"){
H <- -log(max(x))
}

if (id=="richness"){
#richness
H<-length(x)
}
return(H)
}

#' a function from the repseq package not modified
#' @param x an object of class RepSeqExperiment
#' @return gini diversity indice
gini <- function(x) {
x <- sort(x)
n <- length(x)
out <- 1/n * (n + 1 - 2 * sum((n + 1 - 1:n) * x)/sum(x))
out
}

# compute Chao1 indices
#' a function from the repseq package not modified
# function computes Chao indices
#' @param x an object of class RepSeqExperiment
#' @return a list of chao diversity index
chao1 <- function(x) {
f1 <- sum(x == 1)
f2 <- sum(x == 2)
S <- sum(x > 0)
#if (f2 > 0) est <- S + f1^2/(2*f2)
#if (f2 == 0) est <- S + f1*(f1 - 1)/2
est <- S + f1*(f1 - 1)/(2 * (f2 + 1))
r <- f1/f2
chao.var <- f2 * ((r^4)/4 + r^3 + (r^2)/2)
chao.se <- sqrt(chao.var)
#return(est)
return(list(chao.est=est, chao.se=chao.se))
}

#.d50 <- function(x) {
#
#
#}

#' Improved Chao1
#' a function from the repseq package not modified
#' function computes the improve version of Chao1
#' @param x a vector of count.
#' @return improved Chao1 value (ref) Chao and Lin Chao, A. and Lin, C.-W. (2012) Nonparametric lower bounds for species richness and shared species richness under sampling without replacement. Biometrics,68, 912–921.
#' @export
# @example
iChao <- function(x) {
f1 <- sum(x == 1)
f2 <- sum(x == 2)
f3 <- sum(x == 3)
f4 <- sum(x == 4)
if (f4 == 0) f4 <- 1
n <- sum(x)
p1 <- (n-3)/n
p2 <- (n-3)/(n-1)
est <- chao1(x)$chao.est + p1*f3/(4*f4) * max(f1 - p2*f2*f3/(2*f4), 0)
return(est)
}

#' Adjusted Chao1 for sampling without replacement
#'
#' function compute the correct
#' @param x a vector of counts
#' @return a value of Chao1
#' @export
# @example
Chaowor <- function(x) {
f1 <- sum(x == 1)
f2 <- sum(x == 2)
S <- sum(x > 0)
n <- sum(x)
q <- n/S
dn1 <- n*2*f2/(n-1)
dn2 <- q*f1/(q-1)
est <- S + (f1^2)/(dn1 + dn2)
return(est)
}
154 changes: 154 additions & 0 deletions apps/RepSeq/common/process_mixcr_to_tm.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
library(RepSeq)
#library(docstring)
library("argparse")
library('data.table')
source("diversity.R")
source("cdr3length.R")
library(plyr)
library(dplyr)
library(tidyr)
library(tibble)

main<-function(){
#main function
# example from vignette
parser <- ArgumentParser(description='Process some integers')
parser$add_argument('directory', type="character",help='file directory')
parser$add_argument('--output_dir',default="./results/", type="character",help='output directory')
parser$add_argument('--level', type="character",default="VpJ", dest='level', help='V J VpJ')

# parser$add_argument('--alpha', type="integer", nargs='+',default=c(0, 0.25, 0.5, 1.00001, 2, 4, 8, 16, 32, 64,Inf) ,help='an integer for the accumulator')

args=parser$parse_args()
try( dir.create(args$output_dir))

#list file in input folder
file_list <- list.files(args$directory, full.name = TRUE, pattern = "*.tsv")
# teff_file_list <- list.files(args$directory, full.name = TRUE, pattern = ".*teff.*.txt")
process_file(file_list,chain="A",cellType="teff", paste(args$output_dir,"res/",sep=""), args$level,args$directory)
process_file(file_list,chain="B",cellType="teff",paste(args$output_dir,"res/",sep=""), args$level,args$directory)

# process_file(teff_file_list,chain="A",cellType="teff", paste(args$output_dir,sep=""), args$level,args$directory)
# process_file(teff_file_list,chain="B",cellType="treg",paste(args$output_dir,sep=""), args$level,args$directory)
}

#' process mixcr file to create sample info and diversity info
#' @param inputFolder folder where mixcr file are stored
#' @param chain alpha or beta chain for TCR (A, B)
#' @param output_dir output directory
#' @param level level for processing V, J, VpJ, CDR3aa
#' @param inputDir input directory
process_file<-function(inputFolder,cellType, chain, output_dir, level,inputDir){

datatab <- readClonotypeSet(inputFolder, cores=2L, aligner="MiXCR", chain=chain, sampleinfo=NULL, keep.ambiguous=FALSE, keep.unproductive=FALSE, aa.th=8)
if(!file.exists(paste(output_dir,"cdr3lenght",chain,".csv", sep=""))){
# write sample info in a result file
# lengthFreqDf<-lengthFreq(datatab)
# lengthFreqDf$lib<-toupper(sub("()(-.*)","",lengthFreqDf$lib))
# setnames(lengthFreqDf,"lib","id")
# write.csv(lengthFreqDf,paste(output_dir,"cdr3lenght",cellType,chain,".csv", sep=""),row.names=FALSE)

}else{
stop(paste("file",output_dir,"cdr3lenght",cellType,chain,".csv"," already exist", sep=""))
}
# # write to a file
if(!file.exists(paste(output_dir,"sample_info",cellType,chain,".csv", sep=""))){
# write sample info in a result file
print(output_dir)
file_name <-paste(output_dir,"sample_info",cellType,chain,".csv", sep="")
write.csv(datatab@sampleData,file_name)

}else{
stop(paste("file",output_dir,"sample_info",cellType,chain,".csv"," already exist", sep=""))
}
if(!file.exists(paste(output_dir,"diversity_info",cellType,chain,".csv", sep=""))){
#diversity info from modified function of repseq package

basicIndices<-basicIndicesA(datatab, level)
basicIndices$lib<-toupper(sub("()(-.*)","",basicIndices$lib))
setnames(basicIndices,"lib","id")
# rownames(basicIndices)<-basicIndices$id

write.csv(basicIndices,paste(output_dir,"diversity_info",cellType,chain,".csv", sep=""),row.names=FALSE)
length<-datatab@assayData[,.(length=nchar(CDR3dna)),by="CDR3dna"][, mean(length), by="CDR3dna"]
}else{
stop(paste("file",output_dir,"diversity_info",cellType,chain,".csv"," already exist", sep=""))

}
# # write to a file
if(!file.exists(paste(output_dir,"renyi_Profile",cellType,chain,".csv", sep=""))){
res<-renyiProfiles(datatab, level=level)
res<-t(res)
d<-as.data.frame(rownames(res))
colnames(res) <- res[1,]
res<-cbind(d,res)
res<-res[-1,]
setnames(res, "rownames(res)", "id")
row.names(res) <- NULL

res$id <- toupper(sub("()(-.*)","", res$id))
# rownames(res)<-res$id
write.csv(res,paste(output_dir,"renyi_Profile",cellType,chain,".csv", sep=""),row.names=FALSE)
# writeReadme(output_dir,inputDir,level,datatab@History$history)
}else{
stop(paste("file",output_dir,"renyi_Profile",cellType,chain,".csv"," already exist", sep=""))

}

writeReadme(output_dir,inputDir,level,datatab@History)
rm(datatab)
}


#' write a read me for the file
#' @param output_dir output directory
#' @param inputDir input directory
#' @param level level for processing V, J, VpJ, CDR3aa
#' @param RepseqExperimentHistory a dataframe with the history of a repseq experiment
writeReadme<-function(output_dir, input_dir,level, RepseqExperimentHistory){
info_file<-paste(output_dir,"info.txt", sep="")
file.create(info_file)

write("\n",info_file,append =TRUE)

write("----------- commit where the data was generated :-------",info_file,append =TRUE)
try(write(system("git show --oneline -s", intern = TRUE),info_file,append =TRUE))
write("----------parameters:---------",info_file,append =TRUE)
write(paste("input directory :",input_dir,sep=" "),info_file,append =TRUE)
write(paste("analysis level :",level,sep=" "),info_file,append =TRUE)
write(paste("output directory :",output_dir,sep=" "),info_file,append =TRUE)
write("\n",info_file,append =TRUE)
print("------RepseqExperiment history:--------")
write("----------RepseqExperiment history:---------",info_file,append =TRUE)

#write(RepseqExperimentHistory,info_file,append =TRUE)

sessionInfo<-sessionInfo()
write(sessionInfo$R.version$version.string,info_file,append =TRUE)
write("\n",info_file,append =TRUE)

write("--------used library---------",info_file,append =TRUE)

sapply(sessionInfo$otherPkgs,function(x)lib_write(x,info_file))

}

#' write information in readme file
#' @param package_info information from used package
#' @param file_name file name
lib_write<-function(package_info, file_name){

write(package_info$Package,file_name,append =TRUE)
write(package_info$Title,file_name,append =TRUE)
write(package_info$Version,file_name,append =TRUE)

}

meanLength<-function(repseqExp){
if (missing(repseqExp)) stop("x is missing.")
if (!is.RepSeqExperiment(repseqExp)) stop("an object of class RepSeqExperiment is expected.")
}


main()

Loading