ireceptor-plus · schristley · Sep 3, 2020 · Sep 3, 2020 · Sep 3, 2020 · Sep 4, 2020
diff --git a/apps/RepSeq/Dockerfile b/apps/RepSeq/Dockerfile
@@ -0,0 +1,17 @@
+FROM rocker/r-ver:4.0.0
+
+ARG WHEN
+
+RUN apt-get update && \
+  apt-get install -y libcurl4-openssl-dev libssl-dev libssh2-1-dev libxml2-dev zlib1g-dev && \
+  R -e "options(repos = \
+  list(CRAN = 'http://mran.revolutionanalytics.com/snapshot/${WHEN}')); \
+  install.packages(c('devtools','argparse', 'testthat', 'roxygen2', 'BiocManager', 'data.table','pbapply','pheatmap','naturalsort', 'scales', 'magick','Rcpp', 'vegan', 'ggplot2'))"
+RUN R -e "BiocManager:: install('DESeq2')"
+
+
+
+COPY ./common/process_mixcr_to_tm.R /home/analysis/process_mixcr_to_r.R
+COPY ./common/diversity.R /home/analysis/diversity.R
+COPY ./common/cdr3length.R /home/analysis/cdr3length.R
+
diff --git a/apps/RepSeq/common/cdr3length.R b/apps/RepSeq/common/cdr3length.R
@@ -0,0 +1,11 @@
+#' a small function to compute CDR3 length and frequence by length
+#' @param datatab a dataframe of clonotype
+lengthFreq<-function(datatab){
+  #compute cdr3dna length
+  lengthdf<-datatab@assayData[,.(length=nchar(CDR3dna)),by=c("VpJ", "count","lib")]
+  # compute frequency and format dataframe
+  lengthDfFormated<-lengthdf %>% group_by(lib) %>% dplyr::count( length) %>%group_by(
+    lib)%>%mutate(n=n/sum(n))%>% tidyr::pivot_wider(names_from = length, values_from = n) %>% select(
+    lib,order(colnames(.)))%>%mutate_all(~replace(., is.na(.), 0))
+  return(lengthDfFormated)
+  }
diff --git a/apps/RepSeq/common/diversity.R b/apps/RepSeq/common/diversity.R
@@ -0,0 +1,142 @@
+
+#' compute lots of diversity indices
+#' modified from repseq package
+#' @param x: Repseqexperiment S4 object
+#' @param level: level for computation of diversity
+#' @return dataframe of diversity
+basicIndicesA <- function(x, level=c("VpJ", "V", "J", "VJ", "CDR3aa")) {
+  # function to create all diversity
+
+  if (missing(x)) stop("x is missing.")
+  if (!is.RepSeqExperiment(x)) stop("an object of class  is expected.")
+  levelChoice <- match.arg(level)
+  # summary function
+  pastek <- function(y) list(shannon=diversityA(y, index="shannon"), simpson=diversityA(y, index="simpson"), berger_parker=diversityA(y, index="berger_parker"),
+                             invsimpson=diversityA(y, index="invsimpson"),pielou=diversityA(y, index="pielou"),richness=diversityA(y, index="richness"), gini=gini(y), 
+                             chao1=chao1(y)$chao.est, chao1.se=chao1(y)$chao.se, ichao=iChao(y), chaowor=Chaowor(y))   
+  out <- assay(x)[, .(count=sum(count)), by=c("lib", levelChoice)][, pastek(count), by="lib"]
+  return(out)
+}
+
+#'  modified function of repseq package to add other diversity indices
+#' compute: simpson, shannon, pielou, richness, gini, berger_parker, invsimpson
+#' parameters:
+#' @param x: a dataframe of count by sample ( lib)
+#' @param index: diversity index to compute   
+#' @param norm: boolean to normalise some diversity 
+#' @param base: exponential 1 
+#' @return the diversity indice needed for a repertoir
+diversityA <- function(x, index=c("shannon", "simpson", "invsimpson", "berger_parker", "pielou", "richness"), norm=TRUE, base=exp(1)) {
+
+
+  if (missing(x)) stop("data set x is required.")
+  x <- x/sum(x)
+  id <- match.arg(index)
+  if(id=="berger_parker"){
+    # berger parker
+
+    H <- -log(max(x))
+  }
+  if (id == "shannon"|| id=="pielou") {
+    x <- -x * log(x, base)
+
+  }else if(id=="berger_parker"){
+
+  } 
+  else {
+    x <- x * x
+  }
+  H <- sum(x, na.rm = TRUE)
+  if (id=="pielou"){
+    norm=FALSE
+    H <-H/log(length(x),base)
+  }
+  if (norm) H <- H/log(sum(x>0), base)
+  if (id == "simpson") {
+    H <- 1 - H
+  } else if (id == "invsimpson") {
+    H <- 1/H
+  }
+  if (id=="berger_parker"){
+    H <- -log(max(x))
+  }
+
+  if (id=="richness"){
+    #richness
+    H<-length(x)
+  }
+  return(H)
+}
+
+#' a function from the repseq package not modified
+#' @param x an object of class RepSeqExperiment
+#' @return gini diversity indice
+gini <- function(x) {
+  x <- sort(x)
+  n <- length(x)
+  out <- 1/n * (n + 1 - 2 * sum((n + 1 - 1:n) * x)/sum(x))
+  out
+}
+
+# compute Chao1 indices
+#' a function from the repseq package not modified
+# function computes Chao indices
+#' @param x an object of class RepSeqExperiment
+#' @return a list of chao diversity index
+chao1 <- function(x) {
+  f1 <- sum(x == 1)
+  f2 <- sum(x == 2)
+  S <- sum(x > 0)
+  #if (f2 > 0) est <- S + f1^2/(2*f2)
+  #if (f2 == 0) est <- S + f1*(f1 - 1)/2
+  est <- S + f1*(f1 - 1)/(2 * (f2 + 1))
+  r <- f1/f2
+  chao.var <- f2 * ((r^4)/4 + r^3 + (r^2)/2)
+  chao.se <- sqrt(chao.var)
+  #return(est)
+  return(list(chao.est=est, chao.se=chao.se))
+}
+
+#.d50 <- function(x) {
+#    
+#
+#}
+
+#' Improved Chao1
+#' a function from the repseq package not modified
+#' function computes the improve version of Chao1
+#' @param x a vector of count.
+#' @return improved Chao1 value (ref) Chao and Lin Chao, A. and Lin, C.-W. (2012) Nonparametric lower bounds for species richness and shared species richness under sampling without replacement. Biometrics,68, 912–921. 
+#' @export
+# @example
+iChao <- function(x) {
+  f1 <- sum(x == 1)
+  f2 <- sum(x == 2)
+  f3 <- sum(x == 3)
+  f4 <- sum(x == 4)
+  if (f4 == 0) f4 <- 1
+  n <- sum(x)
+  p1 <- (n-3)/n
+  p2 <- (n-3)/(n-1)
+  est <- chao1(x)$chao.est + p1*f3/(4*f4) * max(f1 - p2*f2*f3/(2*f4), 0)
+  return(est)
+}
+
+#' Adjusted Chao1 for sampling without replacement 
+#'
+#' function compute the correct 
+#' @param x a vector of counts
+#' @return a value of Chao1
+#' @export
+# @example
+Chaowor <- function(x) {
+  f1 <- sum(x == 1)
+  f2 <- sum(x == 2)
+  S <- sum(x > 0)
+  n <- sum(x)
+  q <- n/S
+  dn1 <- n*2*f2/(n-1)
+  dn2 <- q*f1/(q-1)
+  est <- S + (f1^2)/(dn1 + dn2)
+  return(est)
+}
diff --git a/apps/RepSeq/common/process_mixcr_to_tm.R b/apps/RepSeq/common/process_mixcr_to_tm.R
@@ -0,0 +1,154 @@
+library(RepSeq)
+#library(docstring)
+library("argparse")
+library('data.table')
+source("diversity.R")
+source("cdr3length.R")
+library(plyr)
+library(dplyr)
+library(tidyr)
+library(tibble)
+
+main<-function(){
+  #main function 
+  # example from vignette
+  parser <- ArgumentParser(description='Process some integers')
+  parser$add_argument('directory', type="character",help='file directory')
+  parser$add_argument('--output_dir',default="./results/", type="character",help='output directory')
+  parser$add_argument('--level', type="character",default="VpJ", dest='level', help='V J VpJ')
+
+  # parser$add_argument('--alpha', type="integer", nargs='+',default=c(0, 0.25, 0.5, 1.00001, 2, 4, 8, 16, 32, 64,Inf) ,help='an integer for the accumulator')
+
+  args=parser$parse_args()
+  try( dir.create(args$output_dir))
+
+  #list file in input folder
+  file_list <- list.files(args$directory, full.name = TRUE, pattern = "*.tsv")
+  # teff_file_list <- list.files(args$directory, full.name = TRUE, pattern = ".*teff.*.txt")
+  process_file(file_list,chain="A",cellType="teff", paste(args$output_dir,"res/",sep=""), args$level,args$directory)
+  process_file(file_list,chain="B",cellType="teff",paste(args$output_dir,"res/",sep=""), args$level,args$directory)
+
+  # process_file(teff_file_list,chain="A",cellType="teff", paste(args$output_dir,sep=""), args$level,args$directory)
+  # process_file(teff_file_list,chain="B",cellType="treg",paste(args$output_dir,sep=""), args$level,args$directory)
+}
+
+#' process mixcr file to create sample info and diversity info
+#' @param inputFolder folder where mixcr file are stored
+#' @param chain alpha or beta chain for TCR (A, B)
+#' @param output_dir output directory 
+#' @param level level for processing V, J, VpJ, CDR3aa
+#' @param inputDir input directory
+process_file<-function(inputFolder,cellType, chain, output_dir, level,inputDir){
+
+  datatab <- readClonotypeSet(inputFolder, cores=2L, aligner="MiXCR", chain=chain, sampleinfo=NULL, keep.ambiguous=FALSE, keep.unproductive=FALSE, aa.th=8)
+  if(!file.exists(paste(output_dir,"cdr3lenght",chain,".csv", sep=""))){
+    # write sample info in a result file
+    # lengthFreqDf<-lengthFreq(datatab)
+    # lengthFreqDf$lib<-toupper(sub("()(-.*)","",lengthFreqDf$lib))
+    # setnames(lengthFreqDf,"lib","id")
+    # write.csv(lengthFreqDf,paste(output_dir,"cdr3lenght",cellType,chain,".csv", sep=""),row.names=FALSE)
+
+  }else{
+    stop(paste("file",output_dir,"cdr3lenght",cellType,chain,".csv"," already exist", sep=""))
+  }
+  # # write to a file
+  if(!file.exists(paste(output_dir,"sample_info",cellType,chain,".csv", sep=""))){
+    # write sample info in a result file
+    print(output_dir)
+    file_name <-paste(output_dir,"sample_info",cellType,chain,".csv", sep="")
+    write.csv(datatab@sampleData,file_name)
+
+  }else{
+    stop(paste("file",output_dir,"sample_info",cellType,chain,".csv"," already exist", sep=""))
+  }
+  if(!file.exists(paste(output_dir,"diversity_info",cellType,chain,".csv", sep=""))){
+    #diversity info from modified function of repseq package
+
+    basicIndices<-basicIndicesA(datatab, level)
+    basicIndices$lib<-toupper(sub("()(-.*)","",basicIndices$lib))
+    setnames(basicIndices,"lib","id")
+    # rownames(basicIndices)<-basicIndices$id 
+
+  write.csv(basicIndices,paste(output_dir,"diversity_info",cellType,chain,".csv", sep=""),row.names=FALSE)
+  length<-datatab@assayData[,.(length=nchar(CDR3dna)),by="CDR3dna"][, mean(length), by="CDR3dna"]
+  }else{
+    stop(paste("file",output_dir,"diversity_info",cellType,chain,".csv"," already exist", sep=""))
+
+  }
+  # # write to a file
+  if(!file.exists(paste(output_dir,"renyi_Profile",cellType,chain,".csv", sep=""))){
+    res<-renyiProfiles(datatab, level=level)
+    res<-t(res)
+    d<-as.data.frame(rownames(res))
+    colnames(res) <- res[1,]
+    res<-cbind(d,res)
+    res<-res[-1,]
+    setnames(res, "rownames(res)", "id")
+    row.names(res) <- NULL
+
+    res$id <- toupper(sub("()(-.*)","", res$id))
+    # rownames(res)<-res$id
+  write.csv(res,paste(output_dir,"renyi_Profile",cellType,chain,".csv", sep=""),row.names=FALSE)
+#  writeReadme(output_dir,inputDir,level,datatab@History$history)
+  }else{
+    stop(paste("file",output_dir,"renyi_Profile",cellType,chain,".csv"," already exist", sep=""))
+
+  }
+
+  writeReadme(output_dir,inputDir,level,datatab@History)
+  rm(datatab)
+}
+
+
+#' write a read me for the file
+#' @param output_dir output directory 
+#' @param inputDir input directory
+#' @param level level for processing V, J, VpJ, CDR3aa
+#' @param RepseqExperimentHistory a dataframe with the history of a repseq experiment
+writeReadme<-function(output_dir, input_dir,level, RepseqExperimentHistory){
+  info_file<-paste(output_dir,"info.txt", sep="")
+  file.create(info_file)
+
+  write("\n",info_file,append =TRUE)
+
+  write("----------- commit where the data was generated :-------",info_file,append =TRUE)
+  try(write(system("git show --oneline -s", intern = TRUE),info_file,append =TRUE))
+  write("----------parameters:---------",info_file,append =TRUE)
+  write(paste("input directory :",input_dir,sep=" "),info_file,append =TRUE)
+  write(paste("analysis level :",level,sep=" "),info_file,append =TRUE)
+  write(paste("output directory :",output_dir,sep=" "),info_file,append =TRUE)
+  write("\n",info_file,append =TRUE)
+  print("------RepseqExperiment history:--------")
+  write("----------RepseqExperiment history:---------",info_file,append =TRUE)
+
+  #write(RepseqExperimentHistory,info_file,append =TRUE)
+
+  sessionInfo<-sessionInfo()
+  write(sessionInfo$R.version$version.string,info_file,append =TRUE)
+  write("\n",info_file,append =TRUE)
+
+  write("--------used library---------",info_file,append =TRUE)
+
+  sapply(sessionInfo$otherPkgs,function(x)lib_write(x,info_file))
+
+}
+
+#' write information in readme file
+#' @param package_info information from used package
+#' @param file_name file name
+lib_write<-function(package_info, file_name){
+
+  write(package_info$Package,file_name,append =TRUE)
+  write(package_info$Title,file_name,append =TRUE)
+  write(package_info$Version,file_name,append =TRUE)
+
+}
+
+meanLength<-function(repseqExp){
+  if (missing(repseqExp)) stop("x is missing.")
+  if (!is.RepSeqExperiment(repseqExp)) stop("an object of class RepSeqExperiment is expected.")
+}
+
+
+main()
+