From 7b30dadd5664017c24132234f39ee9cf4e4d38b5 Mon Sep 17 00:00:00 2001 From: azufre451 Date: Tue, 16 Jun 2020 03:21:48 +0200 Subject: [PATCH] update version --- bt2_against_metagenomes/map_bt2.sh | 16 ++++++--- bt2_against_metagenomes/map_bt2_svr.sh | 38 ++++++++++++++++++++ bt2_against_metagenomes/map_bt2_svrcibio.sh | 39 +++++++++++++++++++++ bt2_against_metagenomes/map_launch.sh | 10 +++--- step3_clustering/prevalence_ht.py | 27 ++++++++++++-- 5 files changed, 117 insertions(+), 13 deletions(-) create mode 100755 bt2_against_metagenomes/map_bt2_svr.sh create mode 100755 bt2_against_metagenomes/map_bt2_svrcibio.sh diff --git a/bt2_against_metagenomes/map_bt2.sh b/bt2_against_metagenomes/map_bt2.sh index 7a055c2..7f25fa9 100755 --- a/bt2_against_metagenomes/map_bt2.sh +++ b/bt2_against_metagenomes/map_bt2.sh @@ -11,11 +11,19 @@ #INDEX=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs_LT3/out_P_vsearch/step4_clusters_greedy/step4_greedy; INDEX=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs_LT5/out_P_vsearch/step4_clusters/rep_fnas_bt2; #inlist=$(ls ${prefix}/${dataset}/reads/${sample}/*${extension} | xargs echo | sed 's/ /,/g') -rm /tmp/samtools.*; +if [ -d /mnt/localscratch ]; then + tpf=/mnt/localscratch; +elif [ -d /shares/CIBIO-Storage/CM/news/users/moreno.zolfo/tmp ]; then + tpf=/shares/CIBIO-Storage/CM/news/users/moreno.zolfo/tmp; +elif [ -d /shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/tmp ]; then + tpf=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/tmp; +else + tpf=/shares/CIBIO-Storage/CM/tmp/mzolfo/tmp_data/; +fi; -if [ -d /mnt/localscratch ]; then tpf=/mnt/localscratch; else tpf=/shares/CIBIO-Storage/CM/tmp/mzolfo/tmp_data/; fi; tmp_folder=${tpf}/${dataset}__${sample}/; + mkdir -p ${tmp_folder}; @@ -23,6 +31,4 @@ echo "OK" ${tmp_folder} $uncompress_cmd ${prefix}/${dataset}/reads/${sample}/*${extension} | /shares/CIBIO-Storage/CM/news/users/moreno.zolfo/mytools/fastq_len_filter.py --min_len 75 | bowtie2 -U - -p ${ncores} -x ${INDEX} -a --no-unal -S - | samtools view -bS - | samtools sort -@${ncores} -T ${tmp_folder} > ${tmp_folder}/vdbm__${dataset}__${sample}.bam; echo "MOVE" mv ${tmp_folder}/vdbm__${dataset}__${sample}.bam ${outDir}/; -echo "DONE" - - +echo "DONE" \ No newline at end of file diff --git a/bt2_against_metagenomes/map_bt2_svr.sh b/bt2_against_metagenomes/map_bt2_svr.sh new file mode 100755 index 0000000..cb2f42a --- /dev/null +++ b/bt2_against_metagenomes/map_bt2_svr.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +#PBS -l place=free +#PBS -V +prefix=$1 +outDir=$2 +dataset=$3 +sample=$4 +uncompress_cmd=$5 +ncores=4 +extension='.fastq.bz2'; + +INDEX=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs_LT5/out_P_vsearch/step4_clusters/rep_fnas_bt2; + +mkdir -p ${outDir}/${dataset}/ + +if [ ! -f ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk ] & [ ! -f ${outDir}/${dataset}/vdbm__${dataset}__${sample}.bam ] ; then + touch ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk + + if [ -d /local-storage/mz/ ]; then + tpf=/local-storage/mz/; + tmp_folder=${tpf}/${dataset}__${sample}/; + mkdir -p ${tmp_folder}; + + echo "OK" ${tmp_folder} + $uncompress_cmd ${prefix}/${dataset}/reads/${sample}/*${extension} | /shares/CIBIO-Storage/CM/news/users/moreno.zolfo/mytools/fastq_len_filter.py --min_len 75 | bowtie2 -U - -p ${ncores} -x ${INDEX} -a --no-unal -S - | samtools view -bS - | samtools sort -@${ncores} -T ${tmp_folder} > ${tmp_folder}/vdbm__${dataset}__${sample}.bam; + echo "MOVE" ${dataset} ${sample} + mv ${tmp_folder}/vdbm__${dataset}__${sample}.bam ${outDir}/${dataset}/; + echo "DONE" ${dataset} ${sample} + else + echo "Problem with TMP folder! " ${tmp_folder}; + fi; + +else + echo ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk "Exists!" +fi; + + diff --git a/bt2_against_metagenomes/map_bt2_svrcibio.sh b/bt2_against_metagenomes/map_bt2_svrcibio.sh new file mode 100755 index 0000000..3d9d7a5 --- /dev/null +++ b/bt2_against_metagenomes/map_bt2_svrcibio.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +#PBS -l place=free +#PBS -V +prefix=$1 +outDir=$2 +dataset=$3 +sample=$4 +uncompress_cmd=$5 + +ncores=4 +extension='.fastq.bz2'; + +INDEX=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs_LT5/out_P_vsearch/step4_clusters/rep_fnas_bt2; + +mkdir -p ${outDir}/${dataset}/ + +if [ ! -f ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk ] & [ ! -f ${outDir}/${dataset}/vdbm__${dataset}__${sample}.bam ] ; then + touch ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk + + if [ -d /shares/CIBIO-Storage/CM/news/users/moreno.zolfo/tmp ]; then + tpf=/shares/CIBIO-Storage/CM/news/users/moreno.zolfo/tmp; + tmp_folder=${tpf}/${dataset}__${sample}/; + mkdir -p ${tmp_folder}; + + echo "OK" ${tmp_folder} + $uncompress_cmd ${prefix}/${dataset}/reads/${sample}/*${extension} | /shares/CIBIO-Storage/CM/news/users/moreno.zolfo/mytools/fastq_len_filter.py --min_len 75 | bowtie2 -U - -p ${ncores} -x ${INDEX} -a --no-unal -S - | samtools view -bS - | samtools sort -@${ncores} -T ${tmp_folder} > ${tmp_folder}/vdbm__${dataset}__${sample}.bam; + echo "MOVE" ${dataset} ${sample} + mv ${tmp_folder}/vdbm__${dataset}__${sample}.bam ${outDir}/${dataset}/; + echo "DONE" ${dataset} ${sample} + else + echo "Problem with TMP folder! " ${tmp_folder}; + fi; + +else + echo ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk "Exists!" +fi; + + diff --git a/bt2_against_metagenomes/map_launch.sh b/bt2_against_metagenomes/map_launch.sh index d57d759..6457fc3 100755 --- a/bt2_against_metagenomes/map_launch.sh +++ b/bt2_against_metagenomes/map_launch.sh @@ -106,8 +106,6 @@ CM_sardegna CM_guinea2 CM_ghana2' - - for et in $etp; do for utp in $(find ${prefix}/${et}/reads -maxdepth 1 -mindepth 1 -type d); do e+=($utp); @@ -133,25 +131,25 @@ while true; do if [ ! -f ${odir}/${dataset}/vdbm__${dataset}__${sample}.bam ] && [ ! -f ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk ]; then if [ $b_short -lt 30 ]; then #echo "L" ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk - qsub -q short_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"8\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=8 ${curDir}/map_bt2.sh; + qsub -q short_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"16\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=16 ${curDir}/map_bt2.sh; touch ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk b_short=$((b_short+1)); bt=$((bt+1)); elif [ $b_common -lt 50 ]; then #echo "L" ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk - qsub -q common_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"4\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=4 ${curDir}/map_bt2.sh; + qsub -q common_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"8\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=8 ${curDir}/map_bt2.sh; touch ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk b_common=$((b_common+1)); bt=$((bt+1)); elif [ $b_cibio -lt 50 ]; then #echo "L" ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk - qsub -q CIBIO_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"4\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=4 ${curDir}/map_bt2.sh; + qsub -q CIBIO_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"8\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=8 ${curDir}/map_bt2.sh; touch ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk b_cibio=$((b_cibio+1)); bt=$((bt+1)); elif [ $b_cibiocm -lt 50 ]; then #echo "L" ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk - qsub -q CIBIOCM_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"4\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=4 ${curDir}/map_bt2.sh; + qsub -q CIBIOCM_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"8\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=8 ${curDir}/map_bt2.sh; touch ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk b_cibiocm=$((b_cibiocm+1)); bt=$((bt+1)); diff --git a/step3_clustering/prevalence_ht.py b/step3_clustering/prevalence_ht.py index 6fb0f08..df5100a 100755 --- a/step3_clustering/prevalence_ht.py +++ b/step3_clustering/prevalence_ht.py @@ -15,6 +15,7 @@ from Bio.Alphabet import IUPAC + parser = argparse.ArgumentParser() parser.add_argument('folder',help="folder with fasta files") parser.add_argument('output',help="outputCSV") @@ -48,7 +49,7 @@ #hardcoded replacements dataset = dataset.replace('CM_caritro','FerrettiP_2018').replace('ZeeviD_2015_A','ZeeviD_2015').replace('ZeeviD_2015_B','ZeeviD_2015').replace('RosarioK_2018_DNA','RosarioK_2018').replace('VLP_LyM_2016','LyM_2016').replace('VLP_Minot_2011','MinotS_2011').replace('VLP_Minot_2013','MinotS_2013').replace('VLP_NormanJ_2015','NormanJ_2015').replace('VLP_ReyesA_2015','ReyesA_2015').replace('LawrenceA_2015','DavidLA_2015').replace('VLP_LimE_2015_SIA','LimE_2015').replace('VLP_LimE_2015_MDA','LimE_2015').replace('LimE_2015_MDA','LimE_2015').replace('LimE_2015_SIA','LimE_2015') - dm.append({'clusterL1':L1Cluster,'clusterL2':L1Cluster,'group':group,'dataset':dataset,'sample':sample,'node':node}) + dm.append({'clusterL1':L1Cluster,'clusterL2':L2Cluster,'group':group,'dataset':dataset,'sample':sample,'node':node}) i+=1 @@ -62,5 +63,27 @@ #et1=pd.pivot_table(a,index='clusterL1',columns=['group','dataset'],values='sample',aggfunc=lambda x: len(x.unique()) ) et2=pd.pivot_table(merged,index='clusterL1',columns=['group','dataset','nsamples','dataset_short_source'],values='sample',aggfunc=lambda x: len(x.unique()) ) -#et1.fillna(0).to_csv('a.csv',sep='\t') + et2.fillna(0).to_csv(args.output,sep='\t') + +#following here there are the dataset slicing for by-dataset counts (for boxplot) + + +et3=merged.groupby(['group','dataset','dataset_short_source','sample'])['clusterL1'].agg(lambda x: len(x.unique())).fillna(0) +et3.to_csv(args.output+'_group_by_L1_clusters.csv',sep='\t') + + +et4=merged.groupby(['group','dataset','dataset_short_source','sample'])['clusterL2'].agg(lambda x: len(x.unique())).fillna(0) +et4.to_csv(args.output+'_group_by_L2_clusters.csv',sep='\t') + +## and crAssphage counts: + + +et5=merged[merged['clusterL1']=='vsearch_c72'].groupby(['group','dataset','dataset_short_source'])['sample'].agg(lambda x: len(x.unique())).fillna(0) +et5.to_csv(args.output+'_crassphage.csv',sep='\t') + +#et3=pd.pivot_table(merged,index='clusterL1',columns=['group','dataset','sample','nsamples','dataset_short_source'],values='node',aggfunc= ) + + + +