Skip to content

Commit

Permalink
update version
Browse files Browse the repository at this point in the history
  • Loading branch information
azufre451 committed Jun 16, 2020
1 parent a41cdcf commit 7b30dad
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 13 deletions.
16 changes: 11 additions & 5 deletions bt2_against_metagenomes/map_bt2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,24 @@
#INDEX=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs_LT3/out_P_vsearch/step4_clusters_greedy/step4_greedy;
INDEX=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs_LT5/out_P_vsearch/step4_clusters/rep_fnas_bt2;
#inlist=$(ls ${prefix}/${dataset}/reads/${sample}/*${extension} | xargs echo | sed 's/ /,/g')
rm /tmp/samtools.*;

if [ -d /mnt/localscratch ]; then
tpf=/mnt/localscratch;
elif [ -d /shares/CIBIO-Storage/CM/news/users/moreno.zolfo/tmp ]; then
tpf=/shares/CIBIO-Storage/CM/news/users/moreno.zolfo/tmp;
elif [ -d /shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/tmp ]; then
tpf=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/tmp;
else
tpf=/shares/CIBIO-Storage/CM/tmp/mzolfo/tmp_data/;
fi;

if [ -d /mnt/localscratch ]; then tpf=/mnt/localscratch; else tpf=/shares/CIBIO-Storage/CM/tmp/mzolfo/tmp_data/; fi;
tmp_folder=${tpf}/${dataset}__${sample}/;

mkdir -p ${tmp_folder};


echo "OK" ${tmp_folder}
$uncompress_cmd ${prefix}/${dataset}/reads/${sample}/*${extension} | /shares/CIBIO-Storage/CM/news/users/moreno.zolfo/mytools/fastq_len_filter.py --min_len 75 | bowtie2 -U - -p ${ncores} -x ${INDEX} -a --no-unal -S - | samtools view -bS - | samtools sort -@${ncores} -T ${tmp_folder} > ${tmp_folder}/vdbm__${dataset}__${sample}.bam;
echo "MOVE"
mv ${tmp_folder}/vdbm__${dataset}__${sample}.bam ${outDir}/;
echo "DONE"


echo "DONE"
38 changes: 38 additions & 0 deletions bt2_against_metagenomes/map_bt2_svr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

#PBS -l place=free
#PBS -V
prefix=$1
outDir=$2
dataset=$3
sample=$4
uncompress_cmd=$5
ncores=4
extension='.fastq.bz2';

INDEX=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs_LT5/out_P_vsearch/step4_clusters/rep_fnas_bt2;

mkdir -p ${outDir}/${dataset}/

if [ ! -f ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk ] & [ ! -f ${outDir}/${dataset}/vdbm__${dataset}__${sample}.bam ] ; then
touch ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk

if [ -d /local-storage/mz/ ]; then
tpf=/local-storage/mz/;
tmp_folder=${tpf}/${dataset}__${sample}/;
mkdir -p ${tmp_folder};

echo "OK" ${tmp_folder}
$uncompress_cmd ${prefix}/${dataset}/reads/${sample}/*${extension} | /shares/CIBIO-Storage/CM/news/users/moreno.zolfo/mytools/fastq_len_filter.py --min_len 75 | bowtie2 -U - -p ${ncores} -x ${INDEX} -a --no-unal -S - | samtools view -bS - | samtools sort -@${ncores} -T ${tmp_folder} > ${tmp_folder}/vdbm__${dataset}__${sample}.bam;
echo "MOVE" ${dataset} ${sample}
mv ${tmp_folder}/vdbm__${dataset}__${sample}.bam ${outDir}/${dataset}/;
echo "DONE" ${dataset} ${sample}
else
echo "Problem with TMP folder! " ${tmp_folder};
fi;

else
echo ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk "Exists!"
fi;


39 changes: 39 additions & 0 deletions bt2_against_metagenomes/map_bt2_svrcibio.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash

#PBS -l place=free
#PBS -V
prefix=$1
outDir=$2
dataset=$3
sample=$4
uncompress_cmd=$5

ncores=4
extension='.fastq.bz2';

INDEX=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs_LT5/out_P_vsearch/step4_clusters/rep_fnas_bt2;

mkdir -p ${outDir}/${dataset}/

if [ ! -f ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk ] & [ ! -f ${outDir}/${dataset}/vdbm__${dataset}__${sample}.bam ] ; then
touch ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk

if [ -d /shares/CIBIO-Storage/CM/news/users/moreno.zolfo/tmp ]; then
tpf=/shares/CIBIO-Storage/CM/news/users/moreno.zolfo/tmp;
tmp_folder=${tpf}/${dataset}__${sample}/;
mkdir -p ${tmp_folder};

echo "OK" ${tmp_folder}
$uncompress_cmd ${prefix}/${dataset}/reads/${sample}/*${extension} | /shares/CIBIO-Storage/CM/news/users/moreno.zolfo/mytools/fastq_len_filter.py --min_len 75 | bowtie2 -U - -p ${ncores} -x ${INDEX} -a --no-unal -S - | samtools view -bS - | samtools sort -@${ncores} -T ${tmp_folder} > ${tmp_folder}/vdbm__${dataset}__${sample}.bam;
echo "MOVE" ${dataset} ${sample}
mv ${tmp_folder}/vdbm__${dataset}__${sample}.bam ${outDir}/${dataset}/;
echo "DONE" ${dataset} ${sample}
else
echo "Problem with TMP folder! " ${tmp_folder};
fi;

else
echo ${outDir}/${dataset}/vdbm__${dataset}__${sample}.wk "Exists!"
fi;


10 changes: 4 additions & 6 deletions bt2_against_metagenomes/map_launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,6 @@ CM_sardegna
CM_guinea2
CM_ghana2'



for et in $etp; do
for utp in $(find ${prefix}/${et}/reads -maxdepth 1 -mindepth 1 -type d); do
e+=($utp);
Expand All @@ -133,25 +131,25 @@ while true; do
if [ ! -f ${odir}/${dataset}/vdbm__${dataset}__${sample}.bam ] && [ ! -f ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk ]; then
if [ $b_short -lt 30 ]; then
#echo "L" ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk
qsub -q short_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"8\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=8 ${curDir}/map_bt2.sh;
qsub -q short_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"16\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=16 ${curDir}/map_bt2.sh;
touch ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk
b_short=$((b_short+1));
bt=$((bt+1));
elif [ $b_common -lt 50 ]; then
#echo "L" ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk
qsub -q common_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"4\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=4 ${curDir}/map_bt2.sh;
qsub -q common_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"8\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=8 ${curDir}/map_bt2.sh;
touch ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk
b_common=$((b_common+1));
bt=$((bt+1));
elif [ $b_cibio -lt 50 ]; then
#echo "L" ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk
qsub -q CIBIO_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"4\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=4 ${curDir}/map_bt2.sh;
qsub -q CIBIO_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"8\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=8 ${curDir}/map_bt2.sh;
touch ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk
b_cibio=$((b_cibio+1));
bt=$((bt+1));
elif [ $b_cibiocm -lt 50 ]; then
#echo "L" ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk
qsub -q CIBIOCM_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"4\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=4 ${curDir}/map_bt2.sh;
qsub -q CIBIOCM_cpuQ -v prefix=\"${base}\",outDir=\"${odirE}\",dataset=\"${dataset}\",sample=\"${sample}\",uncompress_cmd=\"${extraction_cmd}\",extension=\"${extension}\",ncores=\"8\" -N VDBM_${dataset}_${sample} -l select=1:ncpus=8 ${curDir}/map_bt2.sh;
touch ${odir}/${dataset}/vdbm__${dataset}__${sample}.wk
b_cibiocm=$((b_cibiocm+1));
bt=$((bt+1));
Expand Down
27 changes: 25 additions & 2 deletions step3_clustering/prevalence_ht.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from Bio.Alphabet import IUPAC



parser = argparse.ArgumentParser()
parser.add_argument('folder',help="folder with fasta files")
parser.add_argument('output',help="outputCSV")
Expand Down Expand Up @@ -48,7 +49,7 @@
#hardcoded replacements
dataset = dataset.replace('CM_caritro','FerrettiP_2018').replace('ZeeviD_2015_A','ZeeviD_2015').replace('ZeeviD_2015_B','ZeeviD_2015').replace('RosarioK_2018_DNA','RosarioK_2018').replace('VLP_LyM_2016','LyM_2016').replace('VLP_Minot_2011','MinotS_2011').replace('VLP_Minot_2013','MinotS_2013').replace('VLP_NormanJ_2015','NormanJ_2015').replace('VLP_ReyesA_2015','ReyesA_2015').replace('LawrenceA_2015','DavidLA_2015').replace('VLP_LimE_2015_SIA','LimE_2015').replace('VLP_LimE_2015_MDA','LimE_2015').replace('LimE_2015_MDA','LimE_2015').replace('LimE_2015_SIA','LimE_2015')

dm.append({'clusterL1':L1Cluster,'clusterL2':L1Cluster,'group':group,'dataset':dataset,'sample':sample,'node':node})
dm.append({'clusterL1':L1Cluster,'clusterL2':L2Cluster,'group':group,'dataset':dataset,'sample':sample,'node':node})
i+=1


Expand All @@ -62,5 +63,27 @@
#et1=pd.pivot_table(a,index='clusterL1',columns=['group','dataset'],values='sample',aggfunc=lambda x: len(x.unique()) )
et2=pd.pivot_table(merged,index='clusterL1',columns=['group','dataset','nsamples','dataset_short_source'],values='sample',aggfunc=lambda x: len(x.unique()) )

#et1.fillna(0).to_csv('a.csv',sep='\t')

et2.fillna(0).to_csv(args.output,sep='\t')

#following here there are the dataset slicing for by-dataset counts (for boxplot)


et3=merged.groupby(['group','dataset','dataset_short_source','sample'])['clusterL1'].agg(lambda x: len(x.unique())).fillna(0)
et3.to_csv(args.output+'_group_by_L1_clusters.csv',sep='\t')


et4=merged.groupby(['group','dataset','dataset_short_source','sample'])['clusterL2'].agg(lambda x: len(x.unique())).fillna(0)
et4.to_csv(args.output+'_group_by_L2_clusters.csv',sep='\t')

## and crAssphage counts:


et5=merged[merged['clusterL1']=='vsearch_c72'].groupby(['group','dataset','dataset_short_source'])['sample'].agg(lambda x: len(x.unique())).fillna(0)
et5.to_csv(args.output+'_crassphage.csv',sep='\t')

#et3=pd.pivot_table(merged,index='clusterL1',columns=['group','dataset','sample','nsamples','dataset_short_source'],values='node',aggfunc= )




0 comments on commit 7b30dad

Please sign in to comment.