Skip to content

Commit

Permalink
v176
Browse files Browse the repository at this point in the history
  • Loading branch information
azufre451 committed May 20, 2020
1 parent eb4969f commit db2119c
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 10 deletions.
22 changes: 14 additions & 8 deletions step3_clustering/cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
#prog=$1
#flavour=$2

VDB_MAIN_PATH="/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/viromedb/clustering/";
VDB_MAIN_PATH="/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/viromedb/step3_clustering/";
REFSEQ91=/shares/CIBIO-Storage/CM/news/users/moreno.zolfo/mzolfo_virome/indexes/REFSEQ_r91/refseq_91.fasta


BASE=$1 #/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs/
CONTIG_TABLE=$2 #/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment/vdb8_results//out_toplen_filtered.csv
BASE=$1 #/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs_LT3/
CONTIG_TABLE=$2 #/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment/vdb8_results/with_refseq/out_toplen_filtered_largethresholds.csv

prog='vsearch'
flavour='P'
ncores=8;
ncores=32;

#SEEKER_FOLDER=/shares/CIBIO-Storage/CM/scratch/users/moreno.zolfo/virome_data/high_enrichment_vs_all_contigs/seeker_analysis;

Expand Down Expand Up @@ -77,11 +77,11 @@ if [ ! -d ${odir}/step1/centroids/ ]; then
mkdir -p ${odir}/step1/centroids/
${VDB_MAIN_PATH}/process_cluster_uc_files.py --label ${prog} ${odir}/step1/clusters90.uc ${odir}/step1/nr90.fasta ${odir}/step1/centroids/

ls ${odir}/step1/centroids/*.fasta | parallel -j ${ncores} --env ODIR 'fie={}; fiebn=$(basename $fie) ;echo "Mash against "${fiebn//.fasta/}; mash dist -d 0.1 -v 0.05 ${ODIR}/sequences_Q_R.msh ${fie} > ${ODIR}/step1/centroids/${fiebn//.fasta/.mash};'
ls ${odir}/step1/centroids/*.fasta | parallel -j ${ncores} --env ODIR 'fie={}; fiebn=$(basename $fie) ;echo "Mash against "${fiebn//.fasta/}; mash dist -d 0.1 -v 0.05 ${ODIR}/sequences_Q_R_low.msh ${fie} > ${ODIR}/step1/centroids/${fiebn//.fasta/.mash};'
else
echo " -> Mash output already found. Moving on"
fi;


if [ ! -d ${odir}/step2_clusters/ ]; then
mkdir -p ${odir}/step2_clusters/;
Expand All @@ -92,18 +92,21 @@ else
echo " -> step2_clusters folder already found. Moving on"
fi;


echo "Step 3 - Reclustering "
#if [ ! -d ${odir}/step3_clusters/ ]; then

mkdir -p ${odir}/step3_clusters/fnas/


export ODIR=${odir};
ls ${odir}/step2_clusters/*_full_cluster.fasta | parallel -j ${ncores} --env ODIR 'st3_cluster={}; st3_cluster_basename_f=$(basename $st3_cluster); st3_cluster_basename=${st3_cluster_basename_f//_full_cluster/}; if [ ! -f ${ODIR}/step3_clusters/${st3_cluster_basename}_clusters90.uc ]; then echo " | Clustering ${st3_cluster}"; /shares/CIBIO-Storage/CM/mir/tools/vsearch-2.13.6/bin/vsearch --cluster_fast ${st3_cluster} --threads 10 --id 0.7 --strand both --uc ${ODIR}/step3_clusters/${st3_cluster_basename}_clusters90.uc --maxseqlength 200000; fi;'
ls ${odir}/step2_clusters/*_full_cluster.fasta | parallel -j ${ncores} --env ODIR 'st3_cluster={}; st3_cluster_basename_f=$(basename $st3_cluster); st3_cluster_basename=${st3_cluster_basename_f//_full_cluster/}; if [ ! -f ${ODIR}/step3_clusters/${st3_cluster_basename}_clusters90.uc ]; then echo " | Clustering ${st3_cluster}"; /shares/CIBIO-Storage/CM/mir/tools/vsearch-2.13.6/bin/vsearch --cluster_fast ${st3_cluster} --threads 16 --id 0.7 --strand both --uc ${ODIR}/step3_clusters/${st3_cluster_basename}_clusters90.uc --maxseqlength 200000; fi;'

#else
# echo " -> step3_clusters folder already found. Moving on"
#fi;





Expand All @@ -121,9 +124,12 @@ mkdir -p ${odir}/step4_clusters/fnas

if [ ! -f ${odir}/step4_clusters/united_clusters.csv ]; then
echo "Step 4 - Putting clusters in their final form"
${VDB_MAIN_PATH}/unify.py --original_filtered_contigs ${CONTIG_TABLE} --cluster_pipeline_folder ${odir} --cluster_pipeline_folder ${odir}/step3_clusters/ --refseq_file ${REFSEQ91} --percentile 50 --output_folder ${odir}/step4_clusters/ --strict
${VDB_MAIN_PATH}/unify.py --original_filtered_contigs ${CONTIG_TABLE} --cluster_pipeline_folder ${odir} --refseq_file ${REFSEQ91} --percentile 50 --output_folder ${odir}/step4_clusters/ --strict
fi;


exit;

echo "Step 4 - TrimAl"
ls ${odir}/step4_clusters/fnas/*.fna | parallel -j ${ncores} 'i={}; echo $i $(cat ${i//.fna/.aln} | grep ">" | wc -l) seqs; mafft --thread 2 $i > ${i//.fna/.aln}; trimal -gt 0.7 -cons 70 -in ${i//.fna/.aln} -out ${i//.fna/.trim};';

Expand Down
35 changes: 33 additions & 2 deletions step3_clustering/process_cluster_uc_files_step2.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

parser.add_argument('--input_folder', help='foo help')
parser.add_argument('--output')
parser.add_argument('--debug',action='store_true')
parser.add_argument('--allcontigs',help="for_reclustering",nargs="+")
args = parser.parse_args()

Expand All @@ -21,8 +22,10 @@
contigs_seq_trace[allcontigs_element] = SeqIO.to_dict(SeqIO.parse(allcontigs_element,'fasta'))

contigBase = {}
ccc=[]
ddd=[]
for filer in glob.glob(args.input_folder+'/*.mash'):
print("Opening ",filer)
#print("Opening ",filer)
for line in open(filer):

pc,clust,dist,p,sketch = line.strip().split()
Expand All @@ -31,6 +34,18 @@
contigBase[pc] = [(clusterID,float(dist))]
else:
contigBase[pc].append((clusterID,float(dist)))
ccc.append(clusterID)

if(clusterID == 'vsearch_c118'):
print("ZZZ",clusterID)

if args.debug:
#print("contigs tracked:", len(contigBase.keys()))

ot=open('tmp1.log','w')
ot.write("\n".join(set(ccc)))
ot.close()



clusters={}
Expand All @@ -43,12 +58,20 @@
else:
clusters[closestCluster].append(k)

if(k == 'vsearch_c118'):
print("BBB",k)



##add back the original clusters

#if args.debug:
# print("clusters tracked:", len(clusters.keys()))

for k2,v2 in clusters.items():

if(v2 == 'vsearch_c118'):
print("AAA",v2)
sys.exit(0)

v3 = list( originalClusters[originalClusters['clusterID'] == k2]['contig'])

Expand Down Expand Up @@ -76,7 +99,15 @@


SeqIO.write(contigsInThisCluster,args.output+'/'+k2+'_full_cluster.fasta','fasta')
ddd.append(k2)

if args.debug:
#print("contigs tracked:", len(contigBase.keys()))

ot=open('tmp2.log','w')
ot.write("\n".join(set(ddd)))
ot.close()



print (sum([len(x) for x in clusters.values()]))
Expand Down

0 comments on commit db2119c

Please sign in to comment.