add more details and steps

jianshu93 · jianshu93 · commit 196e0639a01f · 2025-09-10T22:30:30.000-07:00
diff --git a/data/templates/1.hifiasm-meta_new.sbatch b/data/templates/1.hifiasm-meta_new.sbatch
@@ -11,7 +11,11 @@
 conda activate {{conda-environment}}
 
 cd {{output}}
-step=${SLURM_ARRAY_TASK_ID}
-input=$(head -n $step {{output}}/file_list.txt | tail -n 1)
-fn=`basename ${input}`
-hifiasm_meta -t 60 -o {{output}}/step-1/${fn} ${input}
+step=${SLURM_ARRAY_TASK_ID} ##1000_2, 1000_1
+
+### sample_list.txt should be somewhere else but not in the output folder?
+line=$(head -n $step absolute_path/sample_list.txt | tail -n 1)
+
+sample_name=`echo $line | awk '{print $1}'`
+filename=`echo $line | awk '{print $2}'
+hifiasm_meta -t 60 -o {{output}}/step-1/${sample_name} ${filename}
diff --git a/data/templates/10.minimap2_mapping b/data/templates/10.minimap2_mapping
@@ -0,0 +1,19 @@
+#!/bin/bash
+#SBATCH -J {{job_name}}
+#SBATCH -N {{node_count}}
+#SBATCH -n {{nprocs}}
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH -o {{output}}/step-3/logs/%x-%A.out
+#SBATCH -e {{output}}/step-3/logs/%x-%A.err
+
+
+
+conda activate {{conda-environment}}
+
+cd {{output}}/step-9
+line=$(head -n $step absolute_path/sample_list.txt | tail -n 1)
+sample_name=`echo $line | awk '{print $1}'`
+filename=`echo $line | awk '{print $2}'
+### we do not want any secondary alignments, so we set --secondary=no, woltka will process any thing in the input SAM
+minimap2 -x map-hifi -t 12 -a --secondary=no --MD --eqx -o {{output}}/step-10/${sample_name}.sam {{output}}/step-9/all_genome_linearized.fna ${filename}
diff --git a/data/templates/2.get-circular-genomes.sh b/data/templates/2.get-circular-genomes.sh
@@ -18,10 +18,11 @@ done | parallel -j {{nprocs}}
 cd split
 find . -type f -size -512k -exec rm -f {} +
 ### extract fasta id for all the genomes in the split folder
+### Note: if there are no circular genomes, no *.fa files will be generated. The circular_id.txt will be empty
 for f in *.fa; do
     k=${f##*/}
     n=${f%.*}
     grep -E "^>" $f >> circular_id.txt
 done
 sed -i 's/>//' circular_id.txt
-seqkit -v -f circular_id.txt all_contigs.fa > noLCG.fa
+seqkit grep -v -f circular_id.txt all_contigs.fa > noLCG.fa
diff --git a/data/templates/7.checkm_batch.sbatch b/data/templates/7.checkm_batch.sbatch
@@ -18,3 +18,5 @@ rm -rf checkm_out
 checkm lineage_wf ./ ./checkm_out -x fa -t 8 --tab_table -f ../${name}_checkm_table.txt --pplacer_threads 1
 cd ../
 cp ${name}_checkm_table.txt step-7
+
+
diff --git a/data/templates/8.dereplication_MAGs.sbatch b/data/templates/8.dereplication_MAGs.sbatch
@@ -0,0 +1,34 @@
+#!/bin/bash
+#SBATCH -J {{job_name}}
+#SBATCH -N {{node_count}}
+#SBATCH -n {{nprocs}}
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH -o {{output}}/step-8/logs/%x-%A.out
+#SBATCH -e {{output}}/step-8/logs/%x-%A.err
+
+#### combine all the checkm tables
+cd {{output}}/step-7
+
+
+#####!!!!!!!! important !!!!!!!!!!!!!!!
+### this job must be submitted dependently via afterok (sbatch) on all the step-7 jobs (all samples must be finished)
+### because it needs all the checkm tables to be ready
+### e.g., sbatch --dependency=afterok:jobid1:jobid2
+
+line=$(head -n 1 sample_list.txt | tail -n 1)
+sample_name=`echo $line | awk '{print $1}'`
+### any sample should work, we just need the header line
+head -n 1 ${sample_name}_checkm_table.txt > checkm_header.txt
+
+for F in *_checkm_table.txt; do
+	tail -n +2 $F >> checkm_body.txt
+done
+
+cat checkm_header.txt checkm_body.txt > checkm_all.txt
+rm checkm_header.txt checkm_body.txt
+
+#### the number of non-header lines in checkm_all.txt equals to the number of genomes/fasta files in the --genome-fasta-directory folder
+galah cluster --checkm-tab-table ./step-7/checkm_all.txt --genome-fasta-directory step-6/ -x fa --min-completeness 50 --max-contamination 10 --quality-formula dRep -t 32 --cluster-method fastani --output-representative-fasta-directory-copy step-8/drep_representativestep-8/drep_representative --output-cluster-definition step-8/drep_cluster99.5_member_fastani.txt --ani 99.5 --precluster-method finch
+
+#### output folder step-8/drep_representative should be used for step 9.lingenomes_withmd.py
diff --git a/data/templates/9.linearize_genome.sbatch b/data/templates/9.linearize_genome.sbatch
@@ -0,0 +1,15 @@
+#!/bin/bash
+#SBATCH -J {{job_name}}
+#SBATCH -N {{node_count}}
+#SBATCH -n {{nprocs}}
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH -o {{output}}/step-3/logs/%x-%A.out
+#SBATCH -e {{output}}/step-3/logs/%x-%A.err
+
+
+conda activate {{conda-environment}}
+
+#### this step must be done after dereplication in step 8
+cd {{output}}/step-8
+python 9.linearize_genome.py --input_fasta {{output}}/step-8/drep_representative --output_fasta {{output}}/step-9/all_genome_linearized.fna
diff --git a/data/templates/9.lingenome_withmd.py b/data/templates/9.lingenome_withmd.py