Skip to content

Commit 196e063

Browse files
committed
add more details and steps
1 parent 2c280b7 commit 196e063

7 files changed

+80
-5
lines changed

data/templates/1.hifiasm-meta_new.sbatch

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@
1111
conda activate {{conda-environment}}
1212

1313
cd {{output}}
14-
step=${SLURM_ARRAY_TASK_ID}
15-
input=$(head -n $step {{output}}/file_list.txt | tail -n 1)
16-
fn=`basename ${input}`
17-
hifiasm_meta -t 60 -o {{output}}/step-1/${fn} ${input}
14+
step=${SLURM_ARRAY_TASK_ID} ##1000_2, 1000_1
15+
16+
### sample_list.txt should be somewhere else but not in the output folder?
17+
line=$(head -n $step absolute_path/sample_list.txt | tail -n 1)
18+
19+
sample_name=`echo $line | awk '{print $1}'`
20+
filename=`echo $line | awk '{print $2}'
21+
hifiasm_meta -t 60 -o {{output}}/step-1/${sample_name} ${filename}

data/templates/10.minimap2_mapping

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
#SBATCH -J {{job_name}}
3+
#SBATCH -N {{node_count}}
4+
#SBATCH -n {{nprocs}}
5+
#SBATCH --time {{wall_time_limit}}
6+
#SBATCH --mem {{mem_in_gb}}G
7+
#SBATCH -o {{output}}/step-3/logs/%x-%A.out
8+
#SBATCH -e {{output}}/step-3/logs/%x-%A.err
9+
10+
11+
12+
conda activate {{conda-environment}}
13+
14+
cd {{output}}/step-9
15+
line=$(head -n $step absolute_path/sample_list.txt | tail -n 1)
16+
sample_name=`echo $line | awk '{print $1}'`
17+
filename=`echo $line | awk '{print $2}'
18+
### we do not want any secondary alignments, so we set --secondary=no, woltka will process any thing in the input SAM
19+
minimap2 -x map-hifi -t 12 -a --secondary=no --MD --eqx -o {{output}}/step-10/${sample_name}.sam {{output}}/step-9/all_genome_linearized.fna ${filename}

data/templates/2.get-circular-genomes.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@ done | parallel -j {{nprocs}}
1818
cd split
1919
find . -type f -size -512k -exec rm -f {} +
2020
### extract fasta id for all the genomes in the split folder
21+
### Note: if there are no circular genomes, no *.fa files will be generated. The circular_id.txt will be empty
2122
for f in *.fa; do
2223
k=${f##*/}
2324
n=${f%.*}
2425
grep -E "^>" $f >> circular_id.txt
2526
done
2627
sed -i 's/>//' circular_id.txt
27-
seqkit -v -f circular_id.txt all_contigs.fa > noLCG.fa
28+
seqkit grep -v -f circular_id.txt all_contigs.fa > noLCG.fa

data/templates/7.checkm_batch.sbatch

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,5 @@ rm -rf checkm_out
1818
checkm lineage_wf ./ ./checkm_out -x fa -t 8 --tab_table -f ../${name}_checkm_table.txt --pplacer_threads 1
1919
cd ../
2020
cp ${name}_checkm_table.txt step-7
21+
22+
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
#SBATCH -J {{job_name}}
3+
#SBATCH -N {{node_count}}
4+
#SBATCH -n {{nprocs}}
5+
#SBATCH --time {{wall_time_limit}}
6+
#SBATCH --mem {{mem_in_gb}}G
7+
#SBATCH -o {{output}}/step-8/logs/%x-%A.out
8+
#SBATCH -e {{output}}/step-8/logs/%x-%A.err
9+
10+
#### combine all the checkm tables
11+
cd {{output}}/step-7
12+
13+
14+
#####!!!!!!!! important !!!!!!!!!!!!!!!
15+
### this job must be submitted dependently via afterok (sbatch) on all the step-7 jobs (all samples must be finished)
16+
### because it needs all the checkm tables to be ready
17+
### e.g., sbatch --dependency=afterok:jobid1:jobid2
18+
19+
line=$(head -n 1 sample_list.txt | tail -n 1)
20+
sample_name=`echo $line | awk '{print $1}'`
21+
### any sample should work, we just need the header line
22+
head -n 1 ${sample_name}_checkm_table.txt > checkm_header.txt
23+
24+
for F in *_checkm_table.txt; do
25+
tail -n +2 $F >> checkm_body.txt
26+
done
27+
28+
cat checkm_header.txt checkm_body.txt > checkm_all.txt
29+
rm checkm_header.txt checkm_body.txt
30+
31+
#### the number of non-header lines in checkm_all.txt equals to the number of genomes/fasta files in the --genome-fasta-directory folder
32+
galah cluster --checkm-tab-table ./step-7/checkm_all.txt --genome-fasta-directory step-6/ -x fa --min-completeness 50 --max-contamination 10 --quality-formula dRep -t 32 --cluster-method fastani --output-representative-fasta-directory-copy step-8/drep_representativestep-8/drep_representative --output-cluster-definition step-8/drep_cluster99.5_member_fastani.txt --ani 99.5 --precluster-method finch
33+
34+
#### output folder step-8/drep_representative should be used for step 9.lingenomes_withmd.py
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
#SBATCH -J {{job_name}}
3+
#SBATCH -N {{node_count}}
4+
#SBATCH -n {{nprocs}}
5+
#SBATCH --time {{wall_time_limit}}
6+
#SBATCH --mem {{mem_in_gb}}G
7+
#SBATCH -o {{output}}/step-3/logs/%x-%A.out
8+
#SBATCH -e {{output}}/step-3/logs/%x-%A.err
9+
10+
11+
conda activate {{conda-environment}}
12+
13+
#### this step must be done after dereplication in step 8
14+
cd {{output}}/step-8
15+
python 9.linearize_genome.py --input_fasta {{output}}/step-8/drep_representative --output_fasta {{output}}/step-9/all_genome_linearized.fna
File renamed without changes.

0 commit comments

Comments
 (0)