emo-bon · hariszaf · May 10, 2023 · Apr 28, 2023 · Apr 28, 2023 · Apr 28, 2023
diff --git a/.gitignore b/.gitignore
@@ -117,6 +117,7 @@ venv.bak/
 # Ignore real-world test samples
 test_input/SRR*
 test_input/DB*
+test_input/ERR*
 
 # Ignore dev output 
 TEST_*/

diff --git a/config.yml b/config.yml
@@ -3,18 +3,18 @@ qc_and_merge_step: true
 taxonomic_inventory: true
 cgc_step: true
 reads_functional_annotation: true
-assemble: false
+assemble: true
 
 # Global
 threads: 40
 
 # As a rule of thumb keep that as floor(threads/8) where threads the previous parameter
-interproscan_threads: 4
+interproscan_threads: 5
 
 # fastp parameters
 detect_adapter_for_pe: false
 overrepresentation_analysis: false
-min_length_required: 100
+min_length_required: 108
 force_polyg_tail_trimming: 
 base_correction: false
 qualified_phred_quality: 
@@ -25,8 +25,8 @@ cut_right: false
 correction: false
 
 # Assembly
-memory: 550
-min-contig-len: 200
+memory: 0.9
+min-contig-len: 500
 
 # Combined Gene Caller // the size is in MB
 cgc_chunk_size: 200
@@ -35,9 +35,9 @@ cgc_chunk_size: 200
 # diamond_maxTargetSeqs: 1
 
 # Functional annotation
-protein_chunk_size_IPS: 2000000
-protein_chunk_size_eggnog: 100000
-protein_chunk_size_hmm: 50000
+protein_chunk_size_IPS: 1000000 # 20000000
+protein_chunk_size_eggnog: 4000000
+protein_chunk_size_hmm: 4000000
 
 # -----------------
 # Run wf partially
@@ -60,7 +60,7 @@ protein_chunk_size_hmm: 50000
 processed_reads: {
   class: File, 
   format: "edam:format_1929",
-  path:  workflows/pseudo_files/pseudo.merged.fasta
+  path:  results/ERR599171.merged.fasta
 }
 
 # Mandatory for running the taxonomy inventory step
@@ -74,20 +74,20 @@ input_for_motus: {
 # If produced previously from metaGOflow, will have a suffix like: .cmsearch.all.tblout.deoverlapped 
 maskfile: {
   class: File, 
-  path:  workflows/pseudo_files/pseudo.merged.cmsearch.all.tblout.deoverlapped
+  path:  results/ERR599171.merged.cmsearch.all.tblout.deoverlapped
 }
 
 # Mandatory for the functional annotation step 
 # Give the number of the sequences included in the predicted_faa_from_previous_run file 
 # You may get this by running:
 # grep -c ">" <*..merged_CDS.faa>
-count_faa_from_previous_run: 
+count_faa_from_previous_run: 18934897
 
 # Mandatory for the functional annotation step
 predicted_faa_from_previous_run: {
   class: File, 
   format: "edam:format_1929",
-  path:  workflows/pseudo_files/pseudo.merged_CDS.faa
+  path:  results/ERR599171.merged_CDS.faa
 }
 
 # Mandatory for running the assembly step 

diff --git a/run_wf.sh b/run_wf.sh
@@ -28,11 +28,12 @@ Script arguments.
   -n                  Name of run and prefix to output files.
   -d                  Path to run directory.
   -s                  Run workflow using Singularity (docker is the by default container technology) ('true' or 'false')
+  -b                  Keep tmp folder. 
 "
 }
 
 # [TODO] Consider adding a -t argument to run using toil.
-while getopts :y:f:r:e:u:k:c:d:m:n:l:sph option; do
+while getopts :y:f:r:e:u:k:c:d:m:n:l:bsph option; do
   case "${option}" in
   y) YML=${OPTARG} ;;
   f)
@@ -51,6 +52,7 @@ while getopts :y:f:r:e:u:k:c:d:m:n:l:sph option; do
   m) MEMORY=${OPTARG} ;;
   n) NAME=${OPTARG} ;;
   l) LIMIT_QUEUE=${OPTARG} ;;
+  b) KEEP_TMP="--keep-tmp" ;;
   s) SINGULARITY="--singularity" ;;
   p) PRIVATE_DATA="-p" ;;
   h)
@@ -132,6 +134,8 @@ mkdir -p "${OUT_DIR_FINAL}" "${TMPDIR}"
 
 export EXTENDED_CONFIG_YAML_TMP=${RUN_DIR}/"${NAME}"_temp.yml
 export EXTENDED_CONFIG_YAML=${RUN_DIR}/"${NAME}".yml
+export FUNCTIONAL_ANNOTATION=${OUT_DIR}/results/functional-annotation/
+
 
 # Get study id in case of ENA fetch tool
 if [[ $ENA_RUN_ID != "" ]];
@@ -183,52 +187,24 @@ cp config.yml ${RUN_DIR}/
 
 # ----------------------------- running pipeline ----------------------------- #
 
-# IMPORTANT! 
-# To work with slurm, add "--batchSystem slurm", "--disableChaining" and "--disableCaching" in the TOIL_PARMS object
-TOIL_PARAMS+=(
-  --singularity
-  --preserve-entire-environment
-  --batchSystem slurm
-  --disableChaining
-  --disableCaching
-  --logFile "${LOG_DIR}/${NAME}.log"
-  --jobStore "${JOB_TOIL_FOLDER}/${NAME}"
-  --outdir "${OUT_DIR_FINAL}"
-  --maxCores 20
-  --defaultMemory "${MEMORY}"
-  --defaultCores "${NUM_CORES}"
-  --retryCount 2
-  --logDebug
-  "$CWL"
-  "$EXTENDED_CONFIG_YAML"
-)
-
-# Toil parameters documentation  - just for your information
-# --disableChaining                Disables  chaining  of jobs (chaining uses one job's resource allocation for its successor job if possible).
-# --preserve-entire-environment    Need to propagate the env vars for Singularity, etc., into the HPC jobs
-# --disableProgress                Disables the progress bar shown when standard error is a terminal.
-# --retryCount                     Number of times to retry a failing job before giving up and labeling job failed. default=1
-# --disableCaching                 Disables caching in the file store. This flag must be set to use a batch  system that does not support caching such as Grid Engine, Parasol, LSF, or Slurm.
-
-# COMMENT IN TO RUN THE TOIL VERSION and MUTE the cwltool case in line 222.
-# echo "toil-cwl-runner" "${TOIL_PARAMS[@]}"
-# toil-cwl-runner "${TOIL_PARAMS[@]}"
-
-# --------------------------------------------
-
 # Run the metaGOflow workflow using cwltool
 cwltool --parallel ${SINGULARITY} --outdir ${OUT_DIR_FINAL} ${CWL} ${EXTENDED_CONFIG_YAML}
 
-# --------------------------------------------
 
-# Edit output structure 
-rm -rf ${TMPDIR}
-export FUNCTIONAL_ANNOTATION=${OUT_DIR}/results/functional-annotation/
+# -----------------------  edit output structure   --------------------------- #
 
-if [ -z "$FUNCTIONAL_ANNOTATION" ]; then
+if [[ $KEEP_TMP != "" ]];
+then 
+  echo "Keep temporary output directory."
+  mv ${TMPDIR} ${CWD}
+else
+  rm -rf ${TMPDIR}
+fi
 
-  cd ${OUT_DIR}/results/functional-annotation/
 
+if [ -z "$FUNCTIONAL_ANNOTATION" ]; then
+
+  cd ${FUNCTIONAL_ANNOTATION}
   count=`ls -1 *.chunks 2>/dev/null | wc -l`
   if [ $count != 0 ]
   then 
@@ -252,21 +228,35 @@ fi
 cd ${CWD}
 
 
-# --------------------------------------------
+# -----------------------  build RO-crate   --------------------------- #
 
-# Build RO-crate
 if [ -z "$ENA_RUN_ID" ]; then
   ENA_RUN_ID="None"
 else
   rm -r ${OUT_DIR}/raw_data_from_ENA
-  # mv ${OUT_DIR}/raw_data_from_ENA .
-  # mv raw_data_from_ENA ${ENA_RUN_ID}
 fi
 
+# Init the RO-Crate
 rocrate init -c ${RUN_DIR}
 
-python utils/edit-ro-crate.py ${OUT_DIR} ${EXTENDED_CONFIG_YAML} ${ENA_RUN_ID} ${METAGOFLOW_VERSION}
+# Edit the RO-Crate
+if [[ $KEEP_TMP != "" ]];
+then 
+  export KEEP_TMP="True"
+else
+  export KEEP_TMP="False"
+fi
+
+python utils/edit-ro-crate.py ${OUT_DIR} ${EXTENDED_CONFIG_YAML} ${ENA_RUN_ID} ${METAGOFLOW_VERSION} ${KEEP_TMP}
+
+
+# Bring back temporary folder if kept.
+if [[ $KEEP_TMP != "True" ]];
+then 
+  echo "Keep temporary output directory."
+  mv ${CWD}/tmp ${TMPDIR}
+fi
+
+echo "metaGOflow has been completed."
 
-# --------------------------------------------
 
-rm -r ${OUT_DIR}
diff --git a/slurm_run.sh b/slurm_run.sh
@@ -8,8 +8,8 @@
 #SBATCH --mail-user=haris.zafr@gmail.com
 #SBATCH --mail-type=ALL
 #SBATCH --requeue
-#SBATCH --job-name="rocr-tax"
-#SBATCH --output=rocrates_tax.output
+#SBATCH --job-name="tara4IPS"
+#SBATCH --output=tara4cpusIPS.output
 
 # Deactivate conda if already there
 conda deactivate
@@ -20,7 +20,7 @@ module load singularity/3.7.1
 
 
 # Run the wf with mini dataset
-./run_wf.sh -f test_input/wgs-paired-SRR1620013_1.fastq.gz -r test_input/wgs-paired-SRR1620013_2.fastq.gz -n mini_dataset -d MINI_DATASET -s
+# ./run_wf.sh -f test_input/wgs-paired-SRR1620013_1.fastq.gz -r test_input/wgs-paired-SRR1620013_2.fastq.gz -n mini_dataset -d MINI_DATASET -s 
 
 # Run the wf with short dataset
 # ./run_wf.sh -f test_input/test_1_fwd_HWLTKDRXY_600000.fastq.gz -r test_input/test_2_rev_HWLTKDRXY_600000.fastq.gz -n dev_dataset -d DEV_DATASET -s
@@ -33,7 +33,8 @@ module load singularity/3.7.1
 # ./run_wf.sh -f test_input/DBB_AABVOSDA_1_1_HMNJKDSX3.UDI256_clean.fastq.gz -r test_input/DBB_AABVOSDA_1_2_HMNJKDSX3.UDI256_clean.fastq.gz -n DBB_dataset -d water_column_dbb -s
 
 # To run an ENA run
-# ./run_wf.sh -e ERR855786 -d TEST_SIMPLIFIED_PFAM -n ERR855786 -s
+./run_wf.sh -e ERR599171 -d TARA_OCEANS_SAMPLE -n ERR599171 -s -b
+#./run_wf.sh -f test_input/ERR599171_1.fastq.gz -r test_input/ERR599171_2.fastq.gz -n ERR599171  -d TARA_OCEANS_SAMPLE_3steps -s
 
 
 # Disable the module