Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ venv.bak/
# Ignore real-world test samples
test_input/SRR*
test_input/DB*
test_input/ERR*

# Ignore dev output
TEST_*/
Expand Down
24 changes: 12 additions & 12 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@ qc_and_merge_step: true
taxonomic_inventory: true
cgc_step: true
reads_functional_annotation: true
assemble: false
assemble: true

# Global
threads: 40

# As a rule of thumb keep that as floor(threads/8) where threads the previous parameter
interproscan_threads: 4
interproscan_threads: 5

# fastp parameters
detect_adapter_for_pe: false
overrepresentation_analysis: false
min_length_required: 100
min_length_required: 108
force_polyg_tail_trimming:
base_correction: false
qualified_phred_quality:
Expand All @@ -25,8 +25,8 @@ cut_right: false
correction: false

# Assembly
memory: 550
min-contig-len: 200
memory: 0.9
min-contig-len: 500

# Combined Gene Caller // the size is in MB
cgc_chunk_size: 200
Expand All @@ -35,9 +35,9 @@ cgc_chunk_size: 200
# diamond_maxTargetSeqs: 1

# Functional annotation
protein_chunk_size_IPS: 2000000
protein_chunk_size_eggnog: 100000
protein_chunk_size_hmm: 50000
protein_chunk_size_IPS: 1000000 # 20000000
protein_chunk_size_eggnog: 4000000
protein_chunk_size_hmm: 4000000

# -----------------
# Run wf partially
Expand All @@ -60,7 +60,7 @@ protein_chunk_size_hmm: 50000
processed_reads: {
class: File,
format: "edam:format_1929",
path: workflows/pseudo_files/pseudo.merged.fasta
path: results/ERR599171.merged.fasta
}

# Mandatory for running the taxonomy inventory step
Expand All @@ -74,20 +74,20 @@ input_for_motus: {
# If produced previously from metaGOflow, will have a suffix like: .cmsearch.all.tblout.deoverlapped
maskfile: {
class: File,
path: workflows/pseudo_files/pseudo.merged.cmsearch.all.tblout.deoverlapped
path: results/ERR599171.merged.cmsearch.all.tblout.deoverlapped
}

# Mandatory for the functional annotation step
# Give the number of the sequences included in the predicted_faa_from_previous_run file
# You may get this by running:
# grep -c ">" <*..merged_CDS.faa>
count_faa_from_previous_run:
count_faa_from_previous_run: 18934897

# Mandatory for the functional annotation step
predicted_faa_from_previous_run: {
class: File,
format: "edam:format_1929",
path: workflows/pseudo_files/pseudo.merged_CDS.faa
path: results/ERR599171.merged_CDS.faa
}

# Mandatory for running the assembly step
Expand Down
84 changes: 37 additions & 47 deletions run_wf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ Script arguments.
-n Name of run and prefix to output files.
-d Path to run directory.
-s Run workflow using Singularity (docker is the by default container technology) ('true' or 'false')
-b Keep tmp folder.
"
}

# [TODO] Consider adding a -t argument to run using toil.
while getopts :y:f:r:e:u:k:c:d:m:n:l:sph option; do
while getopts :y:f:r:e:u:k:c:d:m:n:l:bsph option; do
case "${option}" in
y) YML=${OPTARG} ;;
f)
Expand All @@ -51,6 +52,7 @@ while getopts :y:f:r:e:u:k:c:d:m:n:l:sph option; do
m) MEMORY=${OPTARG} ;;
n) NAME=${OPTARG} ;;
l) LIMIT_QUEUE=${OPTARG} ;;
b) KEEP_TMP="--keep-tmp" ;;
s) SINGULARITY="--singularity" ;;
p) PRIVATE_DATA="-p" ;;
h)
Expand Down Expand Up @@ -132,6 +134,8 @@ mkdir -p "${OUT_DIR_FINAL}" "${TMPDIR}"

export EXTENDED_CONFIG_YAML_TMP=${RUN_DIR}/"${NAME}"_temp.yml
export EXTENDED_CONFIG_YAML=${RUN_DIR}/"${NAME}".yml
export FUNCTIONAL_ANNOTATION=${OUT_DIR}/results/functional-annotation/


# Get study id in case of ENA fetch tool
if [[ $ENA_RUN_ID != "" ]];
Expand Down Expand Up @@ -183,52 +187,24 @@ cp config.yml ${RUN_DIR}/

# ----------------------------- running pipeline ----------------------------- #

# IMPORTANT!
# To work with slurm, add "--batchSystem slurm", "--disableChaining" and "--disableCaching" in the TOIL_PARMS object
TOIL_PARAMS+=(
--singularity
--preserve-entire-environment
--batchSystem slurm
--disableChaining
--disableCaching
--logFile "${LOG_DIR}/${NAME}.log"
--jobStore "${JOB_TOIL_FOLDER}/${NAME}"
--outdir "${OUT_DIR_FINAL}"
--maxCores 20
--defaultMemory "${MEMORY}"
--defaultCores "${NUM_CORES}"
--retryCount 2
--logDebug
"$CWL"
"$EXTENDED_CONFIG_YAML"
)

# Toil parameters documentation - just for your information
# --disableChaining Disables chaining of jobs (chaining uses one job's resource allocation for its successor job if possible).
# --preserve-entire-environment Need to propagate the env vars for Singularity, etc., into the HPC jobs
# --disableProgress Disables the progress bar shown when standard error is a terminal.
# --retryCount Number of times to retry a failing job before giving up and labeling job failed. default=1
# --disableCaching Disables caching in the file store. This flag must be set to use a batch system that does not support caching such as Grid Engine, Parasol, LSF, or Slurm.

# COMMENT IN TO RUN THE TOIL VERSION and MUTE the cwltool case in line 222.
# echo "toil-cwl-runner" "${TOIL_PARAMS[@]}"
# toil-cwl-runner "${TOIL_PARAMS[@]}"

# --------------------------------------------

# Run the metaGOflow workflow using cwltool
cwltool --parallel ${SINGULARITY} --outdir ${OUT_DIR_FINAL} ${CWL} ${EXTENDED_CONFIG_YAML}

# --------------------------------------------

# Edit output structure
rm -rf ${TMPDIR}
export FUNCTIONAL_ANNOTATION=${OUT_DIR}/results/functional-annotation/
# ----------------------- edit output structure --------------------------- #

if [ -z "$FUNCTIONAL_ANNOTATION" ]; then
if [[ $KEEP_TMP != "" ]];
then
echo "Keep temporary output directory."
mv ${TMPDIR} ${CWD}
else
rm -rf ${TMPDIR}
fi

cd ${OUT_DIR}/results/functional-annotation/

if [ -z "$FUNCTIONAL_ANNOTATION" ]; then

cd ${FUNCTIONAL_ANNOTATION}
count=`ls -1 *.chunks 2>/dev/null | wc -l`
if [ $count != 0 ]
then
Expand All @@ -252,21 +228,35 @@ fi
cd ${CWD}


# --------------------------------------------
# ----------------------- build RO-crate --------------------------- #

# Build RO-crate
if [ -z "$ENA_RUN_ID" ]; then
ENA_RUN_ID="None"
else
rm -r ${OUT_DIR}/raw_data_from_ENA
# mv ${OUT_DIR}/raw_data_from_ENA .
# mv raw_data_from_ENA ${ENA_RUN_ID}
fi

# Init the RO-Crate
rocrate init -c ${RUN_DIR}

python utils/edit-ro-crate.py ${OUT_DIR} ${EXTENDED_CONFIG_YAML} ${ENA_RUN_ID} ${METAGOFLOW_VERSION}
# Edit the RO-Crate
if [[ $KEEP_TMP != "" ]];
then
export KEEP_TMP="True"
else
export KEEP_TMP="False"
fi

python utils/edit-ro-crate.py ${OUT_DIR} ${EXTENDED_CONFIG_YAML} ${ENA_RUN_ID} ${METAGOFLOW_VERSION} ${KEEP_TMP}


# Bring back temporary folder if kept.
if [[ $KEEP_TMP != "True" ]];
then
echo "Keep temporary output directory."
mv ${CWD}/tmp ${TMPDIR}
fi

echo "metaGOflow has been completed."

# --------------------------------------------

rm -r ${OUT_DIR}
9 changes: 5 additions & 4 deletions slurm_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
#SBATCH --mail-user=haris.zafr@gmail.com
#SBATCH --mail-type=ALL
#SBATCH --requeue
#SBATCH --job-name="rocr-tax"
#SBATCH --output=rocrates_tax.output
#SBATCH --job-name="tara4IPS"
#SBATCH --output=tara4cpusIPS.output

# Deactivate conda if already there
conda deactivate
Expand All @@ -20,7 +20,7 @@ module load singularity/3.7.1


# Run the wf with mini dataset
./run_wf.sh -f test_input/wgs-paired-SRR1620013_1.fastq.gz -r test_input/wgs-paired-SRR1620013_2.fastq.gz -n mini_dataset -d MINI_DATASET -s
# ./run_wf.sh -f test_input/wgs-paired-SRR1620013_1.fastq.gz -r test_input/wgs-paired-SRR1620013_2.fastq.gz -n mini_dataset -d MINI_DATASET -s

# Run the wf with short dataset
# ./run_wf.sh -f test_input/test_1_fwd_HWLTKDRXY_600000.fastq.gz -r test_input/test_2_rev_HWLTKDRXY_600000.fastq.gz -n dev_dataset -d DEV_DATASET -s
Expand All @@ -33,7 +33,8 @@ module load singularity/3.7.1
# ./run_wf.sh -f test_input/DBB_AABVOSDA_1_1_HMNJKDSX3.UDI256_clean.fastq.gz -r test_input/DBB_AABVOSDA_1_2_HMNJKDSX3.UDI256_clean.fastq.gz -n DBB_dataset -d water_column_dbb -s

# To run an ENA run
# ./run_wf.sh -e ERR855786 -d TEST_SIMPLIFIED_PFAM -n ERR855786 -s
./run_wf.sh -e ERR599171 -d TARA_OCEANS_SAMPLE -n ERR599171 -s -b
#./run_wf.sh -f test_input/ERR599171_1.fastq.gz -r test_input/ERR599171_2.fastq.gz -n ERR599171 -d TARA_OCEANS_SAMPLE_3steps -s


# Disable the module
Expand Down
Loading