From 4eaea4d8abed8fabd3e37f172688d23f64e7b284 Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Thu, 1 Aug 2024 16:37:08 +0100 Subject: [PATCH 01/16] Initiate scaffolding of hap1/hap2 contigs --- conf/modules.config | 264 +++++++++++++++++++++++++-- modules.json | 190 ++++++++++++++----- modules/nf-core/hifiasm/hifiasm.diff | 21 ++- modules/nf-core/hifiasm/main.nf | 2 + subworkflows/local/raw_assembly.nf | 12 +- workflows/genomeassembly.nf | 83 +++++++-- 6 files changed, 482 insertions(+), 90 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index acc23c3c..7f7c69aa 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -132,8 +132,8 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_PRI_HIC' { - ext.prefix = { "${meta.id}.asm.hic.p_ctg" } + withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_HAP1_HIC' { + ext.prefix = { "${meta.id}.asm.hic.hap1" } publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" }, mode: params.publish_dir_mode, @@ -141,8 +141,8 @@ process { ] } - withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_ALT_HIC' { - ext.prefix = { "${meta.id}.asm.hic.a_ctg" } + withName: '.*RAW_ASSEMBLY:GFA_TO_FASTA_HAP2_HIC' { + ext.prefix = { "${meta.id}.asm.hic.hap2" } publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" }, mode: params.publish_dir_mode, @@ -158,7 +158,7 @@ process { ] } withName: '.*GENOME_STATISTICS_RAW_HIC:GFASTATS_PRI' { - ext.prefix = { "${meta.id}.asm.hic.p_ctg" } + ext.prefix = { "${meta.id}.asm.hic.hap1" } publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" }, mode: params.publish_dir_mode, @@ -166,8 +166,8 @@ process { ] } - withName: '.*GENOME_STATISTICS_RAW_HIC:GFASTATS_HAP' { - ext.prefix = { "${meta.id}.asm.hic.a_ctg" } + withName: '.*GENOME_STATISTICS_RAW_HIC:GFASTATS_ALT' { + ext.prefix = { "${meta.id}.asm.hic.hap2" } publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" }, mode: params.publish_dir_mode, @@ -189,7 +189,7 @@ process { withName: '.*GENOME_STATISTICS_RAW_HIC:MERQURYFK_MERQURYFK' { publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/${meta.id}.p_ctg.ccs.merquryk" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/${meta.id}.hap1.ccs.merquryk" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -499,7 +499,7 @@ process { ] } - withName: '.*HIC_MAPPING:SAMTOOLS_MERGE_HIC_MAPPING' { + withName: '.*HIC_MAPPING.*:SAMTOOLS_MERGE_HIC_MAPPING' { ext.prefix = { "${meta.id}_merged" } } @@ -530,7 +530,7 @@ process { } - withName: '.*HIC_MAPPING:CONVERT_STATS:SAMTOOLS_VIEW' { + withName: '.*HIC_MAPPING.*:CONVERT_STATS:SAMTOOLS_VIEW' { ext.args = "--output-fmt cram" } @@ -559,7 +559,7 @@ process { } // Set up of the scffolding pipeline - withName: 'YAHS' { + withName: '.*SCAFFOLDING:YAHS' { ext.prefix = 'out' publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding/yahs/out.break.yahs" }, @@ -568,7 +568,7 @@ process { ] } - withName: 'COOLER_CLOAD' { + withName: '.*SCAFFOLDING:COOLER_CLOAD' { // Positions in the input file are zero-based; // chrom1 field number (one-based) is 2; // pos1 field number (one-based) is 3; @@ -582,7 +582,7 @@ process { ] } - withName: 'PRETEXTSNAPSHOT' { + withName: '.*SCAFFOLDING:PRETEXTSNAPSHOT' { // Make one plot containing all sequences ext.args = '--sequences \"=full\"' publishDir = [ @@ -592,7 +592,7 @@ process { ] } - withName: 'JUICER_TOOLS_PRE' { + withName: '.*SCAFFOLDING:JUICER_TOOLS_PRE' { ext.juicer_tools_jar = 'juicer_tools.1.9.9_jcuda.0.8.jar' ext.juicer_jvm_params = '-Xms1g -Xmx6g' publishDir = [ @@ -602,7 +602,7 @@ process { ] } - withName: 'JUICER_PRE' { + withName: '.*SCAFFOLDING:JUICER_PRE' { ext.args2 = "LC_ALL=C sort -k2,2d -k6,6d -S50G | awk '\$3>=0 && \$7>=0'" publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding/yahs/out.break.yahs" }, @@ -640,8 +640,242 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + + // Scaffolding hap1/hap2 + + // hap1 scaffolding + + withName: '.*HIC_MAPPING_HAP1:SAMTOOLS_MARKDUP_HIC_MAPPING' { + ext.prefix = { "${meta.id}_mkdup" } + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*HIC_MAPPING_HAP1:BAMTOBED_SORT' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + + withName: '.*HIC_MAPPING_HAP1:CONVERT_STATS:SAMTOOLS_STATS' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*HIC_MAPPING_HAP1:CONVERT_STATS:SAMTOOLS_FLAGSTAT' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*HIC_MAPPING_HAP1:CONVERT_STATS:SAMTOOLS_IDXSTATS' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SCAFFOLDING_HAP1:YAHS' { + ext.prefix = 'out' + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + } + + withName: '.*SCAFFOLDING_HAP1:COOLER_CLOAD' { + // Positions in the input file are zero-based; + // chrom1 field number (one-based) is 2; + // pos1 field number (one-based) is 3; + // chrom2 field number (one-based) is 6; + // pos2 field number (one-based) is 7 + ext.args = 'pairs -0 -c1 2 -p1 3 -c2 6 -p2 7' + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SCAFFOLDING_HAP1:PRETEXTSNAPSHOT' { + // Make one plot containing all sequences + ext.args = '--sequences \"=full\"' + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SCAFFOLDING_HAP1:JUICER_TOOLS_PRE' { + ext.juicer_tools_jar = 'juicer_tools.1.9.9_jcuda.0.8.jar' + ext.juicer_jvm_params = '-Xms1g -Xmx6g' + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SCAFFOLDING_HAP1:JUICER_PRE' { + ext.args2 = "LC_ALL=C sort -k2,2d -k6,6d -S50G | awk '\$3>=0 && \$7>=0'" + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + + // End of hap1 scaffolding + + // hap2 scaffolding + + withName: '.*HIC_MAPPING_HAP2:SAMTOOLS_MARKDUP_HIC_MAPPING' { + ext.prefix = { "${meta.id}_mkdup" } + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*HIC_MAPPING_HAP2:BAMTOBED_SORT' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + + withName: '.*HIC_MAPPING_HAP2:CONVERT_STATS:SAMTOOLS_STATS' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*HIC_MAPPING_HAP2:CONVERT_STATS:SAMTOOLS_FLAGSTAT' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*HIC_MAPPING_HAP2:CONVERT_STATS:SAMTOOLS_IDXSTATS' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SCAFFOLDING_HAP2:YAHS' { + ext.prefix = 'out' + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + } + + withName: '.*SCAFFOLDING_HAP2:COOLER_CLOAD' { + // Positions in the input file are zero-based; + // chrom1 field number (one-based) is 2; + // pos1 field number (one-based) is 3; + // chrom2 field number (one-based) is 6; + // pos2 field number (one-based) is 7 + ext.args = 'pairs -0 -c1 2 -p1 3 -c2 6 -p2 7' + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SCAFFOLDING_HAP2:PRETEXTSNAPSHOT' { + // Make one plot containing all sequences + ext.args = '--sequences \"=full\"' + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SCAFFOLDING_HAP2:JUICER_TOOLS_PRE' { + ext.juicer_tools_jar = 'juicer_tools.1.9.9_jcuda.0.8.jar' + ext.juicer_jvm_params = '-Xms1g -Xmx6g' + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*SCAFFOLDING_HAP2:JUICER_PRE' { + ext.args2 = "LC_ALL=C sort -k2,2d -k6,6d -S50G | awk '\$3>=0 && \$7>=0'" + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // End of hap2 scaffolding + + withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:GFASTATS_PRI' { + ext.prefix = { "${meta.id}_scaffolds_final" } + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + pattern: '*assembly_summary' + ] + } + + withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:BUSCO' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding/yahs/out.break.yahs/out_scaffolds_final.${meta.lineage}.busco" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.endsWith('busco.log') ? filename : + filename.endsWith('full_table.tsv') ? filename : + filename.endsWith('missing_busco_list.tsv') ? filename : + filename.startsWith('short_summary') ? filename : + filename.endsWith('busco.batch_summary.txt') ? filename : + null } + ] + } + + withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:MERQURYFK_MERQURYFK' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding/yahs/out.break.yahs/out_scaffolds_final.ccs.merquryk" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // End of Scaffolding hap1/hap2 // End of Set up of the scaffolding pipeline + //Set up of assembly stats subworkflow withName: 'BUSCO' { ext.args = "--mode genome" diff --git a/modules.json b/modules.json index 1fd59692..9bf9d01d 100644 --- a/modules.json +++ b/modules.json @@ -8,250 +8,344 @@ "bcftools/concat": { "branch": "master", "git_sha": "582ff1755bdd205c65e2ba4c31e0a008dae299ec", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/consensus": { "branch": "master", "git_sha": "fa12afdf5874c1d11e4a20efe81c97935e8eea24", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/index": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/norm": { "branch": "master", "git_sha": "0435e4eebc94e53721c194b2d5d06f455a79e407", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/sort": { "branch": "master", "git_sha": "4a21e4cca35e72ec059abd67f790e0b192ce5d81", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/view": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bedtools/bamtobed": { "branch": "master", "git_sha": "9e51255c4f8ec69fb6ccf68593392835f14fecb8", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/bedtools/bamtobed/bedtools-bamtobed.diff" }, "busco": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bwamem2/index": { "branch": "master", "git_sha": "bfed129da5134b4439b1821c917972570d44d39c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "cat/cat": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "cooler/cload": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "cooler/zoomify": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastk/fastk": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastk/histex": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "freebayes": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/freebayes/freebayes.diff" }, "gatk4/mergevcfs": { "branch": "master", "git_sha": "643756685546fa61f5c8fba439af746c090b9180", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "genescopefk": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gfastats": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/gfastats/gfastats.diff" }, "gunzip": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "hifiasm": { "branch": "master", "git_sha": "aecb06fcdb995ff3e3df7c7a1fd119367d6d1996", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/hifiasm/hifiasm.diff" }, "merquryfk/merquryfk": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/align": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/minimap2/align/minimap2-align.diff" }, "minimap2/index": { "branch": "master", "git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "mitohifi/findmitoreference": { "branch": "master", "git_sha": "f52220e84bfc16a8616a5bb3d6f5bc67d601bdce", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/mitohifi/findmitoreference/mitohifi-findmitoreference.diff" }, "mitohifi/mitohifi": { "branch": "master", "git_sha": "c607e74f7aa72eb7cb7cc0a1454f97d3907e8d84", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/mitohifi/mitohifi/mitohifi-mitohifi.diff" }, "oatk": { "branch": "master", "git_sha": "d6a146325058eb9a18da2e898a2376e1d1093052", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "pretextmap": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "pretextsnapshot": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/pretextsnapshot/pretextsnapshot.diff" }, "purgedups/calcuts": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/purgedups/calcuts/purgedups-calcuts.diff" }, "purgedups/getseqs": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "purgedups/pbcstat": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "purgedups/purgedups": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "purgedups/splitfa": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/collate": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/faidx": { "branch": "master", "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/fixmate": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/flagstat": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/idxstats": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/index": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/markdup": { "branch": "master", "git_sha": "9e51255c4f8ec69fb6ccf68593392835f14fecb8", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/merge": { "branch": "master", "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/sort": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/stats": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/view": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "seqtk/subseq": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/seqtk/subseq/seqtk-subseq.diff" }, "yahs": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/hifiasm/hifiasm.diff b/modules/nf-core/hifiasm/hifiasm.diff index c9103cee..0594f11d 100644 --- a/modules/nf-core/hifiasm/hifiasm.diff +++ b/modules/nf-core/hifiasm/hifiasm.diff @@ -13,7 +13,7 @@ Changes in module 'nf-core/hifiasm' + exit 1, "This version of HIFIASM module does not support Conda. Please use Docker / Singularity / Podman instead." + } + -+ container "wave.seqera.io/wt/73ac3caec075/wave/build:hifiasm-0.19.8_samtools-1.20--1f6824530f0d0ad5" ++ container "quay.io/sanger-tol/hifiasm_samtools:0.01" input: tuple val(meta), path(reads) @@ -25,12 +25,14 @@ Changes in module 'nf-core/hifiasm' output: tuple val(meta), path("*.r_utg.gfa") , emit: raw_unitigs -@@ -23,8 +25,10 @@ +@@ -23,8 +25,12 @@ tuple val(meta), path("*.p_utg.gfa") , emit: processed_unitigs, optional: true tuple val(meta), path("*.asm.p_ctg.gfa") , emit: primary_contigs , optional: true tuple val(meta), path("*.asm.a_ctg.gfa") , emit: alternate_contigs, optional: true - tuple val(meta), path("*.hap1.p_ctg.gfa") , emit: paternal_contigs , optional: true - tuple val(meta), path("*.hap2.p_ctg.gfa") , emit: maternal_contigs , optional: true ++ tuple val(meta), path("*.asm.hic.hap1.p_ctg.gfa") , emit: hap1_contigs , optional: true ++ tuple val(meta), path("*.asm.hic.hap2.p_ctg.gfa") , emit: hap2_contigs , optional: true + tuple val(meta), path("*.asm.hic.p_ctg.gfa") , emit: hic_primary_contigs , optional: true + tuple val(meta), path("*.asm.hic.a_ctg.gfa") , emit: hic_alternate_contigs , optional: true + tuple val(meta), path("*.asm.hic.hap1.p_ctg.gfa") , emit: paternal_contigs , optional: true @@ -38,7 +40,7 @@ Changes in module 'nf-core/hifiasm' tuple val(meta), path("*.log") , emit: log path "versions.yml" , emit: versions -@@ -34,6 +38,8 @@ +@@ -34,6 +40,8 @@ script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" @@ -47,7 +49,7 @@ Changes in module 'nf-core/hifiasm' if ((paternal_kmer_dump) && (maternal_kmer_dump) && (hic_read1) && (hic_read2)) { error "Hifiasm Trio-binning and Hi-C integrated should not be used at the same time" } else if ((paternal_kmer_dump) && !(maternal_kmer_dump)) { -@@ -67,8 +73,8 @@ +@@ -67,8 +75,8 @@ $args \\ -o ${prefix}.asm \\ -t $task.cpus \\ @@ -59,4 +61,15 @@ Changes in module 'nf-core/hifiasm' 2> >( tee ${prefix}.stderr.log >&2 ) +--- modules/nf-core/hifiasm/environment.yml ++++ /dev/null +@@ -1,7 +0,0 @@ +-name: hifiasm +-channels: +- - conda-forge +- - bioconda +- - defaults +-dependencies: +- - bioconda::hifiasm=0.19.8 + ************************************************************ diff --git a/modules/nf-core/hifiasm/main.nf b/modules/nf-core/hifiasm/main.nf index 7854a89e..c7b87418 100644 --- a/modules/nf-core/hifiasm/main.nf +++ b/modules/nf-core/hifiasm/main.nf @@ -25,6 +25,8 @@ process HIFIASM { tuple val(meta), path("*.p_utg.gfa") , emit: processed_unitigs, optional: true tuple val(meta), path("*.asm.p_ctg.gfa") , emit: primary_contigs , optional: true tuple val(meta), path("*.asm.a_ctg.gfa") , emit: alternate_contigs, optional: true + tuple val(meta), path("*.asm.hic.hap1.p_ctg.gfa") , emit: hap1_contigs , optional: true + tuple val(meta), path("*.asm.hic.hap2.p_ctg.gfa") , emit: hap2_contigs , optional: true tuple val(meta), path("*.asm.hic.p_ctg.gfa") , emit: hic_primary_contigs , optional: true tuple val(meta), path("*.asm.hic.a_ctg.gfa") , emit: hic_alternate_contigs , optional: true tuple val(meta), path("*.asm.hic.hap1.p_ctg.gfa") , emit: paternal_contigs , optional: true diff --git a/subworkflows/local/raw_assembly.nf b/subworkflows/local/raw_assembly.nf index 37aeb349..703efbf4 100644 --- a/subworkflows/local/raw_assembly.nf +++ b/subworkflows/local/raw_assembly.nf @@ -3,8 +3,8 @@ include { HIFIASM as HIFIASM_HIC } from '../../modules/nf-core/hi include { GFA_TO_FASTA as GFA_TO_FASTA_PRI } from '../../modules/local/gfa_to_fasta' include { GFA_TO_FASTA as GFA_TO_FASTA_ALT } from '../../modules/local/gfa_to_fasta' -include { GFA_TO_FASTA as GFA_TO_FASTA_PRI_HIC } from '../../modules/local/gfa_to_fasta' -include { GFA_TO_FASTA as GFA_TO_FASTA_ALT_HIC } from '../../modules/local/gfa_to_fasta' +include { GFA_TO_FASTA as GFA_TO_FASTA_HAP1_HIC } from '../../modules/local/gfa_to_fasta' +include { GFA_TO_FASTA as GFA_TO_FASTA_HAP2_HIC } from '../../modules/local/gfa_to_fasta' workflow RAW_ASSEMBLY { take: @@ -44,19 +44,19 @@ workflow RAW_ASSEMBLY { // // MODULE: CONVERT HIFIASM-HIC PRIMARY CONTIGS TO FASTA // - GFA_TO_FASTA_PRI_HIC( HIFIASM_HIC.out.hic_primary_contigs ) + GFA_TO_FASTA_HAP1_HIC( HIFIASM_HIC.out.hap1_contigs ) // // MODULE: CONVERT HIFIASM-HIC ALT CONTIGS TO FASTA // - GFA_TO_FASTA_ALT_HIC( HIFIASM_HIC.out.hic_alternate_contigs ) + GFA_TO_FASTA_HAP2_HIC( HIFIASM_HIC.out.hap2_contigs ) } emit: primary_contigs = GFA_TO_FASTA_PRI.out.fasta alternate_contigs = GFA_TO_FASTA_ALT.out.fasta - primary_hic_contigs = hifiasm_hic_on ? GFA_TO_FASTA_PRI_HIC.out.fasta : null - alternate_hic_contigs = hifiasm_hic_on ? GFA_TO_FASTA_ALT_HIC.out.fasta : null + hap1_hic_contigs = hifiasm_hic_on ? GFA_TO_FASTA_HAP1_HIC.out.fasta : null + hap2_hic_contigs = hifiasm_hic_on ? GFA_TO_FASTA_HAP2_HIC.out.fasta : null versions = ch_versions } diff --git a/workflows/genomeassembly.nf b/workflows/genomeassembly.nf index a59ec08c..6f7458d6 100644 --- a/workflows/genomeassembly.nf +++ b/workflows/genomeassembly.nf @@ -35,21 +35,26 @@ if (params.organelles_on) { organelles_on = params.organelles_on } else { organe // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { PREPARE_INPUT } from '../subworkflows/local/prepare_input' -include { RAW_ASSEMBLY } from '../subworkflows/local/raw_assembly' -include { ORGANELLES } from '../subworkflows/local/organelles' -include { GENOMESCOPE_MODEL } from '../subworkflows/local/genomescope_model' -include { PURGE_DUPS } from '../subworkflows/local/purge_dups' -include { POLISHING } from '../subworkflows/local/polishing' -include { SCAFFOLDING } from '../subworkflows/local/scaffolding' -include { KEEP_SEQNAMES as KEEP_SEQNAMES_PRIMARY } from '../modules/local/keep_seqnames' -include { KEEP_SEQNAMES as KEEP_SEQNAMES_HAPLOTIGS } from '../modules/local/keep_seqnames' -include { HIC_MAPPING } from '../subworkflows/local/hic_mapping' -include { GENOME_STATISTICS as GENOME_STATISTICS_RAW } from '../subworkflows/local/genome_statistics' -include { GENOME_STATISTICS as GENOME_STATISTICS_RAW_HIC } from '../subworkflows/local/genome_statistics' -include { GENOME_STATISTICS as GENOME_STATISTICS_PURGED } from '../subworkflows/local/genome_statistics' -include { GENOME_STATISTICS as GENOME_STATISTICS_POLISHED } from '../subworkflows/local/genome_statistics' -include { GENOME_STATISTICS as GENOME_STATISTICS_SCAFFOLDS } from '../subworkflows/local/genome_statistics' +include { PREPARE_INPUT } from '../subworkflows/local/prepare_input' +include { RAW_ASSEMBLY } from '../subworkflows/local/raw_assembly' +include { ORGANELLES } from '../subworkflows/local/organelles' +include { GENOMESCOPE_MODEL } from '../subworkflows/local/genomescope_model' +include { PURGE_DUPS } from '../subworkflows/local/purge_dups' +include { POLISHING } from '../subworkflows/local/polishing' +include { SCAFFOLDING } from '../subworkflows/local/scaffolding' +include { SCAFFOLDING as SCAFFOLDING_HAP1 } from '../subworkflows/local/scaffolding' +include { SCAFFOLDING as SCAFFOLDING_HAP2 } from '../subworkflows/local/scaffolding' +include { KEEP_SEQNAMES as KEEP_SEQNAMES_PRIMARY } from '../modules/local/keep_seqnames' +include { KEEP_SEQNAMES as KEEP_SEQNAMES_HAPLOTIGS } from '../modules/local/keep_seqnames' +include { HIC_MAPPING } from '../subworkflows/local/hic_mapping' +include { HIC_MAPPING as HIC_MAPPING_HAP1 } from '../subworkflows/local/hic_mapping' +include { HIC_MAPPING as HIC_MAPPING_HAP2 } from '../subworkflows/local/hic_mapping' +include { GENOME_STATISTICS as GENOME_STATISTICS_RAW } from '../subworkflows/local/genome_statistics' +include { GENOME_STATISTICS as GENOME_STATISTICS_RAW_HIC } from '../subworkflows/local/genome_statistics' +include { GENOME_STATISTICS as GENOME_STATISTICS_PURGED } from '../subworkflows/local/genome_statistics' +include { GENOME_STATISTICS as GENOME_STATISTICS_POLISHED } from '../subworkflows/local/genome_statistics' +include { GENOME_STATISTICS as GENOME_STATISTICS_SCAFFOLDS } from '../subworkflows/local/genome_statistics' +include { GENOME_STATISTICS as GENOME_STATISTICS_SCAFFOLDS_HAPS } from '../subworkflows/local/genome_statistics' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -164,8 +169,8 @@ workflow GENOMEASSEMBLY { // // SUBWORKFLOW: CALCULATE RAW ASSEMBLY STATISTICS FOR THE HIFIASN IN HIC MODE // - GENOME_STATISTICS_RAW_HIC( RAW_ASSEMBLY.out.primary_hic_contigs - .join(RAW_ASSEMBLY.out.alternate_hic_contigs), + GENOME_STATISTICS_RAW_HIC( RAW_ASSEMBLY.out.hap1_hic_contigs + .join(RAW_ASSEMBLY.out.hap2_hic_contigs), PREPARE_INPUT.out.busco, GENOMESCOPE_MODEL.out.hist, GENOMESCOPE_MODEL.out.ktab @@ -352,6 +357,50 @@ workflow GENOMEASSEMBLY { GENOMESCOPE_MODEL.out.ktab ) + + if ( hifiasm_hic_on ) { + // + // SUBWORKFLOW: MAP HIC DATA TO THE HAP1 CONTIGS + // + HIC_MAPPING_HAP1 ( RAW_ASSEMBLY.out.hap1_hic_contigs, crams_ch, hic_aligner_ch ) + ch_versions = ch_versions.mix(HIC_MAPPING_HAP1.out.versions) + + // + // SUBWORKFLOW: SCAFFOLD HAP1 + // + SCAFFOLDING_HAP1( HIC_MAPPING_HAP1.out.bed, RAW_ASSEMBLY.out.hap1_hic_contigs, cool_bin ) + ch_versions = ch_versions.mix(SCAFFOLDING_HAP1.out.versions) + + // + // SUBWORKFLOW: MAP HIC DATA TO THE HAP2 CONTIGS + // + HIC_MAPPING_HAP2 ( RAW_ASSEMBLY.out.hap2_hic_contigs, crams_ch, hic_aligner_ch ) + ch_versions = ch_versions.mix(HIC_MAPPING_HAP2.out.versions) + + // + // SUBWORKFLOW: SCAFFOLD HAP2 + // + SCAFFOLDING_HAP2( HIC_MAPPING_HAP2.out.bed, RAW_ASSEMBLY.out.hap2_hic_contigs, cool_bin ) + ch_versions = ch_versions.mix(SCAFFOLDING_HAP2.out.versions) + + // + // LOGIC: CREATE A CHANNEL FOR THE FULL HAP1/HAP2 ASSEMBLY + // + SCAFFOLDING_HAP1.out.fasta.combine(SCAFFOLDING_HAP2.out.fasta) + .map{meta_s, fasta_s, meta_h, fasta_h -> [ meta_h, fasta_s, fasta_h ]} + .set{ stats_haps_input_ch } + + // + // SUBWORKFLOW: CALCULATE ASSEMBLY STATISTICS FOR HAP1/HAP2 ASSEMBLY + // + GENOME_STATISTICS_SCAFFOLDS_HAPS( stats_haps_input_ch, + PREPARE_INPUT.out.busco, + GENOMESCOPE_MODEL.out.hist, + GENOMESCOPE_MODEL.out.ktab + ) + + } + // // MODULE: Collate versions.yml file // From f2e0856448fa6ae508bd154b9771a25c254efc9d Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Thu, 1 Aug 2024 16:45:32 +0100 Subject: [PATCH 02/16] Fix modules --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 7f7c69aa..0010a96c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -642,7 +642,7 @@ process { } // Scaffolding hap1/hap2 - + if (params.hifiasm_hic_on) { // hap1 scaffolding withName: '.*HIC_MAPPING_HAP1:SAMTOOLS_MARKDUP_HIC_MAPPING' { @@ -839,7 +839,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - + } // End of hap2 scaffolding withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:GFASTATS_PRI' { From 0a18c542bae82c79883824fd57e8cacc2bf1cb50 Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Mon, 5 Aug 2024 15:05:04 +0100 Subject: [PATCH 03/16] Fix naming prefix --- conf/modules.config | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0010a96c..9ae4499c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -845,7 +845,16 @@ process { withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:GFASTATS_PRI' { ext.prefix = { "${meta.id}_scaffolds_final" } publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding/yahs/out.break.yahs" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, + mode: params.publish_dir_mode, + pattern: '*assembly_summary' + ] + } + + withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:GFASTATS_ALT' { + ext.prefix = { "${meta.id}_scaffolds_final" } + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, mode: params.publish_dir_mode, pattern: '*assembly_summary' ] @@ -853,7 +862,7 @@ process { withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:BUSCO' { publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding/yahs/out.break.yahs/out_scaffolds_final.${meta.lineage}.busco" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs/out_scaffolds_final.${meta.lineage}.busco" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.endsWith('busco.log') ? filename : filename.endsWith('full_table.tsv') ? filename : @@ -866,7 +875,7 @@ process { withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:MERQURYFK_MERQURYFK' { publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding/yahs/out.break.yahs/out_scaffolds_final.ccs.merquryk" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs/out_scaffolds_final.ccs.merquryk" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] From 830fa9ccf49df1660a1a3a53a80354522a36f35b Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Mon, 5 Aug 2024 15:12:08 +0100 Subject: [PATCH 04/16] Fix naming error --- conf/modules.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 9ae4499c..682b478d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -166,7 +166,7 @@ process { ] } - withName: '.*GENOME_STATISTICS_RAW_HIC:GFASTATS_ALT' { + withName: '.*GENOME_STATISTICS_RAW_HIC:GFASTATS_HAP' { ext.prefix = { "${meta.id}.asm.hic.hap2" } publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}" }, @@ -174,6 +174,7 @@ process { pattern: '*assembly_summary' ] } + withName: '.*GENOME_STATISTICS_RAW_HIC:BUSCO' { publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/${meta.id}.p_ctg.${meta.lineage}.busco" }, From 9a2a39a960e7aa33afe9f69d026aa3f3e4efcb0c Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Mon, 5 Aug 2024 16:23:46 +0100 Subject: [PATCH 05/16] Fix meta after hic mapping --- subworkflows/local/hic_bwamem2.nf | 4 ++-- subworkflows/local/hic_minimap2.nf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf index f053ef89..8c917fdf 100755 --- a/subworkflows/local/hic_bwamem2.nf +++ b/subworkflows/local/hic_bwamem2.nf @@ -68,7 +68,7 @@ workflow HIC_BWAMEM2 { .map { file -> tuple ( [ - id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] + id: file[0].toString().split('/')[-1].split('_')[0] ], file ) @@ -78,4 +78,4 @@ workflow HIC_BWAMEM2 { emit: mappedbams = collected_files_for_merge versions = ch_versions.ifEmpty(null) -} \ No newline at end of file +} diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf index 3eff6e43..355b0843 100755 --- a/subworkflows/local/hic_minimap2.nf +++ b/subworkflows/local/hic_minimap2.nf @@ -77,7 +77,7 @@ workflow HIC_MINIMAP2 { .map { file -> tuple ( [ - id: file[0].toString().split('/')[-1].split('_')[0] + '_' + file[0].toString().split('/')[-1].split('_')[1] + id: file[0].toString().split('/')[-1].split('_')[0] ], file ) From 92c35da40c0766852dd87f6cc3abfbc8f3444fb8 Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Mon, 5 Aug 2024 16:24:41 +0100 Subject: [PATCH 06/16] Fix collision in scaffolding hap1/hap2 naming --- conf/modules.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 682b478d..fca2cd93 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -689,7 +689,7 @@ process { } withName: '.*SCAFFOLDING_HAP1:YAHS' { - ext.prefix = 'out' + ext.prefix = 'hap1' publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, mode: params.publish_dir_mode, @@ -789,7 +789,7 @@ process { } withName: '.*SCAFFOLDING_HAP2:YAHS' { - ext.prefix = 'out' + ext.prefix = 'hap2' publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, mode: params.publish_dir_mode, @@ -852,7 +852,7 @@ process { ] } - withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:GFASTATS_ALT' { + withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:GFASTATS_HAP' { ext.prefix = { "${meta.id}_scaffolds_final" } publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, From f23a356a3d8c29094c0ecf8f0b79460119ea3c92 Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Mon, 5 Aug 2024 16:55:20 +0100 Subject: [PATCH 07/16] Run BUSCO on alts for hap1/hap2 --- conf/modules.config | 38 +++++++++++++++++++++---- subworkflows/local/genome_statistics.nf | 21 ++++++++++++-- workflows/genomeassembly.nf | 24 ++++++++++++---- 3 files changed, 68 insertions(+), 15 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index fca2cd93..964aebd0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -102,7 +102,7 @@ process { ] } - withName: '.*GENOME_STATISTICS_RAW:BUSCO' { + withName: '.*GENOME_STATISTICS_RAW:BUSCO_PRI' { publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/${meta.id}.p_ctg.${meta.lineage}.busco" }, mode: params.publish_dir_mode, @@ -175,9 +175,22 @@ process { ] } - withName: '.*GENOME_STATISTICS_RAW_HIC:BUSCO' { + withName: '.*GENOME_STATISTICS_RAW_HIC:BUSCO_PRI' { publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/${meta.id}.p_ctg.${meta.lineage}.busco" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/${meta.id}.hap1.${meta.lineage}.busco" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.endsWith('busco.log') ? filename : + filename.endsWith('full_table.tsv') ? filename : + filename.endsWith('missing_busco_list.tsv') ? filename : + filename.startsWith('short_summary') ? filename : + filename.endsWith('busco.batch_summary.txt') ? filename : + null } + ] + } + + withName: '.*GENOME_STATISTICS_RAW_HIC:BUSCO_HAP' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/${meta.id}.hap2.${meta.lineage}.busco" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.endsWith('busco.log') ? filename : filename.endsWith('full_table.tsv') ? filename : @@ -333,7 +346,7 @@ process { ] } - withName: '.*GENOME_STATISTICS_PURGED:BUSCO' { + withName: '.*GENOME_STATISTICS_PURGED:BUSCO_PRI' { publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/purging/${meta.id}.purged.${meta.lineage}.busco" }, mode: params.publish_dir_mode, @@ -621,7 +634,7 @@ process { ] } - withName: '.*GENOME_STATISTICS_SCAFFOLDS:BUSCO' { + withName: '.*GENOME_STATISTICS_SCAFFOLDS:BUSCO_PRI' { publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/scaffolding/yahs/out.break.yahs/out_scaffolds_final.${meta.lineage}.busco" }, mode: params.publish_dir_mode, @@ -861,7 +874,7 @@ process { ] } - withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:BUSCO' { + withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:BUSCO_PRI' { publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs/out_scaffolds_final.${meta.lineage}.busco" }, mode: params.publish_dir_mode, @@ -874,6 +887,19 @@ process { ] } + withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:BUSCO_HAP' { + publishDir = [ + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs/out_scaffolds_final.${meta.lineage}.busco" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.endsWith('busco.log') ? filename : + filename.endsWith('full_table.tsv') ? filename : + filename.endsWith('missing_busco_list.tsv') ? filename : + filename.startsWith('short_summary') ? filename : + filename.endsWith('busco.batch_summary.txt') ? filename : + null } + ] + } + withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:MERQURYFK_MERQURYFK' { publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs/out_scaffolds_final.ccs.merquryk" }, diff --git a/subworkflows/local/genome_statistics.nf b/subworkflows/local/genome_statistics.nf index 37524e45..d4cfcb6b 100644 --- a/subworkflows/local/genome_statistics.nf +++ b/subworkflows/local/genome_statistics.nf @@ -9,7 +9,8 @@ include { GFASTATS as GFASTATS_PRI } from '../../modules/nf-core/gfastats/main' include { GFASTATS as GFASTATS_HAP } from '../../modules/nf-core/gfastats/main' -include { BUSCO } from '../../modules/nf-core/busco/main' +include { BUSCO as BUSCO_PRI } from '../../modules/nf-core/busco/main' +include { BUSCO as BUSCO_HAP } from '../../modules/nf-core/busco/main' include { MERQURYFK_MERQURYFK } from '../../modules/nf-core/merquryfk/merquryfk/main' workflow GENOME_STATISTICS { @@ -18,6 +19,7 @@ workflow GENOME_STATISTICS { lineage // channel: [ meta, /path/to/buscoDB, lineage ] hist // channel: [meta, fastk_hist files] ktab // channel: [meta, fastk_ktab files] + busco_alt // channel: true/false main: ch_versions = Channel.empty() @@ -48,14 +50,27 @@ workflow GENOME_STATISTICS { // // MODULE: RUN BUSCO ON PRIMARY ASSEMBLY // - BUSCO ( primary_ch.join(lineage) + BUSCO_PRI ( primary_ch.join(lineage) .map{ meta, primary, lineage_db, lineage_name -> [[id:meta.id, lineage:lineage_name], primary]}, lineage.map{ meta, lineage_db, lineage_name -> lineage_name } , lineage.map{ meta, lineage_db, ch_lineage -> lineage_db }, [] ) - ch_versions = ch_versions.mix(BUSCO.out.versions.first()) + ch_versions = ch_versions.mix(BUSCO_PRI.out.versions.first()) + // + // MODULE: run BUSCO for haplotigs + // USED FOR HAP1/HAP2 ASSEMBLIES + // + if ( busco_alt ) { + BUSCO_HAP ( haplotigs_ch.join(lineage) + .map{ meta, haps, lineage_db, lineage_name -> + [[id:meta.id, lineage:lineage_name], haps]}, + lineage.map{ meta, lineage_db, lineage_name -> lineage_name } , + lineage.map{ meta, lineage_db, ch_lineage -> lineage_db }, + [] ) + } + // // LOGIC: JOIN ASSEMBLY AND KMER DATABASE INPUT // diff --git a/workflows/genomeassembly.nf b/workflows/genomeassembly.nf index 6f7458d6..b53f50b5 100644 --- a/workflows/genomeassembly.nf +++ b/workflows/genomeassembly.nf @@ -121,13 +121,20 @@ workflow GENOMEASSEMBLY { // RAW_ASSEMBLY.out.alternate_contigs.set{ haplotigs_ch } + // + // LOGIC: DECLARE CONSTANTS TO TOGGLE BUSCO FOR ALTS + // + set_busco_alts = true + unset_busco_alts = false + // // SUBWORKFLOW: CALCULATE STATISTICS FOR THE RAW ASSEMBLY // GENOME_STATISTICS_RAW( primary_contigs_ch.join(haplotigs_ch), PREPARE_INPUT.out.busco, GENOMESCOPE_MODEL.out.hist, - GENOMESCOPE_MODEL.out.ktab + GENOMESCOPE_MODEL.out.ktab, + unset_busco_alts ) ch_versions = ch_versions.mix(GENOME_STATISTICS_RAW.out.versions) @@ -173,7 +180,8 @@ workflow GENOMEASSEMBLY { .join(RAW_ASSEMBLY.out.hap2_hic_contigs), PREPARE_INPUT.out.busco, GENOMESCOPE_MODEL.out.hist, - GENOMESCOPE_MODEL.out.ktab + GENOMESCOPE_MODEL.out.ktab, + set_busco_alts ) } @@ -215,7 +223,8 @@ workflow GENOMEASSEMBLY { GENOME_STATISTICS_PURGED( primary_contigs_ch.join(haplotigs_ch), PREPARE_INPUT.out.busco, GENOMESCOPE_MODEL.out.hist, - GENOMESCOPE_MODEL.out.ktab + GENOMESCOPE_MODEL.out.ktab, + unset_busco_alts ) // @@ -315,7 +324,8 @@ workflow GENOMEASSEMBLY { GENOME_STATISTICS_POLISHED( polished_asm_stats_input_ch, PREPARE_INPUT.out.busco, GENOMESCOPE_MODEL.out.hist, - GENOMESCOPE_MODEL.out.ktab + GENOMESCOPE_MODEL.out.ktab, + unset_busco_alts ) } @@ -354,7 +364,8 @@ workflow GENOMEASSEMBLY { GENOME_STATISTICS_SCAFFOLDS( stats_input_ch, PREPARE_INPUT.out.busco, GENOMESCOPE_MODEL.out.hist, - GENOMESCOPE_MODEL.out.ktab + GENOMESCOPE_MODEL.out.ktab, + unset_busco_alts ) @@ -396,7 +407,8 @@ workflow GENOMEASSEMBLY { GENOME_STATISTICS_SCAFFOLDS_HAPS( stats_haps_input_ch, PREPARE_INPUT.out.busco, GENOMESCOPE_MODEL.out.hist, - GENOMESCOPE_MODEL.out.ktab + GENOMESCOPE_MODEL.out.ktab, + set_busco_alts ) } From 59dd0d5e634f7a327fc46368e8b32bc3698e3e29 Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Tue, 6 Aug 2024 13:46:49 +0100 Subject: [PATCH 08/16] Move busco constants outside of main block --- workflows/genomeassembly.nf | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/workflows/genomeassembly.nf b/workflows/genomeassembly.nf index b53f50b5..14fe3c7a 100644 --- a/workflows/genomeassembly.nf +++ b/workflows/genomeassembly.nf @@ -20,6 +20,11 @@ if (params.cool_bin) { cool_bin = params.cool_bin } else { cool_bin = 1000; } if (params.polishing_on) { polishing_on = params.polishing_on } else { polishing_on = false; } if (params.hifiasm_hic_on) { hifiasm_hic_on = params.hifiasm_hic_on } else { hifiasm_hic_on = false; } if (params.organelles_on) { organelles_on = params.organelles_on } else { organelles_on = false; } + +// Declare constants to toggle BUSCO for alts +set_busco_alts = true +unset_busco_alts = false + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -121,12 +126,6 @@ workflow GENOMEASSEMBLY { // RAW_ASSEMBLY.out.alternate_contigs.set{ haplotigs_ch } - // - // LOGIC: DECLARE CONSTANTS TO TOGGLE BUSCO FOR ALTS - // - set_busco_alts = true - unset_busco_alts = false - // // SUBWORKFLOW: CALCULATE STATISTICS FOR THE RAW ASSEMBLY // From b0a96715114911245d866d13e6c363b4c0f94d70 Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Wed, 14 Aug 2024 15:31:39 +0100 Subject: [PATCH 09/16] Fix warning and do linting checks --- conf/modules.config | 4 +- modules.json | 190 ++++++++--------------------- subworkflows/local/hic_bwamem2.nf | 17 ++- subworkflows/local/hic_minimap2.nf | 12 +- 4 files changed, 64 insertions(+), 159 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 964aebd0..b1a51e99 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -655,7 +655,7 @@ process { ] } - // Scaffolding hap1/hap2 + // Scaffolding hap1/hap2 if (params.hifiasm_hic_on) { // hap1 scaffolding @@ -758,7 +758,7 @@ process { // End of hap1 scaffolding // hap2 scaffolding - + withName: '.*HIC_MAPPING_HAP2:SAMTOOLS_MARKDUP_HIC_MAPPING' { ext.prefix = { "${meta.id}_mkdup" } publishDir = [ diff --git a/modules.json b/modules.json index 9bf9d01d..1fd59692 100644 --- a/modules.json +++ b/modules.json @@ -8,344 +8,250 @@ "bcftools/concat": { "branch": "master", "git_sha": "582ff1755bdd205c65e2ba4c31e0a008dae299ec", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/consensus": { "branch": "master", "git_sha": "fa12afdf5874c1d11e4a20efe81c97935e8eea24", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/index": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/norm": { "branch": "master", "git_sha": "0435e4eebc94e53721c194b2d5d06f455a79e407", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/sort": { "branch": "master", "git_sha": "4a21e4cca35e72ec059abd67f790e0b192ce5d81", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/view": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bedtools/bamtobed": { "branch": "master", "git_sha": "9e51255c4f8ec69fb6ccf68593392835f14fecb8", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/bedtools/bamtobed/bedtools-bamtobed.diff" }, "busco": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bwamem2/index": { "branch": "master", "git_sha": "bfed129da5134b4439b1821c917972570d44d39c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "cat/cat": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "cooler/cload": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "cooler/zoomify": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "76cc4938c1f6ea5c7d83fed1eeffc146787f9543", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fastk/fastk": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fastk/histex": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "freebayes": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/freebayes/freebayes.diff" }, "gatk4/mergevcfs": { "branch": "master", "git_sha": "643756685546fa61f5c8fba439af746c090b9180", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "genescopefk": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gfastats": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/gfastats/gfastats.diff" }, "gunzip": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "hifiasm": { "branch": "master", "git_sha": "aecb06fcdb995ff3e3df7c7a1fd119367d6d1996", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/hifiasm/hifiasm.diff" }, "merquryfk/merquryfk": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "minimap2/align": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/minimap2/align/minimap2-align.diff" }, "minimap2/index": { "branch": "master", "git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "mitohifi/findmitoreference": { "branch": "master", "git_sha": "f52220e84bfc16a8616a5bb3d6f5bc67d601bdce", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/mitohifi/findmitoreference/mitohifi-findmitoreference.diff" }, "mitohifi/mitohifi": { "branch": "master", "git_sha": "c607e74f7aa72eb7cb7cc0a1454f97d3907e8d84", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/mitohifi/mitohifi/mitohifi-mitohifi.diff" }, "oatk": { "branch": "master", "git_sha": "d6a146325058eb9a18da2e898a2376e1d1093052", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "pretextmap": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "pretextsnapshot": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/pretextsnapshot/pretextsnapshot.diff" }, "purgedups/calcuts": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/purgedups/calcuts/purgedups-calcuts.diff" }, "purgedups/getseqs": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "purgedups/pbcstat": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "purgedups/purgedups": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "purgedups/splitfa": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/collate": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/faidx": { "branch": "master", "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/fixmate": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/flagstat": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/idxstats": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/index": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/markdup": { "branch": "master", "git_sha": "9e51255c4f8ec69fb6ccf68593392835f14fecb8", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/merge": { "branch": "master", "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/sort": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/stats": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/view": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "seqtk/subseq": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/seqtk/subseq/seqtk-subseq.diff" }, "yahs": { "branch": "master", "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } } } } -} \ No newline at end of file +} diff --git a/subworkflows/local/hic_bwamem2.nf b/subworkflows/local/hic_bwamem2.nf index 8c917fdf..7d4f07ea 100755 --- a/subworkflows/local/hic_bwamem2.nf +++ b/subworkflows/local/hic_bwamem2.nf @@ -35,18 +35,17 @@ workflow HIC_BWAMEM2 { id: cram_id.id ], file(cram_info[0]), - cram_info[1], - cram_info[2], - cram_info[3], - cram_info[4], - cram_info[5], - cram_info[6], - bwa_path.toString() + '/' + ref_dir.toString().split('/')[-1], - ref_dir + cram_info[1], // crai path + cram_info[2], // chunk starting position + cram_info[3], // chunk end position + cram_info[4], // basename + cram_info[5], // the number of chunk + cram_info[6], // rgline + bwa_path.toString() + '/' + ref_dir.toString().split('/')[-1] ) } .set { ch_filtering_input } - ch_filtering_input.view() + // // MODULE: map hic reads by 10,000 container per time using bwamem2 // diff --git a/subworkflows/local/hic_minimap2.nf b/subworkflows/local/hic_minimap2.nf index 355b0843..f9b97f71 100755 --- a/subworkflows/local/hic_minimap2.nf +++ b/subworkflows/local/hic_minimap2.nf @@ -43,12 +43,12 @@ workflow HIC_MINIMAP2 { id: cram_id.id ], file(cram_info[0]), - cram_info[1], - cram_info[2], - cram_info[3], - cram_info[4], - cram_info[5], - cram_info[6], + cram_info[1], // crai path + cram_info[2], // chunk starting position + cram_info[3], // chunk end position + cram_info[4], // basename + cram_info[5], // the number of chunk + cram_info[6], // rgline mmi_path.toString(), ref_dir ) From fc3c7f90dace5c4e433e79b174258f6fde90ab8c Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Thu, 15 Aug 2024 16:12:11 +0100 Subject: [PATCH 10/16] Parametrize hap1/hap2 within meta for scaffolding --- conf/modules.config | 145 +++++------------------------- subworkflows/local/hic_mapping.nf | 9 +- subworkflows/local/scaffolding.nf | 12 ++- workflows/genomeassembly.nf | 15 ++-- 4 files changed, 45 insertions(+), 136 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index b1a51e99..21cb4af8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -657,61 +657,60 @@ process { // Scaffolding hap1/hap2 if (params.hifiasm_hic_on) { - // hap1 scaffolding - withName: '.*HIC_MAPPING_HAP1:SAMTOOLS_MARKDUP_HIC_MAPPING' { + withName: '.*HIC_MAPPING_HAP.*:SAMTOOLS_MARKDUP_HIC_MAPPING' { ext.prefix = { "${meta.id}_mkdup" } publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_${meta.hap_id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*HIC_MAPPING_HAP1:BAMTOBED_SORT' { + withName: '.*HIC_MAPPING_HAP.*:BAMTOBED_SORT' { publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_${meta.hap_id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*HIC_MAPPING_HAP1:CONVERT_STATS:SAMTOOLS_STATS' { + withName: '.*HIC_MAPPING_HAP.*:CONVERT_STATS:SAMTOOLS_STATS' { publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_${meta.hap_id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*HIC_MAPPING_HAP1:CONVERT_STATS:SAMTOOLS_FLAGSTAT' { + withName: '.*HIC_MAPPING_HAP.*:CONVERT_STATS:SAMTOOLS_FLAGSTAT' { publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_${meta.hap_id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*HIC_MAPPING_HAP1:CONVERT_STATS:SAMTOOLS_IDXSTATS' { + withName: '.*HIC_MAPPING_HAP.*:CONVERT_STATS:SAMTOOLS_IDXSTATS' { publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_${meta.hap_id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*SCAFFOLDING_HAP1:YAHS' { - ext.prefix = 'hap1' + withName: '.*SCAFFOLDING_HAP.*:YAHS' { + ext.prefix = { "${meta.hap_id}" } publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_${meta.hap_id}/yahs/out.break.yahs" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*SCAFFOLDING_HAP1:COOLER_CLOAD' { + withName: '.*SCAFFOLDING_HAP.*:COOLER_CLOAD' { // Positions in the input file are zero-based; // chrom1 field number (one-based) is 2; // pos1 field number (one-based) is 3; @@ -719,142 +718,43 @@ process { // pos2 field number (one-based) is 7 ext.args = 'pairs -0 -c1 2 -p1 3 -c2 6 -p2 7' publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_${meta.hap_id}/yahs/out.break.yahs" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*SCAFFOLDING_HAP1:PRETEXTSNAPSHOT' { + withName: '.*SCAFFOLDING_HAP.*:PRETEXTSNAPSHOT' { // Make one plot containing all sequences ext.args = '--sequences \"=full\"' publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_${meta.hap_id}/yahs/out.break.yahs" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*SCAFFOLDING_HAP1:JUICER_TOOLS_PRE' { + withName: '.*SCAFFOLDING_HAP.*:JUICER_TOOLS_PRE' { ext.juicer_tools_jar = 'juicer_tools.1.9.9_jcuda.0.8.jar' ext.juicer_jvm_params = '-Xms1g -Xmx6g' publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_${meta.hap_id}/yahs/out.break.yahs" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*SCAFFOLDING_HAP1:JUICER_PRE' { + withName: '.*SCAFFOLDING_HAP.*:JUICER_PRE' { ext.args2 = "LC_ALL=C sort -k2,2d -k6,6d -S50G | awk '\$3>=0 && \$7>=0'" publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap1/yahs/out.break.yahs" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - - // End of hap1 scaffolding - - // hap2 scaffolding - - withName: '.*HIC_MAPPING_HAP2:SAMTOOLS_MARKDUP_HIC_MAPPING' { - ext.prefix = { "${meta.id}_mkdup" } - publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*HIC_MAPPING_HAP2:BAMTOBED_SORT' { - publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - - withName: '.*HIC_MAPPING_HAP2:CONVERT_STATS:SAMTOOLS_STATS' { - publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*HIC_MAPPING_HAP2:CONVERT_STATS:SAMTOOLS_FLAGSTAT' { - publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2" }, + path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_${meta.hap_id}/yahs/out.break.yahs" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - - withName: '.*HIC_MAPPING_HAP2:CONVERT_STATS:SAMTOOLS_IDXSTATS' { - publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] } - withName: '.*SCAFFOLDING_HAP2:YAHS' { - ext.prefix = 'hap2' - publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - - } - - withName: '.*SCAFFOLDING_HAP2:COOLER_CLOAD' { - // Positions in the input file are zero-based; - // chrom1 field number (one-based) is 2; - // pos1 field number (one-based) is 3; - // chrom2 field number (one-based) is 6; - // pos2 field number (one-based) is 7 - ext.args = 'pairs -0 -c1 2 -p1 3 -c2 6 -p2 7' - publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*SCAFFOLDING_HAP2:PRETEXTSNAPSHOT' { - // Make one plot containing all sequences - ext.args = '--sequences \"=full\"' - publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*SCAFFOLDING_HAP2:JUICER_TOOLS_PRE' { - ext.juicer_tools_jar = 'juicer_tools.1.9.9_jcuda.0.8.jar' - ext.juicer_jvm_params = '-Xms1g -Xmx6g' - publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: '.*SCAFFOLDING_HAP2:JUICER_PRE' { - ext.args2 = "LC_ALL=C sort -k2,2d -k6,6d -S50G | awk '\$3>=0 && \$7>=0'" - publishDir = [ - path: { "${params.outdir}/${meta.id}.${params.hifiasmhic}/scaffolding_hap2/yahs/out.break.yahs" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - } - // End of hap2 scaffolding + // End of hap1/hap2 scaffolding withName: '.*GENOME_STATISTICS_SCAFFOLDS_HAPS:GFASTATS_PRI' { ext.prefix = { "${meta.id}_scaffolds_final" } @@ -911,7 +811,6 @@ process { // End of Scaffolding hap1/hap2 // End of Set up of the scaffolding pipeline - //Set up of assembly stats subworkflow withName: 'BUSCO' { ext.args = "--mode genome" diff --git a/subworkflows/local/hic_mapping.nf b/subworkflows/local/hic_mapping.nf index c8786bad..a4d340c4 100644 --- a/subworkflows/local/hic_mapping.nf +++ b/subworkflows/local/hic_mapping.nf @@ -26,6 +26,7 @@ workflow HIC_MAPPING { reference_tuple // Channel [ val(meta), path(file) ] hic_reads_path // Channel [ val(meta), path(directory) ] hic_aligner_ch // Channel [ val(meta), val(hic_aligner)] + hap_id // Value hap_id main: ch_versions = Channel.empty() @@ -67,7 +68,8 @@ workflow HIC_MAPPING { bwamem2 : it[0].aligner == "bwamem2" } .set{ch_aligner} - + + // // SUBWORKFLOW: mapping hic reads using minimap2 // @@ -78,7 +80,7 @@ workflow HIC_MAPPING { ch_versions = ch_versions.mix( HIC_MINIMAP2.out.versions ) mappedbams = HIC_MINIMAP2.out.mappedbams - // + // // SUBWORKFLOW: mapping hic reads using bwamem2 // HIC_BWAMEM2 ( @@ -88,6 +90,9 @@ workflow HIC_MAPPING { ch_versions = ch_versions.mix( HIC_BWAMEM2.out.versions ) mappedbams = mappedbams.mix(HIC_BWAMEM2.out.mappedbams) + mappedbams.map{meta, bams -> [[id: meta.id, hap_id:hap_id], bams]} + .set { mappedbams } + // // LOGIC: GENERATE INDEX OF REFERENCE // diff --git a/subworkflows/local/scaffolding.nf b/subworkflows/local/scaffolding.nf index b15b4439..10f32266 100644 --- a/subworkflows/local/scaffolding.nf +++ b/subworkflows/local/scaffolding.nf @@ -16,6 +16,7 @@ workflow SCAFFOLDING { bed_in // tuple(meta, bed) fasta_in // tuple(meta, fasta) cool_bin // val: cooler cload parameter + hap_id // val: hap1/hap2/empty main: ch_versions = Channel.empty() @@ -37,9 +38,14 @@ workflow SCAFFOLDING { .set{ scaf_ref_fai } // + // LOGIC: MIX IN THE HAPLOTYPE ID TO CONTROL THE OUTPUT SUFFIX + // + bed_in.map{ meta, bed -> [[id:meta.id, hap_id:hap_id],bed] } + .set{ bed_in_hap } + // // MODULE: PERFORM SCAAFFOLDING WITH YAHS // - YAHS( bed_in, scaf_ref, scaf_ref_fai ) + YAHS( bed_in_hap , scaf_ref, scaf_ref_fai ) ch_versions = ch_versions.mix(YAHS.out.versions) // @@ -59,7 +65,7 @@ workflow SCAFFOLDING { YAHS.out.binary.join(YAHS.out.scaffolds_agp) .combine(scaf_ref) .combine(scaf_ref_fai) - .map{meta, binary, agp, fa, fai -> [meta, binary, agp, fai]} + .map{meta, binary, agp, fa, fai -> [[id:meta.id, hap_id:hap_id], binary, agp, fai]} .set{ch_merge} // @@ -71,7 +77,7 @@ workflow SCAFFOLDING { // // LOGIC: BIN CONTACT PAIRS // - JUICER_PRE.out.pairs.join(bed_in) + JUICER_PRE.out.pairs.join(bed_in_hap) .combine(Channel.of(cool_bin)) .set{ch_juicer} diff --git a/workflows/genomeassembly.nf b/workflows/genomeassembly.nf index 14fe3c7a..2e5e3899 100644 --- a/workflows/genomeassembly.nf +++ b/workflows/genomeassembly.nf @@ -340,13 +340,13 @@ workflow GENOMEASSEMBLY { // // SUBWORKFLOW: MAP HIC DATA TO THE PRIMARY ASSEMBLY // - HIC_MAPPING ( primary_contigs_ch,crams_ch,hic_aligner_ch ) + HIC_MAPPING ( primary_contigs_ch,crams_ch,hic_aligner_ch, "") ch_versions = ch_versions.mix(HIC_MAPPING.out.versions) // // SUBWORKFLOW: SCAFFOLD THE PRIMARY ASSEMBLY // - SCAFFOLDING( HIC_MAPPING.out.bed, primary_contigs_ch, cool_bin ) + SCAFFOLDING( HIC_MAPPING.out.bed, primary_contigs_ch, cool_bin, "") ch_versions = ch_versions.mix(SCAFFOLDING.out.versions) // @@ -367,37 +367,36 @@ workflow GENOMEASSEMBLY { unset_busco_alts ) - if ( hifiasm_hic_on ) { // // SUBWORKFLOW: MAP HIC DATA TO THE HAP1 CONTIGS // - HIC_MAPPING_HAP1 ( RAW_ASSEMBLY.out.hap1_hic_contigs, crams_ch, hic_aligner_ch ) + HIC_MAPPING_HAP1 ( RAW_ASSEMBLY.out.hap1_hic_contigs, crams_ch, hic_aligner_ch, 'hap1' ) ch_versions = ch_versions.mix(HIC_MAPPING_HAP1.out.versions) // // SUBWORKFLOW: SCAFFOLD HAP1 // - SCAFFOLDING_HAP1( HIC_MAPPING_HAP1.out.bed, RAW_ASSEMBLY.out.hap1_hic_contigs, cool_bin ) + SCAFFOLDING_HAP1( HIC_MAPPING_HAP1.out.bed, RAW_ASSEMBLY.out.hap1_hic_contigs, cool_bin, 'hap1' ) ch_versions = ch_versions.mix(SCAFFOLDING_HAP1.out.versions) // // SUBWORKFLOW: MAP HIC DATA TO THE HAP2 CONTIGS // - HIC_MAPPING_HAP2 ( RAW_ASSEMBLY.out.hap2_hic_contigs, crams_ch, hic_aligner_ch ) + HIC_MAPPING_HAP2 ( RAW_ASSEMBLY.out.hap2_hic_contigs, crams_ch, hic_aligner_ch, 'hap2' ) ch_versions = ch_versions.mix(HIC_MAPPING_HAP2.out.versions) // // SUBWORKFLOW: SCAFFOLD HAP2 // - SCAFFOLDING_HAP2( HIC_MAPPING_HAP2.out.bed, RAW_ASSEMBLY.out.hap2_hic_contigs, cool_bin ) + SCAFFOLDING_HAP2( HIC_MAPPING_HAP2.out.bed, RAW_ASSEMBLY.out.hap2_hic_contigs, cool_bin, 'hap2' ) ch_versions = ch_versions.mix(SCAFFOLDING_HAP2.out.versions) // // LOGIC: CREATE A CHANNEL FOR THE FULL HAP1/HAP2 ASSEMBLY // SCAFFOLDING_HAP1.out.fasta.combine(SCAFFOLDING_HAP2.out.fasta) - .map{meta_s, fasta_s, meta_h, fasta_h -> [ meta_h, fasta_s, fasta_h ]} + .map{meta_s, fasta_s, meta_h, fasta_h -> [ [id:meta_h.id], fasta_s, fasta_h ]} .set{ stats_haps_input_ch } // From 3559a525d556e7af05771392520e8a30dd5edb06 Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Thu, 15 Aug 2024 16:29:15 +0100 Subject: [PATCH 11/16] Merge conflict --- subworkflows/local/hic_mapping.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/hic_mapping.nf b/subworkflows/local/hic_mapping.nf index a4d340c4..05104a78 100644 --- a/subworkflows/local/hic_mapping.nf +++ b/subworkflows/local/hic_mapping.nf @@ -43,7 +43,7 @@ workflow HIC_MAPPING { reference_tuple .join( hic_reads_path ) .map { meta, ref, hic_reads_path -> - tuple([ id: meta.id, single_end: true], hic_reads_path, hic_reads_path.collect { p -> p.resolveSibling(p.name + ".crai") } ) } + tuple([ id: meta.id, hap_id: hap_id, single_end: true], hic_reads_path, hic_reads_path.collect { p -> p.resolveSibling(p.name + ".crai") } ) } .set { get_reads_input } // From 18958615b35683e10f55fd0abb02e8a66b140fc7 Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Thu, 29 Aug 2024 16:06:31 +0100 Subject: [PATCH 12/16] Environment file for hifiasm module --- modules/nf-core/hifiasm/environment.yml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 modules/nf-core/hifiasm/environment.yml diff --git a/modules/nf-core/hifiasm/environment.yml b/modules/nf-core/hifiasm/environment.yml new file mode 100644 index 00000000..3aa0fd5b --- /dev/null +++ b/modules/nf-core/hifiasm/environment.yml @@ -0,0 +1,8 @@ +name: hifiasm +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::hifiasm=0.19.8 + - bioconda::samtools=1.20 From eaa8851208b8bec85d2d3f9e7bfee89772b4d3ab Mon Sep 17 00:00:00 2001 From: Ksenia Date: Thu, 5 Sep 2024 13:28:52 +0100 Subject: [PATCH 13/16] Update usage.md Update test yaml --- docs/usage.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 83b56492..c1210912 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -59,20 +59,27 @@ Example is based on [test.yaml](../assets/test.yaml). dataset: id: baUndUnlc1 illumina_10X: - reads: /lustre/scratch123/tol/resources/nextflow/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/ + reads: + - https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/baUndUnlc1_S12_L002_R1_001.fastq.gz + - https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/baUndUnlc1_S12_L002_R2_001.fastq.gz + - https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/baUndUnlc1_S12_L002_I1_001.fastq.gz pacbio: reads: - - reads: /lustre/scratch123/tol/resources/nextflow/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/pacbio/fasta/HiFi.reads.fasta + - reads: https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/pacbio/fasta/HiFi.reads.fasta HiC: reads: - - reads: /lustre/scratch123/tol/resources/nextflow/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/hic-arima2/41741_2#7.sub.cram + - reads: https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/hic-arima2/41741_2%237.sub.cram hic_motif: GATC,GANTC,CTNAG,TTAA +hic_aligner: bwamem2 busco: lineage: bacteria_odb10 mito: species: Caradrina clavipalpis min_length: 15000 code: 5 + fam: https://github.com/c-zhou/OatkDB/raw/main/v20230921/insecta_mito.fam +plastid: + fam: https://github.com/c-zhou/OatkDB/raw/main/v20230921/acrogymnospermae_pltd.fam ``` From 42fee8e6f38368674b915652f3e37df9ca89b0f6 Mon Sep 17 00:00:00 2001 From: Ksenia Date: Thu, 5 Sep 2024 15:27:52 +0100 Subject: [PATCH 14/16] Update output.md --- docs/output.md | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/docs/output.md b/docs/output.md index 5a92b0b9..1da8d5c0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -4,7 +4,14 @@ This document describes the output produced by the genomeassembly pipeline. -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. +The standard assembly pipeline contains running hifiasm on the HiFi reads, purging the primary contigs with purge_dups, and scaffolding them up with YaHS. +Optionally, if Illumina 10X data is provided, the purged contigs and haplotigs can be polished. + +In case of a diploid genome when HiFi and HiC data is coming from the same individual addtionally hifiasm can be run in HiC mode to produce a phased assembly. In that case the produced haplotypes are not purged but scaffolded up directly with YaHS. + +Optionally, the organelles assembly can be triggered. The mitochondrion and (if relevant) plastid sequences are produced using MitoHiFi and OATK. + +The directories listed below will be created in the --outdir directory after the pipeline has finished. All paths are relative to the top-level --outdir directory. ## Subworkflows @@ -43,13 +50,16 @@ This subworkflow generates a KMER database and coverage model used in [PURGE_DUP - primary assembly in GFA and FASTA format; for more details refer to [hifiasm output](https://hifiasm.readthedocs.io/en/latest/interpreting-output.html) - .\*hifiasm.\*/.*a_ctg.[g]fa - haplotigs in GFA and FASTA format; for more details refer to [hifiasm output](https://hifiasm.readthedocs.io/en/latest/interpreting-output.html) + - .\*hifiasm-hic.\*/.*hap1.p_ctg.[g]fa + - fully phased hap1 if hifiasm is run in HiC mode; for more details refer to [hifiasm output](https://hifiasm.readthedocs.io/en/latest/interpreting-output.html) + - .\*hifiasm-hic.\*/.*hap2.p_ctg.[g]fa + - fully phased hap2 if hifiasm is run in HiC mode; for more details refer to [hifiasm output](https://hifiasm.readthedocs.io/en/latest/interpreting-output.html) - .\*hifiasm.\*/.*bin - internal binary hifiasm files; for more details refer [here](https://hifiasm.readthedocs.io/en/latest/faq.html#id12) This subworkflow generates a raw assembly(-ies). First, hifiasm is run on the input HiFi reads then raw contigs are converted from GFA into FASTA format, this assembly is due to purging, polishing (optional) and scaffolding further down the pipeline. -In case hifiasm HiC mode is switched on, it is performed as an extra step with results stored in hifiasm-hic folder.

![Raw assembly subworkflow](images/v1/raw_assembly.png) @@ -68,6 +78,7 @@ In case hifiasm HiC mode is switched on, it is performed as an extra step with r Retained haplotype is identified in primary assembly. The alternate contigs are updated correspondingly. The subworkflow relies on kmer coverage model to identify coverage thresholds. For more details see [purge_dups](https://github.com/dfguan/purge_dups) +The two haplotype assemblies produced by hifiasm in HiC mode are not purged.

@@ -98,9 +109,9 @@ This subworkflow uses read mapping of the Illumina 10X short read data to fix sh
Output files - - \*.hifiasm..\*/scaffolding/.*_merged_sorted.bed + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/.*_merged_sorted.bed - bed file obtained from merged mkdup bam - - \*.hifiasm..\*/scaffolding/.*mkdup.bam + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/.*mkdup.bam - final read mapping bam with mapped reads
@@ -113,11 +124,11 @@ This subworkflow implements alignment of the Illumina HiC short reads to the pri
Output files - - \*.hifiasm..\*/scaffolding/.*.stats + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/.*.stats - output of samtools stats - - \*.hifiasm..\*/scaffolding/.*.idxstats + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/.*.idxstats - output of samtools idxstats - - \*.hifiasm..\*/scaffolding/.*.flagstat + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/.*.flagstat - output of samtools flagstat
@@ -128,17 +139,17 @@ This subworkflow produces statistcs for a bam file containing read mapping. It i
Output files - - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/out_scaffolds_final.fa + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/yahs/out.break.yahs/out_scaffolds_final.fa - scaffolds in FASTA format - - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/out_scaffolds_final.agp + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/yahs/out.break.yahs/out_scaffolds_final.agp - coordinates of contigs relative to scaffolds - - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/alignments_sorted.txt + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/yahs/out.break.yahs/alignments_sorted.txt - Alignments for Juicer in text format - - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/yahs_scaffolds.hic + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/yahs/out.break.yahs/yahs_scaffolds.hic - Juicer HiC map - - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/*cool + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/yahs/out.break.yahs/*cool - HiC map for cooler - - \*.hifiasm..\*/scaffolding/yahs/out.break.yahs/*.FullMap.png + - \*.hifiasm.\*/scaffolding[_hap1/_hap2/^$]/yahs/out.break.yahs/*.FullMap.png - Pretext snapshot
From 951a321dfcdd39fa183d1af7dba5ef0abf65f3ec Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Thu, 5 Sep 2024 15:33:32 +0100 Subject: [PATCH 15/16] Prettier fix --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 1da8d5c0..f753555e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -9,7 +9,7 @@ Optionally, if Illumina 10X data is provided, the purged contigs and haplotigs c In case of a diploid genome when HiFi and HiC data is coming from the same individual addtionally hifiasm can be run in HiC mode to produce a phased assembly. In that case the produced haplotypes are not purged but scaffolded up directly with YaHS. -Optionally, the organelles assembly can be triggered. The mitochondrion and (if relevant) plastid sequences are produced using MitoHiFi and OATK. +Optionally, the organelles assembly can be triggered. The mitochondrion and (if relevant) plastid sequences are produced using MitoHiFi and OATK. The directories listed below will be created in the --outdir directory after the pipeline has finished. All paths are relative to the top-level --outdir directory. From cbc319e62049f381c296f31d559a0c21f651b84f Mon Sep 17 00:00:00 2001 From: Ksenia Date: Fri, 6 Sep 2024 13:14:37 +0100 Subject: [PATCH 16/16] Update output.md --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index f753555e..fedc59bd 100644 --- a/docs/output.md +++ b/docs/output.md @@ -7,7 +7,7 @@ This document describes the output produced by the genomeassembly pipeline. The standard assembly pipeline contains running hifiasm on the HiFi reads, purging the primary contigs with purge_dups, and scaffolding them up with YaHS. Optionally, if Illumina 10X data is provided, the purged contigs and haplotigs can be polished. -In case of a diploid genome when HiFi and HiC data is coming from the same individual addtionally hifiasm can be run in HiC mode to produce a phased assembly. In that case the produced haplotypes are not purged but scaffolded up directly with YaHS. +In case of a diploid genome when HiFi and HiC data come from the same individual an additional hifiasm run in HiC mode produces two balanced fully phased haplotypes. The haplotypes are not purged but scaffolded up directly with YaHS. Optionally, the organelles assembly can be triggered. The mitochondrion and (if relevant) plastid sequences are produced using MitoHiFi and OATK.