Skip to content

Commit

Permalink
Update benchmark_load workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
robomics committed Feb 8, 2024
1 parent ce8b4bd commit 2d8da61
Show file tree
Hide file tree
Showing 2 changed files with 194 additions and 25 deletions.
12 changes: 8 additions & 4 deletions benchmarks/configs/benchmark_load.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,21 @@ params {

replicates = 3
resolutions = [10, 100, 500, 1000, 5000, 10000, 25000, 50000, 100000]
cpu_cores = [1, 16]
}

process {
container = 'ghcr.io/paulsengroup/2023-hictk-paper/hictk-bench:2.0.0'
// withName:hictk_load {
// stageInMode = 'copy'
// }
// stageInMode = 'copy'
withName:preprocess_pairs_gz {
container = 'ghcr.io/paulsengroup/2023-hictk-paper/hic-tools:3.30.00'
}
withName:cooler_cload {
// stageInMode = 'copy'
container = 'ghcr.io/paulsengroup/2023-hictk-paper/cooler-bench:1.0.0'
}
withName:hictools_pre {
container = 'ghcr.io/paulsengroup/2023-hictk-paper/hic-tools:3.30.00'
}
withName:summarize {
container = 'ghcr.io/paulsengroup/2023-hictk-paper/plotting:1.0.0'
}
Expand Down
207 changes: 186 additions & 21 deletions benchmarks/workflows/benchmark_load.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,38 +8,71 @@ nextflow.enable.dsl=2
workflow {
Channel.fromPath(params.pairs_file, checkIfExists: true).set { pairs }
Channel.of(params.resolutions).flatten().set { resolutions }
Channel.of(params.cpu_cores).flatten().set { cpus }
task_ids = Channel.of((1..params.replicates).toList()).flatten()

preprocess_pairs(
preprocess_pairs_zst(
pairs
)

preprocess_pairs.out.pairs
.set { pairs_filtered }
preprocess_pairs_gz(
pairs
)

preprocess_pairs_zst.out.pairs
.set { pairs_filtered_zst }
preprocess_pairs_gz.out.pairs
.set { pairs_filtered_gz }

task_ids.combine(pairs_filtered_zst)
.combine(resolutions)
.set { tasks_cooler }

task_ids.combine(pairs_filtered_zst)
.combine(resolutions)
.set { tasks_hictk_cool }

task_ids.combine(pairs_filtered)
task_ids.combine(pairs_filtered_zst)
.combine(resolutions)
.set { tasks }
.combine(cpus)
.set { tasks_hictk_hic }

hictk_load(
tasks,
task_ids.combine(pairs_filtered_gz)
.combine(resolutions)
.combine(cpus)
.set { tasks_hictools }

hictk_load_cool(
tasks_hictk_cool,
file(params.chrom_sizes)
)

hictk_load_hic(
tasks_hictk_hic,
file(params.chrom_sizes)
)

cooler_cload(
tasks,
tasks_cooler,
file(params.chrom_sizes)
)

hictools_pre(
tasks_hictools,
file(params.chrom_sizes)
)

summarize(
hictk_load.out.tsv
hictk_load_cool.out.tsv
.mix(hictk_load_hic.out.tsv)
.mix(cooler_cload.out.tsv)
.map { it[4] }
.mix(hictools_pre.out.tsv)
.map { it[5] }
.collect()
)
}

process preprocess_pairs {
process preprocess_pairs_zst {
label 'process_high'

input:
Expand All @@ -59,8 +92,28 @@ process preprocess_pairs {
'''
}

process hictk_load {
publishDir "${params.outdir}/hictk/", mode: 'copy'
process preprocess_pairs_gz {
label 'process_high'

input:
path pairs

output:
path "*.pairs.gz", emit: pairs

shell:
outname="${pairs.simpleName}_filtered.pairs.gz"
'''
set -o pipefail
zcat '!{pairs}' |
grep -P 'chr[\\dXY]+\\s\\d+\\schr[\\dXY]+\\s' |
pigz -p '!{task.cpus}' -9 > '!{outname}'
'''
}

process hictk_load_cool {
publishDir "${params.outdir}/hictk/cool", mode: 'copy'

cpus 1
memory 36.GB
Expand All @@ -77,17 +130,18 @@ process hictk_load {
output:
tuple val(id),
val("hictk"),
val("pairs"),
val("cooler"),
val(resolution),
val(task.cpus),
path("*.tsv"), emit: tsv

shell:
outname="${id}__${pairs.simpleName}__hictk__pairs__${resolution}.tsv"
outname="${id}__${pairs.simpleName}__hictk__cool__${resolution}__${task.cpus}.tsv"
'''
set -o pipefail
printf 'tool\\tformat\\tresolution\\ttime\\tmemory\\tsize\\n' > '!{outname}'
printf 'hictk\\tpairs\\t!{resolution}\\t' >> '!{outname}'
printf 'tool\\tformat\\tresolution\\tcpus\\ttime\\tmemory\\tsize\\n' > '!{outname}'
printf 'hictk\\tcool\\t!{resolution}\\t!{task.cpus}\\t' >> '!{outname}'
mkdir tmp/
export TMPDIR="$PWD/tmp"
Expand All @@ -110,6 +164,60 @@ process hictk_load {
'''
}

process hictk_load_hic {
publishDir "${params.outdir}/hictk/hic", mode: 'copy'

cpus 1
memory 36.GB
label 'process_long'

tag "${pairs.fileName}_${resolution}_${id}"

input:
tuple val(id),
path(pairs),
val(resolution),
val(cpus)
path chrom_sizes

output:
tuple val(id),
val("hictk"),
val("hic"),
val(resolution),
val(cpus),
path("*.tsv"), emit: tsv

shell:
outname="${id}__${pairs.simpleName}__hictk__hic__${resolution}__${cpus}.tsv"
'''
set -o pipefail
printf 'tool\\tformat\\tresolution\\tcpus\\ttime\\tmemory\\tsize\\n' > '!{outname}'
printf 'hictk\\thic\\t!{resolution}\\t!{cpus}\\t' >> '!{outname}'
mkdir tmp/
export TMPDIR="$PWD/tmp"
command time -f '%e\\t%M' \\
-o '!{outname}' \\
-a \\
hictk load '!{chrom_sizes}' \\
'!{resolution}' \\
'out.hic' \\
--format 4dn \\
--assume-unsorted \\
--batch-size 50000000 \\
--verbosity=1 \\
< <(zstdcat '!{pairs}')
truncate -s -1 '!{outname}' # Remove newline
printf '\\t%d\\n' "$(du -b out.hic | cut -f 1)" >> '!{outname}'
'''
}


process cooler_cload {
publishDir "${params.outdir}/cooler/", mode: 'copy'

Expand All @@ -128,17 +236,18 @@ process cooler_cload {
output:
tuple val(id),
val("hictk"),
val("pairs"),
val("cooler"),
val(resolution),
val(task.cpus),
path("*.tsv"), emit: tsv

shell:
outname="${id}__${pairs.simpleName}__cooler__pairs__${resolution}.tsv"
outname="${id}__${pairs.simpleName}__cooler__cooler__${resolution}__${task.cpus}.tsv"
'''
set -o pipefail
printf 'tool\\tformat\\tresolution\\ttime\\tmemory\\tsize\\n' > '!{outname}'
printf 'cooler\\tpairs\\t!{resolution}\\t' >> '!{outname}'
printf 'tool\\tformat\\tresolution\\tcpus\\ttime\\tmemory\\tsize\\n' > '!{outname}'
printf 'cooler\\tcool\\t!{resolution}\\t!{task.cpus}\\t' >> '!{outname}'
command time -f '%e\\t%M' \\
-o '!{outname}' \\
Expand All @@ -159,6 +268,62 @@ process cooler_cload {
'''
}


process hictools_pre {
publishDir "${params.outdir}/hictools/", mode: 'copy'

memory 650.GB
label 'process_very_long'

tag "${pairs.simpleName}"

input:
tuple val(id),
path(pairs),
val(resolution),
val(cpus)
path chrom_sizes

output:
tuple val(id),
val("hictools"),
val("hic"),
val(resolution),
val(cpus),
path("*.tsv"), emit: tsv

shell:
outname="${id}__${pairs.simpleName}__hictools__hic__${resolution}__${cpus}.tsv"
memory_gb=task.memory.toGiga()
'''
set -o pipefail
printf 'tool\\tformat\\tresolution\\tcpus\\ttime\\tmemory\\tsize\\n' > '!{outname}'
printf 'hictools\\thic\\t!{resolution}\\t!{cpus}\\t' >> '!{outname}'
mkdir tmp/
export TMPDIR="$PWD/tmp"
command time -f '%e\\t%M' \\
-o '!{outname}' \\
-a \\
run_hic_tools_pre.sh \\
'!{pairs}' \\
out.hic \\
"$TMPDIR" \\
"$HICTOOLS_JAR" \\
'!{resolution}' \\
'!{cpus}' \\
!{memory_gb}G
java -jar -Xmx
truncate -s -1 '!{outname}' # Remove newline
printf '\\t%d\\n' "$(du -b out.hic | cut -f 1)" >> '!{outname}'
'''
}

process summarize {
publishDir "${params.outdir}/", mode: 'copy'

Expand Down

0 comments on commit 2d8da61

Please sign in to comment.