Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement hictk fix-mcool #68

Merged
merged 5 commits into from
Sep 30, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Initial implementation of hictk fix-mcool
  • Loading branch information
robomics committed Sep 29, 2023
commit e03bab3e9ca6ce3c0e2f96ffb0ebacff071a0bca
2 changes: 2 additions & 0 deletions src/hictk/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ target_sources(
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_balance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_convert.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_dump.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_fix_mcool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_load.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_merge.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_validate.cpp
Expand All @@ -28,6 +29,7 @@ target_sources(
${CMAKE_CURRENT_SOURCE_DIR}/convert/cool_to_hic.cpp
${CMAKE_CURRENT_SOURCE_DIR}/convert/hic_to_cool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dump/dump.cpp
${CMAKE_CURRENT_SOURCE_DIR}/fix_mcool/fix_mcool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/load/load.cpp
${CMAKE_CURRENT_SOURCE_DIR}/merge/merge.cpp
${CMAKE_CURRENT_SOURCE_DIR}/validate/validate.cpp
Expand Down
2 changes: 1 addition & 1 deletion src/hictk/balance/balance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ static void write_weights_cooler(std::string_view uri, const BalanceConfig& c,
const std::vector<double>& scale) {
const auto& [file, grp] = cooler::parse_cooler_uri(uri);
const auto path = fmt::format(FMT_STRING("{}/bins/{}"), grp, c.name);
SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), uri, path);
SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), file, path);

const HighFive::File clr(file, HighFive::File::ReadWrite);

Expand Down
11 changes: 11 additions & 0 deletions src/hictk/cli/cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ auto Cli::parse_arguments() -> Config {
_subcommand = subcommand::convert;
} else if (_cli.get_subcommand("dump")->parsed()) {
_subcommand = subcommand::dump;
} else if (_cli.get_subcommand("fix-mcool")->parsed()) {
_subcommand = subcommand::fix_mcool;
} else if (_cli.get_subcommand("load")->parsed()) {
_subcommand = subcommand::load;
} else if (_cli.get_subcommand("merge")->parsed()) {
Expand Down Expand Up @@ -77,6 +79,8 @@ std::string_view Cli::subcommand_to_str(subcommand s) noexcept {
return "convert";
case dump:
return "dump";
case fix_mcool:
return "fix-mcool";
case load:
return "load";
case merge:
Expand All @@ -100,6 +104,7 @@ void Cli::make_cli() {
make_balance_subcommand();
make_convert_subcommand();
make_dump_subcommand();
make_fix_mcool_subcommand();
make_load_subcommand();
make_merge_subcommand();
make_validate_subcommand();
Expand All @@ -117,6 +122,9 @@ void Cli::validate_args() const {
case dump:
validate_dump_subcommand();
break;
case fix_mcool:
validate_fix_mcool_subcommand();
break;
case load:
validate_load_subcommand();
break;
Expand Down Expand Up @@ -144,6 +152,9 @@ void Cli::transform_args() {
case dump:
transform_args_dump_subcommand();
break;
case fix_mcool:
transform_args_fix_mcool_subcommand();
break;
case load:
transform_args_load_subcommand();
break;
Expand Down
8 changes: 4 additions & 4 deletions src/hictk/cli/cli_convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ void Cli::make_convert_subcommand() {
->check(CLI::Range(1, 4))
->capture_default_str();
sc.add_option(
"-p,--processes",
c.processes,
"Maximum number of parallel processes to spawn.\n"
"When converting from hic to cool, only two processes will be used.")
"-t,--threads",
c.threads,
"Maximum number of parallel threads to spawn.\n"
"When converting from hic to cool, only two threads will be used.")
->check(CLI::Range(std::uint32_t(2), std::thread::hardware_concurrency()))
->capture_default_str();
sc.add_option(
Expand Down
143 changes: 143 additions & 0 deletions src/hictk/cli/cli_fix_mcool.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
// Copyright (C) 2023 Roberto Rossini <roberros@uio.no>
//
// SPDX-License-Identifier: MIT

#include <fmt/format.h>
#include <fmt/std.h>

#include <CLI/CLI.hpp>
#include <cassert>
#include <cstdint>
#include <string>
#include <thread>

#include "hictk/tools/cli.hpp"
#include "hictk/tools/config.hpp"

namespace hictk::tools {

void Cli::make_fix_mcool_subcommand() {
auto& sc = *_cli.add_subcommand("fix-mcool", "Fix corrupted .mcool files.")
->fallthrough()
->preparse_callback([this]([[maybe_unused]] std::size_t i) {
assert(_config.index() == 0);
_config = FixMcoolConfig{};
});

_config = FixMcoolConfig{};
auto& c = std::get<FixMcoolConfig>(_config);

// clang-format off
sc.add_option(
"input",
c.path_to_input,
"Path to a corrupted .mcool file.")
->check(IsValidMultiresCoolerFile)
->required();
sc.add_option(
"output",
c.path_to_output,
"Path where to store the restored .mcool.")
->required();
sc.add_option(
"--tmpdir",
c.tmp_dir,
"Path to a folder where to store temporary data.")
->capture_default_str();
sc.add_flag(
"--skip-balancing",
c.skip_balancing,
"Do not recompute or copy balancing weights.");
sc.add_flag(
"--check-base-resolution",
c.check_base_resolution,
"Check whether the base resolution is corrupted.");
sc.add_flag(
"--in-memory",
c.in_memory,
"Store all interactions in memory while balancing (greatly improves performance).")
->capture_default_str();
sc.add_option(
"--chunk-size",
c.chunk_size,
"Number of interactions to process at once during balancing.\n"
"Ignored when using --in-memory.")
->check(CLI::PositiveNumber)
->capture_default_str();
sc.add_option(
"-v,--verbosity",
c.verbosity,
"Set verbosity of output to the console.")
->check(CLI::Range(1, 4))
->capture_default_str();
sc.add_option(
"-t,--threads",
c.threads,
"Maximum number of parallel threads to spawn (only applies to balancing stage).")
->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency()))
->capture_default_str();
sc.add_option(
"-l,--compression-level",
c.zstd_compression_lvl,
"Compression level used to compress temporary files using ZSTD (only applies to balancing stage).")
->check(CLI::Range(0, 19))
->capture_default_str();
sc.add_flag(
"-f,--force",
c.force,
"Overwrite existing files (if any).")
->capture_default_str();
// clang-format on
}

void Cli::validate_fix_mcool_subcommand() const {
const auto& c = std::get<FixMcoolConfig>(_config);
std::vector<std::string> errors;
std::vector<std::string> warnings{};

if (!c.force && std::filesystem::exists(c.path_to_output)) {
errors.emplace_back(fmt::format(
FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.path_to_output));
}

if (c.skip_balancing) {
const auto* sc = _cli.get_subcommand("fix-mcool");
if (!sc->get_option("--tmpdir")->empty()) {
warnings.emplace_back("option --tmpdir is ignored when --skip-balancing is provided.");
}
if (!sc->get_option("--in-memory")->empty()) {
warnings.emplace_back("option --in-memory is ignored when --skip-balancing is provided.");
}
if (!sc->get_option("--compression-level")->empty()) {
warnings.emplace_back(
"option --compression-level is ignored when --skip-balancing is provided.");
}
if (!sc->get_option("--chunk-size")->empty()) {
warnings.emplace_back("option --chunk-size is ignored when --skip-balancing is provided.");
}
if (!sc->get_option("--threads")->empty()) {
warnings.emplace_back("option --threads is ignored when --skip-balancing is provided.");
}
}

for (const auto& w : warnings) {
SPDLOG_WARN(FMT_STRING("{}"), w);
}

if (!errors.empty()) {
throw std::runtime_error(fmt::format(
FMT_STRING(
"The following error(s) where encountered while validating CLI arguments:\n - {}"),
fmt::join(errors, "\n - ")));
}
}

void Cli::transform_args_fix_mcool_subcommand() {
auto& c = std::get<FixMcoolConfig>(_config);

// in spdlog, high numbers correspond to low log levels
assert(c.verbosity > 0 && c.verbosity < 5);
c.verbosity = static_cast<std::uint8_t>(spdlog::level::critical) - c.verbosity;
}

} // namespace hictk::tools
20 changes: 10 additions & 10 deletions src/hictk/convert/cool_to_hic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,29 +89,29 @@ static std::size_t dump_pixels_plain(const cooler::File& clr, const std::filesys
template <typename Pipe>
[[nodiscard]] static std::unique_ptr<boost::process::child> run_pigz(
Pipe& pipe, const std::filesystem::path& dest, std::uint8_t compression_lvl,
std::size_t processes) {
std::size_t threads) {
assert(compression_lvl != 0);
assert(processes != 0);
assert(threads != 0);
// clang-format off
return std::make_unique<boost::process::child>(
find_pigz().string(),
fmt::format(FMT_STRING("-{}"), compression_lvl),
"--processes", fmt::to_string(processes),
"--processes", fmt::to_string(threads),
boost::process::std_in < pipe,
boost::process::std_out > dest.string()
);
// clang-format on
}

static std::size_t dump_pixels_pigz(const cooler::File& clr, const std::filesystem::path& dest,
std::uint8_t compression_lvl, std::size_t processes,
std::uint8_t compression_lvl, std::size_t threads,
std::size_t update_frequency = 10'000'000) {
assert(compression_lvl != 0);
assert(processes > 1);
assert(threads > 1);

boost::asio::io_context ioc;
boost::process::async_pipe pipe{ioc};
const auto pigz = run_pigz(pipe, dest, compression_lvl, processes - 1);
const auto pigz = run_pigz(pipe, dest, compression_lvl, threads - 1);

auto t0 = std::chrono::steady_clock::now();
std::string buffer;
Expand Down Expand Up @@ -178,15 +178,15 @@ static std::size_t dump_pixels_pigz(const cooler::File& clr, const std::filesyst
}

static void dump_pixels(const cooler::File& clr, const std::filesystem::path& dest,
std::uint8_t compression_lvl, std::size_t processes) {
std::uint8_t compression_lvl, std::size_t threads) {
const auto t0 = std::chrono::steady_clock::now();

SPDLOG_INFO(FMT_STRING("writing pixels to file {}..."), dest);

std::size_t pixels_processed{};
if (dest.extension() == ".gz") {
assert(compression_lvl != 0);
pixels_processed = dump_pixels_pigz(clr, dest, compression_lvl, processes);
pixels_processed = dump_pixels_pigz(clr, dest, compression_lvl, threads);
} else {
pixels_processed = dump_pixels_plain(clr, dest);
}
Expand Down Expand Up @@ -277,12 +277,12 @@ void cool_to_hic(const ConvertConfig& c) {

const cooler::File clr(uri);
dump_chrom_sizes(clr, chrom_sizes);
dump_pixels(clr, pixels, c.gzip_compression_lvl, c.processes);
dump_pixels(clr, pixels, c.gzip_compression_lvl, c.threads);
}

auto t1 = std::chrono::steady_clock::now();
SPDLOG_INFO(FMT_STRING("running juicer_tools pre..."));
process = run_juicer_tools_pre(c, chrom_sizes, pixels, c.processes);
process = run_juicer_tools_pre(c, chrom_sizes, pixels, c.threads);
process->wait();
if (process->exit_code() != 0) {
throw std::runtime_error(fmt::format(FMT_STRING("juicer_tools pre failed with exit code {}"),
Expand Down
Loading