Skip to content

Commit

Permalink
Merge pull request #74 from paulsengroup/fix/hictk_convert
Browse files Browse the repository at this point in the history
[bugfix] hictk convert (cool2hic)
  • Loading branch information
robomics authored Oct 15, 2023
2 parents 0488b47 + fb6abad commit 479cf68
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 35 deletions.
1 change: 1 addition & 0 deletions docs/cli_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ hictk convert
--normalization-methods TEXT [ALL] ...
Name of one or more normalization methods to be copied.
By default, vectors for all known normalization methods are copied.
Pass NONE to avoid copying normalization vectors.
--fail-if-norm-not-found Fail if any of the requested normalization vectors are missing.
-g,--genome TEXT Genome assembly name. By default this is copied from the .hic file metadata.
--juicer-tools-memory UINT:SIZE [b, kb(=1000b), kib(=1024b), ...]:POSITIVE [32GB]
Expand Down
6 changes: 5 additions & 1 deletion src/hictk/cli/cli_convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ void Cli::make_convert_subcommand() {
"--normalization-methods",
c.normalization_methods,
"Name of one or more normalization methods to be copied.\n"
"By default, vectors for all known normalization methods are copied.\n")
"By default, vectors for all known normalization methods are copied.\n"
"Pass NONE to avoid copying normalization vectors.")
->default_str("ALL");
sc.add_flag(
"--fail-if-norm-not-found",
Expand Down Expand Up @@ -224,6 +225,9 @@ void Cli::transform_args_convert_subcommand() {
if (c.tmp_dir.empty()) {
c.tmp_dir = c.path_to_output.parent_path();
}

c.tmp_dir /= c.path_to_output.filename();
c.tmp_dir.replace_extension(".tmp");
}

} // namespace hictk::tools
101 changes: 70 additions & 31 deletions src/hictk/convert/cool_to_hic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,60 +198,97 @@ static void dump_pixels(const cooler::File& clr, const std::filesystem::path& de
pixels_processed, clr.chromosomes().size(), dest, delta);
}

[[nodiscard]] static std::shared_ptr<const balancing::Weights> try_read_weights(
const cooler::File& clr, const balancing::Method& method) {
try {
return clr.read_weights(method);
} catch (const std::exception& e) {
return clr.read_weights(method, balancing::Weights::Type::DIVISIVE);
}
}

// NOLINTNEXTLINE(readability-function-cognitive-complexity)
static bool dump_weights(std::uint32_t resolution, std::string_view cooler_uri,
const std::filesystem::path& weight_file) {
const std::filesystem::path& weight_file,
std::vector<balancing::Method> normalizations, bool fail_if_norm_missing) {
if (normalizations.size() == 1 && normalizations.front() == balancing::Method::NONE()) {
return false;
}

if (normalizations.empty()) {
normalizations = cooler::File(cooler_uri).avail_normalizations();
if (normalizations.empty()) {
return false;
}
}

SPDLOG_INFO(FMT_STRING("[{}] writing balancing weights to file {}..."), resolution, weight_file);
const cooler::File clr(cooler_uri);
assert(clr.bin_size() == resolution);

if (!clr.has_normalization("weight")) {
SPDLOG_WARN(FMT_STRING("[{}] unable to read weights from \"{}\"..."), resolution, cooler_uri);
return false;
}

const std::unique_ptr<FILE> f(std::fopen(weight_file.string().c_str(), "ae"));
if (!bool(f)) {
throw fmt::system_error(errno, FMT_STRING("cannot open file {}"), weight_file);
}

const auto weights = (*clr.read_weights("weight"))();
std::ptrdiff_t i0 = 0;

for (const auto& chrom : clr.chromosomes()) {
// TODO add GW/INTRA/INTER prefix as appropriate
fmt::print(f.get(), FMT_STRING("vector\tICE\t{}\t{}\tBP\n"), chrom.name(), resolution);

const auto num_bins = (chrom.size() + resolution - 1) / resolution;
const auto i1 = i0 + static_cast<std::ptrdiff_t>(num_bins);
std::for_each(weights.begin() + i0, weights.begin() + i1, [&](const double w) {
std::isnan(w) ? fmt::print(f.get(), FMT_COMPILE(".\n"))
: fmt::print(f.get(), FMT_COMPILE("{}\n"), 1.0 / w);
if (!bool(f)) { // NOLINT
throw fmt::system_error(
errno, FMT_STRING("an error occurred while writing weights to file {}"), weight_file);
}
});
for (const auto& norm : normalizations) {
if (!clr.has_normalization(norm) && !fail_if_norm_missing) {
SPDLOG_WARN(FMT_STRING("[{}] unable to read weights from \"{}\"..."), resolution, cooler_uri);
continue;
}

i0 = i1;
const auto weights = try_read_weights(clr, norm);
const auto weight_name = norm == "weight" ? "ICE" : norm.to_string();
const auto weight_is_divisive = weights->type() == balancing::Weights::Type::INFER ||
weights->type() == balancing::Weights::Type::UNKNOWN ||
weights->type() == balancing::Weights::Type::DIVISIVE;
auto weight_vector = (*weights)();
if (weight_is_divisive) {
std::transform(weight_vector.begin(), weight_vector.end(), weight_vector.begin(),
[](const double w) { return 1.0 / w; });
}

std::ptrdiff_t i0 = 0;
for (const auto& chrom : clr.chromosomes()) {
// TODO add GW/INTRA/INTER prefix as appropriate
fmt::print(f.get(), FMT_STRING("vector\t{}\t{}\t{}\tBP\n"), weight_name, chrom.name(),
resolution);

const auto num_bins = (chrom.size() + resolution - 1) / resolution;
const auto i1 = i0 + static_cast<std::ptrdiff_t>(num_bins);
std::for_each(weight_vector.begin() + i0, weight_vector.begin() + i1, [&](double w) {
!std::isfinite(w) ? fmt::print(f.get(), FMT_COMPILE(".\n"))
: fmt::print(f.get(), FMT_COMPILE("{}\n"), w);
if (!bool(f)) { // NOLINT
throw fmt::system_error(
errno, FMT_STRING("an error occurred while writing weights to file {}"), weight_file);
}
});

i0 = i1;
}
SPDLOG_INFO(FMT_STRING("[{}] wrote \"{}\" weights to file {}..."), resolution, weight_name,
weight_file);
}
SPDLOG_INFO(FMT_STRING("[{}] wrote {} weights to file {}..."), resolution, weights.size(),
weight_file);
return true;

return std::ftell(f.get()) != 0;
}

static bool dump_weights(const ConvertConfig& c, const std::filesystem::path& weight_file) {
bool cooler_has_weights = false;
for (const auto& res : c.resolutions) {
cooler_has_weights |= dump_weights(
res, fmt::format(FMT_STRING("{}::/resolutions/{}"), c.path_to_input.string(), res),
weight_file);
weight_file, c.normalization_methods, c.fail_if_normalization_method_is_not_avaliable);
}

return cooler_has_weights;
}

void cool_to_hic(const ConvertConfig& c) {
static const internal::TmpDir tmpdir{};
std::ignore = find_java();

const internal::TmpDir tmpdir{c.tmp_dir};

const auto chrom_sizes = tmpdir() / "reference.chrom.sizes";
const auto pixels = [&]() {
Expand All @@ -263,7 +300,8 @@ void cool_to_hic(const ConvertConfig& c) {
const auto weights = tmpdir() / "weights.txt";

if (c.force && std::filesystem::exists(c.path_to_output)) {
std::filesystem::remove(c.path_to_output);
[[maybe_unused]] std::error_code ec{};
std::filesystem::remove(c.path_to_output, ec);
}

std::unique_ptr<boost::process::child> process{};
Expand Down Expand Up @@ -300,7 +338,8 @@ void cool_to_hic(const ConvertConfig& c) {

const auto weight_file_has_data =
c.input_format == "cool"
? dump_weights(c.resolutions.front(), c.path_to_input.string(), weights)
? dump_weights(c.resolutions.front(), c.path_to_input.string(), weights,
c.normalization_methods, c.fail_if_normalization_method_is_not_avaliable)
: dump_weights(c, weights);

if (weight_file_has_data) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,7 @@ inline MultiResFile::operator bool() const noexcept { return !!_root_grp; }

inline std::string MultiResFile::path() const { return (*_root_grp)().getFile().getName(); }

inline auto MultiResFile::chromosomes() const noexcept -> const Reference& {
return _chroms;
}
inline auto MultiResFile::chromosomes() const noexcept -> const Reference& { return _chroms; }

[[nodiscard]] inline std::uint32_t MultiResFile::compute_base_resolution(
const std::vector<std::uint32_t>& resolutions, std::uint32_t target_res) {
Expand Down

0 comments on commit 479cf68

Please sign in to comment.