Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement hictk metadata command #204

Merged
merged 13 commits into from
Aug 16, 2024
Prev Previous commit
Next Next commit
Support reading metadata recursively
  • Loading branch information
robomics committed Aug 16, 2024
commit 00b5eaede9d427bd348a0e6c310975b812d5f85e
4 changes: 4 additions & 0 deletions src/hictk/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ target_sources(
${CMAKE_CURRENT_SOURCE_DIR}/fix_mcool/fix_mcool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/load/load.cpp
${CMAKE_CURRENT_SOURCE_DIR}/merge/merge.cpp
${CMAKE_CURRENT_SOURCE_DIR}/metadata/cool.hpp
${CMAKE_CURRENT_SOURCE_DIR}/metadata/hic.hpp
${CMAKE_CURRENT_SOURCE_DIR}/metadata/mcool.hpp
${CMAKE_CURRENT_SOURCE_DIR}/metadata/scool.hpp
${CMAKE_CURRENT_SOURCE_DIR}/metadata/metadata.cpp
${CMAKE_CURRENT_SOURCE_DIR}/rename_chromosomes/rename_chromosomes.cpp
${CMAKE_CURRENT_SOURCE_DIR}/validate/validate.cpp
Expand Down
6 changes: 6 additions & 0 deletions src/hictk/cli/cli_metadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ void Cli::make_metadata_subcommand() {
c.include_file_path,
"Output the given input path using attribute \"uri\"")
->capture_default_str();
sc.add_flag(
"--recursive",
c.recursive,
"Print metadata for each resolution or cell contained in a\n"
"multi-resolution or single-cell file.")
->capture_default_str();
// clang-format on

_config = std::monostate{};
Expand Down
1 change: 1 addition & 0 deletions src/hictk/include/hictk/tools/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ struct MetadataConfig {
std::string input_format{};
std::string output_format{"json"};
bool include_file_path{false};
bool recursive{false};

std::uint8_t verbosity{2};
};
Expand Down
204 changes: 204 additions & 0 deletions src/hictk/metadata/common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
// Copyright (C) 2024 Roberto Rossini <roberros@uio.no>
//
// SPDX-License-Identifier: MIT

#pragma once

#include <fmt/format.h>

#include <cassert>
#include <cstdint>
#include <iostream>
#include <limits>
#include <nlohmann/json.hpp>
#include <optional>
#include <sstream>
#include <string>
#include <string_view>
#include <toml++/toml.hpp>
#include <type_traits>
#include <utility>
#include <variant>
#include <vector>

#include "hictk/common.hpp"
#include "hictk/version.hpp"

namespace hictk::tools {
enum class MetadataOutputFormat : std::uint8_t { json, toml, yaml };

[[nodiscard]] inline MetadataOutputFormat parse_output_format(std::string_view format) {
if (format == "json") {
return MetadataOutputFormat::json;
}
if (format == "toml") {
return MetadataOutputFormat::toml;
}
assert(format == "yaml");
return MetadataOutputFormat::yaml;
}

inline void emplace_if_valid(std::string_view key, const std::string& value, toml::table& buff) {
if (!key.empty()) {
buff.insert(key, value);
}
}

inline void emplace_if_valid(std::string_view key, const toml::array& values, toml::table& buff) {
if (!key.empty()) {
buff.insert(key, values);
}
}

template <typename T, typename std::enable_if_t<std::is_integral_v<T>>* = nullptr>
inline void emplace_if_valid(std::string_view key, const T& value, toml::table& buff) {
if (key.empty()) {
return;
}

if (value <= std::numeric_limits<std::int64_t>::max()) {
buff.insert(key, static_cast<std::int64_t>(value));
} else {
emplace_if_valid(key, fmt::to_string(value), buff);
}
}

template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
inline void emplace_if_valid(std::string_view key, const T& value, toml::table& buff) {
if (!key.empty()) {
buff.insert(key, conditional_static_cast<double>(value));
}
}

template <typename... T>
inline void emplace_if_valid(std::string_view key, const std::variant<T...>& value,
toml::table& buff) {
if (!key.empty()) {
std::visit([&](const auto& value_) { emplace_if_valid(key, value_, buff); }, value);
}
}

template <typename T>
inline void emplace_if_valid(std::string_view key, const std::optional<T>& value,
toml::table& buff) {
if (!key.empty() && !!value) {
emplace_if_valid(key, *value, buff);
}
}

[[nodiscard]] inline nlohmann::json reformat_nulls(nlohmann::json attributes) {
std::vector<std::string> null_fields{};
for (const auto& field : attributes.items()) {
if (field.value() == "null") {
null_fields.emplace_back(field.key());
}
}

for (const auto& k : null_fields) {
attributes[k] = nullptr;
}

return attributes;
}

[[nodiscard]] inline nlohmann::json toml_to_json(const toml::table& t) {
std::stringstream buff;
buff << toml::json_formatter(t);

auto j = reformat_nulls(nlohmann::json::parse(buff.str()));
if (const auto metadata = t.find("metadata");
metadata != t.end() && metadata->second.is_string()) {
try {
j["metadata"] = reformat_nulls(nlohmann::json::parse(metadata->second.ref<std::string>()));
// NOLINTNEXTLINE
} catch (...) {
}
}

return j;
}

[[nodiscard]] inline std::string format_to_json(
const toml::table& attributes,
const std::vector<std::pair<std::string, toml::table>>& nested_attributes) {
auto attributes_json = toml_to_json(attributes);

for (const auto& [key, table] : nested_attributes) {
attributes_json[key] = toml_to_json(table);
}

return attributes_json.dump(4);
}

[[nodiscard]] inline std::string sanitize_toml_section_title(std::string s) {
if (s.find('.') == std::string::npos) {
return s;
}

// Escape '
std::size_t start_pos = 0;
while ((start_pos = s.find('\'', start_pos)) != std::string::npos) {
s.replace(start_pos, 1, "\\'");
start_pos += 2;
}

s.insert(s.begin(), 1, '\'');
s.insert(s.end(), 1, '\'');

return s;
}

[[nodiscard]] inline std::string format_to_toml(
const toml::table& attributes,
const std::vector<std::pair<std::string, toml::table>>& nested_attributes) {
std::stringstream ss;
ss << fmt::format(FMT_STRING("# Metadata generated by {}\n"), hictk::config::version::str_long())
<< attributes << '\n';

for (const auto& [title, table] : nested_attributes) {
ss << fmt::format(FMT_STRING("\n[{}]\n"), sanitize_toml_section_title(title)) << table << '\n';
}

return ss.str();
}

[[nodiscard]] inline std::string format_to_yaml(
const toml::table& attributes,
const std::vector<std::pair<std::string, toml::table>>& nested_attributes) {
if (nested_attributes.empty()) {
std::stringstream ss;
ss << fmt::format(FMT_STRING("--- # Metadata generated by {}\n"),
hictk::config::version::str_long())
<< toml::yaml_formatter(attributes) << '\n';
return ss.str();
}

const auto metadata_toml = toml::parse(format_to_toml(attributes, nested_attributes));

return format_to_yaml(metadata_toml, {});
}

inline void print_attributes(
const toml::table& top_lvl_attributes,
const std::vector<std::pair<std::string, toml::table>>& nested_attributes,
MetadataOutputFormat format) {
std::string buff;
switch (format) {
case MetadataOutputFormat::json: {
buff = format_to_json(top_lvl_attributes, nested_attributes);
break;
}
case MetadataOutputFormat::toml: {
buff = format_to_toml(top_lvl_attributes, nested_attributes);
break;
}
case MetadataOutputFormat::yaml: {
buff = format_to_yaml(top_lvl_attributes, nested_attributes);
break;
}
}

fmt::print(FMT_STRING("{}\n"), buff);
}

} // namespace hictk::tools
53 changes: 53 additions & 0 deletions src/hictk/metadata/cool.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright (C) 2024 Roberto Rossini <roberros@uio.no>
//
// SPDX-License-Identifier: MIT

#pragma once

#include <string>
#include <toml++/toml.hpp>

#include "./common.hpp"
#include "hictk/cooler/cooler.hpp"

namespace hictk::tools {

[[nodiscard]] inline toml::table normalize_attribute_map(const cooler::Attributes& map,
const std::string& uri) {
toml::table attributes;

if (!uri.empty()) {
emplace_if_valid("uri", uri, attributes);
}

emplace_if_valid("bin-size", map.bin_size, attributes);
emplace_if_valid("bin-type", map.bin_type, attributes);
emplace_if_valid("format", map.format, attributes);
emplace_if_valid("format-version", map.format_version, attributes);
emplace_if_valid("storage-mode", map.storage_mode, attributes);

emplace_if_valid("creation-date", map.creation_date, attributes);
emplace_if_valid("generated-by", map.generated_by, attributes);
emplace_if_valid("assembly", map.assembly, attributes);
emplace_if_valid("metadata", map.metadata, attributes);

emplace_if_valid("format-url", map.format_url, attributes);
emplace_if_valid("nbins", map.nbins, attributes);
emplace_if_valid("nchroms", map.nchroms, attributes);
emplace_if_valid("nnz", map.nnz, attributes);
emplace_if_valid("sum", map.sum, attributes);
emplace_if_valid("cis", map.cis, attributes);

return attributes;
}

[[nodiscard]] inline int print_cool_metadata(const std::filesystem::path& p,
MetadataOutputFormat format, bool include_file_path) {
const auto attributes = normalize_attribute_map(cooler::File(p.string()).attributes(),
include_file_path ? p.string() : "");
print_attributes(attributes, {}, format);

return 0;
}

} // namespace hictk::tools
Loading
Loading