Skip to content

Commit 67f3013

Browse files
committed
apply review suggestions
1 parent f5bd897 commit 67f3013

File tree

5 files changed

+67
-71
lines changed

5 files changed

+67
-71
lines changed

components/core/src/clp_s/indexer/CommandLineArguments.cpp

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
2828
"db-config-file",
2929
po::value<std::string>(&metadata_db_config_file_path)->value_name("FILE")
3030
->default_value(metadata_db_config_file_path),
31-
"Table metadata DB YAML config"
31+
"Path to the YAML DB config file for metadata storage"
3232
);
3333
// clang-format on
3434

@@ -37,16 +37,22 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
3737
visible_options.add(general_options);
3838
visible_options.add(output_options);
3939

40-
// Define hidden positional options (not shown in Boost's program options help message)
40+
std::string archive_path;
4141
po::options_description positional_options;
4242
// clang-format off
43-
positional_options.add_options()
44-
("archive-dir", po::value<std::string>(&m_archive_dir))
45-
("archive-id", po::value<std::string>(&m_archive_id));
43+
positional_options.add_options()(
44+
"table-name",
45+
po::value<std::string>(&m_table_name),
46+
"Name of the table where fields from the archive will be indexed and stored"
47+
)(
48+
"archive-path",
49+
po::value<std::string>(&archive_path),
50+
"Path to an archive"
51+
);
4652
// clang-format on
4753
po::positional_options_description positional_options_description;
48-
positional_options_description.add("archive-dir", 1);
49-
positional_options_description.add("archive-id", 1);
54+
positional_options_description.add("table-name", 1);
55+
positional_options_description.add("archive-path", 1);
5056

5157
// Aggregate all options
5258
po::options_description all_options;
@@ -79,12 +85,13 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
7985
}
8086

8187
// Validate required parameters
82-
if (m_archive_dir.empty()) {
83-
throw std::invalid_argument("ARCHIVE_DIR not specified or empty.");
88+
if (m_table_name.empty()) {
89+
throw std::invalid_argument("Table name not specified or empty.");
8490
}
85-
if (m_archive_id.empty()) {
86-
throw std::invalid_argument("ARCHIVE_ID not specified or empty.");
91+
if (archive_path.empty()) {
92+
throw std::invalid_argument("Archive path not specified or empty.");
8793
}
94+
m_archive_path = get_path_object_for_raw_path(archive_path);
8895
if (false == metadata_db_config_file_path.empty()) {
8996
clp::GlobalMetadataDBConfig metadata_db_config;
9097
try {
@@ -116,7 +123,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
116123
}
117124

118125
void CommandLineArguments::print_basic_usage() const {
119-
std::cerr << "Usage: " << get_program_name() << " [OPTIONS] ARCHIVE_DIR ARCHIVE_ID"
126+
std::cerr << "Usage: " << get_program_name() << " [OPTIONS] TABLE_NAME ARCHIVE_PATH"
120127
<< std::endl;
121128
}
122129
} // namespace clp_s::indexer

components/core/src/clp_s/indexer/CommandLineArguments.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <string>
66

77
#include "../../clp/GlobalMetadataDBConfig.hpp"
8+
#include "../InputConfig.hpp"
89

910
namespace clp_s::indexer {
1011
/**
@@ -27,9 +28,9 @@ class CommandLineArguments {
2728

2829
std::string const& get_program_name() const { return m_program_name; }
2930

30-
std::string const& get_archive_dir() const { return m_archive_dir; }
31+
std::string const& get_table_name() const { return m_table_name; }
3132

32-
std::string const& get_archive_id() const { return m_archive_id; }
33+
Path const& get_archive_path() const { return m_archive_path; }
3334

3435
std::optional<clp::GlobalMetadataDBConfig> const& get_db_config() const {
3536
return m_metadata_db_config;
@@ -41,8 +42,8 @@ class CommandLineArguments {
4142

4243
// Variables
4344
std::string m_program_name;
44-
std::string m_archive_dir;
45-
std::string m_archive_id;
45+
std::string m_table_name;
46+
Path m_archive_path;
4647

4748
std::optional<clp::GlobalMetadataDBConfig> m_metadata_db_config;
4849
};

components/core/src/clp_s/indexer/IndexManager.cpp

Lines changed: 20 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,18 @@
88
namespace clp_s::indexer {
99
IndexManager::IndexManager(std::optional<clp::GlobalMetadataDBConfig> const& db_config) {
1010
if (db_config.has_value()) {
11-
m_table_metadata_db = std::make_unique<MySQLIndexStorage>(
11+
m_mysql_index_storage = std::make_unique<MySQLIndexStorage>(
1212
db_config->get_metadata_db_host(),
1313
db_config->get_metadata_db_port(),
1414
db_config->get_metadata_db_username(),
1515
db_config->get_metadata_db_password(),
1616
db_config->get_metadata_db_name(),
1717
db_config->get_metadata_table_prefix()
1818
);
19-
m_table_metadata_db->open();
19+
m_mysql_index_storage->open();
20+
m_field_update_callback = [this](std::string& field_name, NodeType field_type) {
21+
m_mysql_index_storage->add_field(field_name, field_type);
22+
};
2023
m_output_type = OutputType::Database;
2124
} else {
2225
throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__);
@@ -25,32 +28,17 @@ IndexManager::IndexManager(std::optional<clp::GlobalMetadataDBConfig> const& db_
2528

2629
IndexManager::~IndexManager() {
2730
if (m_output_type == OutputType::Database) {
28-
m_table_metadata_db->close();
31+
m_mysql_index_storage->close();
2932
}
3033
}
3134

32-
void IndexManager::update_metadata(std::string const& archive_dir, std::string const& archive_id) {
33-
m_table_metadata_db->init(archive_dir);
34-
35-
auto archive_path = std::filesystem::path(archive_dir) / archive_id;
36-
std::error_code ec;
37-
if (false == std::filesystem::exists(archive_path, ec) || ec) {
38-
throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__);
39-
}
35+
void IndexManager::update_metadata(std::string const& table_name, Path const& archive_path) {
36+
m_mysql_index_storage->init(table_name);
4037

4138
ArchiveReader archive_reader;
42-
archive_reader.open(
43-
clp_s::Path{.source = clp_s::InputSource::Filesystem, .path = archive_path.string()},
44-
NetworkAuthOption{}
45-
);
39+
archive_reader.open(archive_path, NetworkAuthOption{});
4640

47-
auto schema_tree = archive_reader.get_schema_tree();
48-
auto field_pairs = traverse_schema_tree(schema_tree);
49-
if (OutputType::Database == m_output_type) {
50-
for (auto& [name, type] : field_pairs) {
51-
m_table_metadata_db->add_field(name, type);
52-
}
53-
}
41+
traverse_schema_tree_and_update_metadata(archive_reader.get_schema_tree());
5442
}
5543

5644
std::string IndexManager::escape_key_name(std::string_view const key_name) {
@@ -100,49 +88,42 @@ std::string IndexManager::escape_key_name(std::string_view const key_name) {
10088
return escaped_key_name;
10189
}
10290

103-
std::vector<std::pair<std::string, clp_s::NodeType>> IndexManager::traverse_schema_tree(
91+
void IndexManager::traverse_schema_tree_and_update_metadata(
10492
std::shared_ptr<SchemaTree> const& schema_tree
10593
) {
106-
std::vector<std::pair<std::string, clp_s::NodeType>> fields;
10794
if (nullptr == schema_tree) {
108-
return fields;
95+
return;
10996
}
11097

11198
std::string path_buffer;
11299
// Stack of pairs of node_id and path_length
113100
std::stack<std::pair<int32_t, uint64_t>> s;
114-
for (auto const& node : schema_tree->get_nodes()) {
115-
if (constants::cRootNodeId == node.get_parent_id()
116-
&& clp_s::NodeType::Metadata != node.get_type())
117-
{
118-
s.emplace(node.get_id(), 0);
119-
break;
120-
}
121-
}
101+
s.emplace(schema_tree->get_object_subtree_node_id(), 0);
122102

123103
while (false == s.empty()) {
124104
auto [node_id, path_length] = s.top();
125105
s.pop();
126106

127107
auto const& node = schema_tree->get_node(node_id);
128-
auto const& children_ids = node.get_children_ids();
129108
auto node_type = node.get_type();
109+
// TODO: Add support for structured arrays
110+
if (NodeType::StructuredArray == node_type) {
111+
continue;
112+
}
113+
auto const& children_ids = node.get_children_ids();
130114
path_buffer.resize(path_length);
131115
if (false == path_buffer.empty()) {
132116
path_buffer += ".";
133117
}
134118
path_buffer += escape_key_name(node.get_key_name());
135-
if (children_ids.empty() && clp_s::NodeType::Object != node_type
136-
&& clp_s::NodeType::Unknown != node_type)
119+
if (children_ids.empty() && NodeType::Object != node_type && NodeType::Unknown != node_type)
137120
{
138-
fields.emplace_back(path_buffer, node_type);
121+
m_field_update_callback(path_buffer, node_type);
139122
}
140123

141124
for (auto child_id : children_ids) {
142125
s.emplace(child_id, path_buffer.size());
143126
}
144127
}
145-
146-
return fields;
147128
}
148129
} // namespace clp_s::indexer

components/core/src/clp_s/indexer/IndexManager.hpp

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,26 @@
11
#ifndef CLP_S_INDEXER_INDEXMANAGER_HPP
22
#define CLP_S_INDEXER_INDEXMANAGER_HPP
33

4+
#include <functional>
5+
#include <optional>
6+
47
#include "../../clp/GlobalMetadataDBConfig.hpp"
58
#include "../ArchiveReader.hpp"
69
#include "MySQLIndexStorage.hpp"
710

811
namespace clp_s::indexer {
912
/**
10-
* Class used to updates field names (e.g., JSON full paths) and data types for a specified archive
11-
* directory. It currently stores the results in a database. An archive directory consists of
12-
* multiple archives on the same topic, which can be queried using SQL. The directory name serves as
13-
* the table name, and its schema (field names and data types) is used by the SQL engine to resolve
14-
* column metadata.
13+
* This class updates field names (e.g., JSON full paths) and data types for a specified archive.
14+
* It traverses the schema tree from root to leaf, concatenating escaped key names using the
15+
* delimiter `.` at each level.
16+
*
17+
* Currently, only leaf nodes with primitive types are indexed. Object nodes and subtrees of
18+
* structured arrays are not indexed. The results are stored in a database to enable efficient
19+
* querying.
20+
*
21+
* Multiple archives related to the same topic can form a table that can be queried using a SQL
22+
* query engine. When indexing, a table name must be specified. This table is then used by the SQL
23+
* engine to resolve column metadata.
1524
*/
1625
class IndexManager {
1726
public:
@@ -36,10 +45,10 @@ class IndexManager {
3645
// Methods
3746
/**
3847
* Updates the metadata for a given archive
39-
* @param archive_dir used as the table name
40-
* @param archive_id
48+
* @param table_name
49+
* @param archive_path
4150
*/
42-
void update_metadata(std::string const& archive_dir, std::string const& archive_id);
51+
void update_metadata(std::string const& table_name, Path const& archive_path);
4352

4453
private:
4554
/**
@@ -50,15 +59,13 @@ class IndexManager {
5059
static std::string escape_key_name(std::string_view const key_name);
5160

5261
/**
53-
* Traverses the schema tree and returns a list of path names and their types
62+
* Traverses the schema tree and updates the metadata
5463
* @param schema_tree
55-
* @return a list of path names and their types
5664
*/
57-
std::vector<std::pair<std::string, clp_s::NodeType>> traverse_schema_tree(
58-
std::shared_ptr<SchemaTree> const& schema_tree
59-
);
65+
void traverse_schema_tree_and_update_metadata(std::shared_ptr<SchemaTree> const& schema_tree);
6066

61-
std::shared_ptr<MySQLIndexStorage> m_table_metadata_db;
67+
std::function<void(std::string&, NodeType)> m_field_update_callback;
68+
std::shared_ptr<MySQLIndexStorage> m_mysql_index_storage;
6269
OutputType m_output_type{OutputType::Database};
6370
};
6471
} // namespace clp_s::indexer

components/core/src/clp_s/indexer/indexer.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ int main(int argc, char const* argv[]) {
3434
try {
3535
clp_s::indexer::IndexManager index_manager(command_line_arguments.get_db_config());
3636
index_manager.update_metadata(
37-
command_line_arguments.get_archive_dir(),
38-
command_line_arguments.get_archive_id()
37+
command_line_arguments.get_table_name(),
38+
command_line_arguments.get_archive_path()
3939
);
4040
} catch (std::exception& e) {
4141
SPDLOG_ERROR("Failed to update metadata: {}", e.what());

0 commit comments

Comments
 (0)