y-scope · davidlion · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025 · Sep 17, 2025
@@ -16,4 +16,4 @@ float:\-{0,1}[0-9]+\.[0-9]+
 // Dictionary variables
 hex:[a-fA-F]+
 hasNumber:.*\d.*
-equals:.*=.*[a-zA-Z0-9].*
+equals:.*=(?<var>.*[a-zA-Z0-9].*)
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <memory>
 #include <set>
+#include <string>
 
 #include <boost/algorithm/string.hpp>
 #include <boost/lexical_cast.hpp>
@@ -188,6 +189,18 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
     for (std::unique_ptr<log_surgeon::ParserAST> const& parser_ast : schema_ast->m_schema_vars) {
         auto* rule = dynamic_cast<log_surgeon::SchemaVarAST*>(parser_ast.get());
 
+        // Currently, we only support at most a single capture group in each variable. If a capture
+        // group is present its match will be treated as the variable rather than the full match.
+        auto const num_captures = rule->m_regex_ptr->get_subtree_positive_captures().size();
+        if (1 < num_captures) {
+            throw std::runtime_error(
+                    schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
+                    + ": error: the schema rule '" + rule->m_name
+                    + "' has a regex pattern containing > 1 capture groups (found "
+                    + std::to_string(num_captures) + ").\n"
+            );
+        }
+
-        // Currently, we only support at most a single capture group in each variable. If a capture
-        // group is present its match will be treated as the variable rather than the full match.
-        auto const num_captures = rule->m_regex_ptr->get_subtree_positive_captures().size();
-        if (1 < num_captures) {
-            throw std::runtime_error(
-                    schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
-                    + ": error: the schema rule '" + rule->m_name
-                    + "' has a regex pattern containing > 1 capture groups (found "
-                    + std::to_string(num_captures) + ").\n"
-            );
-        }
+        if ("timestamp" == rule->m_name) {
+            continue;
+        }
+
+        // Currently, we only support at most a single capture group in each variable. If a capture
+        // group is present its match will be treated as the variable rather than the full match.
+        auto const num_captures = rule->m_regex_ptr->get_subtree_positive_captures().size();
+        if (1 < num_captures) {
+            throw std::runtime_error(
+                    schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
+                     ": error: the schema rule '" + rule->m_name
+                     "' has a regex pattern containing more than one capture group (found "
+                     std::to_string(num_captures) + ").\n"
+            );
+        }
-        // Currently, we only support at most a single capture group in each variable. If a capture
-        // group is present its match will be treated as the variable rather than the full match.
-        auto const num_captures = rule->m_regex_ptr->get_subtree_positive_captures().size();
-        if (1 < num_captures) {
-            throw std::runtime_error(
-                    schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
-                    + ": error: the schema rule '" + rule->m_name
-                    + "' has a regex pattern containing > 1 capture groups (found "
-                    + std::to_string(num_captures) + ").\n"
-            );
-        }
+        if ("timestamp" == rule->m_name) {
+            continue;
+        }
+
+        // Currently, we only support at most a single capture group in each variable. If a capture
+        // group is present its match will be treated as the variable rather than the full match.
+        auto const num_captures = rule->m_regex_ptr->get_subtree_positive_captures().size();
+        if (1 < num_captures) {
+            throw std::runtime_error(
+                    schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
+                     ": error: the schema rule '" + rule->m_name
+                     "' has a regex pattern containing more than one capture group (found "
+                     std::to_string(num_captures) + ").\n"
+            );
+        }
         if ("timestamp" == rule->m_name) {
             continue;
         }

@@ -2,14 +2,17 @@
 
 #include <sys/stat.h>
 
+#include <cstdint>
 #include <filesystem>
-#include <fstream>
-#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
 
 #include <boost/asio.hpp>
 #include <boost/uuid/uuid.hpp>
 #include <boost/uuid/uuid_generators.hpp>
 #include <boost/uuid/uuid_io.hpp>
+#include <log_surgeon/Constants.hpp>
 #include <log_surgeon/LogEvent.hpp>
 #include <log_surgeon/LogParser.hpp>
 #include <nlohmann/json.hpp>
@@ -23,11 +26,7 @@
 
 using clp::ir::eight_byte_encoded_variable_t;
 using clp::ir::four_byte_encoded_variable_t;
-using log_surgeon::LogEventView;
-using std::list;
-using std::make_unique;
 using std::string;
-using std::unordered_set;
 using std::vector;
 
 namespace clp::streaming_archive::writer {
@@ -315,13 +314,13 @@ Archive::write_msg(epochtime_t timestamp, string const& message, size_t num_unco
     update_segment_indices(logtype_id, var_ids);
 }
 
-void Archive::write_msg_using_schema(LogEventView const& log_view) {
+void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) {
     epochtime_t timestamp = 0;
     TimestampPattern* timestamp_pattern = nullptr;
     auto const& log_output_buffer = log_view.get_log_output_buffer();
     if (log_output_buffer->has_timestamp()) {
-        size_t start;
-        size_t end;
+        size_t start{};
+        size_t end{};
         timestamp_pattern = (TimestampPattern*)TimestampPattern::search_known_ts_patterns(
                 log_output_buffer->get_mutable_token(0).to_string(),
                 timestamp,
@@ -360,7 +359,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
     if (timestamp_pattern == nullptr) {
         start_pos = log_output_buffer->get_token(1).m_start_pos;
     }
-    uint32_t end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos;
+    uint32_t const end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos;
     if (start_pos <= end_pos) {
         num_uncompressed_bytes = end_pos - start_pos;
     } else {
@@ -369,7 +368,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
     }
     for (uint32_t i = 1; i < log_output_buffer->pos(); i++) {
         log_surgeon::Token& token = log_output_buffer->get_mutable_token(i);
-        int token_type = token.m_type_ids_ptr->at(0);
+        auto const token_type{token.m_type_ids_ptr->at(0)};
         if (log_output_buffer->has_delimiters() && (timestamp_pattern != nullptr || i > 1)
             && token_type != static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)
             && token_type != static_cast<int>(log_surgeon::SymbolId::TokenNewline))
@@ -388,13 +387,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
                 break;
             }
             case static_cast<int>(log_surgeon::SymbolId::TokenInt): {
-                encoded_variable_t encoded_var;
+                encoded_variable_t encoded_var{};
                 if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var(
                             token.to_string(),
                             encoded_var
                     ))
                 {
-                    variable_dictionary_id_t id;
+                    variable_dictionary_id_t id{};
                     m_var_dict.add_entry(token.to_string(), id);
                     encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
                     m_logtype_dict_entry.add_dictionary_var();
@@ -405,13 +404,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
                 break;
             }
             case static_cast<int>(log_surgeon::SymbolId::TokenFloat): {
-                encoded_variable_t encoded_var;
+                encoded_variable_t encoded_var{};
                 if (!EncodedVariableInterpreter::convert_string_to_representable_float_var(
                             token.to_string(),
                             encoded_var
                     ))
                 {
-                    variable_dictionary_id_t id;
+                    variable_dictionary_id_t id{};
                     m_var_dict.add_entry(token.to_string(), id);
                     encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
                     m_logtype_dict_entry.add_dictionary_var();
@@ -422,21 +421,63 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
                 break;
             }
             default: {
-                // Variable string looks like a dictionary variable, so encode it as so
-                encoded_variable_t encoded_var;
-                variable_dictionary_id_t id;
-                m_var_dict.add_entry(token.to_string(), id);
-                encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
-                m_var_ids.push_back(id);
+                // If there are no capture groups the entire variable token is stored as a variable.
+                // If the variable token contains capture groups, we break the token up by storing
+                // each capture as a variable and any substrings surrounding the capture as part of
+                // the logtype. If a capture has repetition we store all instances as a single
+                // variable.
+
+                auto const& lexer{log_view.get_log_parser().m_lexer};
+                auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
+                if (false == capture_ids.has_value()) {
+                    variable_dictionary_id_t id{};
+                    m_var_dict.add_entry(token.to_string(), id);
+                    m_var_ids.push_back(id);
+                    m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
+                    m_logtype_dict_entry.add_dictionary_var();
+
+                    break;
+                }
 
+                auto const register_ids{
+                        lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))
+                };
+                if (false == register_ids.has_value()) {
+                    throw(std::runtime_error(
+                            "No register IDs found for variable's capture group. Full token: "
+                            + token.to_string()
+                    ));
+                }
+                auto const [start_reg_id, end_reg_id]{register_ids.value()};
+                auto const capture_start{token.get_reversed_reg_positions(start_reg_id).back()};
+                auto const capture_end{token.get_reversed_reg_positions(end_reg_id).front()};
+                auto token_view{token};
+                auto const token_end_pos{token_view.m_end_pos};
+
+                token_view.m_end_pos = capture_start;
+                auto const text_before_capture{token_view.to_string_view()};
+                m_logtype_dict_entry
+                        .add_constant(text_before_capture, 0, text_before_capture.size());
+
+                token_view.m_start_pos = capture_start;
+                token_view.m_end_pos = capture_end;
+                variable_dictionary_id_t id{};
+                m_var_dict.add_entry(token_view.to_string_view(), id);
+                m_var_ids.push_back(id);
+                m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
                 m_logtype_dict_entry.add_dictionary_var();
-                m_encoded_vars.push_back(encoded_var);
+
+                token_view.m_start_pos = capture_end;
+                token_view.m_end_pos = token_end_pos;
+                auto const text_after_capture{token_view.to_string_view()};
+                m_logtype_dict_entry.add_constant(text_after_capture, 0, text_after_capture.size());
+
                 break;
             }
         }
     }
-    if (!m_logtype_dict_entry.get_value().empty()) {
-        logtype_dictionary_id_t logtype_id;
+    if (false == m_logtype_dict_entry.get_value().empty()) {
+        logtype_dictionary_id_t logtype_id{};
         m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id);
         m_file->write_encoded_msg(
                 timestamp,

@@ -202,3 +202,26 @@ TEST_CASE("Test lexer", "[Search]") {
         token = opt_token.value();
     }
 }
+
+TEST_CASE("Test schema with single capture group", "[load_lexer]") {
+    std::string const schema_path{"../tests/test_schema_files/single_capture_group.txt"};
+    ByteLexer lexer;
+    load_lexer_from_file(schema_path, lexer);
+
+    auto const rule_id{lexer.m_symbol_id.at("capture")};
+    auto const capture_ids{lexer.get_capture_ids_from_rule_id(rule_id)};
+    REQUIRE(capture_ids.has_value());
+    REQUIRE(1 == capture_ids->size());
+    REQUIRE("group" == lexer.m_id_symbol.at(capture_ids->at(0)));
+}
+
+TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexer]") {
+    std::string const schema_path{"../tests/test_schema_files/multiple_capture_groups.txt"};
+    ByteLexer lexer;
+    REQUIRE_THROWS_WITH(
+            load_lexer_from_file(schema_path, lexer),
+            schema_path
+                    + ":3: error: the schema rule 'multicapture' has a regex pattern containing > "
+                      "1 capture groups (found 2).\n"
+    );
+}
@@ -0,0 +1,3 @@
+delimiters: \r\n
+
+multicapture:text(?<group0>var0)text(?<group1>var1)text
@@ -0,0 +1,3 @@
+delimiters: \r\n
+
+capture:text(?<group>var)text
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		delimiters: \r\n

		multicapture:text(?<group0>var0)text(?<group1>var1)text
Copy link Contributor SharafMohamed Sep 2, 2025 • edited Loading Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Technically lexers can exist with the schema as is, so I'm not entirely sure about this comment. But, if any `LogParser` object is created with this schema it will error as it lacks delimiters. So this schema file is malformed according to Log Surgeon.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		delimiters: \r\n

		capture:text(?<group>var)text
Copy link Contributor SharafMohamed Sep 2, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Same as above.