From f05264e88cf69735c0fbb2b173b72821df856b35 Mon Sep 17 00:00:00 2001 From: Devin Gibson Date: Mon, 12 Aug 2024 10:39:36 -0400 Subject: [PATCH] clp-s: Report exactly where parsing error occurs when parsing JSON (fixes #514). (#503) --- components/core/src/clp_s/JsonParser.cpp | 42 ++++++++++++++++++------ components/core/src/clp_s/JsonParser.hpp | 1 + 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 26ec3d7b4..a68062958 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -428,7 +428,7 @@ bool JsonParser::parse() { if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) { SPDLOG_ERROR( - "Encountered error - {} - while trying to parse {}", + "Encountered error - {} - while trying to parse {} after parsing 0 bytes", simdjson::error_message(json_file_iterator.get_error()), file_path ); @@ -439,7 +439,8 @@ bool JsonParser::parse() { simdjson::ondemand::document_stream::iterator json_it; m_num_messages = 0; - size_t last_num_bytes_consumed = 0; + size_t bytes_consumed_up_to_prev_archive = 0; + size_t bytes_consumed_up_to_prev_record = 0; while (json_file_iterator.get_json(json_it)) { m_current_schema.clear(); @@ -450,11 +451,31 @@ bool JsonParser::parse() { // that this isn't a valid JSON document but they get set in different situations so we // need to check both here. if (is_scalar_result.error() || true == is_scalar_result.value()) { - SPDLOG_ERROR("Encountered non-json-object while trying to parse {}", file_path); + SPDLOG_ERROR( + "Encountered non-json-object while trying to parse {} after parsing {} " + "bytes", + file_path, + bytes_consumed_up_to_prev_record + ); + m_archive_writer->close(); + return false; + } + + // Some errors from simdjson are latent until trying to access invalid JSON fields. + // Instead of checking for an error every time we access a JSON field in parse_line we + // just catch simdjson_error here instead. + try { + parse_line(ref.value(), -1, ""); + } catch (simdjson::simdjson_error& error) { + SPDLOG_ERROR( + "Encountered error - {} - while trying to parse {} after parsing {} bytes", + error.what(), + file_path, + bytes_consumed_up_to_prev_record + ); m_archive_writer->close(); return false; } - parse_line(ref.value(), -1, ""); m_num_messages++; int32_t current_schema_id = m_archive_writer->add_schema(m_current_schema); @@ -462,12 +483,12 @@ bool JsonParser::parse() { m_archive_writer ->append_message(current_schema_id, m_current_schema, m_current_parsed_message); + bytes_consumed_up_to_prev_record = json_file_iterator.get_num_bytes_consumed(); if (m_archive_writer->get_data_size() >= m_target_encoded_size) { - size_t num_bytes_read = json_file_iterator.get_num_bytes_consumed(); m_archive_writer->increment_uncompressed_size( - num_bytes_read - last_num_bytes_consumed + bytes_consumed_up_to_prev_record - bytes_consumed_up_to_prev_archive ); - last_num_bytes_consumed = num_bytes_read; + bytes_consumed_up_to_prev_archive = bytes_consumed_up_to_prev_record; split_archive(); } @@ -475,14 +496,15 @@ bool JsonParser::parse() { } m_archive_writer->increment_uncompressed_size( - json_file_iterator.get_num_bytes_read() - last_num_bytes_consumed + json_file_iterator.get_num_bytes_read() - bytes_consumed_up_to_prev_archive ); if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) { SPDLOG_ERROR( - "Encountered error - {} - while trying to parse {}", + "Encountered error - {} - while trying to parse {} after parsing {} bytes", simdjson::error_message(json_file_iterator.get_error()), - file_path + file_path, + bytes_consumed_up_to_prev_record ); m_archive_writer->close(); return false; diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index f65f4c4b1..84aa27fef 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -70,6 +70,7 @@ class JsonParser { * @param line the JSON line * @param parent_node_id the parent node id * @param key the key of the node + * @throw simdjson::simdjson_error when encountering invalid fields while parsing line */ void parse_line(ondemand::value line, int32_t parent_node_id, std::string const& key);