Add support for validating and escaping UTF-8 strings. (#453)

LinZhihao-723 · kirkrodrigues · web-flow · commit 249816b5cb04 · 2024-06-27T00:40:34.000-04:00
Co-authored-by: kirkrodrigues &lt;2454684+kirkrodrigues@users.noreply.github.com&gt;
diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
@@ -326,6 +326,8 @@ set(SOURCE_FILES_unitTest
         src/clp/ffi/search/Subquery.hpp
         src/clp/ffi/search/WildcardToken.cpp
         src/clp/ffi/search/WildcardToken.hpp
+        src/clp/ffi/utils.cpp
+        src/clp/ffi/utils.hpp
         src/clp/FileDescriptor.cpp
         src/clp/FileDescriptor.hpp
         src/clp/FileReader.cpp
@@ -438,6 +440,8 @@ set(SOURCE_FILES_unitTest
         src/clp/TraceableException.hpp
         src/clp/time_types.hpp
         src/clp/type_utils.hpp
+        src/clp/utf8_utils.cpp
+        src/clp/utf8_utils.hpp
         src/clp/Utils.cpp
         src/clp/Utils.hpp
         src/clp/VariableDictionaryEntry.cpp
@@ -472,6 +476,7 @@ set(SOURCE_FILES_unitTest
         tests/test-StreamingCompression.cpp
         tests/test-string_utils.cpp
         tests/test-TimestampPattern.cpp
+        tests/test-utf8_utils.cpp
         tests/test-Utils.cpp
         )
 add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest})
diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp
@@ -0,0 +1,89 @@
+#include "utils.hpp"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <tuple>
+
+#include "../utf8_utils.hpp"
+
+using std::string;
+using std::string_view;
+
+namespace clp::ffi {
+auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
+    std::optional<std::string> ret_val;
+    auto& escaped{ret_val.emplace()};
+    escaped.reserve(raw.size() + (raw.size() / 2));
+    if (false == validate_and_append_escaped_utf8_string(raw, escaped)) {
+        return std::nullopt;
+    }
+    return ret_val;
+}
+
+auto validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool {
+    string_view::const_iterator next_char_to_copy_it{src.cbegin()};
+
+    auto escape_handler = [&](string_view::const_iterator it) -> void {
+        // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte
+        // used by `snprintf` to append '\0'
+        constexpr size_t cControlCharacterBufSize{7};
+        std::array<char, cControlCharacterBufSize> buf{};
+        std::string_view escaped_char;
+        bool escape_required{true};
+        switch (*it) {
+            case '\b':
+                escaped_char = "\\b";
+                break;
+            case '\t':
+                escaped_char = "\\t";
+                break;
+            case '\n':
+                escaped_char = "\\n";
+                break;
+            case '\f':
+                escaped_char = "\\f";
+                break;
+            case '\r':
+                escaped_char = "\\r";
+                break;
+            case '\\':
+                escaped_char = "\\\\";
+                break;
+            case '"':
+                escaped_char = "\\\"";
+                break;
+            default: {
+                constexpr uint8_t cLargestControlCharacter{0x1F};
+                auto const byte{static_cast<uint8_t>(*it)};
+                if (cLargestControlCharacter >= byte) {
+                    std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte);
+                    escaped_char = {buf.data(), buf.size() - 1};
+                } else {
+                    escape_required = false;
+                }
+                break;
+            }
+        }
+        if (escape_required) {
+            dst.append(next_char_to_copy_it, it);
+            dst += escaped_char;
+            next_char_to_copy_it = it + 1;
+        }
+    };
+
+    if (false == validate_utf8_string(src, escape_handler)) {
+        return false;
+    }
+
+    if (src.cend() != next_char_to_copy_it) {
+        dst.append(next_char_to_copy_it, src.cend());
+    }
+
+    return true;
+}
+}  // namespace clp::ffi
diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp
@@ -0,0 +1,31 @@
+#ifndef CLP_FFI_UTILS_HPP
+#define CLP_FFI_UTILS_HPP
+
+#include <optional>
+#include <string>
+#include <string_view>
+
+namespace clp::ffi {
+/**
+ * Validates whether the given string is UTF-8 encoded, and escapes any characters to make the
+ * string compatible with the JSON specification.
+ * @param raw The raw string to escape.
+ * @return The escaped string on success.
+ * @return std::nullopt if the string contains any non-UTF-8-encoded byte sequences.
+ */
+[[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
+) -> std::optional<std::string>;
+
+/**
+ * Validates whether `src` is UTF-8 encoded, and appends `src` to `dst` while escaping any
+ * characters to make the appended string compatible with the JSON specification.
+ * @param src The string to validate and escape.
+ * @param dst Returns `dst` with an escaped version of `src` appended.
+ * @return Whether `src` is a valid UTF-8-encoded string. NOTE: Even if `src` is not UTF-8 encoded,
+ * `dst` may be modified.
+ */
+[[nodiscard]] auto
+validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool;
+}  // namespace clp::ffi
+
+#endif  // CLP_FFI_UTILS_HPP
diff --git a/components/core/src/clp/utf8_utils.cpp b/components/core/src/clp/utf8_utils.cpp
@@ -0,0 +1,55 @@
+#include "utf8_utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <string_view>
+
+namespace clp {
+auto is_utf8_encoded(std::string_view str) -> bool {
+    auto escape_handler = []([[maybe_unused]] std::string_view::const_iterator it) -> void {};
+    return validate_utf8_string(str, escape_handler);
+}
+
+namespace utf8_utils_internal {
+auto parse_and_validate_lead_byte(
+        uint8_t byte,
+        size_t& num_continuation_bytes,
+        uint32_t& code_point,
+        uint32_t& code_point_lower_bound,
+        uint32_t& code_point_upper_bound
+) -> bool {
+    if ((byte & cFourByteUtf8CharHeaderMask) == cFourByteUtf8CharHeader) {
+        num_continuation_bytes = 3;
+        code_point = (~cFourByteUtf8CharHeaderMask & byte);
+        code_point_lower_bound = cFourByteUtf8CharCodePointLowerBound;
+        code_point_upper_bound = cFourByteUtf8CharCodePointUpperBound;
+    } else if ((byte & cThreeByteUtf8CharHeaderMask) == cThreeByteUtf8CharHeader) {
+        num_continuation_bytes = 2;
+        code_point = (~cThreeByteUtf8CharHeaderMask & byte);
+        code_point_lower_bound = cThreeByteUtf8CharCodePointLowerBound;
+        code_point_upper_bound = cThreeByteUtf8CharCodePointUpperBound;
+    } else if ((byte & cTwoByteUtf8CharHeaderMask) == cTwoByteUtf8CharHeader) {
+        num_continuation_bytes = 1;
+        code_point = (~cTwoByteUtf8CharHeaderMask & byte);
+        code_point_lower_bound = cTwoByteUtf8CharCodePointLowerBound;
+        code_point_upper_bound = cTwoByteUtf8CharCodePointUpperBound;
+    } else {
+        return false;
+    }
+    return true;
+}
+
+auto is_ascii_char(uint8_t byte) -> bool {
+    return cOneByteUtf8CharCodePointUpperBound >= byte;
+}
+
+auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
+    return (byte & cUtf8ContinuationByteMask) == cUtf8ContinuationByteHeader;
+}
+
+auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
+    return (code_point << cUtf8NumContinuationByteCodePointBits)
+           + (continuation_byte & cUtf8ContinuationByteCodePointMask);
+}
+}  // namespace utf8_utils_internal
+}  // namespace clp
diff --git a/components/core/src/clp/utf8_utils.hpp b/components/core/src/clp/utf8_utils.hpp
@@ -0,0 +1,144 @@
+#ifndef CLP_UTF8_UTILS_HPP
+#define CLP_UTF8_UTILS_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <string_view>
+
+namespace clp {
+// Constants
+// Lead byte signature
+constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0};  // 0b111x_xxxx
+constexpr uint8_t cTwoByteUtf8CharHeader{0xC0};  // 0b110x_xxxx
+constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0};  // 0b1111_xxxx
+constexpr uint8_t cThreeByteUtf8CharHeader{0xE0};  // 0b1110_xxxx
+constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8};  // 0b1111_1xxx
+constexpr uint8_t cFourByteUtf8CharHeader{0xF0};  // 0b1111_0xxx
+
+// Code point ranges (inclusive)
+constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0};
+constexpr uint32_t cOneByteUtf8CharCodePointUpperBound{0x7F};
+constexpr uint32_t cTwoByteUtf8CharCodePointLowerBound{0x80};
+constexpr uint32_t cTwoByteUtf8CharCodePointUpperBound{0x7FF};
+constexpr uint32_t cThreeByteUtf8CharCodePointLowerBound{0x800};
+constexpr uint32_t cThreeByteUtf8CharCodePointUpperBound{0xFFFF};
+constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000};
+constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF};
+
+// Continuation byte
+constexpr uint32_t cUtf8ContinuationByteMask{0xC0};
+constexpr uint32_t cUtf8ContinuationByteHeader{0x80};
+constexpr uint32_t cUtf8ContinuationByteCodePointMask{0x3F};
+constexpr uint8_t cUtf8NumContinuationByteCodePointBits{6};
+
+/**
+ * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
+ * the given handler.
+ * @tparam EscapeHandler Method to optionally escape any ASCII character in the string.
+ * @param src
+ * @param escape_handler
+ * @return Whether the input is a valid UTF-8 encoded string.
+ */
+template <typename EscapeHandler>
+requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
+[[nodiscard]] auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool;
+
+/**
+ * @param str
+ * @return Whether the input is a valid UTF-8 encoded string.
+ */
+[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool;
+
+namespace utf8_utils_internal {
+/**
+ * Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses
+ * the byte, and returns the parsed properties as well as associated properties.
+ * @param byte Byte to validate.
+ * @param num_continuation_bytes Returns the number of continuation bytes expected.
+ * @param code_point Returns the code point bits parsed from the lead byte.
+ * @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8
+ * character.
+ * @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8
+ * character.
+ * @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character.
+ */
+[[nodiscard]] auto parse_and_validate_lead_byte(
+        uint8_t byte,
+        size_t& num_continuation_bytes,
+        uint32_t& code_point,
+        uint32_t& code_point_lower_bound,
+        uint32_t& code_point_upper_bound
+) -> bool;
+
+/**
+ * @param byte
+ * @return Whether the given byte is a valid ASCII character.
+ */
+[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool;
+
+/*
+ * @param byte
+ * @return Whether the input byte is a valid UTF-8 continuation byte.
+ */
+[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
+
+/**
+ * Parses the code-point bits from the given continuation byte and combines them with the given
+ * code point.
+ * @param code_point
+ * @param continuation_byte
+ * @return The updated code point.
+ */
+[[nodiscard]] auto
+parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
+}  // namespace utf8_utils_internal
+
+template <typename EscapeHandler>
+requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
+auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool {
+    size_t num_continuation_bytes_to_validate{0};
+    uint32_t code_point{};
+    uint32_t code_point_lower_bound{};
+    uint32_t code_point_upper_bound{};
+
+    // NOLINTNEXTLINE(readability-qualified-auto)
+    for (auto it{src.cbegin()}; it != src.cend(); ++it) {
+        auto const byte{static_cast<uint8_t>(*it)};
+        if (0 == num_continuation_bytes_to_validate) {
+            if (utf8_utils_internal::is_ascii_char(byte)) {
+                escape_handler(it);
+            } else if (false
+                       == utf8_utils_internal::parse_and_validate_lead_byte(
+                               byte,
+                               num_continuation_bytes_to_validate,
+                               code_point,
+                               code_point_lower_bound,
+                               code_point_upper_bound
+                       ))
+            {
+                return false;
+            }
+        } else {
+            if (false == utf8_utils_internal::is_valid_utf8_continuation_byte(byte)) {
+                return false;
+            }
+            code_point = utf8_utils_internal::parse_continuation_byte(code_point, byte);
+            --num_continuation_bytes_to_validate;
+            if (0 == num_continuation_bytes_to_validate
+                && (code_point < code_point_lower_bound || code_point_upper_bound < code_point))
+            {
+                return false;
+            }
+        }
+    }
+
+    if (0 != num_continuation_bytes_to_validate) {
+        // Incomplete UTF-8 character
+        return false;
+    }
+
+    return true;
+}
+}  // namespace clp
+
+#endif  // CLP_UTF8_UTILS_HPP
diff --git a/components/core/tests/test-utf8_utils.cpp b/components/core/tests/test-utf8_utils.cpp