|
| 1 | +#ifndef CLP_UTF8_UTILS_HPP |
| 2 | +#define CLP_UTF8_UTILS_HPP |
| 3 | + |
| 4 | +#include <cstddef> |
| 5 | +#include <cstdint> |
| 6 | +#include <string_view> |
| 7 | + |
| 8 | +namespace clp { |
| 9 | +// Constants |
| 10 | +// Lead byte signature |
| 11 | +constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0}; // 0b111x_xxxx |
| 12 | +constexpr uint8_t cTwoByteUtf8CharHeader{0xC0}; // 0b110x_xxxx |
| 13 | +constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0}; // 0b1111_xxxx |
| 14 | +constexpr uint8_t cThreeByteUtf8CharHeader{0xE0}; // 0b1110_xxxx |
| 15 | +constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8}; // 0b1111_1xxx |
| 16 | +constexpr uint8_t cFourByteUtf8CharHeader{0xF0}; // 0b1111_0xxx |
| 17 | + |
| 18 | +// Code point ranges (inclusive) |
| 19 | +constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0}; |
| 20 | +constexpr uint32_t cOneByteUtf8CharCodePointUpperBound{0x7F}; |
| 21 | +constexpr uint32_t cTwoByteUtf8CharCodePointLowerBound{0x80}; |
| 22 | +constexpr uint32_t cTwoByteUtf8CharCodePointUpperBound{0x7FF}; |
| 23 | +constexpr uint32_t cThreeByteUtf8CharCodePointLowerBound{0x800}; |
| 24 | +constexpr uint32_t cThreeByteUtf8CharCodePointUpperBound{0xFFFF}; |
| 25 | +constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000}; |
| 26 | +constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF}; |
| 27 | + |
| 28 | +// Continuation byte |
| 29 | +constexpr uint32_t cUtf8ContinuationByteMask{0xC0}; |
| 30 | +constexpr uint32_t cUtf8ContinuationByteHeader{0x80}; |
| 31 | +constexpr uint32_t cUtf8ContinuationByteCodePointMask{0x3F}; |
| 32 | +constexpr uint8_t cUtf8NumContinuationByteCodePointBits{6}; |
| 33 | + |
| 34 | +/** |
| 35 | + * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using |
| 36 | + * the given handler. |
| 37 | + * @tparam EscapeHandler Method to optionally escape any ASCII character in the string. |
| 38 | + * @param src |
| 39 | + * @param escape_handler |
| 40 | + * @return Whether the input is a valid UTF-8 encoded string. |
| 41 | + */ |
| 42 | +template <typename EscapeHandler> |
| 43 | +requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator> |
| 44 | +[[nodiscard]] auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool; |
| 45 | + |
| 46 | +/** |
| 47 | + * @param str |
| 48 | + * @return Whether the input is a valid UTF-8 encoded string. |
| 49 | + */ |
| 50 | +[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool; |
| 51 | + |
| 52 | +namespace utf8_utils_internal { |
| 53 | +/** |
| 54 | + * Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses |
| 55 | + * the byte, and returns the parsed properties as well as associated properties. |
| 56 | + * @param byte Byte to validate. |
| 57 | + * @param num_continuation_bytes Returns the number of continuation bytes expected. |
| 58 | + * @param code_point Returns the code point bits parsed from the lead byte. |
| 59 | + * @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8 |
| 60 | + * character. |
| 61 | + * @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8 |
| 62 | + * character. |
| 63 | + * @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character. |
| 64 | + */ |
| 65 | +[[nodiscard]] auto parse_and_validate_lead_byte( |
| 66 | + uint8_t byte, |
| 67 | + size_t& num_continuation_bytes, |
| 68 | + uint32_t& code_point, |
| 69 | + uint32_t& code_point_lower_bound, |
| 70 | + uint32_t& code_point_upper_bound |
| 71 | +) -> bool; |
| 72 | + |
| 73 | +/** |
| 74 | + * @param byte |
| 75 | + * @return Whether the given byte is a valid ASCII character. |
| 76 | + */ |
| 77 | +[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool; |
| 78 | + |
| 79 | +/* |
| 80 | + * @param byte |
| 81 | + * @return Whether the input byte is a valid UTF-8 continuation byte. |
| 82 | + */ |
| 83 | +[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; |
| 84 | + |
| 85 | +/** |
| 86 | + * Parses the code-point bits from the given continuation byte and combines them with the given |
| 87 | + * code point. |
| 88 | + * @param code_point |
| 89 | + * @param continuation_byte |
| 90 | + * @return The updated code point. |
| 91 | + */ |
| 92 | +[[nodiscard]] auto |
| 93 | +parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; |
| 94 | +} // namespace utf8_utils_internal |
| 95 | + |
| 96 | +template <typename EscapeHandler> |
| 97 | +requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator> |
| 98 | +auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool { |
| 99 | + size_t num_continuation_bytes_to_validate{0}; |
| 100 | + uint32_t code_point{}; |
| 101 | + uint32_t code_point_lower_bound{}; |
| 102 | + uint32_t code_point_upper_bound{}; |
| 103 | + |
| 104 | + // NOLINTNEXTLINE(readability-qualified-auto) |
| 105 | + for (auto it{src.cbegin()}; it != src.cend(); ++it) { |
| 106 | + auto const byte{static_cast<uint8_t>(*it)}; |
| 107 | + if (0 == num_continuation_bytes_to_validate) { |
| 108 | + if (utf8_utils_internal::is_ascii_char(byte)) { |
| 109 | + escape_handler(it); |
| 110 | + } else if (false |
| 111 | + == utf8_utils_internal::parse_and_validate_lead_byte( |
| 112 | + byte, |
| 113 | + num_continuation_bytes_to_validate, |
| 114 | + code_point, |
| 115 | + code_point_lower_bound, |
| 116 | + code_point_upper_bound |
| 117 | + )) |
| 118 | + { |
| 119 | + return false; |
| 120 | + } |
| 121 | + } else { |
| 122 | + if (false == utf8_utils_internal::is_valid_utf8_continuation_byte(byte)) { |
| 123 | + return false; |
| 124 | + } |
| 125 | + code_point = utf8_utils_internal::parse_continuation_byte(code_point, byte); |
| 126 | + --num_continuation_bytes_to_validate; |
| 127 | + if (0 == num_continuation_bytes_to_validate |
| 128 | + && (code_point < code_point_lower_bound || code_point_upper_bound < code_point)) |
| 129 | + { |
| 130 | + return false; |
| 131 | + } |
| 132 | + } |
| 133 | + } |
| 134 | + |
| 135 | + if (0 != num_continuation_bytes_to_validate) { |
| 136 | + // Incomplete UTF-8 character |
| 137 | + return false; |
| 138 | + } |
| 139 | + |
| 140 | + return true; |
| 141 | +} |
| 142 | +} // namespace clp |
| 143 | + |
| 144 | +#endif // CLP_UTF8_UTILS_HPP |
0 commit comments