Skip to content

Commit 249816b

Browse files
Add support for validating and escaping UTF-8 strings. (#453)
Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
1 parent 8a2c0a8 commit 249816b

File tree

6 files changed

+533
-0
lines changed

6 files changed

+533
-0
lines changed

components/core/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,8 @@ set(SOURCE_FILES_unitTest
326326
src/clp/ffi/search/Subquery.hpp
327327
src/clp/ffi/search/WildcardToken.cpp
328328
src/clp/ffi/search/WildcardToken.hpp
329+
src/clp/ffi/utils.cpp
330+
src/clp/ffi/utils.hpp
329331
src/clp/FileDescriptor.cpp
330332
src/clp/FileDescriptor.hpp
331333
src/clp/FileReader.cpp
@@ -438,6 +440,8 @@ set(SOURCE_FILES_unitTest
438440
src/clp/TraceableException.hpp
439441
src/clp/time_types.hpp
440442
src/clp/type_utils.hpp
443+
src/clp/utf8_utils.cpp
444+
src/clp/utf8_utils.hpp
441445
src/clp/Utils.cpp
442446
src/clp/Utils.hpp
443447
src/clp/VariableDictionaryEntry.cpp
@@ -472,6 +476,7 @@ set(SOURCE_FILES_unitTest
472476
tests/test-StreamingCompression.cpp
473477
tests/test-string_utils.cpp
474478
tests/test-TimestampPattern.cpp
479+
tests/test-utf8_utils.cpp
475480
tests/test-Utils.cpp
476481
)
477482
add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest})

components/core/src/clp/ffi/utils.cpp

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#include "utils.hpp"
2+
3+
#include <array>
4+
#include <cstddef>
5+
#include <cstdint>
6+
#include <cstdio>
7+
#include <optional>
8+
#include <string>
9+
#include <string_view>
10+
#include <tuple>
11+
12+
#include "../utf8_utils.hpp"
13+
14+
using std::string;
15+
using std::string_view;
16+
17+
namespace clp::ffi {
18+
auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
19+
std::optional<std::string> ret_val;
20+
auto& escaped{ret_val.emplace()};
21+
escaped.reserve(raw.size() + (raw.size() / 2));
22+
if (false == validate_and_append_escaped_utf8_string(raw, escaped)) {
23+
return std::nullopt;
24+
}
25+
return ret_val;
26+
}
27+
28+
auto validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool {
29+
string_view::const_iterator next_char_to_copy_it{src.cbegin()};
30+
31+
auto escape_handler = [&](string_view::const_iterator it) -> void {
32+
// Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte
33+
// used by `snprintf` to append '\0'
34+
constexpr size_t cControlCharacterBufSize{7};
35+
std::array<char, cControlCharacterBufSize> buf{};
36+
std::string_view escaped_char;
37+
bool escape_required{true};
38+
switch (*it) {
39+
case '\b':
40+
escaped_char = "\\b";
41+
break;
42+
case '\t':
43+
escaped_char = "\\t";
44+
break;
45+
case '\n':
46+
escaped_char = "\\n";
47+
break;
48+
case '\f':
49+
escaped_char = "\\f";
50+
break;
51+
case '\r':
52+
escaped_char = "\\r";
53+
break;
54+
case '\\':
55+
escaped_char = "\\\\";
56+
break;
57+
case '"':
58+
escaped_char = "\\\"";
59+
break;
60+
default: {
61+
constexpr uint8_t cLargestControlCharacter{0x1F};
62+
auto const byte{static_cast<uint8_t>(*it)};
63+
if (cLargestControlCharacter >= byte) {
64+
std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte);
65+
escaped_char = {buf.data(), buf.size() - 1};
66+
} else {
67+
escape_required = false;
68+
}
69+
break;
70+
}
71+
}
72+
if (escape_required) {
73+
dst.append(next_char_to_copy_it, it);
74+
dst += escaped_char;
75+
next_char_to_copy_it = it + 1;
76+
}
77+
};
78+
79+
if (false == validate_utf8_string(src, escape_handler)) {
80+
return false;
81+
}
82+
83+
if (src.cend() != next_char_to_copy_it) {
84+
dst.append(next_char_to_copy_it, src.cend());
85+
}
86+
87+
return true;
88+
}
89+
} // namespace clp::ffi

components/core/src/clp/ffi/utils.hpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#ifndef CLP_FFI_UTILS_HPP
2+
#define CLP_FFI_UTILS_HPP
3+
4+
#include <optional>
5+
#include <string>
6+
#include <string_view>
7+
8+
namespace clp::ffi {
9+
/**
10+
* Validates whether the given string is UTF-8 encoded, and escapes any characters to make the
11+
* string compatible with the JSON specification.
12+
* @param raw The raw string to escape.
13+
* @return The escaped string on success.
14+
* @return std::nullopt if the string contains any non-UTF-8-encoded byte sequences.
15+
*/
16+
[[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
17+
) -> std::optional<std::string>;
18+
19+
/**
20+
* Validates whether `src` is UTF-8 encoded, and appends `src` to `dst` while escaping any
21+
* characters to make the appended string compatible with the JSON specification.
22+
* @param src The string to validate and escape.
23+
* @param dst Returns `dst` with an escaped version of `src` appended.
24+
* @return Whether `src` is a valid UTF-8-encoded string. NOTE: Even if `src` is not UTF-8 encoded,
25+
* `dst` may be modified.
26+
*/
27+
[[nodiscard]] auto
28+
validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool;
29+
} // namespace clp::ffi
30+
31+
#endif // CLP_FFI_UTILS_HPP
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#include "utf8_utils.hpp"
2+
3+
#include <cstddef>
4+
#include <cstdint>
5+
#include <string_view>
6+
7+
namespace clp {
8+
auto is_utf8_encoded(std::string_view str) -> bool {
9+
auto escape_handler = []([[maybe_unused]] std::string_view::const_iterator it) -> void {};
10+
return validate_utf8_string(str, escape_handler);
11+
}
12+
13+
namespace utf8_utils_internal {
14+
auto parse_and_validate_lead_byte(
15+
uint8_t byte,
16+
size_t& num_continuation_bytes,
17+
uint32_t& code_point,
18+
uint32_t& code_point_lower_bound,
19+
uint32_t& code_point_upper_bound
20+
) -> bool {
21+
if ((byte & cFourByteUtf8CharHeaderMask) == cFourByteUtf8CharHeader) {
22+
num_continuation_bytes = 3;
23+
code_point = (~cFourByteUtf8CharHeaderMask & byte);
24+
code_point_lower_bound = cFourByteUtf8CharCodePointLowerBound;
25+
code_point_upper_bound = cFourByteUtf8CharCodePointUpperBound;
26+
} else if ((byte & cThreeByteUtf8CharHeaderMask) == cThreeByteUtf8CharHeader) {
27+
num_continuation_bytes = 2;
28+
code_point = (~cThreeByteUtf8CharHeaderMask & byte);
29+
code_point_lower_bound = cThreeByteUtf8CharCodePointLowerBound;
30+
code_point_upper_bound = cThreeByteUtf8CharCodePointUpperBound;
31+
} else if ((byte & cTwoByteUtf8CharHeaderMask) == cTwoByteUtf8CharHeader) {
32+
num_continuation_bytes = 1;
33+
code_point = (~cTwoByteUtf8CharHeaderMask & byte);
34+
code_point_lower_bound = cTwoByteUtf8CharCodePointLowerBound;
35+
code_point_upper_bound = cTwoByteUtf8CharCodePointUpperBound;
36+
} else {
37+
return false;
38+
}
39+
return true;
40+
}
41+
42+
auto is_ascii_char(uint8_t byte) -> bool {
43+
return cOneByteUtf8CharCodePointUpperBound >= byte;
44+
}
45+
46+
auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
47+
return (byte & cUtf8ContinuationByteMask) == cUtf8ContinuationByteHeader;
48+
}
49+
50+
auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
51+
return (code_point << cUtf8NumContinuationByteCodePointBits)
52+
+ (continuation_byte & cUtf8ContinuationByteCodePointMask);
53+
}
54+
} // namespace utf8_utils_internal
55+
} // namespace clp
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#ifndef CLP_UTF8_UTILS_HPP
2+
#define CLP_UTF8_UTILS_HPP
3+
4+
#include <cstddef>
5+
#include <cstdint>
6+
#include <string_view>
7+
8+
namespace clp {
9+
// Constants
10+
// Lead byte signature
11+
constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0}; // 0b111x_xxxx
12+
constexpr uint8_t cTwoByteUtf8CharHeader{0xC0}; // 0b110x_xxxx
13+
constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0}; // 0b1111_xxxx
14+
constexpr uint8_t cThreeByteUtf8CharHeader{0xE0}; // 0b1110_xxxx
15+
constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8}; // 0b1111_1xxx
16+
constexpr uint8_t cFourByteUtf8CharHeader{0xF0}; // 0b1111_0xxx
17+
18+
// Code point ranges (inclusive)
19+
constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0};
20+
constexpr uint32_t cOneByteUtf8CharCodePointUpperBound{0x7F};
21+
constexpr uint32_t cTwoByteUtf8CharCodePointLowerBound{0x80};
22+
constexpr uint32_t cTwoByteUtf8CharCodePointUpperBound{0x7FF};
23+
constexpr uint32_t cThreeByteUtf8CharCodePointLowerBound{0x800};
24+
constexpr uint32_t cThreeByteUtf8CharCodePointUpperBound{0xFFFF};
25+
constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000};
26+
constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF};
27+
28+
// Continuation byte
29+
constexpr uint32_t cUtf8ContinuationByteMask{0xC0};
30+
constexpr uint32_t cUtf8ContinuationByteHeader{0x80};
31+
constexpr uint32_t cUtf8ContinuationByteCodePointMask{0x3F};
32+
constexpr uint8_t cUtf8NumContinuationByteCodePointBits{6};
33+
34+
/**
35+
* Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
36+
* the given handler.
37+
* @tparam EscapeHandler Method to optionally escape any ASCII character in the string.
38+
* @param src
39+
* @param escape_handler
40+
* @return Whether the input is a valid UTF-8 encoded string.
41+
*/
42+
template <typename EscapeHandler>
43+
requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
44+
[[nodiscard]] auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool;
45+
46+
/**
47+
* @param str
48+
* @return Whether the input is a valid UTF-8 encoded string.
49+
*/
50+
[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool;
51+
52+
namespace utf8_utils_internal {
53+
/**
54+
* Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses
55+
* the byte, and returns the parsed properties as well as associated properties.
56+
* @param byte Byte to validate.
57+
* @param num_continuation_bytes Returns the number of continuation bytes expected.
58+
* @param code_point Returns the code point bits parsed from the lead byte.
59+
* @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8
60+
* character.
61+
* @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8
62+
* character.
63+
* @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character.
64+
*/
65+
[[nodiscard]] auto parse_and_validate_lead_byte(
66+
uint8_t byte,
67+
size_t& num_continuation_bytes,
68+
uint32_t& code_point,
69+
uint32_t& code_point_lower_bound,
70+
uint32_t& code_point_upper_bound
71+
) -> bool;
72+
73+
/**
74+
* @param byte
75+
* @return Whether the given byte is a valid ASCII character.
76+
*/
77+
[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool;
78+
79+
/*
80+
* @param byte
81+
* @return Whether the input byte is a valid UTF-8 continuation byte.
82+
*/
83+
[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
84+
85+
/**
86+
* Parses the code-point bits from the given continuation byte and combines them with the given
87+
* code point.
88+
* @param code_point
89+
* @param continuation_byte
90+
* @return The updated code point.
91+
*/
92+
[[nodiscard]] auto
93+
parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
94+
} // namespace utf8_utils_internal
95+
96+
template <typename EscapeHandler>
97+
requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
98+
auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool {
99+
size_t num_continuation_bytes_to_validate{0};
100+
uint32_t code_point{};
101+
uint32_t code_point_lower_bound{};
102+
uint32_t code_point_upper_bound{};
103+
104+
// NOLINTNEXTLINE(readability-qualified-auto)
105+
for (auto it{src.cbegin()}; it != src.cend(); ++it) {
106+
auto const byte{static_cast<uint8_t>(*it)};
107+
if (0 == num_continuation_bytes_to_validate) {
108+
if (utf8_utils_internal::is_ascii_char(byte)) {
109+
escape_handler(it);
110+
} else if (false
111+
== utf8_utils_internal::parse_and_validate_lead_byte(
112+
byte,
113+
num_continuation_bytes_to_validate,
114+
code_point,
115+
code_point_lower_bound,
116+
code_point_upper_bound
117+
))
118+
{
119+
return false;
120+
}
121+
} else {
122+
if (false == utf8_utils_internal::is_valid_utf8_continuation_byte(byte)) {
123+
return false;
124+
}
125+
code_point = utf8_utils_internal::parse_continuation_byte(code_point, byte);
126+
--num_continuation_bytes_to_validate;
127+
if (0 == num_continuation_bytes_to_validate
128+
&& (code_point < code_point_lower_bound || code_point_upper_bound < code_point))
129+
{
130+
return false;
131+
}
132+
}
133+
}
134+
135+
if (0 != num_continuation_bytes_to_validate) {
136+
// Incomplete UTF-8 character
137+
return false;
138+
}
139+
140+
return true;
141+
}
142+
} // namespace clp
143+
144+
#endif // CLP_UTF8_UTILS_HPP

0 commit comments

Comments
 (0)