Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode escaping #22

Merged
merged 7 commits into from
Jan 11, 2015
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Quick and dirty implementation for basic multilingual plane in the un…
…icode escape mechanism
  • Loading branch information
Teemperor committed Jan 10, 2015
commit 222aacc213c0b34d275a119df8e7cabb44993af2
73 changes: 73 additions & 0 deletions src/json.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2073,6 +2073,9 @@ std::string json::parser::parseString()
result += '\n';
} else if (currentChar == 'r') {
result += '\r';
} else if (currentChar == 'u') {
pos_++;
result += parseUnicodeEscape();
} else {
error("expected one of \\,/,b,f,n,r,t behind backslash.");
}
Expand Down Expand Up @@ -2118,6 +2121,76 @@ std::string json::parser::parseString()
error("expected '\"'");
}

std::string json::parser::unicodeToUTF8(unsigned int codepoint) {

// it's just a ASCII compatible codepoint,
// so we just interpret the point as a character
if (codepoint <= 0x7f) {
return std::string(1, static_cast<char>(codepoint));
}
else if (codepoint <= 0x7ff)
{
std::string result(2, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
result[1] = static_cast<char>(0x80 | (codepoint & 0x3f));
return result;
}
else if (codepoint <= 0xffff)
{
std::string result(3, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
result[2] = static_cast<char>(0x80 | (codepoint & 0x3f));
return result;
}
else if (codepoint <= 0x1fffff)
{
std::string result(4, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f));
result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
result[3] = static_cast<char>(0x80 | (codepoint & 0x3f));
return result;
} else {
std::string errorMessage = "Invalid codepoint: ";
errorMessage += codepoint;
error(errorMessage);
}
}

/*!
Parses the JSON style unicode escape sequence (\uXXXX).

@return the utf-8 character the escape sequence escaped

@pre An opening quote \p " was read in the main parse function @ref parse.
pos_ is the position after the opening quote.

@post The character after the closing quote \p " is the current character @ref
current_. Whitespace is skipped.
*/
std::string json::parser::parseUnicodeEscape() {
const auto startPos = pos_;
if (pos_ + 3 >= buffer_.size()) {
error("Got end of input while parsing unicode escape sequence \\uXXXX");
}
std::string hexCode(4, ' ');
for(; pos_ < startPos + 4; pos_++) {
char currentChar = buffer_[pos_];
if ( (currentChar >= '0' && currentChar <= '9')
|| (currentChar >= 'a' && currentChar <= 'f')
|| (currentChar >= 'A' && currentChar <= 'F')) {
// all is well, we have valid hexadecimal chars
// so we copy that char into our string
hexCode[pos_ - startPos] = currentChar;
} else {
error("Found non-hexadecimal character in unicode escape sequence!");
}
}
pos_--;
// case is safe as 4 hex characters can't present more than 16 bits
return unicodeToUTF8(static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16)));
}



/*!
This function is called in case a \p "t" is read in the main parse function
@ref parse. In the standard, the \p "true" token is the only candidate, so the
Expand Down
4 changes: 4 additions & 0 deletions src/json.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,10 @@ class json
inline void error(const std::string&) __attribute__((noreturn));
/// parse a quoted string
inline std::string parseString();
/// transforms a unicode codepoint to it's UTF-8 presentation
inline std::string unicodeToUTF8(unsigned int codepoint);
/// parses a unicode escape sequence
inline std::string parseUnicodeEscape();
/// parse a Boolean "true"
inline void parseTrue();
/// parse a Boolean "false"
Expand Down
4 changes: 4 additions & 0 deletions test/json_unit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1652,6 +1652,10 @@ TEST_CASE("Parser")
CHECK(json::parse("\"a\\nz\"") == json("a\nz"));
CHECK(json::parse("\"\\n\"") == json("\n"));

// escape unicode characters
CHECK(json::parse("\"\\u002F\"") == json("/"));
CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));

// escaping senseless stuff
CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument);
CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument);
Expand Down