Skip to content

Support supplementary planes (U+010000 to U+10FFFF) #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions ciere/json/parser/grammar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,62 @@ namespace ciere { namespace json { namespace parser
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;

typedef boost::uint32_t uchar; // a unicode code point

namespace detail
{
struct push_utf8
{
template <typename Sig>
struct result { typedef void type; };

push_utf8(uchar* code_point): prev_code_point(code_point) {}
void operator()(std::string& utf8, uchar code_point) const;
mutable uchar* prev_code_point;
};

struct push_esc
{
template <typename Sig>
struct result { typedef void type; };

push_esc(uchar* code_point): prev_code_point(code_point) {}
void operator()(std::string& utf8, uchar c) const;
mutable uchar* prev_code_point;
};

struct push_char
{
template <typename Sig>
struct result { typedef void type; };

push_char(uchar* code_point): prev_code_point(code_point) {}
void operator()(std::string& utf8, uchar c) const;
mutable uchar* prev_code_point;
};

struct check
{
template <typename Sig>
struct result { typedef void type; };

check(uchar* code_point): prev_code_point(code_point) {}
void operator()(void) const;
mutable uchar* prev_code_point;
};

template <typename Iterator>
struct unicode_string : qi::grammar<Iterator, std::string()>
{
unicode_string();
qi::rule<Iterator, void(std::string&)> escape;
qi::rule<Iterator, void(std::string&)> char_esc;
qi::rule<Iterator, std::string()> double_quoted;
uchar code_point;
push_utf8 push_utf8_;
push_esc push_esc_;
push_char push_char_;
check check_;
};
}

Expand Down
98 changes: 64 additions & 34 deletions ciere/json/parser/grammar_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,49 +28,75 @@ namespace ciere { namespace json { namespace parser
namespace ascii = boost::spirit::ascii;
namespace phoenix = boost::phoenix;

typedef boost::uint32_t uchar; // a unicode code point

namespace detail
{
struct push_utf8
void push_utf8::operator()(std::string& utf8, uchar code_point) const
{
template <typename Sig>
struct result { typedef void type; };

void operator()(std::string& utf8, uchar code_point) const
{
typedef std::back_insert_iterator<std::string> insert_iter;
insert_iter out_iter(utf8);
boost::utf8_output_iterator<insert_iter> utf8_iter(out_iter);
*utf8_iter++ = code_point;
if (*prev_code_point > 0) {
// previous code_point is high surrogate
if (code_point >= 0xdc00 and code_point <= 0xdfff) {
code_point = 0x10000 +
((*prev_code_point - 0xd800) << 10) +
(code_point - 0xdc00);
*prev_code_point = 0;
} else {
// only high surrogate but no low surrogate
*prev_code_point = 0;
throw parse_error();
}
} else {
if (code_point >= 0xd800 and code_point <= 0xdbff) {
// high surrogate, store it and wait for low surrogate
*prev_code_point = code_point;
return;
} else if (code_point >= 0xdc00 and code_point <= 0xdfff) {
// where is the high surrogate?
throw parse_error();
}
}
};
typedef std::back_insert_iterator<std::string> insert_iter;
insert_iter out_iter(utf8);
boost::utf8_output_iterator<insert_iter> utf8_iter(out_iter);
*utf8_iter++ = code_point;
}

struct push_esc
void push_esc::operator()(std::string& utf8, uchar c) const
{
template <typename Sig>
struct result { typedef void type; };

void operator()(std::string& utf8, uchar c) const
if (*prev_code_point > 0)
throw parse_error();
switch (c)
{
switch (c)
{
case '"': utf8 += '"'; break;
case '\\': utf8 += '\\'; break;
case '/': utf8 += '/'; break;
case 'b': utf8 += '\b'; break;
case 'f': utf8 += '\f'; break;
case 'n': utf8 += '\n'; break;
case 'r': utf8 += '\r'; break;
case 't': utf8 += '\t'; break;
}
case '"': utf8 += '"'; break;
case '\\': utf8 += '\\'; break;
case '/': utf8 += '/'; break;
case 'b': utf8 += '\b'; break;
case 'f': utf8 += '\f'; break;
case 'n': utf8 += '\n'; break;
case 'r': utf8 += '\r'; break;
case 't': utf8 += '\t'; break;
}
};
}

void push_char::operator()(std::string& utf8, uchar c) const
{
if (*prev_code_point > 0)
throw parse_error();
utf8 += c;
}

void check::operator()(void) const
{
if (*prev_code_point > 0)
throw parse_error();
}

template< typename Iterator >
unicode_string<Iterator>::unicode_string()
: unicode_string::base_type(double_quoted)
: unicode_string::base_type(double_quoted), code_point(0),
push_utf8_(&code_point),
push_esc_(&code_point),
push_char_(&code_point),
check_(&code_point)
{
qi::char_type char_;
qi::_val_type _val;
Expand All @@ -80,13 +106,16 @@ namespace ciere { namespace json { namespace parser
qi::repeat_type repeat;
qi::hex_type hex;
qi::standard::cntrl_type cntrl;
qi::eps_type eps;

using boost::spirit::qi::uint_parser;
using boost::phoenix::function;

uint_parser<uchar, 16, 4, 4> hex4;
function<detail::push_utf8> push_utf8;
function<detail::push_esc> push_esc;
function<detail::push_utf8> push_utf8(push_utf8_);
function<detail::push_esc> push_esc(push_esc_);
function<detail::push_char> push_char(push_char_);
function<detail::check> check(check_);

escape =
('u' > hex4) [push_utf8(_r1, _1)]
Expand All @@ -104,9 +133,10 @@ namespace ciere { namespace json { namespace parser
double_quoted =
'"'
> *( char_esc(_val)
| (char_ - '"' - '\\' - cntrl) [_val += _1]
| (char_ - '"' - '\\' - cntrl) [push_char(_val, _1)]
)
> '"'
> eps[check()]
;

BOOST_SPIRIT_DEBUG_NODE(escape);
Expand Down
7 changes: 7 additions & 0 deletions libs/json/test/construct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ BOOST_AUTO_TEST_CASE(string)
BOOST_CHECK_EQUAL(
json::construct("\"\\u26030\""), std::string("\xe2\x98\x83\x30"));

// U+064321 = D950 DF21 (UTF-16) = F1 A4 8C A1 (UTF-8)
BOOST_CHECK_EQUAL(
json::construct("\"\\ud950\\udf21\""), std::string("\xf1\xa4\x8c\xa1"));
BOOST_CHECK_THROW(json::construct("\"\\ud950\""), json::parse_error);
BOOST_CHECK_THROW(json::construct("\"\\ud9500\""), json::parse_error);
BOOST_CHECK_THROW(json::construct("\"\\udf21\""), json::parse_error);

json::value v;
BOOST_CHECK_EQUAL(json::construct("[8,42.5,true] \"foo\"", v ), true);
BOOST_CHECK_EQUAL(json::construct("[8,42.5,true] \"foo\"", v, false), true);
Expand Down