Skip to content

Commit

Permalink
Extend sax parser to optionally accept position information for parse…
Browse files Browse the repository at this point in the history
…d tokens
  • Loading branch information
barcode authored and raphael-grimm committed Dec 19, 2022
1 parent 7f72eed commit ab6bb9a
Show file tree
Hide file tree
Showing 7 changed files with 2,992 additions and 102 deletions.
230 changes: 200 additions & 30 deletions include/nlohmann/detail/input/binary_reader.hpp

Large diffs are not rendered by default.

14 changes: 11 additions & 3 deletions include/nlohmann/detail/input/lexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1506,13 +1506,13 @@ class lexer : public lexer_base<BasicJsonType>
while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
}

token_type scan()
bool scan_start()
{
// initially, skip the BOM
if (position.chars_read_total == 0 && !skip_bom())
{
error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
return token_type::parse_error;
return false;
}

// read next character and ignore whitespace
Expand All @@ -1523,13 +1523,17 @@ class lexer : public lexer_base<BasicJsonType>
{
if (!scan_comment())
{
return token_type::parse_error;
return false;
}

// skip following whitespace
skip_whitespace();
}
return true;
}

token_type scan_end()
{
switch (current)
{
// structural characters
Expand Down Expand Up @@ -1593,6 +1597,10 @@ class lexer : public lexer_base<BasicJsonType>
return token_type::parse_error;
}
}
token_type scan()
{
return !scan_start() ? token_type::parse_error : scan_end();
}

private:
/// input adapter
Expand Down
45 changes: 27 additions & 18 deletions include/nlohmann/detail/input/parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ class parser
, m_lexer(std::move(adapter), skip_comments)
, allow_exceptions(allow_exceptions_)
{
// read first token
get_token();
}

/*!
Expand All @@ -98,7 +96,7 @@ class parser
sax_parse_internal(&sdp);

// in strict mode, input must be completely read
if (strict && (get_token() != token_type::end_of_input))
if (strict && (get_token(&sdp) != token_type::end_of_input))
{
sdp.parse_error(m_lexer.get_position(),
m_lexer.get_token_string(),
Expand Down Expand Up @@ -126,7 +124,7 @@ class parser
sax_parse_internal(&sdp);

// in strict mode, input must be completely read
if (strict && (get_token() != token_type::end_of_input))
if (strict && (get_token(&sdp) != token_type::end_of_input))
{
sdp.parse_error(m_lexer.get_position(),
m_lexer.get_token_string(),
Expand Down Expand Up @@ -164,7 +162,7 @@ class parser
const bool result = sax_parse_internal(sax);

// strict mode: next byte must be EOF
if (result && strict && (get_token() != token_type::end_of_input))
if (result && strict && (get_token(sax) != token_type::end_of_input))
{
return sax->parse_error(m_lexer.get_position(),
m_lexer.get_token_string(),
Expand All @@ -185,6 +183,8 @@ class parser
// value to avoid a goto (see comment where set to true)
bool skip_to_state_evaluation = false;

// read first token
get_token(sax);
while (true)
{
if (!skip_to_state_evaluation)
Expand All @@ -200,7 +200,7 @@ class parser
}

// closing } -> we are done
if (get_token() == token_type::end_object)
if (get_token(sax) == token_type::end_object)
{
if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
{
Expand All @@ -222,7 +222,7 @@ class parser
}

// parse separator (:)
if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::name_separator))
{
return sax->parse_error(m_lexer.get_position(),
m_lexer.get_token_string(),
Expand All @@ -233,7 +233,7 @@ class parser
states.push_back(false);

// parse values
get_token();
get_token(sax);
continue;
}

Expand All @@ -245,7 +245,7 @@ class parser
}

// closing ] -> we are done
if (get_token() == token_type::end_array)
if (get_token(sax) == token_type::end_array)
{
if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
{
Expand Down Expand Up @@ -372,10 +372,10 @@ class parser
if (states.back()) // array
{
// comma -> next value
if (get_token() == token_type::value_separator)
if (get_token(sax) == token_type::value_separator)
{
// parse a new value
get_token();
get_token(sax);
continue;
}

Expand Down Expand Up @@ -405,10 +405,10 @@ class parser
// states.back() is false -> object

// comma -> next value
if (get_token() == token_type::value_separator)
if (get_token(sax) == token_type::value_separator)
{
// parse key
if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string))
if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::value_string))
{
return sax->parse_error(m_lexer.get_position(),
m_lexer.get_token_string(),
Expand All @@ -421,15 +421,15 @@ class parser
}

// parse separator (:)
if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::name_separator))
{
return sax->parse_error(m_lexer.get_position(),
m_lexer.get_token_string(),
parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), nullptr));
}

// parse values
get_token();
get_token(sax);
continue;
}

Expand Down Expand Up @@ -457,10 +457,19 @@ class parser
}
}

/// get next token from lexer
token_type get_token()
/// get next token from lexer and pass position info to sax (if it is accepted)
template<class SAX>
token_type get_token(SAX* sax)
{
return last_token = m_lexer.scan();
if (!m_lexer.scan_start())
{
last_token = token_type::parse_error;
return token_type::parse_error;
}
detail::sax_call_next_token_start_pos(sax, m_lexer);
last_token = m_lexer.scan_end();
detail::sax_call_next_token_end_pos(sax, m_lexer);
return last_token;
}

std::string exception_message(const token_type expected, const std::string& context)
Expand Down
145 changes: 145 additions & 0 deletions include/nlohmann/detail/meta/is_sax.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,151 @@
NLOHMANN_JSON_NAMESPACE_BEGIN
namespace detail
{
// helper struct to call sax->next_token_start
//(we want this functionality as a type to ease passing it as template argument)
struct sax_call_next_token_start_pos_direct
{
template<typename SAX, typename...Ts>
static auto call(SAX* sax, Ts&& ...ts)
-> decltype(sax->next_token_start(std::forward<Ts>(ts)...))
{
sax->next_token_start(std::forward<Ts>(ts)...);
}
};
// helper struct to call sax->next_token_end
// (we want this functionality as a type to ease passing it as template argument)
struct sax_call_next_token_end_pos_direct
{
template<typename SAX, typename...Ts>
static auto call(SAX* sax, Ts&& ...ts)
-> decltype(sax->next_token_end(std::forward<Ts>(ts)...))
{
sax->next_token_end(std::forward<Ts>(ts)...);
}
};

// dispatch the calls to next_token_start next_token_end
// and drop the calls if the sax parser does not support these methods.
//
// DirectCaller can be set to one of sax_call_next_token_{start,end}_pos_direct to
// determine which method is called
template <typename DirectCaller, typename SAX, typename LexOrPos>
struct sax_call_function
{
// is the parameter a lexer or a position
static constexpr bool no_lexer = std::is_same<LexOrPos, std::size_t>::value;

template<typename SAX2, typename...Ts2>
using call_t = decltype(DirectCaller::call(std::declval<SAX2*>(), std::declval<Ts2>()...));

//the sax parser supports calls with a position
static constexpr bool detected_call_with_pos =
is_detected_exact<void, call_t, SAX, std::size_t>::value;

//the sax parser supports calls with a lexer
static constexpr bool detected_call_with_lex =
!no_lexer &&
is_detected_exact<void, call_t, SAX, const LexOrPos>::value;

//there either has to be a version accepting a lexer or a position
static constexpr bool valid = detected_call_with_pos || detected_call_with_lex;

//called with pos and pos is method supported -> pass data on
template<typename SaxT = SAX>
static typename std::enable_if <
sax_call_function<DirectCaller, SaxT, LexOrPos>::valid &&
std::is_same<SaxT, SAX>::value &&
sax_call_function<DirectCaller, SaxT, LexOrPos>::detected_call_with_pos
>::type
call(SaxT* sax, std::size_t pos)
{
DirectCaller::call(sax, pos);
}

//the sax parser has no version of the method -> drop call
template<typename SaxT = SAX>
static typename std::enable_if <
std::is_same<SaxT, SAX>::value &&
!sax_call_function<DirectCaller, SaxT, LexOrPos>::valid
>::type
call(SaxT* /*unused*/, const LexOrPos& /*unused*/) {}

//called with lex and lex method is supported -> pass data on
template<typename SaxT = SAX>
static typename std::enable_if <
sax_call_function<DirectCaller, SaxT, LexOrPos>::valid &&
std::is_same<SaxT, SAX>::value &&
!sax_call_function<DirectCaller, SaxT, LexOrPos>::no_lexer &&
sax_call_function<DirectCaller, SaxT, LexOrPos>::detected_call_with_lex
>::type
call(SaxT* sax, const LexOrPos& lex)
{
DirectCaller::call(sax, lex);
}

// called with lex and only pos method is supported -> call with position from lexer
// the start pos in the lexer is last read char -> chars_read_total-1
template<typename SaxT = SAX>
static typename std::enable_if <
sax_call_function<DirectCaller, SaxT, LexOrPos>::valid &&
std::is_same<SaxT, SAX>::value &&
!sax_call_function<DirectCaller, SaxT, LexOrPos>::no_lexer &&
!sax_call_function<DirectCaller, SaxT, LexOrPos>::detected_call_with_lex &&
std::is_same<DirectCaller, sax_call_next_token_start_pos_direct>::value
>::type
call(SaxT* sax, const LexOrPos& lex)
{
DirectCaller::call(sax, lex.get_position().chars_read_total - 1);
}

// called with lex and only pos method is supported -> call with position from lexer
// the one past end pos in the lexer is the current index -> chars_read_total
template<typename SaxT = SAX>
static typename std::enable_if <
sax_call_function<DirectCaller, SaxT, LexOrPos>::valid &&
std::is_same<SaxT, SAX>::value &&
!sax_call_function<DirectCaller, SaxT, LexOrPos>::no_lexer &&
!sax_call_function<DirectCaller, SaxT, LexOrPos>::detected_call_with_lex &&
std::is_same<DirectCaller, sax_call_next_token_end_pos_direct>::value
>::type
call(SaxT* sax, const LexOrPos& lex)
{
DirectCaller::call(sax, lex.get_position().chars_read_total);
}
};

//set the element start pos of a sax parser by calling any version of sax->next_token_start (if available)
template<class SAX, class LexOrPos>
void sax_call_next_token_start_pos(SAX* sax, const LexOrPos& lexOrPos)
{
using call_t = sax_call_function<sax_call_next_token_start_pos_direct, SAX, LexOrPos>;
call_t::call(sax, lexOrPos);
}
//set the element end pos of a sax parser by calling any version of sax->next_token_end (if available)
template<class SAX, class LexOrPos>
void sax_call_next_token_end_pos(SAX* sax, const LexOrPos& lexOrPos)
{
using call_t = sax_call_function<sax_call_next_token_end_pos_direct, SAX, LexOrPos>;
call_t::call(sax, lexOrPos);
}
//set the element start end pos of a sax parser by calling any version of
// sax->next_token_start and sax->next_token_end (if available)
template<class SAX, class LexOrPos1, class LexOrPos2>
void sax_call_next_token_start_end_pos(SAX* sax, const LexOrPos1& lexOrPos1, const LexOrPos2& lexOrPos2)
{
sax_call_next_token_start_pos(sax, lexOrPos1);
sax_call_next_token_end_pos(sax, lexOrPos2);
}
//set the element start end pos of a sax parser by calling any version of
// sax->next_token_start and sax->next_token_end (if available)
template<class SAX, class LexOrPos>
void sax_call_next_token_start_end_pos(SAX* sax, const LexOrPos& lexOrPos)
{
sax_call_next_token_start_pos(sax, lexOrPos);
sax_call_next_token_end_pos(sax, lexOrPos);
}



template<typename T>
using null_function_t = decltype(std::declval<T&>().null());
Expand Down
Loading

0 comments on commit ab6bb9a

Please sign in to comment.