Add quoted_string, to automate probably the most commonly-written par…

…ser of all time -- the quoted string.
tzlaine · Mar 11, 2024 · 24288a0 · 24288a0
1 parent 824a208
commit 24288a0
Show file tree

Hide file tree

Showing 10 changed files with 782 additions and 49 deletions.
diff --git a/doc/parser.qbk b/doc/parser.qbk
@@ -41,6 +41,7 @@
 [import ../example/user_error_handler.cpp]
 [import ../test/parser.cpp]
 [import ../test/parser_rule.cpp]
+[import ../test/parser_quoted_string.cpp]
 
 [import ../include/boost/parser/concepts.hpp]
 [import ../include/boost/parser/error_handling_fwd.hpp]
@@ -218,6 +219,8 @@
 [def _lower_               [globalref boost::parser::lower `lower`]]
 [def _upper_               [globalref boost::parser::upper `upper`]]
 
+[def _quot_str_            [globalref boost::parser::quoted_string `quoted_string`]]
+
 [def _RES_                 ['[^RESOLVE]]`()`]
 [def _RES_np_              ['[^RESOLVE]]]
 [def _ATTR_                ['[^ATTR]]`()`]

diff --git a/doc/tables.qbk b/doc/tables.qbk
@@ -38,6 +38,8 @@ itself be used as a parser; it must be called.  In the table below:
 
 * `p`, `p1`, `p2`, ... are parsers.
 
+* `escapes` is a _symbols_t_ object, where `T` is `char` or `char32_t`.
+
 [note The definition of `parsable_range_like` is:
 
 [parsable_range_like_concept]
@@ -326,6 +328,31 @@ the input they match unless otherwise stated in the table below.]
      [ _symbols_ is an associative container of key, value pairs.  Each key is a _std_str_ and each value has type `T`.  In the Unicode parsing path, the strings are considered to be UTF-8 encoded; in the non-Unicode path, no encoding is assumed.  _symbols_ Matches the longest prefix `pre` of the input that is equal to one of the keys `k`.  If the length `len` of `pre` is zero, and there is no zero-length key, it does not match the input.  If `len` is positive, the generated attribute is the value associated with `k`.]
      [ `T` ]
      [ Unlike the other entries in this table, _symbols_ is a type, not an object. ]]
+
+    [[ _quot_str_ ]
+     [ Matches `'"'`, followed by zero or more characters, followed by `'"'`. ]
+     [ _std_str_ ]
+     [ The result does not include the quotes.  A quote within the string can be written by escaping it with a backslash.  A backslash within the string can be written by writing two consecutive backslashes.  Any other use of a backslash will fail the parse.  Skipping is disabled while parsing the entire string, as if using _lexeme_. ]]
+
+    [[ `_quot_str_(c)` ]
+     [ Matches `c`, followed by zero or more characters, followed by `c`. ]
+     [ _std_str_ ]
+     [ The result does not include the `c` quotes.  A `c` within the string can be written by escaping it with a backslash.  A backslash within the string can be written by writing two consecutive backslashes.  Any other use of a backslash will fail the parse.  Skipping is disabled while parsing the entire string, as if using _lexeme_. ]]
+
+    [[ `_quot_str_(r)` ]
+     [ Matches some character `Q` in `r`, followed by zero or more characters, followed by `Q`. ]
+     [ _std_str_ ]
+     [ The result does not include the `Q` quotes.  A `Q` within the string can be written by escaping it with a backslash.  A backslash within the string can be written by writing two consecutive backslashes.  Any other use of a backslash will fail the parse.  Skipping is disabled while parsing the entire string, as if using _lexeme_. ]]
+
+    [[ `_quot_str_(c, symbols)` ]
+     [ Matches `c`, followed by zero or more characters, followed by `c`. ]
+     [ _std_str_ ]
+     [ The result does not include the `c` quotes.  A `c` within the string can be written by escaping it with a backslash.  A backslash within the string can be written by writing two consecutive backslashes.  A backslash followed by a successful match using `symbols` will be interpreted as the corresponding value produced by `symbols`.  Any other use of a backslash will fail the parse.  Skipping is disabled while parsing the entire string, as if using _lexeme_. ]]
+
+    [[ `_quot_str_(r, symbols)` ]
+     [ Matches some character `Q` in `r`, followed by zero or more characters, followed by `Q`. ]
+     [ _std_str_ ]
+     [ The result does not include the `Q` quotes.  A `Q` within the string can be written by escaping it with a backslash.  A backslash within the string can be written by writing two consecutive backslashes.  A backslash followed by a successful match using `symbols` will be interpreted as the corresponding value produced by `symbols`.  Any other use of a backslash will fail the parse.  Skipping is disabled while parsing the entire string, as if using _lexeme_. ]]
 ]
 
 [important All the character parsers, like _ch_, _cp_ and _cu_ produce either

diff --git a/doc/tutorial.qbk b/doc/tutorial.qbk
@@ -738,6 +738,106 @@ no sense.
 
 [endsect]
 
+[section Alternative Parsers]
+
+Frequently, you need to parse something that might have one of several forms.
+`operator|` is overloaded to form alternative parsers.  For example:
+
+    namespace bp = boost::parser;
+    auto const parser_1 = bp::int_ | bp::eps;
+
+`parser_1` matches an integer, or if that fails, it matches /epsilon/, the
+empty string.  This is equivalent to writing:
+
+    namespace bp = boost::parser;
+    auto const parser_2 = -bp::int_;
+
+However, neither `parser_1` nor `parser_2` is equivalent to writing this:
+
+    namespace bp = boost::parser;
+    auto const parser_3 = bp::eps | bp::int_; // Does not do what you think.
+
+The reason is that alternative parsers try each of their subparsers, one at a
+time, and stop on the first one that matches.  /Epsilon/ matches anything,
+since it is zero length and consumes no input.  It even matches the end of
+input.  This means that `parser_3` is equivalent to _e_ by itself.
+
+[note For this reason, writing `_e_ | p` for any parser p is considered a bug.
+Debug builds will assert when `_e_ | p` is encountered. ]
+
+[warning This kind of error is very common when _e_ is involved, and also very
+easy to detect.  However, it is possible to write `P1 >> P2`, where `P1` is a
+prefix of `P2`, such as `int_ | int >> int_`, or `repeat(4)[hex_digit] |
+repeat(8)[hex_digit]`.  This is almost certainly an error, but is impossible
+to detect in the general case _emdash_ remember that _rs_ can be separately
+compiled, and consider a pair of rules whose associated `_def` parsers are
+`int_` and `int_ >> int_`, respectively.]
+
+[endsect]
+
+[section Parsing Quoted Strings]
+
+It is very common to need to parse quoted strings.  Quoted strings are
+slightly tricky, though, when using a skipper (and you should be using a
+skipper 99% of the time).  You don't want to allow arbitrary whitespace in the
+middle of your strings, and you also don't want to remove all whitespace from
+your strings.  Both of these things will happen with the typical skipper,
+_ws_.
+
+So, here is how most people would write a quoted string parser:
+
+    namespace bp = boost::parser;
+    const auto string = bp::lexeme['"' >> *(bp::char_ - '"') > '"'];
+
+Some things to note:
+
+* the result is a string;
+
+* the quotes are not included in the result;
+
+* there is an expectation point before the close-quote;
+
+* the use of _lexeme_ disables skipping in the parser, and it must be written
+  around the quotes, not around the `operator*` expression; and
+
+* there's no way to write a quote in the middle of the string.
+
+This is a very common pattern.  I have written a quoted string parser like
+this dozens of times.  The parser above is the quick-and-dirty version.  A
+more robust version would be able to handle escaped quotes within the string,
+and then would immediately also need to support escaped escape characters.
+
+_Parser_ provides _quot_str_ to use in place of this very common pattern.  It
+supports quote- and escaped-character-escaping, using backslash as the escape
+character.
+
+[quoted_string_example_1_2]
+
+As common as this use case is, there are very similar use cases that it does
+not cover.  So, _quot_str_ has some options.  If you call it with a single
+character, it returns a _quot_str_ that uses that single character as the
+quote-character.
+
+[quoted_string_example_3]
+
+You can also supply a range of characters.  One of the characters from the
+range must quote the whole string; mismatches are not allowed.  Think of how
+Python allows you to quote a string with either `'"'` or `'\''`, but the same
+character must be used on both sides.
+
+[quoted_string_example_4]
+
+Another common thing to do in a quoted string parser is to recognize escape
+sequences.  If you have simple escape sequencecs that do not require any real
+parsing, like say the simple escape sequences from C++, you can provide a
+_symbols_ object as well.  The template parameter `T` to _symbols_t_ must be
+`char` or `char32_t`.  You don't need to include the escaped backslash or the
+escaped quote character, since those always work.
+
+[quoted_string_example_5]
+
+[endsect]
+
 [section Parsing In Detail]
 
 Now that you've seen some examples, let's see how parsing works in a bit more
@@ -1052,43 +1152,6 @@ for more information.]
 
 [endsect]
 
-[section Alternative Parsers]
-
-Frequently, you need to parse something that might have one of several forms.
-`operator|` is overloaded to form alternative parsers.  For example:
-
-    namespace bp = boost::parser;
-    auto const parser_1 = bp::int_ | bp::eps;
-
-`parser_1` matches an integer, or if that fails, it matches /epsilon/, the
-empty string.  This is equivalent to writing:
-
-    namespace bp = boost::parser;
-    auto const parser_2 = -bp::int_;
-
-However, neither `parser_1` nor `parser_2` is equivalent to writing this:
-
-    namespace bp = boost::parser;
-    auto const parser_3 = bp::eps | bp::int_; // Does not do what you think.
-
-The reason is that alternative parsers try each of their subparsers, one at a
-time, and stop on the first one that matches.  /Epsilon/ matches anything,
-since it is zero length and consumes no input.  It even matches the end of
-input.  This means that `parser_3` is equivalent to _e_ by itself.
-
-[note For this reason, writing `_e_ | p` for any parser p is considered a bug.
-Debug builds will assert when `_e_ | p` is encountered. ]
-
-[warning This kind of error is very common when _e_ is involved, and also very
-easy to detect.  However, it is possible to write `P1 >> P2`, where `P1` is a
-prefix of `P2`, such as `int_ | int >> int_`, or `repeat(4)[hex_digit] |
-repeat(8)[hex_digit]`.  This is almost certainly an error, but is impossible
-to detect in the general case _emdash_ remember that _rs_ can be separately
-compiled, and consider a pair of rules whose associated `_def` parsers are
-`int_` and `int_ >> int_`, respectively.]
-
-[endsect]
-
 [section The Parsers And Their Uses]
 
 _Parser_ comes with all the parsers most parsing tasks will ever need.  Each

diff --git a/include/boost/parser/detail/printing.hpp b/include/boost/parser/detail/printing.hpp
@@ -278,6 +278,13 @@ namespace boost { namespace parser { namespace detail {
         std::ostream & os,
         int components = 0);
 
+    template<typename Context, typename Quotes, typename Escapes>
+    void print_parser(
+        Context const & context,
+        quoted_string_parser<Quotes, Escapes> const & parser,
+        std::ostream & os,
+        int components = 0);
+
     template<typename Context, bool NewlinesOnly, bool NoNewlines>
     void print_parser(
         Context const & context,

diff --git a/include/boost/parser/detail/printing_impl.hpp b/include/boost/parser/detail/printing_impl.hpp
@@ -482,8 +482,7 @@ namespace boost { namespace parser { namespace detail {
     template<
         typename Context,
         typename ResolvedExpected,
-        bool Integral = std::is_integral<ResolvedExpected>{},
-        int SizeofExpected = sizeof(ResolvedExpected)>
+        bool Integral = std::is_integral<ResolvedExpected>{}>
     struct print_expected_char_impl
     {
         static void call(
@@ -495,13 +494,17 @@ namespace boost { namespace parser { namespace detail {
         }
     };
 
-    template<typename Context, typename Expected>
-    struct print_expected_char_impl<Context, Expected, true, 4>
+    template<typename Context>
+    struct print_expected_char_impl<Context, char32_t, true>
     {
         static void
-        call(Context const & context, std::ostream & os, Expected expected)
+        call(Context const & context, std::ostream & os, char32_t expected)
         {
-            std::array<char32_t, 1> cps = {{(char32_t)expected}};
+            if (expected == '\'') {
+                os << "'\\''";
+                return;
+            }
+            std::array<char32_t, 1> cps = {{expected}};
             auto const r = cps | text::as_utf8;
             os << "'";
             for (auto c : r) {
@@ -689,6 +692,27 @@ namespace boost { namespace parser { namespace detail {
         os << "\"";
     }
 
+    template<typename Context, typename Quotes, typename Escapes>
+    void print_parser(
+        Context const & context,
+        quoted_string_parser<Quotes, Escapes> const & parser,
+        std::ostream & os,
+        int components)
+    {
+        os << "quoted_string(";
+        if constexpr (is_nope_v<Quotes>) {
+            detail::print_expected_char_impl<Context, char32_t>::call(
+                context, os, parser.ch_);
+        } else {
+            os << '"';
+            for (auto c : parser.chs_ | text::as_utf8) {
+                detail::print_char(os, c);
+            }
+            os << '"';
+        }
+        os << ')';
+    }
+
     template<typename Context, bool NewlinesOnly, bool NoNewlines>
     void print_parser(
         Context const & context,