Skip to content

Commit a815148

Browse files
authored
[Serving][Grammar] Porting the json schema converter from python to C++ (#2112)
[Serve][Grammar] Porting the json schema converter from python to C++ This PR ports the json schema converter from python to C++. It defines the interface: ``` std::string JSONSchemaToEBNF( std::string schema, std::optional<int> indent = std::nullopt, std::optional<std::pair<std::string, std::string>> separators = std::nullopt, bool strict_mode = true); ``` And uses it in BNFGrammar::FromSchema. This helps cases where python cannot be deployed.
1 parent 7f7c01f commit a815148

File tree

12 files changed

+1205
-847
lines changed

12 files changed

+1205
-847
lines changed

cpp/serve/grammar/grammar.cc

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "grammar_parser.h"
99
#include "grammar_serializer.h"
1010
#include "grammar_simplifier.h"
11+
#include "json_schema_converter.h"
1112

1213
namespace mlc {
1314
namespace llm {
@@ -20,7 +21,7 @@ std::ostream& operator<<(std::ostream& os, const BNFGrammar& grammar) {
2021
return os;
2122
}
2223

23-
BNFGrammar BNFGrammar::FromEBNFString(const String& ebnf_string, const String& main_rule,
24+
BNFGrammar BNFGrammar::FromEBNFString(const std::string& ebnf_string, const std::string& main_rule,
2425
bool normalize, bool simplify) {
2526
auto grammar = EBNFParser::Parse(ebnf_string, main_rule);
2627
if (normalize) {
@@ -34,41 +35,39 @@ TVM_REGISTER_GLOBAL("mlc.serve.BNFGrammarFromEBNFString")
3435
return BNFGrammar::FromEBNFString(ebnf_string, main_rule, normalize, simplify);
3536
});
3637

37-
BNFGrammar BNFGrammar::FromJSON(const String& json_string) {
38+
BNFGrammar BNFGrammar::FromJSON(const std::string& json_string) {
3839
return BNFJSONParser::Parse(json_string);
3940
}
4041

4142
TVM_REGISTER_GLOBAL("mlc.serve.BNFGrammarFromJSON").set_body_typed([](String json_string) {
4243
return BNFGrammar::FromJSON(json_string);
4344
});
4445

45-
BNFGrammar BNFGrammar::FromSchema(const String& schema, int indent,
46-
Optional<Array<String>> separators, bool strict_mode) {
47-
static const PackedFunc* json_schema_to_ebnf = Registry::Get("mlc.serve.json_schema_to_ebnf");
48-
CHECK(json_schema_to_ebnf != nullptr) << "mlc.serve.json_schema_to_ebnf is not registered.";
49-
50-
String ebnf_string;
51-
52-
// Convert the indent parameter to NullOpt for sending it to the PackedFunc.
53-
if (indent == -1) {
54-
// The conversion from TVMRetValue to String is ambiguous, so we call the conversion function
55-
// explicitly
56-
ebnf_string =
57-
((*json_schema_to_ebnf)(schema, Optional<ObjectRef>(NullOpt), separators, strict_mode)
58-
.
59-
operator String());
46+
BNFGrammar BNFGrammar::FromSchema(const std::string& schema, std::optional<int> indent,
47+
std::optional<std::pair<std::string, std::string>> separators,
48+
bool strict_mode) {
49+
return FromEBNFString(JSONSchemaToEBNF(schema, indent, separators, strict_mode));
50+
}
51+
52+
TVM_REGISTER_GLOBAL("mlc.serve.BNFGrammarFromSchema").set_body([](TVMArgs args, TVMRetValue* rv) {
53+
std::optional<int> indent;
54+
if (args[1].type_code() != kTVMNullptr) {
55+
indent = args[1];
6056
} else {
61-
ebnf_string = (*json_schema_to_ebnf)(schema, indent, separators, strict_mode).operator String();
62-
;
57+
indent = std::nullopt;
6358
}
64-
return FromEBNFString(ebnf_string);
65-
}
6659

67-
TVM_REGISTER_GLOBAL("mlc.serve.BNFGrammarFromSchema")
68-
.set_body_typed([](const String& schema, int indent, Optional<Array<String>> separators,
69-
bool strict_mode) {
70-
return BNFGrammar::FromSchema(schema, indent, separators, strict_mode);
71-
});
60+
std::optional<std::pair<std::string, std::string>> separators;
61+
if (args[2].type_code() != kTVMNullptr) {
62+
Array<String> separators_arr = args[2];
63+
CHECK(separators_arr.size() == 2);
64+
separators = std::make_pair(separators_arr[0], separators_arr[1]);
65+
} else {
66+
separators = std::nullopt;
67+
}
68+
69+
*rv = BNFGrammar::FromSchema(args[0], indent, separators, args[3]);
70+
});
7271

7372
const std::string kJSONGrammarString = R"(
7473
main ::= (

cpp/serve/grammar/grammar.h

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <tvm/runtime/registry.h>
1212

1313
#include <cstdint>
14+
#include <optional>
1415
#include <string>
1516
#include <vector>
1617

@@ -183,33 +184,38 @@ class BNFGrammar : public ObjectRef {
183184
* \param simplify Whether to simplify the grammar to make matching more efficient. Default: true.
184185
* Not implemented yet.
185186
*/
186-
static BNFGrammar FromEBNFString(const String& ebnf_string, const String& main_rule = "main",
187-
bool normalize = true, bool simplify = true);
187+
static BNFGrammar FromEBNFString(const std::string& ebnf_string,
188+
const std::string& main_rule = "main", bool normalize = true,
189+
bool simplify = true);
188190

189191
/*!
190192
* \brief Construct a BNF grammar from the dumped JSON string.
191193
* \param json_string The JSON-formatted string. This string should have the same format as
192194
* the result of BNFGrammarJSONSerializer::ToString.
193195
*/
194-
static BNFGrammar FromJSON(const String& json_string);
196+
static BNFGrammar FromJSON(const std::string& json_string);
195197

196198
/*!
197199
* \brief Construct a BNF grammar from the json schema string. The schema string should be in the
198200
* format of the schema of a JSON file. We will parse the schema and generate a BNF grammar.
199201
* \param schema The schema string.
200-
* \param indent The number of spaces for indentation. If -1, the output will be in one line.
201-
* Default: -1.
202+
* \param indent The number of spaces for indentation. If set to std::nullopt, the output will be
203+
* in one line. Default: std::nullopt.
202204
* \param separators Two separators used in the schema: comma and colon. Examples: {",", ":"},
203-
* {", ", ": "}. If NullOpt, the default separators will be used: {",", ": "} when the indent
204-
* is not -1, and {", ", ": "} otherwise. Default: NullOpt.
205+
* {", ", ": "}. If std::nullopt, the default separators will be used: {",", ": "} when the
206+
* indent is not -1, and {", ", ": "} otherwise. This follows the convention in python
207+
* json.dumps(). Default: std::nullopt.
205208
* \param strict_mode Whether to use strict mode. In strict mode, the generated grammar will not
206-
* allow unevaluatedProperties and unevaluatedItems, i.e. these will be set to false by default.
209+
* allow properties and items that is not specified in the schema. This is equivalent to
210+
* setting unevaluatedProperties and unevaluatedItems to false.
211+
*
207212
* This helps LLM to generate accurate output in the grammar-guided generation with JSON
208213
* schema. Default: true.
209214
*/
210-
static BNFGrammar FromSchema(const String& schema, int indent = -1,
211-
Optional<Array<String>> separators = NullOpt,
212-
bool strict_mode = true);
215+
static BNFGrammar FromSchema(
216+
const std::string& schema, std::optional<int> indent = std::nullopt,
217+
std::optional<std::pair<std::string, std::string>> separators = std::nullopt,
218+
bool strict_mode = true);
213219

214220
/*!
215221
* \brief Get the grammar of standard JSON format. We have built-in support for JSON.

cpp/serve/grammar/grammar_parser.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ namespace serve {
1616
class EBNFParserImpl {
1717
public:
1818
/*! \brief The logic of parsing the grammar string. */
19-
BNFGrammar DoParse(String ebnf_string, String main_rule);
19+
BNFGrammar DoParse(std::string ebnf_string, std::string main_rule);
2020

2121
private:
2222
using Rule = BNFGrammarNode::Rule;
@@ -192,7 +192,7 @@ int32_t EBNFParserImpl::ParseString() {
192192
std::vector<int32_t> character_classes;
193193
while (Peek() && Peek() != '\"') {
194194
if (Peek() == '\r' || Peek() == '\n') {
195-
ThrowParseError("String should not contain newline");
195+
ThrowParseError("There should be no newline character in a string literal");
196196
}
197197
auto [codepoint, len] = Utf8OrEscapeToCodepoint(cur_);
198198
if (codepoint == static_cast<TCodepoint>(CharHandlingError::kInvalidUtf8)) {
@@ -391,7 +391,7 @@ void EBNFParserImpl::ResetStringIterator(const char* cur) {
391391
in_parentheses_ = false;
392392
}
393393

394-
BNFGrammar EBNFParserImpl::DoParse(String ebnf_string, String main_rule) {
394+
BNFGrammar EBNFParserImpl::DoParse(std::string ebnf_string, std::string main_rule) {
395395
ResetStringIterator(ebnf_string.c_str());
396396
BuildRuleNameToId();
397397

@@ -412,12 +412,12 @@ BNFGrammar EBNFParserImpl::DoParse(String ebnf_string, String main_rule) {
412412
return builder_.Get(main_rule);
413413
}
414414

415-
BNFGrammar EBNFParser::Parse(String ebnf_string, String main_rule) {
415+
BNFGrammar EBNFParser::Parse(std::string ebnf_string, std::string main_rule) {
416416
EBNFParserImpl parser;
417417
return parser.DoParse(ebnf_string, main_rule);
418418
}
419419

420-
BNFGrammar BNFJSONParser::Parse(String json_string) {
420+
BNFGrammar BNFJSONParser::Parse(std::string json_string) {
421421
auto node = make_object<BNFGrammarNode>();
422422
auto grammar_json = json::ParseToJsonObject(json_string);
423423
auto rules_json = json::Lookup<picojson::array>(grammar_json, "rules");

cpp/serve/grammar/grammar_parser.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class EBNFParser {
3737
* \param main_rule The name of the main rule. Default is "main".
3838
* \return The parsed grammar.
3939
*/
40-
static BNFGrammar Parse(String ebnf_string, String main_rule = "main");
40+
static BNFGrammar Parse(std::string ebnf_string, std::string main_rule = "main");
4141

4242
/*!
4343
* \brief The exception thrown when parsing fails.
@@ -58,7 +58,7 @@ class BNFJSONParser {
5858
* \param json_string The JSON string.
5959
* \return The parsed BNF grammar.
6060
*/
61-
static BNFGrammar Parse(String json_string);
61+
static BNFGrammar Parse(std::string json_string);
6262
};
6363

6464
} // namespace serve

cpp/serve/grammar/grammar_serializer.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ std::string BNFGrammarPrinter::PrintCharacterClassStar(const RuleExpr& rule_expr
107107
return PrintRuleExpr(rule_expr[0]) + "*";
108108
}
109109

110-
String BNFGrammarPrinter::ToString() {
110+
std::string BNFGrammarPrinter::ToString() {
111111
std::string result;
112112
auto num_rules = grammar_->NumRules();
113113
for (auto i = 0; i < num_rules; ++i) {
@@ -120,7 +120,7 @@ TVM_REGISTER_GLOBAL("mlc.serve.BNFGrammarToString").set_body_typed([](const BNFG
120120
return BNFGrammarPrinter(grammar).ToString();
121121
});
122122

123-
String BNFGrammarJSONSerializer::ToString() {
123+
std::string BNFGrammarJSONSerializer::ToString() {
124124
picojson::object grammar_json;
125125

126126
picojson::array rules_json;

cpp/serve/grammar/grammar_serializer.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class BNFGrammarSerializer {
2727
explicit BNFGrammarSerializer(const BNFGrammar& grammar) : grammar_(grammar) {}
2828

2929
/*! \brief Serialize the grammar to string. */
30-
virtual String ToString() = 0;
30+
virtual std::string ToString() = 0;
3131

3232
protected:
3333
const BNFGrammar& grammar_;
@@ -50,7 +50,7 @@ class BNFGrammarPrinter : public BNFGrammarSerializer {
5050
explicit BNFGrammarPrinter(const BNFGrammar& grammar) : BNFGrammarSerializer(grammar) {}
5151

5252
/*! \brief Print the complete grammar. */
53-
String ToString() final;
53+
std::string ToString() final;
5454

5555
/*! \brief Print a rule. */
5656
std::string PrintRule(const Rule& rule);
@@ -102,7 +102,7 @@ class BNFGrammarJSONSerializer : public BNFGrammarSerializer {
102102
* \brief Dump the raw representation of the AST to a JSON file.
103103
* \param prettify Whether to format the JSON string. If false, all whitespaces will be removed.
104104
*/
105-
String ToString() final;
105+
std::string ToString() final;
106106

107107
private:
108108
bool prettify_;

0 commit comments

Comments
 (0)