Skip to content

Commit

Permalink
add regex and json constraints
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Oct 22, 2024
1 parent 3955f38 commit 3cbb71b
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 17 deletions.
14 changes: 14 additions & 0 deletions parser/llguidance.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,20 @@ void llg_constraint_init_set_defaults(struct LlgConstraintInit *init,
struct LlgConstraint *llg_new_constraint(const struct LlgConstraintInit *init,
const char *grammar_json);

/**
* Create a new constraint from a given regular expression
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint_regex(const struct LlgConstraintInit *init,
const char *regex);

/**
* Create a new constraint from a given JSON schema
* Always returns a non-null value. Call llg_get_error() on the result to check for errors.
*/
struct LlgConstraint *llg_new_constraint_json(const struct LlgConstraintInit *init,
const char *json_schema);

/**
* Get the error message from the constraint or null if there is no error.
* After it returns a non-null value, it will always return it until the constraint is freed
Expand Down
27 changes: 27 additions & 0 deletions parser/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,3 +351,30 @@ impl Default for ParserLimits {
}
}
}

impl TopLevelGrammar {
pub fn from_regex(rx: RegexNode) -> Self {
TopLevelGrammar {
grammars: vec![GrammarWithLexer {
nodes: vec![Node::Lexeme {
rx: RegexSpec::RegexId(RegexId(0)),
contextual: None,
temperature: None,
props: NodeProps::default(),
json_string: None,
json_allowed_escapes: None,
json_raw: None,
}],
greedy_lexer: true,
greedy_skip_rx: None,
contextual: None,
rx_nodes: vec![rx],
no_forcing: false,
allow_initial_skip: false,
allow_invalid_utf8: false,
}],
max_tokens: None,
test_trace: false,
}
}
}
86 changes: 69 additions & 17 deletions parser/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ use anyhow::{bail, Result};
use toktrie::{InferenceCapabilities, TokEnv, TokRxInfo, TokTrie, TokenizerEnv};

use crate::{
api::{ParserLimits, TopLevelGrammar},
CommitResult, Constraint, Logger, TokenParser,
api::{ParserLimits, RegexNode, TopLevelGrammar},
CommitResult, Constraint, JsonCompileOptions, Logger, TokenParser,
};

struct CTokenizerInner {
Expand Down Expand Up @@ -175,6 +175,17 @@ pub struct LlgConstraint {
last_commit_result: CommitResult,
}

impl Default for LlgConstraint {
fn default() -> Self {
LlgConstraint {
local_error: None,
last_logs: "\x00".to_string(),
constraint: None,
last_commit_result: CommitResult::default(),
}
}
}

#[repr(C)]
pub struct LlgMaskResult {
/// One bit per vocab token
Expand Down Expand Up @@ -213,16 +224,38 @@ impl LlgCommitResult {
}
}

fn new_constraint(init: &LlgConstraintInit, grammar_json: *const c_char) -> Result<Constraint> {
if init.tokenizer.is_null() {
bail!("Tokenizer is null");
}
fn new_constraint_regex(init: &LlgConstraintInit, regex: *const c_char) -> Result<Constraint> {
let regex = unsafe { CStr::from_ptr(regex) }
.to_str()
.map_err(|_| anyhow::anyhow!("Invalid UTF-8 in regex"))?;
let grammar = TopLevelGrammar::from_regex(RegexNode::Regex(regex.to_string()));
new_constraint_core(init, grammar)
}

fn new_constraint_json(init: &LlgConstraintInit, json_schema: *const c_char) -> Result<Constraint> {
let json_schema = unsafe { CStr::from_ptr(json_schema) }
.to_str()
.map_err(|_| anyhow::anyhow!("Invalid UTF-8 in json_schema"))?;
let json_schema = serde_json::from_str(json_schema)
.map_err(|e| anyhow::anyhow!("Invalid JSON in json_schema: {e}"))?;
let opts = JsonCompileOptions { compact: false };
let grammar = opts.json_to_llg_no_validate(&json_schema)?;
new_constraint_core(init, grammar)
}

fn new_constraint(init: &LlgConstraintInit, grammar_json: *const c_char) -> Result<Constraint> {
let grammar_json = unsafe { CStr::from_ptr(grammar_json) }
.to_str()
.map_err(|_| anyhow::anyhow!("Invalid UTF-8 in grammar_json"))?;
let grammar: TopLevelGrammar = serde_json::from_str(grammar_json)
.map_err(|e| anyhow::anyhow!("Invalid JSON in grammar_json: {e}"))?;
new_constraint_core(init, grammar)
}

fn new_constraint_core(init: &LlgConstraintInit, grammar: TopLevelGrammar) -> Result<Constraint> {
if init.tokenizer.is_null() {
bail!("Tokenizer is null");
}

let tok_env = unsafe { (&*init.tokenizer).to_env() };
let tok_parser = TokenParser::from_llguidance_json(
Expand Down Expand Up @@ -283,26 +316,45 @@ pub extern "C" fn llg_constraint_init_set_defaults(
};
}

fn return_constraint(c: Result<Constraint>) -> *mut LlgConstraint {
let mut res = LlgConstraint::default();

match c {
Ok(constraint) => res.constraint = Some(constraint),
Err(e) => res.set_error(&e.to_string()),
};

Box::into_raw(Box::new(res))
}

/// Create a new constraint from a grammar JSON string
/// Always returns a non-null value. Call llg_get_error() on the result to check for errors.
#[no_mangle]
pub extern "C" fn llg_new_constraint(
init: &LlgConstraintInit,
grammar_json: *const c_char,
) -> *mut LlgConstraint {
let mut res = LlgConstraint {
local_error: None,
constraint: None,
last_logs: "\x00".to_string(),
last_commit_result: CommitResult::default(),
};
return_constraint(new_constraint(init, grammar_json))
}

match new_constraint(init, grammar_json) {
Ok(constraint) => res.constraint = Some(constraint),
Err(e) => res.set_error(&e.to_string()),
};
/// Create a new constraint from a given regular expression
/// Always returns a non-null value. Call llg_get_error() on the result to check for errors.
#[no_mangle]
pub extern "C" fn llg_new_constraint_regex(
init: &LlgConstraintInit,
regex: *const c_char,
) -> *mut LlgConstraint {
return_constraint(new_constraint_regex(init, regex))
}

Box::into_raw(Box::new(res))
/// Create a new constraint from a given JSON schema
/// Always returns a non-null value. Call llg_get_error() on the result to check for errors.
#[no_mangle]
pub extern "C" fn llg_new_constraint_json(
init: &LlgConstraintInit,
json_schema: *const c_char,
) -> *mut LlgConstraint {
return_constraint(new_constraint_json(init, json_schema))
}

/// Get the error message from the constraint or null if there is no error.
Expand Down

0 comments on commit 3cbb71b

Please sign in to comment.