Skip to content

Commit 3103ee0

Browse files
committed
feat(cbork): First cddl parser code
1 parent ecc2d19 commit 3103ee0

File tree

13 files changed

+1020
-0
lines changed

13 files changed

+1020
-0
lines changed

hermes/crates/cbork/Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[workspace]
2+
3+
members = [
4+
"cddl-parser",
5+
]
6+
7+
[workspace.dependencies]
8+
derive_more = "0.99.17"

hermes/crates/cbork/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# cbork
2+
3+
CBOR Kit
4+
5+
We need to support the parsing of CDDL in the following priority sequence.
6+
Each needs to be complete before extending with the subsequent specification extension.
7+
We do not need to handle choosing which extensions are enabled.
8+
9+
1. CDDL Spec: <https://www.rfc-editor.org/rfc/rfc8610>
10+
2. Errata to include: <https://www.ietf.org/archive/id/draft-ietf-cbor-update-8610-grammar-01.html>
11+
3. Extensions: <https://www.rfc-editor.org/rfc/rfc9165>
12+
4. Modules: <https://cbor-wg.github.io/cddl-modules/draft-ietf-cbor-cddl-modules.html> and <https://github.com/cabo/cddlc>
13+
14+
There are semantic rules about well formed CDDL files that are not enforced by the grammar.
15+
The full parser will also need to validate those rules.
16+
The primary rule is that the very first definition in the file is the base type.
17+
18+
We should also be able to detect if there are orphaned definitions in the CDDL file.
19+
20+
There may be other checks we need to perform on the parsed AST for validity.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[package]
2+
name = "cddl-parser"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7+
8+
[dependencies]
9+
derive_more = { workspace = true }
10+
11+
pest = { version = "2.7.2", features = ["std", "pretty-print", "memchr", "const_prec_climber"] }
12+
pest_derive = { version = "2.7.2", features = ["grammar-extras"] }
13+
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
//! CDDL Grammar adapted from RFC8610 Appendix B
2+
//! https://www.rfc-editor.org/rfc/rfc8610#appendix-B
3+
4+
5+
cddl = {
6+
SOI
7+
~ S ~ rule+
8+
~ EOI
9+
}
10+
11+
rule = {
12+
( typename ~ assignt ~ type)
13+
| ( groupname ~ assigng ~ grpent)
14+
}
15+
16+
typename = ${ id ~ genericparm? }
17+
groupname = ${ id ~ genericparm? }
18+
19+
assignt = { "=" | "/=" }
20+
assigng = { "=" | "//=" }
21+
22+
genericparm = { "<" ~ id ~ ( "," ~ id )* ~ ">" }
23+
genericarg = { "<" ~ type1 ~ ( "," ~ type1)* ~ ">" }
24+
25+
type = { type1 ~ ( S ~ "/" ~ type1)* }
26+
27+
type1 = { type2 ~ ( S ~ ( rangeop | ctlop ) ~ type2)? }
28+
29+
typename_arg = ${ typename ~ genericarg? }
30+
groupname_arg = ${ groupname ~ genericarg? }
31+
32+
tag6 = ${ "#" ~ "6" ~ ("." ~ uint)? ~ "(" ~ S ~ type ~ S ~ ")" }
33+
tag_generic = ${ "#" ~ ASCII_DIGIT ~ ("." ~ uint)? }
34+
35+
type2 = {
36+
value
37+
| typename_arg
38+
| ( "(" ~ type ~ ")" )
39+
| ( "{" ~ group ~ "}" )
40+
| ( "[" ~ group ~ "]" )
41+
| ( "~" ~ typename_arg )
42+
| ( "&" ~ "(" ~ group ~ ")" )
43+
| ( "&" ~ groupname_arg )
44+
| tag6
45+
| tag_generic
46+
| "#"
47+
}
48+
49+
rangeop = { "..." | ".." }
50+
ctlop = ${ "." ~ id }
51+
52+
group = { grpchoice ~ ( S ~ "//" ~ grpchoice)* }
53+
54+
grpchoice = { ( grpent ~ ","? )* }
55+
56+
grpent = ${
57+
( (occur ~ S)? ~ (memberkey ~ S)? ~ type )
58+
| ( (occur ~ S)? ~ groupname ~ genericarg? )
59+
| ( (occur ~ S)? ~ "(" ~ S ~ group ~ S ~ ")" )
60+
}
61+
62+
memberkey = {
63+
( type1 ~ "^"? ~ "=>" )
64+
| ( bareword ~ ":" )
65+
| ( value ~ ":" )
66+
}
67+
68+
bareword = { id }
69+
70+
occur = {
71+
( uint? ~ "*" ~ uint? )
72+
| "+"
73+
| "?"
74+
}
75+
76+
// -----------------------------------------------------------------------------
77+
// Literal Values
78+
79+
/// All Literal Values
80+
value = { number | text | bytes }
81+
82+
/// Literal Numbers - A float if it has fraction or exponent; int otherwise
83+
number = { hexfloat | intfloat }
84+
85+
/// Hex floats of the form -0x123.abc0p+12
86+
hexfloat = ${ "-"? ~ "0x" ~ ASCII_HEX_DIGIT+ ~ ("." ~ ASCII_HEX_DIGIT+)? ~ "p" ~ exponent }
87+
88+
/// Ints or Int floats
89+
intfloat = ${ int ~ ("." ~ fraction)? ~ ("e" ~ exponent)? }
90+
91+
/// Fractional part of a number
92+
fraction = ${ ASCII_DIGIT+ }
93+
94+
/// Exponent for a number
95+
exponent = ${ ("+" | "-") ~ ASCII_DIGIT+ }
96+
97+
/// All integers, singed and unsigned
98+
int = ${ "-"? ~ uint }
99+
100+
101+
/// Unsigned Integers
102+
uint = ${
103+
( ASCII_NONZERO_DIGIT ~ ASCII_DIGIT* )
104+
| ( "0x" ~ ASCII_HEX_DIGIT+ )
105+
| ( "0b" ~ ASCII_BIN_DIGIT+ )
106+
| "0"
107+
}
108+
109+
/// Literal Text
110+
text = ${ "\"" ~ SCHAR* ~ "\"" }
111+
112+
/// Literal Bytes - Note CDDL Spec incorrectly defines b64''.
113+
bytes = ${ bytes_hex | bytes_b64 | bytes_text }
114+
bytes_hex = ${ "h" ~ "'" ~ HEX_PAIR* ~ "'" }
115+
bytes_b64 = ${ "b64" ~ "'" ~ URL_BASE64 ~ "'" }
116+
bytes_text = ${ "'" ~ BCHAR* ~ "'" }
117+
118+
// -----------------------------------------------------------------------------
119+
// Simple multiple character sequences
120+
121+
/// identifier, called the `name` in the CDDL spec.
122+
id = ${
123+
group_socket |
124+
type_socket |
125+
name
126+
}
127+
128+
/// Special form of a name that represents a Group Socket.
129+
group_socket = ${ "$$" ~ ( ( "-" | "." )* ~ NAME_END )* }
130+
/// Special form of a name that represents a Type Socket.
131+
type_socket = ${ "$" ~ ( ( "-" | "." )* ~ NAME_END )* }
132+
/// General form of a name.
133+
name = ${ NAME_START ~ ( ( "-" | "." )* ~ NAME_END )* }
134+
135+
/// A pair of hex digits. (Must always have even numbers of hex digits.)
136+
HEX_PAIR = _{ S ~ ASCII_HEX_DIGIT ~ S ~ ASCII_HEX_DIGIT ~ S }
137+
138+
/// Whitespace is allowed and is ignored.
139+
/// This token will keep the whitespace, so it will need to handled when converted to binary.
140+
URL_BASE64 = _{ S ~ ( URL_BASE64_ALPHA ~ S)* ~ URL_BASE64_PAD? }
141+
142+
143+
// -----------------------------------------------------------------------------
144+
// Characters, Whitespace and Comments
145+
146+
S = _{ WHITESPACE* }
147+
WHITESPACE = _{ " " | "\t" | NEWLINE }
148+
COMMENT = _{ ";" ~ (PCHAR | "\t")* ~ NEWLINE }
149+
150+
// URL Base64 Characterset.
151+
URL_BASE64_ALPHA = _{ ASCII_ALPHA | ASCII_DIGIT | "-" | "_" }
152+
// Optional Padding that goes at the end of Base64.
153+
URL_BASE64_PAD = _{ "~" }
154+
155+
// Identifier Name Character sets.
156+
157+
/// A name can start with an alphabetic character (including "@", "_", "$")
158+
/// The body of the name can consist of any of the characters from the set
159+
/// {"A" to "Z", "a" to "z", "0" to "9", "_", "-", "@", ".", "$"}
160+
// NAME_BODY = _{ NAME_END | "-" | "." } -- Unused Rule
161+
NAME_START = _{ ASCII_ALPHA | "@" | "_" | "$" }
162+
/// A name can end with an alphabetic character (including "@", "_", "$") or a digit.
163+
NAME_END = _{ NAME_START | ASCII_DIGIT }
164+
165+
/// All Visible characters.
166+
PCHAR = _{ ASCII_VISIBLE | UNICODE_CHAR }
167+
168+
/// The set of characters valid for a text string.
169+
SCHAR = _{ SCHAR_ASCII_VISIBLE | UNICODE_CHAR | SESC }
170+
171+
/// The set of characters valid for a byte string.
172+
BCHAR = _{ BCHAR_ASCII_VISIBLE | UNICODE_CHAR | SESC | NEWLINE }
173+
174+
/// Escaping code to allow invalid characters to be used in text or byte strings.
175+
SESC = ${ "\\" ~ (ASCII_VISIBLE | UNICODE_CHAR) }
176+
177+
/// All Visiable Ascii characters.
178+
ASCII_VISIBLE = _{ ' '..'~' }
179+
180+
/// Ascii subset valid for text strings.
181+
SCHAR_ASCII_VISIBLE = _{ ' '..'!' | '#'..'[' | ']'..'~' }
182+
183+
/// Ascii subset valid for byte text strings.
184+
BCHAR_ASCII_VISIBLE = _{ ' '..'&' | '('..'[' | ']'..'~' }
185+
186+
/// Valid non ascii unicode Characters
187+
UNICODE_CHAR = _{ '\u{80}'..'\u{10FFFD}' }
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
2+
// -----------------------------------------------------------------------------
3+
// Test Expressions ONLY TO Be USED by Unit Tests.
4+
// Extends `cddl.pest` with rules needed to properly check sub-rules.
5+
6+
/// Test Expression for the S Rule.
7+
S_TEST = ${ SOI ~ S ~ EOI }
8+
9+
/// Test Expression for the COMMENT Rule.
10+
COMMENT_TEST = { SOI ~ COMMENT* ~ EOI }
11+
12+
/// Test expression for the URL_BASE64 Rule.
13+
URL_BASE64_TEST = { SOI ~ URL_BASE64 ~ EOI }
14+
15+
/// Test expression to the id Rule.
16+
id_TEST = ${ SOI ~ id ~ EOI}
17+
18+
/// Test expression to the bytes Rule.
19+
bytes_TEST = ${ SOI ~ bytes ~ EOI}
20+
21+
/// Test expression to the text Rule.
22+
text_TEST = ${ SOI ~ text ~ EOI}
23+
24+
/// Test expression to the uint Rule.
25+
uint_TEST = ${ SOI ~ uint ~ EOI}
26+
27+
/// Test expression to the int Rule.
28+
int_TEST = ${ SOI ~ int ~ EOI}
29+
30+
/// Test expression to the intfloat Rule.
31+
intfloat_TEST = ${ SOI ~ intfloat ~ EOI}
32+
33+
/// Test expression to the hexfloat Rule.
34+
hexfloat_TEST = ${ SOI ~ hexfloat ~ EOI}
35+
36+
/// Test expression to the number Rule.
37+
number_TEST = ${ SOI ~ number ~ EOI}
38+
39+
/// Test expression to the value Rule.
40+
value_TEST = ${ SOI ~ value ~ EOI}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
; CDDL Standard Postlude as defined by Appendix D of RFC8610
2+
; https://www.rfc-editor.org/rfc/rfc8610#appendix-D
3+
4+
any = #
5+
6+
uint = #0
7+
nint = #1
8+
int = uint / nint
9+
10+
bstr = #2
11+
bytes = bstr
12+
tstr = #3
13+
text = tstr
14+
15+
tdate = #6.0(tstr)
16+
time = #6.1(number)
17+
number = int / float
18+
biguint = #6.2(bstr)
19+
bignint = #6.3(bstr)
20+
bigint = biguint / bignint
21+
integer = int / bigint
22+
unsigned = uint / biguint
23+
decfrac = #6.4([e10: int, m: integer])
24+
bigfloat = #6.5([e2: int, m: integer])
25+
eb64url = #6.21(any)
26+
eb64legacy = #6.22(any)
27+
eb16 = #6.23(any)
28+
encoded-cbor = #6.24(bstr)
29+
uri = #6.32(tstr)
30+
b64url = #6.33(tstr)
31+
b64legacy = #6.34(tstr)
32+
regexp = #6.35(tstr)
33+
mime-message = #6.36(tstr)
34+
cbor-any = #6.55799(any)
35+
36+
float16 = #7.25
37+
float32 = #7.26
38+
float64 = #7.27
39+
float16-32 = float16 / float32
40+
float32-64 = float32 / float64
41+
float = float16-32 / float64
42+
43+
false = #7.20
44+
true = #7.21
45+
bool = false / true
46+
nil = #7.22
47+
null = nil
48+
undefined = #7.23
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
use std::fmt::Debug;
2+
3+
pub use pest::Parser;
4+
use pest_derive::Parser;
5+
6+
extern crate derive_more;
7+
use derive_more::{Display, From};
8+
9+
// Parser with DEBUG rules. These rules are only used in tests.
10+
#[derive(Parser)]
11+
#[grammar = "grammar/cddl.pest"]
12+
#[grammar = "grammar/cddl_test.pest"] // Ideally this would only be used in tests.
13+
pub struct CDDLParser;
14+
15+
#[derive(Display, Debug, From)]
16+
pub struct CDDLError(pest::error::Error<Rule>);
17+
18+
// CDDL Standard Postlude - read from an external file
19+
const POSTLUDE: &str = include_str!("grammar/postlude.cddl");
20+
21+
pub fn parse_cddl(input: &str) -> Result<(), Box<CDDLError>> {
22+
let result = CDDLParser::parse(Rule::cddl, input);
23+
24+
match result {
25+
Ok(c) => println!("{c:?}"),
26+
Err(e) => {
27+
println!("{e:?}");
28+
println!("{e}");
29+
return Err(Box::new(CDDLError::from(e)));
30+
}
31+
}
32+
33+
Ok(())
34+
}
35+
36+
#[cfg(test)]
37+
mod tests {
38+
use crate::{parse_cddl, POSTLUDE};
39+
40+
#[test]
41+
fn it_works() {
42+
let result = parse_cddl(POSTLUDE);
43+
44+
match result {
45+
Ok(c) => println!("{c:?}"),
46+
Err(e) => {
47+
println!("{e:?}");
48+
println!("{e}");
49+
}
50+
}
51+
}
52+
}

0 commit comments

Comments
 (0)