Skip to content

Commit e72710e

Browse files
committed
feat: add support for IRI-reference and absolute-IRI
Signed-off-by: Jan Kowalleck <jan.kowalleck@gmail.com>
1 parent 3f78689 commit e72710e

File tree

4 files changed

+100
-4
lines changed

4 files changed

+100
-4
lines changed

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,18 +71,25 @@ if is_valid_syntax(term='iri', value='http://github.com'):
7171

7272
if not is_valid_syntax(term='iri', value='bob'):
7373
print("✗ Invalid IRI syntax")
74+
75+
if not is_valid_syntax(term='iri_reference', value='bob'):
76+
print("✓ Valid IRI-reference syntax")
7477
```
7578

7679
### Alternatively, use term-specific helpers to validate RFC 3987 syntax.
7780

7881
```python
7982
from rfc3987_syntax import is_valid_syntax_iri
83+
from rfc3987_syntax import is_valid_syntax_iri_reference
8084

8185
if is_valid_syntax_iri('http://github.com'):
8286
print("✓ Valid IRI syntax")
8387

8488
if not is_valid_syntax_iri('bob'):
8589
print("✗ Invalid IRI syntax")
90+
91+
if is_valid_syntax_iri_reference('bob'):
92+
print("✓ Valid IRI-reference syntax")
8693
```
8794

8895
### Get the Lark parse tree for a syntax validation (useful for additional semantic validation)
@@ -114,8 +121,12 @@ This grammar was derived from:
114121
| Rule/Component | Source | Notes |
115122
|----------------------|------------|-------|
116123
| `iri` | RFC 3987 | Top-level IRI rule |
124+
| `iri_reference` | RFC 3987 | Top-level IRI Reference rule |
125+
| `absolute_iri` | RFC 3987 | Top-level Absolute IRI rule |
117126
| `scheme` | RFC 3986 | Referenced by RFC 3987 §2.2 |
118127
| `ihier_part` | RFC 3987 | IRI-specific hierarchy |
128+
| `irelative_ref` | RFC 3987 | IRI-specific relative ref |
129+
| `irelative_part` | RFC 3987 | IRI-specific relative part |
119130
| `iauthority` | RFC 3986 | Standard URI authority |
120131
| `ipath_abempty` | RFC 3986 | Path format variant |
121132
| `ipath_absolute` | RFC 3986 | Absolute path |

src/rfc3987_syntax/syntax_helpers.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
RFC3987_SYNTAX_GRAMMAR_PATH: Path = Path(__file__).parent / "syntax_rfc3987.lark"
99
RFC3987_SYNTAX_TERMS: list[str] = [
1010
"iri",
11+
"iri_reference",
12+
"absolute_iri",
1113
"scheme",
14+
"irelative_ref",
15+
"irelative_part"
1216
"ihier_part",
1317
"iauthority",
1418
"iuserinfo",
@@ -43,7 +47,7 @@
4347

4448
grammar: str = load_grammar(RFC3987_SYNTAX_GRAMMAR_PATH)
4549

46-
syntax_parser = Lark(grammar, start=["iri"], parser=RFC3987_SYNTAX_PARSER_TYPE)
50+
syntax_parser = Lark(grammar, start=["iri", "iri_reference", "absolute_iri"], parser=RFC3987_SYNTAX_PARSER_TYPE)
4751

4852

4953
def parse(term: str, value: str) -> ParseTree:
@@ -73,6 +77,14 @@ def syntax_validator(text):
7377

7478
is_valid_syntax_iri = make_syntax_validator("iri")
7579

80+
is_valid_syntax_iri_reference = make_syntax_validator("iri_reference")
81+
82+
is_valid_syntax_absolute_iri = make_syntax_validator("absolute_iri")
83+
84+
is_valid_syntax_irelative_ref = make_syntax_validator("irelative_ref")
85+
86+
is_valid_syntax_irelative_part = make_syntax_validator("irelative_part")
87+
7688
is_valid_syntax_ihier_part = make_syntax_validator("ihier_part")
7789

7890
is_valid_syntax_iauthority = make_syntax_validator("iauthority")

src/rfc3987_syntax/syntax_rfc3987.lark

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,24 @@
11

22
iri: scheme ":" ihier_part ("?" iquery)? ("#" ifragment)?
33

4+
iri_reference: iri | irelative_ref
5+
6+
absolute_iri: scheme ":" ihier_part ("?" iquery)?
7+
48
scheme: alpha (alpha | digit | "+" | "-" | ".")*
59

610
ihier_part: "//" iauthority ipath_abempty
711
| ipath_absolute
812
| ipath_rootless
913
| ipath_empty
1014

15+
irelative_ref: irelative_part ("?" iquery)? ("#" ifragment)?
16+
17+
irelative_part: "//" iauthority ipath_abempty
18+
| ipath_absolute
19+
| ipath_noscheme
20+
| ipath_empty
21+
1122
iauthority: (iuserinfo "@")? ihost (":" port)?
1223

1324
iuserinfo: (iunreserved | pct_encoded | sub_delims | ":")*

tests/valid_syntax.json

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,23 @@
2424
"ihier_part": [],
2525
"iprivate": [],
2626
"iquery": [],
27-
"absolute_iri": [],
27+
"absolute_iri": [
28+
{
29+
"value": "ftp://пример.испытание.example.com/файл.txt",
30+
"expect_lark": true,
31+
"reason": ""
32+
},
33+
{
34+
"value": "https://www.example.com/über",
35+
"expect_lark": true,
36+
"reason": ""
37+
},
38+
{
39+
"value": "https://exämple.example.org/pageセクション1",
40+
"expect_lark": true,
41+
"reason": ""
42+
}
43+
],
2844
"gen_delims": [],
2945
"ifragment": [],
3046
"isegment_nz_nc": [],
@@ -40,11 +56,57 @@
4056
"reason": "Regex may match due to naive prefix pattern, but triple slash `http:///` is invalid due to empty authority.",
4157
"conforms_with_rfc3987_semantics": false,
4258
"semantics_notes": "Can occur when ireg_name is empty. To prevent this, change the ireg_name production rule to: ireg_name: (iunreserved | pct_encoded | sub_delims)+"
59+
},
60+
{
61+
"value": "mailto:用户@例子.公司@example.com",
62+
"expect_lark": true,
63+
"reason": ""
64+
},
65+
{
66+
"value": "ftp://пример.испытание.example.com/файл.txt",
67+
"expect_lark": true,
68+
"reason": ""
69+
},
70+
{
71+
"value": "https://www.example.com/über",
72+
"expect_lark": true,
73+
"reason": ""
74+
},
75+
{
76+
"value": "https://exämple.example.org/page#セクション1",
77+
"expect_lark": true,
78+
"reason": ""
79+
}
80+
],
81+
"iri_reference": [
82+
{
83+
"value": "/documents/नियम.pdf",
84+
"expect_lark": true,
85+
"reason": ""
86+
},
87+
{
88+
"value": "../../фото/лошадь.jpg",
89+
"expect_lark": true,
90+
"reason": ""
91+
},
92+
{
93+
"value": "index.html#संपर्क",
94+
"expect_lark": true,
95+
"reason": ""
96+
},
97+
{
98+
"value": "doc/guide.html#überblick",
99+
"expect_lark": true,
100+
"reason": ""
101+
},
102+
{
103+
"value": "пример/тест#часть2",
104+
"expect_lark": true,
105+
"reason": ""
43106
}
44107
],
45-
"iri_reference": [],
46108
"reserved": [],
47109
"scheme": [],
48110
"pct_encoded": [],
49111
"ipv4address": []
50-
}
112+
}

0 commit comments

Comments
 (0)