Skip to content

Commit b3d493b

Browse files
committed
feat(engine-js): improve js engine by replacing hard-coded recursive reference
1 parent 4f7e5d1 commit b3d493b

File tree

8 files changed

+131
-24
lines changed

8 files changed

+131
-24
lines changed

docs/references/engine-js-compat.md

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
| | Count |
1212
| :-------------- | --------------------------------: |
1313
| Total Languages | 213 |
14-
| Fully Supported | [164](#fully-supported-languages) |
15-
| Mismatched | [20](#mismatched-languages) |
16-
| Unsupported | [29](#unsupported-languages) |
14+
| Fully Supported | [171](#fully-supported-languages) |
15+
| Mismatched | [24](#mismatched-languages) |
16+
| Unsupported | [18](#unsupported-languages) |
1717

1818
## Fully Supported Languages
1919

@@ -29,6 +29,7 @@ Languages that works with the JavaScript RegExp engine, and will produce the sam
2929
| applescript | ✅ OK | 152 | - | |
3030
| ara | ✅ OK | 54 | - | |
3131
| asm | ✅ OK | 297 | - | |
32+
| astro | ✅ OK | 1090 | - | |
3233
| awk | ✅ OK | 36 | - | |
3334
| ballerina | ✅ OK | 230 | - | |
3435
| bat | ✅ OK | 58 | - | |
@@ -67,6 +68,7 @@ Languages that works with the JavaScript RegExp engine, and will produce the sam
6768
| fluent | ✅ OK | 23 | - | |
6869
| fortran-fixed-form | ✅ OK | 332 | - | |
6970
| fortran-free-form | ✅ OK | 328 | - | |
71+
| fsharp | ✅ OK | 239 | - | |
7072
| fsl | ✅ OK | 30 | - | |
7173
| gdresource | ✅ OK | 157 | - | |
7274
| gdscript | ✅ OK | 93 | - | |
@@ -117,6 +119,7 @@ Languages that works with the JavaScript RegExp engine, and will produce the sam
117119
| move | ✅ OK | 120 | - | |
118120
| narrat | ✅ OK | 34 | - | |
119121
| nextflow | ✅ OK | 17 | - | |
122+
| nim | ✅ OK | 1126 | - | |
120123
| nix | ✅ OK | 80 | - | |
121124
| nushell | ✅ OK | 81 | - | |
122125
| objective-c | ✅ OK | 223 | - | |
@@ -143,6 +146,7 @@ Languages that works with the JavaScript RegExp engine, and will produce the sam
143146
| riscv | ✅ OK | 36 | - | |
144147
| rust | ✅ OK | 89 | - | |
145148
| sas | ✅ OK | 101 | - | |
149+
| sass | ✅ OK | 69 | - | |
146150
| scala | ✅ OK | 112 | - | |
147151
| scheme | ✅ OK | 34 | - | |
148152
| scss | ✅ OK | 234 | - | |
@@ -154,6 +158,7 @@ Languages that works with the JavaScript RegExp engine, and will produce the sam
154158
| sql | ✅ OK | 67 | - | |
155159
| ssh-config | ✅ OK | 12 | - | |
156160
| stylus | ✅ OK | 107 | - | |
161+
| svelte | ✅ OK | 1491 | - | |
157162
| system-verilog | ✅ OK | 102 | - | |
158163
| systemd | ✅ OK | 32 | - | |
159164
| tasl | ✅ OK | 23 | - | |
@@ -176,6 +181,8 @@ Languages that works with the JavaScript RegExp engine, and will produce the sam
176181
| verilog | ✅ OK | 33 | - | |
177182
| vhdl | ✅ OK | 82 | - | |
178183
| viml | ✅ OK | 72 | - | |
184+
| vue | ✅ OK | 1597 | - | |
185+
| vue-html | ✅ OK | 1620 | - | |
179186
| vyper | ✅ OK | 238 | - | |
180187
| wasm | ✅ OK | 78 | - | |
181188
| wenyan | ✅ OK | 18 | - | |
@@ -200,12 +207,16 @@ Languages that does not throw with the JavaScript RegExp engine, but will produc
200207
| elixir | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=elixir) | 708 | - | 179 |
201208
| erlang | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=erlang) | 147 | - | 470 |
202209
| glsl | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=glsl) | 186 | - | 306 |
210+
| haml | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=haml) | 1612 | - | 48 |
203211
| kusto | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=kusto) | 60 | - | 40 |
212+
| markdown | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=markdown) | 118 | - | 648 |
213+
| mdc | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=mdc) | 784 | - | 407 |
204214
| mermaid | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=mermaid) | 129 | - | 38 |
205215
| nginx | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=nginx) | 378 | - | 4 |
206216
| objective-cpp | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=objective-cpp) | 309 | - | 172 |
207217
| php | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=php) | 1131 | - | 605 |
208218
| po | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=po) | 23 | - | 336 |
219+
| pug | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=pug) | 1013 | - | 164 |
209220
| ruby | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=ruby) | 1307 | - | 1 |
210221
| shellscript | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=shellscript) | 148 | - | 56 |
211222
| smalltalk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=smalltalk) | 35 | - | 40 |
@@ -220,18 +231,9 @@ Languages that throws with the JavaScript RegExp engine (contains syntaxes that
220231
| Language | Highlight Match | Patterns Parsable | Patterns Failed | Diff |
221232
| ---------- | :------------------------------------------------------------------------- | ----------------: | --------------: | ---: |
222233
| ada | ✅ OK | 201 | 1 | |
223-
| astro | ✅ OK | 1088 | 2 | |
224-
| sass | ✅ OK | 67 | 2 | |
225-
| fsharp | ✅ OK | 232 | 7 | |
226-
| nim | ✅ OK | 1119 | 7 | |
227-
| svelte | ✅ OK | 1482 | 9 | |
228-
| vue | ✅ OK | 1588 | 9 | |
229-
| vue-html | ✅ OK | 1611 | 9 | |
230-
| asciidoc | ✅ OK | 4388 | 93 | |
231-
| wikitext | ✅ OK | 5208 | 95 | |
234+
| wikitext | ✅ OK | 5217 | 86 | |
235+
| asciidoc | ✅ OK | 4390 | 91 | |
232236
| blade | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=blade) | 1124 | 2 | |
233-
| pug | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=pug) | 1011 | 2 | 164 |
234-
| haml | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=haml) | 1603 | 9 | 48 |
235237
| rst | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=rst) | 1835 | 22 | 62 |
236238
| latex | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=latex) | 2451 | 48 | 25 |
237239
| powershell | ❌ Error | 87 | 1 | |
@@ -240,8 +242,6 @@ Languages that throws with the JavaScript RegExp engine (contains syntaxes that
240242
| swift | ❌ Error | 325 | 4 | 18 |
241243
| kotlin | ❌ Error | 52 | 6 | 2986 |
242244
| purescript | ❌ Error | 67 | 6 | 1488 |
243-
| markdown | ❌ Error | 111 | 7 | 584 |
244-
| mdc | ❌ Error | 777 | 7 | 377 |
245245
| apex | ❌ Error | 173 | 14 | 242 |
246246
| haskell | ❌ Error | 136 | 21 | 12 |
247247
| cpp | ❌ Error | 490 | 22 | 25 |
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import fs from 'node:fs/promises'
2+
import { expandRecursiveBackReference } from './utils'
3+
4+
interface ReplacementRecursiveBackReference {
5+
type: 'recursive-back-reference'
6+
regex: string
7+
groupName: string
8+
fallback: string
9+
recursive?: number
10+
}
11+
12+
interface ReplacementStatic {
13+
type: 'static'
14+
regex: string
15+
replacement: string
16+
}
17+
18+
type Replacement = ReplacementRecursiveBackReference | ReplacementStatic
19+
20+
const replacements: Replacement[] = [
21+
{
22+
// Subroutine recursive reference are not supported in JavaScript regex engine.
23+
// We expand a few levels of recursion to literals to simulate the behavior (incomplete)
24+
type: 'recursive-back-reference',
25+
regex: '(?<square>[^\\[\\]\\\\]|\\\\.|\\[\\g<square>*+\\])',
26+
groupName: 'square',
27+
fallback: '(?:[^\\[\\]\\\\])',
28+
},
29+
{
30+
type: 'recursive-back-reference',
31+
regex: '(?<url>(?>[^\\s()]+)|\\(\\g<url>*\\))',
32+
groupName: 'url',
33+
fallback: '[^\\s\\(\\)]',
34+
},
35+
]
36+
37+
const result = replacements.map((r) => {
38+
switch (r.type) {
39+
case 'recursive-back-reference':
40+
return [r.regex, expandRecursiveBackReference(r.regex, r.groupName, r.fallback, r.recursive ?? 2)]
41+
case 'static':
42+
return [r.regex, r.replacement]
43+
default:
44+
throw new Error(`Unknown replacement type: ${(r as any).type}`)
45+
}
46+
})
47+
48+
fs.writeFile(new URL('../src/replacements.ts', import.meta.url), `// Generated by script\n\nexport const replacements = ${JSON.stringify(result, null, 2)} as [string, string][]\n`, 'utf-8')
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
export function expandRecursiveBackReference(
2+
regex: string,
3+
name: string,
4+
fallback: string,
5+
recursive = 2,
6+
) {
7+
const refMarker = new RegExp(`\\\\g<${name}>`, 'g')
8+
const groupMaker = new RegExp(`\\(\\?<${name}>`, 'g')
9+
const normalized = regex.replace(groupMaker, '(?:')
10+
11+
let out = regex
12+
for (let i = 0; i < recursive; i++) {
13+
out = out.replace(refMarker, normalized)
14+
}
15+
16+
out = out
17+
.replace(refMarker, fallback)
18+
.replace(groupMaker, '(?:')
19+
20+
return out
21+
}

packages/engine-javascript/src/index.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import type {
44
RegexEngineString,
55
} from '@shikijs/types'
66
import { onigurumaToRegexp } from 'oniguruma-to-js'
7+
import { replacements } from './replacements'
78

89
export interface JavaScriptRegexEngineOptions {
910
/**
@@ -77,7 +78,13 @@ export class JavaScriptScanner implements PatternScanner {
7778
throw cached
7879
}
7980
try {
80-
const regex = regexConstructor(p)
81+
let pattern = p
82+
if (simulation) {
83+
for (const [from, to] of replacements) {
84+
pattern = pattern.replaceAll(from, to)
85+
}
86+
}
87+
const regex = regexConstructor(pattern)
8188
cache?.set(p, regex)
8289
return regex
8390
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Generated by script
2+
3+
export const replacements = [
4+
[
5+
'(?<square>[^\\[\\]\\\\]|\\\\.|\\[\\g<square>*+\\])',
6+
'(?:[^\\[\\]\\\\]|\\\\.|\\[(?:[^\\[\\]\\\\]|\\\\.|\\[(?:[^\\[\\]\\\\]|\\\\.|\\[(?:[^\\[\\]\\\\])*+\\])*+\\])*+\\])',
7+
],
8+
[
9+
'(?<url>(?>[^\\s()]+)|\\(\\g<url>*\\))',
10+
'(?:(?>[^\\s()]+)|\\((?:(?>[^\\s()]+)|\\((?:(?>[^\\s()]+)|\\([^\\s\\(\\)]*\\))*\\))*\\))',
11+
],
12+
] as [string, string][]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import { describe, expect, it } from 'vitest'
2+
import { expandRecursiveBackReference } from '../scripts/utils'
3+
4+
describe('expandRecursiveBackReference', () => {
5+
it('case 1', () => {
6+
const name = 'square'
7+
const regex = '(?<square>[^\\[\\]\\\\]|\\\\.|\\[\\g<square>*\\])'
8+
const fallback = '(?:[^\\[\\]\\\\])'
9+
10+
expect(expandRecursiveBackReference(regex, name, fallback, 0))
11+
.toMatchInlineSnapshot(`"(?:[^\\[\\]\\\\]|\\\\.|\\[(?:[^\\[\\]\\\\])*\\])"`)
12+
13+
expect(expandRecursiveBackReference(regex, name, fallback, 1))
14+
.toMatchInlineSnapshot(`"(?:[^\\[\\]\\\\]|\\\\.|\\[(?:[^\\[\\]\\\\]|\\\\.|\\[(?:[^\\[\\]\\\\])*\\])*\\])"`)
15+
16+
expect(expandRecursiveBackReference(regex, name, fallback, 2))
17+
.toMatchInlineSnapshot(`"(?:[^\\[\\]\\\\]|\\\\.|\\[(?:[^\\[\\]\\\\]|\\\\.|\\[(?:[^\\[\\]\\\\]|\\\\.|\\[(?:[^\\[\\]\\\\])*\\])*\\])*\\])"`)
18+
})
19+
})

pnpm-lock.yaml

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pnpm-workspace.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ catalog:
5353
minimist: ^1.2.8
5454
monaco-editor-core: ^0.51.0
5555
ofetch: ^1.3.4
56-
oniguruma-to-js: 0.4.0
56+
oniguruma-to-js: 0.4.3
5757
picocolors: ^1.1.0
5858
pinia: ^2.2.2
5959
pnpm: ^9.10.0

0 commit comments

Comments
 (0)