Skip to content

Commit 8ca3d89

Browse files
leebyronandimarek
andauthored
RFC: Support full Unicode in lexer (#3117)
Depends on #3115 Implements RFC at graphql/graphql-spec#849. * Replaces `isSourceCharacter` with `isUnicodeScalarValue` * Adds `isSupplementaryCodePoint`, used in String, BlockStrings, and Comments to ensure correct lexing of JavaScript's UTF-16 source. * Updates `printCodePointAt` to correctly print supplementary code points. * Adds variable-width Unicode escape sequences * Adds explicit support for legacy JSON-style fixed-width Unicode escape sequence surrogate pairs. * Adds `printString` to no longer rely on `JSON.stringify`. Borrows some implementation details from Node.js internals for string printing. Implements: > When producing a {StringValue}, implementations should use escape sequences to > represent non-printable control characters (U+0000 to U+001F and U+007F to > U+009F). Other escape sequences are not necessary, however an implementation may > use escape sequences to represent any other range of code points. Closes #2449 Co-authored-by: Andreas Marek <andimarek@fastmail.fm>
1 parent 4493ca3 commit 8ca3d89

File tree

7 files changed

+531
-50
lines changed

7 files changed

+531
-50
lines changed

cspell.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ overrides:
2020
- filename: '**/docs/APIReference-*.md'
2121
ignoreRegExpList: ['/href="[^"]*"/']
2222

23+
ignoreRegExpList:
24+
- u\{[0-9a-f]{1,8}\}
25+
2326
words:
2427
- graphiql
2528
- sublinks

src/language/__tests__/lexer-test.ts

Lines changed: 224 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,6 @@ function expectSyntaxError(text: string) {
2828
}
2929

3030
describe('Lexer', () => {
31-
it('disallows uncommon control characters', () => {
32-
expectSyntaxError('\u0007').to.deep.equal({
33-
message: 'Syntax Error: Invalid character: U+0007.',
34-
locations: [{ line: 1, column: 1 }],
35-
});
36-
});
37-
3831
it('ignores BOM header', () => {
3932
expect(lexOne('\uFEFF foo')).to.contain({
4033
kind: TokenKind.NAME,
@@ -269,12 +262,98 @@ describe('Lexer', () => {
269262
value: 'slashes \\ /',
270263
});
271264

265+
expect(lexOne('"unescaped unicode outside BMP \u{1f600}"')).to.contain({
266+
kind: TokenKind.STRING,
267+
start: 0,
268+
end: 34,
269+
value: 'unescaped unicode outside BMP \u{1f600}',
270+
});
271+
272+
expect(
273+
lexOne('"unescaped maximal unicode outside BMP \u{10ffff}"'),
274+
).to.contain({
275+
kind: TokenKind.STRING,
276+
start: 0,
277+
end: 42,
278+
value: 'unescaped maximal unicode outside BMP \u{10ffff}',
279+
});
280+
272281
expect(lexOne('"unicode \\u1234\\u5678\\u90AB\\uCDEF"')).to.contain({
273282
kind: TokenKind.STRING,
274283
start: 0,
275284
end: 34,
276285
value: 'unicode \u1234\u5678\u90AB\uCDEF',
277286
});
287+
288+
expect(lexOne('"unicode \\u{1234}\\u{5678}\\u{90AB}\\u{CDEF}"')).to.contain(
289+
{
290+
kind: TokenKind.STRING,
291+
start: 0,
292+
end: 42,
293+
value: 'unicode \u1234\u5678\u90AB\uCDEF',
294+
},
295+
);
296+
297+
expect(
298+
lexOne('"string with unicode escape outside BMP \\u{1F600}"'),
299+
).to.contain({
300+
kind: TokenKind.STRING,
301+
start: 0,
302+
end: 50,
303+
value: 'string with unicode escape outside BMP \u{1f600}',
304+
});
305+
306+
expect(lexOne('"string with minimal unicode escape \\u{0}"')).to.contain({
307+
kind: TokenKind.STRING,
308+
start: 0,
309+
end: 42,
310+
value: 'string with minimal unicode escape \u{0}',
311+
});
312+
313+
expect(
314+
lexOne('"string with maximal unicode escape \\u{10FFFF}"'),
315+
).to.contain({
316+
kind: TokenKind.STRING,
317+
start: 0,
318+
end: 47,
319+
value: 'string with maximal unicode escape \u{10FFFF}',
320+
});
321+
322+
expect(
323+
lexOne('"string with maximal minimal unicode escape \\u{00000000}"'),
324+
).to.contain({
325+
kind: TokenKind.STRING,
326+
start: 0,
327+
end: 57,
328+
value: 'string with maximal minimal unicode escape \u{0}',
329+
});
330+
331+
expect(
332+
lexOne('"string with unicode surrogate pair escape \\uD83D\\uDE00"'),
333+
).to.contain({
334+
kind: TokenKind.STRING,
335+
start: 0,
336+
end: 56,
337+
value: 'string with unicode surrogate pair escape \u{1f600}',
338+
});
339+
340+
expect(
341+
lexOne('"string with minimal surrogate pair escape \\uD800\\uDC00"'),
342+
).to.contain({
343+
kind: TokenKind.STRING,
344+
start: 0,
345+
end: 56,
346+
value: 'string with minimal surrogate pair escape \u{10000}',
347+
});
348+
349+
expect(
350+
lexOne('"string with maximal surrogate pair escape \\uDBFF\\uDFFF"'),
351+
).to.contain({
352+
kind: TokenKind.STRING,
353+
start: 0,
354+
end: 56,
355+
value: 'string with maximal surrogate pair escape \u{10FFFF}',
356+
});
278357
});
279358

280359
it('lex reports useful string errors', () => {
@@ -304,16 +383,19 @@ describe('Lexer', () => {
304383
locations: [{ line: 1, column: 1 }],
305384
});
306385

307-
expectSyntaxError('"contains unescaped \u0007 control char"').to.deep.equal(
308-
{
309-
message: 'Syntax Error: Invalid character within String: U+0007.',
310-
locations: [{ line: 1, column: 21 }],
311-
},
312-
);
386+
expectSyntaxError('"bad surrogate \uDEAD"').to.deep.equal({
387+
message: 'Syntax Error: Invalid character within String: U+DEAD.',
388+
locations: [{ line: 1, column: 16 }],
389+
});
390+
391+
expectSyntaxError('"bad high surrogate pair \uDEAD\uDEAD"').to.deep.equal({
392+
message: 'Syntax Error: Invalid character within String: U+DEAD.',
393+
locations: [{ line: 1, column: 26 }],
394+
});
313395

314-
expectSyntaxError('"null-byte is not \u0000 end of file"').to.deep.equal({
315-
message: 'Syntax Error: Invalid character within String: U+0000.',
316-
locations: [{ line: 1, column: 19 }],
396+
expectSyntaxError('"bad low surrogate pair \uD800\uD800"').to.deep.equal({
397+
message: 'Syntax Error: Invalid character within String: U+D800.',
398+
locations: [{ line: 1, column: 25 }],
317399
});
318400

319401
expectSyntaxError('"multi\nline"').to.deep.equal({
@@ -360,6 +442,93 @@ describe('Lexer', () => {
360442
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uXXXF".',
361443
locations: [{ line: 1, column: 6 }],
362444
});
445+
446+
expectSyntaxError('"bad \\u{} esc"').to.deep.equal({
447+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{}".',
448+
locations: [{ line: 1, column: 6 }],
449+
});
450+
451+
expectSyntaxError('"bad \\u{FXXX} esc"').to.deep.equal({
452+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FX".',
453+
locations: [{ line: 1, column: 6 }],
454+
});
455+
456+
expectSyntaxError('"bad \\u{FFFF esc"').to.deep.equal({
457+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF ".',
458+
locations: [{ line: 1, column: 6 }],
459+
});
460+
461+
expectSyntaxError('"bad \\u{FFFF"').to.deep.equal({
462+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF"".',
463+
locations: [{ line: 1, column: 6 }],
464+
});
465+
466+
expectSyntaxError('"too high \\u{110000} esc"').to.deep.equal({
467+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{110000}".',
468+
locations: [{ line: 1, column: 11 }],
469+
});
470+
471+
expectSyntaxError('"way too high \\u{12345678} esc"').to.deep.equal({
472+
message:
473+
'Syntax Error: Invalid Unicode escape sequence: "\\u{12345678}".',
474+
locations: [{ line: 1, column: 15 }],
475+
});
476+
477+
expectSyntaxError('"too long \\u{000000000} esc"').to.deep.equal({
478+
message:
479+
'Syntax Error: Invalid Unicode escape sequence: "\\u{000000000".',
480+
locations: [{ line: 1, column: 11 }],
481+
});
482+
483+
expectSyntaxError('"bad surrogate \\uDEAD esc"').to.deep.equal({
484+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
485+
locations: [{ line: 1, column: 16 }],
486+
});
487+
488+
expectSyntaxError('"bad surrogate \\u{DEAD} esc"').to.deep.equal({
489+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{DEAD}".',
490+
locations: [{ line: 1, column: 16 }],
491+
});
492+
493+
expectSyntaxError(
494+
'"cannot use braces for surrogate pair \\u{D83D}\\u{DE00} esc"',
495+
).to.deep.equal({
496+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{D83D}".',
497+
locations: [{ line: 1, column: 39 }],
498+
});
499+
500+
expectSyntaxError(
501+
'"bad high surrogate pair \\uDEAD\\uDEAD esc"',
502+
).to.deep.equal({
503+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
504+
locations: [{ line: 1, column: 26 }],
505+
});
506+
507+
expectSyntaxError(
508+
'"bad low surrogate pair \\uD800\\uD800 esc"',
509+
).to.deep.equal({
510+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD800".',
511+
locations: [{ line: 1, column: 25 }],
512+
});
513+
514+
expectSyntaxError(
515+
'"cannot escape half a pair \uD83D\\uDE00 esc"',
516+
).to.deep.equal({
517+
message: 'Syntax Error: Invalid character within String: U+D83D.',
518+
locations: [{ line: 1, column: 28 }],
519+
});
520+
521+
expectSyntaxError(
522+
'"cannot escape half a pair \\uD83D\uDE00 esc"',
523+
).to.deep.equal({
524+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
525+
locations: [{ line: 1, column: 28 }],
526+
});
527+
528+
expectSyntaxError('"bad \\uD83D\\not an escape"').to.deep.equal({
529+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
530+
locations: [{ line: 1, column: 6 }],
531+
});
363532
});
364533

365534
it('lexes block strings', () => {
@@ -419,6 +588,13 @@ describe('Lexer', () => {
419588
value: 'unescaped \\n\\r\\b\\t\\f\\u1234',
420589
});
421590

591+
expect(lexOne('"""unescaped unicode outside BMP \u{1f600}"""')).to.contain({
592+
kind: TokenKind.BLOCK_STRING,
593+
start: 0,
594+
end: 38,
595+
value: 'unescaped unicode outside BMP \u{1f600}',
596+
});
597+
422598
expect(lexOne('"""slashes \\\\ \\/"""')).to.contain({
423599
kind: TokenKind.BLOCK_STRING,
424600
start: 0,
@@ -491,18 +667,9 @@ describe('Lexer', () => {
491667
locations: [{ line: 1, column: 16 }],
492668
});
493669

494-
expectSyntaxError(
495-
'"""contains unescaped \u0007 control char"""',
496-
).to.deep.equal({
497-
message: 'Syntax Error: Invalid character within String: U+0007.',
498-
locations: [{ line: 1, column: 23 }],
499-
});
500-
501-
expectSyntaxError(
502-
'"""null-byte is not \u0000 end of file"""',
503-
).to.deep.equal({
504-
message: 'Syntax Error: Invalid character within String: U+0000.',
505-
locations: [{ line: 1, column: 21 }],
670+
expectSyntaxError('"""contains invalid surrogate \uDEAD"""').to.deep.equal({
671+
message: 'Syntax Error: Invalid character within String: U+DEAD.',
672+
locations: [{ line: 1, column: 31 }],
506673
});
507674
});
508675

@@ -842,6 +1009,16 @@ describe('Lexer', () => {
8421009
locations: [{ line: 1, column: 1 }],
8431010
});
8441011

1012+
expectSyntaxError('\x00').to.deep.equal({
1013+
message: 'Syntax Error: Unexpected character: U+0000.',
1014+
locations: [{ line: 1, column: 1 }],
1015+
});
1016+
1017+
expectSyntaxError('\b').to.deep.equal({
1018+
message: 'Syntax Error: Unexpected character: U+0008.',
1019+
locations: [{ line: 1, column: 1 }],
1020+
});
1021+
8451022
expectSyntaxError('\u00AA').to.deep.equal({
8461023
message: 'Syntax Error: Unexpected character: U+00AA.',
8471024
locations: [{ line: 1, column: 1 }],
@@ -856,6 +1033,16 @@ describe('Lexer', () => {
8561033
message: 'Syntax Error: Unexpected character: U+203B.',
8571034
locations: [{ line: 1, column: 1 }],
8581035
});
1036+
1037+
expectSyntaxError('\u{1f600}').to.deep.equal({
1038+
message: 'Syntax Error: Unexpected character: U+1F600.',
1039+
locations: [{ line: 1, column: 1 }],
1040+
});
1041+
1042+
expectSyntaxError('\uDEAD').to.deep.equal({
1043+
message: 'Syntax Error: Invalid character: U+DEAD.',
1044+
locations: [{ line: 1, column: 1 }],
1045+
});
8591046
});
8601047

8611048
it('lex reports useful information for dashes in names', () => {
@@ -936,9 +1123,15 @@ describe('Lexer', () => {
9361123
end: 9,
9371124
value: ' Comment',
9381125
});
939-
expectSyntaxError('# \u0007').to.deep.equal({
940-
message: 'Syntax Error: Invalid character: U+0007.',
941-
locations: [{ line: 1, column: 3 }],
1126+
expect(lexOne('# Comment \u{1f600}').prev).to.contain({
1127+
kind: TokenKind.COMMENT,
1128+
start: 0,
1129+
end: 12,
1130+
value: ' Comment \u{1f600}',
1131+
});
1132+
expectSyntaxError('# Invalid surrogate \uDEAD').to.deep.equal({
1133+
message: 'Syntax Error: Invalid character: U+DEAD.',
1134+
locations: [{ line: 1, column: 21 }],
9421135
});
9431136
});
9441137
});

0 commit comments

Comments
 (0)