Skip to content

Commit 231c9d7

Browse files
leebyronandimarek
andcommitted
RFC: Support full Unicode in lexer
Depends on #3115 Implements RFC at graphql/graphql-spec#849. * Replaces `isSourceCharacter` with `isUnicodeScalarValue` * Adds `isSupplementaryCodePoint`, used in String, BlockStrings, and Comments to ensure correct lexing of JavaScript's UTF-16 source. * Updates `printCodePointAt` to correctly print supplementary code points. * Adds variable-width Unicode escape sequences * Adds explicit support for legacy JSON-style fixed-width Unicode escape sequence surrogate pairs. * Adds `printString` to no longer rely on `JSON.stringify`. Borrows some implementation details from Node.js internals for string printing. Implements: > When producing a {StringValue}, implementations should use escape sequences to > represent non-printable control characters (U+0000 to U+001F and U+007F to > U+009F). Other escape sequences are not necessary, however an implementation may > use escape sequences to represent any other range of code points. Closes #2449 Co-authored-by: Andreas Marek <andimarek@fastmail.fm>
1 parent b6864ac commit 231c9d7

File tree

6 files changed

+528
-48
lines changed

6 files changed

+528
-48
lines changed

src/language/__tests__/lexer-test.ts

Lines changed: 224 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,6 @@ function expectSyntaxError(text: string) {
2828
}
2929

3030
describe('Lexer', () => {
31-
it('disallows uncommon control characters', () => {
32-
expectSyntaxError('\u0007').to.deep.equal({
33-
message: 'Syntax Error: Invalid character: U+0007.',
34-
locations: [{ line: 1, column: 1 }],
35-
});
36-
});
37-
3831
it('ignores BOM header', () => {
3932
expect(lexOne('\uFEFF foo')).to.contain({
4033
kind: TokenKind.NAME,
@@ -264,12 +257,98 @@ describe('Lexer', () => {
264257
value: 'slashes \\ /',
265258
});
266259

260+
expect(lexOne('"unescaped unicode outside BMP \u{1f600}"')).to.contain({
261+
kind: TokenKind.STRING,
262+
start: 0,
263+
end: 34,
264+
value: 'unescaped unicode outside BMP \u{1f600}',
265+
});
266+
267+
expect(
268+
lexOne('"unescaped maximal unicode outside BMP \u{10ffff}"'),
269+
).to.contain({
270+
kind: TokenKind.STRING,
271+
start: 0,
272+
end: 42,
273+
value: 'unescaped maximal unicode outside BMP \u{10ffff}',
274+
});
275+
267276
expect(lexOne('"unicode \\u1234\\u5678\\u90AB\\uCDEF"')).to.contain({
268277
kind: TokenKind.STRING,
269278
start: 0,
270279
end: 34,
271280
value: 'unicode \u1234\u5678\u90AB\uCDEF',
272281
});
282+
283+
expect(lexOne('"unicode \\u{1234}\\u{5678}\\u{90AB}\\u{CDEF}"')).to.contain(
284+
{
285+
kind: TokenKind.STRING,
286+
start: 0,
287+
end: 42,
288+
value: 'unicode \u1234\u5678\u90AB\uCDEF',
289+
},
290+
);
291+
292+
expect(
293+
lexOne('"string with unicode escape outside BMP \\u{1F600}"'),
294+
).to.contain({
295+
kind: TokenKind.STRING,
296+
start: 0,
297+
end: 50,
298+
value: 'string with unicode escape outside BMP \u{1f600}',
299+
});
300+
301+
expect(lexOne('"string with minimal unicode escape \\u{0}"')).to.contain({
302+
kind: TokenKind.STRING,
303+
start: 0,
304+
end: 42,
305+
value: 'string with minimal unicode escape \u{0}',
306+
});
307+
308+
expect(
309+
lexOne('"string with maximal unicode escape \\u{10FFFF}"'),
310+
).to.contain({
311+
kind: TokenKind.STRING,
312+
start: 0,
313+
end: 47,
314+
value: 'string with maximal unicode escape \u{10FFFF}',
315+
});
316+
317+
expect(
318+
lexOne('"string with maximal minimal unicode escape \\u{00000000}"'),
319+
).to.contain({
320+
kind: TokenKind.STRING,
321+
start: 0,
322+
end: 57,
323+
value: 'string with maximal minimal unicode escape \u{0}',
324+
});
325+
326+
expect(
327+
lexOne('"string with unicode surrogate pair escape \\uD83D\\uDE00"'),
328+
).to.contain({
329+
kind: TokenKind.STRING,
330+
start: 0,
331+
end: 56,
332+
value: 'string with unicode surrogate pair escape \u{1f600}',
333+
});
334+
335+
expect(
336+
lexOne('"string with minimal surrogate pair escape \\uD800\\uDC00"'),
337+
).to.contain({
338+
kind: TokenKind.STRING,
339+
start: 0,
340+
end: 56,
341+
value: 'string with minimal surrogate pair escape \u{10000}',
342+
});
343+
344+
expect(
345+
lexOne('"string with maximal surrogate pair escape \\uDBFF\\uDFFF"'),
346+
).to.contain({
347+
kind: TokenKind.STRING,
348+
start: 0,
349+
end: 56,
350+
value: 'string with maximal surrogate pair escape \u{10FFFF}',
351+
});
273352
});
274353

275354
it('lex reports useful string errors', () => {
@@ -299,16 +378,19 @@ describe('Lexer', () => {
299378
locations: [{ line: 1, column: 1 }],
300379
});
301380

302-
expectSyntaxError('"contains unescaped \u0007 control char"').to.deep.equal(
303-
{
304-
message: 'Syntax Error: Invalid character within String: U+0007.',
305-
locations: [{ line: 1, column: 21 }],
306-
},
307-
);
381+
expectSyntaxError('"bad surrogate \uDEAD"').to.deep.equal({
382+
message: 'Syntax Error: Invalid character within String: U+DEAD.',
383+
locations: [{ line: 1, column: 16 }],
384+
});
385+
386+
expectSyntaxError('"bad high surrogate pair \uDEAD\uDEAD"').to.deep.equal({
387+
message: 'Syntax Error: Invalid character within String: U+DEAD.',
388+
locations: [{ line: 1, column: 26 }],
389+
});
308390

309-
expectSyntaxError('"null-byte is not \u0000 end of file"').to.deep.equal({
310-
message: 'Syntax Error: Invalid character within String: U+0000.',
311-
locations: [{ line: 1, column: 19 }],
391+
expectSyntaxError('"bad low surrogate pair \uD800\uD800"').to.deep.equal({
392+
message: 'Syntax Error: Invalid character within String: U+D800.',
393+
locations: [{ line: 1, column: 25 }],
312394
});
313395

314396
expectSyntaxError('"multi\nline"').to.deep.equal({
@@ -355,6 +437,93 @@ describe('Lexer', () => {
355437
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uXXXF".',
356438
locations: [{ line: 1, column: 6 }],
357439
});
440+
441+
expectSyntaxError('"bad \\u{} esc"').to.deep.equal({
442+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{}".',
443+
locations: [{ line: 1, column: 6 }],
444+
});
445+
446+
expectSyntaxError('"bad \\u{FXXX} esc"').to.deep.equal({
447+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FX".',
448+
locations: [{ line: 1, column: 6 }],
449+
});
450+
451+
expectSyntaxError('"bad \\u{FFFF esc"').to.deep.equal({
452+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF ".',
453+
locations: [{ line: 1, column: 6 }],
454+
});
455+
456+
expectSyntaxError('"bad \\u{FFFF"').to.deep.equal({
457+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF"".',
458+
locations: [{ line: 1, column: 6 }],
459+
});
460+
461+
expectSyntaxError('"too high \\u{110000} esc"').to.deep.equal({
462+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{110000}".',
463+
locations: [{ line: 1, column: 11 }],
464+
});
465+
466+
expectSyntaxError('"way too high \\u{12345678} esc"').to.deep.equal({
467+
message:
468+
'Syntax Error: Invalid Unicode escape sequence: "\\u{12345678}".',
469+
locations: [{ line: 1, column: 15 }],
470+
});
471+
472+
expectSyntaxError('"too long \\u{000000000} esc"').to.deep.equal({
473+
message:
474+
'Syntax Error: Invalid Unicode escape sequence: "\\u{000000000".',
475+
locations: [{ line: 1, column: 11 }],
476+
});
477+
478+
expectSyntaxError('"bad surrogate \\uDEAD esc"').to.deep.equal({
479+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
480+
locations: [{ line: 1, column: 16 }],
481+
});
482+
483+
expectSyntaxError('"bad surrogate \\u{DEAD} esc"').to.deep.equal({
484+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{DEAD}".',
485+
locations: [{ line: 1, column: 16 }],
486+
});
487+
488+
expectSyntaxError(
489+
'"cannot use braces for surrogate pair \\u{D83D}\\u{DE00} esc"',
490+
).to.deep.equal({
491+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{D83D}".',
492+
locations: [{ line: 1, column: 39 }],
493+
});
494+
495+
expectSyntaxError(
496+
'"bad high surrogate pair \\uDEAD\\uDEAD esc"',
497+
).to.deep.equal({
498+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
499+
locations: [{ line: 1, column: 26 }],
500+
});
501+
502+
expectSyntaxError(
503+
'"bad low surrogate pair \\uD800\\uD800 esc"',
504+
).to.deep.equal({
505+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD800".',
506+
locations: [{ line: 1, column: 25 }],
507+
});
508+
509+
expectSyntaxError(
510+
'"cannot escape half a pair \uD83D\\uDE00 esc"',
511+
).to.deep.equal({
512+
message: 'Syntax Error: Invalid character within String: U+D83D.',
513+
locations: [{ line: 1, column: 28 }],
514+
});
515+
516+
expectSyntaxError(
517+
'"cannot escape half a pair \\uD83D\uDE00 esc"',
518+
).to.deep.equal({
519+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
520+
locations: [{ line: 1, column: 28 }],
521+
});
522+
523+
expectSyntaxError('"bad \\uD83D\\not an escape"').to.deep.equal({
524+
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
525+
locations: [{ line: 1, column: 6 }],
526+
});
358527
});
359528

360529
it('lexes block strings', () => {
@@ -414,6 +583,13 @@ describe('Lexer', () => {
414583
value: 'unescaped \\n\\r\\b\\t\\f\\u1234',
415584
});
416585

586+
expect(lexOne('"""unescaped unicode outside BMP \u{1f600}"""')).to.contain({
587+
kind: TokenKind.BLOCK_STRING,
588+
start: 0,
589+
end: 38,
590+
value: 'unescaped unicode outside BMP \u{1f600}',
591+
});
592+
417593
expect(lexOne('"""slashes \\\\ \\/"""')).to.contain({
418594
kind: TokenKind.BLOCK_STRING,
419595
start: 0,
@@ -486,18 +662,9 @@ describe('Lexer', () => {
486662
locations: [{ line: 1, column: 16 }],
487663
});
488664

489-
expectSyntaxError(
490-
'"""contains unescaped \u0007 control char"""',
491-
).to.deep.equal({
492-
message: 'Syntax Error: Invalid character within String: U+0007.',
493-
locations: [{ line: 1, column: 23 }],
494-
});
495-
496-
expectSyntaxError(
497-
'"""null-byte is not \u0000 end of file"""',
498-
).to.deep.equal({
499-
message: 'Syntax Error: Invalid character within String: U+0000.',
500-
locations: [{ line: 1, column: 21 }],
665+
expectSyntaxError('"""contains invalid surrogate \uDEAD"""').to.deep.equal({
666+
message: 'Syntax Error: Invalid character within String: U+DEAD.',
667+
locations: [{ line: 1, column: 31 }],
501668
});
502669
});
503670

@@ -837,6 +1004,16 @@ describe('Lexer', () => {
8371004
locations: [{ line: 1, column: 1 }],
8381005
});
8391006

1007+
expectSyntaxError('\x00').to.deep.equal({
1008+
message: 'Syntax Error: Unexpected character: U+0000.',
1009+
locations: [{ line: 1, column: 1 }],
1010+
});
1011+
1012+
expectSyntaxError('\b').to.deep.equal({
1013+
message: 'Syntax Error: Unexpected character: U+0008.',
1014+
locations: [{ line: 1, column: 1 }],
1015+
});
1016+
8401017
expectSyntaxError('\u00AA').to.deep.equal({
8411018
message: 'Syntax Error: Unexpected character: U+00AA.',
8421019
locations: [{ line: 1, column: 1 }],
@@ -851,6 +1028,16 @@ describe('Lexer', () => {
8511028
message: 'Syntax Error: Unexpected character: U+203B.',
8521029
locations: [{ line: 1, column: 1 }],
8531030
});
1031+
1032+
expectSyntaxError('\u{1f600}').to.deep.equal({
1033+
message: 'Syntax Error: Unexpected character: U+1F600.',
1034+
locations: [{ line: 1, column: 1 }],
1035+
});
1036+
1037+
expectSyntaxError('\uDEAD').to.deep.equal({
1038+
message: 'Syntax Error: Invalid character: U+DEAD.',
1039+
locations: [{ line: 1, column: 1 }],
1040+
});
8541041
});
8551042

8561043
it('lex reports useful information for dashes in names', () => {
@@ -931,9 +1118,15 @@ describe('Lexer', () => {
9311118
end: 9,
9321119
value: ' Comment',
9331120
});
934-
expectSyntaxError('# \u0007').to.deep.equal({
935-
message: 'Syntax Error: Invalid character: U+0007.',
936-
locations: [{ line: 1, column: 3 }],
1121+
expect(lexOne('# Comment \u{1f600}').prev).to.contain({
1122+
kind: TokenKind.COMMENT,
1123+
start: 0,
1124+
end: 12,
1125+
value: ' Comment \u{1f600}',
1126+
});
1127+
expectSyntaxError('# Invalid surrogate \uDEAD').to.deep.equal({
1128+
message: 'Syntax Error: Invalid character: U+DEAD.',
1129+
locations: [{ line: 1, column: 21 }],
9371130
});
9381131
});
9391132
});

0 commit comments

Comments
 (0)