Skip to content

Commit

Permalink
regexp-generator: Implement downstream changes
Browse files Browse the repository at this point in the history
The optimizations from commit e558b29 were never incorporated into the
upstream test generator. This does so now.

As far as I can tell, the changes to the Unicode ranges are purely
cosmetic. Some are formatted as 6-digit hex numbers instead of 4-digit.
Others move the low-surrogates range 0xDC00-0xDCFF to the beginning of the
array, but the union of the ranges is still the same.
  • Loading branch information
ptomato committed Nov 12, 2024
1 parent 8793268 commit 07ddc3b
Show file tree
Hide file tree
Showing 10 changed files with 96 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ includes: [regExpUtils.js]
const str = buildString({
loneCodePoints: [],
ranges: [
[0x0030, 0x0039],
[0x000030, 0x000039],
],
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ includes: [regExpUtils.js]
const str = buildString({
loneCodePoints: [],
ranges: [
[0x0030, 0x0039],
[0x000030, 0x000039],
],
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ includes: [regExpUtils.js]
const str = buildString({
loneCodePoints: [],
ranges: [
[0x0030, 0x0039],
[0x000030, 0x000039],
],
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ includes: [regExpUtils.js]
const str = buildString({
loneCodePoints: [],
ranges: [
[0x0030, 0x0039],
[0x000030, 0x000039],
],
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,10 @@ includes: [regExpUtils.js]
const str = buildString({
loneCodePoints: [],
ranges: [
[0x00DC00, 0x00DFFF],
[0x000000, 0x00002F],
[0x00003A, 0x10FFFF],
[0x00003A, 0x00DBFF],
[0x00E000, 0x10FFFF],
],
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ includes: [regExpUtils.js]
---*/

const str = buildString({
loneCodePoints: [],
ranges: [
[0x000000, 0x00002F],
[0x00003A, 0x10FFFF],
],
loneCodePoints: [],
ranges: [
[0x00DC00, 0x00DFFF],
[0x000000, 0x00002F],
[0x00003A, 0x00DBFF],
[0x00E000, 0x10FFFF],
],
});

const re = /\D+/ug;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ includes: [regExpUtils.js]
---*/

const str = buildString({
loneCodePoints: [],
ranges: [
[0x000000, 0x00002F],
[0x00003A, 0x00FFFF],
],
loneCodePoints: [],
ranges: [
[0x00DC00, 0x00DFFF],
[0x000000, 0x00002F],
[0x00003A, 0x00DBFF],
[0x00E000, 0x00FFFF],
],
});

const re = /\D+/g;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ includes: [regExpUtils.js]
---*/

const str = buildString({
loneCodePoints: [],
ranges: [
[0x000000, 0x00002F],
[0x00003A, 0x00FFFF],
],
loneCodePoints: [],
ranges: [
[0x00DC00, 0x00DFFF],
[0x000000, 0x00002F],
[0x00003A, 0x00DBFF],
[0x00E000, 0x00FFFF],
],
});

const re = /\D/g;
Expand Down
78 changes: 67 additions & 11 deletions tools/regexp-generator/index.mjs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import filenamify from 'filenamify';
import fs from 'node:fs';
import jsesc from 'jsesc';
import regenerate from 'regenerate';
import rewritePattern from 'regexpu-core';
import ESCAPE_SETS from 'regexpu-core/data/character-class-escape-sets.js';
import slugify from 'slugify';

import header from './header.mjs';
Expand All @@ -15,30 +16,85 @@ const patterns = {
'non-digit class escape': '\\D',
};

// Pretty-printing code adapted from unicode-property-escapes-tests.
// https://github.com/mathiasbynens/unicode-property-escapes-tests/blob/60f2dbec2b2a840ee67aa04dbd3449bb90fd2999/regenerate.js

function toHex(codePoint) {
return '0x' + ('00000' + codePoint.toString(16).toUpperCase()).slice(-6);
};

function toTestData(reg) {
const data = reg.data;
// Iterate over the data per `(start, end)` pair.
let index = 0;
const length = data.length;
const loneCodePoints = [];
const ranges = [];
while (index < length) {
let start = data[index];
let end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive.
if (start == end) {
loneCodePoints.push(start);
} else {
ranges.push([start, end]);
}
index += 2;
}
return [ loneCodePoints, ranges ];
}

function prettyPrint([ loneCodePoints, ranges ]) {
const indent = ' ';
loneCodePoints = loneCodePoints.map((codePoint) => toHex(codePoint));
ranges = ranges.map(
(range) => `[${ toHex(range[0]) }, ${ toHex(range[1]) }]`
);
const loneCodePointsOutput = loneCodePoints.length ?
loneCodePoints.length === 1 ? `[${loneCodePoints[0]}]` :
`[\n${indent}${indent}${ loneCodePoints.join(`,\n${indent}${indent}`) },\n${indent}]` :
`[]`;
const rangesOutput = ranges.length ?
`[\n${indent}${indent}${ ranges.join(`,\n${indent}${indent}`) },\n${indent}]` :
`[]`;
return `{\n${indent}loneCodePoints: ${ loneCodePointsOutput },\n${indent}ranges: ${ rangesOutput },\n}`;
}

const LOW_SURROGATES = regenerate().addRange(0xDC00, 0xDFFF);

function buildString(escapeChar, flags) {
const isUnicode = flags.includes('u');
let escapeData = ESCAPE_SETS[isUnicode ? 'UNICODE' : 'REGULAR'].get(escapeChar);

const lowSurrogates = escapeData.clone().intersection(LOW_SURROGATES);
if (lowSurrogates.data.length === 0) {
return prettyPrint(toTestData(escapeData));
}
const rest = escapeData.clone().remove(LOW_SURROGATES);
const [ lowLoneCodePoints, lowRanges ] = toTestData(lowSurrogates);
const [ loneCodePoints, ranges ] = toTestData(rest);
loneCodePoints.unshift(...lowLoneCodePoints);
ranges.unshift(...lowRanges);
return prettyPrint([ loneCodePoints, ranges ]);
}

function buildContent(desc, pattern, range, max, flags, skip180e) {
let string = buildString(pattern[1], flags);
let method;
let features = [];

let content = header(`Compare range for ${desc} ${pattern} with flags ${flags}`);

content += `
const str = buildString({ loneCodePoints: [], ranges: [[0, ${
jsesc(max, { numbers: 'hexadecimal' })
}]] });
const str = buildString(${string});
const re = /${pattern}/${flags};
const matchingRange = /${range}/${flags};
const errors = [];
function matching(str) {
return str.replace(re, '') === str.replace(matchingRange, '');
}
if (!matching(str)) {
if (!re.test(str)) {
// Error, let's find out where
for (const char of str) {
if (!matching(char)) {
if (!re.test(char)) {
errors.push('0x' + char.codePointAt(0).toString(16));
}
}
Expand Down
2 changes: 1 addition & 1 deletion tools/regexp-generator/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
"license": "MIT",
"devDependencies": {
"filenamify": "^6.0.0",
"jsesc": "^3.0.2",
"mkdirp": "^3.0.1",
"regenerate": "^1.4.2",
"regexpu-core": "^6.1.1",
"rimraf": "^6.0.1",
"slugify": "^1.6.6"
Expand Down

0 comments on commit 07ddc3b

Please sign in to comment.