Skip to content

Commit bb4fb22

Browse files
enh(python) Add support for unicode identifiers (#3280)
* Add support for diacritics in Python identifiers * Add performance testing scripts * Rebuild hljs before each perf test Co-authored-by: Josh Goebel <me@joshgoebel.com>
1 parent 1fcf615 commit bb4fb22

File tree

12 files changed

+1882
-17
lines changed

12 files changed

+1882
-17
lines changed

CHANGES.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,25 @@
11
## Version 11.3.0 (most likely)
22

3+
Parser:
4+
5+
- add first rough performance testing script (#3280) [Austin Schick][]
6+
7+
Grammars:
8+
9+
- fix(python) added support for unicode identifiers (#3280) [Austin Schick][]
310
- enh(css/less/stylus/scss) improve consistency of function dispatch (#3301) [Josh Goebel][]
411
- enh(css/less/stylus/scss) detect block comments more fully (#3301) [Josh Goebel][]
512
- fix(cpp) switch is a keyword (#3312) [Josh Goebel][]
613
- fix(cpp) fix `xor_eq` keyword highlighting. [Denis Kovalchuk][]
714
- enh(c,cpp) highlight type modifiers as type (#3316) [Josh Goebel][]
815
- enh(css/less/stylus/scss) add support for CSS Grid properties [monochromer][]
916

17+
[Austin Schick]: https://github.com/austin-schick
1018
[Josh Goebel]: https://github.com/joshgoebel
1119
[Denis Kovalchuk]: https://github.com/deniskovalchuk
1220
[monochromer]: https://github.com/monochromer
1321

22+
1423
## Version 11.2.0
1524

1625
Build:
@@ -41,6 +50,7 @@ New Languages:
4150
[Bradley Mackey]: https://github.com/bradleymackey
4251
[Dereavy]: https://github.com/dereavy
4352

53+
4454
## Version 11.1.0
4555

4656
Grammars:

docs/mode-reference.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,22 @@ name
4242
The canonical name of this language, ie "JavaScript", etc.
4343

4444

45+
unicodeRegex
46+
^^^^^^^^^^^^
47+
48+
- **type**: boolean
49+
50+
Expresses whether the grammar in question uses Unicode (``u`` flag) regular expressions.
51+
(defaults to false)
52+
53+
4554
case_insensitive
4655
^^^^^^^^^^^^^^^^
4756

4857
- **type**: boolean
4958

5059
Case insensitivity of language keywords and regexps. Used only on the top-level mode.
60+
(defaults to false)
5161

5262

5363
aliases
@@ -92,6 +102,7 @@ disableAutodetect
92102
- **type**: boolean
93103

94104
Disables autodetection for this language.
105+
(defaults to false, meaning auto-detect is enabled)
95106

96107

97108
compilerExtensions

src/languages/python.js

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ Website: https://www.python.org
55
Category: common
66
*/
77

8-
import { UNDERSCORE_IDENT_RE } from '../lib/modes.js';
98
import * as regex from '../lib/regex.js';
109

1110
export default function(hljs) {
11+
const IDENT_RE = /[\p{XID_Start}_]\p{XID_Continue}*/u;
1212
const RESERVED_WORDS = [
1313
'and',
1414
'as',
@@ -358,6 +358,7 @@ export default function(hljs) {
358358
'gyp',
359359
'ipython'
360360
],
361+
unicodeRegex: true,
361362
keywords: KEYWORDS,
362363
illegal: /(<\/|->|\?)|=>/,
363364
contains: [
@@ -379,7 +380,7 @@ export default function(hljs) {
379380
{
380381
match: [
381382
/def/, /\s+/,
382-
UNDERSCORE_IDENT_RE
383+
IDENT_RE,
383384
],
384385
scope: {
385386
1: "keyword",
@@ -392,14 +393,14 @@ export default function(hljs) {
392393
{
393394
match: [
394395
/class/, /\s+/,
395-
UNDERSCORE_IDENT_RE, /\s*/,
396-
/\(\s*/, UNDERSCORE_IDENT_RE,/\s*\)/
396+
IDENT_RE, /\s*/,
397+
/\(\s*/, IDENT_RE,/\s*\)/
397398
],
398399
},
399400
{
400401
match: [
401402
/class/, /\s+/,
402-
UNDERSCORE_IDENT_RE
403+
IDENT_RE
403404
],
404405
}
405406
],

src/lib/mode_compiler.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export function compileLanguage(language) {
3333
function langRe(value, global) {
3434
return new RegExp(
3535
regex.source(value),
36-
'm' + (language.case_insensitive ? 'i' : '') + (global ? 'g' : '')
36+
'm' + (language.case_insensitive ? 'i' : '') + (language.unicodeRegex ? 'u' : '') + (global ? 'g' : '')
3737
);
3838
}
3939

test/markup/index.js

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,16 +46,23 @@ describe('highlight() markup', async() => {
4646
const markupPath = utility.buildPath('markup');
4747

4848
if (!process.env.ONLY_EXTRA) {
49-
const languages = await fs.readdir(markupPath);
49+
let languages = null;
50+
if (process.env.ONLY_LANGUAGES) {
51+
languages = process.env.ONLY_LANGUAGES.split(" ");
52+
} else {
53+
languages = await fs.readdir(markupPath);
54+
}
5055
languages.forEach(testLanguage);
5156
}
5257

53-
const thirdPartyPackages = await getThirdPartyPackages();
54-
thirdPartyPackages.forEach(
55-
(pkg) => pkg.names.forEach(
56-
(name, idx) => testLanguage(name, { testDir: pkg.markupTestPaths[idx] })
57-
)
58-
);
58+
if (!process.env.ONLY_LANGUAGES) {
59+
const thirdPartyPackages = await getThirdPartyPackages();
60+
thirdPartyPackages.forEach(
61+
(pkg) => pkg.names.forEach(
62+
(name, idx) => testLanguage(name, { testDir: pkg.markupTestPaths[idx] })
63+
)
64+
);
65+
}
5966
});
6067

6168
it("adding dynamic tests...", async function() {}); // this is required to work
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<span class="hljs-keyword">def</span> <span class="hljs-title function_">fóö</span>():
2+
<span class="hljs-keyword">pass</span>
3+
4+
<span class="hljs-keyword">def</span> <span class="hljs-title function_">bär</span>():
5+
<span class="hljs-keyword">pass</span>
6+
7+
<span class="hljs-keyword">def</span> <span class="hljs-title function_">FOÖ</span>():
8+
<span class="hljs-keyword">pass</span>
9+
10+
<span class="hljs-keyword">def</span> <span class="hljs-title function_">ÿay</span>():
11+
<span class="hljs-keyword">pass</span>
12+
13+
<span class="hljs-keyword">class</span> <span class="hljs-title class_">fóö</span>():
14+
<span class="hljs-keyword">pass</span>
15+
16+
<span class="hljs-keyword">class</span> <span class="hljs-title class_">bär</span>():
17+
<span class="hljs-keyword">pass</span>
18+
19+
<span class="hljs-keyword">class</span> <span class="hljs-title class_">FOÖ</span>():
20+
<span class="hljs-keyword">pass</span>
21+
22+
<span class="hljs-keyword">class</span> <span class="hljs-title class_">ÿay</span>():
23+
<span class="hljs-keyword">pass</span>
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
def fóö():
2+
pass
3+
4+
def bär():
5+
pass
6+
7+
def FOÖ():
8+
pass
9+
10+
def ÿay():
11+
pass
12+
13+
class fóö():
14+
pass
15+
16+
class bär():
17+
pass
18+
19+
class FOÖ():
20+
pass
21+
22+
class ÿay():
23+
pass

test/regex/lib/util.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ const { RegExpParser } = require('regexpp');
88
* @typedef {{ pattern: Pattern, flags: Flags }} LiteralAST
99
*/
1010

11-
const parser = new RegExpParser({ strict: false, ecmaVersion: 6 });
11+
const parser = new RegExpParser({ strict: false, ecmaVersion: 2018 });
12+
// ecmaVersion 2018 is ECMAScript 9
13+
1214
/** @type {Map<string, LiteralAST>} */
1315
const astCache = new Map();
1416

tools/checkAutoDetect.js

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,16 @@ function testAutoDetection(language, index, languages) {
5858
});
5959
}
6060

61-
const languages = hljs.listLanguages()
62-
.filter(hljs.autoDetection);
61+
let languages = null;
62+
if (process.env.ONLY_LANGUAGES) {
63+
languages = process.env.ONLY_LANGUAGES.split(" ");
64+
} else {
65+
languages = hljs.listLanguages().filter(hljs.autoDetection);
66+
}
6367

6468
console.log('Checking auto-highlighting for ' + colors.grey(languages.length) + ' languages...');
6569
languages.forEach((lang, index) => {
66-
if (index%60===0) { console.log("") }
70+
if (index % 60 === 0) { console.log(""); }
6771
testAutoDetection(lang)
6872
process.stdout.write(".");
6973
});

tools/perf.js

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env node
2+
const execSync = require('child_process').execSync;
3+
const fs = require('fs');
4+
const { performance } = require('perf_hooks');
5+
6+
const build = () => {
7+
console.log(`Starting perf tests, building hljs ... `);
8+
// build node.js version of library with CJS and ESM libraries
9+
execSync('npm run build', {
10+
cwd: '.',
11+
env: Object.assign(
12+
process.env
13+
)
14+
});
15+
};
16+
17+
const timeTest = (name, func) => {
18+
process.stdout.write(` running ${name}...`);
19+
const t0 = performance.now();
20+
func();
21+
const t1 = performance.now();
22+
console.log(` done! [${((t1 - t0) / 1000).toFixed(2)}s elapsed]`);
23+
}
24+
25+
const oneLanguageMarkupTests = (lang) => {
26+
for (let i = 0; i < 50; i++) {
27+
execSync('npx mocha ./test/markup', {
28+
cwd: '.',
29+
env: Object.assign(
30+
process.env,
31+
{ ONLY_LANGUAGES: lang }
32+
)
33+
});
34+
}
35+
};
36+
37+
const oneLanguageCheckAutoDetect = (lang) => {
38+
for (let i = 0; i < 50; i++) {
39+
execSync('node ./tools/checkAutoDetect.js', {
40+
env: Object.assign(
41+
process.env,
42+
{ ONLY_LANGUAGES: lang }
43+
)
44+
});
45+
}
46+
};
47+
48+
const globalCheckAutoDetect = () => {
49+
for (let i = 0; i < 5; i++) {
50+
execSync('node ./tools/checkAutoDetect.js');
51+
}
52+
};
53+
54+
const highlightFile = (lang) => {
55+
const source = fs.readFileSync(`./tools/sample_files/${lang}.txt`, { encoding:'utf8' });
56+
const hljs = require('../build');
57+
for (let i = 0; i < 2000; i++) {
58+
hljs.highlight(source, {language: lang});
59+
}
60+
};
61+
62+
const main = (lang) => {
63+
build();
64+
timeTest(`global checkAutoDetect`, globalCheckAutoDetect);
65+
timeTest(`${lang}-only markup tests`, () => oneLanguageMarkupTests(lang));
66+
timeTest(`${lang}-only checkAutoDetect`, () => oneLanguageCheckAutoDetect(lang));
67+
timeTest(`highlight large file`, () => highlightFile(lang));
68+
};
69+
70+
main('python');

0 commit comments

Comments
 (0)