Skip to content

Commit d0e2492

Browse files
jasnellFishrock123
authored andcommitted
net: use icu's punycode implementation
ICU has a punycode implementation built in. Use it instead of the javascript implementation because it's much faster. PR-URL: #7355 Reviewed-By: Trevor Norris <trev.norris@gmail.com> Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
1 parent 12b1993 commit d0e2492

File tree

6 files changed

+291
-12
lines changed

6 files changed

+291
-12
lines changed

benchmark/net/punycode.js

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
'use strict';
2+
3+
const common = require('../common.js');
4+
const icu = process.binding('icu');
5+
const punycode = require('punycode');
6+
7+
const bench = common.createBenchmark(main, {
8+
method: ['punycode', 'icu'],
9+
n: [1024],
10+
val: [
11+
'افغانستا.icom.museum',
12+
'الجزائر.icom.museum',
13+
'österreich.icom.museum',
14+
'বাংলাদেশ.icom.museum',
15+
'беларусь.icom.museum',
16+
'belgië.icom.museum',
17+
'българия.icom.museum',
18+
'تشادر.icom.museum',
19+
'中国.icom.museum',
20+
'القمر.icom.museum',
21+
'κυπρος.icom.museum',
22+
'českárepublika.icom.museum',
23+
'مصر.icom.museum',
24+
'ελλάδα.icom.museum',
25+
'magyarország.icom.museum',
26+
'ísland.icom.museum',
27+
'भारत.icom.museum',
28+
'ايران.icom.museum',
29+
'éire.icom.museum',
30+
'איקו״ם.ישראל.museum',
31+
'日本.icom.museum',
32+
'الأردن.icom.museum'
33+
]
34+
});
35+
36+
function usingPunycode(val) {
37+
punycode.toUnicode(punycode.toASCII(val));
38+
}
39+
40+
function usingICU(val) {
41+
icu.toUnicode(icu.toASCII(val));
42+
}
43+
44+
function runPunycode(n, val) {
45+
common.v8ForceOptimization(usingPunycode, val);
46+
var i = 0;
47+
bench.start();
48+
for (; i < n; i++)
49+
usingPunycode(val);
50+
bench.end(n);
51+
}
52+
53+
function runICU(n, val) {
54+
common.v8ForceOptimization(usingICU, val);
55+
var i = 0;
56+
bench.start();
57+
for (; i < n; i++)
58+
usingICU(val);
59+
bench.end(n);
60+
}
61+
62+
function main(conf) {
63+
const n = +conf.n;
64+
const val = conf.val;
65+
switch (conf.method) {
66+
case 'punycode':
67+
runPunycode(n, val);
68+
break;
69+
case 'icu':
70+
runICU(n, val);
71+
break;
72+
default:
73+
throw new Error('Unexpected method');
74+
}
75+
}

lib/url.js

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
'use strict';
22

3-
const punycode = require('punycode');
3+
function importPunycode() {
4+
try {
5+
return process.binding('icu');
6+
} catch (e) {
7+
return require('punycode');
8+
}
9+
}
10+
11+
const { toASCII } = importPunycode();
412

513
exports.parse = urlParse;
614
exports.resolve = urlResolve;
@@ -309,7 +317,7 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) {
309317
// It only converts parts of the domain name that
310318
// have non-ASCII characters, i.e. it doesn't matter if
311319
// you call it with a domain that already is ASCII-only.
312-
this.hostname = punycode.toASCII(this.hostname);
320+
this.hostname = toASCII(this.hostname);
313321
}
314322

315323
var p = this.port ? ':' + this.port : '';

src/node_i18n.cc

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,16 @@
2323

2424
#if defined(NODE_HAVE_I18N_SUPPORT)
2525

26+
#include "node.h"
27+
#include "env.h"
28+
#include "env-inl.h"
29+
#include "util.h"
30+
#include "util-inl.h"
31+
#include "v8.h"
32+
2633
#include <unicode/putil.h>
2734
#include <unicode/udata.h>
35+
#include <unicode/uidna.h>
2836

2937
#ifdef NODE_HAVE_SMALL_ICU
3038
/* if this is defined, we have a 'secondary' entry point.
@@ -43,6 +51,13 @@ extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];
4351

4452
namespace node {
4553

54+
using v8::Context;
55+
using v8::FunctionCallbackInfo;
56+
using v8::Local;
57+
using v8::Object;
58+
using v8::String;
59+
using v8::Value;
60+
4661
bool flag_icu_data_dir = false;
4762

4863
namespace i18n {
@@ -64,7 +79,124 @@ bool InitializeICUDirectory(const char* icu_data_path) {
6479
}
6580
}
6681

82+
static int32_t ToUnicode(MaybeStackBuffer<char>* buf,
83+
const char* input,
84+
size_t length) {
85+
UErrorCode status = U_ZERO_ERROR;
86+
uint32_t options = UIDNA_DEFAULT;
87+
options |= UIDNA_NONTRANSITIONAL_TO_UNICODE;
88+
UIDNA* uidna = uidna_openUTS46(options, &status);
89+
if (U_FAILURE(status))
90+
return -1;
91+
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
92+
93+
int32_t len = uidna_nameToUnicodeUTF8(uidna,
94+
input, length,
95+
**buf, buf->length(),
96+
&info,
97+
&status);
98+
99+
if (status == U_BUFFER_OVERFLOW_ERROR) {
100+
status = U_ZERO_ERROR;
101+
buf->AllocateSufficientStorage(len);
102+
len = uidna_nameToUnicodeUTF8(uidna,
103+
input, length,
104+
**buf, buf->length(),
105+
&info,
106+
&status);
107+
}
108+
109+
if (U_FAILURE(status))
110+
len = -1;
111+
112+
uidna_close(uidna);
113+
return len;
114+
}
115+
116+
static int32_t ToASCII(MaybeStackBuffer<char>* buf,
117+
const char* input,
118+
size_t length) {
119+
UErrorCode status = U_ZERO_ERROR;
120+
uint32_t options = UIDNA_DEFAULT;
121+
options |= UIDNA_NONTRANSITIONAL_TO_ASCII;
122+
UIDNA* uidna = uidna_openUTS46(options, &status);
123+
if (U_FAILURE(status))
124+
return -1;
125+
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
126+
127+
int32_t len = uidna_nameToASCII_UTF8(uidna,
128+
input, length,
129+
**buf, buf->length(),
130+
&info,
131+
&status);
132+
133+
if (status == U_BUFFER_OVERFLOW_ERROR) {
134+
status = U_ZERO_ERROR;
135+
buf->AllocateSufficientStorage(len);
136+
len = uidna_nameToASCII_UTF8(uidna,
137+
input, length,
138+
**buf, buf->length(),
139+
&info,
140+
&status);
141+
}
142+
143+
if (U_FAILURE(status))
144+
len = -1;
145+
146+
uidna_close(uidna);
147+
return len;
148+
}
149+
150+
static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
151+
Environment* env = Environment::GetCurrent(args);
152+
CHECK_GE(args.Length(), 1);
153+
CHECK(args[0]->IsString());
154+
Utf8Value val(env->isolate(), args[0]);
155+
MaybeStackBuffer<char> buf;
156+
int32_t len = ToUnicode(&buf, *val, val.length());
157+
158+
if (len < 0) {
159+
return env->ThrowError("Cannot convert name to Unicode");
160+
}
161+
162+
args.GetReturnValue().Set(
163+
String::NewFromUtf8(env->isolate(),
164+
*buf,
165+
v8::NewStringType::kNormal,
166+
len).ToLocalChecked());
167+
}
168+
169+
static void ToASCII(const FunctionCallbackInfo<Value>& args) {
170+
Environment* env = Environment::GetCurrent(args);
171+
CHECK_GE(args.Length(), 1);
172+
CHECK(args[0]->IsString());
173+
Utf8Value val(env->isolate(), args[0]);
174+
MaybeStackBuffer<char> buf;
175+
int32_t len = ToASCII(&buf, *val, val.length());
176+
177+
if (len < 0) {
178+
return env->ThrowError("Cannot convert name to ASCII");
179+
}
180+
181+
args.GetReturnValue().Set(
182+
String::NewFromUtf8(env->isolate(),
183+
*buf,
184+
v8::NewStringType::kNormal,
185+
len).ToLocalChecked());
186+
}
187+
188+
void Init(Local<Object> target,
189+
Local<Value> unused,
190+
Local<Context> context,
191+
void* priv) {
192+
Environment* env = Environment::GetCurrent(context);
193+
env->SetMethod(target, "toUnicode", ToUnicode);
194+
env->SetMethod(target, "toASCII", ToASCII);
195+
}
196+
67197
} // namespace i18n
68198
} // namespace node
69199

200+
NODE_MODULE_CONTEXT_AWARE_BUILTIN(icu, node::i18n::Init)
201+
70202
#endif // NODE_HAVE_I18N_SUPPORT

test/parallel/test-icu-punycode.js

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
'use strict';
2+
3+
const common = require('../common');
4+
const icu = getPunycode();
5+
const assert = require('assert');
6+
7+
function getPunycode() {
8+
try {
9+
return process.binding('icu');
10+
} catch (err) {
11+
return undefined;
12+
}
13+
}
14+
15+
if (!icu) {
16+
common.skip('icu punycode tests because ICU is not present.');
17+
return;
18+
}
19+
20+
// Credit for list: http://www.i18nguy.com/markup/idna-examples.html
21+
const tests = [
22+
'افغانستا.icom.museum',
23+
'الجزائر.icom.museum',
24+
'österreich.icom.museum',
25+
'বাংলাদেশ.icom.museum',
26+
'беларусь.icom.museum',
27+
'belgië.icom.museum',
28+
'българия.icom.museum',
29+
'تشادر.icom.museum',
30+
'中国.icom.museum',
31+
'القمر.icom.museum',
32+
'κυπρος.icom.museum',
33+
'českárepublika.icom.museum',
34+
'مصر.icom.museum',
35+
'ελλάδα.icom.museum',
36+
'magyarország.icom.museum',
37+
'ísland.icom.museum',
38+
'भारत.icom.museum',
39+
'ايران.icom.museum',
40+
'éire.icom.museum',
41+
'איקו״ם.ישראל.museum',
42+
'日本.icom.museum',
43+
'الأردن.icom.museum',
44+
'қазақстан.icom.museum',
45+
'한국.icom.museum',
46+
'кыргызстан.icom.museum',
47+
'ລາວ.icom.museum',
48+
'لبنان.icom.museum',
49+
'македонија.icom.museum',
50+
'موريتانيا.icom.museum',
51+
'méxico.icom.museum',
52+
'монголулс.icom.museum',
53+
'المغرب.icom.museum',
54+
'नेपाल.icom.museum',
55+
'عمان.icom.museum',
56+
'قطر.icom.museum',
57+
'românia.icom.museum',
58+
'россия.иком.museum',
59+
'србијаицрнагора.иком.museum',
60+
'இலங்கை.icom.museum',
61+
'españa.icom.museum',
62+
'ไทย.icom.museum',
63+
'تونس.icom.museum',
64+
'türkiye.icom.museum',
65+
'украина.icom.museum',
66+
'việtnam.icom.museum'
67+
];
68+
69+
// Testing the roundtrip
70+
tests.forEach((i) => {
71+
assert.strictEqual(i, icu.toUnicode(icu.toASCII(i)));
72+
});

tools/icu/icu-generic.gyp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,7 @@
3737
'defines': [
3838
# ICU cannot swap the initial data without this.
3939
# http://bugs.icu-project.org/trac/ticket/11046
40-
'UCONFIG_NO_LEGACY_CONVERSION=1',
41-
'UCONFIG_NO_IDNA=1',
40+
'UCONFIG_NO_LEGACY_CONVERSION=1'
4241
],
4342
}],
4443
],
@@ -428,9 +427,6 @@
428427
#'<(icu_path)/source/common/ubidi_props_data.h',
429428
# and the callers
430429
'<(icu_path)/source/common/ushape.cpp',
431-
'<(icu_path)/source/common/usprep.cpp',
432-
'<(icu_path)/source/common/uts46.cpp',
433-
'<(icu_path)/source/common/uidna.cpp',
434430
]}],
435431
[ 'icu_ver_major == 57', { 'sources!': [
436432
# work around http://bugs.icu-project.org/trac/ticket/12451
@@ -447,9 +443,6 @@
447443
#'<(icu_path)/source/common/ubidi_props_data.h',
448444
# and the callers
449445
'<(icu_path)/source/common/ushape.cpp',
450-
'<(icu_path)/source/common/usprep.cpp',
451-
'<(icu_path)/source/common/uts46.cpp',
452-
'<(icu_path)/source/common/uidna.cpp',
453446
]}],
454447
[ 'OS == "solaris"', { 'defines': [
455448
'_XOPEN_SOURCE_EXTENDED=0',

tools/icu/icu_small.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"region": "none",
2525
"zone": "locales",
2626
"converters": "none",
27-
"stringprep": "none",
27+
"stringprep": "locales",
2828
"translit": "none",
2929
"brkfiles": "none",
3030
"brkdict": "none",
@@ -34,7 +34,6 @@
3434
"remove": [
3535
"cnvalias.icu",
3636
"postalCodeData.res",
37-
"uts46.nrm",
3837
"genderList.res",
3938
"brkitr/root.res",
4039
"unames.icu"

0 commit comments

Comments
 (0)