Skip to content

Commit

Permalink
Add a few more confusable map entries
Browse files Browse the repository at this point in the history
1. Map Malaylam U+0D1F to 's'.
2. Map 'small-cap-like' Cyrillic letters to "look-alike" Latin lowercase
letters.

The characters in new confusable map entries are replaced by their Latin
"look-alike" characters before the skeleton is calculated to compare with
top domain names.

Bug: 784761,773930
Test: components_unittests --gtest_filter=*IDNToUni*
Change-Id: Ib26664e21ac5eb290e4a2993b01cbf0edaade0ee
Reviewed-on: https://chromium-review.googlesource.com/805214
Reviewed-by: Peter Kasting <pkasting@chromium.org>
Commit-Queue: Jungshik Shin <jshin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#521648}
  • Loading branch information
jungshik authored and Commit Bot committed Dec 5, 2017
1 parent cc432e1 commit b3f0207
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 12 deletions.
32 changes: 24 additions & 8 deletions components/url_formatter/idn_spoof_checker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,31 @@ IDNSpoofChecker::IDNSpoofChecker() {

// Used for diacritics-removal before the skeleton calculation. Add
// "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark
// removal; NFC". On top of that, supplement the Unicode confusable list by
// replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by
// 'k', 'l' and 'n', respectively.
// removal; NFC".
// TODO(jshin): Revisit "ł > l; ø > o" mapping.
UParseError parse_error;
transliterator_.reset(icu::Transliterator::createFromRules(
diacritic_remover_.reset(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("DropAcc"),
icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"
" ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),
" ł > l; ø > o; đ > d;"),
UTRANS_FORWARD, parse_error, status));

// Supplement the Unicode confusable list by the following mapping.
// - U+04CF (ӏ) => l
// - {U+043A (к), U+0138(ĸ), U+03BA(κ)} => k
// - U+043F(п) => n
// - {U+0185 (ƅ), U+044C (ь)} => b
// - U+0432 (в) => b
// - U+043C (м) => m
// - U+043D (н) => h
// - U+0442 (т) => t
// - {U+0448 (ш), U+0449 (щ)} => w
// - U+0D1F (ട) => s
extra_confusable_mapper_.reset(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("ExtraConf"),
icu::UnicodeString(
"ӏ > l; [кĸκ] > k; п > n; [ƅь] > b; в > b; м > m; н > h; "
"т > t; [шщ] > w; ട > s;"),
UTRANS_FORWARD, parse_error, status));
DCHECK(U_SUCCESS(status))
<< "Spoofchecker initalization failed due to an error: "
Expand Down Expand Up @@ -270,7 +286,8 @@ bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
// attached to non-LGC characters are already blocked.
if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==
ustr_host.length())
transliterator_.get()->transliterate(ustr_host);
diacritic_remover_.get()->transliterate(ustr_host);
extra_confusable_mapper_.get()->transliterate(ustr_host);

UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString ustr_skeleton;
Expand All @@ -279,8 +296,7 @@ bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
if (U_FAILURE(status))
return false;
std::string skeleton;
ustr_skeleton.toUTF8String(skeleton);
return LookupMatchInTopDomains(skeleton);
return LookupMatchInTopDomains(ustr_skeleton.toUTF8String(skeleton));
}

bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
Expand Down
3 changes: 2 additions & 1 deletion components/url_formatter/idn_spoof_checker.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ class IDNSpoofChecker {
icu::UnicodeSet cyrillic_letters_;
icu::UnicodeSet cyrillic_letters_latin_alike_;
icu::UnicodeSet lgc_letters_n_ascii_;
std::unique_ptr<icu::Transliterator> transliterator_;
std::unique_ptr<icu::Transliterator> diacritic_remover_;
std::unique_ptr<icu::Transliterator> extra_confusable_mapper_;

IDNSpoofChecker(const IDNSpoofChecker&) = delete;
void operator=(const IDNSpoofChecker&) = delete;
Expand Down
3 changes: 3 additions & 0 deletions components/url_formatter/top_domains/alexa_domains.list
Original file line number Diff line number Diff line change
Expand Up @@ -9176,3 +9176,6 @@ stripe.com
digklmo68.com
digklmo68.co.uk
islkpx123.com
os345.com
woder.com
wmhtb.com
3 changes: 3 additions & 0 deletions components/url_formatter/top_domains/alexa_skeletons.gperf
Original file line number Diff line number Diff line change
Expand Up @@ -9185,4 +9185,7 @@ stripe.corn, 1
digklrno68.corn, 1
digklrno68.co.uk, 1
islkpxl23.corn, 1
os345.corn, 1
woder.corn, 1
wrnhtb.corn, 1
%%
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,4 @@
# Add a few made-up domains for testing.
outfile.write("# for testing\ndigklmo68.com\ndigklmo68.co.uk\n")
outfile.write("islkpx123.com\n")
outfile.write("os345.com\nwoder.com\nwmhtb.com\n")
16 changes: 13 additions & 3 deletions components/url_formatter/url_formatter_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,19 @@ const IDNTestCase idn_cases[] = {
L"123.com",
false},

// wmhtb.com
{"xn--l1acpvx.com", L"\x0448\x043c\x043d\x0442\x044c.com", false},
// щмнть.com
{"xn--l1acpzs.com", L"\x0449\x043c\x043d\x0442\x044c.com", false},
// шмнтв.com
{"xn--b1atdu1a.com", L"\x0448\x043c\x043d\x0442\x0432.com", false},
// ഠട345.com
{"xn--345-jtke.com", L"\x0d20\x0d1f" L"345.com", false},

// At one point the skeleton of 'w' was 'vv', ensure that
// that it's treated as 'w'.
{"xn--wder-qqa.com", L"w\x00f3" L"der.com", false},

// Mixed digits: the first two will also fail mixed script test
// Latin + ASCII digit + Deva digit
{"xn--asc1deva-j0q.co.in", L"asc1deva\x0967.co.in", false},
Expand Down Expand Up @@ -696,9 +709,6 @@ const IDNTestCase idn_cases[] = {
true},
// Can start with a RTL and end with AN
{"xn--mgbjq0r.eg", L"\x062c\x0627\x0631\x0662.eg", true},
// At one point the skeleton of 'w' was 'vv', ensure that
// that it's treated as 'w'.
{"xn--wnderlist-58a.com", L"w\x00fanderlist.com", false},

// Extremely rare Latin letters
// Latin Ext B - Pinyin: ǔnion.com
Expand Down

0 comments on commit b3f0207

Please sign in to comment.