Skip to content

Commit

Permalink
Revert "Implement non-transitional IDNA processing behind a flag"
Browse files Browse the repository at this point in the history
This reverts commit 60447d2.

Reason for revert: Thread safety

Original change's description:
> Implement non-transitional IDNA processing behind a flag
>
> This CL adds a flag called UseIDNA2008NonTransitional that enables
> non-transitional IDNA processing for URLs. When enabled, hostnames
> that contain deviation characters will no longer be mapped and will
> be usable as is. These deviation characters are:
> - U+00DF LATIN SMALL LETTER SHARP S
> - U+03C2 GREEK SMALL LETTER FINAL SIGMA
> - U+200D ZERO WIDTH JOINER
> - U+200C ZERO WIDTH NON-JOINER
>
> See https://unicode.org/reports/tr46/#Deviations for details.
>
> As an example, http://faß.de is presently mapped to http://fass.de
> with IDNA 2008 transitional processing. When the flag is enabled,
> ß will no longer be mapped to ss and the URL will remain unchanged.
>
> Bug: 694157
> Change-Id: I37df9aea5520374d1fce829b0d6ee22d9bdf32ec
> Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4018995
> Reviewed-by: Emily Stark <estark@chromium.org>
> Reviewed-by: Charlie Harrison <csharrison@chromium.org>
> Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
> Reviewed-by: Kent Tamura <tkent@chromium.org>
> Cr-Commit-Position: refs/heads/main@{#1070571}

Bug: 694157
Change-Id: I3c509230dc8fad7c1e696007d4ac8f918964b995
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4024340
Reviewed-by: Kent Tamura <tkent@chromium.org>
Reviewed-by: Charlie Harrison <csharrison@chromium.org>
Reviewed-by: Emily Stark <estark@chromium.org>
Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1071085}
  • Loading branch information
meacer authored and Chromium LUCI CQ committed Nov 14, 2022
1 parent b483999 commit e544837
Show file tree
Hide file tree
Showing 25 changed files with 169 additions and 6,351 deletions.
5 changes: 1 addition & 4 deletions components/url_formatter/spoof_checks/idn_spoof_checker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include "third_party/icu/source/i18n/unicode/regex.h"
#include "third_party/icu/source/i18n/unicode/translit.h"
#include "third_party/icu/source/i18n/unicode/uspoof.h"
#include "url/url_features.h"

namespace url_formatter {

Expand Down Expand Up @@ -376,10 +375,8 @@ IDNSpoofChecker::Result IDNSpoofChecker::SafeToDisplayAsUnicode(
// chosen. On the other hand, 'fu<sharp-s>' typed or copy and pasted
// as Unicode would be canonicalized to 'fuss' by GURL and is displayed as
// such. See http://crbug.com/595263 .
if (!url::IsUsingIDNA2008NonTransitional() &&
deviation_characters_.containsSome(label_string)) {
if (deviation_characters_.containsSome(label_string))
return Result::kDeviationCharacters;
}

// Disallow Icelandic confusables for domains outside Iceland's ccTLD (.is).
if (label_string.length() > 1 && top_level_domain != "is" &&
Expand Down
117 changes: 40 additions & 77 deletions components/url_formatter/spoof_checks/idn_spoof_checker_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,11 @@
#include "base/strings/string_piece.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "base/test/scoped_feature_list.h"
#include "build/build_config.h"
#include "components/url_formatter/spoof_checks/skeleton_generator.h"
#include "components/url_formatter/url_formatter.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "url/gurl.h"
#include "url/url_features.h"
#include "url/url_idna_icu.h"

namespace url_formatter {

Expand Down Expand Up @@ -822,6 +819,19 @@ const IDNTestCase kIdnCases[] = {
{"xn--foog-opf.com", u"foog\u05b4.com", kUnsafe}, // Latin + Hebrew NSM
{"xn--shb5495f.com", u"\uac00\u0650.com", kUnsafe}, // Hang + Arabic NSM

// 4 Deviation characters between IDNA 2003 and IDNA 2008
// When entered in Unicode, the first two are mapped to 'ss' and Greek sigma
// and the latter two are mapped away. However, the punycode form should
// remain in punycode.
// U+00DF(sharp-s)
{"xn--fu-hia.de", u"fu\u00df.de", kUnsafe},
// U+03C2(final-sigma)
{"xn--mxac2c.gr", u"\u03b1\u03b2\u03c2.gr", kUnsafe},
// U+200C(ZWNJ)
{"xn--h2by8byc123p.in", u"\u0924\u094d\u200c\u0930\u093f.in", kUnsafe},
// U+200C(ZWJ)
{"xn--11b6iy14e.in", u"\u0915\u094d\u200d.in", kUnsafe},

// Math Monospace Small A. When entered in Unicode, it's canonicalized to
// 'a'. The punycode form should remain in punycode.
{"xn--bc-9x80a.xyz", u"\U0001d68abc.xyz", kInvalid},
Expand Down Expand Up @@ -1108,19 +1118,8 @@ bool IsPunycode(const std::u16string& s) {

} // namespace

class IDNSpoofCheckerTest : public ::testing::Test,
public ::testing::WithParamInterface<bool> {
class IDNSpoofCheckerTest : public ::testing::Test {
protected:
IDNSpoofCheckerTest() {
if (GetParam()) {
scoped_feature_list_.InitAndEnableFeature(
url::kUseIDNA2008NonTransitional);
} else {
scoped_feature_list_.InitAndDisableFeature(
url::kUseIDNA2008NonTransitional);
}
}

void SetUp() override {
IDNSpoofChecker::HuffmanTrieParams trie_params{
test::kTopDomainsHuffmanTree, sizeof(test::kTopDomainsHuffmanTree),
Expand All @@ -1129,89 +1128,53 @@ class IDNSpoofCheckerTest : public ::testing::Test,
IDNSpoofChecker::SetTrieParamsForTesting(trie_params);
}

void TearDown() override {
IDNSpoofChecker::RestoreTrieParamsForTesting();
url::ResetUIDNAForTesting();
}
void TearDown() override { IDNSpoofChecker::RestoreTrieParamsForTesting(); }
};

// Test that a domain entered as punycode is decoded to unicode if safe,
// otherwise is left in punycode.
//
// TODO(crbug.com/1036523): This should also check if a domain entered as
// unicode is properly decoded or not-decoded. This is important in cases where
// certain unicode characters are canonicalized to other characters.
// E.g. Mathematical Monospace Small A (U+1D68A) is canonicalized to "a" when
// used in a domain name.
TEST_F(IDNSpoofCheckerTest, IDNToUnicode) {
for (size_t i = 0; i < std::size(kIdnCases); i++) {
SCOPED_TRACE(
base::StringPrintf("input #%zu: \"%s\"", i, kIdnCases[i].input));

void RunIDNToUnicodeTest(const IDNTestCase& test) {
// Sanity check to ensure that the unicode output matches the input. Bypass
// all spoof checks by doing an unsafe conversion.
const IDNConversionResult unsafe_result =
UnsafeIDNToUnicodeWithDetails(test.input);
UnsafeIDNToUnicodeWithDetails(kIdnCases[i].input);

// Ignore inputs that can't be converted even with unsafe conversion because
// they contain certain characters not allowed in IDNs. E.g. U+24DF (Latin
// CIRCLED LATIN SMALL LETTER P) in hostname causes the conversion to fail
// before reaching spoof checks.
if (test.expected_result != kInvalid) {
if (kIdnCases[i].expected_result != kInvalid) {
// Also ignore domains that need to remain partially in punycode, such
// as ѕсоре-рау.한국 where scope-pay is a Cyrillic whole-script
// confusable but 한국 is safe. This would require adding yet another
// field to the the test struct.
if (!IsPunycode(test.unicode_output)) {
EXPECT_EQ(unsafe_result.result, test.unicode_output);
if (!IsPunycode(kIdnCases[i].unicode_output)) {
EXPECT_EQ(unsafe_result.result, kIdnCases[i].unicode_output);
}
} else {
// Invalid punycode should not be converted.
EXPECT_EQ(unsafe_result.result, base::ASCIIToUTF16(test.input));
EXPECT_EQ(unsafe_result.result, base::ASCIIToUTF16(kIdnCases[i].input));
}

const std::u16string output(IDNToUnicode(test.input));
const std::u16string expected(test.expected_result == kSafe
? test.unicode_output
: base::ASCIIToUTF16(test.input));
const std::u16string output(IDNToUnicode(kIdnCases[i].input));
const std::u16string expected(kIdnCases[i].expected_result == kSafe
? kIdnCases[i].unicode_output
: base::ASCIIToUTF16(kIdnCases[i].input));
EXPECT_EQ(expected, output);
}

private:
base::test::ScopedFeatureList scoped_feature_list_;
};

INSTANTIATE_TEST_SUITE_P(All, IDNSpoofCheckerTest, ::testing::Bool());

// Test that a domain entered as punycode is decoded to unicode if safe,
// otherwise is left in punycode.
//
// TODO(crbug.com/1036523): This should also check if a domain entered as
// unicode is properly decoded or not-decoded. This is important in cases where
// certain unicode characters are canonicalized to other characters.
// E.g. Mathematical Monospace Small A (U+1D68A) is canonicalized to "a" when
// used in a domain name.
TEST_P(IDNSpoofCheckerTest, IDNToUnicode) {
for (const auto& test : kIdnCases) {
RunIDNToUnicodeTest(test);
}
}

// Same as IDNToUnicode but only tests hostnames with deviation characters.
TEST_P(IDNSpoofCheckerTest, IDNToUnicodeDeviationCharacters) {
const Result kExpectedSafety = GetParam() ? kSafe : kUnsafe;

// Tests for 4 Deviation characters between IDNA 2003 and IDNA 2008. When
// entered in Unicode:
// - In Transitional mode, the first two are mapped to 'ss' and Greek sigma
// and the latter two are mapped away. However, the punycode form should
// remain in punycode.
// - In Non-Transitional mode, none of the characters should be mapped and
// the hostnames should be considered safe.
const IDNTestCase kTestCases[] = {
// U+00DF(sharp-s)
{"xn--fu-hia.de", u"fu\u00df.de", kExpectedSafety},
// U+03C2(final-sigma)
{"xn--mxac2c.gr", u"\u03b1\u03b2\u03c2.gr", kExpectedSafety},
// U+200C(ZWNJ)
{"xn--h2by8byc123p.in", u"\u0924\u094d\u200c\u0930\u093f.in",
kExpectedSafety},
// U+200C(ZWJ)
{"xn--11b6iy14e.in", u"\u0915\u094d\u200d.in", kExpectedSafety},
};
for (const auto& test : kTestCases) {
RunIDNToUnicodeTest(test);
}
}

TEST_P(IDNSpoofCheckerTest, GetSimilarTopDomain) {
TEST_F(IDNSpoofCheckerTest, GetSimilarTopDomain) {
struct TestCase {
const char16_t* const hostname;
const char* const expected_top_domain;
Expand Down Expand Up @@ -1244,7 +1207,7 @@ TEST_P(IDNSpoofCheckerTest, GetSimilarTopDomain) {
}
}

TEST_P(IDNSpoofCheckerTest, LookupSkeletonInTopDomains) {
TEST_F(IDNSpoofCheckerTest, LookupSkeletonInTopDomains) {
{
TopDomainEntry entry =
IDNSpoofChecker().LookupSkeletonInTopDomains("d4OOO.corn");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,7 @@ const char kEmailPattern[] =

// RFC5321 says the maximum total length of a domain name is 255 octets.
const int32_t kMaximumDomainNameLength = 255;

// Use the same option as in url/url_canon_icu.cc
// TODO(crbug.com/694157): Change the options if UseIDNA2008NonTransitional flag
// is enabled.
const int32_t kIdnaConversionOption = UIDNA_CHECK_BIDI;

} // namespace
Expand Down
7 changes: 0 additions & 7 deletions third_party/blink/web_tests/VirtualTestSuites
Original file line number Diff line number Diff line change
Expand Up @@ -1258,13 +1258,6 @@
"--enable-blink-features=DocumentPictureInPictureAPI"]
},

{
"prefix": "idna-2008",
"platforms": ["Linux"],
"bases": [ "external/wpt/url", "fast/url"],
"args": ["--enable-features=UseIDNA2008NonTransitional"]
},

"Tests in the virtual/origin-agent-cluster-default suite. These tests",
"require isolation behaviour as in the actual browser and are therefore",
"incompatible with auto-wpt-origin-isolation. Thus they are only run in",
Expand Down
50 changes: 0 additions & 50 deletions third_party/blink/web_tests/virtual/idna-2008/README.md

This file was deleted.

Loading

0 comments on commit e544837

Please sign in to comment.