Skip to content

Commit

Permalink
Bug 910211 - Guess the fallback encoding from the top-level domain wh…
Browse files Browse the repository at this point in the history
…en feasible. r=emk.
  • Loading branch information
hsivonen committed Feb 6, 2014
1 parent d0e81da commit 7eef0de
Show file tree
Hide file tree
Showing 15 changed files with 432 additions and 13 deletions.
7 changes: 7 additions & 0 deletions build/pgo/server-locations.txt
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,10 @@ https://www2.w3c-test.org:443
https://xn--n8j6ds53lwwkrqhv28a.w3c-test.org:443
https://xn--lve-6lad.w3c-test.org:443
http://test.w3.org:80

# Hosts for testing TLD-based fallback encoding
http://example.tw:80 privileged
http://example.cn:80 privileged
http://example.co.jp:80 privileged
http://example.fi:80 privileged

61 changes: 61 additions & 0 deletions content/html/document/src/nsHTMLDocument.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,66 @@ nsHTMLDocument::TryParentCharset(nsIDocShell* aDocShell,
}
}

void
nsHTMLDocument::TryTLD(int32_t& aCharsetSource, nsACString& aCharset)
{
if (aCharsetSource >= kCharsetFromTopLevelDomain) {
return;
}
if (!FallbackEncoding::sGuessFallbackFromTopLevelDomain) {
return;
}
if (!mDocumentURI) {
return;
}
nsAutoCString host;
mDocumentURI->GetAsciiHost(host);
if (host.IsEmpty()) {
return;
}
// First let's see if the host is DNS-absolute and ends with a dot and
// get rid of that one.
if (host.Last() == '.') {
host.SetLength(host.Length() - 1);
if (host.IsEmpty()) {
return;
}
}
// If we still have a dot, the host is weird, so let's continue only
// if we have something other than a dot now.
if (host.Last() == '.') {
return;
}
int32_t index = host.RFindChar('.');
if (index == kNotFound) {
// We have an intranet host, Gecko-internal URL or an IPv6 address.
return;
}
// Since the string didn't end with a dot and we found a dot,
// there is at least one character between the dot and the end of
// the string, so taking the substring below is safe.
nsAutoCString tld;
ToLowerCase(Substring(host, index + 1, host.Length() - (index + 1)), tld);
// Reject generic TLDs and country TLDs that need more research
if (!FallbackEncoding::IsParticipatingTopLevelDomain(tld)) {
return;
}
// Check if we have an IPv4 address
bool seenNonDigit = false;
for (size_t i = 0; i < tld.Length(); ++i) {
char c = tld.CharAt(i);
if (c < '0' || c > '9') {
seenNonDigit = true;
break;
}
}
if (!seenNonDigit) {
return;
}
aCharsetSource = kCharsetFromTopLevelDomain;
FallbackEncoding::FromTopLevelDomain(tld, aCharset);
}

void
nsHTMLDocument::TryFallback(int32_t& aCharsetSource, nsACString& aCharset)
{
Expand Down Expand Up @@ -661,6 +721,7 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
TryCacheCharset(cachingChan, charsetSource, charset);
}

TryTLD(charsetSource, charset);
TryFallback(charsetSource, charset);

if (wyciwygChannel) {
Expand Down
1 change: 1 addition & 0 deletions content/html/document/src/nsHTMLDocument.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ class nsHTMLDocument : public nsDocument,
nsACString& aCharset);
void TryParentCharset(nsIDocShell* aDocShell,
int32_t& charsetSource, nsACString& aCharset);
void TryTLD(int32_t& aCharsetSource, nsACString& aCharset);
static void TryFallback(int32_t& aCharsetSource, nsACString& aCharset);

// Override so we can munge the charset on our wyciwyg channel as needed.
Expand Down
4 changes: 4 additions & 0 deletions docshell/base/nsDocShell.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1994,6 +1994,10 @@ nsDocShell::GatherCharsetMenuTelemetry()

int32_t charsetSource = doc->GetDocumentCharacterSetSource();
switch (charsetSource) {
case kCharsetFromTopLevelDomain:
// Unlabeled doc on a domain that we map to a fallback encoding
Telemetry::Accumulate(Telemetry::CHARSET_OVERRIDE_SITUATION, 7);
break;
case kCharsetFromFallback:
case kCharsetFromDocTypeDefault:
case kCharsetFromCache:
Expand Down
32 changes: 32 additions & 0 deletions dom/encoding/FallbackEncoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,16 @@ static const char* localesFallbacks[][3] = {
#include "localesfallbacks.properties.h"
};

static const char* domainsFallbacks[][3] = {
#include "domainsfallbacks.properties.h"
};

static const char* nonParticipatingDomains[][3] = {
#include "nonparticipatingdomains.properties.h"
};

FallbackEncoding* FallbackEncoding::sInstance = nullptr;
bool FallbackEncoding::sGuessFallbackFromTopLevelDomain = true;

FallbackEncoding::FallbackEncoding()
{
Expand Down Expand Up @@ -121,6 +130,8 @@ FallbackEncoding::Initialize()
Preferences::RegisterCallback(FallbackEncoding::PrefChanged,
"general.useragent.locale",
nullptr);
Preferences::AddBoolVarCache(&sGuessFallbackFromTopLevelDomain,
"intl.charset.fallback.tld");
}

void
Expand All @@ -132,5 +143,26 @@ FallbackEncoding::Shutdown()
FallbackEncoding::sInstance = nullptr;
}

bool
FallbackEncoding::IsParticipatingTopLevelDomain(const nsACString& aTLD)
{
nsAutoCString dummy;
return NS_FAILED(nsUConvPropertySearch::SearchPropertyValue(
nonParticipatingDomains,
ArrayLength(nonParticipatingDomains),
aTLD,
dummy));
}

void
FallbackEncoding::FromTopLevelDomain(const nsACString& aTLD,
nsACString& aFallback)
{
if (NS_FAILED(nsUConvPropertySearch::SearchPropertyValue(
domainsFallbacks, ArrayLength(domainsFallbacks), aTLD, aFallback))) {
aFallback.AssignLiteral("windows-1252");
}
}

} // namespace dom
} // namespace mozilla
22 changes: 22 additions & 0 deletions dom/encoding/FallbackEncoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ class FallbackEncoding
{
public:

/**
* Whether FromTopLevelDomain() should be used.
*/
static bool sGuessFallbackFromTopLevelDomain;

/**
* Gets the locale-dependent fallback encoding for legacy HTML and plain
* text content.
Expand All @@ -22,6 +27,23 @@ class FallbackEncoding
*/
static void FromLocale(nsACString& aFallback);

/**
* Checks if it is appropriate to call FromTopLevelDomain() for a given TLD.
*
* @param aTLD the top-level domain (in Punycode)
* @return true if OK to call FromTopLevelDomain()
*/
static bool IsParticipatingTopLevelDomain(const nsACString& aTLD);

/**
* Gets a top-level domain-depedendent fallback encoding for legacy HTML
* and plain text content
*
* @param aTLD the top-level domain (in Punycode)
* @param aFallback the outparam for the fallback encoding
*/
static void FromTopLevelDomain(const nsACString& aTLD, nsACString& aFallback);

// public API ends here!

/**
Expand Down
4 changes: 4 additions & 0 deletions dom/encoding/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,7 @@ labelsencodings.properties.h: $(PROPS2ARRAYS) labelsencodings.properties
$(PYTHON) $^ $@
localesfallbacks.properties.h: $(PROPS2ARRAYS) localesfallbacks.properties
$(PYTHON) $^ $@
domainsfallbacks.properties.h: $(PROPS2ARRAYS) domainsfallbacks.properties
$(PYTHON) $^ $@
nonparticipatingdomains.properties.h: $(PROPS2ARRAYS) nonparticipatingdomains.properties
$(PYTHON) $^ $@
167 changes: 167 additions & 0 deletions dom/encoding/domainsfallbacks.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

# This file contains educated guesses about which top-level domains are
# likely to host legacy content that assumes a non-windows-1252 encoding.
# Punycode TLDs are included on the theory that legacy content might appear
# behind those relatively new TLDs if DNS just points to a legacy server.
#
# Encodings for which a confident-enough educated guess is missing are
# listed in nonparticipatingdomains.properties. Domains that are listed
# neither there nor here get windows-1252 as the associated fallback.
#
# The list below includes Arabic-script TLDs not on IANA list but on the
# ICANN list:
# http://www.icann.org/en/resources/idn/fast-track/string-evaluation-completion
# Otherwise, the list includes non-windows-1252-affilited country TLDs from
# https://data.iana.org/TLD/tlds-alpha-by-domain.txt
#
# The guesses are assigned as follows:
# * If the country has a dominant country-affiliated language and that language
# is part of the languages to fallbacks mapping, use the encoding for that
# language from that mapping.
# * Use windows-1256 for countries that have a dominant Arabic-script
# language or whose all languages are Arabic-script languages.
# * Use windows-1251 likewise but for Cyrillic script.

ae=windows-1256
xn--mgbaam7a8h=windows-1256

af=windows-1256

bg=windows-1251

bh=windows-1256

by=windows-1251

cn=gbk
xn--fiqs8s=gbk
# Assume that Traditional Chinese TLD is meant to work if URL input happens to
# be in the traditional mode. Expect content to be simplified anyway.
xn--fiqz9s=gbk

cz=windows-1250

dz=windows-1256
xn--lgbbat1ad8j=windows-1256

ee=windows-1257

eg=windows-1256
xn--wgbh1c=windows-1256

gr=ISO-8859-7

hk=Big5-HKSCS
xn--j6w193g=Big5-HKSCS

hr=windows-1250

hu=ISO-8859-2

iq=windows-1256

ir=windows-1256
xn--mgba3a4f16a=windows-1256

jo=windows-1256
xn--mgbayh7gpa=windows-1256

jp=Shift_JIS

kg=windows-1251

kp=EUC-KR

kr=EUC-KR
xn--3e0b707e=EUC-KR

kw=windows-1256

kz=windows-1251
xn--80ao21a=windows-1251

lb=windows-1256

lt=windows-1257

lv=windows-1257

ma=windows-1256
xn--mgbc0a9azcg=windows-1256

mk=windows-1251

mn=windows-1251
xn--l1acc=windows-1251

mo=Big5

# my
xn--mgbx4cd0ab=windows-1256

om=windows-1256
xn--mgb9awbf=windows-1256

#pk
xn--mgbai9azgqp6j=windows-1256

pl=ISO-8859-2

ps=windows-1256
xn--ygbi2ammx=windows-1256

qa=windows-1256
xn--wgbl6a=windows-1256

rs=windows-1251
xn--90a3ac=windows-1251

ru=windows-1251
xn--p1ai=windows-1251

sa=windows-1256
xn--mgberp4a5d4ar=windows-1256

sd=windows-1256
xn--mgbpl2fh=windows-1256

sg=gbk
xn--yfro4i67o=gbk

si=ISO-8859-2

sk=windows-1250

su=windows-1251

sy=windows-1256
xn--mgbtf8fl=windows-1256

th=windows-874
xn--o3cw4h=windows-874

tj=windows-1251

tn=windows-1256
xn--pgbs0dh=windows-1256

tr=windows-1254

tw=Big5
# Assume that the Simplified Chinese TLD is meant to work when URL input
# happens in the simplified mode. Assume content is tradition anyway.
xn--kprw13d=Big5
xn--kpry57d=Big5

ua=windows-1251
xn--j1amh=windows-1251

uz=windows-1251

vn=windows-1258

ye=windows-1256
xn--mgb2ddes=windows-1256
2 changes: 2 additions & 0 deletions dom/encoding/moz.build
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ LOCAL_INCLUDES += [
]

GENERATED_FILES += [
'domainsfallbacks.properties.h',
'labelsencodings.properties.h',
'localesfallbacks.properties.h',
'nonparticipatingdomains.properties.h',
]
Loading

0 comments on commit 7eef0de

Please sign in to comment.