Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve decoding of HTML entities #5064

Merged
merged 9 commits into from
Oct 4, 2017
60 changes: 53 additions & 7 deletions spec/std/html_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ describe "HTML" do
str.should eq("safe_string")
end

it "unescapes dangerous characters from a string" do
it "unescapes html special characters" do
str = HTML.unescape("< & >")

str.should eq("< & >")
Expand All @@ -42,9 +42,9 @@ describe "HTML" do
end

it "unescapes with invalid entities" do
str = HTML.unescape("&&lt;&amp&gt;&quot&abcdefghijklmn")
str = HTML.unescape("&&lt;&amp&gt;&quot&abcdefghijklmn &ThisIsNotAnEntitiy;")

str.should eq("&<&>\"&abcdefghijklmn")
str.should eq("&<&>\"&abcdefghijklmn &ThisIsNotAnEntitiy;")
end

it "unescapes hex encoded chars" do
Expand All @@ -53,18 +53,33 @@ describe "HTML" do
str.should eq("3 + 2 = 5")
end

it "unescapes decimal encoded chars" do
str = HTML.unescape("3 &#00043; 2 &#00061 5")

str.should eq("3 + 2 = 5")
end

it "unescapes &nbsp;" do
str = HTML.unescape("nbsp&nbsp;space ")

str.should eq("nbsp\u{0000A0}space ")
end

it "unescapes Char::MAX_CODEPOINT" do
it "does not unescape Char::MAX_CODEPOINT" do
# Char::MAX_CODEPOINT is actually a noncharacter and is not replaced
str = HTML.unescape("limit &#x10FFFF;")
str.should eq("limit 􏿿")
str.should eq("limit &#x10FFFF;")

str = HTML.unescape("limit &#1114111;")
str.should eq("limit 􏿿")
str.should eq("limit &#1114111;")
end

it "does not unescape characters above Char::MAX_CODEPOINT" do
str = HTML.unescape("limit &#x110000;")
str.should eq("limit \uFFFD")

str = HTML.unescape("limit &#1114112;")
str.should eq("limit \uFFFD")
end

it "unescapes &NotSquareSuperset;" do
Expand All @@ -73,9 +88,40 @@ describe "HTML" do
str.should eq(" ⊐̸ ")
end

it "unescapes &ampd" do
it "unescapes entities without trailing semicolon" do
str = HTML.unescape("&amphello")
str.should eq("&hello")
end

it "unescapes named character reference with numerical characters" do
str = HTML.unescape("&frac34;")
str.should eq("\u00BE")
end

it "does not escape unicode control characters except space characters" do
string = "&#x0001;-&#x001F; &#x000D; &#x007F;"
HTML.unescape(string).should eq(string)

string = HTML.unescape("&#x0080;-&#x009F;")
string.should eq("\u20AC-\u0178")

HTML.unescape("&#x000;").should eq("\uFFFD")
end

it "escapes space characters" do
string = HTML.unescape("&#x0020;&#32;&#x0009;&#x000A;&#x000C;")
string.should eq(" \t\n\f")
end

it "does not escape noncharacter codepoints" do
# noncharacters http://www.unicode.org/faq/private_use.html
string = "&#xFDD0;-&#xFDEF; &#xFFFE; &#FFFF; &#x1FFFE; &#x1FFFF; &#x2FFFE; &#x10FFFF;"
HTML.unescape(string).should eq(string)
end

it "does not escape unicode surrogate characters" do
string = "&#xD800;-&#xDFFF;"
HTML.unescape(string).should eq("\uFFFD-\uFFFD")
end
end
end
89 changes: 76 additions & 13 deletions src/html.cr
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,61 @@ module HTML
end
end

# These replacements permit compatibility with old numeric entities that
# assumed Windows-1252 encoding.
# http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
private CHARACTER_REPLACEMENTS = {
'\u20AC', # First entry is what 0x80 should be replaced with.
'\u0081',
'\u201A',
'\u0192',
'\u201E',
'\u2026',
'\u2020',
'\u2021',
'\u02C6',
'\u2030',
'\u0160',
'\u2039',
'\u0152',
'\u008D',
'\u017D',
'\u008F',
'\u0090',
'\u2018',
'\u2019',
'\u201C',
'\u201D',
'\u2022',
'\u2013',
'\u2014',
'\u02DC',
'\u2122',
'\u0161',
'\u203A',
'\u0153',
'\u009D',
'\u017E',
'\u0178', # Last entry is 0x9F.
# 0x00->'\uFFFD' is handled programmatically.
# 0x0D->'\u000D' is a no-op.
}

# Returns a string where named and numeric character references
# (e.g. &gt;, &#62;, &x3e;) in *string* are replaced with the corresponding
# unicode characters.
# unicode characters. This method decodes all HTML5 entities including those
# without a trailing semicolon (such as `&copy`).
#
# ```
# HTML.unescape("Crystal &amp; You") # => "Crystal & You"
# ```
def self.unescape(string : String) : String
string.gsub(/&(?:([a-zA-Z]{2,32};?)|\#([0-9]+);?|\#[xX]([0-9A-Fa-f]+);?)/) do |string, match|
string.gsub(/&(?:([a-zA-Z0-9]{2,32};?)|\#([0-9]+);?|\#[xX]([0-9A-Fa-f]+);?)/) do |string, match|
if code = match[1]?
# Try to find the code
value = named_entity(code)
if value
value
elsif !code.ends_with?(';')

unless value || code.ends_with?(';')
# If we can't find it and it doesn't end with ';',
# we need to find each prefix of it.
# We start from the largest prefix.
Expand All @@ -67,19 +107,17 @@ module HTML
break
end
end

# We either found the code or not,
# in which case we need to return the original string
value || string
end

# We either found the code or not,
# in which case we need to return the original string
value || string
elsif code = match[2]?
# Find by decimal code
n = code.to_i
n <= Char::MAX_CODEPOINT ? n.unsafe_chr : string
decode_codepoint(code.to_i) || string
elsif code = match[3]?
# Find by hexadecimal code
n = code.to_i(16)
n <= Char::MAX_CODEPOINT ? n.unsafe_chr : string
decode_codepoint(code.to_i(16)) || string
else
string
end
Expand All @@ -89,4 +127,29 @@ module HTML
private def self.named_entity(code)
HTML::SINGLE_CHAR_ENTITIES[code]? || HTML::DOUBLE_CHAR_ENTITIES[code]?
end

# see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
private def self.decode_codepoint(codepoint)
case codepoint
when 0x80..0x9F
# Replace characters from Windows-1252 with UTF-8 equivalents.
CHARACTER_REPLACEMENTS[codepoint - 0x80].to_s
when 0,
.>(Char::MAX_CODEPOINT),
0xD800..0xDFFF # unicode surrogate characters
# Replace invalid characters with replacement character.
'\uFFFD'
else
# don't replace disallowed codepoints
unless codepoint == 0x007F ||
# unicode noncharacters
(0xFDD0..0xFDEF).includes?(codepoint) ||
# last two of each plane (nonchars) disallowed
codepoint & 0xFFFF >= 0xFFFE ||
# unicode control characters expect space
(codepoint < 0x0020 && !{0x0009, 0x000A, 0x000C}.includes?(codepoint))
codepoint.unsafe_chr
end
end
end
end