-
-
Notifications
You must be signed in to change notification settings - Fork 102
Extracting EncodedString specs from #134 #151
Changes from all commits
19e967a
e3231f4
d159096
012e558
0b61339
494e3e7
d7c1885
9a13068
9eeedb9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,21 @@ module RSpec | |
module Support | ||
# @private | ||
class EncodedString | ||
# Reduce allocations by storing constants. | ||
UTF_8 = "UTF-8" | ||
US_ASCII = 'US-ASCII' | ||
# Ruby's default replacement string is: | ||
# for Unicode encoding forms: U+FFFD ("\xEF\xBF\xBD") | ||
MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD" | ||
# else: '?' 63.chr ("\x3F") | ||
REPLACE = "?" | ||
ENCODE_UNCONVERTABLE_BYTES = { | ||
:invalid => :replace, | ||
:undef => :replace | ||
} | ||
ENCODE_NO_CONVERTER = { | ||
:invalid => :replace, | ||
} | ||
|
||
def initialize(string, encoding=nil) | ||
@encoding = encoding | ||
|
@@ -33,17 +47,55 @@ def to_s | |
|
||
private | ||
|
||
# Encoding Exceptions: | ||
# | ||
# Raised by Encoding and String methods: | ||
# Encoding::UndefinedConversionError: | ||
# when a transcoding operation fails | ||
# if the String contains characters invalid for the target encoding | ||
# e.g. "\x80".encode('UTF-8','ASCII-8BIT') | ||
# vs "\x80".encode('UTF-8','ASCII-8BIT', undef: :replace, replace: '<undef>') | ||
# # => '<undef>' | ||
# Encoding::CompatibilityError | ||
# when Enconding.compatbile?(str1, str2) is false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe should be Encoding. :-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prol'y |
||
# e.g. utf_16le_emoji_string.split("\n") | ||
# e.g. valid_unicode_string.encode(utf8_encoding) << ascii_string | ||
# Encoding::InvalidByteSequenceError: | ||
# when the string being transcoded contains a byte invalid for | ||
# either the source or target encoding | ||
# e.g. "\x80".encode('UTF-8','US-ASCII') | ||
# vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>') | ||
# # => '<byte>' | ||
# ArgumentError | ||
# when operating on a string with invalid bytes | ||
# e.g."\xEF".split("\n") | ||
# TypeError | ||
# when a symbol is passed as an encoding | ||
# Encoding.find(:"utf-8") | ||
# when calling force_encoding on an object | ||
# that doesn't respond to #to_str | ||
# | ||
# Raised by transcoding methods: | ||
# Encoding::ConverterNotFoundError: | ||
# when a named encoding does not correspond with a known converter | ||
# e.g. 'abc'.force_encoding('UTF-8').encode('foo') | ||
# or a converter path cannot be found | ||
# e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule') | ||
# | ||
# Raised by byte <-> char conversions | ||
# RangeError: out of char range | ||
# e.g. the UTF-16LE emoji: 128169.chr | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this too much detail in the comments? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The detail is helpful, actually. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice docs. There's not enough about ruby encodings around the place so this is great. |
||
def matching_encoding(string) | ||
string.encode(@encoding) | ||
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError | ||
normalize_missing(string.encode(@encoding, :invalid => :replace, :undef => :replace)) | ||
normalize_missing(string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES)) | ||
rescue Encoding::ConverterNotFoundError | ||
normalize_missing(string.force_encoding(@encoding).encode(:invalid => :replace)) | ||
normalize_missing(string.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER)) | ||
end | ||
|
||
def normalize_missing(string) | ||
if @encoding.to_s == "UTF-8" | ||
string.gsub(MRI_UNICODE_UNKOWN_CHARACTER.force_encoding(@encoding), "?") | ||
if @encoding.to_s == UTF_8 | ||
string.gsub(MRI_UNICODE_UNKOWN_CHARACTER.force_encoding(@encoding), REPLACE) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There're no functional changes in this file. Just more tests. |
||
else | ||
string | ||
end | ||
|
@@ -61,7 +113,7 @@ def matching_encoding(string) | |
end | ||
|
||
def detect_source_encoding(_string) | ||
'US-ASCII' | ||
US_ASCII | ||
end | ||
end | ||
end | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,51 +1,162 @@ | ||
# encoding: utf-8 | ||
require 'spec_helper' | ||
require 'rspec/support/encoded_string' | ||
|
||
# Special matcher for comparing encoded strings so that | ||
# we don't run any expectation failures through the Differ, | ||
# which also relies on EncodedString. Instead, confirm the | ||
# strings have the same encoding and same bytes. | ||
RSpec::Matchers.define :be_identical_string do |expected| | ||
|
||
if String.method_defined?(:encoding) | ||
match do | ||
actual.encoding == expected.encoding && | ||
actual.bytes.to_a == expected.bytes.to_a | ||
end | ||
|
||
failure_message do | ||
"expected #{actual.inspect} (#{actual.encoding.name}) to be identical to "\ | ||
"#{expected.inspect} (#{expected.encoding.name})" | ||
end | ||
else | ||
match do |actual| | ||
actual.split(//) == expected.split(//) | ||
end | ||
end | ||
end | ||
RSpec::Matchers.alias_matcher :a_string_identical_to, :be_identical_string | ||
|
||
module RSpec::Support | ||
describe EncodedString do | ||
let(:target_encoding) { 'UTF-8' } | ||
let(:utf8_encoding) { 'UTF-8' } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this just clarifies the language |
||
|
||
delegated_methods = String.instance_methods.map(&:to_s) & %w[eql? lines == encoding empty?] | ||
delegated_methods.each do |delegated_method| | ||
it "responds to #{delegated_method}" do | ||
encoded_string = EncodedString.new("abc", target_encoding) | ||
encoded_string = EncodedString.new("abc", utf8_encoding) | ||
expect(encoded_string).to respond_to(delegated_method) | ||
end | ||
end | ||
|
||
if String.method_defined?(:encoding) | ||
|
||
describe '#source_encoding' do | ||
it 'knows the original encoding of the string' do | ||
str = EncodedString.new("abc".encode('ASCII-8BIT'), "UTF-8") | ||
expect( str.source_encoding.to_s ).to eq('ASCII-8BIT') | ||
expect(str.source_encoding.to_s).to eq('ASCII-8BIT') | ||
end | ||
end | ||
|
||
let(:ascii_arrow_symbol) { "\xAE" } | ||
describe '#to_s' do | ||
context 'when encoding a string with invalid bytes in the target encoding' do | ||
# see https://github.com/jruby/jruby/blob/c1be61a501/test/mri/ruby/test_transcode.rb#L13 | ||
let(:source_encoding) { Encoding.find('US-ASCII') } | ||
let(:target_encoding) { Encoding.find('UTF-8') } | ||
let(:string) { "I have a bad byté\x80".force_encoding(source_encoding) } | ||
|
||
it 'normally raises an EncodedString::InvalidByteSequenceError' do | ||
expect { | ||
string.encode(target_encoding) | ||
}.to raise_error(Encoding::InvalidByteSequenceError) | ||
end | ||
|
||
it 'replaces invalid byte sequences with the REPLACE string' do | ||
resulting_string = build_encoded_string(string, target_encoding).to_s | ||
replacement = EncodedString::REPLACE * 3 | ||
expected_string = "I have a bad byt#{replacement}".force_encoding(target_encoding) | ||
expect(resulting_string).to be_identical_string(expected_string) | ||
end | ||
end | ||
|
||
context 'when no converter is known for an encoding' do | ||
# see https://github.com/rubyspec/rubyspec/blob/91ce9f6549/core/string/shared/encode.rb#L12 | ||
let(:source_encoding) { Encoding.find('ASCII-8BIT') } | ||
let(:no_converter_encoding) { Encoding::Emacs_Mule } | ||
let(:string) { "\x80".force_encoding(source_encoding) } | ||
|
||
it 'normally raises an Encoding::ConverterNotFoundError' do | ||
expect { | ||
string.encode(no_converter_encoding) | ||
}.to raise_error(Encoding::ConverterNotFoundError) | ||
end | ||
|
||
# In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence | ||
# see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176 | ||
# https://www.ruby-forum.com/topic/6861247 | ||
# https://twitter.com/nalsh/status/553413844685438976 | ||
# For example, given: | ||
# "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a | ||
# On MRI 2.1 or above: 63 # '?' | ||
# else : 128 # "\x80" | ||
if RUBY_VERSION < '2.1' | ||
it 'does nothing' do | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does it do nothing on 1.9 and 2.0? I didn't think 2.1 had new encoding features...shouldn't it act the same on those rubies? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know why. Try it in irb. I can't explain it. Maybe it's a bug in 2.0? "\x80".force_encoding("ASCII-8BIT").force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Huh, weird. You mind leaving a comment that provides this snippet to demonstrate the different 2.0 vs 2.1+ behavior? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \
rvm ruby-1.9.2-p330,ruby-1.9.3-p551,ruby-2.0.0-p598,ruby-2.1.5,ruby-2.2.0,jruby-1.7.18,rbx-2.2.2 do \
ruby -e 'p [RUBY_VERSION, RUBY_ENGINE, Encoding.default_external, __ENCODING__] + "\x80".force_encoding("ASCII-8BIT").force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a'
for version in 1.9 2.0; do \
export JRUBY_OPTS="-Xcompat.version=${version}" ; \
LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \
rvm jruby-1.7.18 do \
ruby -e 'p [RUBY_VERSION, RUBY_ENGINE, Encoding.default_external, __ENCODING__] + "\x80".force_encoding("ASCII-8BIT").force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a' ; done
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fwiw I just asked about this on the ruby forum https://www.ruby-forum.com/topic/6861247 and got a reply, and also on twitter https://twitter.com/nalsh/status/553413844685438976 links to Ruby 2.1 release news
|
||
resulting_string = build_encoded_string(string, no_converter_encoding).to_s | ||
expected_string = "\x80".force_encoding(no_converter_encoding) | ||
expect(resulting_string).to be_identical_string(expected_string) | ||
end | ||
else | ||
it 'forces the encoding and replaces invalid characters with the REPLACE string' do | ||
resulting_string = build_encoded_string(string, no_converter_encoding).to_s | ||
expected_string = EncodedString::REPLACE.force_encoding(no_converter_encoding) | ||
expect(resulting_string).to be_identical_string(expected_string) | ||
end | ||
end | ||
end | ||
|
||
# see https://github.com/ruby/ruby/blob/34fbf57aaa/transcode.c#L4289 | ||
# ISO-8859-1 -> UTF-8 -> EUC-JP | ||
# "\xa0" NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand your question, it's why talk about UTF-8 when we're talking about ISO-8859-1 -> EUC-JP? If that's your question, it's because when Ruby determines a conversion path it passes through UTF-8 p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
#=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
# [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]] and see https://github.com/ruby/ruby/blob/34fbf57aaa/transcode.c#L4242-L4250 and https://github.com/ruby/ruby/blob/34fbf57aaa/transcode.c#L3917-L3973 (and maybe https://github.com/ruby/ruby/blob/34fbf57aaa/transcode.c#L4289-L4294 etc) And then probably also look at the weirdness that is converter not found: https://github.com/rubyspec/rubyspec/blob/archive/core/string/shared/encode.rb#L96-L101 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wow, I had no idea just how weird and complex ruby encoding behavior is! |
||
context 'when there is an undefined conversion to the target encoding' do | ||
let(:source_encoding) { Encoding.find('ISO-8859-1') } | ||
let(:incompatible_encoding) { Encoding.find('EUC-JP') } | ||
let(:string) { "\xa0 hi I am not going to work".force_encoding(source_encoding) } | ||
|
||
it 'normally raises an Encoding::UndefinedConversionError' do | ||
expect { | ||
string.encode(incompatible_encoding) | ||
}.to raise_error(Encoding::UndefinedConversionError) | ||
end | ||
|
||
it 'replaces all undefines conversions with the REPLACE string' do | ||
resulting_string = build_encoded_string(string, incompatible_encoding).to_s | ||
replacement = EncodedString::REPLACE | ||
expected_string = "#{replacement} hi I am not going to work".force_encoding('EUC-JP') | ||
expect(resulting_string).to be_identical_string(expected_string) | ||
end | ||
end | ||
end | ||
|
||
let(:ascii_arrow_symbol) { "\xAE" } | ||
let(:utf_8_euro_symbol) { "\xE2\x82\xAC" } | ||
|
||
describe '#<<' do | ||
context 'with strings that can be converted to the target encoding' do | ||
it 'encodes and appends the string' do | ||
valid_ascii_string = "abc".force_encoding("ASCII-8BIT") | ||
valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8') | ||
let(:valid_ascii_string) { "abcde".force_encoding("ASCII-8BIT") } | ||
let(:valid_unicode_string) { utf_8_euro_symbol.force_encoding('UTF-8') } | ||
|
||
resulting_string = build_encoded_string(valid_unicode_string, target_encoding) << valid_ascii_string | ||
expect(resulting_string).to eq "#{utf_8_euro_symbol}abc".force_encoding('UTF-8') | ||
it 'encodes and appends the string' do | ||
resulting_string = build_encoded_string(valid_unicode_string, utf8_encoding) << valid_ascii_string | ||
expected_string = "#{utf_8_euro_symbol}abcde".force_encoding('UTF-8') | ||
expect(resulting_string).to be_identical_string(expected_string) | ||
end | ||
end | ||
|
||
context 'with a string that cannot be converted to the target encoding' do | ||
it 'replaces undefined characters with either a ? or a unicode ?' do | ||
ascii_string = ascii_arrow_symbol.force_encoding("ASCII-8BIT") | ||
valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8') | ||
|
||
resulting_string = build_encoded_string(valid_unicode_string, target_encoding) << ascii_string | ||
expected_bytes = utf_8_euro_symbol.each_byte.to_a + ["?".unpack("c").first] | ||
actual_bytes = resulting_string.each_byte.to_a | ||
|
||
expect(actual_bytes).to eq(expected_bytes) | ||
context 'when appending a string with an incompatible character encoding' do | ||
let(:ascii_string) { ascii_arrow_symbol.force_encoding("ASCII-8BIT") } | ||
let(:valid_unicode_string) { utf_8_euro_symbol.force_encoding('UTF-8') } | ||
|
||
it "normally raises an Encoding::CompatibilityError" do | ||
expect { | ||
valid_unicode_string.encode(utf8_encoding) << ascii_string | ||
}.to raise_error(Encoding::CompatibilityError) | ||
end | ||
|
||
it 'replaces unconvertable characters with the REPLACE string' do | ||
resulting_string = build_encoded_string(valid_unicode_string, utf8_encoding) << ascii_string | ||
expected_string = "#{utf_8_euro_symbol}#{EncodedString::REPLACE}" | ||
expect(resulting_string).to be_identical_string(expected_string) | ||
end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This spec confuses me...the doc string says it replaces the characters with their hex value but the example code doesn't seem to demonstrate that... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, my bad. s/hex/REPLACE. (it was a changed behavior among the churn). the arrow becomes a '?' There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gotcha. I trust you'll update the doc string then :). One thing I don't quite understand is the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. per http://stackoverflow.com/questions/21289181/char174-returning-the-value-of-char0174-why which links to http://www.theasciicode.com.ar/extended-ascii-code/angle-quotes-guillemets-right-pointing-double-angle-french-quotation-marks-ascii-code-174.html
e.g. CP1252 or Windows-1251 or something like that |
||
end | ||
end | ||
|
||
|
@@ -54,31 +165,67 @@ module RSpec::Support | |
ascii_string = 'abc'.force_encoding("ASCII-8BIT") | ||
other_ascii_string = '123'.force_encoding("ASCII-8BIT") | ||
|
||
resulting_string = build_encoded_string(ascii_string, target_encoding) << other_ascii_string | ||
expect(resulting_string.encoding.to_s).to eq 'UTF-8' | ||
resulting_string = build_encoded_string(ascii_string, utf8_encoding) << other_ascii_string | ||
expected_string = 'abc123'.force_encoding(utf8_encoding) | ||
expect(resulting_string).to be_identical_string(expected_string) | ||
end | ||
end | ||
end | ||
|
||
describe '#split' do | ||
it 'splits the string based on the delimiter accounting for encoding' do | ||
wrapped_string = "aaaaaaaaaaa#{ascii_arrow_symbol}aaaaa".force_encoding("ASCII-8BIT") | ||
context 'when there is an undefined conversion to the target encoding' do | ||
let(:wrapped_string_template) { "abaaaaaaaaaa%saaaaa" } | ||
let(:wrapped_string) { sprintf(wrapped_string_template, ascii_arrow_symbol).force_encoding("ASCII-8BIT") } | ||
|
||
it 'normally raises an Encoding::UndefinedConversionError' do | ||
expect { | ||
wrapped_string.encode(utf8_encoding) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This confuses me -- I thought that in this context we were specifying the behavior of |
||
}.to raise_error(Encoding::UndefinedConversionError) | ||
end | ||
|
||
it 'splits the string based on the delimiter accounting for encoding' do | ||
delimiter = "b".force_encoding(utf8_encoding) | ||
resulting_string = build_encoded_string(wrapped_string, utf8_encoding). | ||
split(delimiter) | ||
exp1, exp2 = sprintf(wrapped_string_template, EncodedString::REPLACE).force_encoding(utf8_encoding).split(delimiter) | ||
expect(resulting_string).to match [ | ||
a_string_identical_to(exp1), | ||
a_string_identical_to(exp2) | ||
] | ||
end | ||
end | ||
|
||
expect { | ||
build_encoded_string(wrapped_string, target_encoding).split(utf_8_euro_symbol.force_encoding("UTF-8")) | ||
}.not_to raise_error | ||
# see https://github.com/rspec/rspec-expectations/blob/f8a1232/spec/rspec/expectations/fail_with_spec.rb#L50 | ||
# https://github.com/rspec/rspec-expectations/issues/201 | ||
# https://github.com/rspec/rspec-expectations/pull/220 | ||
context 'with a string that cannot be converted to the target encoding' do | ||
let(:binary_poop) {'💩' } # [128169] "\u{1F4A9}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lame that chrome doesn't render this character :(. (I see it render in my editor fine, though). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
let(:non_ascii_compatible_string) { "This is a pile of poo: #{binary_poop}, yuck".encode("UTF-16LE") } | ||
|
||
it 'normally raises an Encoding::CompatibilityError' do | ||
expect { | ||
non_ascii_compatible_string.split("\n") | ||
}.to raise_error(Encoding::CompatibilityError) | ||
end | ||
|
||
it 'makes no changes to the resulting string' do | ||
resulting_array = build_encoded_string(non_ascii_compatible_string).split("\n") | ||
expect(resulting_array).to match [ | ||
a_string_identical_to(non_ascii_compatible_string) | ||
] | ||
end | ||
end | ||
end | ||
|
||
def build_encoded_string(string, target_encoding) | ||
def build_encoded_string(string, target_encoding = string.encoding) | ||
EncodedString.new(string, target_encoding) | ||
end | ||
else | ||
|
||
describe '#source_encoding' do | ||
it 'defaults to US-ASCII' do | ||
str = EncodedString.new("abc", "UTF-8") | ||
expect( str.source_encoding ).to eq('US-ASCII') | ||
expect(str.source_encoding).to eq('US-ASCII') | ||
end | ||
end | ||
end | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is technically incorrect. as written, it's when
nil
. I'll revise if everything else is ok.I was confusing enc.ascii_compatible?
with rb_enc_check
where
enc_compatible
is `Encoding.compatible?(str1,str2)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, please fix this. (Good catch!). Also, s/compatbile/compatible/