Skip to content

Commit e327630

Browse files
committed
Fix invalid byte sequence on EncodedString#split
From: - rspec/rspec-core#1760 - via rspec#134
1 parent b666ad5 commit e327630

File tree

2 files changed

+72
-30
lines changed

2 files changed

+72
-30
lines changed

lib/rspec/support/encoded_string.rb

Lines changed: 51 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,32 @@ module Support
33
# @private
44
class EncodedString
55
# Reduce allocations by storing constants.
6-
UTF_8 = "UTF-8"
7-
US_ASCII = 'US-ASCII'
8-
# else: '?' 63.chr ("\x3F")
6+
UTF_8 = "UTF-8"
7+
US_ASCII = "US-ASCII"
8+
#
9+
# In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
10+
# see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
11+
# https://www.ruby-forum.com/topic/6861247
12+
# https://twitter.com/nalsh/status/553413844685438976
13+
#
14+
# For example, given:
15+
# "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
16+
#
17+
# On MRI 2.1 or above: 63 # '?'
18+
# else : 128 # "\x80"
19+
#
20+
# Ruby's default replacement string is:
21+
# U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else
22+
# ? ("\x3F")
923
REPLACE = "?"
1024
ENCODE_UNCONVERTABLE_BYTES = {
1125
:invalid => :replace,
12-
:undef => :replace
26+
:undef => :replace,
27+
:replace => REPLACE
1328
}
1429
ENCODE_NO_CONVERTER = {
1530
:invalid => :replace,
31+
:replace => REPLACE
1632
}
1733

1834
def initialize(string, encoding=nil)
@@ -64,13 +80,13 @@ def to_s
6480
# vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
6581
# # => '<byte>'
6682
# ArgumentError
67-
# when operating on a string with invalid bytes
68-
# e.g."\xEF".split("\n")
83+
# when operating on a string with invalid bytes
84+
# e.g."\x80".split("\n")
6985
# TypeError
70-
# when a symbol is passed as an encoding
71-
# Encoding.find(:"utf-8")
72-
# when calling force_encoding on an object
73-
# that doesn't respond to #to_str
86+
# when a symbol is passed as an encoding
87+
# Encoding.find(:"UTF-8")
88+
# when calling force_encoding on an object
89+
# that doesn't respond to #to_str
7490
#
7591
# Raised by transcoding methods:
7692
# Encoding::ConverterNotFoundError:
@@ -80,25 +96,38 @@ def to_s
8096
# e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
8197
#
8298
# Raised by byte <-> char conversions
83-
# RangeError: out of char range
84-
# e.g. the UTF-16LE emoji: 128169.chr
99+
# RangeError: out of char range
100+
# e.g. the UTF-16LE emoji: 128169.chr
85101
def matching_encoding(string)
102+
string = remove_invalid_bytes(string)
86103
string.encode(@encoding)
87104
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
88-
normalize_missing(string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES))
105+
string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES)
89106
rescue Encoding::ConverterNotFoundError
90-
normalize_missing(string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER))
107+
string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER)
91108
end
92109

93-
# Ruby's default replacement string is:
94-
# for Unicode encoding forms: U+FFFD ("\xEF\xBF\xBD")
95-
MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD".force_encoding(UTF_8)
96-
97-
def normalize_missing(string)
98-
if @encoding.to_s == UTF_8
99-
string.gsub(MRI_UNICODE_UNKOWN_CHARACTER, REPLACE)
100-
else
110+
# Work around bad bytes with a double conversion
111+
# Prevents raising ArgumentError
112+
#
113+
# Emulates Ruby 2.1 String#scrub
114+
# see https://github.com/hsbt/string-scrub
115+
# https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123
116+
# https://speakerdeck.com/samsaffron/why-ruby-2-dot-1-excites-me?slide=48
117+
#
118+
# Force UTF-8 encoding,
119+
# Converting it to a higher higher character set (UTF-16) and then
120+
# back (to UTF-8) ensures that you will strip away invalid or undefined byte sequences,
121+
# Restore original encoding
122+
def remove_invalid_bytes(string)
123+
if string.valid_encoding?
101124
string
125+
else
126+
string.dup.
127+
force_encoding(UTF_8)
128+
encode(Encoding::UTF_16, UTF_8, ENCODE_NO_CONVERTER).
129+
encode(UTF_8, Encoding::UTF_16).
130+
force_encoding(string.encoding)
102131
end
103132
end
104133

spec/rspec/support/encoded_string_spec.rb

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -80,14 +80,8 @@ module RSpec::Support
8080
}.to raise_error(Encoding::ConverterNotFoundError)
8181
end
8282

83-
# In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
84-
# see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
85-
# https://www.ruby-forum.com/topic/6861247
86-
# https://twitter.com/nalsh/status/553413844685438976
87-
# For example, given:
88-
# "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
89-
# On MRI 2.1 or above: 63 # '?'
90-
# else : 128 # "\x80"
83+
# See comment above ENCODE_UNCONVERTABLE_BYTES in encoded_string.rb
84+
# for why the behavior differs by (MRI) Ruby version.
9185
if RUBY_VERSION < '2.1'
9286
it 'does nothing' do
9387
resulting_string = build_encoded_string(string, no_converter_encoding).to_s
@@ -220,6 +214,25 @@ module RSpec::Support
220214
]
221215
end
222216
end
217+
218+
context 'when the string has an invalid byte sequence' do
219+
let(:message_with_invalid_byte_sequence) { "\xEF \255 \xAD I have bad bytes".force_encoding(utf8_encoding) }
220+
221+
it 'normally raises an ArgumentError' do
222+
expect(message_with_invalid_byte_sequence).not_to be_valid_encoding
223+
expect {
224+
message_with_invalid_byte_sequence.split("\n")
225+
}.to raise_error(ArgumentError)
226+
end
227+
228+
it 'replaces invalid bytes with the REPLACE string' do
229+
resulting_array = build_encoded_string(message_with_invalid_byte_sequence, utf8_encoding).split("\n")
230+
expected_string = "? ? ? I have bad bytes"
231+
expect(resulting_array).to match [
232+
a_string_identical_to(expected_string)
233+
]
234+
end
235+
end
223236
end
224237

225238
def build_encoded_string(string, target_encoding = string.encoding)

0 commit comments

Comments
 (0)