Fix invalid byte sequence on EncodedString#split

bf4 · bf4 · commit e3276306caf3 · 2015-02-07T20:19:13.000-06:00
From: - rspec/rspec-core#1760 - via rspec#134
diff --git a/lib/rspec/support/encoded_string.rb b/lib/rspec/support/encoded_string.rb
@@ -3,16 +3,32 @@ module Support
     # @private
     class EncodedString
       # Reduce allocations by storing constants.
-      UTF_8 = "UTF-8"
-      US_ASCII = 'US-ASCII'
-      #  else: '?' 63.chr ("\x3F")
+      UTF_8    = "UTF-8"
+      US_ASCII = "US-ASCII"
+      #
+      # In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
+      # see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
+      # https://www.ruby-forum.com/topic/6861247
+      # https://twitter.com/nalsh/status/553413844685438976
+      #
+      # For example, given:
+      #  "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
+      #
+      # On MRI 2.1 or above: 63  # '?'
+      # else               : 128 # "\x80"
+      #
+      # Ruby's default replacement string is:
+      #   U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else
+      #   ?      ("\x3F")
       REPLACE = "?"
       ENCODE_UNCONVERTABLE_BYTES =  {
         :invalid => :replace,
-        :undef   => :replace
+        :undef   => :replace,
+        :replace => REPLACE
       }
       ENCODE_NO_CONVERTER = {
         :invalid => :replace,
+        :replace => REPLACE
       }
 
       def initialize(string, encoding=nil)
@@ -64,13 +80,13 @@ def to_s
         #     vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
         #     # => '<byte>'
         #   ArgumentError
-        #    when operating on a string with invalid bytes
-        #     e.g."\xEF".split("\n")
+        #     when operating on a string with invalid bytes
+        #     e.g."\x80".split("\n")
         #   TypeError
-        #    when a symbol is passed as an encoding
-        #    Encoding.find(:"utf-8")
-        #    when calling force_encoding on an object
-        #    that doesn't respond to #to_str
+        #     when a symbol is passed as an encoding
+        #     Encoding.find(:"UTF-8")
+        #     when calling force_encoding on an object
+        #     that doesn't respond to #to_str
         #
         # Raised by transcoding methods:
         #   Encoding::ConverterNotFoundError:
@@ -80,25 +96,38 @@ def to_s
         #     e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
         #
         # Raised by byte <-> char conversions
-        #  RangeError: out of char range
-        #   e.g. the UTF-16LE emoji: 128169.chr
+        #   RangeError: out of char range
+        #    e.g. the UTF-16LE emoji: 128169.chr
         def matching_encoding(string)
+          string = remove_invalid_bytes(string)
           string.encode(@encoding)
         rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
-          normalize_missing(string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES))
+          string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES)
         rescue Encoding::ConverterNotFoundError
-          normalize_missing(string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER))
+          string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER)
         end
 
-        # Ruby's default replacement string is:
-        # for Unicode encoding forms: U+FFFD ("\xEF\xBF\xBD")
-        MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD".force_encoding(UTF_8)
-
-        def normalize_missing(string)
-          if @encoding.to_s == UTF_8
-            string.gsub(MRI_UNICODE_UNKOWN_CHARACTER, REPLACE)
-          else
+        # Work around bad bytes with a double conversion
+        # Prevents raising ArgumentError
+        #
+        # Emulates Ruby 2.1 String#scrub
+        # see https://github.com/hsbt/string-scrub
+        # https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123
+        # https://speakerdeck.com/samsaffron/why-ruby-2-dot-1-excites-me?slide=48
+        #
+        # Force UTF-8 encoding,
+        # Converting it to a higher higher character set (UTF-16) and then
+        # back (to UTF-8) ensures that you will strip away invalid or undefined byte sequences,
+        # Restore original encoding
+        def remove_invalid_bytes(string)
+          if string.valid_encoding?
             string
+          else
+            string.dup.
+              force_encoding(UTF_8)
+              encode(Encoding::UTF_16, UTF_8, ENCODE_NO_CONVERTER).
+              encode(UTF_8, Encoding::UTF_16).
+              force_encoding(string.encoding)
           end
         end
 
diff --git a/spec/rspec/support/encoded_string_spec.rb b/spec/rspec/support/encoded_string_spec.rb
@@ -80,14 +80,8 @@ module RSpec::Support
             }.to raise_error(Encoding::ConverterNotFoundError)
           end
 
-          # In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
-          # see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
-          # https://www.ruby-forum.com/topic/6861247
-          # https://twitter.com/nalsh/status/553413844685438976
-          # For example, given:
-          #  "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
-          # On MRI 2.1 or above: 63 # '?'
-          # else               : 128 # "\x80"
+          # See comment above ENCODE_UNCONVERTABLE_BYTES in encoded_string.rb
+          # for why the behavior differs by (MRI) Ruby version.
           if RUBY_VERSION < '2.1'
             it 'does nothing' do
               resulting_string = build_encoded_string(string, no_converter_encoding).to_s
@@ -220,6 +214,25 @@ module RSpec::Support
             ]
           end
         end
+
+        context 'when the string has an invalid byte sequence' do
+          let(:message_with_invalid_byte_sequence) { "\xEF \255 \xAD I have bad bytes".force_encoding(utf8_encoding) }
+
+          it 'normally raises an ArgumentError' do
+            expect(message_with_invalid_byte_sequence).not_to be_valid_encoding
+            expect {
+              message_with_invalid_byte_sequence.split("\n")
+            }.to raise_error(ArgumentError)
+          end
+
+          it 'replaces invalid bytes with the REPLACE string' do
+            resulting_array = build_encoded_string(message_with_invalid_byte_sequence, utf8_encoding).split("\n")
+            expected_string = "? ? ? I have bad bytes"
+            expect(resulting_array).to match [
+              a_string_identical_to(expected_string)
+            ]
+          end
+        end
       end
 
       def build_encoded_string(string, target_encoding = string.encoding)