rspec · myronmarston · Jan 23, 2015 · Jan 7, 2015 · Jan 7, 2015 · Jan 7, 2015
diff --git a/lib/rspec/support/encoded_string.rb b/lib/rspec/support/encoded_string.rb
@@ -2,7 +2,21 @@ module RSpec
   module Support
     # @private
     class EncodedString
+      # Reduce allocations by storing constants.
+      UTF_8 = "UTF-8"
+      US_ASCII = 'US-ASCII'
+      # Ruby's default replacement string is:
+      # for Unicode encoding forms: U+FFFD ("\xEF\xBF\xBD")
       MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD"
+      #  else: '?' 63.chr ("\x3F")
+      REPLACE = "?"
+      ENCODE_UNCONVERTABLE_BYTES =  {
+        :invalid => :replace,
+        :undef   => :replace
+      }
+      ENCODE_NO_CONVERTER = {
+        :invalid => :replace,
+      }
 
       def initialize(string, encoding=nil)
         @encoding = encoding
@@ -33,17 +47,55 @@ def to_s
 
         private
 
+        # Encoding Exceptions:
+        #
+        # Raised by Encoding and String methods:
+        #   Encoding::UndefinedConversionError:
+        #     when a transcoding operation fails
+        #     if the String contains characters invalid for the target encoding
+        #     e.g. "\x80".encode('UTF-8','ASCII-8BIT')
+        #     vs "\x80".encode('UTF-8','ASCII-8BIT', undef: :replace, replace: '<undef>')
+        #     # => '<undef>'
+        #   Encoding::CompatibilityError
+        #    when Enconding.compatbile?(str1, str2) is false
+        #     e.g. utf_16le_emoji_string.split("\n")
+        #     e.g. valid_unicode_string.encode(utf8_encoding) << ascii_string
+        #   Encoding::InvalidByteSequenceError:
+        #     when the string being transcoded contains a byte invalid for
+        #     either the source or target encoding
+        #     e.g. "\x80".encode('UTF-8','US-ASCII')
+        #     vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
+        #     # => '<byte>'
+        #   ArgumentError
+        #    when operating on a string with invalid bytes
+        #     e.g."\xEF".split("\n")
+        #   TypeError
+        #    when a symbol is passed as an encoding
+        #    Encoding.find(:"utf-8")
+        #    when calling force_encoding on an object
+        #    that doesn't respond to #to_str
+        #
+        # Raised by transcoding methods:
+        #   Encoding::ConverterNotFoundError:
+        #     when a named encoding does not correspond with a known converter
+        #     e.g. 'abc'.force_encoding('UTF-8').encode('foo')
+        #     or a converter path cannot be found
+        #     e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
+        #
+        # Raised by byte <-> char conversions
+        #  RangeError: out of char range
+        #   e.g. the UTF-16LE emoji: 128169.chr
         def matching_encoding(string)
           string.encode(@encoding)
         rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
-          normalize_missing(string.encode(@encoding, :invalid => :replace, :undef => :replace))
+          normalize_missing(string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES))
         rescue Encoding::ConverterNotFoundError
-          normalize_missing(string.force_encoding(@encoding).encode(:invalid => :replace))
+          normalize_missing(string.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER))
         end
 
         def normalize_missing(string)
-          if @encoding.to_s == "UTF-8"
-            string.gsub(MRI_UNICODE_UNKOWN_CHARACTER.force_encoding(@encoding), "?")
+          if @encoding.to_s == UTF_8
+            string.gsub(MRI_UNICODE_UNKOWN_CHARACTER.force_encoding(@encoding), REPLACE)
           else
             string
           end
@@ -61,7 +113,7 @@ def matching_encoding(string)
         end
 
         def detect_source_encoding(_string)
-          'US-ASCII'
+          US_ASCII
         end
       end
     end

diff --git a/spec/rspec/support/encoded_string_spec.rb b/spec/rspec/support/encoded_string_spec.rb
@@ -1,51 +1,162 @@
+# encoding: utf-8
 require 'spec_helper'
 require 'rspec/support/encoded_string'
 
+# Special matcher for comparing encoded strings so that
+# we don't run any expectation failures through the Differ,
+# which also relies on EncodedString. Instead, confirm the
+# strings have the same encoding and same bytes.
+RSpec::Matchers.define :be_identical_string do |expected|
+
+  if String.method_defined?(:encoding)
+    match do
+      actual.encoding == expected.encoding &&
+        actual.bytes.to_a == expected.bytes.to_a
+    end
+
+    failure_message do
+      "expected #{actual.inspect} (#{actual.encoding.name}) to be identical to "\
+        "#{expected.inspect} (#{expected.encoding.name})"
+    end
+  else
+    match do |actual|
+      actual.split(//) == expected.split(//)
+    end
+  end
+end
+RSpec::Matchers.alias_matcher :a_string_identical_to, :be_identical_string
+
 module RSpec::Support
   describe EncodedString do
-    let(:target_encoding) { 'UTF-8' }
+    let(:utf8_encoding) { 'UTF-8' }
 
     delegated_methods = String.instance_methods.map(&:to_s) & %w[eql? lines == encoding empty?]
     delegated_methods.each do |delegated_method|
       it "responds to #{delegated_method}" do
-        encoded_string = EncodedString.new("abc", target_encoding)
+        encoded_string = EncodedString.new("abc", utf8_encoding)
         expect(encoded_string).to respond_to(delegated_method)
       end
     end
 
     if String.method_defined?(:encoding)
+
       describe '#source_encoding' do
         it 'knows the original encoding of the string' do
           str = EncodedString.new("abc".encode('ASCII-8BIT'), "UTF-8")
-          expect( str.source_encoding.to_s ).to eq('ASCII-8BIT')
+          expect(str.source_encoding.to_s).to eq('ASCII-8BIT')
         end
       end
 
-      let(:ascii_arrow_symbol) { "\xAE" }
+      describe '#to_s' do
+        context 'when encoding a string with invalid bytes in the target encoding' do
+          # see https://github.com/jruby/jruby/blob/c1be61a501/test/mri/ruby/test_transcode.rb#L13
+          let(:source_encoding) { Encoding.find('US-ASCII') }
+          let(:target_encoding) { Encoding.find('UTF-8') }
+          let(:string) { "I have a bad byté\x80".force_encoding(source_encoding) }
+
+          it 'normally raises an EncodedString::InvalidByteSequenceError' do
+            expect {
+              string.encode(target_encoding)
+            }.to raise_error(Encoding::InvalidByteSequenceError)
+          end
+
+          it 'replaces invalid byte sequences with the REPLACE string' do
+            resulting_string = build_encoded_string(string, target_encoding).to_s
+            replacement = EncodedString::REPLACE * 3
+            expected_string = "I have a bad byt#{replacement}".force_encoding(target_encoding)
+            expect(resulting_string).to be_identical_string(expected_string)
+          end
+        end
+
+        context 'when no converter is known for an encoding' do
+          # see https://github.com/rubyspec/rubyspec/blob/91ce9f6549/core/string/shared/encode.rb#L12
+          let(:source_encoding) { Encoding.find('ASCII-8BIT') }
+          let(:no_converter_encoding) { Encoding::Emacs_Mule }
+          let(:string) { "\x80".force_encoding(source_encoding) }
+
+          it 'normally raises an Encoding::ConverterNotFoundError' do
+            expect {
+              string.encode(no_converter_encoding)
+            }.to raise_error(Encoding::ConverterNotFoundError)
+          end
+
+          # In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
+          # see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
+          # https://www.ruby-forum.com/topic/6861247
+          # https://twitter.com/nalsh/status/553413844685438976
+          # For example, given:
+          #  "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
+          # On MRI 2.1 or above: 63 # '?'
+          # else               : 128 # "\x80"
+          if RUBY_VERSION < '2.1'
+            it 'does nothing' do
+              resulting_string = build_encoded_string(string, no_converter_encoding).to_s
+              expected_string  = "\x80".force_encoding(no_converter_encoding)
+              expect(resulting_string).to be_identical_string(expected_string)
+            end
+          else
+            it 'forces the encoding and replaces invalid characters with the REPLACE string' do
+              resulting_string = build_encoded_string(string, no_converter_encoding).to_s
+              expected_string  = EncodedString::REPLACE.force_encoding(no_converter_encoding)
+              expect(resulting_string).to be_identical_string(expected_string)
+            end
+          end
+        end
 
+        # see https://github.com/ruby/ruby/blob/34fbf57aaa/transcode.c#L4289
+        # ISO-8859-1 -> UTF-8 -> EUC-JP
+        # "\xa0" NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP
+        context 'when there is an undefined conversion to the target encoding' do
+          let(:source_encoding) { Encoding.find('ISO-8859-1') }
+          let(:incompatible_encoding) { Encoding.find('EUC-JP') }
+          let(:string) { "\xa0 hi I am not going to work".force_encoding(source_encoding) }
+
+          it 'normally raises an Encoding::UndefinedConversionError' do
+            expect {
+              string.encode(incompatible_encoding)
+            }.to raise_error(Encoding::UndefinedConversionError)
+          end
+
+          it 'replaces all undefines conversions with the REPLACE string' do
+            resulting_string = build_encoded_string(string, incompatible_encoding).to_s
+            replacement = EncodedString::REPLACE
+            expected_string = "#{replacement} hi I am not going to work".force_encoding('EUC-JP')
+            expect(resulting_string).to be_identical_string(expected_string)
+          end
+        end
+      end
+
+      let(:ascii_arrow_symbol) { "\xAE" }
       let(:utf_8_euro_symbol) { "\xE2\x82\xAC" }
 
       describe '#<<' do
         context 'with strings that can be converted to the target encoding' do
-          it 'encodes and appends the string' do
-            valid_ascii_string = "abc".force_encoding("ASCII-8BIT")
-            valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8')
+          let(:valid_ascii_string) { "abcde".force_encoding("ASCII-8BIT") }
+          let(:valid_unicode_string) { utf_8_euro_symbol.force_encoding('UTF-8') }
 
-            resulting_string = build_encoded_string(valid_unicode_string, target_encoding) << valid_ascii_string
-            expect(resulting_string).to eq "#{utf_8_euro_symbol}abc".force_encoding('UTF-8')
+          it 'encodes and appends the string' do
+            resulting_string = build_encoded_string(valid_unicode_string, utf8_encoding) << valid_ascii_string
+            expected_string = "#{utf_8_euro_symbol}abcde".force_encoding('UTF-8')
+            expect(resulting_string).to be_identical_string(expected_string)
           end
         end
 
         context 'with a string that cannot be converted to the target encoding' do
-          it 'replaces undefined characters with either a ? or a unicode ?' do
-            ascii_string = ascii_arrow_symbol.force_encoding("ASCII-8BIT")
-            valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8')
-
-            resulting_string = build_encoded_string(valid_unicode_string, target_encoding) << ascii_string
-            expected_bytes = utf_8_euro_symbol.each_byte.to_a + ["?".unpack("c").first]
-            actual_bytes = resulting_string.each_byte.to_a
-
-            expect(actual_bytes).to eq(expected_bytes)
+          context 'when appending a string with an incompatible character encoding' do
+            let(:ascii_string) { ascii_arrow_symbol.force_encoding("ASCII-8BIT") }
+            let(:valid_unicode_string) { utf_8_euro_symbol.force_encoding('UTF-8') }
+
+            it "normally raises an Encoding::CompatibilityError" do
+              expect {
+                valid_unicode_string.encode(utf8_encoding) << ascii_string
+              }.to raise_error(Encoding::CompatibilityError)
+            end
+
+            it 'replaces unconvertable characters with the REPLACE string' do
+              resulting_string = build_encoded_string(valid_unicode_string, utf8_encoding) << ascii_string
+              expected_string = "#{utf_8_euro_symbol}#{EncodedString::REPLACE}"
+              expect(resulting_string).to be_identical_string(expected_string)
+            end
           end
         end
 
@@ -54,31 +165,67 @@ module RSpec::Support
             ascii_string = 'abc'.force_encoding("ASCII-8BIT")
             other_ascii_string = '123'.force_encoding("ASCII-8BIT")
 
-            resulting_string = build_encoded_string(ascii_string, target_encoding) << other_ascii_string
-            expect(resulting_string.encoding.to_s).to eq 'UTF-8'
+            resulting_string = build_encoded_string(ascii_string, utf8_encoding) << other_ascii_string
+            expected_string = 'abc123'.force_encoding(utf8_encoding)
+            expect(resulting_string).to be_identical_string(expected_string)
           end
         end
       end
 
       describe '#split' do
-        it 'splits the string based on the delimiter accounting for encoding' do
-          wrapped_string = "aaaaaaaaaaa#{ascii_arrow_symbol}aaaaa".force_encoding("ASCII-8BIT")
+        context 'when there is an undefined conversion to the target encoding' do
+          let(:wrapped_string_template) { "abaaaaaaaaaa%saaaaa" }
+          let(:wrapped_string) { sprintf(wrapped_string_template, ascii_arrow_symbol).force_encoding("ASCII-8BIT") }
+
+          it 'normally raises an Encoding::UndefinedConversionError' do
+            expect {
+              wrapped_string.encode(utf8_encoding)
+            }.to raise_error(Encoding::UndefinedConversionError)
+          end
+
+          it 'splits the string based on the delimiter accounting for encoding' do
+            delimiter = "b".force_encoding(utf8_encoding)
+            resulting_string = build_encoded_string(wrapped_string, utf8_encoding).
+              split(delimiter)
+            exp1, exp2 = sprintf(wrapped_string_template, EncodedString::REPLACE).force_encoding(utf8_encoding).split(delimiter)
+            expect(resulting_string).to match [
+              a_string_identical_to(exp1),
+              a_string_identical_to(exp2)
+            ]
+          end
+        end
 
-          expect {
-            build_encoded_string(wrapped_string, target_encoding).split(utf_8_euro_symbol.force_encoding("UTF-8"))
-          }.not_to raise_error
+        # see https://github.com/rspec/rspec-expectations/blob/f8a1232/spec/rspec/expectations/fail_with_spec.rb#L50
+        #     https://github.com/rspec/rspec-expectations/issues/201
+        #     https://github.com/rspec/rspec-expectations/pull/220
+        context 'with a string that cannot be converted to the target encoding' do
+          let(:binary_poop) {'💩' } # [128169] "\u{1F4A9}"
+          let(:non_ascii_compatible_string) { "This is a pile of poo: #{binary_poop}, yuck".encode("UTF-16LE") }
+
+          it 'normally raises an Encoding::CompatibilityError' do
+            expect {
+              non_ascii_compatible_string.split("\n")
+            }.to raise_error(Encoding::CompatibilityError)
+          end
+
+          it 'makes no changes to the resulting string' do
+            resulting_array = build_encoded_string(non_ascii_compatible_string).split("\n")
+            expect(resulting_array).to match [
+              a_string_identical_to(non_ascii_compatible_string)
+            ]
+          end
         end
       end
 
-      def build_encoded_string(string, target_encoding)
+      def build_encoded_string(string, target_encoding = string.encoding)
         EncodedString.new(string, target_encoding)
       end
     else
 
       describe '#source_encoding' do
         it 'defaults to US-ASCII' do
           str = EncodedString.new("abc", "UTF-8")
-          expect( str.source_encoding ).to eq('US-ASCII')
+          expect(str.source_encoding).to eq('US-ASCII')
         end
       end
     end