@@ -3,16 +3,32 @@ module Support
3
3
# @private
4
4
class EncodedString
5
5
# Reduce allocations by storing constants.
6
- UTF_8 = "UTF-8"
7
- US_ASCII = 'US-ASCII'
8
- # else: '?' 63.chr ("\x3F")
6
+ UTF_8 = "UTF-8"
7
+ US_ASCII = "US-ASCII"
8
+ #
9
+ # In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
10
+ # see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
11
+ # https://www.ruby-forum.com/topic/6861247
12
+ # https://twitter.com/nalsh/status/553413844685438976
13
+ #
14
+ # For example, given:
15
+ # "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
16
+ #
17
+ # On MRI 2.1 or above: 63 # '?'
18
+ # else : 128 # "\x80"
19
+ #
20
+ # Ruby's default replacement string is:
21
+ # U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else
22
+ # ? ("\x3F")
9
23
REPLACE = "?"
10
24
ENCODE_UNCONVERTABLE_BYTES = {
11
25
:invalid => :replace ,
12
- :undef => :replace
26
+ :undef => :replace ,
27
+ :replace => REPLACE
13
28
}
14
29
ENCODE_NO_CONVERTER = {
15
30
:invalid => :replace ,
31
+ :replace => REPLACE
16
32
}
17
33
18
34
def initialize ( string , encoding = nil )
@@ -64,13 +80,13 @@ def to_s
64
80
# vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
65
81
# # => '<byte>'
66
82
# ArgumentError
67
- # when operating on a string with invalid bytes
68
- # e.g."\xEF ".split("\n")
83
+ # when operating on a string with invalid bytes
84
+ # e.g."\x80 ".split("\n")
69
85
# TypeError
70
- # when a symbol is passed as an encoding
71
- # Encoding.find(:"utf -8")
72
- # when calling force_encoding on an object
73
- # that doesn't respond to #to_str
86
+ # when a symbol is passed as an encoding
87
+ # Encoding.find(:"UTF -8")
88
+ # when calling force_encoding on an object
89
+ # that doesn't respond to #to_str
74
90
#
75
91
# Raised by transcoding methods:
76
92
# Encoding::ConverterNotFoundError:
@@ -80,25 +96,38 @@ def to_s
80
96
# e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
81
97
#
82
98
# Raised by byte <-> char conversions
83
- # RangeError: out of char range
84
- # e.g. the UTF-16LE emoji: 128169.chr
99
+ # RangeError: out of char range
100
+ # e.g. the UTF-16LE emoji: 128169.chr
85
101
def matching_encoding ( string )
102
+ string = remove_invalid_bytes ( string )
86
103
string . encode ( @encoding )
87
104
rescue Encoding ::UndefinedConversionError , Encoding ::InvalidByteSequenceError
88
- normalize_missing ( string . encode ( @encoding , ENCODE_UNCONVERTABLE_BYTES ) )
105
+ string . encode ( @encoding , ENCODE_UNCONVERTABLE_BYTES )
89
106
rescue Encoding ::ConverterNotFoundError
90
- normalize_missing ( string . dup . force_encoding ( @encoding ) . encode ( ENCODE_NO_CONVERTER ) )
107
+ string . dup . force_encoding ( @encoding ) . encode ( ENCODE_NO_CONVERTER )
91
108
end
92
109
93
- # Ruby's default replacement string is:
94
- # for Unicode encoding forms: U+FFFD ("\xEF\xBF\xBD")
95
- MRI_UNICODE_UNKOWN_CHARACTER = "\xEF \xBF \xBD " . force_encoding ( UTF_8 )
96
-
97
- def normalize_missing ( string )
98
- if @encoding . to_s == UTF_8
99
- string . gsub ( MRI_UNICODE_UNKOWN_CHARACTER , REPLACE )
100
- else
110
+ # Work around bad bytes with a double conversion
111
+ # Prevents raising ArgumentError
112
+ #
113
+ # Emulates Ruby 2.1 String#scrub
114
+ # see https://github.com/hsbt/string-scrub
115
+ # https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123
116
+ # https://speakerdeck.com/samsaffron/why-ruby-2-dot-1-excites-me?slide=48
117
+ #
118
+ # Force UTF-8 encoding,
119
+ # Converting it to a higher higher character set (UTF-16) and then
120
+ # back (to UTF-8) ensures that you will strip away invalid or undefined byte sequences,
121
+ # Restore original encoding
122
+ def remove_invalid_bytes ( string )
123
+ if string . valid_encoding?
101
124
string
125
+ else
126
+ string . dup .
127
+ force_encoding ( UTF_8 )
128
+ encode ( Encoding ::UTF_16 , UTF_8 , ENCODE_NO_CONVERTER ) .
129
+ encode ( UTF_8 , Encoding ::UTF_16 ) .
130
+ force_encoding ( string . encoding )
102
131
end
103
132
end
104
133
0 commit comments