@@ -4,7 +4,6 @@ module Support
4
4
class EncodedString
5
5
# Ruby's default replacement string for is U+FFFD ("\xEF\xBF\xBD") for Unicode encoding forms
6
6
# else is '?' ("\x3F")
7
- MRI_UNICODE_UNKOWN_CHARACTER = "\xEF \xBF \xBD "
8
7
REPLACE = "\x3F "
9
8
10
9
def initialize ( string , encoding = nil )
@@ -36,6 +35,24 @@ def to_s
36
35
37
36
private
38
37
38
+ ENCODING_STRATEGY = {
39
+ :bad_bytes => {
40
+ :invalid => :replace ,
41
+ # :undef => :nil,
42
+ :replace => REPLACE
43
+ } ,
44
+ :cannot_convert => {
45
+ # :invalid => :nil,
46
+ :undef => :replace ,
47
+ :replace => REPLACE
48
+ } ,
49
+ :no_converter => {
50
+ :invalid => :replace ,
51
+ # :undef => :nil,
52
+ :replace => REPLACE
53
+ }
54
+ }
55
+
39
56
# Raised by Encoding and String methods:
40
57
# Encoding::UndefinedConversionError:
41
58
# when a transcoding operation fails
@@ -51,20 +68,19 @@ def to_s
51
68
# Encoding::CompatibilityError
52
69
#
53
70
def matching_encoding ( string )
54
- string . encode ( @encoding )
55
- rescue Encoding ::UndefinedConversionError , Encoding ::InvalidByteSequenceError
56
- normalize_missing ( string . encode ( @encoding , :invalid => :replace , :undef => :replace ) )
71
+ # Converting it to a higher character set (UTF-16) and then back (to UTF-8)
72
+ # ensures that we strip away invalid or undefined byte sequences
73
+ # => no need to rescue Encoding::InvalidByteSequenceError, ArgumentError
74
+ string . encode ( ::Encoding ::UTF_16LE , ENCODING_STRATEGY [ :bad_bytes ] ) .
75
+ encode ( @encoding )
76
+ rescue Encoding ::UndefinedConversionError , Encoding ::CompatibilityError
77
+ string . encode ( @encoding , ENCODING_STRATEGY [ :cannot_convert ] )
78
+ # Begin: Needed for 1.9.2
57
79
rescue Encoding ::ConverterNotFoundError
58
- normalize_missing ( string . force_encoding ( @encoding ) . encode ( :invalid => :replace ) )
80
+ string . force_encoding ( @encoding ) . encode ( ENCODING_STRATEGY [ :no_converter ] )
59
81
end
82
+ # End: Needed for 1.9.2
60
83
61
- def normalize_missing ( string )
62
- if @encoding . to_s == "UTF-8"
63
- string . gsub ( MRI_UNICODE_UNKOWN_CHARACTER . force_encoding ( @encoding ) , REPLACE )
64
- else
65
- string
66
- end
67
- end
68
84
69
85
def detect_source_encoding ( string )
70
86
string . encoding
0 commit comments