Skip to content

Commit 8e15ff5

Browse files
kjgjeremy
authored andcommitted
Transcoding replaces invalid chars with "�" instead of discarding them
Make it clear that the source string had invalid characters instead of silently dropping them. � U+FFFD is the Unicode replacement character. Only supported on Ruby 1.9+. Ruby 1.8 continues to discard unrecognized and invalid characters.
1 parent 646a234 commit 8e15ff5

File tree

4 files changed

+18
-14
lines changed

4 files changed

+18
-14
lines changed

CHANGELOG.rdoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Features:
44
* #853 - `Mail::Message#set_sort_order` overrides the default message part sort order. (rafbm)
55
* #650 - UTF-7 charset support. (johngrimes)
66
* #1065 - Require STARTTLS using :enable_starttls. (bk2204)
7+
* #1002 - Transcoding replaces invalid chars with "�" instead of discarding them. (kjg)
78

89
Performance:
910
* #1059 - Switch from mime-types to mini_mime for a much smaller memory footprint. (SamSaffron)

lib/mail/version_specific/ruby_1_9.rb

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ def Ruby19.get_constant(klass, string)
8888
end
8989

9090
def Ruby19.transcode_charset(str, from_encoding, to_encoding = Encoding::UTF_8)
91-
charset_encoder.encode(str.dup, from_encoding).encode(to_encoding, :undef => :replace, :invalid => :replace, :replace => '')
91+
to_encoding = Encoding.find(to_encoding)
92+
replacement_char = to_encoding == Encoding::UTF_8 ? '�' : '?'
93+
charset_encoder.encode(str.dup, from_encoding).encode(to_encoding, :undef => :replace, :invalid => :replace, :replace => replacement_char)
9294
end
9395

9496
# From Ruby stdlib Net::IMAP
@@ -125,8 +127,7 @@ def Ruby19.b_value_decode(str)
125127
str = Ruby19.decode_base64(match[2])
126128
str = charset_encoder.encode(str, charset)
127129
end
128-
decoded = str.encode(Encoding::UTF_8, :undef => :replace, :invalid => :replace, :replace => "")
129-
decoded.valid_encoding? ? decoded : decoded.encode(Encoding::UTF_16LE, :invalid => :replace, :replace => "").encode(Encoding::UTF_8)
130+
transcode_to_scrubbed_utf8(str)
130131
rescue Encoding::UndefinedConversionError, ArgumentError, Encoding::ConverterNotFoundError
131132
warn "Encoding conversion failed #{$!}"
132133
str.dup.force_encoding(Encoding::UTF_8)
@@ -150,8 +151,7 @@ def Ruby19.q_value_decode(str)
150151
# jruby/jruby#829 which subtly changes String#encode semantics.
151152
str.force_encoding(Encoding::UTF_8) if str.encoding == Encoding::ASCII_8BIT
152153
end
153-
decoded = str.encode(Encoding::UTF_8, :invalid => :replace, :replace => "")
154-
decoded.valid_encoding? ? decoded : decoded.encode(Encoding::UTF_16LE, :invalid => :replace, :replace => "").encode(Encoding::UTF_8)
154+
transcode_to_scrubbed_utf8(str)
155155
rescue Encoding::UndefinedConversionError, ArgumentError, Encoding::ConverterNotFoundError
156156
warn "Encoding conversion failed #{$!}"
157157
str.dup.force_encoding(Encoding::UTF_8)
@@ -253,6 +253,11 @@ def convert_to_encoding(encoding)
253253
end
254254
end
255255
end
256+
257+
def transcode_to_scrubbed_utf8(str)
258+
decoded = str.encode(Encoding::UTF_8, :undef => :replace, :invalid => :replace, :replace => "�")
259+
decoded.valid_encoding? ? decoded : decoded.encode(Encoding::UTF_16LE, :invalid => :replace, :replace => "�").encode(Encoding::UTF_8)
260+
end
256261
end
257262
end
258263
end

spec/mail/encoding_spec.rb

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -186,20 +186,18 @@
186186
expect(mail.parts[0].content_type).to eq "text/html; charset=ISO-8859-1"
187187
end
188188

189-
it "should skip invalid characters" do
189+
it "should replace invalid characters" do
190190
m = Mail.new
191191
m['Subject'] = Mail::SubjectField.new("=?utf-8?Q?Hello_=96_World?=")
192-
if RUBY_VERSION > '1.9'
193-
expect { expect(m.subject).to be_valid_encoding }.not_to raise_error
194-
else
195-
expect(m.subject).to eq "Hello World"
196-
end
192+
replace = '�' if RUBY_VERSION > '1.9'
193+
expect(m.subject).to eq "Hello #{replace} World"
197194
end
198195

199-
it "should skip characters of unknown and invalid encoding" do
196+
it "should replace characters of unknown and invalid encoding" do
200197
m = Mail.new
201198
m['Subject'] = Mail::SubjectField.new("Hello=?UNKNOWN?B?4g==?=")
202-
expect(m.subject).to eq "Hello"
199+
replace = '�' if RUBY_VERSION > '1.9'
200+
expect(m.subject).to eq "Hello#{replace}"
203201
end
204202

205203
if RUBY_VERSION > '1.9'

spec/mail/encodings_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@
316316

317317
it "should treat unrecognized charsets as binary" do
318318
if RUBY_VERSION >= "1.9"
319-
expect(Mail::Encodings.value_decode("=?ISO-FOOO?Q?Morten_R=F8verdatt=E9r?=")).to eq "Morten Rverdattr"
319+
expect(Mail::Encodings.value_decode("=?ISO-FOOO?Q?Morten_R=F8verdatt=E9r?=")).to eq "Morten R�verdatt�r"
320320
end
321321
end
322322
end

0 commit comments

Comments
 (0)