From 615a00623584adaff2cd17490dab4302b8ac8590 Mon Sep 17 00:00:00 2001 From: M66B Date: Sat, 10 Oct 2020 08:57:03 +0200 Subject: [PATCH] Using juniversalchardet --- ATTRIBUTION.md | 1 + app/build.gradle | 5 ++++ app/src/main/assets/ATTRIBUTION.md | 1 + .../java/eu/faircode/email/CharsetHelper.java | 26 +++++++++++++++++++ .../java/eu/faircode/email/MessageHelper.java | 24 ++++++++++------- 5 files changed, 48 insertions(+), 9 deletions(-) diff --git a/ATTRIBUTION.md b/ATTRIBUTION.md index a4f9b249ec..e425ec1383 100644 --- a/ATTRIBUTION.md +++ b/ATTRIBUTION.md @@ -27,3 +27,4 @@ FairEmail uses: * [GPX file type icon](https://www.flaticon.com/free-icon/gpx-file-format-variant_29258) made by [Freepik](https://www.flaticon.com/authors/freepik) from [Flaticon](https://www.flaticon.com). * [Disconnect's tracker protection lists](https://github.com/disconnectme/disconnect-tracking-protection). Copyright 2010-2020 Disconnect, Inc. [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International license](https://github.com/disconnectme/disconnect-tracking-protection/blob/master/LICENSE). * [Over-Scroll Support For Android's RecyclerView, ListView, GridView, ScrollView ...](https://github.com/EverythingMe/overscroll-decor). Copyright (c) 2015, DoAT Media Ltd. [BSD-2-Clause License](https://github.com/EverythingMe/overscroll-decor/blob/master/LICENSE) +* [juniversalchardet](https://github.com/albfernandez/juniversalchardet). Copyright (C) 2001 the Initial Developer. All Rights Reserved. [GNU General Public License Version 2](https://github.com/albfernandez/juniversalchardet#license). diff --git a/app/build.gradle b/app/build.gradle index 37c298c9a9..f582e52d51 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -235,6 +235,7 @@ dependencies { def overscroll_version = "1.1.0" def appauth_version = "0.7.1" def jcharset_version = "2.1" + def jchardet_version = "2.3.2" // https://developer.android.com/jetpack/androidx/releases/ @@ -395,4 +396,8 @@ dependencies { // http://www.freeutils.net/source/jcharset/ // https://mvnrepository.com/artifact/net.freeutils/jcharset implementation "net.freeutils:jcharset:$jcharset_version" + + // https://github.com/albfernandez/juniversalchardet + // https://mvnrepository.com/artifact/com.github.albfernandez/juniversalchardet + implementation "com.github.albfernandez:juniversalchardet:$jchardet_version" } diff --git a/app/src/main/assets/ATTRIBUTION.md b/app/src/main/assets/ATTRIBUTION.md index a4f9b249ec..e425ec1383 100644 --- a/app/src/main/assets/ATTRIBUTION.md +++ b/app/src/main/assets/ATTRIBUTION.md @@ -27,3 +27,4 @@ FairEmail uses: * [GPX file type icon](https://www.flaticon.com/free-icon/gpx-file-format-variant_29258) made by [Freepik](https://www.flaticon.com/authors/freepik) from [Flaticon](https://www.flaticon.com). * [Disconnect's tracker protection lists](https://github.com/disconnectme/disconnect-tracking-protection). Copyright 2010-2020 Disconnect, Inc. [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International license](https://github.com/disconnectme/disconnect-tracking-protection/blob/master/LICENSE). * [Over-Scroll Support For Android's RecyclerView, ListView, GridView, ScrollView ...](https://github.com/EverythingMe/overscroll-decor). Copyright (c) 2015, DoAT Media Ltd. [BSD-2-Clause License](https://github.com/EverythingMe/overscroll-decor/blob/master/LICENSE) +* [juniversalchardet](https://github.com/albfernandez/juniversalchardet). Copyright (C) 2001 the Initial Developer. All Rights Reserved. [GNU General Public License Version 2](https://github.com/albfernandez/juniversalchardet#license). diff --git a/app/src/main/java/eu/faircode/email/CharsetHelper.java b/app/src/main/java/eu/faircode/email/CharsetHelper.java index cba87dba16..69eb0e3ac2 100644 --- a/app/src/main/java/eu/faircode/email/CharsetHelper.java +++ b/app/src/main/java/eu/faircode/email/CharsetHelper.java @@ -19,11 +19,17 @@ Copyright 2018-2020 by Marcel Bokhorst (M66B) */ +import org.mozilla.universalchardet.UniversalDetector; + import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.charset.UnsupportedCharsetException; class CharsetHelper { + private static UniversalDetector detector = new UniversalDetector(); + + private static final int SAMPLE_SIZE = 2 * 1024; + static boolean isUTF8(String text) { // Get extended ASCII characters byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1); @@ -115,4 +121,24 @@ else if (c == '$') return false; } + + static Charset detect(String text) { + try { + byte[] sample = text.getBytes(StandardCharsets.ISO_8859_1); + + detector.handleData(sample, 0, Math.min(SAMPLE_SIZE, sample.length)); + detector.dataEnd(); + + String detected = detector.getDetectedCharset(); + if (detected == null) + return null; + + return Charset.forName(detected); + } catch (Throwable ex) { + Log.w(ex); + return null; + } finally { + detector.reset(); + } + } } diff --git a/app/src/main/java/eu/faircode/email/MessageHelper.java b/app/src/main/java/eu/faircode/email/MessageHelper.java index fae44ad1b8..5a8ec9a4f0 100644 --- a/app/src/main/java/eu/faircode/email/MessageHelper.java +++ b/app/src/main/java/eu/faircode/email/MessageHelper.java @@ -1729,22 +1729,28 @@ else if (content instanceof InputStream) if (UnknownCharsetProvider.charsetForMime(charset) == null) warnings.add(context.getString(R.string.title_no_charset, charset)); + if ((TextUtils.isEmpty(charset) || charset.equalsIgnoreCase(StandardCharsets.US_ASCII.name()))) + charset = null; + if (part.isMimeType("text/plain")) { - if (TextUtils.isEmpty(charset) && CharsetHelper.isISO2022JP(result)) - result = new String(result.getBytes(StandardCharsets.ISO_8859_1), "ISO-2022-JP"); - else if ((TextUtils.isEmpty(charset) || charset.equalsIgnoreCase(StandardCharsets.US_ASCII.name())) && - CharsetHelper.isUTF8(result)) { - Log.i("Charset plain=UTF8"); - result = new String(result.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8); + if (charset == null) { + Charset detected = CharsetHelper.detect(result); + if (detected == null) { + if (CharsetHelper.isUTF8(result)) { + Log.i("Charset plain=UTF8"); + result = new String(result.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8); + } + } else { + Log.i("Charset plain=" + detected.name()); + result = new String(result.getBytes(StandardCharsets.ISO_8859_1), detected); + } } if ("flowed".equalsIgnoreCase(ct.getParameter("format"))) result = HtmlHelper.flow(result); result = "
" + HtmlHelper.formatPre(result) + "
"; } else if (part.isMimeType("text/html")) { - if (TextUtils.isEmpty(charset) && CharsetHelper.isISO2022JP(result)) - result = new String(result.getBytes(StandardCharsets.ISO_8859_1), "ISO-2022-JP"); - else if (TextUtils.isEmpty(charset)) { + if (charset == null) { // // String excerpt = result.substring(0, Math.min(MAX_META_EXCERPT, result.length()));