fix(java): Fix incorrect results of utf16 to utf8 conversion for lati…

…n1 but not ascii characters (#1914)  ## What does this PR do?  Fix incorrect results of utf16 to utf8 conversion for latin1 but not ascii characters ## Related issues  ## Does this PR introduce any user-facing change?  - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark
apache · Oct 27, 2024 · e087481 · e087481
1 parent 57a9eae
commit e087481
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 31 deletions.
diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java
@@ -20,6 +20,7 @@
 package org.apache.fury.serializer;
 
 import static org.apache.fury.type.TypeUtils.STRING_TYPE;
+import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_ASCII_MASK;
 import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_LATIN_MASK;
 
 import java.lang.invoke.CallSite;
@@ -387,7 +388,6 @@ public String readUTF8String(MemoryBuffer buffer) {
   }
 
   public char[] readCharsLatin1(MemoryBuffer buffer, int numBytes) {
-    //    int utf8AsciiBytes = buffer.readInt32();
     buffer.checkReadableBytes(numBytes);
     byte[] srcArray = buffer.getHeapMemory();
     char[] chars = new char[numBytes];
@@ -775,33 +775,48 @@ private static byte bestCoder(char[] chars) {
     int vectorizedLen = sampleNum >> 2;
     int vectorizedChars = vectorizedLen << 2;
     int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1);
-    int count = 0;
+    int asciiCount = 0;
+    int latin1Count = 0;
     for (int offset = Platform.CHAR_ARRAY_OFFSET, charOffset = 0;
         offset < endOffset;
         offset += 8, charOffset += 4) {
       long multiChars = Platform.getLong(chars, offset);
-      if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
-        count += 4;
+      if ((multiChars & MULTI_CHARS_NON_ASCII_MASK) == 0) {
+        latin1Count += 4;
+        asciiCount += 4;
+      } else if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
+        latin1Count += 4;
+        for (int i = 0; i < 4; ++i) {
+          if (chars[charOffset + i] < 0x80) {
+            asciiCount++;
+          }
+        }
       } else {
         for (int i = 0; i < 4; ++i) {
           if (chars[charOffset + i] < 0x80) {
-            count++;
+            latin1Count++;
+            asciiCount++;
+          } else if (chars[charOffset + i] <= 0xFF) {
+            latin1Count++;
           }
         }
       }
     }
 
     for (int i = vectorizedChars; i < sampleNum; i++) {
       if (chars[i] < 0x80) {
-        count++;
+        latin1Count++;
+        asciiCount++;
+      } else if (chars[i] <= 0xFF) {
+        latin1Count++;
       }
     }
 
-    // ascii number > 50%, choose UTF-8
-    if (count >= sampleNum * 0.5) {
-      if (count == numChars || (count == sampleNum && StringUtils.isLatin(chars, sampleNum))) {
-        return LATIN1;
-      }
+    if (latin1Count == numChars
+        || (latin1Count == sampleNum && StringUtils.isLatin(chars, sampleNum))) {
+      return LATIN1;
+    } else if (asciiCount >= sampleNum * 0.5) {
+      // ascii number > 50%, choose UTF-8
       return UTF8;
     } else {
       return UTF16;
@@ -815,30 +830,28 @@ private static byte bestCoder(byte[] bytes) {
     int vectorizedLen = sampleNum >> 3;
     int vectorizedBytes = vectorizedLen << 3;
     int endOffset = Platform.BYTE_ARRAY_OFFSET + vectorizedBytes;
-    int count = 0;
+    int asciiCount = 0;
     for (int offset = Platform.BYTE_ARRAY_OFFSET, bytesOffset = 0;
         offset < endOffset;
         offset += 8, bytesOffset += 8) {
       long multiChars = Platform.getLong(bytes, offset);
-      if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
-        count += 4;
+      if ((multiChars & MULTI_CHARS_NON_ASCII_MASK) == 0) {
+        asciiCount += 4;
       } else {
-        for (int i = Platform.IS_LITTLE_ENDIAN ? 1 : 0; i < 8; i += 2) {
-          if (bytes[bytesOffset + i] == 0) {
-            count++;
+        for (int i = 0; i < 8; i += 2) {
+          if (Platform.getChar(bytes, offset + i) < 0x80) {
+            asciiCount++;
           }
         }
       }
     }
-    for (int i = Platform.IS_LITTLE_ENDIAN ? vectorizedBytes + 1 : vectorizedBytes;
-        i < sampleNum;
-        ++i) {
-      if (bytes[i] == 0) {
-        count++;
+    for (int i = vectorizedBytes; vectorizedBytes < sampleNum; vectorizedBytes += 2) {
+      if (Platform.getChar(bytes, Platform.BYTE_ARRAY_OFFSET + i) < 0x80) {
+        asciiCount++;
       }
     }
     // ascii number > 50%, choose UTF-8
-    if (count >= sampleNum * 0.5) {
+    if (asciiCount >= sampleNum * 0.5) {
       return UTF8;
     } else {
       return UTF16;

diff --git a/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java b/java/fury-core/src/main/java/org/apache/fury/util/StringEncodingUtils.java
@@ -19,7 +19,7 @@
 
 package org.apache.fury.util;
 
-import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_LATIN_MASK;
+import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_ASCII_MASK;
 
 import org.apache.fury.memory.Platform;
 
@@ -29,20 +29,20 @@ public class StringEncodingUtils {
   /** A fast convert algorithm to convert an utf16 char array into an utf8 byte array. */
   public static int convertUTF16ToUTF8(char[] src, byte[] dst, int dp) {
     int numChars = src.length;
-    for (int charOffset = 0; charOffset < numChars; ) {
+    for (int charOffset = 0, arrayOffset = Platform.CHAR_ARRAY_OFFSET; charOffset < numChars; ) {
       if (charOffset + 4 <= numChars
-          && (Platform.getLong(src, Platform.CHAR_ARRAY_OFFSET + charOffset * 2L)
-                  & MULTI_CHARS_NON_LATIN_MASK)
-              == 0) {
+          && (Platform.getLong(src, arrayOffset) & MULTI_CHARS_NON_ASCII_MASK) == 0) {
         // ascii only
         dst[dp] = (byte) src[charOffset];
         dst[dp + 1] = (byte) src[charOffset + 1];
         dst[dp + 2] = (byte) src[charOffset + 2];
         dst[dp + 3] = (byte) src[charOffset + 3];
         dp += 4;
         charOffset += 4;
+        arrayOffset += 8;
       } else {
         char c = src[charOffset++];
+        arrayOffset += 2;
         if (c < 0x80) {
           dst[dp++] = (byte) c;
         } else if (c < 0x800) {
@@ -53,6 +53,7 @@ public static int convertUTF16ToUTF8(char[] src, byte[] dst, int dp) {
           utf8ToChar2(src, charOffset, c, dst, dp);
           dp += 4;
           charOffset++;
+          arrayOffset += 2;
         } else {
           dst[dp] = (byte) (0xe0 | ((c >> 12)));
           dst[dp + 1] = (byte) (0x80 | ((c >> 6) & 0x3f));
@@ -70,7 +71,7 @@ public static int convertUTF16ToUTF8(byte[] src, byte[] dst, int dp) {
     for (int offset = 0; offset < numBytes; ) {
       if (offset + 8 <= numBytes
           && (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset)
-                  & MULTI_CHARS_NON_LATIN_MASK)
+                  & MULTI_CHARS_NON_ASCII_MASK)
               == 0) {
         // ascii only
         if (Platform.IS_LITTLE_ENDIAN) {

diff --git a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java
@@ -28,6 +28,8 @@ public class StringUtils {
   // A long mask used to clear all-higher bits of char in a super-word way.
   public static final long MULTI_CHARS_NON_LATIN_MASK;
 
+  public static final long MULTI_CHARS_NON_ASCII_MASK;
+
   private static final char[] BASE16_CHARS2 = {
     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
   };
@@ -37,10 +39,12 @@ public class StringUtils {
       // latin chars will be 0xXX,0x00;0xXX,0x00 in byte order;
       // Using 0x00,0xff(0xff00) to clear latin bits.
       MULTI_CHARS_NON_LATIN_MASK = 0xff00ff00ff00ff00L;
+      MULTI_CHARS_NON_ASCII_MASK = 0xff80ff80ff80ff80L;
     } else {
       // latin chars will be 0x00,0xXX;0x00,0xXX in byte order;
       // Using 0x00,0xff(0x00ff) to clear latin bits.
       MULTI_CHARS_NON_LATIN_MASK = 0x00ff00ff00ff00ffL;
+      MULTI_CHARS_NON_ASCII_MASK = 0x80ff80ff80ff80ffL;
     }
   }
 

diff --git a/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java b/java/fury-core/src/test/java/org/apache/fury/util/StringEncodingUtilsTest.java
@@ -28,7 +28,7 @@
 public class StringEncodingUtilsTest extends FuryTestBase {
   @Test
   public void testUTF8ToUTF16() {
-    String input = "你好, Fury";
+    String input = "jbmbmner8 jhk hj \n \t üäßß@µ你好";
     byte[] utf8 = input.getBytes(StandardCharsets.UTF_8);
     char[] utf16Chars = new char[utf8.length * 2];
     int readLen = StringEncodingUtils.convertUTF8ToUTF16(utf8, 0, utf8.length, utf16Chars);
@@ -43,7 +43,7 @@ public void testUTF8ToUTF16() {
 
   @Test
   public void testUTF16ToUTF8() {
-    String input = "你好, Fury";
+    String input = "jbmbmner8 jhk hj \n \t üäßß@µ你好";
     char[] utf16 = new char[input.length()];
     byte[] utf8 = new byte[input.length() * 3];
     input.getChars(0, input.length(), utf16, 0);