Skip to content

Commit

Permalink
fix(java): Fix incorrect results of utf16 to utf8 conversion for lati…
Browse files Browse the repository at this point in the history
…n1 but not ascii characters (#1914)

<!--
**Thanks for contributing to Fury.**

**If this is your first time opening a PR on fury, you can refer to
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).**

Contribution Checklist

- The **Apache Fury (incubating)** community has restrictions on the
naming of pr titles. You can also find instructions in
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).

- Fury has a strong focus on performance. If the PR you submit will have
an impact on performance, please benchmark it first and provide the
benchmark result here.
-->

## What does this PR do?

<!-- Describe the purpose of this PR. -->

Fix incorrect results of utf16 to utf8 conversion for latin1 but not
ascii characters

## Related issues

<!--
Is there any related issue? Please attach here.

- #xxxx0
- #xxxx1
- #xxxx2
-->

## Does this PR introduce any user-facing change?

<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/fury/issues/new/choose) describing the
need to do so and update the document if necessary.
-->

- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?

## Benchmark

<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
-->
  • Loading branch information
HuangXingBo authored Oct 27, 2024
1 parent 57a9eae commit e087481
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package org.apache.fury.serializer;

import static org.apache.fury.type.TypeUtils.STRING_TYPE;
import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_ASCII_MASK;
import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_LATIN_MASK;

import java.lang.invoke.CallSite;
Expand Down Expand Up @@ -387,7 +388,6 @@ public String readUTF8String(MemoryBuffer buffer) {
}

public char[] readCharsLatin1(MemoryBuffer buffer, int numBytes) {
// int utf8AsciiBytes = buffer.readInt32();
buffer.checkReadableBytes(numBytes);
byte[] srcArray = buffer.getHeapMemory();
char[] chars = new char[numBytes];
Expand Down Expand Up @@ -775,33 +775,48 @@ private static byte bestCoder(char[] chars) {
int vectorizedLen = sampleNum >> 2;
int vectorizedChars = vectorizedLen << 2;
int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1);
int count = 0;
int asciiCount = 0;
int latin1Count = 0;
for (int offset = Platform.CHAR_ARRAY_OFFSET, charOffset = 0;
offset < endOffset;
offset += 8, charOffset += 4) {
long multiChars = Platform.getLong(chars, offset);
if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
count += 4;
if ((multiChars & MULTI_CHARS_NON_ASCII_MASK) == 0) {
latin1Count += 4;
asciiCount += 4;
} else if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
latin1Count += 4;
for (int i = 0; i < 4; ++i) {
if (chars[charOffset + i] < 0x80) {
asciiCount++;
}
}
} else {
for (int i = 0; i < 4; ++i) {
if (chars[charOffset + i] < 0x80) {
count++;
latin1Count++;
asciiCount++;
} else if (chars[charOffset + i] <= 0xFF) {
latin1Count++;
}
}
}
}

for (int i = vectorizedChars; i < sampleNum; i++) {
if (chars[i] < 0x80) {
count++;
latin1Count++;
asciiCount++;
} else if (chars[i] <= 0xFF) {
latin1Count++;
}
}

// ascii number > 50%, choose UTF-8
if (count >= sampleNum * 0.5) {
if (count == numChars || (count == sampleNum && StringUtils.isLatin(chars, sampleNum))) {
return LATIN1;
}
if (latin1Count == numChars
|| (latin1Count == sampleNum && StringUtils.isLatin(chars, sampleNum))) {
return LATIN1;
} else if (asciiCount >= sampleNum * 0.5) {
// ascii number > 50%, choose UTF-8
return UTF8;
} else {
return UTF16;
Expand All @@ -815,30 +830,28 @@ private static byte bestCoder(byte[] bytes) {
int vectorizedLen = sampleNum >> 3;
int vectorizedBytes = vectorizedLen << 3;
int endOffset = Platform.BYTE_ARRAY_OFFSET + vectorizedBytes;
int count = 0;
int asciiCount = 0;
for (int offset = Platform.BYTE_ARRAY_OFFSET, bytesOffset = 0;
offset < endOffset;
offset += 8, bytesOffset += 8) {
long multiChars = Platform.getLong(bytes, offset);
if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
count += 4;
if ((multiChars & MULTI_CHARS_NON_ASCII_MASK) == 0) {
asciiCount += 4;
} else {
for (int i = Platform.IS_LITTLE_ENDIAN ? 1 : 0; i < 8; i += 2) {
if (bytes[bytesOffset + i] == 0) {
count++;
for (int i = 0; i < 8; i += 2) {
if (Platform.getChar(bytes, offset + i) < 0x80) {
asciiCount++;
}
}
}
}
for (int i = Platform.IS_LITTLE_ENDIAN ? vectorizedBytes + 1 : vectorizedBytes;
i < sampleNum;
++i) {
if (bytes[i] == 0) {
count++;
for (int i = vectorizedBytes; vectorizedBytes < sampleNum; vectorizedBytes += 2) {
if (Platform.getChar(bytes, Platform.BYTE_ARRAY_OFFSET + i) < 0x80) {
asciiCount++;
}
}
// ascii number > 50%, choose UTF-8
if (count >= sampleNum * 0.5) {
if (asciiCount >= sampleNum * 0.5) {
return UTF8;
} else {
return UTF16;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

package org.apache.fury.util;

import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_LATIN_MASK;
import static org.apache.fury.util.StringUtils.MULTI_CHARS_NON_ASCII_MASK;

import org.apache.fury.memory.Platform;

Expand All @@ -29,20 +29,20 @@ public class StringEncodingUtils {
/** A fast convert algorithm to convert an utf16 char array into an utf8 byte array. */
public static int convertUTF16ToUTF8(char[] src, byte[] dst, int dp) {
int numChars = src.length;
for (int charOffset = 0; charOffset < numChars; ) {
for (int charOffset = 0, arrayOffset = Platform.CHAR_ARRAY_OFFSET; charOffset < numChars; ) {
if (charOffset + 4 <= numChars
&& (Platform.getLong(src, Platform.CHAR_ARRAY_OFFSET + charOffset * 2L)
& MULTI_CHARS_NON_LATIN_MASK)
== 0) {
&& (Platform.getLong(src, arrayOffset) & MULTI_CHARS_NON_ASCII_MASK) == 0) {
// ascii only
dst[dp] = (byte) src[charOffset];
dst[dp + 1] = (byte) src[charOffset + 1];
dst[dp + 2] = (byte) src[charOffset + 2];
dst[dp + 3] = (byte) src[charOffset + 3];
dp += 4;
charOffset += 4;
arrayOffset += 8;
} else {
char c = src[charOffset++];
arrayOffset += 2;
if (c < 0x80) {
dst[dp++] = (byte) c;
} else if (c < 0x800) {
Expand All @@ -53,6 +53,7 @@ public static int convertUTF16ToUTF8(char[] src, byte[] dst, int dp) {
utf8ToChar2(src, charOffset, c, dst, dp);
dp += 4;
charOffset++;
arrayOffset += 2;
} else {
dst[dp] = (byte) (0xe0 | ((c >> 12)));
dst[dp + 1] = (byte) (0x80 | ((c >> 6) & 0x3f));
Expand All @@ -70,7 +71,7 @@ public static int convertUTF16ToUTF8(byte[] src, byte[] dst, int dp) {
for (int offset = 0; offset < numBytes; ) {
if (offset + 8 <= numBytes
&& (Platform.getLong(src, Platform.BYTE_ARRAY_OFFSET + offset)
& MULTI_CHARS_NON_LATIN_MASK)
& MULTI_CHARS_NON_ASCII_MASK)
== 0) {
// ascii only
if (Platform.IS_LITTLE_ENDIAN) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ public class StringUtils {
// A long mask used to clear all-higher bits of char in a super-word way.
public static final long MULTI_CHARS_NON_LATIN_MASK;

public static final long MULTI_CHARS_NON_ASCII_MASK;

private static final char[] BASE16_CHARS2 = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
};
Expand All @@ -37,10 +39,12 @@ public class StringUtils {
// latin chars will be 0xXX,0x00;0xXX,0x00 in byte order;
// Using 0x00,0xff(0xff00) to clear latin bits.
MULTI_CHARS_NON_LATIN_MASK = 0xff00ff00ff00ff00L;
MULTI_CHARS_NON_ASCII_MASK = 0xff80ff80ff80ff80L;
} else {
// latin chars will be 0x00,0xXX;0x00,0xXX in byte order;
// Using 0x00,0xff(0x00ff) to clear latin bits.
MULTI_CHARS_NON_LATIN_MASK = 0x00ff00ff00ff00ffL;
MULTI_CHARS_NON_ASCII_MASK = 0x80ff80ff80ff80ffL;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
public class StringEncodingUtilsTest extends FuryTestBase {
@Test
public void testUTF8ToUTF16() {
String input = "你好, Fury";
String input = "jbmbmner8 jhk hj \n \t üäßß@µ你好";
byte[] utf8 = input.getBytes(StandardCharsets.UTF_8);
char[] utf16Chars = new char[utf8.length * 2];
int readLen = StringEncodingUtils.convertUTF8ToUTF16(utf8, 0, utf8.length, utf16Chars);
Expand All @@ -43,7 +43,7 @@ public void testUTF8ToUTF16() {

@Test
public void testUTF16ToUTF8() {
String input = "你好, Fury";
String input = "jbmbmner8 jhk hj \n \t üäßß@µ你好";
char[] utf16 = new char[input.length()];
byte[] utf8 = new byte[input.length() * 3];
input.getChars(0, input.length(), utf16, 0);
Expand Down

0 comments on commit e087481

Please sign in to comment.