Skip to content

Commit c385ac5

Browse files
committed
[GR-55296] TruffleStrings: add support for non-native endian UTF-16 and UTF-32 without JCodings.
PullRequest: graal/19471
2 parents a3b1bc3 + 4d3a03a commit c385ac5

File tree

19 files changed

+1025
-232
lines changed

19 files changed

+1025
-232
lines changed

truffle/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ This changelog summarizes major changes between Truffle versions relevant to lan
2222
`Value.fromByteBasedString(...)` `Value.fromNativeString(...)`. A `Value.StringEncoding` must be provided.
2323
* GR-55296 Added support to convert any string to a `byte[]` with a given `Value.StringEncoding` using `Value.asStringBytes(...)`.
2424
* GR-40323 Deprecated `Shape.Builder.layout(Class)` for removal and added replacement API [`Shape.Builder.layout(Class, MethodHandles.Lookup)`](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/object/Shape.Builder.html#layout(java.lang.Class,java.lang.MethodHandles.Lookup)). Replace usages with the new method, additionally providing a `Lookup` that has full privilege access to the layout class or the module in which it is declared, as obtained by `MethodHandles.lookup()`. See javadoc for the updated usage.
25+
* GR-55296 Added support for UTF-16 and UTF-32 in non-system-endianness without dependency on the JCodings library in TruffleString.
2526

2627

2728
* GR-54760 `RootNode.translateStackTraceElement()` is now always consulted for polyglot and debugger stack traces. Stack traces now use the source section, the executable name, and the name of the declared meta-object to build `StackTraceElement` instances.

truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/Encodings.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -246,10 +246,10 @@ public static void main(String[] args) {
246246

247247
if (e == TruffleString.Encoding.UTF_16BE) {
248248
encodedBroken = asBytes(new int[]{Character.MIN_LOW_SURROGATE}, 1, ByteOrder.BIG_ENDIAN);
249-
codepointsBroken = new int[]{0xfffd};
249+
codepointsBroken = new int[]{Character.MIN_LOW_SURROGATE};
250250
} else if (e == TruffleString.Encoding.UTF_32BE) {
251251
encodedBroken = asBytes(new int[]{Character.MIN_LOW_SURROGATE}, 2, ByteOrder.BIG_ENDIAN);
252-
codepointsBroken = new int[]{0xfffd};
252+
codepointsBroken = new int[]{Character.MIN_LOW_SURROGATE};
253253
}
254254

255255
testData[e.ordinal()] = new TestData(
@@ -454,7 +454,7 @@ static TestData dataUTF32BE() {
454454
return new TestData(
455455
new int[]{0x000000, 0x00d7ff, 0x00e000, 0x10ffff}, new int[]{0x0, 0x4, 0x8, 0xc}, null, null, null, new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00,
456456
(byte) 0x00, (byte) 0xd7, (byte) 0xff, (byte) 0x00, (byte) 0x00, (byte) 0xe0, (byte) 0x00, (byte) 0x00, (byte) 0x10, (byte) 0xff, (byte) 0xff},
457-
new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0xdc, (byte) 0x00}, new int[]{0xfffd});
457+
new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0xdc, (byte) 0x00}, new int[]{0xdc00});
458458
}
459459

460460
static TestData dataUTF16LE() {
@@ -471,7 +471,7 @@ static TestData dataUTF16BE() {
471471
assert TruffleString.Encoding.UTF_16BE.ordinal() == 3;
472472
return new TestData(new int[]{0x000000, 0x00d7ff, 0x00e000, 0x10ffff}, new int[]{0x0, 0x2, 0x4, 0x6}, null, null, null,
473473
new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0xd7, (byte) 0xff, (byte) 0xe0, (byte) 0x00, (byte) 0xdb, (byte) 0xff, (byte) 0xdf, (byte) 0xff},
474-
new byte[]{(byte) 0xdc, (byte) 0x00}, new int[]{0xfffd});
474+
new byte[]{(byte) 0xdc, (byte) 0x00}, new int[]{0xdc00});
475475
}
476476

477477
static TestData dataISO88591() {

truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/TStringJCodingsDisabledTest.java

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@
4141

4242
package com.oracle.truffle.api.strings.test;
4343

44+
import static com.oracle.truffle.api.strings.test.TStringTestBase.forAllStrings;
45+
4446
import java.util.Arrays;
47+
import java.util.EnumSet;
4548

4649
import org.graalvm.polyglot.Context;
4750
import org.junit.AfterClass;
@@ -60,6 +63,7 @@
6063
import com.oracle.truffle.api.nodes.Node;
6164
import com.oracle.truffle.api.strings.MutableTruffleString;
6265
import com.oracle.truffle.api.strings.TruffleString;
66+
import com.oracle.truffle.api.strings.test.ops.TStringSwitchEncodingTest;
6367

6468
@RunWith(Parameterized.class)
6569
public class TStringJCodingsDisabledTest {
@@ -118,7 +122,7 @@ public static NoJCodingsDummyLanguageContext get(Node node) {
118122

119123
@BeforeClass
120124
public static void setUp() {
121-
context = Context.newBuilder(TStringTestNoJCodingsDummyLanguage.ID).build();
125+
context = Context.newBuilder(TStringTestNoJCodingsDummyLanguage.ID).allowAllAccess(true).build();
122126
context.enter();
123127
Assert.assertNotNull(TruffleString.Encoding.UTF_7.getEmpty());
124128
boolean jcodingsEnabled;
@@ -139,12 +143,20 @@ public static void tearDown() {
139143

140144
@Parameter public TruffleString.FromByteArrayNode node;
141145
@Parameter(1) public MutableTruffleString.FromByteArrayNode nodeMutable;
146+
@Parameter(2) public TruffleString.SwitchEncodingNode switchEncodingNode;
147+
@Parameter(3) public MutableTruffleString.SwitchEncodingNode switchEncodingNodeMutable;
142148

143149
@Parameters(name = "{0}, {1}")
144150
public static Iterable<Object[]> data() {
145151
return Arrays.asList(
146-
new Object[]{TruffleString.FromByteArrayNode.create(), MutableTruffleString.FromByteArrayNode.create()},
147-
new Object[]{TruffleString.FromByteArrayNode.getUncached(), MutableTruffleString.FromByteArrayNode.getUncached()});
152+
new Object[]{TruffleString.FromByteArrayNode.create(),
153+
MutableTruffleString.FromByteArrayNode.create(),
154+
TruffleString.SwitchEncodingNode.create(),
155+
MutableTruffleString.SwitchEncodingNode.create()},
156+
new Object[]{TruffleString.FromByteArrayNode.getUncached(),
157+
MutableTruffleString.FromByteArrayNode.getUncached(),
158+
TruffleString.SwitchEncodingNode.getUncached(),
159+
MutableTruffleString.SwitchEncodingNode.getUncached()});
148160
}
149161

150162
@Test(expected = AssertionError.class)
@@ -166,4 +178,21 @@ public void testMutableTruffleStringCopy() {
166178
public void testMutableTruffleStringDirect() {
167179
nodeMutable.execute(new byte[1], 0, 1, TruffleString.Encoding.Big5, false);
168180
}
181+
182+
@Test
183+
public void testSwitchEncodingSupported() throws Exception {
184+
EnumSet<TruffleString.Encoding> encodings = EnumSet.of(
185+
TruffleString.Encoding.US_ASCII,
186+
TruffleString.Encoding.ISO_8859_1,
187+
TruffleString.Encoding.UTF_8,
188+
TruffleString.Encoding.UTF_16LE,
189+
TruffleString.Encoding.UTF_32LE,
190+
TruffleString.Encoding.UTF_16BE,
191+
TruffleString.Encoding.UTF_32BE);
192+
forAllStrings(encodings.toArray(TruffleString.Encoding[]::new), true, (a, array, codeRange, isValid, encoding, codepoints, byteIndices) -> {
193+
for (TruffleString.Encoding targetEncoding : encodings) {
194+
TStringSwitchEncodingTest.checkSwitchEncoding(a, codeRange, isValid, encoding, codepoints, targetEncoding, switchEncodingNode, switchEncodingNodeMutable);
195+
}
196+
});
197+
}
169198
}

truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/TStringUTF16Tests.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141

4242
package com.oracle.truffle.api.strings.test;
4343

44+
import java.nio.ByteOrder;
4445
import java.util.Arrays;
4546

4647
import org.junit.Assert;
@@ -146,4 +147,28 @@ public void testToJavaString() {
146147
TruffleString a = TruffleString.fromCharArrayUTF16Uncached(new char[]{'a', 'b', 'c'});
147148
Assert.assertEquals("abc", a.toJavaStringUncached());
148149
}
150+
151+
private static TruffleString.Encoding getForeignEndian() {
152+
return ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? TruffleString.Encoding.UTF_16BE : TruffleString.Encoding.UTF_16LE;
153+
}
154+
155+
private byte[] getByteSwappedArray(String s) {
156+
byte[] array = new byte[s.length() << 1];
157+
for (int i = 0; i < s.length(); i++) {
158+
char c = s.charAt(i);
159+
if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) {
160+
c = Character.reverseBytes(c);
161+
}
162+
array[i << 1] = (byte) (c >> 8);
163+
array[(i << 1) + 1] = (byte) c;
164+
}
165+
return array;
166+
}
167+
168+
@Test
169+
public void testForeignEndian() {
170+
TruffleString a = TruffleString.fromByteArrayUncached(getByteSwappedArray("a\udc00"), getForeignEndian());
171+
Assert.assertEquals(2, a.codePointLengthUncached(getForeignEndian()));
172+
Assert.assertEquals("a\udc00", a.toJavaStringUncached());
173+
}
149174
}

truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/TStringUTF32Tests.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050
import com.oracle.truffle.api.strings.TruffleString;
5151
import com.oracle.truffle.api.strings.TruffleStringBuilder;
5252

53+
import java.nio.ByteOrder;
54+
5355
public class TStringUTF32Tests extends TStringTestBase {
5456

5557
@Test
@@ -80,4 +82,32 @@ public void testBroken3() {
8082
Assert.assertEquals(0xD801, ts4.codePointAtIndexUncached(0, TruffleString.Encoding.UTF_32));
8183
Assert.assertEquals(0xDC00, ts4.codePointAtIndexUncached(1, TruffleString.Encoding.UTF_32));
8284
}
85+
86+
private static TruffleString.Encoding getForeignEndian() {
87+
return ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? TruffleString.Encoding.UTF_32BE : TruffleString.Encoding.UTF_32LE;
88+
}
89+
90+
private byte[] getByteSwappedArray(String s) {
91+
byte[] array = new byte[s.length() << 2];
92+
int i = 0;
93+
for (int cp : s.codePoints().toArray()) {
94+
int c = cp; // checkstyle
95+
if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) {
96+
c = Integer.reverseBytes(c);
97+
}
98+
array[i << 2] = (byte) (c >> 24);
99+
array[(i << 2) + 1] = (byte) (c >> 16);
100+
array[(i << 2) + 2] = (byte) (c >> 8);
101+
array[(i << 2) + 3] = (byte) c;
102+
i++;
103+
}
104+
return array;
105+
}
106+
107+
@Test
108+
public void testForeignEndian() {
109+
TruffleString a = TruffleString.fromByteArrayUncached(getByteSwappedArray("a\udc00b"), getForeignEndian());
110+
Assert.assertEquals(3, a.codePointLengthUncached(getForeignEndian()));
111+
Assert.assertEquals("a\udc00b", a.toJavaStringUncached());
112+
}
83113
}

truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringForceEncodingTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ public static Iterable<Object[]> data() {
7272
public void testAll() throws Exception {
7373
forAllStrings(true, (a, array, codeRange, isValid, encoding, codepoints, byteIndices) -> {
7474
for (TruffleString.Encoding targetEncoding : TruffleString.Encoding.values()) {
75-
if (targetEncoding == TruffleString.Encoding.UTF_32 && (array.length & 3) != 0 || targetEncoding == TruffleString.Encoding.UTF_16 && (array.length & 1) != 0) {
75+
if ((targetEncoding == TruffleString.Encoding.UTF_32LE || targetEncoding == TruffleString.Encoding.UTF_32BE) && (array.length & 3) != 0 ||
76+
(targetEncoding == TruffleString.Encoding.UTF_16LE || targetEncoding == TruffleString.Encoding.UTF_16BE) && (array.length & 1) != 0) {
7677
expectIllegalArgumentException(() -> node.execute(a, encoding, targetEncoding));
7778
expectIllegalArgumentException(() -> nodeMutable.execute(a, encoding, targetEncoding));
7879
} else {

truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringSwitchEncodingTest.java

Lines changed: 44 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -127,49 +127,56 @@ public void testAll() throws Exception {
127127
return;
128128
}
129129
for (TruffleString.Encoding targetEncoding : reducedEncodingSet) {
130-
boolean bothUTF = isUTF(encoding) && isUTF(targetEncoding);
131-
for (TranscodingErrorHandler errorHandler : bothUTF ? new TranscodingErrorHandler[]{TranscodingErrorHandler.DEFAULT,
132-
TranscodingErrorHandler.DEFAULT_KEEP_SURROGATES_IN_UTF8}
133-
: new TranscodingErrorHandler[]{TranscodingErrorHandler.DEFAULT}) {
134-
TruffleString b = node.execute(a, targetEncoding, errorHandler);
135-
MutableTruffleString bMutable = nodeMutable.execute(a, targetEncoding, errorHandler);
136-
if (a instanceof TruffleString && (encoding == targetEncoding || !isDebugStrictEncodingChecks() &&
137-
codeRange == TruffleString.CodeRange.ASCII && isAsciiCompatible(targetEncoding) &&
138-
a.getStringCompactionLevelUncached(encoding).getLog2() <= getNaturalStride(targetEncoding))) {
139-
Assert.assertSame(a, b);
140-
}
141-
if (a instanceof MutableTruffleString && encoding == targetEncoding) {
142-
Assert.assertSame(a, bMutable);
143-
}
144-
if (bothUTF) {
145-
for (AbstractTruffleString target : new AbstractTruffleString[]{b, bMutable}) {
146-
if (encoding == targetEncoding || isValid) {
147-
assertCodePointsEqual(target, targetEncoding, codepoints);
148-
} else {
149-
TruffleStringIterator it = target.createCodePointIteratorUncached(targetEncoding);
150-
for (int codepoint : codepoints) {
151-
int expected = codepoint;
152-
if (codepoint > Character.MAX_CODE_POINT) {
153-
expected = 0xfffd;
154-
} else if (targetEncoding == TruffleString.Encoding.UTF_8 && codepoint <= 0xffff &&
155-
Character.isSurrogate((char) codepoint)) {
156-
if (errorHandler == TranscodingErrorHandler.DEFAULT) {
157-
expected = 0xfffd;
158-
} else if (errorHandler == TranscodingErrorHandler.DEFAULT_KEEP_SURROGATES_IN_UTF8) {
159-
expected = 0xfffd;
160-
Assert.assertEquals(codepoint, TStringTestUtil.utf8DecodeValid(b, it.getByteIndex()));
161-
} else {
162-
Assert.fail();
163-
}
164-
}
165-
Assert.assertEquals(expected, it.nextUncached());
130+
checkSwitchEncoding(a, codeRange, isValid, encoding, codepoints, targetEncoding, node, nodeMutable);
131+
}
132+
});
133+
}
134+
135+
public static void checkSwitchEncoding(AbstractTruffleString a, TruffleString.CodeRange codeRange, boolean isValid, TruffleString.Encoding encoding, int[] codepoints,
136+
TruffleString.Encoding targetEncoding,
137+
TruffleString.SwitchEncodingNode node,
138+
MutableTruffleString.SwitchEncodingNode nodeMutable) {
139+
boolean bothUTF = isUTF(encoding) && isUTF(targetEncoding);
140+
for (TranscodingErrorHandler errorHandler : bothUTF ? new TranscodingErrorHandler[]{TranscodingErrorHandler.DEFAULT,
141+
TranscodingErrorHandler.DEFAULT_KEEP_SURROGATES_IN_UTF8}
142+
: new TranscodingErrorHandler[]{TranscodingErrorHandler.DEFAULT}) {
143+
TruffleString b = node.execute(a, targetEncoding, errorHandler);
144+
MutableTruffleString bMutable = nodeMutable.execute(a, targetEncoding, errorHandler);
145+
if (a instanceof TruffleString &&
146+
(encoding == targetEncoding || !isDebugStrictEncodingChecks() && codeRange == TruffleString.CodeRange.ASCII && isAsciiCompatible(targetEncoding)) &&
147+
a.getStringCompactionLevelUncached(encoding).getLog2() <= getNaturalStride(targetEncoding)) {
148+
Assert.assertSame(a, b);
149+
}
150+
if (a instanceof MutableTruffleString && encoding == targetEncoding) {
151+
Assert.assertSame(a, bMutable);
152+
}
153+
if (bothUTF) {
154+
for (AbstractTruffleString target : new AbstractTruffleString[]{b, bMutable}) {
155+
if (encoding == targetEncoding || isValid) {
156+
assertCodePointsEqual(target, targetEncoding, codepoints);
157+
} else {
158+
TruffleStringIterator it = target.createCodePointIteratorUncached(targetEncoding);
159+
for (int codepoint : codepoints) {
160+
int expected = codepoint;
161+
if (codepoint > Character.MAX_CODE_POINT) {
162+
expected = 0xfffd;
163+
} else if (targetEncoding == TruffleString.Encoding.UTF_8 && codepoint <= 0xffff &&
164+
Character.isSurrogate((char) codepoint)) {
165+
if (errorHandler == TranscodingErrorHandler.DEFAULT) {
166+
expected = 0xfffd;
167+
} else if (errorHandler == TranscodingErrorHandler.DEFAULT_KEEP_SURROGATES_IN_UTF8) {
168+
expected = 0xfffd;
169+
Assert.assertEquals(codepoint, TStringTestUtil.utf8DecodeValid(b, it.getByteIndex()));
170+
} else {
171+
Assert.fail();
166172
}
167173
}
174+
Assert.assertEquals(expected, it.nextUncached());
168175
}
169176
}
170177
}
171178
}
172-
});
179+
}
173180
}
174181

175182
@Test

truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@
5050
import static com.oracle.truffle.api.strings.TStringGuards.isLatin1;
5151
import static com.oracle.truffle.api.strings.TStringGuards.isSupportedEncoding;
5252
import static com.oracle.truffle.api.strings.TStringGuards.isUTF16;
53+
import static com.oracle.truffle.api.strings.TStringGuards.isUTF16FE;
5354
import static com.oracle.truffle.api.strings.TStringGuards.isUTF32;
55+
import static com.oracle.truffle.api.strings.TStringGuards.isUTF32FE;
5456
import static com.oracle.truffle.api.strings.TStringGuards.isUTF8;
5557
import static com.oracle.truffle.api.strings.TStringGuards.isValidFixedWidth;
5658
import static com.oracle.truffle.api.strings.TStringGuards.isValidMultiByte;
@@ -134,7 +136,7 @@ public abstract sealed class AbstractTruffleString permits TruffleString, Mutabl
134136
assert isByte(stride);
135137
assert isByte(flags);
136138
assert validateCodeRange(encoding, codeRange);
137-
assert isSupportedEncoding(encoding) || length == 0 || JCodings.ENABLED;
139+
assert isSupportedEncoding(encoding) || isUTF16FE(encoding) || isUTF32FE(encoding) || length == 0 || JCodings.ENABLED;
138140
this.data = data;
139141
this.encoding = encoding.id;
140142
this.offset = offset;
@@ -550,9 +552,9 @@ static void boundsCheckRegionI(int fromIndex, int regionLength, int arrayLength)
550552
}
551553

552554
static void checkByteLength(int byteLength, Encoding encoding) {
553-
if (isUTF16(encoding)) {
555+
if (isUTF16(encoding) || isUTF16FE(encoding)) {
554556
TruffleString.checkByteLengthUTF16(byteLength);
555-
} else if (isUTF32(encoding)) {
557+
} else if (isUTF32(encoding) || isUTF32FE(encoding)) {
556558
TruffleString.checkByteLengthUTF32(byteLength);
557559
}
558560
}

0 commit comments

Comments
 (0)