Skip to content

Commit dd6d5b5

Browse files
authored
Parsing Unicode escape sequences (simdjson#21)
1 parent 8c4c689 commit dd6d5b5

File tree

12 files changed

+677
-259
lines changed

12 files changed

+677
-259
lines changed

README.md

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,6 @@ A Java version of [simdjson](https://github.com/simdjson/simdjson) - a JSON pars
88
based on the paper [Parsing Gigabytes of JSON per Second](https://arxiv.org/abs/1902.08318)
99
by Geoff Langdale and Daniel Lemire.
1010

11-
This implementation is still missing several features available in simdsjon. For example:
12-
13-
* Support for Unicode characters
14-
* UTF-8 validation
15-
* Support for 512-bit vectors
16-
1711
## Code Sample
1812

1913
```java
@@ -73,8 +67,8 @@ This section presents a performance comparison of different JSON parsers availab
7367
the [twitter.json](src/jmh/resources/twitter.json) dataset, and its goal was to measure the throughput (ops/s) of parsing
7468
and finding all unique users with a default profile.
7569

76-
**Note that simdjson-java is still missing several features (mentioned in the introduction), so the following results
77-
may not reflect its real performance.**
70+
**Note that simdjson-java is still missing several features (see [GitHub Issues](https://github.com/simdjson/simdjson-java/issues)),
71+
so the following results may not reflect its real performance.**
7872

7973
Environment:
8074
* CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz

build.gradle

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,18 @@ java {
3838

3939
ext {
4040
junitVersion = '5.9.1'
41+
jsoniterScalaVersion = '2.23.2'
4142
}
4243

4344
dependencies {
4445
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.15.2'
4546
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.35'
4647
jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23'
47-
jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: '2.23.2'
48-
compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: '2.23.2'
48+
jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion
49+
compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion
4950

5051
testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2'
52+
testImplementation group: 'org.apache.commons', name: 'commons-text', version: '1.10.0'
5153
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion
5254
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion
5355
testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: junitVersion
@@ -74,6 +76,9 @@ test {
7476
'--add-modules', 'jdk.incubator.vector',
7577
'-Xmx2g'
7678
]
79+
testLogging {
80+
events 'PASSED', 'SKIPPED', 'FAILED', 'STANDARD_OUT', 'STANDARD_ERROR'
81+
}
7782
}
7883

7984
tasks.withType(JmhBytecodeGeneratorTask).configureEach {

src/jmh/java/org/simdjson/ParseBenchmark.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
@OutputTimeUnit(TimeUnit.SECONDS)
2222
public class ParseBenchmark {
2323

24-
@Param({"/twitter.json" /*, "/gsoc-2018.json - unicode is not supported yet"*/, "/github_events.json"})
24+
@Param({"/twitter.json", "/gsoc-2018.json", "/github_events.json"})
2525
String fileName;
2626

2727
private final SimdJsonParser simdJsonParser = new SimdJsonParser();

src/main/java/org/simdjson/CharacterUtils.java

Lines changed: 252 additions & 0 deletions
Large diffs are not rendered by default.

src/main/java/org/simdjson/JsonCharUtils.java

Lines changed: 0 additions & 49 deletions
This file was deleted.

src/main/java/org/simdjson/NumberParser.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import static java.lang.Long.remainderUnsigned;
1010
import static java.lang.Math.abs;
1111
import static java.lang.Math.unsignedMultiplyHigh;
12-
import static org.simdjson.JsonCharUtils.isStructuralOrWhitespace;
12+
import static org.simdjson.CharacterUtils.isStructuralOrWhitespace;
1313
import static org.simdjson.NumberParserTables.NUMBER_OF_ADDITIONAL_DIGITS_AFTER_LEFT_SHIFT;
1414
import static org.simdjson.NumberParserTables.POWERS_OF_FIVE;
1515
import static org.simdjson.NumberParserTables.POWER_OF_FIVE_DIGITS;
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
package org.simdjson;
2+
3+
import jdk.incubator.vector.ByteVector;
4+
5+
import static org.simdjson.CharacterUtils.escape;
6+
import static org.simdjson.CharacterUtils.hexToInt;
7+
import static org.simdjson.Tape.STRING;
8+
9+
class StringParser {
10+
11+
private static final byte BACKSLASH = '\\';
12+
private static final byte QUOTE = '"';
13+
private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
14+
private static final int MIN_HIGH_SURROGATE = 0xD800;
15+
private static final int MAX_HIGH_SURROGATE = 0xDBFF;
16+
private static final int MIN_LOW_SURROGATE = 0xDC00;
17+
private static final int MAX_LOW_SURROGATE = 0xDFFF;
18+
19+
private final Tape tape;
20+
private final byte[] stringBuffer;
21+
22+
private int stringBufferIdx;
23+
24+
StringParser(Tape tape, byte[] stringBuffer) {
25+
this.tape = tape;
26+
this.stringBuffer = stringBuffer;
27+
}
28+
29+
void parseString(byte[] buffer, int idx) {
30+
tape.append(stringBufferIdx, STRING);
31+
int src = idx + 1;
32+
int dst = stringBufferIdx + Integer.BYTES;
33+
while (true) {
34+
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
35+
srcVec.intoArray(stringBuffer, dst);
36+
long backslashBits = srcVec.eq(BACKSLASH).toLong();
37+
long quoteBits = srcVec.eq(QUOTE).toLong();
38+
39+
if (hasQuoteFirst(backslashBits, quoteBits)) {
40+
dst += Long.numberOfTrailingZeros(quoteBits);
41+
break;
42+
}
43+
if (hasBackslash(backslashBits, quoteBits)) {
44+
int backslashDist = Long.numberOfTrailingZeros(backslashBits);
45+
byte escapeChar = buffer[src + backslashDist + 1];
46+
if (escapeChar == 'u') {
47+
src += backslashDist;
48+
dst += backslashDist;
49+
int codePoint = hexToInt(buffer, src + 2);
50+
src += 6;
51+
if (codePoint >= MIN_HIGH_SURROGATE && codePoint <= MAX_HIGH_SURROGATE) {
52+
codePoint = parseLowSurrogate(buffer, src, codePoint);
53+
src += 6;
54+
} else if (codePoint >= MIN_LOW_SURROGATE && codePoint <= MAX_LOW_SURROGATE) {
55+
throw new JsonParsingException("Invalid code point. The range U+DC00–U+DFFF is reserved for low surrogate.");
56+
}
57+
dst += storeCodePointInStringBuffer(codePoint, dst);
58+
} else {
59+
stringBuffer[dst + backslashDist] = escape(escapeChar);
60+
src += backslashDist + 2;
61+
dst += backslashDist + 1;
62+
}
63+
} else {
64+
src += BYTES_PROCESSED;
65+
dst += BYTES_PROCESSED;
66+
}
67+
}
68+
int len = dst - stringBufferIdx - Integer.BYTES;
69+
IntegerUtils.toBytes(len, stringBuffer, stringBufferIdx);
70+
stringBufferIdx = dst;
71+
}
72+
73+
private int parseLowSurrogate(byte[] buffer, int src, int codePoint) {
74+
if ((buffer[src] << 8 | buffer[src + 1]) != ('\\' << 8 | 'u')) {
75+
throw new JsonParsingException("Low surrogate should start with '\\u'");
76+
} else {
77+
int codePoint2 = hexToInt(buffer, src + 2);
78+
int lowBit = codePoint2 - MIN_LOW_SURROGATE;
79+
if (lowBit >> 10 == 0) {
80+
return (((codePoint - MIN_HIGH_SURROGATE) << 10) | lowBit) + 0x10000;
81+
} else {
82+
throw new JsonParsingException("Invalid code point. Low surrogate should be in the range U+DC00–U+DFFF.");
83+
}
84+
}
85+
}
86+
87+
private int storeCodePointInStringBuffer(int codePoint, int dst) {
88+
if (codePoint < 0) {
89+
throw new JsonParsingException("Invalid unicode escape sequence.");
90+
}
91+
if (codePoint <= 0x7F) {
92+
stringBuffer[dst] = (byte) codePoint;
93+
return 1;
94+
}
95+
if (codePoint <= 0x7FF) {
96+
stringBuffer[dst] = (byte) ((codePoint >> 6) + 192);
97+
stringBuffer[dst + 1] = (byte) ((codePoint & 63) + 128);
98+
return 2;
99+
}
100+
if (codePoint <= 0xFFFF) {
101+
stringBuffer[dst] = (byte) ((codePoint >> 12) + 224);
102+
stringBuffer[dst + 1] = (byte) (((codePoint >> 6) & 63) + 128);
103+
stringBuffer[dst + 2] = (byte) ((codePoint & 63) + 128);
104+
return 3;
105+
}
106+
if (codePoint <= 0x10FFFF) {
107+
stringBuffer[dst] = (byte) ((codePoint >> 18) + 240);
108+
stringBuffer[dst + 1] = (byte) (((codePoint >> 12) & 63) + 128);
109+
stringBuffer[dst + 2] = (byte) (((codePoint >> 6) & 63) + 128);
110+
stringBuffer[dst + 3] = (byte) ((codePoint & 63) + 128);
111+
return 4;
112+
}
113+
throw new IllegalStateException("Code point is greater than 0x110000.");
114+
}
115+
116+
private boolean hasQuoteFirst(long backslashBits, long quoteBits) {
117+
return ((backslashBits - 1) & quoteBits) != 0;
118+
}
119+
120+
private boolean hasBackslash(long backslashBits, long quoteBits) {
121+
return ((quoteBits - 1) & backslashBits) != 0;
122+
}
123+
124+
void reset() {
125+
stringBufferIdx = 0;
126+
}
127+
}

src/main/java/org/simdjson/TapeBuilder.java

Lines changed: 5 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,27 @@
11
package org.simdjson;
22

3-
import jdk.incubator.vector.ByteVector;
4-
53
import java.util.Arrays;
64

7-
import static org.simdjson.JsonCharUtils.isStructuralOrWhitespace;
5+
import static org.simdjson.CharacterUtils.isStructuralOrWhitespace;
86
import static org.simdjson.Tape.END_ARRAY;
97
import static org.simdjson.Tape.END_OBJECT;
108
import static org.simdjson.Tape.FALSE_VALUE;
119
import static org.simdjson.Tape.NULL_VALUE;
1210
import static org.simdjson.Tape.ROOT;
1311
import static org.simdjson.Tape.START_ARRAY;
1412
import static org.simdjson.Tape.START_OBJECT;
15-
import static org.simdjson.Tape.STRING;
1613
import static org.simdjson.Tape.TRUE_VALUE;
1714

1815
class TapeBuilder {
1916

2017
private static final byte SPACE = 0x20;
21-
private static final byte BACKSLASH = '\\';
22-
private static final byte QUOTE = '"';
23-
private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
24-
private static final byte[] ESCAPE_MAP = new byte[]{
25-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
26-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27-
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
28-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29-
30-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
31-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
32-
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
33-
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
34-
35-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
36-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
38-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
39-
40-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44-
};
4518

4619
private final Tape tape;
4720
private final byte[] stringBuffer;
4821
private final OpenContainer[] openContainers;
4922
private final int padding;
5023
private final NumberParser numberParser;
51-
52-
private int stringBufferIdx;
24+
private final StringParser stringParser;
5325

5426
TapeBuilder(int capacity, int depth, int padding) {
5527
this.tape = new Tape(capacity);
@@ -60,6 +32,7 @@ class TapeBuilder {
6032
}
6133
this.stringBuffer = new byte[capacity];
6234
this.numberParser = new NumberParser(tape);
35+
this.stringParser = new StringParser(tape, stringBuffer);
6336
}
6437

6538
void visitDocumentStart() {
@@ -193,56 +166,7 @@ void visitKey(byte[] buffer, int idx) {
193166
}
194167

195168
private void visitString(byte[] buffer, int idx) {
196-
tape.append(stringBufferIdx, STRING);
197-
int src = idx + 1;
198-
int dst = stringBufferIdx + Integer.BYTES;
199-
while (true) {
200-
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
201-
srcVec.intoArray(stringBuffer, dst);
202-
long backslashBits = srcVec.eq(BACKSLASH).toLong();
203-
long quoteBits = srcVec.eq(QUOTE).toLong();
204-
205-
if (hasQuoteFirst(backslashBits, quoteBits)) {
206-
dst += Long.numberOfTrailingZeros(quoteBits);
207-
break;
208-
}
209-
if (hasBackslash(backslashBits, quoteBits)) {
210-
int backslashDist = Long.numberOfTrailingZeros(backslashBits);
211-
byte escapeChar = buffer[src + backslashDist + 1];
212-
if (escapeChar == 'u') {
213-
throw new UnsupportedOperationException("Support for unicode characters is not implemented yet.");
214-
} else {
215-
stringBuffer[dst + backslashDist] = escape(escapeChar);
216-
src += backslashDist + 2;
217-
dst += backslashDist + 1;
218-
}
219-
} else {
220-
src += BYTES_PROCESSED;
221-
dst += BYTES_PROCESSED;
222-
}
223-
}
224-
int len = dst - stringBufferIdx - Integer.BYTES;
225-
IntegerUtils.toBytes(len, stringBuffer, stringBufferIdx);
226-
stringBufferIdx = dst;
227-
}
228-
229-
private byte escape(byte escapeChar) {
230-
if (escapeChar < 0) {
231-
throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar));
232-
}
233-
byte escapeResult = ESCAPE_MAP[escapeChar];
234-
if (escapeResult == 0) {
235-
throw new JsonParsingException("Escaped unexpected character: " + ((char) escapeChar));
236-
}
237-
return escapeResult;
238-
}
239-
240-
private boolean hasQuoteFirst(long backslashBits, long quoteBits) {
241-
return ((backslashBits - 1) & quoteBits) != 0;
242-
}
243-
244-
private boolean hasBackslash(long backslashBits, long quoteBits) {
245-
return ((quoteBits - 1) & backslashBits) != 0;
169+
stringParser.parseString(buffer, idx);
246170
}
247171

248172
private void visitNumber(byte[] buffer, int idx) {
@@ -278,7 +202,7 @@ private void emptyContainer(char start, char end) {
278202

279203
void reset() {
280204
tape.reset();
281-
stringBufferIdx = 0;
205+
stringParser.reset();
282206
}
283207

284208
JsonValue createJsonValue(byte[] buffer) {

0 commit comments

Comments
 (0)