Skip to content

Commit

Permalink
first version
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrrzysko committed Jul 17, 2023
1 parent bc88d27 commit ebcb106
Show file tree
Hide file tree
Showing 33 changed files with 57,696 additions and 9 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
.idea
.gradle
build
profilers
68 changes: 67 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,67 @@
# simdjson-java
# simdjson-java

A Java version of [simdjson](https://github.com/simdjson/simdjson) - a JSON parser using SIMD instructions,
based on the paper [Parsing Gigabytes of JSON per Second](https://arxiv.org/abs/1902.08318)
by Geoff Langdale and Daniel Lemire.

This implementation is still missing several features available in simdsjon. For example:

* Support for Unicode characters
* UTF-8 validation
* Full support for parsing floats
* Support for 512-bit vectors

## Code Sample

```java
byte[] json = loadTwitterJson();

SimdJsonParser parser = new SimdJsonParser();
JsonValue jsonValue = simdJsonParser.parse(json, json.length);
Iterator<JsonValue> tweets = jsonValue.get("statuses").arrayIterator();
while (tweets.hasNext()) {
JsonValue tweet = tweets.next();
JsonValue user = tweet.get("user");
if (user.get("default_profile").asBoolean()) {
System.out.println(user.get("screen_name").asString());
}
}
```

## Benchmarks

To run the JMH benchmarks, execute the following command:

```./gradlew jmh```

## Tests

To run the tests, execute the following command:

```./gradlew test```

## Performance

This section presents a performance comparison of different JSON parsers available as Java libraries. The benchmark used
the [twitter.json](src/jmh/resources/twitter.json) dataset, and its goal was to measure the throughput (ops/s) of parsing
and finding all unique users with a default profile.

**Note that simdjson-java is still missing several features (mentioned in the introduction), so the following results
may not reflect its real performance.**

Environment:
* CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
* OS: Ubuntu 23.04, kernel 6.2.0-23-generic
* Java: OpenJDK 64-Bit Server VM Temurin-20.0.1+9

Library | Version | Throughput (ops/s)
---------------------------------------------------|---------|--------------------
simdjson-java | - | 1450.951
simdjson-java (padded) | - | 1505.227
[jackson](https://github.com/FasterXML/jackson) | 2.15.2 | 504.562
[fastjson2](https://github.com/alibaba/fastjson) | 2.0.35 | 590.743
[jsoniter](https://github.com/json-iterator/java) | 0.9.23 | 384.664

To reproduce the benchmark results, execute the following command:

```./gradlew jmh -Pjmh.includes='.*ParseAndSelectBenchmark.*'```
60 changes: 59 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import me.champeau.jmh.JmhBytecodeGeneratorTask
import org.gradle.internal.os.OperatingSystem

plugins {
id 'java'
id 'me.champeau.jmh' version '0.7.1'
}

group = 'com.github.piotrrzysko'
version = '1.0-SNAPSHOT'
version = '0.0.1-SNAPSHOT'

repositories {
mavenCentral()
Expand All @@ -21,6 +24,10 @@ ext {
}

dependencies {
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.15.2'
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.35'
jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23'

testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2'
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion
Expand All @@ -29,4 +36,55 @@ dependencies {

test {
useJUnitPlatform()
jvmArgs += [
'--add-modules', 'jdk.incubator.vector',
'-Xmx2g'
]
}

tasks.withType(JmhBytecodeGeneratorTask).configureEach {
jvmArgs.set(["--add-modules=jdk.incubator.vector"])
}

tasks.withType(JavaCompile).configureEach {
options.compilerArgs.add("--add-modules=jdk.incubator.vector")
}

compileTestJava {
options.compilerArgs += [
'--add-modules', 'jdk.incubator.vector'
]
}

jmh {
fork = 1
warmupIterations = 3
iterations = 5
jvmArgsPrepend = [
'--add-modules=jdk.incubator.vector'
]
if (getBooleanProperty('jmh.profilersEnabled', false)) {
if (OperatingSystem.current().isLinux()) {
profilers = [
'perf',
'perfasm:intelSyntax=true',
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getAsyncProfilerLibPath('LD_LIBRARY_PATH')
]
} else if (OperatingSystem.current().isMacOsX()) {
profilers = [
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getAsyncProfilerLibPath('DYLD_LIBRARY_PATH')
]
}
}
if (project.hasProperty('jmh.includes')) {
includes = [project.findProperty('jmh.includes')]
}
}

def getBooleanProperty(String name, boolean defaultValue) {
Boolean.valueOf((project.findProperty(name) ?: defaultValue) as String)
}

static def getAsyncProfilerLibPath(String envVarName) {
System.getenv(envVarName) ?: System.getProperty('java.library.path')
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package com.github.piotrrzysko.simdjson;

import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jsoniter.JsonIterator;
import com.jsoniter.any.Any;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import static com.github.piotrrzysko.simdjson.SimdJsonPaddingUtil.padded;

@State(Scope.Benchmark)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
public class ParseAndSelectBenchmark {

private final SimdJsonParser simdJsonParser = new SimdJsonParser();
private final ObjectMapper objectMapper = new ObjectMapper();

private byte[] buffer;
private byte[] bufferPadded;

@Setup(Level.Trial)
public void setup() throws IOException {
try (InputStream is = ParseBenchmark.class.getResourceAsStream("/twitter.json")) {
buffer = is.readAllBytes();
bufferPadded = padded(buffer);
}
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jackson() throws IOException {
JsonNode jacksonJsonNode = objectMapper.readTree(buffer);
Set<String> defaultUsers = new HashSet<>();
Iterator<JsonNode> tweets = jacksonJsonNode.get("statuses").elements();
while (tweets.hasNext()) {
JsonNode tweet = tweets.next();
JsonNode user = tweet.get("user");
if (user.get("default_profile").asBoolean()) {
defaultUsers.add(user.get("screen_name").textValue());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_fastjson() {
JSONObject jsonObject = (JSONObject) JSON.parse(buffer);
Set<String> defaultUsers = new HashSet<>();
Iterator<Object> tweets = jsonObject.getJSONArray("statuses").iterator();
while (tweets.hasNext()) {
JSONObject tweet = (JSONObject) tweets.next();
JSONObject user = (JSONObject) tweet.get("user");
if (user.getBoolean("default_profile")) {
defaultUsers.add(user.getString("screen_name"));
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jsoniter() {
Any json = JsonIterator.deserialize(buffer);
Set<String> defaultUsers = new HashSet<>();
for (Any tweet : json.get("statuses")) {
Any user = tweet.get("user");
if (user.get("default_profile").toBoolean()) {
defaultUsers.add(user.get("screen_name").toString());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_simdjson() {
JsonValue simdJsonValue = simdJsonParser.parse(buffer, buffer.length);
Set<String> defaultUsers = new HashSet<>();
Iterator<JsonValue> tweets = simdJsonValue.get("statuses").arrayIterator();
while (tweets.hasNext()) {
JsonValue tweet = tweets.next();
JsonValue user = tweet.get("user");
if (user.get("default_profile").asBoolean()) {
defaultUsers.add(user.get("screen_name").asString());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_simdjsonPadded() {
JsonValue simdJsonValue = simdJsonParser.parse(bufferPadded, buffer.length);
Set<String> defaultUsers = new HashSet<>();
Iterator<JsonValue> tweets = simdJsonValue.get("statuses").arrayIterator();
while (tweets.hasNext()) {
JsonValue tweet = tweets.next();
JsonValue user = tweet.get("user");
if (user.get("default_profile").asBoolean()) {
defaultUsers.add(user.get("screen_name").asString());
}
}
return defaultUsers.size();
}
}
49 changes: 49 additions & 0 deletions src/jmh/java/com/github/piotrrzysko/simdjson/ParseBenchmark.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package com.github.piotrrzysko.simdjson;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;

import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.TimeUnit;

import static com.github.piotrrzysko.simdjson.SimdJsonPaddingUtil.padded;

@State(Scope.Benchmark)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
public class ParseBenchmark {

@Param({"/twitter.json" /*, "/gsoc-2018.json - unicode is not supported yet"*/, "/github_events.json"})
String fileName;

private final SimdJsonParser simdJsonParser = new SimdJsonParser();

private byte[] buffer;
private byte[] bufferPadded;

@Setup(Level.Trial)
public void setup() throws IOException {
try (InputStream is = ParseBenchmark.class.getResourceAsStream(fileName)) {
buffer = is.readAllBytes();
bufferPadded = padded(buffer);
}
}

@Benchmark
public JsonValue simdjson() {
return simdJsonParser.parse(buffer, buffer.length);
}

@Benchmark
public JsonValue simdjsonPadded() {
return simdJsonParser.parse(bufferPadded, buffer.length);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package com.github.piotrrzysko.simdjson;

class SimdJsonPaddingUtil {

static byte[] padded(byte[] src) {
byte[] bufferPadded = new byte[src.length + 64];
System.arraycopy(src, 0, bufferPadded, 0, src.length);
return bufferPadded;
}
}
Loading

0 comments on commit ebcb106

Please sign in to comment.