Skip to content

Migrate from fastutil to HPPC. Randomize tests. #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.classpath
.project
.settings/
target/
38 changes: 8 additions & 30 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.6</version>
<executions>
<execution>
<id>sign-artifacts</id>
Expand Down Expand Up @@ -148,37 +149,16 @@
<dependencies>
<!-- Production dependencies :: Default scope -->
<dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>fastutil</artifactId>
<version>${fastutil-version}</version>
<groupId>com.carrotsearch</groupId>
<artifactId>hppc</artifactId>
<version>0.7.1</version>
</dependency>

<!-- Test dependencies :: Test scope -->
<dependency>
<groupId>org.easymock</groupId>
<artifactId>easymock</artifactId>
<version>${easymock-version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.powermock</groupId>
<artifactId>powermock-module-junit4</artifactId>
<version>${powermock-version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.powermock</groupId>
<artifactId>powermock-api-easymock</artifactId>
<version>${powermock-version}</version>
<scope>test</scope>
</dependency>
<!-- NOTE: the "jdk15" classifier is "JDK 1.5+" -->
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>${testng-version}</version>
<scope>test</scope>
<classifier>jdk15</classifier>
<groupId>com.carrotsearch.randomizedtesting</groupId>
<artifactId>randomizedtesting-runner</artifactId>
<version>2.1.14</version>
<scope>test</scope>
</dependency>
</dependencies>

Expand All @@ -190,7 +170,5 @@
<!-- Testing versions -->
<easymock-version>3.0</easymock-version>
<powermock-version>1.4.8</powermock-version>
<testng-version>5.7</testng-version>
<fastutil-version>6.5.11</fastutil-version>
</properties>
</project>
112 changes: 61 additions & 51 deletions src/main/java/net/agkn/hll/HLL.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@

import java.util.Arrays;

import it.unimi.dsi.fastutil.ints.Int2ByteOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import com.carrotsearch.hppc.IntByteHashMap;
import com.carrotsearch.hppc.LongHashSet;
import com.carrotsearch.hppc.cursors.IntByteCursor;
import com.carrotsearch.hppc.cursors.LongCursor;

import net.agkn.hll.serialization.HLLMetadata;
import net.agkn.hll.serialization.IHLLMetadata;
import net.agkn.hll.serialization.ISchemaVersion;
Expand Down Expand Up @@ -79,11 +82,11 @@ public class HLL implements Cloneable {
// ************************************************************************
// Storage
// storage used when #type is EXPLICIT, null otherwise
private LongOpenHashSet explicitStorage;
LongHashSet explicitStorage;
// storage used when #type is SPARSE, null otherwise
private Int2ByteOpenHashMap sparseProbabilisticStorage;
IntByteHashMap sparseProbabilisticStorage;
// storage used when #type is FULL, null otherwise
private BitVector probabilisticStorage;
BitVector probabilisticStorage;

// current type of this HLL instance, if this changes then so should the
// storage used (see above)
Expand Down Expand Up @@ -347,13 +350,13 @@ public void addRaw(final long rawValue) {
if(explicitStorage.size() > explicitThreshold) {
if(!sparseOff) {
initializeStorage(HLLType.SPARSE);
for(final long value : explicitStorage) {
addRawSparseProbabilistic(value);
for (LongCursor c : explicitStorage) {
addRawSparseProbabilistic(c.value);
}
} else {
initializeStorage(HLLType.FULL);
for(final long value : explicitStorage) {
addRawProbabilistic(value);
for (LongCursor c : explicitStorage) {
addRawProbabilistic(c.value);
}
}
explicitStorage = null;
Expand All @@ -366,8 +369,9 @@ public void addRaw(final long rawValue) {
// promotion, if necessary
if(sparseProbabilisticStorage.size() > sparseThreshold) {
initializeStorage(HLLType.FULL);
for(final int registerIndex : sparseProbabilisticStorage.keySet()) {
final byte registerValue = sparseProbabilisticStorage.get(registerIndex);
for(IntByteCursor c : sparseProbabilisticStorage) {
final int registerIndex = c.key;
final byte registerValue = c.value;
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
}
sparseProbabilisticStorage = null;
Expand Down Expand Up @@ -423,7 +427,7 @@ private void addRawSparseProbabilistic(final long rawValue) {
// NOTE: no +1 as in paper since 0-based indexing
final int j = (int)(rawValue & mBitsMask);

final byte currentValue = sparseProbabilisticStorage.get(j);
final byte currentValue = sparseProbabilisticStorage.getOrDefault(j, (byte) 0);
if(p_w > currentValue) {
sparseProbabilisticStorage.put(j, p_w);
}
Expand Down Expand Up @@ -488,10 +492,10 @@ private void initializeStorage(final HLLType type) {
// nothing to be done
break;
case EXPLICIT:
this.explicitStorage = new LongOpenHashSet();
this.explicitStorage = new LongHashSet();
break;
case SPARSE:
this.sparseProbabilisticStorage = new Int2ByteOpenHashMap();
this.sparseProbabilisticStorage = new IntByteHashMap();
break;
case FULL:
this.probabilisticStorage = new BitVector(regwidth, m);
Expand Down Expand Up @@ -541,7 +545,7 @@ public long cardinality() {
double sum = 0;
int numberOfZeroes = 0/*"V" in the paper*/;
for(int j=0; j<m; j++) {
final long register = sparseProbabilisticStorage.get(j);
final long register = sparseProbabilisticStorage.getOrDefault(j, (byte) 0);

sum += 1.0 / (1L << register);
if(register == 0L) numberOfZeroes++;
Expand Down Expand Up @@ -607,10 +611,10 @@ public void clear() {
case EMPTY:
return /*do nothing*/;
case EXPLICIT:
explicitStorage.clear();
explicitStorage.release();
return;
case SPARSE:
sparseProbabilisticStorage.clear();
sparseProbabilisticStorage.release();
return;
case FULL:
probabilisticStorage.fill(0);
Expand Down Expand Up @@ -683,8 +687,8 @@ public void union(final HLL other) {
} else {
initializeStorage(HLLType.FULL);
}
for(final long value : other.explicitStorage) {
addRaw(value);
for(LongCursor c : other.explicitStorage) {
addRaw(c.value);
}
}
return;
Expand All @@ -698,9 +702,10 @@ public void union(final HLL other) {
sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
} else {
initializeStorage(HLLType.FULL);
for(final int registerIndex : other.sparseProbabilisticStorage.keySet()) {
final byte registerValue = other.sparseProbabilisticStorage.get(registerIndex);
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
for(IntByteCursor c : other.sparseProbabilisticStorage) {
final int registerIndex = c.key;
final byte registerValue = c.value;
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
}
}
return;
Expand Down Expand Up @@ -739,17 +744,18 @@ public void union(final HLL other) {
sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
} else {
initializeStorage(HLLType.FULL);
for(final int registerIndex : other.sparseProbabilisticStorage.keySet()) {
final byte registerValue = other.sparseProbabilisticStorage.get(registerIndex);
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
for(IntByteCursor c : other.sparseProbabilisticStorage) {
final int registerIndex = c.key;
final byte registerValue = c.value;
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
}
}
} else /*source is HLLType.FULL*/ {
type = HLLType.FULL;
probabilisticStorage = other.probabilisticStorage.clone();
}
for(final long value : explicitStorage) {
addRaw(value);
for(LongCursor c : explicitStorage) {
addRaw(c.value);
}
explicitStorage = null;
return;
Expand All @@ -760,8 +766,8 @@ public void union(final HLL other) {
// dest: SPARSE
// Add the raw values from the source to the destination.

for(final long value : other.explicitStorage) {
addRaw(value);
for(LongCursor c : other.explicitStorage) {
addRaw(c.value);
}
// NOTE: addRaw will handle promotion cleanup
} else /*source is HLLType.FULL*/ {
Expand All @@ -774,9 +780,10 @@ public void union(final HLL other) {

type = HLLType.FULL;
probabilisticStorage = other.probabilisticStorage.clone();
for(final int registerIndex : sparseProbabilisticStorage.keySet()) {
final byte registerValue = sparseProbabilisticStorage.get(registerIndex);
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
for(IntByteCursor c : sparseProbabilisticStorage) {
final int registerIndex = c.key;
final byte registerValue = c.value;
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
}
sparseProbabilisticStorage = null;
}
Expand All @@ -789,21 +796,21 @@ public void union(final HLL other) {
// Add the raw values from the source to the destination.
// Promotion is not possible, so don't bother checking.

for(final long value : other.explicitStorage) {
addRaw(value);
for(LongCursor c : other.explicitStorage) {
addRaw(c.value);
}
} else /*source is HLLType.SPARSE*/ {
// src: SPARSE
// dest: FULL
// Merge the registers from the source into the destination.
// Promotion is not possible, so don't bother checking.

for(final int registerIndex : other.sparseProbabilisticStorage.keySet()) {
final byte registerValue = other.sparseProbabilisticStorage.get(registerIndex);
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
for(IntByteCursor c : other.sparseProbabilisticStorage) {
final int registerIndex = c.key;
final byte registerValue = c.value;
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
}
}

}
}
}
Expand All @@ -821,26 +828,28 @@ private void homogeneousUnion(final HLL other) {
// union of empty and empty is empty
return;
case EXPLICIT:
for(final long value : other.explicitStorage) {
addRaw(value);
for(LongCursor c : other.explicitStorage) {
addRaw(c.value);
}
// NOTE: #addRaw() will handle promotion, if necessary
return;
case SPARSE:
for(final int registerIndex : other.sparseProbabilisticStorage.keySet()) {
final byte registerValue = other.sparseProbabilisticStorage.get(registerIndex);
final byte currentRegisterValue = sparseProbabilisticStorage.get(registerIndex);
if(registerValue > currentRegisterValue) {
sparseProbabilisticStorage.put(registerIndex, registerValue);
}
for(IntByteCursor c : other.sparseProbabilisticStorage) {
final int registerIndex = c.key;
final byte registerValue = c.value;
final byte currentRegisterValue = sparseProbabilisticStorage.get(registerIndex);
if(registerValue > currentRegisterValue) {
sparseProbabilisticStorage.put(registerIndex, registerValue);
}
}

// promotion, if necessary
if(sparseProbabilisticStorage.size() > sparseThreshold) {
initializeStorage(HLLType.FULL);
for(final int registerIndex : sparseProbabilisticStorage.keySet()) {
final byte registerValue = sparseProbabilisticStorage.get(registerIndex);
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
for(IntByteCursor c : sparseProbabilisticStorage) {
final int registerIndex = c.key;
final byte registerValue = c.value;
probabilisticStorage.setMaxRegister(registerIndex, registerValue);
}
sparseProbabilisticStorage = null;
}
Expand Down Expand Up @@ -887,7 +896,7 @@ public byte[] toBytes(final ISchemaVersion schemaVersion) {
final IWordSerializer serializer =
schemaVersion.getSerializer(type, Long.SIZE, explicitStorage.size());

final long[] values = explicitStorage.toLongArray();
final long[] values = explicitStorage.toArray();
Arrays.sort(values);
for(final long value : values) {
serializer.writeWord(value);
Expand All @@ -900,9 +909,10 @@ public byte[] toBytes(final ISchemaVersion schemaVersion) {
final IWordSerializer serializer =
schemaVersion.getSerializer(type, shortWordLength, sparseProbabilisticStorage.size());

final int[] indices = sparseProbabilisticStorage.keySet().toIntArray();
final int[] indices = sparseProbabilisticStorage.keys().toArray();
Arrays.sort(indices);
for(final int registerIndex : indices) {
assert sparseProbabilisticStorage.containsKey(registerIndex);
final long registerValue = sparseProbabilisticStorage.get(registerIndex);
// pack index and value into "short word"
final long shortWord = ((registerIndex << regwidth) | registerValue);
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/net/agkn/hll/util/HLLUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public final class HLLUtil {
*
* @see #largeEstimator(int, int, double)
* @see #largeEstimatorCutoff(int, int)
* @see <a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 2^L</a>
* @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 2^L</a>"
*/
private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)];

Expand Down Expand Up @@ -178,7 +178,7 @@ public static double smallEstimator(final int m, final int numberOfZeroes) {
* @param registerSizeInBits the size of the HLL registers, in bits.
* @return the cutoff for the large range correction.
* @see #largeEstimator(int, int, double)
* @see <a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and "large range correction" cutoff</a>
* @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction' cutoff</a>"
*/
public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) {
return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0;
Expand All @@ -193,7 +193,7 @@ public static double largeEstimatorCutoff(final int log2m, final int registerSiz
* @param registerSizeInBits the size of the HLL registers, in bits.
* @param estimator the original estimator ("E" in the paper).
* @return a corrected cardinality estimate.
* @see <a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and "large range correction"</a>
* @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction'</a>"
*/
public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) {
final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m];
Expand Down
Loading