From 5d61ac32bf34a82846559ac7e2771cc26237016c Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 2 Jan 2024 16:16:25 -0800 Subject: [PATCH 1/3] Preparing PR for datasketches-java-kll_weighted_updates_group2 --- .../kll/KllDirectDoublesSketch.java | 4 +- .../kll/KllDirectFloatsSketch.java | 4 +- .../datasketches/kll/KllDoublesHelper.java | 33 ++-- .../datasketches/kll/KllDoublesSketch.java | 26 ++- .../kll/KllDoublesSketchSortedView.java | 2 +- .../kll/KllFloatsSketchSortedView.java | 2 +- .../kll/KllHeapDoublesSketch.java | 15 +- .../datasketches/kll/KllHeapFloatsSketch.java | 6 +- .../apache/datasketches/kll/KllHelper.java | 122 ++++++++------ .../datasketches/kll/KllItemsSketch.java | 2 +- .../kll/KllItemsSketchSortedView.java | 2 +- .../datasketches/kll/KllPreambleUtil.java | 6 +- .../apache/datasketches/kll/KllSketch.java | 16 +- .../datasketches/kll/KllMiscDoublesTest.java | 153 +++++++++++------- .../datasketches/kll/KllMiscFloatsTest.java | 2 +- 15 files changed, 229 insertions(+), 166 deletions(-) diff --git a/src/main/java/org/apache/datasketches/kll/KllDirectDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDirectDoublesSketch.java index 21a46069c..031a4e2f6 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDirectDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDirectDoublesSketch.java @@ -178,7 +178,7 @@ public long getN() { //restricted - @Override //returns updatable, expanded array including empty/garbage space at bottom + @Override //returns updatable, expanded array including free space at bottom double[] getDoubleItemsArray() { final int k = getK(); if (sketchStructure == COMPACT_EMPTY) { return new double[k]; } @@ -196,7 +196,7 @@ public long getN() { return doubleItemsArr; } - @Override //returns compact items array of retained items, no empty/garbage. + @Override //returns compact items array of retained items, no free space. double[] getDoubleRetainedItemsArray() { if (sketchStructure == COMPACT_EMPTY) { return new double[0]; } if (sketchStructure == COMPACT_SINGLE) { return new double[] { getDoubleSingleItem() }; } diff --git a/src/main/java/org/apache/datasketches/kll/KllDirectFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllDirectFloatsSketch.java index 542eda596..80baf76c6 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDirectFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDirectFloatsSketch.java @@ -178,7 +178,7 @@ public long getN() { //restricted - @Override //returns updatable, expanded array including empty/garbage space at bottom + @Override //returns updatable, expanded array including free space at bottom float[] getFloatItemsArray() { final int k = getK(); if (sketchStructure == COMPACT_EMPTY) { return new float[k]; } @@ -196,7 +196,7 @@ float[] getFloatItemsArray() { return floatItemsArr; } - @Override //returns compact items array of retained items, no empty/garbage. + @Override //returns compact items array of retained items, no free space. float[] getFloatRetainedItemsArray() { if (sketchStructure == COMPACT_EMPTY) { return new float[0]; } if (sketchStructure == COMPACT_SINGLE) { return new float[] { getFloatSingleItem() }; } diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java index e92709463..d10a93383 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java @@ -24,6 +24,8 @@ import static org.apache.datasketches.common.Util.isEven; import static org.apache.datasketches.common.Util.isOdd; import static org.apache.datasketches.kll.KllHelper.findLevelToCompact; +import static org.apache.datasketches.kll.KllSketch.DEFAULT_M; +import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import java.util.Arrays; import java.util.Random; @@ -313,30 +315,31 @@ private static void randomlyHalveUpDoubles(final double[] buf, final int start, } } - //Called from KllDoublesSketch::update and this + //Called from KllDoublesSketch::update and merge static void updateDouble(final KllDoublesSketch dblSk, final double item) { - if (Double.isNaN(item)) { return; } //ignore - if (dblSk.isEmpty()) { - dblSk.setMinItem(item); - dblSk.setMaxItem(item); - } else { - dblSk.setMinItem(min(dblSk.getMinItem(), item)); - dblSk.setMaxItem(max(dblSk.getMaxItem(), item)); - } - int level0space = dblSk.levelsArr[0]; - assert (level0space >= 0); - if (level0space == 0) { + int freeSpace = dblSk.levelsArr[0]; + assert (freeSpace >= 0); + if (freeSpace == 0) { compressWhileUpdatingSketch(dblSk); - level0space = dblSk.levelsArr[0]; - assert (level0space > 0); + freeSpace = dblSk.levelsArr[0]; + assert (freeSpace > 0); } dblSk.incN(); dblSk.setLevelZeroSorted(false); - final int nextPos = level0space - 1; + final int nextPos = freeSpace - 1; dblSk.setLevelsArrayAt(0, nextPos); dblSk.setDoubleItemsArrayAt(nextPos, item); } + static void updateDouble(final KllDoublesSketch dblSk, final double item, final int weight) { + if (weight < dblSk.getLevelsArray(UPDATABLE)[0]) { + for (int i = 0; i < weight; i++) { dblSk.update(item); } + } else { + final KllHeapDoublesSketch tmpSk = new KllHeapDoublesSketch(dblSk.getK(), DEFAULT_M, item, weight); + dblSk.merge(tmpSk); + } + } + /** * Compression algorithm used to merge higher levels. *

Here is what we do for each level:

diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index d2f3dc38a..3f1206fa2 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -307,33 +307,37 @@ public byte[] toByteArray() { } @Override - public String toString(final boolean withSummary, final boolean withData) { + public String toString(final boolean withSummary, final boolean withDetail) { KllSketch sketch = this; - if (withData && sketchStructure != UPDATABLE) { + if (withDetail && sketchStructure != UPDATABLE) { final Memory mem = getWritableMemory(); assert mem != null; sketch = KllDoublesSketch.heapify(getWritableMemory()); } - return KllHelper.toStringImpl(sketch, withSummary, withData, getSerDe()); + return KllHelper.toStringImpl(sketch, withSummary, withDetail, getSerDe()); } @Override public void update(final double item) { + if (Double.isNaN(item)) { return; } //ignore if (readOnly) { throw new SketchesArgumentException(TGT_IS_READ_ONLY_MSG); } + updateMinMax(item); KllDoublesHelper.updateDouble(this, item); kllDoublesSV = null; } /** - * Updates this sketch with the given item the number of times specified by the given weight. + * Weighted update. Updates this sketch with the given item the number of times specified by the given weight. * @param item the item to be repeated. NaNs are ignored. * @param weight the number of times the update of item is to be repeated. It must be ≥ one. */ - public void weightedUpdate(final double item, final int weight) { + public void update(final double item, final int weight) { + if (Double.isNaN(item)) { return; } //ignore if (readOnly) { throw new SketchesArgumentException(TGT_IS_READ_ONLY_MSG); } if (weight < 1) { throw new SketchesArgumentException("Weight is less than one."); } if (Double.isNaN(item)) { return; } //ignore - KllHeapDoublesSketch.weightedUpdateDouble(this, item, weight); + updateMinMax(item); + KllDoublesHelper.updateDouble(this, item, weight); kllDoublesSV = null; } @@ -403,4 +407,14 @@ private final void refreshSortedView() { abstract void setMinItem(double item); + private void updateMinMax(final double item) { + if (isEmpty()) { + setMinItem(item); + setMaxItem(item); + } else { + setMinItem(min(getMinItem(), item)); + setMaxItem(max(getMaxItem(), item)); + } + } + } diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java index cac663695..13f0a9df0 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchSortedView.java @@ -77,7 +77,7 @@ public KllDoublesSketchSortedView(final KllDoublesSketch sketch) { if (!sketch.hasMemory()) { sketch.setLevelZeroSorted(true); } } - final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove garbage + final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove free space quantiles = new double[numQuantiles]; cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java index ebad5f397..db13bc19e 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchSortedView.java @@ -77,7 +77,7 @@ public KllFloatsSketchSortedView(final KllFloatsSketch sketch) { if (!sketch.hasMemory()) { sketch.setLevelZeroSorted(true); } } - final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove garbage + final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove free space quantiles = new float[numQuantiles]; cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java index 8e0ef93d5..0065b4a8c 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java @@ -136,10 +136,10 @@ else if (memStructure == COMPACT_FULL) { maxDoubleItem = srcMem.getDouble(offsetBytes); offsetBytes += Double.BYTES; final int capacityItems = levelsArr[getNumLevels()]; - final int garbageItems = levelsArr[0]; - final int retainedItems = capacityItems - garbageItems; + final int freeItems = levelsArr[0]; + final int retainedItems = capacityItems - freeItems; doubleItems = new double[capacityItems]; - srcMem.getDoubleArray(offsetBytes, doubleItems, garbageItems, retainedItems); + srcMem.getDoubleArray(offsetBytes, doubleItems, freeItems, retainedItems); } else { //(memStructure == UPDATABLE) int offsetBytes = DATA_START_ADR; @@ -301,13 +301,4 @@ void setNumLevels(final int numLevels) { @Override void setWritableMemory(final WritableMemory wmem) { } - static void weightedUpdateDouble(final KllDoublesSketch dblSk, final double item, final int weight) { - if (weight < dblSk.getLevelsArray(UPDATABLE)[0]) { - for (int i = 0; i < weight; i++) { dblSk.update(item); } - } else { - final KllHeapDoublesSketch tmpSk = new KllHeapDoublesSketch(dblSk.getK(), DEFAULT_M, item, weight); - dblSk.merge(tmpSk); - } - } - } diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java index 472871854..545e02c8f 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java @@ -117,10 +117,10 @@ else if (memStructure == COMPACT_FULL) { maxFloatItem = srcMem.getFloat(offsetBytes); offsetBytes += Float.BYTES; final int capacityItems = levelsArr[getNumLevels()]; - final int garbageItems = levelsArr[0]; - final int retainedItems = capacityItems - garbageItems; + final int freeItems = levelsArr[0]; + final int retainedItems = capacityItems - freeItems; floatItems = new float[capacityItems]; - srcMem.getFloatArray(offsetBytes, floatItems, garbageItems, retainedItems); + srcMem.getFloatArray(offsetBytes, floatItems, freeItems, retainedItems); } else { //(memStructure == UPDATABLE) int offsetBytes = DATA_START_ADR; diff --git a/src/main/java/org/apache/datasketches/kll/KllHelper.java b/src/main/java/org/apache/datasketches/kll/KllHelper.java index 6ef4197d2..7eb681329 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllHelper.java @@ -365,10 +365,10 @@ private static String outputData(final KllSketch sketch) { final int k = sketch.getK(); final int m = sketch.getM(); final StringBuilder sb = new StringBuilder(); - sb.append("### KllSketch itemsArray & levelsArray data:").append(LS); + sb.append(LS + "### KLL ItemsArray & LevelsArray Detail:").append(LS); sb.append("Index, Value").append(LS); if (levelsArr[0] > 0) { - final String gbg = " Empty or Garbage, size = " + levelsArr[0]; + final String gbg = " Free Space, Size = " + levelsArr[0]; for (int i = 0; i < levelsArr[0]; i++) { sb.append(" ").append(i + ", ").append(sketch.getItemAsString(i)); if (i == 0) { sb.append(gbg); } @@ -381,10 +381,10 @@ private static String outputData(final KllSketch sketch) { final int toIndex = levelsArr[level + 1]; // exclusive String lvlData = ""; if (fromIndex < toIndex) { - lvlData = " level[" + level + "]=" + levelsArr[level] - + ", cap=" + KllHelper.levelCapacity(k, numLevels, level, m) - + ", size=" + KllHelper.currentLevelSizeItems(level, numLevels, levelsArr) - + ", wt=" + (1 << level) + LS; + lvlData = " Level[" + level + "]=" + levelsArr[level] + + ", Cap=" + KllHelper.levelCapacity(k, numLevels, level, m) + + ", Size=" + KllHelper.currentLevelSizeItems(level, numLevels, levelsArr) + + ", Wt=" + (1 << level) + LS; } for (int i = fromIndex; i < toIndex; i++) { @@ -393,10 +393,25 @@ private static String outputData(final KllSketch sketch) { } level++; } - sb.append(" ----------level[" + level + "]=" + levelsArr[level] + ": itemsArray[].length"); + sb.append(" ----------Level[" + level + "]=" + levelsArr[level] + ": ItemsArray[].length"); sb.append(LS); - sb.append("### End data").append(LS); + sb.append("### End ItemsArray & LevelsArray Detail").append(LS); + return sb.toString(); + } + static String outputLevels(final int k, final int m, final int numLevels, final int[] levelsArr) { + final StringBuilder sb = new StringBuilder(); + sb.append(LS + "### KLL Levels Array:").append(LS) + .append(" Level, Offset: Nominal Capacity, Actual Capacity").append(LS); + int level = 0; + for ( ; level < numLevels; level++) { + sb.append(" ").append(level).append(", ").append(levelsArr[level]).append(": ") + .append(KllHelper.levelCapacity(k, numLevels, level, m)) + .append(", ").append(KllHelper.currentLevelSizeItems(level, numLevels, levelsArr)).append(LS); + } + sb.append(" ").append(level).append(", ").append(levelsArr[level]).append(": ----ItemsArray[].length") + .append(LS); + sb.append("### End Levels Array").append(LS); return sb.toString(); } @@ -479,55 +494,58 @@ static byte[] toByteArray(final KllSketch srcSk, final boolean updatable) { static String toStringImpl(final KllSketch sketch, final boolean withSummary, final boolean withData, final ArrayOfItemsSerDe serDe) { - final SketchType sketchType = sketch.sketchType; - final boolean hasMemory = sketch.hasMemory(); + final StringBuilder sb = new StringBuilder(); final int k = sketch.getK(); final int m = sketch.getM(); - final long n = sketch.getN(); final int numLevels = sketch.getNumLevels(); final int[] fullLevelsArr = sketch.getLevelsArray(UPDATABLE); - //final int[] levelsArr = sketch.getLevelsArray(sketch.sketchStructure); - final String epsPct = String.format("%.3f%%", sketch.getNormalizedRankError(false) * 100); - final String epsPMFPct = String.format("%.3f%%", sketch.getNormalizedRankError(true) * 100); - final boolean compact = sketch.isCompactMemoryFormat(); - final StringBuilder sb = new StringBuilder(); - final String directStr = hasMemory ? "Direct" : ""; - final String compactStr = compact ? "Compact" : ""; - final String readOnlyStr = sketch.isReadOnly() ? "true" + ("(" + (compact ? "Format" : "Memory") + ")") : "false"; - final String skTypeStr = sketchType.getName(); - final String className = "Kll" + directStr + compactStr + skTypeStr; - - sb.append(LS).append("### ").append(className).append(" Summary:").append(LS); - sb.append(" K : ").append(k).append(LS); - sb.append(" Dynamic min K : ").append(sketch.getMinK()).append(LS); - sb.append(" M : ").append(m).append(LS); - sb.append(" N : ").append(n).append(LS); - sb.append(" Epsilon : ").append(epsPct).append(LS); - sb.append(" Epsilon PMF : ").append(epsPMFPct).append(LS); - sb.append(" Empty : ").append(sketch.isEmpty()).append(LS); - sb.append(" Estimation Mode : ").append(sketch.isEstimationMode()).append(LS); - sb.append(" Levels : ").append(numLevels).append(LS); - sb.append(" Level 0 Sorted : ").append(sketch.isLevelZeroSorted()).append(LS); - sb.append(" Capacity Items : ").append(fullLevelsArr[numLevels]).append(LS); - sb.append(" Retained Items : ").append(sketch.getNumRetained()).append(LS); - sb.append(" Empty/Garbage Items : ").append(sketch.levelsArr[0]).append(LS); - sb.append(" ReadOnly : ").append(readOnlyStr).append(LS); - if (sketchType != ITEMS_SKETCH) { - sb.append(" Updatable Storage Bytes: ").append(sketch.currentSerializedSizeBytes(true)).append(LS); - } - sb.append(" Compact Storage Bytes : ").append(sketch.currentSerializedSizeBytes(false)).append(LS); - - final String emptyStr = (sketchType == ITEMS_SKETCH) ? "Null" : "NaN"; - - sb.append(" Min Item : ").append(sketch.isEmpty() ? emptyStr : sketch.getMinItemAsString()) - .append(LS); - sb.append(" Max Item : ").append(sketch.isEmpty() ? emptyStr : sketch.getMaxItemAsString()) - .append(LS); - sb.append("### End sketch summary").append(LS); - - if (! withSummary) { sb.setLength(0); } - if (withData) { sb.append(outputData(sketch)); } + if (withSummary) { + final SketchType sketchType = sketch.sketchType; + final boolean hasMemory = sketch.hasMemory(); + final long n = sketch.getN(); + final String epsPct = String.format("%.3f%%", sketch.getNormalizedRankError(false) * 100); + final String epsPMFPct = String.format("%.3f%%", sketch.getNormalizedRankError(true) * 100); + final boolean compact = sketch.isCompactMemoryFormat(); + + final String directStr = hasMemory ? "Direct" : ""; + final String compactStr = compact ? "Compact" : ""; + final String readOnlyStr = sketch.isReadOnly() ? "true" + ("(" + (compact ? "Format" : "Memory") + ")") : "false"; + final String skTypeStr = sketchType.getName(); + final String className = "Kll" + directStr + compactStr + skTypeStr; + + sb.append(LS + "### ").append(className).append(" Summary:").append(LS); + sb.append(" K : ").append(k).append(LS); + sb.append(" Dynamic min K : ").append(sketch.getMinK()).append(LS); + sb.append(" M : ").append(m).append(LS); + sb.append(" N : ").append(n).append(LS); + sb.append(" Epsilon : ").append(epsPct).append(LS); + sb.append(" Epsilon PMF : ").append(epsPMFPct).append(LS); + sb.append(" Empty : ").append(sketch.isEmpty()).append(LS); + sb.append(" Estimation Mode : ").append(sketch.isEstimationMode()).append(LS); + sb.append(" Levels : ").append(numLevels).append(LS); + sb.append(" Level 0 Sorted : ").append(sketch.isLevelZeroSorted()).append(LS); + sb.append(" Capacity Items : ").append(fullLevelsArr[numLevels]).append(LS); + sb.append(" Retained Items : ").append(sketch.getNumRetained()).append(LS); + sb.append(" Free Space : ").append(sketch.levelsArr[0]).append(LS); + sb.append(" ReadOnly : ").append(readOnlyStr).append(LS); + if (sketchType != ITEMS_SKETCH) { + sb.append(" Updatable Storage Bytes: ").append(sketch.currentSerializedSizeBytes(true)).append(LS); + } + sb.append(" Compact Storage Bytes : ").append(sketch.currentSerializedSizeBytes(false)).append(LS); + + final String emptyStr = (sketchType == ITEMS_SKETCH) ? "Null" : "NaN"; + + sb.append(" Min Item : ").append(sketch.isEmpty() ? emptyStr : sketch.getMinItemAsString()) + .append(LS); + sb.append(" Max Item : ").append(sketch.isEmpty() ? emptyStr : sketch.getMaxItemAsString()) + .append(LS); + sb.append("### End sketch summary").append(LS); + } + if (withData) { + sb.append(outputLevels(k, m, numLevels, fullLevelsArr)); + sb.append(outputData(sketch)); + } return sb.toString(); } diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java index 589c1fa30..6911a6013 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java @@ -337,7 +337,7 @@ private final KllItemsSketchSortedView refreshSortedView() { /** * @return a full array of items as if the sketch was in COMPACT_FULL or UPDATABLE format. - * This will include zeros and possibly some garbage items. + * This will include zeros and possibly some free space. */ abstract T[] getTotalItemsArray(); diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java index fffb5d704..ee1278826 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java @@ -103,7 +103,7 @@ public class KllItemsSketchSortedView implements GenericSortedView, Partit if (!sketch.hasMemory()) { sketch.setLevelZeroSorted(true); } } - final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove garbage + final int numQuantiles = srcLevels[srcNumLevels] - srcLevels[0]; //remove free space quantiles = (T[]) Array.newInstance(sketch.serDe.getClassOfT(), numQuantiles); cumWeights = new long[numQuantiles]; populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles); diff --git a/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java b/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java index 97d71494b..805ba23a4 100644 --- a/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java +++ b/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java @@ -67,13 +67,13 @@ * in the table below. * The 5 int preamble is followed by the levelsArr int[numLevels] as bytes, * followed by the min and max values as bytes, - * followed by a packed items data array as bytes. There are no empty or garbage slots in this structure. + * followed by a packed items data array as bytes. There are no free slots in this structure. * It is not updatable. * It is identified by the enum SketchStructure.COMPACT_FULL. * *
  • A serialized, n > 1 non-compact, updatable structure requires 20 bytes of preamble (5 ints). * This is followed by the LevelsArr int[NumLevels + 1], followed by the min and max values, and then - * followed by an items data array that may include empty or garbage slots. It is updatable. + * followed by an items data array that may include free slots. It is updatable. * The details of these fields can be found in the code.. * It is identified by the enum SketchStructure.UPDATABLE. This structure may not be implemented by * some sketches.
  • @@ -300,7 +300,7 @@ static String toString(final Memory mem, final SketchType sketchType, final sb.append("<<>>").append(LS); } - sb.append("ALL DATA (including empty & garbage data)").append(LS); + sb.append("ALL DATA (including free space)").append(LS); final int itemsSpace = (sketchBytes - offsetBytes) / typeBytes; if (sketchType == DOUBLES_SKETCH) { for (int i = 0; i < itemsSpace; i++) { diff --git a/src/main/java/org/apache/datasketches/kll/KllSketch.java b/src/main/java/org/apache/datasketches/kll/KllSketch.java index 684cfd841..bbe4ce807 100644 --- a/src/main/java/org/apache/datasketches/kll/KllSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllSketch.java @@ -53,7 +53,7 @@ * The data for level i lies in positions levelsArr[i] through levelsArr[i + 1] - 1 inclusive. * Hence, the levelsArr must contain (numLevels + 1) elements. * The valid portion of the itemsArr is completely packed and sorted, except for level 0, - * which is filled from the top down. Any items below the index levelsArr[0] is garbage and will be + * which is filled from the top down. Any items below the index levelsArr[0] is free space and will be * overwritten by subsequent updates. * * Invariants: @@ -287,11 +287,11 @@ public final String toString() { /** * Returns a summary of the sketch as a string. * @param withSummary if true includes sketch summary information - * @param withData if true include sketch data + * @param withDetail if true include detail of levels array and items array * @return string representation of sketch summary */ - public String toString(final boolean withSummary, final boolean withData) { - return KllHelper.toStringImpl(this, withSummary, withData, getSerDe()); + public String toString(final boolean withSummary, final boolean withDetail) { + return KllHelper.toStringImpl(this, withSummary, withDetail, getSerDe()); } //restricted @@ -390,14 +390,14 @@ final int getNumLevels() { /** * Gets the serialized byte array of the valid retained items as a byte array. - * It does not include the preamble, the levels array, minimum or maximum items, or garbage data. + * It does not include the preamble, the levels array, minimum or maximum items, or free space. * @return the serialized bytes of the retained data. */ abstract byte[] getRetainedItemsByteArr(); /** * Gets the size in bytes of the valid retained items. - * It does not include the preamble, the levels array, minimum or maximum items, or garbage data. + * It does not include the preamble, the levels array, minimum or maximum items, or free space. * @return the size of the retained data in bytes. */ abstract int getRetainedItemsSizeBytes(); @@ -423,7 +423,7 @@ final int getNumLevels() { /** * Gets the serialized byte array of the entire internal items hypothetical structure. * It does not include the preamble, the levels array, or minimum or maximum items. - * It may include empty or garbage items. + * It may include empty or free space. * @return the serialized bytes of the retained data. */ abstract byte[] getTotalItemsByteArr(); @@ -431,7 +431,7 @@ final int getNumLevels() { /** * Gets the size in bytes of the entire internal items hypothetical structure. * It does not include the preamble, the levels array, or minimum or maximum items. - * It may include empty or garbage items. + * It may include empty or free space. * @return the size of the retained data in bytes. */ abstract int getTotalItemsNumBytes(); diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java index 59a845fc7..046e82d76 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java @@ -22,6 +22,7 @@ import static org.apache.datasketches.common.Util.bitAt; import static org.apache.datasketches.kll.KllHelper.getGrowthSchemeForGivenN; import static org.apache.datasketches.kll.KllSketch.SketchType.DOUBLES_SKETCH; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; @@ -168,50 +169,124 @@ public void visualCheckToString() { assertEquals(sk2.getNumRetained(), 56); } - //Disable this test for releases @Test //set static enablePrinting = true for visual checking public void viewHeapCompactions() { int k = 20; int n = 108; boolean withSummary = false; - boolean withData = true; + boolean withDetail = true; int compaction = 0; - WritableMemory wmem = WritableMemory.allocate(1 << 20); - MemoryRequestServer memReqSvr = new DefaultMemoryRequestServer(); + KllDoublesSketch sk = KllDoublesSketch.newHeapInstance(k); + for (int i = 1; i <= n; i++) { + sk.update(i); + if (sk.levelsArr[0] == 0) { + println(LS + "#<<< BEFORE COMPACTION # " + (++compaction) + " >>>"); + println(sk.toString(withSummary, withDetail)); + sk.update(++i); + println(LS + "#<<< AFTER COMPACTION # " + (compaction) + " >>>"); + println(sk.toString(withSummary, withDetail)); + assertEquals(sk.getDoubleItemsArray()[sk.levelsArr[0]], i); + } + } + println(LS + "#<<< END STATE # >>>"); + println(sk.toString(withSummary, withDetail)); + println(""); + } + + @Test //set static enablePrinting = true for visual checking + public void viewDirectCompactions() { + int k = 20; + int n = 108; + boolean withSummary = false; + boolean withDetail = true; + int compaction = 0; + int sizeBytes = KllSketch.getMaxSerializedSizeBytes(k, n, DOUBLES_SKETCH, true); + WritableMemory wmem = WritableMemory.allocate(sizeBytes); KllDoublesSketch sk = KllDoublesSketch.newDirectInstance(k, wmem, memReqSvr); for (int i = 1; i <= n; i++) { sk.update(i); if (sk.levelsArr[0] == 0) { println(LS + "#<<< BEFORE COMPACTION # " + (++compaction) + " >>>"); - println(sk.toString(withSummary, withData)); + println(sk.toString(withSummary, withDetail)); sk.update(++i); println(LS + "#<<< AFTER COMPACTION # " + (compaction) + " >>>"); - println(sk.toString(withSummary, withData)); + println(sk.toString(withSummary, withDetail)); assertEquals(sk.getDoubleItemsArray()[sk.levelsArr[0]], i); } } println(LS + "#<<< END STATE # >>>"); - println(sk.toString(withSummary, withData)); + println(sk.toString(withSummary, withDetail)); println(""); } @Test //set static enablePrinting = true for visual checking - public void checkWeightedUpdates() { + public void viewCompactionAndSortedView() { + int n = 43; + KllDoublesSketch sk = KllDoublesSketch.newHeapInstance(20); + for (int i = 1; i <= n; i++) { sk.update(i); } + println(sk.toString(true, true)); + DoublesSortedView sv = sk.getSortedView(); + DoublesSortedViewIterator itr = sv.iterator(); + println("### SORTED VIEW"); + printf("%12s%12s\n", "Value", "Weight"); + long[] correct = {2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; + int i = 0; + while (itr.next()) { + double v = itr.getQuantile(); + long wt = itr.getWeight(); + printf("%12.1f%12d\n", v, wt); + assertEquals(wt, correct[i++]); + } + } + + @Test //set static enablePrinting = true for visual checking + public void checkWeightedUpdates1() { int k = 20; - int n1 = 0; int weight = 127; double item = 10.0; KllDoublesSketch sk = KllDoublesSketch.newHeapInstance(k); println(sk.toString(true, true)); - sk.weightedUpdate(item, weight); -// sk.weightedUpdate(item, n2); -// println(sk.toString(true, true)); -// assertEquals(sk.getNumRetained(), 8); -// assertEquals(sk.getN(), 216); + sk.update(item, weight); + println(sk.toString(true, true)); + assertEquals(sk.getNumRetained(), 7); + assertEquals(sk.getN(), 127); + sk.update(item, weight); + println(sk.toString(true, true)); + assertEquals(sk.getNumRetained(), 14); + assertEquals(sk.getN(), 254); } @Test //set static enablePrinting = true for visual checking - public void checkCreateItemsArray() { + public void checkWeightedUpdates2() { + int k = 20; + int initial = 1000; + int weight = 127; + double item = 10.0; + KllDoublesSketch sk = KllDoublesSketch.newHeapInstance(k); + for (int i = 1; i <= initial; i++) { sk.update(i + 1000); } + println(sk.toString(true, true)); + sk.update(item, weight); + println(sk.toString(true, true)); + assertEquals(sk.getNumRetained(), 65); + assertEquals(sk.getN(), 1127); + + DoublesSortedViewIterator itr = sk.getSortedView().iterator(); + println("### SORTED VIEW"); + printf("%12s %12s %12s\n", "Value", "Weight", "NaturalRank"); + long cumWt = 0; + while (itr.next()) { + double v = itr.getQuantile(); + long wt = itr.getWeight(); + long natRank = itr.getNaturalRank(INCLUSIVE); + cumWt += wt; + assertEquals(cumWt, natRank); + printf("%12.1f %12d %12d\n", v, wt, natRank); + } + assertEquals(cumWt, sk.getN()); + } + + @Test //set static enablePrinting = true for visual checking + public void checkCreateItemsArray() { //used with weighted updates double item = 10.0; int weight = 108; double[] itemsArr = KllDoublesHelper.createItemsArray(item, weight); @@ -233,7 +308,7 @@ private static void outputItems(double[] itemsArr) { } @Test //set static enablePrinting = true for visual checking - public void checkCreateLevelsArray() { + public void checkCreateLevelsArray() { //used with weighted updates int weight = 108; int[] levelsArr = KllHelper.createLevelsArray(weight); assertEquals(levelsArr.length, 8); @@ -267,13 +342,13 @@ public void viewMemorySketchData() { int k = 20; int n = 109; boolean withSummary = true; - boolean withData = true; + boolean withDetail = true; KllDoublesSketch sk = KllDoublesSketch.newHeapInstance(k); for (int i = 1; i <= n; i++) { sk.update(i); } byte[] byteArr = sk.toByteArray(); Memory mem = Memory.wrap(byteArr); KllDoublesSketch ddSk = KllDoublesSketch.wrap(mem); - println(ddSk.toString(withSummary, withData)); + println(ddSk.toString(withSummary, withDetail)); assertEquals(ddSk.getN(), n); } @@ -324,44 +399,6 @@ public void checkIntCapAuxAux() { } } - @Test //set static enablePrinting = true for visual checking - public void viewDirectCompactions() { - int k = 20; - int n = 108; - int sizeBytes = KllSketch.getMaxSerializedSizeBytes(k, n, DOUBLES_SKETCH, true); - WritableMemory wmem = WritableMemory.allocate(sizeBytes); - KllDoublesSketch sk = KllDoublesSketch.newDirectInstance(k, wmem, memReqSvr); - for (int i = 1; i <= n; i++) { - sk.update(i); - if (sk.levelsArr[0] == 0) { - println(sk.toString(true, true)); - sk.update(++i); - println(sk.toString(true, true)); - assertEquals(sk.getDoubleItemsArray()[sk.levelsArr[0]], i); - } - } - } - - @Test //set static enablePrinting = true for visual checking - public void viewCompactionAndSortedView() { - int n = 43; - KllDoublesSketch sk = KllDoublesSketch.newHeapInstance(20); - for (int i = 1; i <= n; i++) { sk.update(i); } - println(sk.toString(true, true)); - DoublesSortedView sv = sk.getSortedView(); - DoublesSortedViewIterator itr = sv.iterator(); - println("### SORTED VIEW"); - printf("%12s%12s\n", "Value", "CumWeight"); - long[] correct = {2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; - int i = 0; - while (itr.next()) { - double v = itr.getQuantile(); - long wt = itr.getWeight(); - printf("%12.1f%12d\n", v, wt); - assertEquals(wt, correct[i++]); - } - } - @Test public void checkGrowLevels() { KllDoublesSketch sk = KllDoublesSketch.newHeapInstance(20); @@ -650,7 +687,7 @@ public void checkMemoryToStringDoubleUpdatable() { wmem = WritableMemory.writableWrap(upBytes2); s = KllPreambleUtil.toString(wmem, DOUBLES_SKETCH, true); println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); - println(s); //note: heapify does not copy garbage, while toUpdatableByteArray does + println(s); //note: heapify does not copy free space, while toUpdatableByteArray does assertEquals(sk.getN(), sk2.getN()); assertEquals(sk.getMinItem(), sk2.getMinItem()); assertEquals(sk.getMaxItem(), sk2.getMaxItem()); @@ -736,7 +773,7 @@ public void printlnTest() { printf("%s\n", s); } - private final static boolean enablePrinting = false; + private final static boolean enablePrinting = true; /** * @param format the format diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscFloatsTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscFloatsTest.java index a7b699cd5..6cc495575 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscFloatsTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscFloatsTest.java @@ -506,7 +506,7 @@ public void checkMemoryToStringFloatUpdatable() { wmem = WritableMemory.writableWrap(upBytes2); s = KllPreambleUtil.toString(wmem, FLOATS_SKETCH, true); println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); - println(s); //note: heapify does not copy garbage, while toUpdatableByteArray does + println(s); //note: heapify does not copy free space, while toUpdatableByteArray does assertEquals(sk.getN(), sk2.getN()); assertEquals(sk.getMinItem(), sk2.getMinItem()); assertEquals(sk.getMaxItem(), sk2.getMaxItem()); From 00cdbe5ca597c2a3fd42c2e3386a8a5017ed0ed6 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 3 Jan 2024 11:36:28 -0800 Subject: [PATCH 2/3] minor test update --- .../java/org/apache/datasketches/kll/KllMiscDoublesTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java index 046e82d76..8e3c615fe 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java @@ -249,7 +249,7 @@ public void checkWeightedUpdates1() { sk.update(item, weight); println(sk.toString(true, true)); assertEquals(sk.getNumRetained(), 7); - assertEquals(sk.getN(), 127); + assertEquals(sk.getN(), weight); sk.update(item, weight); println(sk.toString(true, true)); assertEquals(sk.getNumRetained(), 14); From 1db34d9bd7bb1813e8800fa2856f91ae589a29f3 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 3 Jan 2024 16:18:01 -0800 Subject: [PATCH 3/3] minor updates to documentation or variable names. --- .../org/apache/datasketches/kll/KllHeapDoublesSketch.java | 6 +++--- .../org/apache/datasketches/kll/KllHeapFloatsSketch.java | 6 +++--- .../java/org/apache/datasketches/kll/KllPreambleUtil.java | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java index 0065b4a8c..c83bd8d19 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java @@ -136,10 +136,10 @@ else if (memStructure == COMPACT_FULL) { maxDoubleItem = srcMem.getDouble(offsetBytes); offsetBytes += Double.BYTES; final int capacityItems = levelsArr[getNumLevels()]; - final int freeItems = levelsArr[0]; - final int retainedItems = capacityItems - freeItems; + final int freeSpace = levelsArr[0]; + final int retainedItems = capacityItems - freeSpace; doubleItems = new double[capacityItems]; - srcMem.getDoubleArray(offsetBytes, doubleItems, freeItems, retainedItems); + srcMem.getDoubleArray(offsetBytes, doubleItems, freeSpace, retainedItems); } else { //(memStructure == UPDATABLE) int offsetBytes = DATA_START_ADR; diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java index 545e02c8f..9b2595b96 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java @@ -117,10 +117,10 @@ else if (memStructure == COMPACT_FULL) { maxFloatItem = srcMem.getFloat(offsetBytes); offsetBytes += Float.BYTES; final int capacityItems = levelsArr[getNumLevels()]; - final int freeItems = levelsArr[0]; - final int retainedItems = capacityItems - freeItems; + final int freeSpace = levelsArr[0]; + final int retainedItems = capacityItems - freeSpace; floatItems = new float[capacityItems]; - srcMem.getFloatArray(offsetBytes, floatItems, freeItems, retainedItems); + srcMem.getFloatArray(offsetBytes, floatItems, freeSpace, retainedItems); } else { //(memStructure == UPDATABLE) int offsetBytes = DATA_START_ADR; diff --git a/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java b/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java index 805ba23a4..367dde647 100644 --- a/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java +++ b/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java @@ -67,13 +67,13 @@ * in the table below. * The 5 int preamble is followed by the levelsArr int[numLevels] as bytes, * followed by the min and max values as bytes, - * followed by a packed items data array as bytes. There are no free slots in this structure. + * followed by a packed items data array as bytes. There is no free space in this structure. * It is not updatable. * It is identified by the enum SketchStructure.COMPACT_FULL. * *
  • A serialized, n > 1 non-compact, updatable structure requires 20 bytes of preamble (5 ints). * This is followed by the LevelsArr int[NumLevels + 1], followed by the min and max values, and then - * followed by an items data array that may include free slots. It is updatable. + * followed by an items data array that may include free space. It is updatable. * The details of these fields can be found in the code.. * It is identified by the enum SketchStructure.UPDATABLE. This structure may not be implemented by * some sketches.