Skip to content

Commit

Permalink
Merge pull request Qbeast-io#234 from alexeiakimov/backport-string-hi…
Browse files Browse the repository at this point in the history
…st-encoding-to-main

String hist encoding to main
  • Loading branch information
osopardo1 authored Nov 27, 2023
2 parents f066acf + 2152a20 commit 0cbf7aa
Show file tree
Hide file tree
Showing 18 changed files with 593 additions and 45 deletions.
52 changes: 19 additions & 33 deletions core/src/main/scala/io/qbeast/core/model/CubeId.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ package io.qbeast.core.model
import io.qbeast.core.model.CubeId.{ChildrenIterator, Codec}

import java.nio.ByteBuffer
import java.util.Arrays
import scala.collection.immutable.BitSet
import scala.collection.mutable

Expand Down Expand Up @@ -95,20 +94,6 @@ object CubeId {
containers(point).drop(depth).next()
}

private def trimBitMask(bitMask: Array[Long]): Array[Long] = {
var last = bitMask.length - 1
while (last >= 0 && bitMask(last) == 0) {
last -= 1
}
if (last < bitMask.length - 1) {
val trimmedBitMask = new Array[Long](last + 1)
Array.copy(bitMask, 0, trimmedBitMask, 0, trimmedBitMask.length)
trimmedBitMask
} else {
bitMask
}
}

private class ContainersIterator(point: Point, parent: Option[CubeId])
extends Iterator[CubeId] {

Expand Down Expand Up @@ -270,14 +255,17 @@ case class CubeId(dimensionCount: Int, depth: Int, bitMask: Array[Long])
* is less than, equal to, or greater than the other CubeId.
*/
override def compare(that: CubeId): Int = {
val thisBitset = BitSet.fromBitMaskNoCopy(bitMask)
val thatBitset = BitSet.fromBitMaskNoCopy(that.bitMask)
val commonDepth = math.min(depth, that.depth)
for (depthOffset <- 0.until(commonDepth * dimensionCount)) {
val firstBit = thisBitset.contains(depthOffset)
val secondBit = thatBitset.contains(depthOffset)
if (firstBit != secondBit) {
if (firstBit) {
require(
that.dimensionCount == dimensionCount,
"The two cubes must have the same dimension count.")
val thisBits = BitSet.fromBitMaskNoCopy(bitMask)
val thatBits = BitSet.fromBitMaskNoCopy(that.bitMask)
val end = dimensionCount * math.min(depth, that.depth)
for (i <- (0 until end)) {
val thisBit = thisBits.contains(i)
val thatBit = thatBits.contains(i)
if (thisBit != thatBit) {
if (thisBit) {
return 1
} else {
return -1
Expand All @@ -302,14 +290,13 @@ case class CubeId(dimensionCount: Int, depth: Int, bitMask: Array[Long])
require(
other.dimensionCount == dimensionCount,
"The two cubes must have the same dimension count.")

if (depth > other.depth) {
false
} else {
val end = dimensionCount * depth
val ancestorBitMask = BitSet.fromBitMaskNoCopy(other.bitMask).until(end).toBitMask
Arrays.equals(CubeId.trimBitMask(bitMask), CubeId.trimBitMask(ancestorBitMask))
return false
}
val end = dimensionCount * depth
val bits = BitSet.fromBitMaskNoCopy(bitMask)
val otherBits = BitSet.fromBitMask(other.bitMask).until(end)
bits == otherBits
}

/**
Expand Down Expand Up @@ -402,9 +389,8 @@ case class CubeId(dimensionCount: Int, depth: Int, bitMask: Array[Long])

override def equals(obj: Any): Boolean = obj match {
case other: CubeId =>
dimensionCount == other.dimensionCount && depth == other.depth && Arrays.equals(
CubeId.trimBitMask(bitMask),
CubeId.trimBitMask(other.bitMask))
dimensionCount == other.dimensionCount && depth == other.depth && BitSet.fromBitMaskNoCopy(
bitMask) == BitSet.fromBitMaskNoCopy(other.bitMask)
case _ => false
}

Expand All @@ -413,7 +399,7 @@ case class CubeId(dimensionCount: Int, depth: Int, bitMask: Array[Long])
var result = 1
result = prime * result + dimensionCount
result = prime * result + depth
result = prime * result + Arrays.hashCode(CubeId.trimBitMask(bitMask))
result = prime * result + BitSet.fromBitMaskNoCopy(bitMask).hashCode()
result
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ case class HashTransformation(nullValue: Any = Random.nextInt()) extends Transfo
override def isSupersededBy(newTransformation: Transformation): Boolean = false

override def merge(other: Transformation): Transformation = this

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package io.qbeast.core.transform

import io.qbeast.core.model.QDataType

trait HistogramTransformation extends Transformation {

/**
* QDataType for the associated column.
*/
def dataType: QDataType

/**
* Histogram of the associated column that reflects the distribution of the column values.
* @return
*/
def histogram: IndexedSeq[Any]

/**
* Determines whether the associated histogram is the default one
* @return
*/
def isDefault: Boolean

override def transform(value: Any): Double

override def isSupersededBy(newTransformation: Transformation): Boolean

override def merge(other: Transformation): Transformation
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package io.qbeast.core.transform

import io.qbeast.core.model.{QDataType, StringDataType}

object HistogramTransformer extends TransformerType {
override def transformerSimpleName: String = "histogram"

override def apply(columnName: String, dataType: QDataType): Transformer = dataType match {
case StringDataType => StringHistogramTransformer(columnName, dataType)
case dt => throw new Exception(s"DataType not supported for HistogramTransformers: $dt")
}

// "a" to "z"
def defaultStringHistogram: IndexedSeq[String] = (97 to 122).map(_.toChar.toString)
}

trait HistogramTransformer extends Transformer {

override protected def transformerType: TransformerType = HistogramTransformer

/**
* Returns the name of the column
*
* @return
*/
override def columnName: String

/**
* Returns the stats
*
* @return
*/
override def stats: ColumnStats

/**
* Returns the Transformation given a row representation of the values
*
* @param row the values
* @return the transformation
*/
override def makeTransformation(row: String => Any): Transformation

}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ case class LinearTransformer(columnName: String, dataType: QDataType) extends Tr
} else if (minAux == maxAux) {
// If both values are equal we return an IdentityTransformation
IdentityToZeroTransformation(minAux)
} else { // otherwhise we pick the min and max
} else { // otherwise we pick the min and max
val min = getValue(minAux)
val max = getValue(maxAux)
dataType match {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package io.qbeast.core.transform

import com.fasterxml.jackson.core.{JsonFactory, JsonGenerator, JsonParser, TreeNode}
import com.fasterxml.jackson.databind.annotation.{JsonDeserialize, JsonSerialize}
import com.fasterxml.jackson.databind.deser.std.StdDeserializer
import com.fasterxml.jackson.databind.jsontype.TypeSerializer
import com.fasterxml.jackson.databind.node.ArrayNode
import com.fasterxml.jackson.databind.ser.std.StdSerializer
import com.fasterxml.jackson.databind.{DeserializationContext, SerializerProvider}
import io.qbeast.core.model.{QDataType, StringDataType}
import io.qbeast.core.transform.HistogramTransformer.defaultStringHistogram

import scala.collection.Searching._

@JsonSerialize(using = classOf[StringHistogramTransformationSerializer])
@JsonDeserialize(using = classOf[StringHistogramTransformationDeserializer])
case class StringHistogramTransformation(histogram: IndexedSeq[String])
extends HistogramTransformation {
require(histogram.length > 1, s"Histogram length has to be > 1: ${histogram.length}")

override val dataType: QDataType = StringDataType

override def isDefault: Boolean = histogram == defaultStringHistogram

/**
* Converts a real number to a normalized value.
*
* @param value a real number to convert
* @return a real number between 0 and 1
*/
override def transform(value: Any): Double = {
val v: String = value match {
case s: String => s
case null => "null"
case _ => value.toString
}

histogram.search(v) match {
case Found(foundIndex) => foundIndex.toDouble / (histogram.length - 1)
case InsertionPoint(insertionPoint) =>
if (insertionPoint == 0) 0d
else if (insertionPoint == histogram.length + 1) 1d
else (insertionPoint - 1).toDouble / (histogram.length - 1)
}
}

/**
* This method should determine if the new data will cause the creation of a new revision.
*
* @param newTransformation the new transformation created with statistics over the new data
* @return true if the domain of the newTransformation is not fully contained in this one.
*/
override def isSupersededBy(newTransformation: Transformation): Boolean =
newTransformation match {
case nt @ StringHistogramTransformation(hist) =>
if (isDefault) !nt.isDefault
else if (nt.isDefault) false
else !(histogram == hist)
case _ => false
}

/**
* Merges two transformations. The domain of the resulting transformation is the union of this
*
* @param other Transformation
* @return a new Transformation that contains both this and other.
*/
override def merge(other: Transformation): Transformation = other match {
case _: StringHistogramTransformation => other
case _ => this
}

}

class StringHistogramTransformationSerializer
extends StdSerializer[StringHistogramTransformation](classOf[StringHistogramTransformation]) {
val jsonFactory = new JsonFactory()

override def serializeWithType(
value: StringHistogramTransformation,
gen: JsonGenerator,
serializers: SerializerProvider,
typeSer: TypeSerializer): Unit = {
gen.writeStartObject()
typeSer.getPropertyName
gen.writeStringField(typeSer.getPropertyName, typeSer.getTypeIdResolver.idFromValue(value))

gen.writeFieldName("histogram")
gen.writeStartArray()
value.histogram.foreach(gen.writeString)
gen.writeEndArray()

gen.writeEndObject()
}

override def serialize(
value: StringHistogramTransformation,
gen: JsonGenerator,
provider: SerializerProvider): Unit = {
gen.writeStartObject()

gen.writeFieldName("histogram")
gen.writeStartArray()
value.histogram.foreach(gen.writeString)
gen.writeEndArray()

gen.writeEndObject()
}

}

class StringHistogramTransformationDeserializer
extends StdDeserializer[StringHistogramTransformation](
classOf[StringHistogramTransformation]) {

override def deserialize(
p: JsonParser,
ctxt: DeserializationContext): StringHistogramTransformation = {
val histogramBuilder = IndexedSeq.newBuilder[String]

val tree: TreeNode = p.getCodec.readTree(p)
tree.get("histogram") match {
case an: ArrayNode =>
(0 until an.size()).foreach(i => histogramBuilder += an.get(i).asText())
}

StringHistogramTransformation(histogramBuilder.result())
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package io.qbeast.core.transform

import io.qbeast.core.model.QDataType
import io.qbeast.core.transform.HistogramTransformer.defaultStringHistogram

case class StringHistogramTransformer(columnName: String, dataType: QDataType)
extends HistogramTransformer {
private val columnHistogram = s"${columnName}_histogram"

/**
* Returns the stats
*
* @return
*/
override def stats: ColumnStats = {
val defaultHistString = defaultStringHistogram.mkString("Array('", "', '", "')")
ColumnStats(
statsNames = columnHistogram :: Nil,
statsSqlPredicates = s"$defaultHistString AS $columnHistogram" :: Nil)
}

/**
* Returns the Transformation given a row representation of the values
*
* @param row the values
* @return the transformation
*/
override def makeTransformation(row: String => Any): Transformation = {
val hist = row(columnHistogram) match {
case h: Seq[_] => h.map(_.toString).toIndexedSeq
case _ => defaultStringHistogram
}

StringHistogramTransformation(hist)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ import java.util.Locale
object Transformer {

private val transformersRegistry: Map[String, TransformerType] =
Seq(LinearTransformer, HashTransformer).map(a => (a.transformerSimpleName, a)).toMap
Seq(LinearTransformer, HashTransformer, HistogramTransformer)
.map(a => (a.transformerSimpleName, a))
.toMap

/**
* Returns the transformer for the given column and type of transformer
Expand Down
11 changes: 11 additions & 0 deletions core/src/test/scala/io/qbeast/core/model/CubeIdTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class CubeIdTest extends AnyFlatSpec with Matchers {
val id7 =
CubeId(2, "wQwwwQwwQwwQwwwwwQwQwwwQQwwwwQQwQwwwQwwQwwQwwwwwQwwQQQQQQQQQQQQQ")
id6 == id7 shouldBe true
val id8 =
CubeId(1, 4, Array(9L)).parent.get.parent.get.parent.get
val id9 = CubeId(1, 1, Array(1L))
id8 == id9 shouldBe true
}

it should "implement hashCode correctly" in {
Expand Down Expand Up @@ -153,6 +157,13 @@ class CubeIdTest extends AnyFlatSpec with Matchers {
id4.nextSibling shouldBe None
}

it should "implement children iterator throwing NoSuchElementException after last child" in {
val children = CubeId.root(1).children
children.next() shouldBe CubeId.root(1).firstChild
children.next() shouldBe CubeId.root(1).firstChild.nextSibling.get
assertThrows[NoSuchElementException](children.next())
}

it should "return a correct container with specified depth" in {
val point = Point(0.66, 0.83)
val id = CubeId.container(point, 2)
Expand Down
Loading

0 comments on commit 0cbf7aa

Please sign in to comment.