forked from Qbeast-io/qbeast-spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request Qbeast-io#234 from alexeiakimov/backport-string-hi…
…st-encoding-to-main String hist encoding to main
- Loading branch information
Showing
18 changed files
with
593 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
29 changes: 29 additions & 0 deletions
29
core/src/main/scala/io/qbeast/core/transform/HistogramTransformation.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package io.qbeast.core.transform | ||
|
||
import io.qbeast.core.model.QDataType | ||
|
||
trait HistogramTransformation extends Transformation { | ||
|
||
/** | ||
* QDataType for the associated column. | ||
*/ | ||
def dataType: QDataType | ||
|
||
/** | ||
* Histogram of the associated column that reflects the distribution of the column values. | ||
* @return | ||
*/ | ||
def histogram: IndexedSeq[Any] | ||
|
||
/** | ||
* Determines whether the associated histogram is the default one | ||
* @return | ||
*/ | ||
def isDefault: Boolean | ||
|
||
override def transform(value: Any): Double | ||
|
||
override def isSupersededBy(newTransformation: Transformation): Boolean | ||
|
||
override def merge(other: Transformation): Transformation | ||
} |
43 changes: 43 additions & 0 deletions
43
core/src/main/scala/io/qbeast/core/transform/HistogramTransformer.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
package io.qbeast.core.transform | ||
|
||
import io.qbeast.core.model.{QDataType, StringDataType} | ||
|
||
object HistogramTransformer extends TransformerType { | ||
override def transformerSimpleName: String = "histogram" | ||
|
||
override def apply(columnName: String, dataType: QDataType): Transformer = dataType match { | ||
case StringDataType => StringHistogramTransformer(columnName, dataType) | ||
case dt => throw new Exception(s"DataType not supported for HistogramTransformers: $dt") | ||
} | ||
|
||
// "a" to "z" | ||
def defaultStringHistogram: IndexedSeq[String] = (97 to 122).map(_.toChar.toString) | ||
} | ||
|
||
trait HistogramTransformer extends Transformer { | ||
|
||
override protected def transformerType: TransformerType = HistogramTransformer | ||
|
||
/** | ||
* Returns the name of the column | ||
* | ||
* @return | ||
*/ | ||
override def columnName: String | ||
|
||
/** | ||
* Returns the stats | ||
* | ||
* @return | ||
*/ | ||
override def stats: ColumnStats | ||
|
||
/** | ||
* Returns the Transformation given a row representation of the values | ||
* | ||
* @param row the values | ||
* @return the transformation | ||
*/ | ||
override def makeTransformation(row: String => Any): Transformation | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
130 changes: 130 additions & 0 deletions
130
core/src/main/scala/io/qbeast/core/transform/StringHistogramTransformation.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
package io.qbeast.core.transform | ||
|
||
import com.fasterxml.jackson.core.{JsonFactory, JsonGenerator, JsonParser, TreeNode} | ||
import com.fasterxml.jackson.databind.annotation.{JsonDeserialize, JsonSerialize} | ||
import com.fasterxml.jackson.databind.deser.std.StdDeserializer | ||
import com.fasterxml.jackson.databind.jsontype.TypeSerializer | ||
import com.fasterxml.jackson.databind.node.ArrayNode | ||
import com.fasterxml.jackson.databind.ser.std.StdSerializer | ||
import com.fasterxml.jackson.databind.{DeserializationContext, SerializerProvider} | ||
import io.qbeast.core.model.{QDataType, StringDataType} | ||
import io.qbeast.core.transform.HistogramTransformer.defaultStringHistogram | ||
|
||
import scala.collection.Searching._ | ||
|
||
@JsonSerialize(using = classOf[StringHistogramTransformationSerializer]) | ||
@JsonDeserialize(using = classOf[StringHistogramTransformationDeserializer]) | ||
case class StringHistogramTransformation(histogram: IndexedSeq[String]) | ||
extends HistogramTransformation { | ||
require(histogram.length > 1, s"Histogram length has to be > 1: ${histogram.length}") | ||
|
||
override val dataType: QDataType = StringDataType | ||
|
||
override def isDefault: Boolean = histogram == defaultStringHistogram | ||
|
||
/** | ||
* Converts a real number to a normalized value. | ||
* | ||
* @param value a real number to convert | ||
* @return a real number between 0 and 1 | ||
*/ | ||
override def transform(value: Any): Double = { | ||
val v: String = value match { | ||
case s: String => s | ||
case null => "null" | ||
case _ => value.toString | ||
} | ||
|
||
histogram.search(v) match { | ||
case Found(foundIndex) => foundIndex.toDouble / (histogram.length - 1) | ||
case InsertionPoint(insertionPoint) => | ||
if (insertionPoint == 0) 0d | ||
else if (insertionPoint == histogram.length + 1) 1d | ||
else (insertionPoint - 1).toDouble / (histogram.length - 1) | ||
} | ||
} | ||
|
||
/** | ||
* This method should determine if the new data will cause the creation of a new revision. | ||
* | ||
* @param newTransformation the new transformation created with statistics over the new data | ||
* @return true if the domain of the newTransformation is not fully contained in this one. | ||
*/ | ||
override def isSupersededBy(newTransformation: Transformation): Boolean = | ||
newTransformation match { | ||
case nt @ StringHistogramTransformation(hist) => | ||
if (isDefault) !nt.isDefault | ||
else if (nt.isDefault) false | ||
else !(histogram == hist) | ||
case _ => false | ||
} | ||
|
||
/** | ||
* Merges two transformations. The domain of the resulting transformation is the union of this | ||
* | ||
* @param other Transformation | ||
* @return a new Transformation that contains both this and other. | ||
*/ | ||
override def merge(other: Transformation): Transformation = other match { | ||
case _: StringHistogramTransformation => other | ||
case _ => this | ||
} | ||
|
||
} | ||
|
||
class StringHistogramTransformationSerializer | ||
extends StdSerializer[StringHistogramTransformation](classOf[StringHistogramTransformation]) { | ||
val jsonFactory = new JsonFactory() | ||
|
||
override def serializeWithType( | ||
value: StringHistogramTransformation, | ||
gen: JsonGenerator, | ||
serializers: SerializerProvider, | ||
typeSer: TypeSerializer): Unit = { | ||
gen.writeStartObject() | ||
typeSer.getPropertyName | ||
gen.writeStringField(typeSer.getPropertyName, typeSer.getTypeIdResolver.idFromValue(value)) | ||
|
||
gen.writeFieldName("histogram") | ||
gen.writeStartArray() | ||
value.histogram.foreach(gen.writeString) | ||
gen.writeEndArray() | ||
|
||
gen.writeEndObject() | ||
} | ||
|
||
override def serialize( | ||
value: StringHistogramTransformation, | ||
gen: JsonGenerator, | ||
provider: SerializerProvider): Unit = { | ||
gen.writeStartObject() | ||
|
||
gen.writeFieldName("histogram") | ||
gen.writeStartArray() | ||
value.histogram.foreach(gen.writeString) | ||
gen.writeEndArray() | ||
|
||
gen.writeEndObject() | ||
} | ||
|
||
} | ||
|
||
class StringHistogramTransformationDeserializer | ||
extends StdDeserializer[StringHistogramTransformation]( | ||
classOf[StringHistogramTransformation]) { | ||
|
||
override def deserialize( | ||
p: JsonParser, | ||
ctxt: DeserializationContext): StringHistogramTransformation = { | ||
val histogramBuilder = IndexedSeq.newBuilder[String] | ||
|
||
val tree: TreeNode = p.getCodec.readTree(p) | ||
tree.get("histogram") match { | ||
case an: ArrayNode => | ||
(0 until an.size()).foreach(i => histogramBuilder += an.get(i).asText()) | ||
} | ||
|
||
StringHistogramTransformation(histogramBuilder.result()) | ||
} | ||
|
||
} |
37 changes: 37 additions & 0 deletions
37
core/src/main/scala/io/qbeast/core/transform/StringHistogramTransformer.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package io.qbeast.core.transform | ||
|
||
import io.qbeast.core.model.QDataType | ||
import io.qbeast.core.transform.HistogramTransformer.defaultStringHistogram | ||
|
||
case class StringHistogramTransformer(columnName: String, dataType: QDataType) | ||
extends HistogramTransformer { | ||
private val columnHistogram = s"${columnName}_histogram" | ||
|
||
/** | ||
* Returns the stats | ||
* | ||
* @return | ||
*/ | ||
override def stats: ColumnStats = { | ||
val defaultHistString = defaultStringHistogram.mkString("Array('", "', '", "')") | ||
ColumnStats( | ||
statsNames = columnHistogram :: Nil, | ||
statsSqlPredicates = s"$defaultHistString AS $columnHistogram" :: Nil) | ||
} | ||
|
||
/** | ||
* Returns the Transformation given a row representation of the values | ||
* | ||
* @param row the values | ||
* @return the transformation | ||
*/ | ||
override def makeTransformation(row: String => Any): Transformation = { | ||
val hist = row(columnHistogram) match { | ||
case h: Seq[_] => h.map(_.toString).toIndexedSeq | ||
case _ => defaultStringHistogram | ||
} | ||
|
||
StringHistogramTransformation(hist) | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.