Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport string hist encoding to main #234

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 19 additions & 33 deletions core/src/main/scala/io/qbeast/core/model/CubeId.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ package io.qbeast.core.model
import io.qbeast.core.model.CubeId.{ChildrenIterator, Codec}

import java.nio.ByteBuffer
import java.util.Arrays
import scala.collection.immutable.BitSet
import scala.collection.mutable

Expand Down Expand Up @@ -95,20 +94,6 @@ object CubeId {
containers(point).drop(depth).next()
}

private def trimBitMask(bitMask: Array[Long]): Array[Long] = {
var last = bitMask.length - 1
while (last >= 0 && bitMask(last) == 0) {
last -= 1
}
if (last < bitMask.length - 1) {
val trimmedBitMask = new Array[Long](last + 1)
Array.copy(bitMask, 0, trimmedBitMask, 0, trimmedBitMask.length)
trimmedBitMask
} else {
bitMask
}
}

private class ContainersIterator(point: Point, parent: Option[CubeId])
extends Iterator[CubeId] {

Expand Down Expand Up @@ -270,14 +255,17 @@ case class CubeId(dimensionCount: Int, depth: Int, bitMask: Array[Long])
* is less than, equal to, or greater than the other CubeId.
*/
override def compare(that: CubeId): Int = {
val thisBitset = BitSet.fromBitMaskNoCopy(bitMask)
val thatBitset = BitSet.fromBitMaskNoCopy(that.bitMask)
val commonDepth = math.min(depth, that.depth)
for (depthOffset <- 0.until(commonDepth * dimensionCount)) {
val firstBit = thisBitset.contains(depthOffset)
val secondBit = thatBitset.contains(depthOffset)
if (firstBit != secondBit) {
if (firstBit) {
require(
that.dimensionCount == dimensionCount,
"The two cubes must have the same dimension count.")
val thisBits = BitSet.fromBitMaskNoCopy(bitMask)
val thatBits = BitSet.fromBitMaskNoCopy(that.bitMask)
val end = dimensionCount * math.min(depth, that.depth)
for (i <- (0 until end)) {
val thisBit = thisBits.contains(i)
val thatBit = thatBits.contains(i)
if (thisBit != thatBit) {
if (thisBit) {
return 1
} else {
return -1
Expand All @@ -302,14 +290,13 @@ case class CubeId(dimensionCount: Int, depth: Int, bitMask: Array[Long])
require(
other.dimensionCount == dimensionCount,
"The two cubes must have the same dimension count.")

if (depth > other.depth) {
false
} else {
val end = dimensionCount * depth
val ancestorBitMask = BitSet.fromBitMaskNoCopy(other.bitMask).until(end).toBitMask
Arrays.equals(CubeId.trimBitMask(bitMask), CubeId.trimBitMask(ancestorBitMask))
return false
}
val end = dimensionCount * depth
val bits = BitSet.fromBitMaskNoCopy(bitMask)
val otherBits = BitSet.fromBitMask(other.bitMask).until(end)
bits == otherBits
}

/**
Expand Down Expand Up @@ -402,9 +389,8 @@ case class CubeId(dimensionCount: Int, depth: Int, bitMask: Array[Long])

override def equals(obj: Any): Boolean = obj match {
case other: CubeId =>
dimensionCount == other.dimensionCount && depth == other.depth && Arrays.equals(
CubeId.trimBitMask(bitMask),
CubeId.trimBitMask(other.bitMask))
dimensionCount == other.dimensionCount && depth == other.depth && BitSet.fromBitMaskNoCopy(
bitMask) == BitSet.fromBitMaskNoCopy(other.bitMask)
case _ => false
}

Expand All @@ -413,7 +399,7 @@ case class CubeId(dimensionCount: Int, depth: Int, bitMask: Array[Long])
var result = 1
result = prime * result + dimensionCount
result = prime * result + depth
result = prime * result + Arrays.hashCode(CubeId.trimBitMask(bitMask))
result = prime * result + BitSet.fromBitMaskNoCopy(bitMask).hashCode()
result
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ case class HashTransformation(nullValue: Any = Random.nextInt()) extends Transfo
override def isSupersededBy(newTransformation: Transformation): Boolean = false

override def merge(other: Transformation): Transformation = this

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package io.qbeast.core.transform

import io.qbeast.core.model.QDataType

trait HistogramTransformation extends Transformation {

/**
* QDataType for the associated column.
*/
def dataType: QDataType

/**
* Histogram of the associated column that reflects the distribution of the column values.
* @return
*/
def histogram: IndexedSeq[Any]

/**
* Determines whether the associated histogram is the default one
* @return
*/
def isDefault: Boolean

override def transform(value: Any): Double

override def isSupersededBy(newTransformation: Transformation): Boolean

override def merge(other: Transformation): Transformation
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package io.qbeast.core.transform

import io.qbeast.core.model.{QDataType, StringDataType}

object HistogramTransformer extends TransformerType {
override def transformerSimpleName: String = "histogram"

override def apply(columnName: String, dataType: QDataType): Transformer = dataType match {
case StringDataType => StringHistogramTransformer(columnName, dataType)
case dt => throw new Exception(s"DataType not supported for HistogramTransformers: $dt")
}

// "a" to "z"
def defaultStringHistogram: IndexedSeq[String] = (97 to 122).map(_.toChar.toString)
}

trait HistogramTransformer extends Transformer {

override protected def transformerType: TransformerType = HistogramTransformer

/**
* Returns the name of the column
*
* @return
*/
override def columnName: String

/**
* Returns the stats
*
* @return
*/
override def stats: ColumnStats

/**
* Returns the Transformation given a row representation of the values
*
* @param row the values
* @return the transformation
*/
override def makeTransformation(row: String => Any): Transformation

}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ case class LinearTransformer(columnName: String, dataType: QDataType) extends Tr
} else if (minAux == maxAux) {
// If both values are equal we return an IdentityTransformation
IdentityToZeroTransformation(minAux)
} else { // otherwhise we pick the min and max
} else { // otherwise we pick the min and max
val min = getValue(minAux)
val max = getValue(maxAux)
dataType match {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package io.qbeast.core.transform

import com.fasterxml.jackson.core.{JsonFactory, JsonGenerator, JsonParser, TreeNode}
import com.fasterxml.jackson.databind.annotation.{JsonDeserialize, JsonSerialize}
import com.fasterxml.jackson.databind.deser.std.StdDeserializer
import com.fasterxml.jackson.databind.jsontype.TypeSerializer
import com.fasterxml.jackson.databind.node.ArrayNode
import com.fasterxml.jackson.databind.ser.std.StdSerializer
import com.fasterxml.jackson.databind.{DeserializationContext, SerializerProvider}
import io.qbeast.core.model.{QDataType, StringDataType}
import io.qbeast.core.transform.HistogramTransformer.defaultStringHistogram

import scala.collection.Searching._

@JsonSerialize(using = classOf[StringHistogramTransformationSerializer])
@JsonDeserialize(using = classOf[StringHistogramTransformationDeserializer])
case class StringHistogramTransformation(histogram: IndexedSeq[String])
extends HistogramTransformation {
require(histogram.length > 1, s"Histogram length has to be > 1: ${histogram.length}")

override val dataType: QDataType = StringDataType

override def isDefault: Boolean = histogram == defaultStringHistogram

/**
* Converts a real number to a normalized value.
*
* @param value a real number to convert
* @return a real number between 0 and 1
*/
override def transform(value: Any): Double = {
val v: String = value match {
case s: String => s
case null => "null"
case _ => value.toString
}

histogram.search(v) match {
case Found(foundIndex) => foundIndex.toDouble / (histogram.length - 1)
case InsertionPoint(insertionPoint) =>
if (insertionPoint == 0) 0d
else if (insertionPoint == histogram.length + 1) 1d
else (insertionPoint - 1).toDouble / (histogram.length - 1)
}
}

/**
* This method should determine if the new data will cause the creation of a new revision.
*
* @param newTransformation the new transformation created with statistics over the new data
* @return true if the domain of the newTransformation is not fully contained in this one.
*/
override def isSupersededBy(newTransformation: Transformation): Boolean =
newTransformation match {
case nt @ StringHistogramTransformation(hist) =>
if (isDefault) !nt.isDefault
else if (nt.isDefault) false
else !(histogram == hist)
case _ => false
}

/**
* Merges two transformations. The domain of the resulting transformation is the union of this
*
* @param other Transformation
* @return a new Transformation that contains both this and other.
*/
override def merge(other: Transformation): Transformation = other match {
case _: StringHistogramTransformation => other
case _ => this
}

}

class StringHistogramTransformationSerializer
extends StdSerializer[StringHistogramTransformation](classOf[StringHistogramTransformation]) {
val jsonFactory = new JsonFactory()

override def serializeWithType(
value: StringHistogramTransformation,
gen: JsonGenerator,
serializers: SerializerProvider,
typeSer: TypeSerializer): Unit = {
gen.writeStartObject()
typeSer.getPropertyName
gen.writeStringField(typeSer.getPropertyName, typeSer.getTypeIdResolver.idFromValue(value))

gen.writeFieldName("histogram")
gen.writeStartArray()
value.histogram.foreach(gen.writeString)
gen.writeEndArray()

gen.writeEndObject()
}

override def serialize(
value: StringHistogramTransformation,
gen: JsonGenerator,
provider: SerializerProvider): Unit = {
gen.writeStartObject()

gen.writeFieldName("histogram")
gen.writeStartArray()
value.histogram.foreach(gen.writeString)
gen.writeEndArray()

gen.writeEndObject()
}

}

class StringHistogramTransformationDeserializer
extends StdDeserializer[StringHistogramTransformation](
classOf[StringHistogramTransformation]) {

override def deserialize(
p: JsonParser,
ctxt: DeserializationContext): StringHistogramTransformation = {
val histogramBuilder = IndexedSeq.newBuilder[String]

val tree: TreeNode = p.getCodec.readTree(p)
tree.get("histogram") match {
case an: ArrayNode =>
(0 until an.size()).foreach(i => histogramBuilder += an.get(i).asText())
}

StringHistogramTransformation(histogramBuilder.result())
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package io.qbeast.core.transform

import io.qbeast.core.model.QDataType
import io.qbeast.core.transform.HistogramTransformer.defaultStringHistogram

case class StringHistogramTransformer(columnName: String, dataType: QDataType)
extends HistogramTransformer {
private val columnHistogram = s"${columnName}_histogram"

/**
* Returns the stats
*
* @return
*/
override def stats: ColumnStats = {
val defaultHistString = defaultStringHistogram.mkString("Array('", "', '", "')")
ColumnStats(
statsNames = columnHistogram :: Nil,
statsSqlPredicates = s"$defaultHistString AS $columnHistogram" :: Nil)
}

/**
* Returns the Transformation given a row representation of the values
*
* @param row the values
* @return the transformation
*/
override def makeTransformation(row: String => Any): Transformation = {
val hist = row(columnHistogram) match {
case h: Seq[_] => h.map(_.toString).toIndexedSeq
case _ => defaultStringHistogram
}

StringHistogramTransformation(hist)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ import java.util.Locale
object Transformer {

private val transformersRegistry: Map[String, TransformerType] =
Seq(LinearTransformer, HashTransformer).map(a => (a.transformerSimpleName, a)).toMap
Seq(LinearTransformer, HashTransformer, HistogramTransformer)
.map(a => (a.transformerSimpleName, a))
.toMap

/**
* Returns the transformer for the given column and type of transformer
Expand Down
11 changes: 11 additions & 0 deletions core/src/test/scala/io/qbeast/core/model/CubeIdTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class CubeIdTest extends AnyFlatSpec with Matchers {
val id7 =
CubeId(2, "wQwwwQwwQwwQwwwwwQwQwwwQQwwwwQQwQwwwQwwQwwQwwwwwQwwQQQQQQQQQQQQQ")
id6 == id7 shouldBe true
val id8 =
CubeId(1, 4, Array(9L)).parent.get.parent.get.parent.get
val id9 = CubeId(1, 1, Array(1L))
id8 == id9 shouldBe true
}

it should "implement hashCode correctly" in {
Expand Down Expand Up @@ -153,6 +157,13 @@ class CubeIdTest extends AnyFlatSpec with Matchers {
id4.nextSibling shouldBe None
}

it should "implement children iterator throwing NoSuchElementException after last child" in {
val children = CubeId.root(1).children
children.next() shouldBe CubeId.root(1).firstChild
children.next() shouldBe CubeId.root(1).firstChild.nextSibling.get
assertThrows[NoSuchElementException](children.next())
}

it should "return a correct container with specified depth" in {
val point = Point(0.66, 0.83)
val id = CubeId.container(point, 2)
Expand Down
Loading
Loading