Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix time-series filtering #203

Merged
merged 6 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
[![Notebook](https://img.shields.io/badge/_-Jupyter_Notebook_example-0053B3?style=for-the-badge&logo=jupyter)](./docs/sample_pushdown_demo.ipynb)
<br />
[![Slack](https://img.shields.io/badge/_-Slack-blue?style=for-the-badge&logo=slack)](https://join.slack.com/t/qbeast-users/shared_invite/zt-w0zy8qrm-tJ2di1kZpXhjDq_hAl1LHw)
[![Medium](https://img.shields.io/badge/_-Medium-yellowgreen?style=for-the-badge&logo=medium)](https://blog.qbeast.io/)
[![Academy](https://img.shields.io/badge/_-Medium-yellowgreen?style=for-the-badge&logo=medium)](https://qbeast.io/academy-courses-index/)
[![Website](https://img.shields.io/badge/_-Website-dc005f?style=for-the-badge&logo=)](https://qbeast.io)

---
Expand Down
27 changes: 21 additions & 6 deletions src/main/scala/io/qbeast/spark/index/query/QueryFiltersUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@ import io.qbeast.spark.internal.expressions.QbeastMurmur3Hash
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.analysis.Resolver
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.util.DateTimeUtils.{daysToMicros, getZoneId}
import org.apache.spark.sql.catalyst.util.TypeUtils
import org.apache.spark.sql.execution.InSubqueryExec
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{DateType, TimestampType}
import org.apache.spark.unsafe.types.UTF8String

import java.util.concurrent.TimeUnit

private[query] trait QueryFiltersUtils {

lazy val spark: SparkSession = SparkSession.active
Expand Down Expand Up @@ -91,15 +96,25 @@ private[query] trait QueryFiltersUtils {
}

/**
* Convert an Spark String type to a Scala core type
* @param value the value to convert
* Convert a Literal value from Spark to a Qbeast/Scala core type
* @param l the Literal to convert
* @return
*/

def sparkTypeToCoreType(value: Any): Any = {
value match {
case s: UTF8String => s.toString
case _ => value
def sparkTypeToCoreType(l: Literal): Any = {

(l.value, l.dataType) match {
case (int: Integer, _: DateType) =>
// convert DateType to Milliseconds
lazy val zoneId = getZoneId(SQLConf.get.sessionLocalTimeZone)
alexeiakimov marked this conversation as resolved.
Show resolved Hide resolved
val dateInMicros = daysToMicros(int, zoneId)
val dateInMillis = TimeUnit.MICROSECONDS.toMillis(dateInMicros)
dateInMillis
case (long: Long, _: TimestampType) =>
// convert Timestamp from Microseconds to Milliseconds
TimeUnit.MICROSECONDS.toMillis(long)
case (s: UTF8String, _) => s.toString
case _ => l.value
}
}

Expand Down
24 changes: 12 additions & 12 deletions src/main/scala/io/qbeast/spark/index/query/QuerySpecBuilder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -88,23 +88,23 @@ private[spark] class QuerySpecBuilder(sparkFilters: Seq[Expression])
// if not found, use the overall coordinates
val columnFrom = columnFilters
.collectFirst {
case GreaterThan(_, Literal(value, _)) => sparkTypeToCoreType(value)
case GreaterThanOrEqual(_, Literal(value, _)) => sparkTypeToCoreType(value)
case LessThan(Literal(value, _), _) => sparkTypeToCoreType(value)
case LessThanOrEqual(Literal(value, _), _) => sparkTypeToCoreType(value)
case EqualTo(_, Literal(value, _)) => sparkTypeToCoreType(value)
case EqualTo(Literal(value, _), _) => sparkTypeToCoreType(value)
case GreaterThan(_, l: Literal) => sparkTypeToCoreType(l)
case GreaterThanOrEqual(_, l: Literal) => sparkTypeToCoreType(l)
case LessThan(l: Literal, _) => sparkTypeToCoreType(l)
case LessThanOrEqual(l: Literal, _) => sparkTypeToCoreType(l)
case EqualTo(_, l: Literal) => sparkTypeToCoreType(l)
case EqualTo(l: Literal, _) => sparkTypeToCoreType(l)
case IsNull(_) => null
}

val columnTo = columnFilters
.collectFirst {
case LessThan(_, Literal(value, _)) => sparkTypeToCoreType(value)
case LessThanOrEqual(_, Literal(value, _)) => sparkTypeToCoreType(value)
case GreaterThan(Literal(value, _), _) => sparkTypeToCoreType(value)
case GreaterThanOrEqual(Literal(value, _), _) => sparkTypeToCoreType(value)
case EqualTo(_, Literal(value, _)) => sparkTypeToCoreType(value)
case EqualTo(Literal(value, _), _) => sparkTypeToCoreType(value)
case LessThan(_, l: Literal) => sparkTypeToCoreType(l)
case LessThanOrEqual(_, l: Literal) => sparkTypeToCoreType(l)
case GreaterThan(l: Literal, _) => sparkTypeToCoreType(l)
case GreaterThanOrEqual(l: Literal, _) => sparkTypeToCoreType(l)
case EqualTo(_, l: Literal) => sparkTypeToCoreType(l)
case EqualTo(l: Literal, _) => sparkTypeToCoreType(l)
case IsNull(_) => null
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package io.qbeast.spark.index.query

import io.qbeast.spark.{QbeastIntegrationTestSpec}
import org.apache.spark.sql.functions.{to_date, to_timestamp, unix_timestamp}

class TimeSeriesQueryTest extends QbeastIntegrationTestSpec with QueryTestSpec {

"Qbeast Format" should "filter correctly values with Timestamp" in withSparkAndTmpDir(
(spark, tmpDir) => {
import spark.implicits._
val df =
Seq(
"2017-01-03 12:02:00",
"2017-01-02 12:02:00",
"2017-01-02 12:02:00",
"2017-01-02 12:02:00",
"2017-01-01 12:02:00",
"2017-01-01 12:02:00")
.toDF("date")
.withColumn("date", to_timestamp($"date"))

df.write.format("qbeast").option("columnsToIndex", "date").save(tmpDir)

val indexed = spark.read.format("qbeast").load(tmpDir)

indexed.filter("date == '2017-01-02 12:02:00'").count() shouldBe 3

indexed.filter("date > '2017-01-02 12:02:00'").count() shouldBe 1

indexed.filter("date < '2017-01-02 12:02:00'").count() shouldBe 2

})

it should "filter correctly values with Date" in withSparkAndTmpDir((spark, tmpDir) => {
import spark.implicits._
val df =
Seq("2017-01-01", "2017-01-02", "2017-01-03", "2017-01-04")
.toDF("date")
.withColumn("date", to_date($"date"))

df.write.format("qbeast").option("columnsToIndex", "date").save(tmpDir)

val indexed = spark.read.format("qbeast").load(tmpDir)

indexed.filter("date == '2017-01-03'").count() shouldBe 1

})

it should "filter correctly values with complex Dates" in withSparkAndTmpDir(
(spark, tmpDir) => {
import spark.implicits._
val df =
Seq(
"2017-01-03 12:02:00",
"2017-01-02 12:02:00",
"2017-01-02 12:02:00",
"2017-01-02 12:02:00",
"2017-01-01 12:02:00",
"2017-01-01 12:02:00")
.toDF("date")
.withColumn("date", to_date($"date"))

df.write.format("qbeast").option("columnsToIndex", "date").save(tmpDir)

val indexed = spark.read.format("qbeast").load(tmpDir)

indexed.filter("date == '2017-01-02 12:02:00'").count() shouldBe 3

})

it should "filter correctly values with Unix Timestamp" in withSparkAndTmpDir(
(spark, tmpDir) => {
import spark.implicits._
val df =
Seq(
"2017-01-03 12:02:00",
"2017-01-02 12:02:00",
"2017-01-02 12:02:00",
"2017-01-02 12:02:00",
"2017-01-01 12:02:00",
"2017-01-01 12:02:00")
.toDF("date")
.withColumn("date", unix_timestamp($"date"))

df.write.format("qbeast").option("columnsToIndex", "date").save(tmpDir)

val indexed = spark.read.format("qbeast").load(tmpDir)
val valueToFilter = df.first().getLong(0)

indexed.filter(s"date == $valueToFilter").count() shouldBe df
.filter(s"date == $valueToFilter")
.count()

})
}
Loading