diff --git a/tutorials/analyzers.ipynb b/tutorials/analyzers.ipynb index 77a1d6d..1fe5517 100644 --- a/tutorials/analyzers.ipynb +++ b/tutorials/analyzers.ipynb @@ -6,14 +6,116 @@ "source": [ "# Analyzers Basic Tutorial\n", "\n", + "__Updated June 2024 to use a new dataset__\n", + "\n", "This Jupyter notebook will give a basic tutorial on how to use PyDeequ's Analyzers module." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], + "source": [ + "import os\n", + "# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n", + "os.environ[\"SPARK_VERSION\"] = '3.5'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n", + "The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n", + "com.amazon.deequ#deequ added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9;1.0\n", + "\tconfs: [default]\n", + "\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n", + "\tfound org.scala-lang#scala-reflect;2.12.10 in central\n", + "\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n", + "\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n", + "\tfound com.github.fommil.netlib#core;1.1.2 in central\n", + "\tfound net.sf.opencsv#opencsv;2.3 in central\n", + "\tfound com.github.rwl#jtransforms;2.4.0 in central\n", + "\tfound junit#junit;4.8.2 in central\n", + "\tfound org.apache.commons#commons-math3;3.2 in central\n", + "\tfound org.spire-math#spire_2.12;0.13.0 in central\n", + "\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n", + "\tfound org.typelevel#machinist_2.12;0.6.1 in central\n", + "\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n", + "\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n", + "\tfound org.slf4j#slf4j-api;1.7.5 in central\n", + ":: resolution report :: resolve 435ms :: artifacts dl 12ms\n", + "\t:: modules in use:\n", + "\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n", + "\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n", + "\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n", + "\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n", + "\tjunit#junit;4.8.2 from central in [default]\n", + "\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n", + "\torg.apache.commons#commons-math3;3.2 from central in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n", + "\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n", + "\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n", + "\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n", + "\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n", + "\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n", + "\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n", + "\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n", + "\t:: evicted modules:\n", + "\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 17 | 0 | 0 | 2 || 15 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 15 already retrieved (0kB/9ms)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:25:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:25:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n" + ] + } + ], "source": [ "from pyspark.sql import SparkSession, Row, DataFrame\n", "import json\n", @@ -36,14 +138,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### We will be using the Amazon Product Reviews dataset -- specifically the Electronics subset. " + "### We will be using the synthetic reviews dataset for Electronics products" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "execution_count": 3, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:26:01 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -53,32 +171,46 @@ " |-- customer_id: string (nullable = true)\n", " |-- review_id: string (nullable = true)\n", " |-- product_id: string (nullable = true)\n", - " |-- product_parent: string (nullable = true)\n", " |-- product_title: string (nullable = true)\n", - " |-- star_rating: integer (nullable = true)\n", - " |-- helpful_votes: integer (nullable = true)\n", - " |-- total_votes: integer (nullable = true)\n", - " |-- vine: string (nullable = true)\n", - " |-- verified_purchase: string (nullable = true)\n", + " |-- star_rating: long (nullable = true)\n", + " |-- helpful_votes: long (nullable = true)\n", + " |-- total_votes: long (nullable = true)\n", + " |-- insight: string (nullable = true)\n", " |-- review_headline: string (nullable = true)\n", " |-- review_body: string (nullable = true)\n", - " |-- review_date: date (nullable = true)\n", - " |-- year: integer (nullable = true)\n", + " |-- review_date: timestamp (nullable = true)\n", + " |-- review_year: long (nullable = true)\n", "\n" ] } ], "source": [ - "df = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n", + "df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n", "\n", "df.printSchema()" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, + "execution_count": 4, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:26:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -87,15 +219,23 @@ "| entity| instance| name| value|\n", "+-----------+--------------------+-------------------+--------------------+\n", "| Column| review_id| Completeness| 1.0|\n", - "| Column| review_id|ApproxCountDistinct| 3010972.0|\n", - "|Mutlicolumn|total_votes,star_...| Correlation|-0.03451097996538765|\n", - "| Dataset| *| Size| 3120938.0|\n", - "| Column| star_rating| Mean| 4.036143941340712|\n", - "| Column| top star_rating| Compliance| 0.7494070692849394|\n", - "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9936463809903863|\n", + "| Column| review_id|ApproxCountDistinct| 3160409.0|\n", + "|Mutlicolumn|total_votes,star_...| Correlation|-7.38808965018615...|\n", + "| Dataset| *| Size| 3010972.0|\n", + "| Column| star_rating| Mean| 3.9999973430506826|\n", + "| Column| top star_rating| Compliance| 0.7499993357626706|\n", + "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9817922803462663|\n", "+-----------+--------------------+-------------------+--------------------+\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n", + " warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n" + ] } ], "source": [ @@ -119,7 +259,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -161,42 +303,42 @@ "
\n", + " | check | \n", + "check_level | \n", + "check_status | \n", + "constraint | \n", + "constraint_status | \n", + "constraint_message | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "Synthetic Product Reviews | \n", + "Warning | \n", + "Warning | \n", + "SizeConstraint(Size(None)) | \n", + "Failure | \n", + "org.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + " | \n", + "
1 | \n", + "Synthetic Product Reviews | \n", + "Warning | \n", + "Warning | \n", + "MinimumConstraint(Minimum(star_rating,None)) | \n", + "Failure | \n", + "org.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + " | \n", + "
2 | \n", + "Synthetic Product Reviews | \n", + "Warning | \n", + "Warning | \n", + "MaximumConstraint(Maximum(star_rating,None)) | \n", + "Failure | \n", + "org.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + " | \n", + "
3 | \n", + "Synthetic Product Reviews | \n", + "Warning | \n", + "Warning | \n", + "CompletenessConstraint(Completeness(review_id,None)) | \n", + "Failure | \n", + "org.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + " | \n", + "
4 | \n", + "Synthetic Product Reviews | \n", + "Warning | \n", + "Warning | \n", + "UniquenessConstraint(Uniqueness(List(review_id),None)) | \n", + "Failure | \n", + "Value: 0.9853137126482744 does not meet the constraint requirement! | \n", + "
5 | \n", + "Synthetic Product Reviews | \n", + "Warning | \n", + "Warning | \n", + "CompletenessConstraint(Completeness(marketplace,None)) | \n", + "Failure | \n", + "org.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + " | \n", + "
6 | \n", + "Synthetic Product Reviews | \n", + "Warning | \n", + "Warning | \n", + "ComplianceConstraint(Compliance(marketplace contained in US,UK,DE,JP,FR,`marketplace` IS NULL OR `marketplace` IN ('US','UK','DE','JP','FR'),None)) | \n", + "Failure | \n", + "org.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + " | \n", + "
7 | \n", + "Synthetic Product Reviews | \n", + "Warning | \n", + "Warning | \n", + "ComplianceConstraint(Compliance(year is non-negative,COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0,None)) | \n", + "Failure | \n", + "org.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + " | \n", + "
8 | \n", + "Synthetic Product Reviews | \n", + "Warning | \n", + "Warning | \n", + "MinimumConstraint(Minimum(review_year,None)) | \n", + "Failure | \n", + "org.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + " | \n", + "
9 | \n", + "Synthetic Product Reviews | \n", + "Warning | \n", + "Warning | \n", + "MaximumConstraint(Maximum(review_year,None)) | \n", + "Failure | \n", + "org.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + " | \n", + "