diff --git a/tutorials/analyzers.ipynb b/tutorials/analyzers.ipynb index 77a1d6d..1fe5517 100644 --- a/tutorials/analyzers.ipynb +++ b/tutorials/analyzers.ipynb @@ -6,14 +6,116 @@ "source": [ "# Analyzers Basic Tutorial\n", "\n", + "__Updated June 2024 to use a new dataset__\n", + "\n", "This Jupyter notebook will give a basic tutorial on how to use PyDeequ's Analyzers module." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], + "source": [ + "import os\n", + "# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n", + "os.environ[\"SPARK_VERSION\"] = '3.5'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n", + "The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n", + "com.amazon.deequ#deequ added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9;1.0\n", + "\tconfs: [default]\n", + "\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n", + "\tfound org.scala-lang#scala-reflect;2.12.10 in central\n", + "\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n", + "\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n", + "\tfound com.github.fommil.netlib#core;1.1.2 in central\n", + "\tfound net.sf.opencsv#opencsv;2.3 in central\n", + "\tfound com.github.rwl#jtransforms;2.4.0 in central\n", + "\tfound junit#junit;4.8.2 in central\n", + "\tfound org.apache.commons#commons-math3;3.2 in central\n", + "\tfound org.spire-math#spire_2.12;0.13.0 in central\n", + "\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n", + "\tfound org.typelevel#machinist_2.12;0.6.1 in central\n", + "\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n", + "\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n", + "\tfound org.slf4j#slf4j-api;1.7.5 in central\n", + ":: resolution report :: resolve 435ms :: artifacts dl 12ms\n", + "\t:: modules in use:\n", + "\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n", + "\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n", + "\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n", + "\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n", + "\tjunit#junit;4.8.2 from central in [default]\n", + "\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n", + "\torg.apache.commons#commons-math3;3.2 from central in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n", + "\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n", + "\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n", + "\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n", + "\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n", + "\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n", + "\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n", + "\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n", + "\t:: evicted modules:\n", + "\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 17 | 0 | 0 | 2 || 15 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 15 already retrieved (0kB/9ms)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:25:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:25:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n" + ] + } + ], "source": [ "from pyspark.sql import SparkSession, Row, DataFrame\n", "import json\n", @@ -36,14 +138,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### We will be using the Amazon Product Reviews dataset -- specifically the Electronics subset. " + "### We will be using the synthetic reviews dataset for Electronics products" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "execution_count": 3, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:26:01 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -53,32 +171,46 @@ " |-- customer_id: string (nullable = true)\n", " |-- review_id: string (nullable = true)\n", " |-- product_id: string (nullable = true)\n", - " |-- product_parent: string (nullable = true)\n", " |-- product_title: string (nullable = true)\n", - " |-- star_rating: integer (nullable = true)\n", - " |-- helpful_votes: integer (nullable = true)\n", - " |-- total_votes: integer (nullable = true)\n", - " |-- vine: string (nullable = true)\n", - " |-- verified_purchase: string (nullable = true)\n", + " |-- star_rating: long (nullable = true)\n", + " |-- helpful_votes: long (nullable = true)\n", + " |-- total_votes: long (nullable = true)\n", + " |-- insight: string (nullable = true)\n", " |-- review_headline: string (nullable = true)\n", " |-- review_body: string (nullable = true)\n", - " |-- review_date: date (nullable = true)\n", - " |-- year: integer (nullable = true)\n", + " |-- review_date: timestamp (nullable = true)\n", + " |-- review_year: long (nullable = true)\n", "\n" ] } ], "source": [ - "df = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n", + "df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n", "\n", "df.printSchema()" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, + "execution_count": 4, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:26:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -87,15 +219,23 @@ "| entity| instance| name| value|\n", "+-----------+--------------------+-------------------+--------------------+\n", "| Column| review_id| Completeness| 1.0|\n", - "| Column| review_id|ApproxCountDistinct| 3010972.0|\n", - "|Mutlicolumn|total_votes,star_...| Correlation|-0.03451097996538765|\n", - "| Dataset| *| Size| 3120938.0|\n", - "| Column| star_rating| Mean| 4.036143941340712|\n", - "| Column| top star_rating| Compliance| 0.7494070692849394|\n", - "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9936463809903863|\n", + "| Column| review_id|ApproxCountDistinct| 3160409.0|\n", + "|Mutlicolumn|total_votes,star_...| Correlation|-7.38808965018615...|\n", + "| Dataset| *| Size| 3010972.0|\n", + "| Column| star_rating| Mean| 3.9999973430506826|\n", + "| Column| top star_rating| Compliance| 0.7499993357626706|\n", + "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9817922803462663|\n", "+-----------+--------------------+-------------------+--------------------+\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n", + " warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n" + ] } ], "source": [ @@ -119,7 +259,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -161,42 +303,42 @@ " Column\n", " review_id\n", " ApproxCountDistinct\n", - " 3.010972e+06\n", + " 3.160409e+06\n", " \n", " \n", " 2\n", " Mutlicolumn\n", " total_votes,star_rating\n", " Correlation\n", - " -3.451098e-02\n", + " -7.388090e-04\n", " \n", " \n", " 3\n", " Dataset\n", " *\n", " Size\n", - " 3.120938e+06\n", + " 3.010972e+06\n", " \n", " \n", " 4\n", " Column\n", " star_rating\n", " Mean\n", - " 4.036144e+00\n", + " 3.999997e+00\n", " \n", " \n", " 5\n", " Column\n", " top star_rating\n", " Compliance\n", - " 7.494071e-01\n", + " 7.499993e-01\n", " \n", " \n", " 6\n", " Mutlicolumn\n", " total_votes,helpful_votes\n", " Correlation\n", - " 9.936464e-01\n", + " 9.817923e-01\n", " \n", " \n", "\n", @@ -205,12 +347,12 @@ "text/plain": [ " entity instance name value\n", "0 Column review_id Completeness 1.000000e+00\n", - "1 Column review_id ApproxCountDistinct 3.010972e+06\n", - "2 Mutlicolumn total_votes,star_rating Correlation -3.451098e-02\n", - "3 Dataset * Size 3.120938e+06\n", - "4 Column star_rating Mean 4.036144e+00\n", - "5 Column top star_rating Compliance 7.494071e-01\n", - "6 Mutlicolumn total_votes,helpful_votes Correlation 9.936464e-01" + "1 Column review_id ApproxCountDistinct 3.160409e+06\n", + "2 Mutlicolumn total_votes,star_rating Correlation -7.388090e-04\n", + "3 Dataset * Size 3.010972e+06\n", + "4 Column star_rating Mean 3.999997e+00\n", + "5 Column top star_rating Compliance 7.499993e-01\n", + "6 Mutlicolumn total_votes,helpful_votes Correlation 9.817923e-01" ] }, "execution_count": 5, @@ -247,7 +389,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/tutorials/profiles.ipynb b/tutorials/profiles.ipynb index 2a274a4..b968217 100644 --- a/tutorials/profiles.ipynb +++ b/tutorials/profiles.ipynb @@ -6,14 +6,117 @@ "source": [ "# Profiles Basic Tutorial\n", "\n", + "__Updated June 2024 to use a new dataset__\n", + "\n", "This Jupyter notebook will give a basic tutorial on how to use PyDeequ's Profiles module." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], + "source": [ + "import os\n", + "# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n", + "os.environ[\"SPARK_VERSION\"] = '3.5'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n", + "The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n", + "com.amazon.deequ#deequ added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-37bbf6aa-f0ee-4f60-8485-4571bfefeb8f;1.0\n", + "\tconfs: [default]\n", + "\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n", + "\tfound org.scala-lang#scala-reflect;2.12.10 in central\n", + "\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n", + "\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n", + "\tfound com.github.fommil.netlib#core;1.1.2 in central\n", + "\tfound net.sf.opencsv#opencsv;2.3 in central\n", + "\tfound com.github.rwl#jtransforms;2.4.0 in central\n", + "\tfound junit#junit;4.8.2 in central\n", + "\tfound org.apache.commons#commons-math3;3.2 in central\n", + "\tfound org.spire-math#spire_2.12;0.13.0 in central\n", + "\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n", + "\tfound org.typelevel#machinist_2.12;0.6.1 in central\n", + "\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n", + "\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n", + "\tfound org.slf4j#slf4j-api;1.7.5 in central\n", + ":: resolution report :: resolve 385ms :: artifacts dl 12ms\n", + "\t:: modules in use:\n", + "\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n", + "\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n", + "\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n", + "\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n", + "\tjunit#junit;4.8.2 from central in [default]\n", + "\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n", + "\torg.apache.commons#commons-math3;3.2 from central in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n", + "\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n", + "\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n", + "\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n", + "\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n", + "\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n", + "\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n", + "\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n", + "\t:: evicted modules:\n", + "\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 17 | 0 | 0 | 2 || 15 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-37bbf6aa-f0ee-4f60-8485-4571bfefeb8f\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 15 already retrieved (0kB/10ms)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:33:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:33:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", + "24/06/14 23:33:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n" + ] + } + ], "source": [ "from pyspark.sql import SparkSession, Row, DataFrame\n", "import json\n", @@ -29,6 +132,8 @@ " .config(\"spark.driver.extraClassPath\", classpath)\n", " .config(\"spark.jars.packages\", pydeequ.deequ_maven_coord)\n", " .config(\"spark.jars.excludes\", pydeequ.f2j_maven_coord)\n", + " .config(\"spark.driver.memory\", \"15g\")\n", + " .config(\"spark.sql.parquet.int96RebaseModeInRead\", \"CORRECTED\")\n", " .getOrCreate())" ] }, @@ -36,14 +141,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### We will be using the Amazon Product Reviews dataset -- specifically the Electronics subset. " + "### We will be using the synthetic reviews dataset for Electronics products" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "execution_count": 3, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:33:38 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -53,23 +174,21 @@ " |-- customer_id: string (nullable = true)\n", " |-- review_id: string (nullable = true)\n", " |-- product_id: string (nullable = true)\n", - " |-- product_parent: string (nullable = true)\n", " |-- product_title: string (nullable = true)\n", - " |-- star_rating: integer (nullable = true)\n", - " |-- helpful_votes: integer (nullable = true)\n", - " |-- total_votes: integer (nullable = true)\n", - " |-- vine: string (nullable = true)\n", - " |-- verified_purchase: string (nullable = true)\n", + " |-- star_rating: long (nullable = true)\n", + " |-- helpful_votes: long (nullable = true)\n", + " |-- total_votes: long (nullable = true)\n", + " |-- insight: string (nullable = true)\n", " |-- review_headline: string (nullable = true)\n", " |-- review_body: string (nullable = true)\n", - " |-- review_date: date (nullable = true)\n", - " |-- year: integer (nullable = true)\n", + " |-- review_date: timestamp (nullable = true)\n", + " |-- review_year: long (nullable = true)\n", "\n" ] } ], "source": [ - "df = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n", + "df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n", "\n", "df.printSchema()" ] @@ -77,8 +196,25 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, - "outputs": [], + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:33:43 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "from pydeequ.profiles import *\n", "\n", @@ -90,93 +226,724 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Column review_id:\n", - "completeness: 1.0\n", - "approx distinct: 3010972\n", - "datatype: String\n", - "\n", - "Statistics of customer_id\n", - "Min:10005.0\n", - "Max:53096582.0\n", - "Mean:28806032.68895954\n", - "StandardDeviation:15415072.111267326\n", - "\n", - "Column review_date:\n", - "completeness: 1.0\n", - "approx distinct: 5898\n", - "datatype: Unknown\n", - "\n", - "Statistics of helpful_votes\n", - "Min:0.0\n", - "Max:12786.0\n", - "Mean:1.865194053838942\n", - "StandardDeviation:21.296393520562624\n", - "\n", - "Statistics of star_rating\n", - "Min:1.0\n", - "Max:5.0\n", - "Mean:4.036143941340712\n", - "StandardDeviation:1.3866747032700206\n", - "\n", - "Statistics of year\n", - "Min:1999.0\n", - "Max:2015.0\n", - "Mean:2012.8595236432125\n", - "StandardDeviation:2.464162689284542\n", - "\n", - "Column product_title:\n", - "completeness: 1.0\n", - "approx distinct: 164112\n", - "datatype: String\n", - "\n", - "Column review_headline:\n", - "completeness: 0.9999987183340393\n", - "approx distinct: 1694860\n", - "datatype: String\n", - "\n", - "Column product_id:\n", - "completeness: 1.0\n", - "approx distinct: 169835\n", - "datatype: String\n", - "\n", - "Statistics of total_votes\n", - "Min:0.0\n", - "Max:12944.0\n", - "Mean:2.3798239503636407\n", - "StandardDeviation:22.457108543167916\n", - "\n", - "Statistics of product_parent\n", - "Min:6478.0\n", - "Max:999998189.0\n", - "Mean:510045457.18261784\n", - "StandardDeviation:286789206.496994\n", - "\n", - "Column review_body:\n", - "completeness: 0.9999724441818453\n", - "approx distinct: 3024295\n", - "datatype: String\n", - "\n", - "Column vine:\n", - "completeness: 1.0\n", - "approx distinct: 2\n", - "datatype: String\n", - "\n", - "Column marketplace:\n", - "completeness: 1.0\n", - "approx distinct: 5\n", - "datatype: String\n", - "\n", - "Column verified_purchase:\n", - "completeness: 1.0\n", - "approx distinct: 2\n", - "datatype: String\n", - "\n" + "StandardProfiles for column: insight: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 2,\n", + " \"dataType\": \"String\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {\n", + " \"Boolean\": 0,\n", + " \"Fractional\": 0,\n", + " \"Integral\": 0,\n", + " \"Unknown\": 0,\n", + " \"String\": 3010972\n", + " },\n", + " \"histogram\": [\n", + " [\n", + " \"N\",\n", + " 1701683,\n", + " 0.5651606856523408\n", + " ],\n", + " [\n", + " \"Y\",\n", + " 1309289,\n", + " 0.43483931434765916\n", + " ]\n", + " ]\n", + "}\n", + "StandardProfiles for column: review_id: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 3160409,\n", + " \"dataType\": \"String\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {\n", + " \"Boolean\": 0,\n", + " \"Fractional\": 0,\n", + " \"Integral\": 0,\n", + " \"Unknown\": 0,\n", + " \"String\": 3010972\n", + " },\n", + " \"histogram\": null\n", + "}\n", + "NumericProfiles for column: customer_id: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 866021,\n", + " \"dataType\": \"Integral\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {\n", + " \"Boolean\": 0,\n", + " \"Fractional\": 0,\n", + " \"Integral\": 3010972,\n", + " \"Unknown\": 0,\n", + " \"String\": 0\n", + " },\n", + " \"histogram\": null,\n", + " \"kll\": \"None\",\n", + " \"mean\": 514989.4746510429,\n", + " \"maximum\": 929121.0,\n", + " \"minimum\": 100000.0,\n", + " \"sum\": 1550618888469.0,\n", + " \"stdDev\": 239465.84713597817,\n", + " \"approxPercentiles\": []\n", + "}\n", + "StandardProfiles for column: review_date: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 7916,\n", + " \"dataType\": \"String\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {},\n", + " \"histogram\": null\n", + "}\n", + "NumericProfiles for column: helpful_votes: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 24,\n", + " \"dataType\": \"Integral\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {},\n", + " \"histogram\": [\n", + " [\n", + " \"12\",\n", + " 287924,\n", + " 0.09562493440656372\n", + " ],\n", + " [\n", + " \"8\",\n", + " 8337,\n", + " 0.0027688733073572254\n", + " ],\n", + " [\n", + " \"19\",\n", + " 72602,\n", + " 0.02411247929240126\n", + " ],\n", + " [\n", + " \"23\",\n", + " 434,\n", + " 0.00014413950046695883\n", + " ],\n", + " [\n", + " \"4\",\n", + " 13,\n", + " 4.317542640715357e-06\n", + " ],\n", + " [\n", + " \"15\",\n", + " 520006,\n", + " 0.1727036983406023\n", + " ],\n", + " [\n", + " \"11\",\n", + " 159423,\n", + " 0.052947353877751104\n", + " ],\n", + " [\n", + " \"9\",\n", + " 27286,\n", + " 0.009062189884196863\n", + " ],\n", + " [\n", + " \"22\",\n", + " 2218,\n", + " 0.000736639198238974\n", + " ],\n", + " [\n", + " \"26\",\n", + " 2,\n", + " 6.642373293408242e-07\n", + " ],\n", + " [\n", + " \"13\",\n", + " 427526,\n", + " 0.1419893642318826\n", + " ],\n", + " [\n", + " \"24\",\n", + " 70,\n", + " 2.3248306526928844e-05\n", + " ],\n", + " [\n", + " \"16\",\n", + " 426940,\n", + " 0.14179474269438574\n", + " ],\n", + " [\n", + " \"5\",\n", + " 87,\n", + " 2.889432382632585e-05\n", + " ],\n", + " [\n", + " \"10\",\n", + " 72476,\n", + " 0.024070632340652785\n", + " ],\n", + " [\n", + " \"21\",\n", + " 8287,\n", + " 0.0027522673741237048\n", + " ],\n", + " [\n", + " \"6\",\n", + " 450,\n", + " 0.00014945339910168542\n", + " ],\n", + " [\n", + " \"17\",\n", + " 288496,\n", + " 0.0958149062827552\n", + " ],\n", + " [\n", + " \"25\",\n", + " 12,\n", + " 3.985423976044945e-06\n", + " ],\n", + " [\n", + " \"14\",\n", + " 519410,\n", + " 0.17250575561645873\n", + " ],\n", + " [\n", + " \"20\",\n", + " 27100,\n", + " 0.009000415812568167\n", + " ],\n", + " [\n", + " \"18\",\n", + " 159692,\n", + " 0.053036693798547446\n", + " ],\n", + " [\n", + " \"7\",\n", + " 2180,\n", + " 0.0007240186889814984\n", + " ],\n", + " [\n", + " \"3\",\n", + " 1,\n", + " 3.321186646704121e-07\n", + " ]\n", + " ],\n", + " \"kll\": \"None\",\n", + " \"mean\": 14.500331454427341,\n", + " \"maximum\": 26.0,\n", + " \"minimum\": 3.0,\n", + " \"sum\": 43660092.0,\n", + " \"stdDev\": 2.2547987149482704,\n", + " \"approxPercentiles\": []\n", + "}\n", + "NumericProfiles for column: star_rating: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 5,\n", + " \"dataType\": \"Integral\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {},\n", + " \"histogram\": [\n", + " [\n", + " \"4\",\n", + " 1053843,\n", + " 0.35000092993226106\n", + " ],\n", + " [\n", + " \"5\",\n", + " 1204384,\n", + " 0.3999984058304096\n", + " ],\n", + " [\n", + " \"1\",\n", + " 150549,\n", + " 0.05000013284746587\n", + " ],\n", + " [\n", + " \"2\",\n", + " 150549,\n", + " 0.05000013284746587\n", + " ],\n", + " [\n", + " \"3\",\n", + " 451647,\n", + " 0.1500003985423976\n", + " ]\n", + " ],\n", + " \"kll\": \"None\",\n", + " \"mean\": 3.9999973430506826,\n", + " \"maximum\": 5.0,\n", + " \"minimum\": 1.0,\n", + " \"sum\": 12043880.0,\n", + " \"stdDev\": 1.0954453575522467,\n", + " \"approxPercentiles\": []\n", + "}\n", + "StandardProfiles for column: product_title: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 5847,\n", + " \"dataType\": \"String\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {\n", + " \"Boolean\": 0,\n", + " \"Fractional\": 0,\n", + " \"Integral\": 0,\n", + " \"Unknown\": 0,\n", + " \"String\": 3010972\n", + " },\n", + " \"histogram\": null\n", + "}\n", + "StandardProfiles for column: review_headline: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 3263448,\n", + " \"dataType\": \"String\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {\n", + " \"Boolean\": 0,\n", + " \"Fractional\": 0,\n", + " \"Integral\": 0,\n", + " \"Unknown\": 0,\n", + " \"String\": 3010972\n", + " },\n", + " \"histogram\": null\n", + "}\n", + "NumericProfiles for column: review_year: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 25,\n", + " \"dataType\": \"Integral\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {},\n", + " \"histogram\": [\n", + " [\n", + " \"2014\",\n", + " 321464,\n", + " 0.10676419441960935\n", + " ],\n", + " [\n", + " \"2003\",\n", + " 52307,\n", + " 0.017372130992915246\n", + " ],\n", + " [\n", + " \"1997\",\n", + " 10723,\n", + " 0.0035613084412608287\n", + " ],\n", + " [\n", + " \"2101\",\n", + " 2,\n", + " 6.642373293408242e-07\n", + " ],\n", + " [\n", + " \"2013\",\n", + " 316199,\n", + " 0.10501558965011963\n", + " ],\n", + " [\n", + " \"2002\",\n", + " 42388,\n", + " 0.014077845958049427\n", + " ],\n", + " [\n", + " \"2202\",\n", + " 1,\n", + " 3.321186646704121e-07\n", + " ],\n", + " [\n", + " \"2007\",\n", + " 147466,\n", + " 0.048976211004286986\n", + " ],\n", + " [\n", + " \"2004\",\n", + " 55873,\n", + " 0.018556466151129936\n", + " ],\n", + " [\n", + " \"1996\",\n", + " 9521,\n", + " 0.0031621018063269935\n", + " ],\n", + " [\n", + " \"2015\",\n", + " 325725,\n", + " 0.10817935204976997\n", + " ],\n", + " [\n", + " \"2011\",\n", + " 197133,\n", + " 0.06547154872247235\n", + " ],\n", + " [\n", + " \"2001\",\n", + " 38141,\n", + " 0.012667337989194187\n", + " ],\n", + " [\n", + " \"2008\",\n", + " 166847,\n", + " 0.055413002844264245\n", + " ],\n", + " [\n", + " \"1696\",\n", + " 2,\n", + " 6.642373293408242e-07\n", + " ],\n", + " [\n", + " \"2012\",\n", + " 234672,\n", + " 0.07793895127553495\n", + " ],\n", + " [\n", + " \"1999\",\n", + " 12318,\n", + " 0.004091037711410136\n", + " ],\n", + " [\n", + " \"2005\",\n", + " 122344,\n", + " 0.040632725910436894\n", + " ],\n", + " [\n", + " \"2010\",\n", + " 190961,\n", + " 0.06342171232412656\n", + " ],\n", + " [\n", + " \"2009\",\n", + " 169434,\n", + " 0.0562721938297666\n", + " ],\n", + " [\n", + " \"2000\",\n", + " 20756,\n", + " 0.0068934550038990735\n", + " ],\n", + " [\n", + " \"2016\",\n", + " 423293,\n", + " 0.14058350592433275\n", + " ],\n", + " [\n", + " \"1998\",\n", + " 11633,\n", + " 0.003863536426110904\n", + " ],\n", + " [\n", + " \"2006\",\n", + " 141769,\n", + " 0.04708413097165965\n", + " ]\n", + " ],\n", + " \"kll\": \"None\",\n", + " \"mean\": 2010.9318020891592,\n", + " \"maximum\": 2202.0,\n", + " \"minimum\": 1696.0,\n", + " \"sum\": 6054859350.0,\n", + " \"stdDev\": 4.348285411927376,\n", + " \"approxPercentiles\": []\n", + "}\n", + "NumericProfiles for column: product_id: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 9375,\n", + " \"dataType\": \"Integral\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {\n", + " \"Boolean\": 0,\n", + " \"Fractional\": 0,\n", + " \"Integral\": 3010972,\n", + " \"Unknown\": 0,\n", + " \"String\": 0\n", + " },\n", + " \"histogram\": null,\n", + " \"kll\": \"None\",\n", + " \"mean\": 54624.17527828223,\n", + " \"maximum\": 99991.0,\n", + " \"minimum\": 10000.0,\n", + " \"sum\": 164471862286.0,\n", + " \"stdDev\": 26182.48676113594,\n", + " \"approxPercentiles\": []\n", + "}\n", + "NumericProfiles for column: total_votes: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 41,\n", + " \"dataType\": \"Integral\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {},\n", + " \"histogram\": [\n", + " [\n", + " \"34\",\n", + " 292,\n", + " 9.697865008376033e-05\n", + " ],\n", + " [\n", + " \"12\",\n", + " 47972,\n", + " 0.01593239658156901\n", + " ],\n", + " [\n", + " \"8\",\n", + " 3891,\n", + " 0.0012922737242325735\n", + " ],\n", + " [\n", + " \"19\",\n", + " 306985,\n", + " 0.10195544827384645\n", + " ],\n", + " [\n", + " \"23\",\n", + " 206056,\n", + " 0.06843504356732644\n", + " ],\n", + " [\n", + " \"4\",\n", + " 125,\n", + " 4.151483308380151e-05\n", + " ],\n", + " [\n", + " \"15\",\n", + " 158682,\n", + " 0.05270125394723033\n", + " ],\n", + " [\n", + " \"11\",\n", + " 28004,\n", + " 0.00930065108543022\n", + " ],\n", + " [\n", + " \"9\",\n", + " 7913,\n", + " 0.002628054993536971\n", + " ],\n", + " [\n", + " \"33\",\n", + " 773,\n", + " 0.0002567277277902285\n", + " ],\n", + " [\n", + " \"22\",\n", + " 251211,\n", + " 0.08343186187051889\n", + " ],\n", + " [\n", + " \"26\",\n", + " 76151,\n", + " 0.02529116843331655\n", + " ],\n", + " [\n", + " \"37\",\n", + " 13,\n", + " 4.317542640715357e-06\n", + " ],\n", + " [\n", + " \"13\",\n", + " 75950,\n", + " 0.025224412581717797\n", + " ],\n", + " [\n", + " \"24\",\n", + " 158507,\n", + " 0.05264313318091301\n", + " ],\n", + " [\n", + " \"35\",\n", + " 100,\n", + " 3.321186646704121e-05\n", + " ],\n", + " [\n", + " \"16\",\n", + " 205878,\n", + " 0.0683759264450151\n", + " ],\n", + " [\n", + " \"5\",\n", + " 300,\n", + " 9.963559940112363e-05\n", + " ],\n", + " [\n", + " \"10\",\n", + " 15691,\n", + " 0.005211273967343436\n", + " ],\n", + " [\n", + " \"21\",\n", + " 287397,\n", + " 0.09544990787028242\n", + " ],\n", + " [\n", + " \"32\",\n", + " 1790,\n", + " 0.0005944924097600376\n", + " ],\n", + " [\n", + " \"6\",\n", + " 748,\n", + " 0.00024842476117346824\n", + " ],\n", + " [\n", + " \"36\",\n", + " 38,\n", + " 1.262050925747566e-05\n", + " ],\n", + " [\n", + " \"1\",\n", + " 2,\n", + " 6.642373293408242e-07\n", + " ],\n", + " [\n", + " \"39\",\n", + " 2,\n", + " 6.642373293408242e-07\n", + " ],\n", + " [\n", + " \"17\",\n", + " 251277,\n", + " 0.08345378170238714\n", + " ],\n", + " [\n", + " \"25\",\n", + " 113552,\n", + " 0.037712738610654634\n", + " ],\n", + " [\n", + " \"14\",\n", + " 113205,\n", + " 0.037597493434014\n", + " ],\n", + " [\n", + " \"31\",\n", + " 3833,\n", + " 0.0012730108416816896\n", + " ],\n", + " [\n", + " \"0\",\n", + " 1,\n", + " 3.321186646704121e-07\n", + " ],\n", + " [\n", + " \"20\",\n", + " 306210,\n", + " 0.10169805630872689\n", + " ],\n", + " [\n", + " \"27\",\n", + " 47680,\n", + " 0.015835417931485248\n", + " ],\n", + " [\n", + " \"2\",\n", + " 13,\n", + " 4.317542640715357e-06\n", + " ],\n", + " [\n", + " \"38\",\n", + " 4,\n", + " 1.3284746586816483e-06\n", + " ],\n", + " [\n", + " \"18\",\n", + " 286997,\n", + " 0.09531706040441426\n", + " ],\n", + " [\n", + " \"30\",\n", + " 7922,\n", + " 0.0026310440615190047\n", + " ],\n", + " [\n", + " \"7\",\n", + " 1782,\n", + " 0.0005918354604426743\n", + " ],\n", + " [\n", + " \"29\",\n", + " 15535,\n", + " 0.005159463455654852\n", + " ],\n", + " [\n", + " \"3\",\n", + " 49,\n", + " 1.6273814568850192e-05\n", + " ],\n", + " [\n", + " \"28\",\n", + " 28441,\n", + " 0.00944578694189119\n", + " ]\n", + " ],\n", + " \"kll\": \"None\",\n", + " \"mean\": 19.500789446065923,\n", + " \"maximum\": 39.0,\n", + " \"minimum\": 0.0,\n", + " \"sum\": 58716331.0,\n", + " \"stdDev\": 3.884513343386856,\n", + " \"approxPercentiles\": []\n", + "}\n", + "StandardProfiles for column: review_body: {\n", + " \"completeness\": 1.0,\n", + " \"approximateNumDistinctValues\": 3111148,\n", + " \"dataType\": \"String\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {\n", + " \"Boolean\": 0,\n", + " \"Fractional\": 0,\n", + " \"Integral\": 0,\n", + " \"Unknown\": 0,\n", + " \"String\": 3010972\n", + " },\n", + " \"histogram\": null\n", + "}\n", + "StandardProfiles for column: marketplace: {\n", + " \"completeness\": 0.8570391886739565,\n", + " \"approximateNumDistinctValues\": 6,\n", + " \"dataType\": \"String\",\n", + " \"isDataTypeInferred\": false,\n", + " \"typeCounts\": {\n", + " \"Boolean\": 0,\n", + " \"Fractional\": 0,\n", + " \"Integral\": 0,\n", + " \"Unknown\": 430451,\n", + " \"String\": 2580521\n", + " },\n", + " \"histogram\": [\n", + " [\n", + " \"\",\n", + " 430828,\n", + " 0.1430860200626243\n", + " ],\n", + " [\n", + " \"NullValue\",\n", + " 430451,\n", + " 0.14296081132604355\n", + " ],\n", + " [\n", + " \"US\",\n", + " 429002,\n", + " 0.14247957138093612\n", + " ],\n", + " [\n", + " \"FR\",\n", + " 430382,\n", + " 0.14293789513818128\n", + " ],\n", + " [\n", + " \"UK\",\n", + " 430142,\n", + " 0.1428581866586604\n", + " ],\n", + " [\n", + " \"JP\",\n", + " 430345,\n", + " 0.14292560674758847\n", + " ],\n", + " [\n", + " \"DE\",\n", + " 429822,\n", + " 0.14275190868596585\n", + " ]\n", + " ]\n", + "}\n" ] } ], @@ -216,9 +983,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tutorials/repository.ipynb b/tutorials/repository.ipynb index c64847e..2786e74 100644 --- a/tutorials/repository.ipynb +++ b/tutorials/repository.ipynb @@ -6,14 +6,118 @@ "source": [ "# Storing Computed Metrics in a MetricsRepository\n", "\n", + "__Updated June 2024 to use a new dataset__\n", + "\n", "PyDeequ allows us to persist the metrics we computed on dataframes in a so-called MetricsRepository. In the following example, we showcase how to store metrics in a filesystem and query them later on." ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "execution_count": 1, + "metadata": { + "tags": [] + }, "outputs": [], + "source": [ + "import os\n", + "# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n", + "os.environ[\"SPARK_VERSION\"] = '3.5'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n", + "The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n", + "com.amazon.deequ#deequ added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-3fd29e82-4619-4f88-ba49-669eee4ba096;1.0\n", + "\tconfs: [default]\n", + "\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n", + "\tfound org.scala-lang#scala-reflect;2.12.10 in central\n", + "\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n", + "\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n", + "\tfound com.github.fommil.netlib#core;1.1.2 in central\n", + "\tfound net.sf.opencsv#opencsv;2.3 in central\n", + "\tfound com.github.rwl#jtransforms;2.4.0 in central\n", + "\tfound junit#junit;4.8.2 in central\n", + "\tfound org.apache.commons#commons-math3;3.2 in central\n", + "\tfound org.spire-math#spire_2.12;0.13.0 in central\n", + "\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n", + "\tfound org.typelevel#machinist_2.12;0.6.1 in central\n", + "\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n", + "\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n", + "\tfound org.slf4j#slf4j-api;1.7.5 in central\n", + ":: resolution report :: resolve 408ms :: artifacts dl 15ms\n", + "\t:: modules in use:\n", + "\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n", + "\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n", + "\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n", + "\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n", + "\tjunit#junit;4.8.2 from central in [default]\n", + "\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n", + "\torg.apache.commons#commons-math3;3.2 from central in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n", + "\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n", + "\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n", + "\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n", + "\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n", + "\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n", + "\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n", + "\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n", + "\t:: evicted modules:\n", + "\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 17 | 0 | 0 | 2 || 15 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-3fd29e82-4619-4f88-ba49-669eee4ba096\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 15 already retrieved (0kB/9ms)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:36:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:36:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", + "24/06/14 23:36:10 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n", + "24/06/14 23:36:10 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n" + ] + } + ], "source": [ "from pyspark.sql import SparkSession, Row, DataFrame\n", "import json\n", @@ -36,7 +140,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### We will be using the Amazon Product Reviews dataset\n", + "### We will be using the synthetic reviews dataset\n", "\n", "Specifically the Electronics and Books subset." ] @@ -44,8 +148,24 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:36:38 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -55,43 +175,39 @@ " |-- customer_id: string (nullable = true)\n", " |-- review_id: string (nullable = true)\n", " |-- product_id: string (nullable = true)\n", - " |-- product_parent: string (nullable = true)\n", " |-- product_title: string (nullable = true)\n", - " |-- star_rating: integer (nullable = true)\n", - " |-- helpful_votes: integer (nullable = true)\n", - " |-- total_votes: integer (nullable = true)\n", - " |-- vine: string (nullable = true)\n", - " |-- verified_purchase: string (nullable = true)\n", + " |-- star_rating: long (nullable = true)\n", + " |-- helpful_votes: long (nullable = true)\n", + " |-- total_votes: long (nullable = true)\n", + " |-- insight: string (nullable = true)\n", " |-- review_headline: string (nullable = true)\n", " |-- review_body: string (nullable = true)\n", - " |-- review_date: date (nullable = true)\n", - " |-- year: integer (nullable = true)\n", + " |-- review_date: timestamp (nullable = true)\n", + " |-- review_year: long (nullable = true)\n", "\n", "root\n", " |-- marketplace: string (nullable = true)\n", " |-- customer_id: string (nullable = true)\n", " |-- review_id: string (nullable = true)\n", " |-- product_id: string (nullable = true)\n", - " |-- product_parent: string (nullable = true)\n", " |-- product_title: string (nullable = true)\n", - " |-- star_rating: integer (nullable = true)\n", - " |-- helpful_votes: integer (nullable = true)\n", - " |-- total_votes: integer (nullable = true)\n", - " |-- vine: string (nullable = true)\n", - " |-- verified_purchase: string (nullable = true)\n", + " |-- star_rating: long (nullable = true)\n", + " |-- helpful_votes: long (nullable = true)\n", + " |-- total_votes: long (nullable = true)\n", + " |-- insight: string (nullable = true)\n", " |-- review_headline: string (nullable = true)\n", " |-- review_body: string (nullable = true)\n", - " |-- review_date: date (nullable = true)\n", - " |-- year: integer (nullable = true)\n", + " |-- review_date: timestamp (nullable = true)\n", + " |-- review_year: long (nullable = true)\n", "\n", "None None\n" ] } ], "source": [ - "df_electronics = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n", + "df_electronics = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n", "\n", - "df_books = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Books/\")\n", + "df_books = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Books/\")\n", "\n", "print(df_electronics.printSchema(), df_books.printSchema())" ] @@ -110,13 +226,15 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "metrics_file path: /tmp/1595457441222-0/metrics.json\n" + "metrics_file path: /tmp/1718408214845-0/metrics.json\n" ] } ], @@ -138,7 +256,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "key_tags = {'tag': 'electronics'}\n", @@ -157,8 +277,24 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:37:18 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -167,15 +303,23 @@ "| entity| instance| name| value|\n", "+-----------+--------------------+-------------------+--------------------+\n", "| Column| review_id| Completeness| 1.0|\n", - "| Column| review_id|ApproxCountDistinct| 3010972.0|\n", - "|Mutlicolumn|total_votes,star_...| Correlation|-0.03451097996538765|\n", - "| Dataset| *| Size| 3120938.0|\n", - "| Column| star_rating| Mean| 4.036143941340712|\n", - "| Column| top star_rating| Compliance| 0.7494070692849394|\n", - "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9936463809903863|\n", + "| Column| review_id|ApproxCountDistinct| 3160409.0|\n", + "|Mutlicolumn|total_votes,star_...| Correlation|-7.38808965018615...|\n", + "| Dataset| *| Size| 3010972.0|\n", + "| Column| star_rating| Mean| 3.9999973430506826|\n", + "| Column| top star_rating| Compliance| 0.7499993357626706|\n", + "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9817922803462663|\n", "+-----------+--------------------+-------------------+--------------------+\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n", + " warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n" + ] } ], "source": [ @@ -210,7 +354,9 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -219,13 +365,13 @@ "+-----------+--------------------+-------------------+--------------------+-------------+-----------+\n", "| entity| instance| name| value| dataset_date| tag|\n", "+-----------+--------------------+-------------------+--------------------+-------------+-----------+\n", - "| Column| review_id| Completeness| 1.0|1595457441235|electronics|\n", - "| Column| review_id|ApproxCountDistinct| 3010972.0|1595457441235|electronics|\n", - "|Mutlicolumn|total_votes,star_...| Correlation|-0.03451097996538765|1595457441235|electronics|\n", - "| Dataset| *| Size| 3120938.0|1595457441235|electronics|\n", - "| Column| star_rating| Mean| 4.036143941340712|1595457441235|electronics|\n", - "| Column| top star_rating| Compliance| 0.7494070692849394|1595457441235|electronics|\n", - "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9936463809903863|1595457441235|electronics|\n", + "| Column| review_id| Completeness| 1.0|1718408220742|electronics|\n", + "| Column| review_id|ApproxCountDistinct| 3160409.0|1718408220742|electronics|\n", + "|Mutlicolumn|total_votes,star_...| Correlation|-7.38808965018615...|1718408220742|electronics|\n", + "| Dataset| *| Size| 3010972.0|1718408220742|electronics|\n", + "| Column| star_rating| Mean| 3.9999973430506826|1718408220742|electronics|\n", + "| Column| top star_rating| Compliance| 0.7499993357626706|1718408220742|electronics|\n", + "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9817922803462663|1718408220742|electronics|\n", "+-----------+--------------------+-------------------+--------------------+-------------+-----------+\n", "\n" ] @@ -249,8 +395,17 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 5:=======================================================> (33 + 1) / 34]\r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -259,15 +414,22 @@ "| entity| instance| name| value|\n", "+-----------+--------------------+-------------------+--------------------+\n", "| Column| review_id| Completeness| 1.0|\n", - "| Column| review_id|ApproxCountDistinct| 2.005151E7|\n", - "|Mutlicolumn|total_votes,star_...| Correlation|-0.13092955077624202|\n", - "| Dataset| *| Size| 2.072616E7|\n", - "| Column| star_rating| Mean| 4.340540167594962|\n", - "| Column| top star_rating| Compliance| 0.8302768095971468|\n", - "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9613189372804929|\n", + "| Column| review_id|ApproxCountDistinct| 1.0865041E7|\n", + "|Mutlicolumn|total_votes,star_...| Correlation|1.747345622996871...|\n", + "| Dataset| *| Size| 9672664.0|\n", + "| Column| star_rating| Mean| 2.9938504015026264|\n", + "| Column| top star_rating| Compliance| 0.33738967878962817|\n", + "|Mutlicolumn|total_votes,helpf...| Correlation|8.085328839629536E-5|\n", "+-----------+--------------------+-------------------+--------------------+\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] } ], "source": [ @@ -301,30 +463,32 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+-----------+-------------------------+-------------------+--------------------+-------------+-----------+\n", - "|entity |instance |name |value |dataset_date |tag |\n", - "+-----------+-------------------------+-------------------+--------------------+-------------+-----------+\n", - "|Column |review_id |Completeness |1.0 |1595457441235|electronics|\n", - "|Column |review_id |ApproxCountDistinct|3010972.0 |1595457441235|electronics|\n", - "|Mutlicolumn|total_votes,star_rating |Correlation |-0.03451097996538765|1595457441235|electronics|\n", - "|Dataset |* |Size |3120938.0 |1595457441235|electronics|\n", - "|Column |star_rating |Mean |4.036143941340712 |1595457441235|electronics|\n", - "|Column |top star_rating |Compliance |0.7494070692849394 |1595457441235|electronics|\n", - "|Mutlicolumn|total_votes,helpful_votes|Correlation |0.9936463809903863 |1595457441235|electronics|\n", - "|Column |review_id |Completeness |1.0 |1595457494596|books |\n", - "|Column |review_id |ApproxCountDistinct|2.005151E7 |1595457494596|books |\n", - "|Mutlicolumn|total_votes,star_rating |Correlation |-0.13092955077624202|1595457494596|books |\n", - "|Dataset |* |Size |2.072616E7 |1595457494596|books |\n", - "|Column |star_rating |Mean |4.340540167594962 |1595457494596|books |\n", - "|Column |top star_rating |Compliance |0.8302768095971468 |1595457494596|books |\n", - "|Mutlicolumn|total_votes,helpful_votes|Correlation |0.9613189372804929 |1595457494596|books |\n", - "+-----------+-------------------------+-------------------+--------------------+-------------+-----------+\n", + "+-----------+-------------------------+-------------------+---------------------+-------------+-----------+\n", + "|entity |instance |name |value |dataset_date |tag |\n", + "+-----------+-------------------------+-------------------+---------------------+-------------+-----------+\n", + "|Column |review_id |Completeness |1.0 |1718408220742|electronics|\n", + "|Column |review_id |ApproxCountDistinct|3160409.0 |1718408220742|electronics|\n", + "|Mutlicolumn|total_votes,star_rating |Correlation |-7.388089650186156E-4|1718408220742|electronics|\n", + "|Dataset |* |Size |3010972.0 |1718408220742|electronics|\n", + "|Column |star_rating |Mean |3.9999973430506826 |1718408220742|electronics|\n", + "|Column |top star_rating |Compliance |0.7499993357626706 |1718408220742|electronics|\n", + "|Mutlicolumn|total_votes,helpful_votes|Correlation |0.9817922803462663 |1718408220742|electronics|\n", + "|Column |review_id |Completeness |1.0 |1718408257243|books |\n", + "|Column |review_id |ApproxCountDistinct|1.0865041E7 |1718408257243|books |\n", + "|Mutlicolumn|total_votes,star_rating |Correlation |1.7473456229968713E-4|1718408257243|books |\n", + "|Dataset |* |Size |9672664.0 |1718408257243|books |\n", + "|Column |star_rating |Mean |2.9938504015026264 |1718408257243|books |\n", + "|Column |top star_rating |Compliance |0.33738967878962817 |1718408257243|books |\n", + "|Mutlicolumn|total_votes,helpful_votes|Correlation |8.085328839629536E-5 |1718408257243|books |\n", + "+-----------+-------------------------+-------------------+---------------------+-------------+-----------+\n", "\n" ] } @@ -347,23 +511,25 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+-----------+-------------------------+-------------------+--------------------+-------------+-----+\n", - "|entity |instance |name |value |dataset_date |tag |\n", - "+-----------+-------------------------+-------------------+--------------------+-------------+-----+\n", - "|Column |review_id |Completeness |1.0 |1595457494596|books|\n", - "|Column |review_id |ApproxCountDistinct|2.005151E7 |1595457494596|books|\n", - "|Mutlicolumn|total_votes,star_rating |Correlation |-0.13092955077624202|1595457494596|books|\n", - "|Dataset |* |Size |2.072616E7 |1595457494596|books|\n", - "|Column |star_rating |Mean |4.340540167594962 |1595457494596|books|\n", - "|Column |top star_rating |Compliance |0.8302768095971468 |1595457494596|books|\n", - "|Mutlicolumn|total_votes,helpful_votes|Correlation |0.9613189372804929 |1595457494596|books|\n", - "+-----------+-------------------------+-------------------+--------------------+-------------+-----+\n", + "+-----------+-------------------------+-------------------+---------------------+-------------+-----+\n", + "|entity |instance |name |value |dataset_date |tag |\n", + "+-----------+-------------------------+-------------------+---------------------+-------------+-----+\n", + "|Column |review_id |Completeness |1.0 |1718408257243|books|\n", + "|Column |review_id |ApproxCountDistinct|1.0865041E7 |1718408257243|books|\n", + "|Mutlicolumn|total_votes,star_rating |Correlation |1.7473456229968713E-4|1718408257243|books|\n", + "|Dataset |* |Size |9672664.0 |1718408257243|books|\n", + "|Column |star_rating |Mean |2.9938504015026264 |1718408257243|books|\n", + "|Column |top star_rating |Compliance |0.33738967878962817 |1718408257243|books|\n", + "|Mutlicolumn|total_votes,helpful_votes|Correlation |8.085328839629536E-5 |1718408257243|books|\n", + "+-----------+-------------------------+-------------------+---------------------+-------------+-----+\n", "\n" ] } @@ -378,24 +544,33 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, + "execution_count": 11, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+-----------+-------------------------+-------------------+--------------------+-------------+-----+\n", - "|entity |instance |name |value |dataset_date |tag |\n", - "+-----------+-------------------------+-------------------+--------------------+-------------+-----+\n", - "|Column |review_id |Completeness |1.0 |1595457494596|books|\n", - "|Column |review_id |ApproxCountDistinct|2.005151E7 |1595457494596|books|\n", - "|Mutlicolumn|total_votes,star_rating |Correlation |-0.13092955077624202|1595457494596|books|\n", - "|Dataset |* |Size |2.072616E7 |1595457494596|books|\n", - "|Column |star_rating |Mean |4.340540167594962 |1595457494596|books|\n", - "|Column |top star_rating |Compliance |0.8302768095971468 |1595457494596|books|\n", - "|Mutlicolumn|total_votes,helpful_votes|Correlation |0.9613189372804929 |1595457494596|books|\n", - "+-----------+-------------------------+-------------------+--------------------+-------------+-----+\n", + "+-----------+-------------------------+-------------------+---------------------+-------------+-----------+\n", + "|entity |instance |name |value |dataset_date |tag |\n", + "+-----------+-------------------------+-------------------+---------------------+-------------+-----------+\n", + "|Column |review_id |Completeness |1.0 |1718408220742|electronics|\n", + "|Column |review_id |ApproxCountDistinct|3160409.0 |1718408220742|electronics|\n", + "|Mutlicolumn|total_votes,star_rating |Correlation |-7.388089650186156E-4|1718408220742|electronics|\n", + "|Dataset |* |Size |3010972.0 |1718408220742|electronics|\n", + "|Column |star_rating |Mean |3.9999973430506826 |1718408220742|electronics|\n", + "|Column |top star_rating |Compliance |0.7499993357626706 |1718408220742|electronics|\n", + "|Mutlicolumn|total_votes,helpful_votes|Correlation |0.9817922803462663 |1718408220742|electronics|\n", + "|Column |review_id |Completeness |1.0 |1718408257243|books |\n", + "|Column |review_id |ApproxCountDistinct|1.0865041E7 |1718408257243|books |\n", + "|Mutlicolumn|total_votes,star_rating |Correlation |1.7473456229968713E-4|1718408257243|books |\n", + "|Dataset |* |Size |9672664.0 |1718408257243|books |\n", + "|Column |star_rating |Mean |2.9938504015026264 |1718408257243|books |\n", + "|Column |top star_rating |Compliance |0.33738967878962817 |1718408257243|books |\n", + "|Mutlicolumn|total_votes,helpful_votes|Correlation |8.085328839629536E-5 |1718408257243|books |\n", + "+-----------+-------------------------+-------------------+---------------------+-------------+-----------+\n", "\n" ] } @@ -414,6 +589,13 @@ "source": [ "### For more info ... look at full list of Metrics Repository in `docs/repository.md` " ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -432,7 +614,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/tutorials/suggestions.ipynb b/tutorials/suggestions.ipynb index 0a2d8f5..c80400e 100644 --- a/tutorials/suggestions.ipynb +++ b/tutorials/suggestions.ipynb @@ -6,14 +6,119 @@ "source": [ "# Constraint Suggestions Basic Tutorial\n", "\n", + "__Updated June 2024 to use a new dataset__\n", + "\n", "This Jupyter notebook will give a basic tutorial on how to use PyDeequ's Constraint Suggestions module." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], + "source": [ + "import os\n", + "# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n", + "os.environ[\"SPARK_VERSION\"] = '3.5' " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n", + "The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n", + "com.amazon.deequ#deequ added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-af15f371-12cc-40de-ae72-695f78861e66;1.0\n", + "\tconfs: [default]\n", + "\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n", + "\tfound org.scala-lang#scala-reflect;2.12.10 in central\n", + "\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n", + "\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n", + "\tfound com.github.fommil.netlib#core;1.1.2 in central\n", + "\tfound net.sf.opencsv#opencsv;2.3 in central\n", + "\tfound com.github.rwl#jtransforms;2.4.0 in central\n", + "\tfound junit#junit;4.8.2 in central\n", + "\tfound org.apache.commons#commons-math3;3.2 in central\n", + "\tfound org.spire-math#spire_2.12;0.13.0 in central\n", + "\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n", + "\tfound org.typelevel#machinist_2.12;0.6.1 in central\n", + "\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n", + "\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n", + "\tfound org.slf4j#slf4j-api;1.7.5 in central\n", + ":: resolution report :: resolve 395ms :: artifacts dl 15ms\n", + "\t:: modules in use:\n", + "\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n", + "\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n", + "\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n", + "\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n", + "\tjunit#junit;4.8.2 from central in [default]\n", + "\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n", + "\torg.apache.commons#commons-math3;3.2 from central in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n", + "\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n", + "\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n", + "\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n", + "\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n", + "\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n", + "\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n", + "\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n", + "\t:: evicted modules:\n", + "\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 17 | 0 | 0 | 2 || 15 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-af15f371-12cc-40de-ae72-695f78861e66\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 15 already retrieved (0kB/10ms)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:56:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:56:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", + "24/06/14 23:56:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n", + "24/06/14 23:56:36 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n", + "24/06/14 23:56:36 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.\n" + ] + } + ], "source": [ "from pyspark.sql import SparkSession, Row, DataFrame\n", "import json\n", @@ -29,6 +134,8 @@ " .config(\"spark.driver.extraClassPath\", classpath)\n", " .config(\"spark.jars.packages\", pydeequ.deequ_maven_coord)\n", " .config(\"spark.jars.excludes\", pydeequ.f2j_maven_coord)\n", + " .config(\"spark.driver.memory\", \"15g\")\n", + " .config(\"spark.sql.parquet.int96RebaseModeInRead\", \"CORRECTED\")\n", " .getOrCreate())" ] }, @@ -36,14 +143,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### We will be using the Amazon Product Reviews dataset -- specifically the Electronics subset. " + "### We will be using the synthetic reviews dataset for Electronics products" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "execution_count": 3, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:56:38 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -53,32 +176,46 @@ " |-- customer_id: string (nullable = true)\n", " |-- review_id: string (nullable = true)\n", " |-- product_id: string (nullable = true)\n", - " |-- product_parent: string (nullable = true)\n", " |-- product_title: string (nullable = true)\n", - " |-- star_rating: integer (nullable = true)\n", - " |-- helpful_votes: integer (nullable = true)\n", - " |-- total_votes: integer (nullable = true)\n", - " |-- vine: string (nullable = true)\n", - " |-- verified_purchase: string (nullable = true)\n", + " |-- star_rating: long (nullable = true)\n", + " |-- helpful_votes: long (nullable = true)\n", + " |-- total_votes: long (nullable = true)\n", + " |-- insight: string (nullable = true)\n", " |-- review_headline: string (nullable = true)\n", " |-- review_body: string (nullable = true)\n", - " |-- review_date: date (nullable = true)\n", - " |-- year: integer (nullable = true)\n", + " |-- review_date: timestamp (nullable = true)\n", + " |-- review_year: long (nullable = true)\n", "\n" ] } ], "source": [ - "df = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n", + "df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n", "\n", "df.printSchema()" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, + "execution_count": 4, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:56:43 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 7:===================================================> (10 + 1) / 11]\r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -86,6 +223,24 @@ "{\n", " \"constraint_suggestions\": [\n", " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('insight' has value range 'N', 'Y',`insight` IN ('N', 'Y'),None))\",\n", + " \"column_name\": \"insight\",\n", + " \"current_value\": \"Compliance: 1\",\n", + " \"description\": \"'insight' has value range 'N', 'Y'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$2307/0x00000008012b0040@31a0d64e)\",\n", + " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"insight\\\", [\\\"N\\\", \\\"Y\\\"])\"\n", + " },\n", + " {\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(insight,None))\",\n", + " \"column_name\": \"insight\",\n", + " \"current_value\": \"Completeness: 1.0\",\n", + " \"description\": \"'insight' is not null\",\n", + " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", + " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", + " \"code_for_constraint\": \".isComplete(\\\"insight\\\")\"\n", + " },\n", + " {\n", " \"constraint_name\": \"CompletenessConstraint(Completeness(review_id,None))\",\n", " \"column_name\": \"review_id\",\n", " \"current_value\": \"Completeness: 1.0\",\n", @@ -97,7 +252,7 @@ " {\n", " \"constraint_name\": \"UniquenessConstraint(Uniqueness(List(review_id),None))\",\n", " \"column_name\": \"review_id\",\n", - " \"current_value\": \"ApproxDistinctness: 0.9647650802419017\",\n", + " \"current_value\": \"ApproxDistinctness: 1.0496308168923523\",\n", " \"description\": \"'review_id' is unique\",\n", " \"suggesting_rule\": \"UniqueIfApproximatelyUniqueRule()\",\n", " \"rule_description\": \"If the ratio of approximate num distinct values in a column is close to the number of records (within the error of the HLL sketch), we suggest a UNIQUE constraint\",\n", @@ -115,7 +270,7 @@ " {\n", " \"constraint_name\": \"ComplianceConstraint(Compliance('customer_id' has no negative values,customer_id >= 0,None))\",\n", " \"column_name\": \"customer_id\",\n", - " \"current_value\": \"Minimum: 10005.0\",\n", + " \"current_value\": \"Minimum: 100000.0\",\n", " \"description\": \"'customer_id' has no negative values\",\n", " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", @@ -140,6 +295,15 @@ " \"code_for_constraint\": \".isComplete(\\\"review_date\\\")\"\n", " },\n", " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('helpful_votes' has value range '15', '14', '13', '16', '17', '12', '18', '11', '19', '10', '9', '20', '8', '21', '22', '7', '6', '23', '5', '24', '4', '25', '26', '3',`helpful_votes` IN ('15', '14', '13', '16', '17', '12', '18', '11', '19', '10', '9', '20', '8', '21', '22', '7', '6', '23', '5', '24', '4', '25', '26', '3'),None))\",\n", + " \"column_name\": \"helpful_votes\",\n", + " \"current_value\": \"Compliance: 1\",\n", + " \"description\": \"'helpful_votes' has value range '15', '14', '13', '16', '17', '12', '18', '11', '19', '10', '9', '20', '8', '21', '22', '7', '6', '23', '5', '24', '4', '25', '26', '3'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$2307/0x00000008012b0040@31a0d64e)\",\n", + " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"helpful_votes\\\", [\\\"15\\\", \\\"14\\\", \\\"13\\\", \\\"16\\\", \\\"17\\\", \\\"12\\\", \\\"18\\\", \\\"11\\\", \\\"19\\\", \\\"10\\\", \\\"9\\\", \\\"20\\\", \\\"8\\\", \\\"21\\\", \\\"22\\\", \\\"7\\\", \\\"6\\\", \\\"23\\\", \\\"5\\\", \\\"24\\\", \\\"4\\\", \\\"25\\\", \\\"26\\\", \\\"3\\\"])\"\n", + " },\n", + " {\n", " \"constraint_name\": \"CompletenessConstraint(Completeness(helpful_votes,None))\",\n", " \"column_name\": \"helpful_votes\",\n", " \"current_value\": \"Completeness: 1.0\",\n", @@ -149,15 +313,33 @@ " \"code_for_constraint\": \".isComplete(\\\"helpful_votes\\\")\"\n", " },\n", " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('helpful_votes' has value range '15', '14', '13', '16', '17', '12', '18', '11' for at least 92.0% of values,`helpful_votes` IN ('15', '14', '13', '16', '17', '12', '18', '11'),None))\",\n", + " \"column_name\": \"helpful_votes\",\n", + " \"current_value\": \"Compliance: 0.926417449248947\",\n", + " \"description\": \"'helpful_votes' has value range '15', '14', '13', '16', '17', '12', '18', '11' for at least 92.0% of values\",\n", + " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9,com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule$$$Lambda$2308/0x00000008012b0840@227699b9)\",\n", + " \"rule_description\": \"If we see a categorical range for most values in a column, we suggest an IS IN (...) constraint that should hold for most values\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"helpful_votes\\\", [\\\"15\\\", \\\"14\\\", \\\"13\\\", \\\"16\\\", \\\"17\\\", \\\"12\\\", \\\"18\\\", \\\"11\\\"], lambda x: x >= 0.92, \\\"It should be above 0.92!\\\")\"\n", + " },\n", + " {\n", " \"constraint_name\": \"ComplianceConstraint(Compliance('helpful_votes' has no negative values,helpful_votes >= 0,None))\",\n", " \"column_name\": \"helpful_votes\",\n", - " \"current_value\": \"Minimum: 0.0\",\n", + " \"current_value\": \"Minimum: 3.0\",\n", " \"description\": \"'helpful_votes' has no negative values\",\n", " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", " \"code_for_constraint\": \".isNonNegative(\\\"helpful_votes\\\")\"\n", " },\n", " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('star_rating' has value range '5', '4', '3', '2', '1',`star_rating` IN ('5', '4', '3', '2', '1'),None))\",\n", + " \"column_name\": \"star_rating\",\n", + " \"current_value\": \"Compliance: 1\",\n", + " \"description\": \"'star_rating' has value range '5', '4', '3', '2', '1'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$2307/0x00000008012b0040@31a0d64e)\",\n", + " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"star_rating\\\", [\\\"5\\\", \\\"4\\\", \\\"3\\\", \\\"2\\\", \\\"1\\\"])\"\n", + " },\n", + " {\n", " \"constraint_name\": \"CompletenessConstraint(Completeness(star_rating,None))\",\n", " \"column_name\": \"star_rating\",\n", " \"current_value\": \"Completeness: 1.0\",\n", @@ -167,6 +349,15 @@ " \"code_for_constraint\": \".isComplete(\\\"star_rating\\\")\"\n", " },\n", " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('star_rating' has value range '5', '4', '3', '2' for at least 94.0% of values,`star_rating` IN ('5', '4', '3', '2'),None))\",\n", + " \"column_name\": \"star_rating\",\n", + " \"current_value\": \"Compliance: 0.9499998671525341\",\n", + " \"description\": \"'star_rating' has value range '5', '4', '3', '2' for at least 94.0% of values\",\n", + " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9,com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule$$$Lambda$2308/0x00000008012b0840@227699b9)\",\n", + " \"rule_description\": \"If we see a categorical range for most values in a column, we suggest an IS IN (...) constraint that should hold for most values\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"star_rating\\\", [\\\"5\\\", \\\"4\\\", \\\"3\\\", \\\"2\\\"], lambda x: x >= 0.94, \\\"It should be above 0.94!\\\")\"\n", + " },\n", + " {\n", " \"constraint_name\": \"ComplianceConstraint(Compliance('star_rating' has no negative values,star_rating >= 0,None))\",\n", " \"column_name\": \"star_rating\",\n", " \"current_value\": \"Minimum: 1.0\",\n", @@ -176,24 +367,6 @@ " \"code_for_constraint\": \".isNonNegative(\\\"star_rating\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(year,None))\",\n", - " \"column_name\": \"year\",\n", - " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'year' is not null\",\n", - " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", - " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"year\\\")\"\n", - " },\n", - " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('year' has no negative values,year >= 0,None))\",\n", - " \"column_name\": \"year\",\n", - " \"current_value\": \"Minimum: 1999.0\",\n", - " \"description\": \"'year' has no negative values\",\n", - " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", - " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", - " \"code_for_constraint\": \".isNonNegative(\\\"year\\\")\"\n", - " },\n", - " {\n", " \"constraint_name\": \"CompletenessConstraint(Completeness(product_title,None))\",\n", " \"column_name\": \"product_title\",\n", " \"current_value\": \"Completeness: 1.0\",\n", @@ -205,150 +378,157 @@ " {\n", " \"constraint_name\": \"CompletenessConstraint(Completeness(review_headline,None))\",\n", " \"column_name\": \"review_headline\",\n", - " \"current_value\": \"Completeness: 0.9999987183340393\",\n", - " \"description\": \"'review_headline' has less than 1% missing values\",\n", - " \"suggesting_rule\": \"RetainCompletenessRule()\",\n", - " \"rule_description\": \"If a column is incomplete in the sample, we model its completeness as a binomial variable, estimate a confidence interval and use this to define a lower bound for the completeness\",\n", - " \"code_for_constraint\": \".hasCompleteness(\\\"review_headline\\\", lambda x: x >= 0.99, \\\"It should be above 0.99!\\\")\"\n", - " },\n", - " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(product_id,None))\",\n", - " \"column_name\": \"product_id\",\n", " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'product_id' is not null\",\n", + " \"description\": \"'review_headline' is not null\",\n", " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"product_id\\\")\"\n", + " \"code_for_constraint\": \".isComplete(\\\"review_headline\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(total_votes,None))\",\n", - " \"column_name\": \"total_votes\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('review_year' has value range '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002', '2001', '2000', '1999', '1998', '1997', '1996', '1696', '2101', '2202',`review_year` IN ('2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002', '2001', '2000', '1999', '1998', '1997', '1996', '1696', '2101', '2202'),None))\",\n", + " \"column_name\": \"review_year\",\n", + " \"current_value\": \"Compliance: 1\",\n", + " \"description\": \"'review_year' has value range '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002', '2001', '2000', '1999', '1998', '1997', '1996', '1696', '2101', '2202'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$2307/0x00000008012b0040@31a0d64e)\",\n", + " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"review_year\\\", [\\\"2016\\\", \\\"2015\\\", \\\"2014\\\", \\\"2013\\\", \\\"2012\\\", \\\"2011\\\", \\\"2010\\\", \\\"2009\\\", \\\"2008\\\", \\\"2007\\\", \\\"2006\\\", \\\"2005\\\", \\\"2004\\\", \\\"2003\\\", \\\"2002\\\", \\\"2001\\\", \\\"2000\\\", \\\"1999\\\", \\\"1998\\\", \\\"1997\\\", \\\"1996\\\", \\\"1696\\\", \\\"2101\\\", \\\"2202\\\"])\"\n", + " },\n", + " {\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(review_year,None))\",\n", + " \"column_name\": \"review_year\",\n", " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'total_votes' is not null\",\n", + " \"description\": \"'review_year' is not null\",\n", " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"total_votes\\\")\"\n", + " \"code_for_constraint\": \".isComplete(\\\"review_year\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('total_votes' has no negative values,total_votes >= 0,None))\",\n", - " \"column_name\": \"total_votes\",\n", - " \"current_value\": \"Minimum: 0.0\",\n", - " \"description\": \"'total_votes' has no negative values\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('review_year' has value range '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005' for at least 91.0% of values,`review_year` IN ('2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005'),None))\",\n", + " \"column_name\": \"review_year\",\n", + " \"current_value\": \"Compliance: 0.9157531189263799\",\n", + " \"description\": \"'review_year' has value range '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005' for at least 91.0% of values\",\n", + " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9,com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule$$$Lambda$2308/0x00000008012b0840@227699b9)\",\n", + " \"rule_description\": \"If we see a categorical range for most values in a column, we suggest an IS IN (...) constraint that should hold for most values\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"review_year\\\", [\\\"2016\\\", \\\"2015\\\", \\\"2014\\\", \\\"2013\\\", \\\"2012\\\", \\\"2011\\\", \\\"2010\\\", \\\"2009\\\", \\\"2008\\\", \\\"2007\\\", \\\"2006\\\", \\\"2005\\\"], lambda x: x >= 0.91, \\\"It should be above 0.91!\\\")\"\n", + " },\n", + " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('review_year' has no negative values,review_year >= 0,None))\",\n", + " \"column_name\": \"review_year\",\n", + " \"current_value\": \"Minimum: 1696.0\",\n", + " \"description\": \"'review_year' has no negative values\",\n", " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", - " \"code_for_constraint\": \".isNonNegative(\\\"total_votes\\\")\"\n", + " \"code_for_constraint\": \".isNonNegative(\\\"review_year\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(product_parent,None))\",\n", - " \"column_name\": \"product_parent\",\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(product_id,None))\",\n", + " \"column_name\": \"product_id\",\n", " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'product_parent' is not null\",\n", + " \"description\": \"'product_id' is not null\",\n", " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"product_parent\\\")\"\n", + " \"code_for_constraint\": \".isComplete(\\\"product_id\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('product_parent' has no negative values,product_parent >= 0,None))\",\n", - " \"column_name\": \"product_parent\",\n", - " \"current_value\": \"Minimum: 6478.0\",\n", - " \"description\": \"'product_parent' has no negative values\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('product_id' has no negative values,product_id >= 0,None))\",\n", + " \"column_name\": \"product_id\",\n", + " \"current_value\": \"Minimum: 10000.0\",\n", + " \"description\": \"'product_id' has no negative values\",\n", " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", - " \"code_for_constraint\": \".isNonNegative(\\\"product_parent\\\")\"\n", + " \"code_for_constraint\": \".isNonNegative(\\\"product_id\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"AnalysisBasedConstraint(DataType(product_parent,None),,Some(),None)\",\n", - " \"column_name\": \"product_parent\",\n", + " \"constraint_name\": \"AnalysisBasedConstraint(DataType(product_id,None),,Some(),None)\",\n", + " \"column_name\": \"product_id\",\n", " \"current_value\": \"DataType: Integral\",\n", - " \"description\": \"'product_parent' has type Integral\",\n", + " \"description\": \"'product_id' has type Integral\",\n", " \"suggesting_rule\": \"RetainTypeRule()\",\n", " \"rule_description\": \"If we detect a non-string type, we suggest a type constraint\",\n", - " \"code_for_constraint\": \".hasDataType(\\\"product_parent\\\", ConstrainableDataTypes.Integral)\"\n", + " \"code_for_constraint\": \".hasDataType(\\\"product_id\\\", ConstrainableDataTypes.Integral)\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(review_body,None))\",\n", - " \"column_name\": \"review_body\",\n", - " \"current_value\": \"Completeness: 0.9999724441818453\",\n", - " \"description\": \"'review_body' has less than 1% missing values\",\n", - " \"suggesting_rule\": \"RetainCompletenessRule()\",\n", - " \"rule_description\": \"If a column is incomplete in the sample, we model its completeness as a binomial variable, estimate a confidence interval and use this to define a lower bound for the completeness\",\n", - " \"code_for_constraint\": \".hasCompleteness(\\\"review_body\\\", lambda x: x >= 0.99, \\\"It should be above 0.99!\\\")\"\n", - " },\n", - " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('vine' has value range 'N', 'Y',`vine` IN ('N', 'Y'),None))\",\n", - " \"column_name\": \"vine\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('total_votes' has value range '19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26', '13', '12', '27', '28', '11', '10', '29', '30', '9', '8', '31', '32', '7', '33', '6', '5', '34', '4', '35', '3', '36', '2', '37', '38', '39', '1', '0',`total_votes` IN ('19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26', '13', '12', '27', '28', '11', '10', '29', '30', '9', '8', '31', '32', '7', '33', '6', '5', '34', '4', '35', '3', '36', '2', '37', '38', '39', '1', '0'),None))\",\n", + " \"column_name\": \"total_votes\",\n", " \"current_value\": \"Compliance: 1\",\n", - " \"description\": \"'vine' has value range 'N', 'Y'\",\n", - " \"suggesting_rule\": \"CategoricalRangeRule()\",\n", + " \"description\": \"'total_votes' has value range '19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26', '13', '12', '27', '28', '11', '10', '29', '30', '9', '8', '31', '32', '7', '33', '6', '5', '34', '4', '35', '3', '36', '2', '37', '38', '39', '1', '0'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$2307/0x00000008012b0040@31a0d64e)\",\n", " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", - " \"code_for_constraint\": \".isContainedIn(\\\"vine\\\", [\\\"N\\\", \\\"Y\\\"])\"\n", + " \"code_for_constraint\": \".isContainedIn(\\\"total_votes\\\", [\\\"19\\\", \\\"20\\\", \\\"21\\\", \\\"18\\\", \\\"17\\\", \\\"22\\\", \\\"23\\\", \\\"16\\\", \\\"15\\\", \\\"24\\\", \\\"25\\\", \\\"14\\\", \\\"26\\\", \\\"13\\\", \\\"12\\\", \\\"27\\\", \\\"28\\\", \\\"11\\\", \\\"10\\\", \\\"29\\\", \\\"30\\\", \\\"9\\\", \\\"8\\\", \\\"31\\\", \\\"32\\\", \\\"7\\\", \\\"33\\\", \\\"6\\\", \\\"5\\\", \\\"34\\\", \\\"4\\\", \\\"35\\\", \\\"3\\\", \\\"36\\\", \\\"2\\\", \\\"37\\\", \\\"38\\\", \\\"39\\\", \\\"1\\\", \\\"0\\\"])\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(vine,None))\",\n", - " \"column_name\": \"vine\",\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(total_votes,None))\",\n", + " \"column_name\": \"total_votes\",\n", " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'vine' is not null\",\n", + " \"description\": \"'total_votes' is not null\",\n", " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"vine\\\")\"\n", + " \"code_for_constraint\": \".isComplete(\\\"total_votes\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('vine' has value range 'N' for at least 99.0% of values,`vine` IN ('N'),None))\",\n", - " \"column_name\": \"vine\",\n", - " \"current_value\": \"Compliance: 0.9939271462617969\",\n", - " \"description\": \"'vine' has value range 'N' for at least 99.0% of values\",\n", - " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9)\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('total_votes' has value range '19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26' for at least 90.0% of values,`total_votes` IN ('19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26'),None))\",\n", + " \"column_name\": \"total_votes\",\n", + " \"current_value\": \"Compliance: 0.904062874048646\",\n", + " \"description\": \"'total_votes' has value range '19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26' for at least 90.0% of values\",\n", + " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9,com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule$$$Lambda$2308/0x00000008012b0840@227699b9)\",\n", " \"rule_description\": \"If we see a categorical range for most values in a column, we suggest an IS IN (...) constraint that should hold for most values\",\n", - " \"code_for_constraint\": \".isContainedIn(\\\"vine\\\", [\\\"N\\\"], lambda x: x >= 0.99, \\\"It should be above 0.99!\\\")\"\n", + " \"code_for_constraint\": \".isContainedIn(\\\"total_votes\\\", [\\\"19\\\", \\\"20\\\", \\\"21\\\", \\\"18\\\", \\\"17\\\", \\\"22\\\", \\\"23\\\", \\\"16\\\", \\\"15\\\", \\\"24\\\", \\\"25\\\", \\\"14\\\", \\\"26\\\"], lambda x: x >= 0.9, \\\"It should be above 0.9!\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('marketplace' has value range 'US', 'UK', 'DE', 'JP', 'FR',`marketplace` IN ('US', 'UK', 'DE', 'JP', 'FR'),None))\",\n", - " \"column_name\": \"marketplace\",\n", - " \"current_value\": \"Compliance: 1\",\n", - " \"description\": \"'marketplace' has value range 'US', 'UK', 'DE', 'JP', 'FR'\",\n", - " \"suggesting_rule\": \"CategoricalRangeRule()\",\n", - " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", - " \"code_for_constraint\": \".isContainedIn(\\\"marketplace\\\", [\\\"US\\\", \\\"UK\\\", \\\"DE\\\", \\\"JP\\\", \\\"FR\\\"])\"\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('total_votes' has no negative values,total_votes >= 0,None))\",\n", + " \"column_name\": \"total_votes\",\n", + " \"current_value\": \"Minimum: 0.0\",\n", + " \"description\": \"'total_votes' has no negative values\",\n", + " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", + " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", + " \"code_for_constraint\": \".isNonNegative(\\\"total_votes\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(marketplace,None))\",\n", - " \"column_name\": \"marketplace\",\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(review_body,None))\",\n", + " \"column_name\": \"review_body\",\n", " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'marketplace' is not null\",\n", + " \"description\": \"'review_body' is not null\",\n", " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"marketplace\\\")\"\n", + " \"code_for_constraint\": \".isComplete(\\\"review_body\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('marketplace' has value range 'US' for at least 99.0% of values,`marketplace` IN ('US'),None))\",\n", - " \"column_name\": \"marketplace\",\n", - " \"current_value\": \"Compliance: 0.9949982985884372\",\n", - " \"description\": \"'marketplace' has value range 'US' for at least 99.0% of values\",\n", - " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9)\",\n", - " \"rule_description\": \"If we see a categorical range for most values in a column, we suggest an IS IN (...) constraint that should hold for most values\",\n", - " \"code_for_constraint\": \".isContainedIn(\\\"marketplace\\\", [\\\"US\\\"], lambda x: x >= 0.99, \\\"It should be above 0.99!\\\")\"\n", + " \"constraint_name\": \"UniquenessConstraint(Uniqueness(List(review_body),None))\",\n", + " \"column_name\": \"review_body\",\n", + " \"current_value\": \"ApproxDistinctness: 1.0332703193520232\",\n", + " \"description\": \"'review_body' is unique\",\n", + " \"suggesting_rule\": \"UniqueIfApproximatelyUniqueRule()\",\n", + " \"rule_description\": \"If the ratio of approximate num distinct values in a column is close to the number of records (within the error of the HLL sketch), we suggest a UNIQUE constraint\",\n", + " \"code_for_constraint\": \".isUnique(\\\"review_body\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('verified_purchase' has value range 'Y', 'N',`verified_purchase` IN ('Y', 'N'),None))\",\n", - " \"column_name\": \"verified_purchase\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('marketplace' has value range '', 'FR', 'JP', 'UK', 'DE', 'US',`marketplace` IN ('', 'FR', 'JP', 'UK', 'DE', 'US'),None))\",\n", + " \"column_name\": \"marketplace\",\n", " \"current_value\": \"Compliance: 1\",\n", - " \"description\": \"'verified_purchase' has value range 'Y', 'N'\",\n", - " \"suggesting_rule\": \"CategoricalRangeRule()\",\n", + " \"description\": \"'marketplace' has value range '', 'FR', 'JP', 'UK', 'DE', 'US'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$2307/0x00000008012b0040@31a0d64e)\",\n", " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", - " \"code_for_constraint\": \".isContainedIn(\\\"verified_purchase\\\", [\\\"Y\\\", \\\"N\\\"])\"\n", + " \"code_for_constraint\": \".isContainedIn(\\\"marketplace\\\", [\\\"\\\", \\\"FR\\\", \\\"JP\\\", \\\"UK\\\", \\\"DE\\\", \\\"US\\\"])\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(verified_purchase,None))\",\n", - " \"column_name\": \"verified_purchase\",\n", - " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'verified_purchase' is not null\",\n", - " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", - " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"verified_purchase\\\")\"\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(marketplace,None))\",\n", + " \"column_name\": \"marketplace\",\n", + " \"current_value\": \"Completeness: 0.8570391886739565\",\n", + " \"description\": \"'marketplace' has less than 15% missing values\",\n", + " \"suggesting_rule\": \"RetainCompletenessRule()\",\n", + " \"rule_description\": \"If a column is incomplete in the sample, we model its completeness as a binomial variable, estimate a confidence interval and use this to define a lower bound for the completeness\",\n", + " \"code_for_constraint\": \".hasCompleteness(\\\"marketplace\\\", lambda x: x >= 0.85, \\\"It should be above 0.85!\\\")\"\n", " }\n", " ]\n", "}\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] } ], "source": [ @@ -393,7 +573,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/tutorials/test_data_quality_at_scale.ipynb b/tutorials/test_data_quality_at_scale.ipynb index 8d82827..0963c66 100644 --- a/tutorials/test_data_quality_at_scale.ipynb +++ b/tutorials/test_data_quality_at_scale.ipynb @@ -4,85 +4,252 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Test data quality at scale with PyDeequ\n", + "# Testing data quality at scale with PyDeequ\n", "\n", - "Authors: Calvin Wang (calviwan@), Chris Ghyzel (cghyzel@), Joan Aoanan (jaoanan@), Veronika Megler (meglerv@) " + "Authors: Vitalina Komashko (komashk@), Calvin Wang (calviwan@), Chris Ghyzel (cghyzel@), Joan Aoanan (jaoanan@), Veronika Megler (meglerv@) \n", + "\n", + "__Updated June 2024 to use a new dataset, added additional library usage examples.__\n" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "You generally write unit tests for your code, but do you also test your data? Incoming data quality can make or break your machine learning application. Incorrect, missing or malformed data can have a large impact on production systems. Examples of data quality issues are:\n", + "This notebook accompanies AWS Blog post [Testing data quality at scale with PyDeequ](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/).\n", + "\n", + "You generally write unit tests for your code, but do you also test your data? Incoming data quality can make or break your application. Incorrect, missing, or malformed data can have a large impact on production systems. Examples of data quality issues include the following:\n", "\n", - "* Missing values can lead to failures in production system that require non-null values (NullPointerException).\n", - "* Changes in the distribution of data can lead to unexpected outputs of machine learning models.\n", - "* Aggregations of incorrect data can lead to wrong business decisions.\n", + "- Missing values can lead to failures in the production system that require non-null values (`NullPointerException`)\n", + "- Changes in the distribution of data can lead to unexpected outputs of machine learning (ML) models\n", + "- Aggregations of incorrect data can lead to misguided business decisions\n", "\n", - "In this blog post, we introduce PyDeequ, an open source Python wrapper over [Deequ](https://aws.amazon.com/blogs/big-data/test-data-quality-at-scale-with-deequ/) (an open source tool developed and used at Amazon). While Deequ is written in Scala, PyDeequ allows you to use its data quality and testing capabilities from Python and PySpark, the language of choice of many data scientists. PyDeequ democratizes and extends the power of Deequ by allowing you to use it alongside the many data science libraries that are available in that language. Furthermore, PyDeequ allows for fluid interface with [Pandas](https://pandas.pydata.org/) DataFrame as opposed to restricting within Spark DataFrames. \n", + "In this post, we introduce PyDeequ, an open source Python wrapper over [Deequ](https://aws.amazon.com/blogs/big-data/test-data-quality-at-scale-with-deequ/) (an open source tool developed and used at Amazon). Deequ is written in [Scala](https://www.scala-lang.org/), whereas PyDeequ allows you to use its data quality and testing capabilities from Python and PySpark, the language of choice for many data scientists. PyDeequ democratizes and extends the power of Deequ by allowing you to use it alongside the many data science libraries that are available in that language. Furthermore, PyDeequ allows for fluid interface with [pandas](https://pandas.pydata.org/) DataFrames as opposed to restricting within [Apache Spark](https://spark.apache.org/) DataFrames.\n", "\n", - "Deequ allows you to calculate data quality metrics on your dataset, define and verify data quality constraints, and be informed about changes in the data distribution. Instead of implementing checks and verification algorithms on your own, you can focus on describing how your data should look. Deequ supports you by suggesting checks for you. Deequ is implemented on top of [Apache Spark](https://spark.apache.org/) and is designed to scale with large datasets (think billions of rows) that typically live in a distributed filesystem or a data warehouse. PyDeequ gives you access to this capability, but also allows you to use it from the familiar environment of your Python Jupyter notebook.\n", + "Deequ allows you to calculate data quality metrics for your dataset, define and verify data quality constraints, and be informed about changes in data distribution. Instead of implementing checks and verification algorithms on your own, you can focus on describing how your data should look. Deequ supports you by suggesting checks for you. Deequ is implemented on top of Apache Spark and is designed to scale with large datasets (billions of rows) that typically live in a data lake, distributed file system, or a data warehouse. PyDeequ gives you access to this capability, but also allows you to use it from the familiar environment of your Python [Jupyter](https://jupyter.org/) notebook.\n", "\n", "## Deequ at Amazon \n", "\n", - "Deequ is being used internally at Amazon for verifying the quality of many large production datasets. Dataset producers can add and edit data quality constraints. The system computes data quality metrics on a regular basis (with every new version of a dataset), verifies constraints defined by dataset producers, and publishes datasets to consumers in case of success. In error cases, dataset publication can be stopped, and producers are notified to take action. Data quality issues do not propagate to consumer data pipelines, reducing their blast radius. \n", + "Deequ is used internally at Amazon to verify the quality of many large production datasets. Dataset producers can add and edit data quality constraints. The system computes data quality metrics on a regular basis (with every new version of a dataset), verifies constraints defined by dataset producers, and publishes datasets to consumers in case of success. In error cases, dataset publication can be stopped, and producers are notified to take action. Data quality issues don’t propagate to consumer data pipelines, reducing their area of impact.\n", "\n", - "Deequ is also used within [Amazon SageMaker Model Monitor](https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor.html#model-monitor-how-it-works). Now with the availability of PyDeequ, it is finding its way into a broader set of environments - SageMaker Notebooks, AWS Glue, and more.\n", + "Deequ is also used within [Amazon SageMaker Model Monitor](https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor.html#model-monitor-how-it-works). Now with the availability of PyDeequ, you can use it from a broader set of environments — [Amazon SageMaker](https://aws.amazon.com/sagemaker/), [AWS Glue](https://aws.amazon.com/glue/), [Amazon EMR](https://aws.amazon.com/emr/), and more.\n", "\n", "## Overview of PyDeequ\n", "\n", - "Let’s look at PyDeequ’s main components, and how they relate to Deequ (shown in Figure 1). \n", + "Let’s look at PyDeequ’s main components, and how they relate to Deequ (shown in the following diagram). \n", + "\n", + "- __Metrics computation__ – Deequ computes data quality metrics, which are statistics such as completeness, maximum, or correlation. Deequ uses Spark to read from sources such as [Amazon Simple Storage Service](https://aws.amazon.com/s3/) (Amazon S3) and compute metrics through an optimized set of aggregation queries. You have direct access to the raw metrics computed on the data.\n", + "- __Constraint verification__ – As a user, you focus on defining a set of data quality constraints to be verified. Deequ takes care of deriving the required set of metrics to be computed on the data. Deequ generates a data quality report, which contains the result of the constraint verification.\n", + "- __Constraint suggestion__ – You can choose to define your own custom data quality constraints or use the automated constraint suggestion methods that profile the data to infer useful constraints.\n", + "- __Python wrappers__ – You can call each Deequ function using Python syntax. The wrappers translate the commands to the underlying Deequ calls and return their response.\n", + "\n", + "![pydeequ-spark-components](../imgs/pydeequ_architecture.jpg)\n", "\n", - "* Metrics Computation — Deequ computes data quality metrics, that is, statistics such as completeness, maximum, or correlation. Deequ uses Spark to read from sources such as Amazon S3, and to compute metrics through an optimized set of aggregation queries. You have direct access to the raw metrics computed on the data.\n", - "* Constraint Verification — As a user, you focus on defining a set of data quality constraints to be verified. Deequ takes care of deriving the required set of metrics to be computed on the data. Deequ generates a data quality report, which contains the result of the constraint verification.\n", - "* Constraint Suggestion — You can choose to define your own custom data quality constraints, or use the automated constraint suggestion methods that profile the data to infer useful constraints.\n", - "* Python wrappers — You can call each of the Deequ functions using Python syntax. The wrappers translate the commands to the underlying Deequ calls, and return their response.\n", + "**Figure 1. Overview of PyDeequ components.** \n", "\n", - "![image.png](../imgs/pydeequ_architecture.png)\n", + "## Solution overview \n", "\n", - "Figure 1. Overview of PyDeequ components. \n", + "As a running example, we have generated a synthetic reviews dataset and introduced various data issues. We demonstrate how to detect these issues using PyDeequ. We begin the way many data science projects do: with initial data exploration and assessment in a Jupyter notebook.\n", "\n", - "## Example \n", + "During the data exploration phase, we want to answer some basic questions about the data:\n", "\n", - "As a running example, we use [a customer review dataset provided by Amazon](https://s3.amazonaws.com/amazon-reviews-pds/readme.html) on Amazon S3. We have intentionally followed the example in the [Deequ blog](https://aws.amazon.com/blogs/big-data/test-data-quality-at-scale-with-deequ/), to show the similarity in functionality and execution. We begin the way many data science projects do: with initial data exploration and assessment in a Jupyter notebook. \n", + "- Are there fields that have missing values?\n", + "- How many distinct categories are there in the categorical fields?\n", + "- Are there correlations between some key features?\n", + "- If there are two supposedly similar datasets (such as different categories or different time periods), are they really similar?\n", + "- We also show you how to scale this approach to large-scale datasets, using the same code on an EMR cluster. This is how you’d likely do your ML training as you move into a production setting.\n", "\n", - "During the data exploration phase, you’d like to easily answer some basic questions about the data: \n", "\n", - "* Are the fields that are supposed to contain unique values, really unique? Are there fields that are missing values? \n", - "* How many distinct categories are there in the categorical fields?\n", - "* Are there correlations between some key features?\n", - "* If there are two supposedly similar datasets (different categories, or different time periods, say), are they really similar?\n", "\n", - "Then, we’ll show you how to scale this approach to large-scale datasets, using the same code on an EMR cluster. This is how you’d likely do your ML training, and later as you move into a production setting.\n", + "## Setup\n", "\n", - "### Setup: Start a PySpark Session in a SageMaker Notebook" + "In this section we will show how to set up PyDeequ in [SageMaker Notebooks](https://docs.aws.amazon.com/sagemaker/latest/dg/nbi.html).\n", + "\n", + "We use the default VPC for SageMaker Notebooks. The examples presented here use PyDeequ library version 1.2.0 (latest at the time of the update to this notebook) and tested in a SageMaker Notebook instance ml.m5.2xlarge, `conda_python3` kernel.\n", + " \n", + "1. Create a new notebook instance. \n", + "\n", + "As of version 1.1.0, PyDeequ supports Spark up to version 3.3.0. Your PyDeequ version has to work with your version of Spark.\n", + "\n", + "2. In the notebook, run the following lines in a code cell to specify `SPARK_VERSION`:" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 1, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "%%bash \n", - "\n", - "# install PyDeequ via pip \n", - "pip install pydeequ " + "import os\n", + "os.environ[\"SPARK_VERSION\"] = '3.3' " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. Install PyDeequ module. For consistency, we'll set the PyDeequ version too." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pydeequ==1.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (1.2.0)\n", + "Requirement already satisfied: numpy>=1.14.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from pydeequ==1.2.0) (1.22.4)\n", + "Requirement already satisfied: pandas>=0.23.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from pydeequ==1.2.0) (2.2.1)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from pandas>=0.23.0->pydeequ==1.2.0) (2.9.0)\n", + "Requirement already satisfied: pytz>=2020.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from pandas>=0.23.0->pydeequ==1.2.0) (2024.1)\n", + "Requirement already satisfied: tzdata>=2022.7 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from pandas>=0.23.0->pydeequ==1.2.0) (2024.1)\n", + "Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas>=0.23.0->pydeequ==1.2.0) (1.16.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install pydeequ==1.2.0" + ] + }, + { + "cell_type": "markdown", "metadata": {}, + "source": [ + "4. To import the modules, run the following commands in a code cell:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, "outputs": [], + "source": [ + "import sagemaker_pyspark\n", + "import pydeequ" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This completes the steps specific to SageMaker Notebooks." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Start a PySpark Session\n", + "\n", + "In the cell below we import modules and set up a Spark session with the following configurations:\n", + "\n", + "- `config(\"spark.driver.extraClassPath\", classpath)` to prepend extra classpath entries to the classpath of the driver\n", + "- `config(\"spark.jars.packages\", pydeequ.deequ_maven_coord)` to provide Maven of jars to include on the driver and executor classpaths\n", + "- `config(\"spark.jars.excludes\", pydeequ.f2j_maven_coord` to exclude jars to avoid conflicts\n", + "- `config(\"spark.driver.memory\", \"15g\")` to increase Java heap space\n", + "- `config(\"spark.sql.parquet.int96RebaseModeInRead\", \"CORRECTED\")` to read the datetime values as is. In our synthetic dataset we introduced review years and dates such as 1696 to simulate a manual entry error. To ensure that these timestamps are read correctly, this configuration was necessary. See [Spark issue SPARK-31404](https://issues.apache.org/jira/browse/SPARK-31404) about the calendar switch in the version 3.0.\n", + "\n", + "For a detailed explanation about these parameters, see [Spark Configuration](https://spark.apache.org/docs/latest/configuration.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n", + "The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n", + "com.amazon.deequ#deequ added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-eb3a177b-ccc4-4677-a9b0-104653b54fc7;1.0\n", + "\tconfs: [default]\n", + "\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n", + "\tfound org.scala-lang#scala-reflect;2.12.10 in central\n", + "\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n", + "\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n", + "\tfound com.github.fommil.netlib#core;1.1.2 in central\n", + "\tfound net.sf.opencsv#opencsv;2.3 in central\n", + "\tfound com.github.rwl#jtransforms;2.4.0 in central\n", + "\tfound junit#junit;4.8.2 in central\n", + "\tfound org.apache.commons#commons-math3;3.2 in central\n", + "\tfound org.spire-math#spire_2.12;0.13.0 in central\n", + "\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n", + "\tfound org.typelevel#machinist_2.12;0.6.1 in central\n", + "\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n", + "\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n", + "\tfound org.slf4j#slf4j-api;1.7.5 in central\n", + ":: resolution report :: resolve 493ms :: artifacts dl 18ms\n", + "\t:: modules in use:\n", + "\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n", + "\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n", + "\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n", + "\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n", + "\tjunit#junit;4.8.2 from central in [default]\n", + "\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n", + "\torg.apache.commons#commons-math3;3.2 from central in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n", + "\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n", + "\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n", + "\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n", + "\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n", + "\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n", + "\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n", + "\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n", + "\t:: evicted modules:\n", + "\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 17 | 0 | 0 | 2 || 15 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-eb3a177b-ccc4-4677-a9b0-104653b54fc7\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 15 already retrieved (0kB/13ms)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:15:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + } + ], "source": [ "from pyspark.sql import SparkSession, Row, DataFrame\n", "import json\n", "import pandas as pd\n", - "import sagemaker_pyspark\n", - "\n", - "import pydeequ\n", "\n", "classpath = \":\".join(sagemaker_pyspark.classpath_jars())\n", "\n", @@ -91,6 +258,8 @@ " .config(\"spark.driver.extraClassPath\", classpath)\n", " .config(\"spark.jars.packages\", pydeequ.deequ_maven_coord)\n", " .config(\"spark.jars.excludes\", pydeequ.f2j_maven_coord)\n", + " .config(\"spark.driver.memory\", \"15g\")\n", + " .config(\"spark.sql.parquet.int96RebaseModeInRead\", \"CORRECTED\")\n", " .getOrCreate())" ] }, @@ -98,13 +267,48 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### We will be using the Amazon Product Reviews dataset -- specifically the Electronics subset. " + "### Read the dataset " ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:15:33 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")" + ] + }, + { + "cell_type": "markdown", "metadata": {}, + "source": [ + "After you load the DataFrame, you can run `df.printSchema()` to view the schema of the dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stdout", @@ -115,24 +319,20 @@ " |-- customer_id: string (nullable = true)\n", " |-- review_id: string (nullable = true)\n", " |-- product_id: string (nullable = true)\n", - " |-- product_parent: string (nullable = true)\n", " |-- product_title: string (nullable = true)\n", - " |-- star_rating: integer (nullable = true)\n", - " |-- helpful_votes: integer (nullable = true)\n", - " |-- total_votes: integer (nullable = true)\n", - " |-- vine: string (nullable = true)\n", - " |-- verified_purchase: string (nullable = true)\n", + " |-- star_rating: long (nullable = true)\n", + " |-- helpful_votes: long (nullable = true)\n", + " |-- total_votes: long (nullable = true)\n", + " |-- insight: string (nullable = true)\n", " |-- review_headline: string (nullable = true)\n", " |-- review_body: string (nullable = true)\n", - " |-- review_date: date (nullable = true)\n", - " |-- year: integer (nullable = true)\n", + " |-- review_date: timestamp (nullable = true)\n", + " |-- review_year: long (nullable = true)\n", "\n" ] } ], "source": [ - "df = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n", - "\n", "df.printSchema()" ] }, @@ -142,30 +342,22 @@ "source": [ "## Data Analysis \n", "\n", - "Before we define checks on the data, we want to calculate some statistics on the dataset; we call them metrics. As with Deequ, PyDeequ supports a rich set of metrics (they are described in this blog (https://aws.amazon.com/blogs/big-data/test-data-quality-at-scale-with-deequ/) and in this Deequ package (https://github.com/awslabs/deequ/tree/master/src/main/scala/com/amazon/deequ/analyzers)). In the following example, we show how to use the _AnalysisRunner (https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/analyzers/runners/AnalysisRunner.scala)_ to capture the metrics you are interested in. " + "Before we define checks on the data, we want to calculate some statistics for the dataset. As with Deequ, PyDeequ supports a rich set of metrics. For more information, see [Test data quality at scale with Deequ](https://aws.amazon.com/blogs/big-data/test-data-quality-at-scale-with-deequ/) or the [GitHub repo](https://github.com/awslabs/deequ/tree/master/src/main/scala/com/amazon/deequ/analyzers). In the following example, we use the [AnalysisRunner](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/analyzers/runners/AnalysisRunner.scala) to capture the metrics we’re interested in:" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, + "execution_count": 7, + "metadata": { + "tags": [] + }, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "+-----------+--------------------+-------------------+--------------------+\n", - "| entity| instance| name| value|\n", - "+-----------+--------------------+-------------------+--------------------+\n", - "| Column| review_id| Completeness| 1.0|\n", - "| Column| review_id|ApproxCountDistinct| 3010972.0|\n", - "|Mutlicolumn|total_votes,star_...| Correlation|-0.03451097996538765|\n", - "| Dataset| *| Size| 3120938.0|\n", - "| Column| star_rating| Mean| 4.036143941340712|\n", - "| Column| top star_rating| Compliance| 0.7494070692849394|\n", - "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9936463809903863|\n", - "+-----------+--------------------+-------------------+--------------------+\n", - "\n" + "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n", + " warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n" ] } ], @@ -176,14 +368,44 @@ " .onData(df) \\\n", " .addAnalyzer(Size()) \\\n", " .addAnalyzer(Completeness(\"review_id\")) \\\n", - " .addAnalyzer(ApproxCountDistinct(\"review_id\")) \\\n", + " .addAnalyzer(Distinctness(\"review_id\")) \\\n", " .addAnalyzer(Mean(\"star_rating\")) \\\n", " .addAnalyzer(Compliance(\"top star_rating\", \"star_rating >= 4.0\")) \\\n", " .addAnalyzer(Correlation(\"total_votes\", \"star_rating\")) \\\n", " .addAnalyzer(Correlation(\"total_votes\", \"helpful_votes\")) \\\n", " .run()\n", " \n", - "analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)\n", + "analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+--------------------+------------+--------------------+\n", + "| entity| instance| name| value|\n", + "+-----------+--------------------+------------+--------------------+\n", + "| Column| review_id|Completeness| 1.0|\n", + "|Mutlicolumn|total_votes,star_...| Correlation|-7.38808965018615...|\n", + "| Column| review_id|Distinctness| 0.9926568563241371|\n", + "| Dataset| *| Size| 3010972.0|\n", + "| Column| star_rating| Mean| 3.9999973430506826|\n", + "| Column| top star_rating| Compliance| 0.7499993357626706|\n", + "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9817922803462663|\n", + "+-----------+--------------------+------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "pd.options.display.float_format = '{:,.7g}'.format\n", "analysisResult_df.show()" ] }, @@ -191,16 +413,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### You can also get that result in a Pandas Dataframe!\n", + "From this, we learn the following:\n", + "\n", + "- `review_id` has no missing values and approximately 99.27% of the values are distinct\n", + "- 74.99% of reviews have a `star_rating` of 4 or higher\n", + "- `total_votes` and `star_rating` are not correlated\n", + "- `helpful_votes` and `total_votes` are strongly correlated\n", + "- The average `star_rating` is 3.99\n", + "- The dataset contains 3,010,972 reviews\n", "\n", - "Passing `pandas=True` in any call for getting metrics as DataFrames will return the dataframe in Pandas form! We'll see more of it down the line! " + "Sometimes, you may want to run multiple metrics on a single column. For example, you want to check that all reviews were written either after 1996 or before 2017. In this case, it’s helpful to provide a name for each metric in order to distinguish the results in the output:" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, + "execution_count": 9, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "data": { "text/html": [ @@ -232,74 +470,42 @@ " \n", " 0\n", " Column\n", - " review_id\n", - " Completeness\n", - " 1.000000e+00\n", + " after-1996 review_year\n", + " Compliance\n", + " 0.9999993\n", " \n", " \n", " 1\n", " Column\n", - " review_id\n", - " ApproxCountDistinct\n", - " 3.010972e+06\n", - " \n", - " \n", - " 2\n", - " Mutlicolumn\n", - " total_votes,star_rating\n", - " Correlation\n", - " -3.451098e-02\n", - " \n", - " \n", - " 3\n", - " Dataset\n", - " *\n", - " Size\n", - " 3.120938e+06\n", - " \n", - " \n", - " 4\n", - " Column\n", - " star_rating\n", - " Mean\n", - " 4.036144e+00\n", - " \n", - " \n", - " 5\n", - " Column\n", - " top star_rating\n", + " before-2017 review_year\n", " Compliance\n", - " 7.494071e-01\n", - " \n", - " \n", - " 6\n", - " Mutlicolumn\n", - " total_votes,helpful_votes\n", - " Correlation\n", - " 9.936464e-01\n", + " 0.999999\n", " \n", " \n", "\n", "" ], "text/plain": [ - " entity instance name value\n", - "0 Column review_id Completeness 1.000000e+00\n", - "1 Column review_id ApproxCountDistinct 3.010972e+06\n", - "2 Mutlicolumn total_votes,star_rating Correlation -3.451098e-02\n", - "3 Dataset * Size 3.120938e+06\n", - "4 Column star_rating Mean 4.036144e+00\n", - "5 Column top star_rating Compliance 7.494071e-01\n", - "6 Mutlicolumn total_votes,helpful_votes Correlation 9.936464e-01" + " entity instance name value\n", + "0 Column after-1996 review_year Compliance 0.9999993\n", + "1 Column before-2017 review_year Compliance 0.999999" ] }, - "execution_count": 4, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "analysisResult_pd_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult, pandas=True)\n", + "analysisResult = AnalysisRunner(spark) \\\n", + " .onData(df) \\\n", + " .addAnalyzer(Compliance(\"after-1996 review_year\", \n", + "\"review_year >= 1996\")) \\\n", + " .addAnalyzer(Compliance(\"before-2017 review_year\", \n", + "\"review_year <= 2017\")) \\\n", + " .run()\n", + "analysisResult_pd_df = AnalyzerContext.successMetricsAsDataFrame(spark,\n", + "analysisResult, pandas=True)\n", "analysisResult_pd_df" ] }, @@ -307,42 +513,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "From this, we learn that: \n", - "\n", - "* review_id has no missing values and approximately 3,010,972 unique values. \n", - "* 74.9% of reviews have a star_rating of 4 or higher \n", - "* total_votes and star_rating are not correlated. \n", - "* helpful_votes and total_votes are strongly correlated \n", - "* the average star_rating is 4.0 \n", - "* The dataset contains 3,120,938 reviews. \n", - "\n", - "## Define and Run Tests for Data\n", - "\n", - "After analyzing and understanding the data, we want to verify that the properties we have derived also hold for new versions of the dataset. By defining assertions on the data distribution as part of a data pipeline, we can ensure that every processed dataset is of high quality, and that any application consuming the data can rely on it.\n", - "\n", - "For writing tests on data, we start with the _VerificationSuite (https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/VerificationSuite.scala)_ and add _Checks (https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/checks/Check.scala)_ on attributes of the data. In this example, we test for the following properties of our data:\n", - "\n", - "* There are at least 3 million rows in total. \n", - "* review_id is never NULL.\n", - "* review_id is unique. \n", - "* star_rating has a minimum of 1.0 and maximum of 5.0. \n", - "* marketplace only contains “US”, “UK”, “DE”, “JP”, or “FR”.\n", - "* year does not contain negative values. \n", - "\n", - "This is the code that reflects the previous statements. For information about all available checks, see _this GitHub repository (https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/checks/Check.scala)_. You can run this directly in the Spark shell as previously explained:\n" + "Alternatively, you can combine the conditions by using logical operators `and` and `or`. In the following example, we check that the years for the reviews are between 1996 and 2017 and the values for the `insight` column are either 'Y' or 'N':" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, + "execution_count": 10, + "metadata": { + "tags": [] + }, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Python Callback server started!\n", - "Verification Run Status: Warning\n" + "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n", + " warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n" ] }, { @@ -366,133 +552,140 @@ " \n", " \n", " \n", - " check\n", - " check_level\n", - " check_status\n", - " constraint\n", - " constraint_status\n", - " constraint_message\n", + " entity\n", + " instance\n", + " name\n", + " value\n", " \n", " \n", " \n", " \n", " 0\n", - " Amazon Electronic Products Reviews\n", - " Warning\n", - " Warning\n", - " SizeConstraint(Size(None))\n", - " Success\n", - " \n", + " Column\n", + " range1996to2017 review_year\n", + " Compliance\n", + " 0.9999983\n", " \n", " \n", " 1\n", - " Amazon Electronic Products Reviews\n", - " Warning\n", - " Warning\n", - " MinimumConstraint(Minimum(star_rating,None))\n", - " Success\n", - " \n", - " \n", - " \n", - " 2\n", - " Amazon Electronic Products Reviews\n", - " Warning\n", - " Warning\n", - " MaximumConstraint(Maximum(star_rating,None))\n", - " Success\n", - " \n", - " \n", - " \n", - " 3\n", - " Amazon Electronic Products Reviews\n", - " Warning\n", - " Warning\n", - " CompletenessConstraint(Completeness(review_id,...\n", - " Success\n", - " \n", - " \n", - " \n", - " 4\n", - " Amazon Electronic Products Reviews\n", - " Warning\n", - " Warning\n", - " UniquenessConstraint(Uniqueness(List(review_id...\n", - " Failure\n", - " Value: 0.9926566948782706 does not meet the co...\n", - " \n", - " \n", - " 5\n", - " Amazon Electronic Products Reviews\n", - " Warning\n", - " Warning\n", - " CompletenessConstraint(Completeness(marketplac...\n", - " Success\n", - " \n", - " \n", - " \n", - " 6\n", - " Amazon Electronic Products Reviews\n", - " Warning\n", - " Warning\n", - " ComplianceConstraint(Compliance(marketplace co...\n", - " Success\n", - " \n", - " \n", - " \n", - " 7\n", - " Amazon Electronic Products Reviews\n", - " Warning\n", - " Warning\n", - " ComplianceConstraint(Compliance(year is non-ne...\n", - " Success\n", - " \n", + " Column\n", + " values insight\n", + " Compliance\n", + " 1\n", " \n", " \n", "\n", "" ], "text/plain": [ - " check check_level check_status \\\n", - "0 Amazon Electronic Products Reviews Warning Warning \n", - "1 Amazon Electronic Products Reviews Warning Warning \n", - "2 Amazon Electronic Products Reviews Warning Warning \n", - "3 Amazon Electronic Products Reviews Warning Warning \n", - "4 Amazon Electronic Products Reviews Warning Warning \n", - "5 Amazon Electronic Products Reviews Warning Warning \n", - "6 Amazon Electronic Products Reviews Warning Warning \n", - "7 Amazon Electronic Products Reviews Warning Warning \n", - "\n", - " constraint constraint_status \\\n", - "0 SizeConstraint(Size(None)) Success \n", - "1 MinimumConstraint(Minimum(star_rating,None)) Success \n", - "2 MaximumConstraint(Maximum(star_rating,None)) Success \n", - "3 CompletenessConstraint(Completeness(review_id,... Success \n", - "4 UniquenessConstraint(Uniqueness(List(review_id... Failure \n", - "5 CompletenessConstraint(Completeness(marketplac... Success \n", - "6 ComplianceConstraint(Compliance(marketplace co... Success \n", - "7 ComplianceConstraint(Compliance(year is non-ne... Success \n", - "\n", - " constraint_message \n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 Value: 0.9926566948782706 does not meet the co... \n", - "5 \n", - "6 \n", - "7 " + " entity instance name value\n", + "0 Column range1996to2017 review_year Compliance 0.9999983\n", + "1 Column values insight Compliance 1" ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "analysisResult = AnalysisRunner(spark) \\\n", + " .onData(df) \\\n", + " .addAnalyzer(Compliance(\"range1996to2017 review_year\",\n", + "\"review_year >= 1996 and review_year <= 2017\")) \\\n", + " .addAnalyzer(Compliance(\"values insight\", \n", + "\"insight == 'Y' or insight == 'N'\")) \\\n", + " .run()\n", + "analysisResult_pd_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult, pandas=True)\n", + "analysisResult_pd_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In some cases, you might prefer a different format for the output. PyDeequ allows you to output the results in a JSON format:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'entity': 'Column',\n", + " 'instance': 'range1996to2017 review_year',\n", + " 'name': 'Compliance',\n", + " 'value': 0.9999983394066766},\n", + " {'entity': 'Column',\n", + " 'instance': 'values insight',\n", + " 'name': 'Compliance',\n", + " 'value': 1.0}]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analysisResult_json = AnalyzerContext.successMetricsAsJson(spark, analysisResult)\n", + "analysisResult_json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define and Run Tests for Data\n", + "\n", + "After analyzing and understanding the data, we want to verify that the properties we have derived also hold for new versions of the dataset. By defining assertions on the data distribution as part of a data pipeline, we can make sure every processed dataset is of high quality, and that any application consuming the data can rely on it.\n", + "\n", + "For writing tests on data, we start with the `VerificationSuite` and add [checks](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/checks/Check.scala) on attributes of the data. In this example, we test for the following properties of our data:\n", + "\n", + "- At least 3 million rows in total\n", + "- `review_id` is never null\n", + "- `review_id` is unique\n", + "- `star_rating` has a minimum of 1.0 and maximum of 5.0\n", + "- `marketplace` only contains `US`, `UK`, `DE`, `JP`, or `FR`\n", + "- `year` does not contain negative values\n", + "- `year` is between 1996 and 2017\n", + "\n", + "The following code reflects the previous statements. For information about all available checks, see the [GitHub repo](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/checks/Check.scala). You can run this directly in the Spark shell as previously explained:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python Callback server started!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n", + " warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n" + ] + } + ], "source": [ "from pydeequ.checks import *\n", "from pydeequ.verification import *\n", "\n", - "check = Check(spark, CheckLevel.Warning, \"Amazon Electronic Products Reviews\")\n", + "check = Check(spark, CheckLevel.Warning, \"Synthetic Product Reviews\")\n", "\n", "checkResult = VerificationSuite(spark) \\\n", " .onData(df) \\\n", @@ -504,28 +697,213 @@ " .isUnique(\"review_id\") \\\n", " .isComplete(\"marketplace\") \\\n", " .isContainedIn(\"marketplace\", [\"US\", \"UK\", \"DE\", \"JP\", \"FR\"]) \\\n", - " .isNonNegative(\"year\")) \\\n", + " .isNonNegative(\"year\") \\\n", + " .hasMin(\"review_year\", lambda x: x == '1996') \\\n", + " .hasMax(\"review_year\", lambda x: x == '2017')) \\\n", " .run()\n", - "\n", - "print(f\"Verification Run Status: {checkResult.status}\")\n", - "checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult, pandas=True)\n", - "checkResult_df" + " \n", + "checkResult_df = VerificationResult.checkResultsAsDataFrame(spark,\n", + "checkResult, pandas=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "After calling run(), PyDeequ translates your test description into Deequ, which in its turn translates it into a series of Spark jobs which are executed to compute metrics on the data. Afterwards, it invokes your assertion functions (e.g., lambda x: x == 1.0 for the minimum star-rating check) on these metrics to see if the constraints hold on the data. \n", - "\n", - "Interestingly, the review_id column is not unique, which resulted in a failure of the check on uniqueness. We can also look at all the metrics that Deequ computed for this check by running: " + "Here we change the display settings of the DataFrame to ensure that the entire constraint message is visible." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 checkcheck_levelcheck_statusconstraintconstraint_statusconstraint_message
0Synthetic Product ReviewsWarningWarningSizeConstraint(Size(None))Failureorg.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + "
1Synthetic Product ReviewsWarningWarningMinimumConstraint(Minimum(star_rating,None))Failureorg.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + "
2Synthetic Product ReviewsWarningWarningMaximumConstraint(Maximum(star_rating,None))Failureorg.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + "
3Synthetic Product ReviewsWarningWarningCompletenessConstraint(Completeness(review_id,None))Failureorg.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + "
4Synthetic Product ReviewsWarningWarningUniquenessConstraint(Uniqueness(List(review_id),None))FailureValue: 0.9853137126482744 does not meet the constraint requirement!
5Synthetic Product ReviewsWarningWarningCompletenessConstraint(Completeness(marketplace,None))Failureorg.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + "
6Synthetic Product ReviewsWarningWarningComplianceConstraint(Compliance(marketplace contained in US,UK,DE,JP,FR,`marketplace` IS NULL OR `marketplace` IN ('US','UK','DE','JP','FR'),None))Failureorg.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + "
7Synthetic Product ReviewsWarningWarningComplianceConstraint(Compliance(year is non-negative,COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0,None))Failureorg.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + "
8Synthetic Product ReviewsWarningWarningMinimumConstraint(Minimum(review_year,None))Failureorg.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + "
9Synthetic Product ReviewsWarningWarningMaximumConstraint(Maximum(review_year,None))Failureorg.apache.spark.sql.AnalysisException: Column 'year' does not exist. Did you mean one of the following? [insight, review_year, review_id, marketplace, review_date, star_rating, customer_id, product_id, review_body, total_votes, helpful_votes, product_title, review_headline]; line 1 pos 14;\n", + "'Aggregate [sum(cast(isnotnull(review_id#2) as int)) AS sum(CAST((review_id IS NOT NULL) AS INT))#629L, count(1) AS count(1)#630L, cast(min(review_year#12L) as double) AS CAST(min(review_year) AS DOUBLE)#631, cast(max(review_year#12L) as double) AS CAST(max(review_year) AS DOUBLE)#632, count(1) AS count(1)#633L, cast(min(star_rating#5L) as double) AS CAST(min(star_rating) AS DOUBLE)#634, cast(max(star_rating#5L) as double) AS CAST(max(star_rating) AS DOUBLE)#635, sum(cast((isnull(marketplace#0) OR marketplace#0 IN (US,UK,DE,JP,FR)) as int)) AS sum(CAST(((marketplace IS NULL) OR (marketplace IN (US, UK, DE, JP, FR))) AS INT))#636L, count(1) AS count(1)#637L, sum(cast(isnotnull(marketplace#0) as int)) AS sum(CAST((marketplace IS NOT NULL) AS INT))#638L, count(1) AS count(1)#639L, sum(cast(('COALESCE(cast('year as decimal(20,10)), 0.0) >= 0) as int)) AS sum(CAST((COALESCE(CAST(year AS DECIMAL(20,10)), 0.0) >= 0) AS INT))#640, count(1) AS count(1)#641L]\n", + "+- Relation [marketplace#0,customer_id#1,review_id#2,product_id#3,product_title#4,star_rating#5L,helpful_votes#6L,total_votes#7L,insight#8,review_headline#9,review_body#10,review_date#11,review_year#12L] parquet\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "checkResult_df.style.set_properties(\n", + " **{\n", + " 'overflow-wrap': 'break-word',\n", + " 'inline-size': '10px',\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", "metadata": {}, + "source": [ + "After calling `run()`, PyDeequ translates your test description into Deequ, which translates it into a series of Spark jobs that are run to compute metrics on the data. Afterwards, it invokes your assertion functions (for example, `lambda x: x == 1.0` for the minimum star rating check) on these metrics to see if the constraints hold on the data. \n", + "\n", + "Interestingly, the `review_id` column isn’t unique, which resulted in a failure of the check on uniqueness. We can also look at all the metrics that Deequ computed for this check by running the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n", + " warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n" + ] + }, { "data": { "text/html": [ @@ -558,92 +936,26 @@ " 0\n", " Column\n", " review_id\n", - " Completeness\n", - " 1.000000e+00\n", - " \n", - " \n", - " 1\n", - " Column\n", - " review_id\n", " Uniqueness\n", - " 9.926567e-01\n", - " \n", - " \n", - " 2\n", - " Dataset\n", - " *\n", - " Size\n", - " 3.120938e+06\n", - " \n", - " \n", - " 3\n", - " Column\n", - " star_rating\n", - " Maximum\n", - " 5.000000e+00\n", - " \n", - " \n", - " 4\n", - " Column\n", - " star_rating\n", - " Minimum\n", - " 1.000000e+00\n", - " \n", - " \n", - " 5\n", - " Column\n", - " year is non-negative\n", - " Compliance\n", - " 1.000000e+00\n", - " \n", - " \n", - " 6\n", - " Column\n", - " marketplace contained in US,UK,DE,JP,FR\n", - " Compliance\n", - " 1.000000e+00\n", - " \n", - " \n", - " 7\n", - " Column\n", - " marketplace\n", - " Completeness\n", - " 1.000000e+00\n", + " 0.9853137\n", " \n", " \n", "\n", "" ], "text/plain": [ - " entity instance name \\\n", - "0 Column review_id Completeness \n", - "1 Column review_id Uniqueness \n", - "2 Dataset * Size \n", - "3 Column star_rating Maximum \n", - "4 Column star_rating Minimum \n", - "5 Column year is non-negative Compliance \n", - "6 Column marketplace contained in US,UK,DE,JP,FR Compliance \n", - "7 Column marketplace Completeness \n", - "\n", - " value \n", - "0 1.000000e+00 \n", - "1 9.926567e-01 \n", - "2 3.120938e+06 \n", - "3 5.000000e+00 \n", - "4 1.000000e+00 \n", - "5 1.000000e+00 \n", - "6 1.000000e+00 \n", - "7 1.000000e+00 " + " entity instance name value\n", + "0 Column review_id Uniqueness 0.9853137" ] }, - "execution_count": 6, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "checkResult_df = VerificationResult.successMetricsAsDataFrame(spark, checkResult, pandas=True)\n", - "checkResult_df" + "checkResults_df = VerificationResult.successMetricsAsDataFrame(spark, checkResult, pandas = True)\n", + "checkResults_df" ] }, { @@ -652,14 +964,30 @@ "source": [ "## Automated Constraint Suggestion \n", "\n", - "If you own a large number of datasets or if your dataset has many columns, it may be challenging for you to manually define appropriate constraints. Deequ can automatically suggest useful constraints based on the data distribution. Deequ first runs a data profiling method and then applies a set of rules on the result. For more information about how to run a data profiling method, see _this GitHub repository. (https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/data_profiling_example.md)_" + "If you own a large number of datasets or if your dataset has many columns, it may be challenging for you to manually define appropriate constraints. Deequ can automatically suggest useful constraints based on the data distribution. Deequ first runs a data profiling method and then applies a set of rules on the result. For more information about how to run a data profiling method, see the [GitHub repo](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/data_profiling_example.md)." ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, + "execution_count": 15, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/14 23:15:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 34:==================================================> (10 + 1) / 11]\r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -667,6 +995,24 @@ "{\n", " \"constraint_suggestions\": [\n", " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('insight' has value range 'N', 'Y',`insight` IN ('N', 'Y'),None))\",\n", + " \"column_name\": \"insight\",\n", + " \"current_value\": \"Compliance: 1\",\n", + " \"description\": \"'insight' has value range 'N', 'Y'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$4119/0x000000080197e840@74f276b0)\",\n", + " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"insight\\\", [\\\"N\\\", \\\"Y\\\"])\"\n", + " },\n", + " {\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(insight,None))\",\n", + " \"column_name\": \"insight\",\n", + " \"current_value\": \"Completeness: 1.0\",\n", + " \"description\": \"'insight' is not null\",\n", + " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", + " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", + " \"code_for_constraint\": \".isComplete(\\\"insight\\\")\"\n", + " },\n", + " {\n", " \"constraint_name\": \"CompletenessConstraint(Completeness(review_id,None))\",\n", " \"column_name\": \"review_id\",\n", " \"current_value\": \"Completeness: 1.0\",\n", @@ -678,7 +1024,7 @@ " {\n", " \"constraint_name\": \"UniquenessConstraint(Uniqueness(List(review_id),None))\",\n", " \"column_name\": \"review_id\",\n", - " \"current_value\": \"ApproxDistinctness: 0.9647650802419017\",\n", + " \"current_value\": \"ApproxDistinctness: 1.0496308168923523\",\n", " \"description\": \"'review_id' is unique\",\n", " \"suggesting_rule\": \"UniqueIfApproximatelyUniqueRule()\",\n", " \"rule_description\": \"If the ratio of approximate num distinct values in a column is close to the number of records (within the error of the HLL sketch), we suggest a UNIQUE constraint\",\n", @@ -696,7 +1042,7 @@ " {\n", " \"constraint_name\": \"ComplianceConstraint(Compliance('customer_id' has no negative values,customer_id >= 0,None))\",\n", " \"column_name\": \"customer_id\",\n", - " \"current_value\": \"Minimum: 10005.0\",\n", + " \"current_value\": \"Minimum: 100000.0\",\n", " \"description\": \"'customer_id' has no negative values\",\n", " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", @@ -721,6 +1067,15 @@ " \"code_for_constraint\": \".isComplete(\\\"review_date\\\")\"\n", " },\n", " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('helpful_votes' has value range '15', '14', '13', '16', '17', '12', '18', '11', '19', '10', '9', '20', '8', '21', '22', '7', '6', '23', '5', '24', '4', '25', '26', '3',`helpful_votes` IN ('15', '14', '13', '16', '17', '12', '18', '11', '19', '10', '9', '20', '8', '21', '22', '7', '6', '23', '5', '24', '4', '25', '26', '3'),None))\",\n", + " \"column_name\": \"helpful_votes\",\n", + " \"current_value\": \"Compliance: 1\",\n", + " \"description\": \"'helpful_votes' has value range '15', '14', '13', '16', '17', '12', '18', '11', '19', '10', '9', '20', '8', '21', '22', '7', '6', '23', '5', '24', '4', '25', '26', '3'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$4119/0x000000080197e840@74f276b0)\",\n", + " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"helpful_votes\\\", [\\\"15\\\", \\\"14\\\", \\\"13\\\", \\\"16\\\", \\\"17\\\", \\\"12\\\", \\\"18\\\", \\\"11\\\", \\\"19\\\", \\\"10\\\", \\\"9\\\", \\\"20\\\", \\\"8\\\", \\\"21\\\", \\\"22\\\", \\\"7\\\", \\\"6\\\", \\\"23\\\", \\\"5\\\", \\\"24\\\", \\\"4\\\", \\\"25\\\", \\\"26\\\", \\\"3\\\"])\"\n", + " },\n", + " {\n", " \"constraint_name\": \"CompletenessConstraint(Completeness(helpful_votes,None))\",\n", " \"column_name\": \"helpful_votes\",\n", " \"current_value\": \"Completeness: 1.0\",\n", @@ -730,15 +1085,33 @@ " \"code_for_constraint\": \".isComplete(\\\"helpful_votes\\\")\"\n", " },\n", " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('helpful_votes' has value range '15', '14', '13', '16', '17', '12', '18', '11' for at least 92.0% of values,`helpful_votes` IN ('15', '14', '13', '16', '17', '12', '18', '11'),None))\",\n", + " \"column_name\": \"helpful_votes\",\n", + " \"current_value\": \"Compliance: 0.926417449248947\",\n", + " \"description\": \"'helpful_votes' has value range '15', '14', '13', '16', '17', '12', '18', '11' for at least 92.0% of values\",\n", + " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9,com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule$$$Lambda$4120/0x000000080197f040@65f67611)\",\n", + " \"rule_description\": \"If we see a categorical range for most values in a column, we suggest an IS IN (...) constraint that should hold for most values\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"helpful_votes\\\", [\\\"15\\\", \\\"14\\\", \\\"13\\\", \\\"16\\\", \\\"17\\\", \\\"12\\\", \\\"18\\\", \\\"11\\\"], lambda x: x >= 0.92, \\\"It should be above 0.92!\\\")\"\n", + " },\n", + " {\n", " \"constraint_name\": \"ComplianceConstraint(Compliance('helpful_votes' has no negative values,helpful_votes >= 0,None))\",\n", " \"column_name\": \"helpful_votes\",\n", - " \"current_value\": \"Minimum: 0.0\",\n", + " \"current_value\": \"Minimum: 3.0\",\n", " \"description\": \"'helpful_votes' has no negative values\",\n", " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", " \"code_for_constraint\": \".isNonNegative(\\\"helpful_votes\\\")\"\n", " },\n", " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('star_rating' has value range '5', '4', '3', '2', '1',`star_rating` IN ('5', '4', '3', '2', '1'),None))\",\n", + " \"column_name\": \"star_rating\",\n", + " \"current_value\": \"Compliance: 1\",\n", + " \"description\": \"'star_rating' has value range '5', '4', '3', '2', '1'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$4119/0x000000080197e840@74f276b0)\",\n", + " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"star_rating\\\", [\\\"5\\\", \\\"4\\\", \\\"3\\\", \\\"2\\\", \\\"1\\\"])\"\n", + " },\n", + " {\n", " \"constraint_name\": \"CompletenessConstraint(Completeness(star_rating,None))\",\n", " \"column_name\": \"star_rating\",\n", " \"current_value\": \"Completeness: 1.0\",\n", @@ -748,6 +1121,15 @@ " \"code_for_constraint\": \".isComplete(\\\"star_rating\\\")\"\n", " },\n", " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('star_rating' has value range '5', '4', '3', '2' for at least 94.0% of values,`star_rating` IN ('5', '4', '3', '2'),None))\",\n", + " \"column_name\": \"star_rating\",\n", + " \"current_value\": \"Compliance: 0.9499998671525341\",\n", + " \"description\": \"'star_rating' has value range '5', '4', '3', '2' for at least 94.0% of values\",\n", + " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9,com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule$$$Lambda$4120/0x000000080197f040@65f67611)\",\n", + " \"rule_description\": \"If we see a categorical range for most values in a column, we suggest an IS IN (...) constraint that should hold for most values\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"star_rating\\\", [\\\"5\\\", \\\"4\\\", \\\"3\\\", \\\"2\\\"], lambda x: x >= 0.94, \\\"It should be above 0.94!\\\")\"\n", + " },\n", + " {\n", " \"constraint_name\": \"ComplianceConstraint(Compliance('star_rating' has no negative values,star_rating >= 0,None))\",\n", " \"column_name\": \"star_rating\",\n", " \"current_value\": \"Minimum: 1.0\",\n", @@ -757,24 +1139,6 @@ " \"code_for_constraint\": \".isNonNegative(\\\"star_rating\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(year,None))\",\n", - " \"column_name\": \"year\",\n", - " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'year' is not null\",\n", - " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", - " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"year\\\")\"\n", - " },\n", - " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('year' has no negative values,year >= 0,None))\",\n", - " \"column_name\": \"year\",\n", - " \"current_value\": \"Minimum: 1999.0\",\n", - " \"description\": \"'year' has no negative values\",\n", - " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", - " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", - " \"code_for_constraint\": \".isNonNegative(\\\"year\\\")\"\n", - " },\n", - " {\n", " \"constraint_name\": \"CompletenessConstraint(Completeness(product_title,None))\",\n", " \"column_name\": \"product_title\",\n", " \"current_value\": \"Completeness: 1.0\",\n", @@ -786,150 +1150,157 @@ " {\n", " \"constraint_name\": \"CompletenessConstraint(Completeness(review_headline,None))\",\n", " \"column_name\": \"review_headline\",\n", - " \"current_value\": \"Completeness: 0.9999987183340393\",\n", - " \"description\": \"'review_headline' has less than 1% missing values\",\n", - " \"suggesting_rule\": \"RetainCompletenessRule()\",\n", - " \"rule_description\": \"If a column is incomplete in the sample, we model its completeness as a binomial variable, estimate a confidence interval and use this to define a lower bound for the completeness\",\n", - " \"code_for_constraint\": \".hasCompleteness(\\\"review_headline\\\", lambda x: x >= 0.99, \\\"It should be above 0.99!\\\")\"\n", - " },\n", - " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(product_id,None))\",\n", - " \"column_name\": \"product_id\",\n", " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'product_id' is not null\",\n", + " \"description\": \"'review_headline' is not null\",\n", " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"product_id\\\")\"\n", + " \"code_for_constraint\": \".isComplete(\\\"review_headline\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(total_votes,None))\",\n", - " \"column_name\": \"total_votes\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('review_year' has value range '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002', '2001', '2000', '1999', '1998', '1997', '1996', '1696', '2101', '2202',`review_year` IN ('2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002', '2001', '2000', '1999', '1998', '1997', '1996', '1696', '2101', '2202'),None))\",\n", + " \"column_name\": \"review_year\",\n", + " \"current_value\": \"Compliance: 1\",\n", + " \"description\": \"'review_year' has value range '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002', '2001', '2000', '1999', '1998', '1997', '1996', '1696', '2101', '2202'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$4119/0x000000080197e840@74f276b0)\",\n", + " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"review_year\\\", [\\\"2016\\\", \\\"2015\\\", \\\"2014\\\", \\\"2013\\\", \\\"2012\\\", \\\"2011\\\", \\\"2010\\\", \\\"2009\\\", \\\"2008\\\", \\\"2007\\\", \\\"2006\\\", \\\"2005\\\", \\\"2004\\\", \\\"2003\\\", \\\"2002\\\", \\\"2001\\\", \\\"2000\\\", \\\"1999\\\", \\\"1998\\\", \\\"1997\\\", \\\"1996\\\", \\\"1696\\\", \\\"2101\\\", \\\"2202\\\"])\"\n", + " },\n", + " {\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(review_year,None))\",\n", + " \"column_name\": \"review_year\",\n", " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'total_votes' is not null\",\n", + " \"description\": \"'review_year' is not null\",\n", " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"total_votes\\\")\"\n", + " \"code_for_constraint\": \".isComplete(\\\"review_year\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('total_votes' has no negative values,total_votes >= 0,None))\",\n", - " \"column_name\": \"total_votes\",\n", - " \"current_value\": \"Minimum: 0.0\",\n", - " \"description\": \"'total_votes' has no negative values\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('review_year' has value range '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005' for at least 91.0% of values,`review_year` IN ('2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005'),None))\",\n", + " \"column_name\": \"review_year\",\n", + " \"current_value\": \"Compliance: 0.9157531189263799\",\n", + " \"description\": \"'review_year' has value range '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005' for at least 91.0% of values\",\n", + " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9,com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule$$$Lambda$4120/0x000000080197f040@65f67611)\",\n", + " \"rule_description\": \"If we see a categorical range for most values in a column, we suggest an IS IN (...) constraint that should hold for most values\",\n", + " \"code_for_constraint\": \".isContainedIn(\\\"review_year\\\", [\\\"2016\\\", \\\"2015\\\", \\\"2014\\\", \\\"2013\\\", \\\"2012\\\", \\\"2011\\\", \\\"2010\\\", \\\"2009\\\", \\\"2008\\\", \\\"2007\\\", \\\"2006\\\", \\\"2005\\\"], lambda x: x >= 0.91, \\\"It should be above 0.91!\\\")\"\n", + " },\n", + " {\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('review_year' has no negative values,review_year >= 0,None))\",\n", + " \"column_name\": \"review_year\",\n", + " \"current_value\": \"Minimum: 1696.0\",\n", + " \"description\": \"'review_year' has no negative values\",\n", " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", - " \"code_for_constraint\": \".isNonNegative(\\\"total_votes\\\")\"\n", + " \"code_for_constraint\": \".isNonNegative(\\\"review_year\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(product_parent,None))\",\n", - " \"column_name\": \"product_parent\",\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(product_id,None))\",\n", + " \"column_name\": \"product_id\",\n", " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'product_parent' is not null\",\n", + " \"description\": \"'product_id' is not null\",\n", " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"product_parent\\\")\"\n", + " \"code_for_constraint\": \".isComplete(\\\"product_id\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('product_parent' has no negative values,product_parent >= 0,None))\",\n", - " \"column_name\": \"product_parent\",\n", - " \"current_value\": \"Minimum: 6478.0\",\n", - " \"description\": \"'product_parent' has no negative values\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('product_id' has no negative values,product_id >= 0,None))\",\n", + " \"column_name\": \"product_id\",\n", + " \"current_value\": \"Minimum: 10000.0\",\n", + " \"description\": \"'product_id' has no negative values\",\n", " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", - " \"code_for_constraint\": \".isNonNegative(\\\"product_parent\\\")\"\n", + " \"code_for_constraint\": \".isNonNegative(\\\"product_id\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"AnalysisBasedConstraint(DataType(product_parent,None),,Some(),None)\",\n", - " \"column_name\": \"product_parent\",\n", + " \"constraint_name\": \"AnalysisBasedConstraint(DataType(product_id,None),,Some(),None)\",\n", + " \"column_name\": \"product_id\",\n", " \"current_value\": \"DataType: Integral\",\n", - " \"description\": \"'product_parent' has type Integral\",\n", + " \"description\": \"'product_id' has type Integral\",\n", " \"suggesting_rule\": \"RetainTypeRule()\",\n", " \"rule_description\": \"If we detect a non-string type, we suggest a type constraint\",\n", - " \"code_for_constraint\": \".hasDataType(\\\"product_parent\\\", ConstrainableDataTypes.Integral)\"\n", - " },\n", - " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(review_body,None))\",\n", - " \"column_name\": \"review_body\",\n", - " \"current_value\": \"Completeness: 0.9999724441818453\",\n", - " \"description\": \"'review_body' has less than 1% missing values\",\n", - " \"suggesting_rule\": \"RetainCompletenessRule()\",\n", - " \"rule_description\": \"If a column is incomplete in the sample, we model its completeness as a binomial variable, estimate a confidence interval and use this to define a lower bound for the completeness\",\n", - " \"code_for_constraint\": \".hasCompleteness(\\\"review_body\\\", lambda x: x >= 0.99, \\\"It should be above 0.99!\\\")\"\n", + " \"code_for_constraint\": \".hasDataType(\\\"product_id\\\", ConstrainableDataTypes.Integral)\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('vine' has value range 'N', 'Y',`vine` IN ('N', 'Y'),None))\",\n", - " \"column_name\": \"vine\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('total_votes' has value range '19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26', '13', '12', '27', '28', '11', '10', '29', '30', '9', '8', '31', '32', '7', '33', '6', '5', '34', '4', '35', '3', '36', '2', '37', '38', '39', '1', '0',`total_votes` IN ('19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26', '13', '12', '27', '28', '11', '10', '29', '30', '9', '8', '31', '32', '7', '33', '6', '5', '34', '4', '35', '3', '36', '2', '37', '38', '39', '1', '0'),None))\",\n", + " \"column_name\": \"total_votes\",\n", " \"current_value\": \"Compliance: 1\",\n", - " \"description\": \"'vine' has value range 'N', 'Y'\",\n", - " \"suggesting_rule\": \"CategoricalRangeRule()\",\n", + " \"description\": \"'total_votes' has value range '19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26', '13', '12', '27', '28', '11', '10', '29', '30', '9', '8', '31', '32', '7', '33', '6', '5', '34', '4', '35', '3', '36', '2', '37', '38', '39', '1', '0'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$4119/0x000000080197e840@74f276b0)\",\n", " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", - " \"code_for_constraint\": \".isContainedIn(\\\"vine\\\", [\\\"N\\\", \\\"Y\\\"])\"\n", + " \"code_for_constraint\": \".isContainedIn(\\\"total_votes\\\", [\\\"19\\\", \\\"20\\\", \\\"21\\\", \\\"18\\\", \\\"17\\\", \\\"22\\\", \\\"23\\\", \\\"16\\\", \\\"15\\\", \\\"24\\\", \\\"25\\\", \\\"14\\\", \\\"26\\\", \\\"13\\\", \\\"12\\\", \\\"27\\\", \\\"28\\\", \\\"11\\\", \\\"10\\\", \\\"29\\\", \\\"30\\\", \\\"9\\\", \\\"8\\\", \\\"31\\\", \\\"32\\\", \\\"7\\\", \\\"33\\\", \\\"6\\\", \\\"5\\\", \\\"34\\\", \\\"4\\\", \\\"35\\\", \\\"3\\\", \\\"36\\\", \\\"2\\\", \\\"37\\\", \\\"38\\\", \\\"39\\\", \\\"1\\\", \\\"0\\\"])\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(vine,None))\",\n", - " \"column_name\": \"vine\",\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(total_votes,None))\",\n", + " \"column_name\": \"total_votes\",\n", " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'vine' is not null\",\n", + " \"description\": \"'total_votes' is not null\",\n", " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"vine\\\")\"\n", + " \"code_for_constraint\": \".isComplete(\\\"total_votes\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('vine' has value range 'N' for at least 99.0% of values,`vine` IN ('N'),None))\",\n", - " \"column_name\": \"vine\",\n", - " \"current_value\": \"Compliance: 0.9939271462617969\",\n", - " \"description\": \"'vine' has value range 'N' for at least 99.0% of values\",\n", - " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9)\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('total_votes' has value range '19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26' for at least 90.0% of values,`total_votes` IN ('19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26'),None))\",\n", + " \"column_name\": \"total_votes\",\n", + " \"current_value\": \"Compliance: 0.904062874048646\",\n", + " \"description\": \"'total_votes' has value range '19', '20', '21', '18', '17', '22', '23', '16', '15', '24', '25', '14', '26' for at least 90.0% of values\",\n", + " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9,com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule$$$Lambda$4120/0x000000080197f040@65f67611)\",\n", " \"rule_description\": \"If we see a categorical range for most values in a column, we suggest an IS IN (...) constraint that should hold for most values\",\n", - " \"code_for_constraint\": \".isContainedIn(\\\"vine\\\", [\\\"N\\\"], lambda x: x >= 0.99, \\\"It should be above 0.99!\\\")\"\n", + " \"code_for_constraint\": \".isContainedIn(\\\"total_votes\\\", [\\\"19\\\", \\\"20\\\", \\\"21\\\", \\\"18\\\", \\\"17\\\", \\\"22\\\", \\\"23\\\", \\\"16\\\", \\\"15\\\", \\\"24\\\", \\\"25\\\", \\\"14\\\", \\\"26\\\"], lambda x: x >= 0.9, \\\"It should be above 0.9!\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('marketplace' has value range 'US', 'UK', 'DE', 'JP', 'FR',`marketplace` IN ('US', 'UK', 'DE', 'JP', 'FR'),None))\",\n", - " \"column_name\": \"marketplace\",\n", - " \"current_value\": \"Compliance: 1\",\n", - " \"description\": \"'marketplace' has value range 'US', 'UK', 'DE', 'JP', 'FR'\",\n", - " \"suggesting_rule\": \"CategoricalRangeRule()\",\n", - " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", - " \"code_for_constraint\": \".isContainedIn(\\\"marketplace\\\", [\\\"US\\\", \\\"UK\\\", \\\"DE\\\", \\\"JP\\\", \\\"FR\\\"])\"\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('total_votes' has no negative values,total_votes >= 0,None))\",\n", + " \"column_name\": \"total_votes\",\n", + " \"current_value\": \"Minimum: 0.0\",\n", + " \"description\": \"'total_votes' has no negative values\",\n", + " \"suggesting_rule\": \"NonNegativeNumbersRule()\",\n", + " \"rule_description\": \"If we see only non-negative numbers in a column, we suggest a corresponding constraint\",\n", + " \"code_for_constraint\": \".isNonNegative(\\\"total_votes\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(marketplace,None))\",\n", - " \"column_name\": \"marketplace\",\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(review_body,None))\",\n", + " \"column_name\": \"review_body\",\n", " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'marketplace' is not null\",\n", + " \"description\": \"'review_body' is not null\",\n", " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"marketplace\\\")\"\n", + " \"code_for_constraint\": \".isComplete(\\\"review_body\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('marketplace' has value range 'US' for at least 99.0% of values,`marketplace` IN ('US'),None))\",\n", - " \"column_name\": \"marketplace\",\n", - " \"current_value\": \"Compliance: 0.9949982985884372\",\n", - " \"description\": \"'marketplace' has value range 'US' for at least 99.0% of values\",\n", - " \"suggesting_rule\": \"FractionalCategoricalRangeRule(0.9)\",\n", - " \"rule_description\": \"If we see a categorical range for most values in a column, we suggest an IS IN (...) constraint that should hold for most values\",\n", - " \"code_for_constraint\": \".isContainedIn(\\\"marketplace\\\", [\\\"US\\\"], lambda x: x >= 0.99, \\\"It should be above 0.99!\\\")\"\n", + " \"constraint_name\": \"UniquenessConstraint(Uniqueness(List(review_body),None))\",\n", + " \"column_name\": \"review_body\",\n", + " \"current_value\": \"ApproxDistinctness: 1.0332703193520232\",\n", + " \"description\": \"'review_body' is unique\",\n", + " \"suggesting_rule\": \"UniqueIfApproximatelyUniqueRule()\",\n", + " \"rule_description\": \"If the ratio of approximate num distinct values in a column is close to the number of records (within the error of the HLL sketch), we suggest a UNIQUE constraint\",\n", + " \"code_for_constraint\": \".isUnique(\\\"review_body\\\")\"\n", " },\n", " {\n", - " \"constraint_name\": \"ComplianceConstraint(Compliance('verified_purchase' has value range 'Y', 'N',`verified_purchase` IN ('Y', 'N'),None))\",\n", - " \"column_name\": \"verified_purchase\",\n", + " \"constraint_name\": \"ComplianceConstraint(Compliance('marketplace' has value range '', 'FR', 'JP', 'UK', 'DE', 'US',`marketplace` IN ('', 'FR', 'JP', 'UK', 'DE', 'US'),None))\",\n", + " \"column_name\": \"marketplace\",\n", " \"current_value\": \"Compliance: 1\",\n", - " \"description\": \"'verified_purchase' has value range 'Y', 'N'\",\n", - " \"suggesting_rule\": \"CategoricalRangeRule()\",\n", + " \"description\": \"'marketplace' has value range '', 'FR', 'JP', 'UK', 'DE', 'US'\",\n", + " \"suggesting_rule\": \"CategoricalRangeRule(com.amazon.deequ.suggestions.rules.CategoricalRangeRule$$$Lambda$4119/0x000000080197e840@74f276b0)\",\n", " \"rule_description\": \"If we see a categorical range for a column, we suggest an IS IN (...) constraint\",\n", - " \"code_for_constraint\": \".isContainedIn(\\\"verified_purchase\\\", [\\\"Y\\\", \\\"N\\\"])\"\n", + " \"code_for_constraint\": \".isContainedIn(\\\"marketplace\\\", [\\\"\\\", \\\"FR\\\", \\\"JP\\\", \\\"UK\\\", \\\"DE\\\", \\\"US\\\"])\"\n", " },\n", " {\n", - " \"constraint_name\": \"CompletenessConstraint(Completeness(verified_purchase,None))\",\n", - " \"column_name\": \"verified_purchase\",\n", - " \"current_value\": \"Completeness: 1.0\",\n", - " \"description\": \"'verified_purchase' is not null\",\n", - " \"suggesting_rule\": \"CompleteIfCompleteRule()\",\n", - " \"rule_description\": \"If a column is complete in the sample, we suggest a NOT NULL constraint\",\n", - " \"code_for_constraint\": \".isComplete(\\\"verified_purchase\\\")\"\n", + " \"constraint_name\": \"CompletenessConstraint(Completeness(marketplace,None))\",\n", + " \"column_name\": \"marketplace\",\n", + " \"current_value\": \"Completeness: 0.8570391886739565\",\n", + " \"description\": \"'marketplace' has less than 15% missing values\",\n", + " \"suggesting_rule\": \"RetainCompletenessRule()\",\n", + " \"rule_description\": \"If a column is incomplete in the sample, we model its completeness as a binomial variable, estimate a confidence interval and use this to define a lower bound for the completeness\",\n", + " \"code_for_constraint\": \".hasCompleteness(\\\"marketplace\\\", lambda x: x >= 0.85, \\\"It should be above 0.85!\\\")\"\n", " }\n", " ]\n", "}\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] } ], "source": [ @@ -948,21 +1319,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The above result contains a list of constraints with descriptions and Python code, so that you can directly apply it in your data quality checks." + "The result contains a list of constraints with descriptions and Python code, so that you can directly apply it in your data quality checks. You can call `print(json.dumps(result_json))` to inspect the suggested constraints." ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Scaling to Production \n", "\n", - "So far we’ve shown you how to use these capabilities in the context of data exploration using a Jupyter notebook running on a SageMaker Notebook instance. As your project matures, you’ll want to use the same capabilities on larger and larger datasets, and in a production environment. With PyDeequ, it’s easy to make that transition.\n", + "So far, we’ve shown you how to use these capabilities in the context of data exploration using a Jupyter notebook running on a SageMaker notebook instance. As your project matures, you need to use the same capabilities on larger and larger datasets, and in a production environment. With PyDeequ, it’s straightforward to make that transition. The following diagram illustrates deployment options for local and production purposes on AWS.\n", + "\n", + "![pydeequ-in-production](../imgs/pydeequ_deployment.png)\n", "\n", - "![image.png](../imgs/pydeequ_deployment.png)\n", + "**Figure 3. Deployment of PyDeequ in production.** \n", "\n", - "As seen in the diagram above, you can leverage both an AWS EMR cluster and/or AWS Glue for larger or production purposes. " + "As seen in the diagram above, you can leverage both an AWS EMR cluster and/or AWS Glue for larger or production purposes. To learn more about how to configure an EMR cluster with PyDeequ to explore much larger volumes of data please refer to the AWS blog post [Testing data quality at scale with PyDeequ](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/)." ] }, { @@ -971,19 +1343,17 @@ "source": [ "## More Examples on GitHub\n", "\n", - "You can find examples of more advanced features at _Deequ’s GitHub page (https://github.com/awslabs/deequ)_:\n", + "You can find examples of more advanced features on the [Deequ GitHub repo](https://github.com/awslabs/deequ):\n", "\n", - "* Deequ not only provides data quality checks with fixed thresholds. Learn how to use _anomaly detection on data quality metrics (https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/anomaly_detection_example.md)_ to apply tests on metrics that change over time.\n", - "* Deequ offers support for storing and loading metrics. Learn how to use the _MetricsRepository (https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/metrics_repository_example.md)_ for this use case.\n", - "* If your dataset grows over time or is partitioned, you can use Deequ’s _incremental metrics computation (https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/algebraic_states_example.md)_ capability. For each partition, Deequ stores a state for each computed metric. To compute metrics for the union of partitions, Deequ can use these states to efficiently derive overall metrics without reloading the data.\n", - "\n", - "## Additional Resources\n", - "\n", - "Learn more about the inner workings of Deequ in the VLDB 2018 paper “_Automating large-scale data quality verification. (http://www.vldb.org/pvldb/vol11/p1781-schelter.pdf)_”\n", + "- Deequ provides more than data quality checks with fixed thresholds. Learn how to use [anomaly detection on data quality metrics](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/anomaly_detection_example.md) to apply tests on metrics that change over time.\n", + "- Deequ offers support for storing and loading metrics. Learn how to use the [MetricsRepository](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/metrics_repository_example.md) for this use case.\n", + "- If your dataset grows over time or is partitioned, you can use Deequ’s [incremental metrics computation](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/algebraic_states_example.md). For each partition, Deequ stores a state for each computed metric. To compute metrics for the union of partitions, Deequ can use these states to efficiently derive overall metrics without reloading the data.\n", "\n", "## Conclusion\n", "\n", - "This blog post showed you how to use PyDeequ for calculating data quality metrics, verifying data quality metrics, and profiling data to automate the configuration of data quality checks. PyDeequ is available for you now to build your own data quality management pipeline.\n" + "This notebook showed you how to use PyDeequ for calculating data quality metrics, verifying data quality metrics, and profiling data to automate the configuration of data quality checks in an Amazon SageMaker notebook. PyDeequ is available using `pip install` and on GitHub for you to build your own data quality management pipeline.\n", + "\n", + "Learn more about the inner workings of Deequ in the VLDB 2018 paper [Automating large-scale data quality verification](https://www.vldb.org/pvldb/vol11/p1781-schelter.pdf)." ] }, { @@ -996,7 +1366,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1010,7 +1380,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/tutorials/verification.ipynb b/tutorials/verification.ipynb index e24dac7..f098a53 100644 --- a/tutorials/verification.ipynb +++ b/tutorials/verification.ipynb @@ -6,14 +6,120 @@ "source": [ "# Constraint Verification Basic Tutorial\n", "\n", + "__Updated June 2024 to use a new dataset__\n", + "\n", "This Jupyter notebook will give a basic tutorial on how to use PyDeequ's Constraint Verification module." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], + "source": [ + "import os\n", + "# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n", + "os.environ[\"SPARK_VERSION\"] = '3.5'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n", + "The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n", + "com.amazon.deequ#deequ added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-b34517df-498e-49a1-8270-59aa779256fd;1.0\n", + "\tconfs: [default]\n", + "\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n", + "\tfound org.scala-lang#scala-reflect;2.12.10 in central\n", + "\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n", + "\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n", + "\tfound com.github.fommil.netlib#core;1.1.2 in central\n", + "\tfound net.sf.opencsv#opencsv;2.3 in central\n", + "\tfound com.github.rwl#jtransforms;2.4.0 in central\n", + "\tfound junit#junit;4.8.2 in central\n", + "\tfound org.apache.commons#commons-math3;3.2 in central\n", + "\tfound org.spire-math#spire_2.12;0.13.0 in central\n", + "\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n", + "\tfound org.typelevel#machinist_2.12;0.6.1 in central\n", + "\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n", + "\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n", + "\tfound org.slf4j#slf4j-api;1.7.5 in central\n", + ":: resolution report :: resolve 473ms :: artifacts dl 19ms\n", + "\t:: modules in use:\n", + "\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n", + "\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n", + "\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n", + "\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n", + "\tjunit#junit;4.8.2 from central in [default]\n", + "\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n", + "\torg.apache.commons#commons-math3;3.2 from central in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n", + "\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n", + "\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n", + "\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n", + "\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n", + "\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n", + "\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n", + "\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n", + "\t:: evicted modules:\n", + "\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 17 | 0 | 0 | 2 || 15 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-b34517df-498e-49a1-8270-59aa779256fd\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 15 already retrieved (0kB/14ms)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/15 00:01:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/15 00:01:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", + "24/06/15 00:01:23 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n", + "24/06/15 00:01:23 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n", + "24/06/15 00:01:23 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.\n", + "24/06/15 00:01:23 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.\n" + ] + } + ], "source": [ "from pyspark.sql import SparkSession, Row, DataFrame\n", "import json\n", @@ -36,14 +142,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### We will be using the Amazon Product Reviews dataset -- specifically the Electronics subset. " + "### We will be using the synthetic reviews dataset for Electronics products" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "execution_count": 3, + "metadata": { + "tags": [] + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/06/15 00:01:25 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -53,51 +175,73 @@ " |-- customer_id: string (nullable = true)\n", " |-- review_id: string (nullable = true)\n", " |-- product_id: string (nullable = true)\n", - " |-- product_parent: string (nullable = true)\n", " |-- product_title: string (nullable = true)\n", - " |-- star_rating: integer (nullable = true)\n", - " |-- helpful_votes: integer (nullable = true)\n", - " |-- total_votes: integer (nullable = true)\n", - " |-- vine: string (nullable = true)\n", - " |-- verified_purchase: string (nullable = true)\n", + " |-- star_rating: long (nullable = true)\n", + " |-- helpful_votes: long (nullable = true)\n", + " |-- total_votes: long (nullable = true)\n", + " |-- insight: string (nullable = true)\n", " |-- review_headline: string (nullable = true)\n", " |-- review_body: string (nullable = true)\n", - " |-- review_date: date (nullable = true)\n", - " |-- year: integer (nullable = true)\n", + " |-- review_date: timestamp (nullable = true)\n", + " |-- review_year: long (nullable = true)\n", "\n" ] } ], "source": [ - "df = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n", + "df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n", "\n", "df.printSchema()" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, + "execution_count": 4, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Python Callback server started!\n", + "Python Callback server started!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Verification Run Status: Warning\n", "+--------------------+-----------+------------+--------------------+-----------------+--------------------+\n", "| check|check_level|check_status| constraint|constraint_status| constraint_message|\n", "+--------------------+-----------+------------+--------------------+-----------------+--------------------+\n", - "|Amazon Electronic...| Warning| Warning|SizeConstraint(Si...| Success| |\n", - "|Amazon Electronic...| Warning| Warning|MinimumConstraint...| Success| |\n", - "|Amazon Electronic...| Warning| Warning|MaximumConstraint...| Success| |\n", - "|Amazon Electronic...| Warning| Warning|CompletenessConst...| Success| |\n", - "|Amazon Electronic...| Warning| Warning|UniquenessConstra...| Failure|Value: 0.99265669...|\n", - "|Amazon Electronic...| Warning| Warning|CompletenessConst...| Success| |\n", - "|Amazon Electronic...| Warning| Warning|ComplianceConstra...| Success| |\n", - "|Amazon Electronic...| Warning| Warning|ComplianceConstra...| Success| |\n", + "|Amazon Electronic...| Warning| Warning|SizeConstraint(Si...| Failure|org.apache.spark....|\n", + "|Amazon Electronic...| Warning| Warning|MinimumConstraint...| Failure|org.apache.spark....|\n", + "|Amazon Electronic...| Warning| Warning|MaximumConstraint...| Failure|org.apache.spark....|\n", + "|Amazon Electronic...| Warning| Warning|CompletenessConst...| Failure|org.apache.spark....|\n", + "|Amazon Electronic...| Warning| Warning|UniquenessConstra...| Failure|Value: 0.98531371...|\n", + "|Amazon Electronic...| Warning| Warning|CompletenessConst...| Failure|org.apache.spark....|\n", + "|Amazon Electronic...| Warning| Warning|ComplianceConstra...| Failure|org.apache.spark....|\n", + "|Amazon Electronic...| Warning| Warning|ComplianceConstra...| Failure|org.apache.spark....|\n", "+--------------------+-----------+------------+--------------------+-----------------+--------------------+\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n", + " warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n" + ] } ], "source": [ @@ -148,9 +292,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}