From dd9687adb25f0b8049c1feaf8e31914e8e00fffe Mon Sep 17 00:00:00 2001 From: joke1196 Date: Thu, 30 Jan 2025 14:55:28 +0000 Subject: [PATCH 1/3] Create rule S7195 --- rules/S7195/metadata.json | 2 ++ rules/S7195/python/metadata.json | 25 ++++++++++++++++++ rules/S7195/python/rule.adoc | 44 ++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) create mode 100644 rules/S7195/metadata.json create mode 100644 rules/S7195/python/metadata.json create mode 100644 rules/S7195/python/rule.adoc diff --git a/rules/S7195/metadata.json b/rules/S7195/metadata.json new file mode 100644 index 00000000000..2c63c085104 --- /dev/null +++ b/rules/S7195/metadata.json @@ -0,0 +1,2 @@ +{ +} diff --git a/rules/S7195/python/metadata.json b/rules/S7195/python/metadata.json new file mode 100644 index 00000000000..65025f01d74 --- /dev/null +++ b/rules/S7195/python/metadata.json @@ -0,0 +1,25 @@ +{ + "title": "FIXME", + "type": "CODE_SMELL", + "status": "ready", + "remediation": { + "func": "Constant\/Issue", + "constantCost": "5min" + }, + "tags": [ + ], + "defaultSeverity": "Major", + "ruleSpecification": "RSPEC-7195", + "sqKey": "S7195", + "scope": "All", + "defaultQualityProfiles": ["Sonar way"], + "quickfix": "unknown", + "code": { + "impacts": { + "MAINTAINABILITY": "HIGH", + "RELIABILITY": "MEDIUM", + "SECURITY": "LOW" + }, + "attribute": "CONVENTIONAL" + } +} diff --git a/rules/S7195/python/rule.adoc b/rules/S7195/python/rule.adoc new file mode 100644 index 00000000000..caae0d69054 --- /dev/null +++ b/rules/S7195/python/rule.adoc @@ -0,0 +1,44 @@ +FIXME: add a description + +// If you want to factorize the description uncomment the following line and create the file. +//include::../description.adoc[] + +== Why is this an issue? + +FIXME: remove the unused optional headers (that are commented out) + +//=== What is the potential impact? + +== How to fix it +//== How to fix it in FRAMEWORK NAME + +=== Code examples + +==== Noncompliant code example + +[source,python,diff-id=1,diff-type=noncompliant] +---- +FIXME +---- + +==== Compliant solution + +[source,python,diff-id=1,diff-type=compliant] +---- +FIXME +---- + +//=== How does this work? + +//=== Pitfalls + +//=== Going the extra mile + + +//== Resources +//=== Documentation +//=== Articles & blog posts +//=== Conference presentations +//=== Standards +//=== External coding guidelines +//=== Benchmarks From fbb298d50d37af6ae233fa389d272b76735707e5 Mon Sep 17 00:00:00 2001 From: David Kunzmann Date: Fri, 31 Jan 2025 15:37:08 +0100 Subject: [PATCH 2/3] Create rule S7195: PySpark lit(None) should be used when populating empty columns. --- rules/S7195/python/metadata.json | 9 ++--- rules/S7195/python/rule.adoc | 60 ++++++++++++++++++++++---------- 2 files changed, 46 insertions(+), 23 deletions(-) diff --git a/rules/S7195/python/metadata.json b/rules/S7195/python/metadata.json index 65025f01d74..db85439bfdd 100644 --- a/rules/S7195/python/metadata.json +++ b/rules/S7195/python/metadata.json @@ -1,5 +1,5 @@ { - "title": "FIXME", + "title": "PySpark lit(None) should be used when populating empty columns", "type": "CODE_SMELL", "status": "ready", "remediation": { @@ -7,6 +7,8 @@ "constantCost": "5min" }, "tags": [ + "data-science", + "pyspark" ], "defaultSeverity": "Major", "ruleSpecification": "RSPEC-7195", @@ -16,9 +18,8 @@ "quickfix": "unknown", "code": { "impacts": { - "MAINTAINABILITY": "HIGH", - "RELIABILITY": "MEDIUM", - "SECURITY": "LOW" + "MAINTAINABILITY": "MEDIUM", + "RELIABILITY": "MEDIUM" }, "attribute": "CONVENTIONAL" } diff --git a/rules/S7195/python/rule.adoc b/rules/S7195/python/rule.adoc index caae0d69054..a083b611422 100644 --- a/rules/S7195/python/rule.adoc +++ b/rules/S7195/python/rule.adoc @@ -1,16 +1,16 @@ -FIXME: add a description - -// If you want to factorize the description uncomment the following line and create the file. -//include::../description.adoc[] +This rule raises an issue when a column of a PySpark DataFrame is populated with `lit('')`. == Why is this an issue? -FIXME: remove the unused optional headers (that are commented out) +In PySpark, when populating a DataFrame columns with empty or null values, it is recommended to use `lit(None)`. +Using literals such as `lit('')` as a placeholder for absent values can lead to data misinterpretation and inconsistencies. -//=== What is the potential impact? +The usage of `lit(None)` ensures clarity and consistency in the codebase, making it explicit that the column is intentionally populated with null values. +Using `lit(None)` also preserves the ability to use functions such as `isnull` or `isnotnull` to check for null values in the DataFrame. == How to fix it -//== How to fix it in FRAMEWORK NAME + +To fix this issue, replace `lit('')` with `lit(None)` when populating a DataFrame columns with empty/null values. === Code examples @@ -18,27 +18,49 @@ FIXME: remove the unused optional headers (that are commented out) [source,python,diff-id=1,diff-type=noncompliant] ---- -FIXME +from pyspark.sql import SparkSession +from pyspark.sql.functions import lit + +spark = SparkSession.builder.appName("Example").getOrCreate() + +data = [ + (1, "Alice"), + (2, "Bob"), + (3, "Charlie") +] + +df = spark.createDataFrame(data, ["id", "name"]) + +df_with_empty_column = df.withColumn("middle_name", lit('')) # Noncompliant: usage of lit('') to represent en empty value ---- ==== Compliant solution [source,python,diff-id=1,diff-type=compliant] ---- -FIXME +from pyspark.sql import SparkSession +from pyspark.sql.functions import lit + +spark = SparkSession.builder.appName("Example").getOrCreate() + +data = [ + (1, "Alice"), + (2, "Bob"), + (3, "Charlie") +] + +df = spark.createDataFrame(data, ["id", "name"]) + +df_with_empty_column = df.withColumn("middle_name", lit(None)) # Compliant ---- -//=== How does this work? +== Resources +=== Documentation -//=== Pitfalls +* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.lit.html#pyspark-sql-functions-lit[pyspark-sql-functions-lit] +* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.isnull.html#pyspark-sql-functions-isnull[pyspark-sql-functions-isnull] -//=== Going the extra mile +=== Standards +* Palantir PySpark Style Guide - https://github.com/palantir/pyspark-style-guide?tab=readme-ov-file#empty-columns[empty-columns] -//== Resources -//=== Documentation -//=== Articles & blog posts -//=== Conference presentations -//=== Standards -//=== External coding guidelines -//=== Benchmarks From a819212b4ccf5ea3b2cb597a1ebbf4939a6daae2 Mon Sep 17 00:00:00 2001 From: David Kunzmann Date: Mon, 3 Feb 2025 11:35:18 +0100 Subject: [PATCH 3/3] Fix after release --- rules/S7195/python/metadata.json | 1 - rules/S7195/python/rule.adoc | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/rules/S7195/python/metadata.json b/rules/S7195/python/metadata.json index db85439bfdd..d858e9c6653 100644 --- a/rules/S7195/python/metadata.json +++ b/rules/S7195/python/metadata.json @@ -18,7 +18,6 @@ "quickfix": "unknown", "code": { "impacts": { - "MAINTAINABILITY": "MEDIUM", "RELIABILITY": "MEDIUM" }, "attribute": "CONVENTIONAL" diff --git a/rules/S7195/python/rule.adoc b/rules/S7195/python/rule.adoc index a083b611422..e8edfd9af26 100644 --- a/rules/S7195/python/rule.adoc +++ b/rules/S7195/python/rule.adoc @@ -2,7 +2,7 @@ This rule raises an issue when a column of a PySpark DataFrame is populated with == Why is this an issue? -In PySpark, when populating a DataFrame columns with empty or null values, it is recommended to use `lit(None)`. +In PySpark, when populating a DataFrame column with empty or null values, it is recommended to use `lit(None)`. Using literals such as `lit('')` as a placeholder for absent values can lead to data misinterpretation and inconsistencies. The usage of `lit(None)` ensures clarity and consistency in the codebase, making it explicit that the column is intentionally populated with null values. @@ -10,7 +10,7 @@ Using `lit(None)` also preserves the ability to use functions such as `isnull` o == How to fix it -To fix this issue, replace `lit('')` with `lit(None)` when populating a DataFrame columns with empty/null values. +To fix this issue, replace `lit('')` with `lit(None)` when populating a DataFrame column with empty/null values. === Code examples