From dd9687adb25f0b8049c1feaf8e31914e8e00fffe Mon Sep 17 00:00:00 2001
From: joke1196 <joke1196@users.noreply.github.com>
Date: Thu, 30 Jan 2025 14:55:28 +0000
Subject: [PATCH 1/3] Create rule S7195

---
 rules/S7195/metadata.json        |  2 ++
 rules/S7195/python/metadata.json | 25 ++++++++++++++++++
 rules/S7195/python/rule.adoc     | 44 ++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+)
 create mode 100644 rules/S7195/metadata.json
 create mode 100644 rules/S7195/python/metadata.json
 create mode 100644 rules/S7195/python/rule.adoc

diff --git a/rules/S7195/metadata.json b/rules/S7195/metadata.json
new file mode 100644
index 00000000000..2c63c085104
--- /dev/null
+++ b/rules/S7195/metadata.json
@@ -0,0 +1,2 @@
+{
+}
diff --git a/rules/S7195/python/metadata.json b/rules/S7195/python/metadata.json
new file mode 100644
index 00000000000..65025f01d74
--- /dev/null
+++ b/rules/S7195/python/metadata.json
@@ -0,0 +1,25 @@
+{
+  "title": "FIXME",
+  "type": "CODE_SMELL",
+  "status": "ready",
+  "remediation": {
+    "func": "Constant\/Issue",
+    "constantCost": "5min"
+  },
+  "tags": [
+  ],
+  "defaultSeverity": "Major",
+  "ruleSpecification": "RSPEC-7195",
+  "sqKey": "S7195",
+  "scope": "All",
+  "defaultQualityProfiles": ["Sonar way"],
+  "quickfix": "unknown",
+  "code": {
+    "impacts": {
+      "MAINTAINABILITY": "HIGH",
+      "RELIABILITY": "MEDIUM",
+      "SECURITY": "LOW"
+    },
+    "attribute": "CONVENTIONAL"
+  }
+}
diff --git a/rules/S7195/python/rule.adoc b/rules/S7195/python/rule.adoc
new file mode 100644
index 00000000000..caae0d69054
--- /dev/null
+++ b/rules/S7195/python/rule.adoc
@@ -0,0 +1,44 @@
+FIXME: add a description
+
+// If you want to factorize the description uncomment the following line and create the file.
+//include::../description.adoc[]
+
+== Why is this an issue?
+
+FIXME: remove the unused optional headers (that are commented out)
+
+//=== What is the potential impact?
+
+== How to fix it
+//== How to fix it in FRAMEWORK NAME
+
+=== Code examples
+
+==== Noncompliant code example
+
+[source,python,diff-id=1,diff-type=noncompliant]
+----
+FIXME
+----
+
+==== Compliant solution
+
+[source,python,diff-id=1,diff-type=compliant]
+----
+FIXME
+----
+
+//=== How does this work?
+
+//=== Pitfalls
+
+//=== Going the extra mile
+
+
+//== Resources
+//=== Documentation
+//=== Articles & blog posts
+//=== Conference presentations
+//=== Standards
+//=== External coding guidelines
+//=== Benchmarks

From fbb298d50d37af6ae233fa389d272b76735707e5 Mon Sep 17 00:00:00 2001
From: David Kunzmann <david.kunzmann@sonarsource.com>
Date: Fri, 31 Jan 2025 15:37:08 +0100
Subject: [PATCH 2/3] Create rule S7195: PySpark lit(None) should be used when
 populating empty columns.

---
 rules/S7195/python/metadata.json |  9 ++---
 rules/S7195/python/rule.adoc     | 60 ++++++++++++++++++++++----------
 2 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/rules/S7195/python/metadata.json b/rules/S7195/python/metadata.json
index 65025f01d74..db85439bfdd 100644
--- a/rules/S7195/python/metadata.json
+++ b/rules/S7195/python/metadata.json
@@ -1,5 +1,5 @@
 {
-  "title": "FIXME",
+  "title": "PySpark lit(None) should be used when populating empty columns",
   "type": "CODE_SMELL",
   "status": "ready",
   "remediation": {
@@ -7,6 +7,8 @@
     "constantCost": "5min"
   },
   "tags": [
+    "data-science",
+    "pyspark"
   ],
   "defaultSeverity": "Major",
   "ruleSpecification": "RSPEC-7195",
@@ -16,9 +18,8 @@
   "quickfix": "unknown",
   "code": {
     "impacts": {
-      "MAINTAINABILITY": "HIGH",
-      "RELIABILITY": "MEDIUM",
-      "SECURITY": "LOW"
+      "MAINTAINABILITY": "MEDIUM",
+      "RELIABILITY": "MEDIUM"
     },
     "attribute": "CONVENTIONAL"
   }
diff --git a/rules/S7195/python/rule.adoc b/rules/S7195/python/rule.adoc
index caae0d69054..a083b611422 100644
--- a/rules/S7195/python/rule.adoc
+++ b/rules/S7195/python/rule.adoc
@@ -1,16 +1,16 @@
-FIXME: add a description
-
-// If you want to factorize the description uncomment the following line and create the file.
-//include::../description.adoc[]
+This rule raises an issue when a column of a PySpark DataFrame is populated with `lit('')`.
 
 == Why is this an issue?
 
-FIXME: remove the unused optional headers (that are commented out)
+In PySpark, when populating a DataFrame columns with empty or null values, it is recommended to use `lit(None)`. 
+Using literals such as `lit('')` as a placeholder for absent values can lead to data misinterpretation and inconsistencies.
 
-//=== What is the potential impact?
+The usage of `lit(None)` ensures clarity and consistency in the codebase, making it explicit that the column is intentionally populated with null values.
+Using `lit(None)` also preserves the ability to use functions such as `isnull` or `isnotnull` to check for null values in the DataFrame.
 
 == How to fix it
-//== How to fix it in FRAMEWORK NAME
+
+To fix this issue, replace `lit('')` with `lit(None)` when populating a DataFrame columns with empty/null values.
 
 === Code examples
 
@@ -18,27 +18,49 @@ FIXME: remove the unused optional headers (that are commented out)
 
 [source,python,diff-id=1,diff-type=noncompliant]
 ----
-FIXME
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import lit
+
+spark = SparkSession.builder.appName("Example").getOrCreate()
+
+data = [
+    (1, "Alice"),
+    (2, "Bob"),
+    (3, "Charlie")
+]
+
+df = spark.createDataFrame(data, ["id", "name"])
+
+df_with_empty_column = df.withColumn("middle_name", lit('')) # Noncompliant: usage of lit('') to represent en empty value
 ----
 
 ==== Compliant solution
 
 [source,python,diff-id=1,diff-type=compliant]
 ----
-FIXME
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import lit
+
+spark = SparkSession.builder.appName("Example").getOrCreate()
+
+data = [
+    (1, "Alice"),
+    (2, "Bob"),
+    (3, "Charlie")
+]
+
+df = spark.createDataFrame(data, ["id", "name"])
+
+df_with_empty_column = df.withColumn("middle_name", lit(None)) # Compliant
 ----
 
-//=== How does this work?
+== Resources
+=== Documentation
 
-//=== Pitfalls
+* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.lit.html#pyspark-sql-functions-lit[pyspark-sql-functions-lit]
+* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.isnull.html#pyspark-sql-functions-isnull[pyspark-sql-functions-isnull]
 
-//=== Going the extra mile
+=== Standards
 
+* Palantir PySpark Style Guide - https://github.com/palantir/pyspark-style-guide?tab=readme-ov-file#empty-columns[empty-columns]
 
-//== Resources
-//=== Documentation
-//=== Articles & blog posts
-//=== Conference presentations
-//=== Standards
-//=== External coding guidelines
-//=== Benchmarks

From a819212b4ccf5ea3b2cb597a1ebbf4939a6daae2 Mon Sep 17 00:00:00 2001
From: David Kunzmann <david.kunzmann@sonarsource.com>
Date: Mon, 3 Feb 2025 11:35:18 +0100
Subject: [PATCH 3/3] Fix after release

---
 rules/S7195/python/metadata.json | 1 -
 rules/S7195/python/rule.adoc     | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/rules/S7195/python/metadata.json b/rules/S7195/python/metadata.json
index db85439bfdd..d858e9c6653 100644
--- a/rules/S7195/python/metadata.json
+++ b/rules/S7195/python/metadata.json
@@ -18,7 +18,6 @@
   "quickfix": "unknown",
   "code": {
     "impacts": {
-      "MAINTAINABILITY": "MEDIUM",
       "RELIABILITY": "MEDIUM"
     },
     "attribute": "CONVENTIONAL"
diff --git a/rules/S7195/python/rule.adoc b/rules/S7195/python/rule.adoc
index a083b611422..e8edfd9af26 100644
--- a/rules/S7195/python/rule.adoc
+++ b/rules/S7195/python/rule.adoc
@@ -2,7 +2,7 @@ This rule raises an issue when a column of a PySpark DataFrame is populated with
 
 == Why is this an issue?
 
-In PySpark, when populating a DataFrame columns with empty or null values, it is recommended to use `lit(None)`. 
+In PySpark, when populating a DataFrame column with empty or null values, it is recommended to use `lit(None)`. 
 Using literals such as `lit('')` as a placeholder for absent values can lead to data misinterpretation and inconsistencies.
 
 The usage of `lit(None)` ensures clarity and consistency in the codebase, making it explicit that the column is intentionally populated with null values.
@@ -10,7 +10,7 @@ Using `lit(None)` also preserves the ability to use functions such as `isnull` o
 
 == How to fix it
 
-To fix this issue, replace `lit('')` with `lit(None)` when populating a DataFrame columns with empty/null values.
+To fix this issue, replace `lit('')` with `lit(None)` when populating a DataFrame column with empty/null values.
 
 === Code examples