From 8725d6445fb04aab550818087fd8b2549a6f9448 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@users.noreply.github.com>
Date: Mon, 16 Dec 2024 12:19:37 +0200
Subject: [PATCH] Add &quot; and &apos; cases for keyword pattern (#635)

* update action test

* fix keyword pattern with HTML escape quotes

* MailChimp API Key right border
---
 .github/workflows/action.yml          |  5 +-
 credsweeper/common/keyword_pattern.py |  7 +--
 credsweeper/rules/config.yaml         |  2 +-
 tests/__init__.py                     |  6 +--
 tests/data/depth_3.json               | 78 +++++++++++++++++++++++++++
 tests/data/doc.json                   | 52 ++++++++++++++++++
 tests/data/ml_threshold.json          | 78 +++++++++++++++++++++++++++
 tests/data/output.json                | 78 +++++++++++++++++++++++++++
 tests/samples/sample.html             | 24 +++++++++
 9 files changed, 320 insertions(+), 10 deletions(-)
 create mode 100644 tests/samples/sample.html
diff --git a/.github/workflows/action.yml b/.github/workflows/action.yml
index e3ea37420..581c63ba7 100644
--- a/.github/workflows/action.yml
+++ b/.github/workflows/action.yml
@@ -26,9 +26,8 @@ jobs:
       with:
         ref: ${{ github.event.pull_request.head.sha }}
 
-    - name: CredSweeper test
-      # TODO: change @action to release point with the action
-      uses: Samsung/CredSweeper@8682ea7d68bf3dfe96e2ea1fad3f04b9b167993b # main
+    - name: CredSweeper action
+      uses: Samsung/CredSweeper@v1.9.6 # may be changed to any tag
       with:
         # args - arguments to credsweeper tool. See default values in action.yml
         args: --path ./tests/samples/ --save-json
diff --git a/credsweeper/common/keyword_pattern.py b/credsweeper/common/keyword_pattern.py
index 4c37d3b1f..3927606c0 100644
--- a/credsweeper/common/keyword_pattern.py
+++ b/credsweeper/common/keyword_pattern.py
@@ -8,7 +8,7 @@ class KeywordPattern:
                r"(?P<keyword>"
     # there will be inserted a keyword
     key_right = r")" \
-                r"[^%:='\"`<>{?!&]*)[`'\"]*)"  # <variable>
+                r"(&(quot|apos);|[^%:='\"`<>{?!&]*)[`'\"]*))"  # <variable>
     separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \
                 r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=|%3d)" \
                 r"(\s|\\+[tnr])*"
@@ -21,15 +21,16 @@ class KeywordPattern:
            r"([0-9a-z_]{1,32}=)?" \
            r")+)?"
     string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?"
-    left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?[`'\"]){1,4}))?"
+    left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4}))?"
     # Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
     auth_keywords = r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?"
     value = r"(?P<value>" \
             r"(?(value_leftquote)" \
             r"(" \
             r"(?!(?P=value_leftquote))" \
-            r"(?(esq)((?!(?P=esq)['`\"]).)|((?!(?P=value_leftquote)).)))" \
+            r"(?(esq)((?!(?P=esq)([`'\"]|&(quot|apos);)).)|((?!(?P=value_leftquote)).)))" \
             r"|" \
+            r"(?!&(quot|apos);)" \
             r"(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])" \
             r"){3,8000}" \
             r"|(\{[^}]{3,8000}\})" \
diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml
index 7bcdc8c0f..6baf16d67 100644
--- a/credsweeper/rules/config.yaml
+++ b/credsweeper/rules/config.yaml
@@ -368,7 +368,7 @@
   confidence: moderate
   type: pattern
   values:
-    - (?:(?<![0-9A-Za-z_-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?P<value>[0-9A-Za-z_-]{32}-us[0-9]{1,2})
+    - (?:(?<![0-9A-Za-z_-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?P<value>[0-9A-Za-z_-]{32}-us[0-9]{1,2})(?![0-9A-Za-z_-])
   filter_type: GeneralPattern
   required_substrings:
     - -us
diff --git a/tests/__init__.py b/tests/__init__.py
index 99d18fcaf..4f1fde9c2 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,13 +1,13 @@
 from pathlib import Path
 
 # total number of files in test samples
-SAMPLES_FILES_COUNT = 138
+SAMPLES_FILES_COUNT = 139
 
 # the lowest value of ML threshold is used to display possible lowest values
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
 # credentials count after scan with negligible ML threshold
-SAMPLES_CRED_COUNT = 425
+SAMPLES_CRED_COUNT = 428
 SAMPLES_CRED_LINE_COUNT = SAMPLES_CRED_COUNT + 19
 
 # Number of filtered credentials with ML
@@ -17,7 +17,7 @@
 SAMPLES_POST_CRED_COUNT = SAMPLES_CRED_COUNT - ML_FILTERED
 
 # with option --doc
-SAMPLES_IN_DOC = 468
+SAMPLES_IN_DOC = 470
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 33
diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json
index e10acceed..ab5b0d0e4 100644
--- a/tests/data/depth_3.json
+++ b/tests/data/depth_3.json
@@ -10687,6 +10687,84 @@
             }
         ]
     },
+    {
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.96,
+        "rule": "Token",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "<a href=\"http://example.com?token=&quot;g1re0g1T0keN3zWx&quot;\">TokenRequest</a>",
+                "line_num": 9,
+                "path": "./tests/samples/sample.html",
+                "info": "./tests/samples/sample.html|RAW",
+                "value": "g1re0g1T0keN3zWx",
+                "value_start": 40,
+                "value_end": 56,
+                "variable": "token",
+                "variable_start": 28,
+                "variable_end": 33,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.5,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.999,
+        "rule": "Password",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "    placeholder=\"Your password: &quot;g1re0g1Pa5$w0Rd&quot;\"",
+                "line_num": 16,
+                "path": "./tests/samples/sample.html",
+                "info": "./tests/samples/sample.html|RAW",
+                "value": "g1re0g1Pa5$w0Rd",
+                "value_start": 38,
+                "value_end": 53,
+                "variable": "Your password",
+                "variable_start": 17,
+                "variable_end": 30,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.246431222567951,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.998,
+        "rule": "Password",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": " <button onclick=\"alert(&quot;password:g1re0g2Pa5$w0Rd&quot;)\">ShowPass</button>",
+                "line_num": 21,
+                "path": "./tests/samples/sample.html",
+                "info": "./tests/samples/sample.html|RAW",
+                "value": "g1re0g2Pa5$w0Rd",
+                "value_start": 39,
+                "value_end": 54,
+                "variable": "password",
+                "variable_start": 30,
+                "variable_end": 38,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.379764555901284,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "ml_validation": "NOT_AVAILABLE",
         "ml_probability": null,
diff --git a/tests/data/doc.json b/tests/data/doc.json
index b242989ba..47dd349fe 100644
--- a/tests/data/doc.json
+++ b/tests/data/doc.json
@@ -12874,6 +12874,58 @@
             }
         ]
     },
+    {
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "PASSWD_PAIR",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "placeholder=\"Your password: &quot;g1re0g1Pa5$w0Rd&quot;\"",
+                "line_num": 16,
+                "path": "./tests/samples/sample.html",
+                "info": "./tests/samples/sample.html|RAW",
+                "value": "&quot;g1re0g1Pa5$w0Rd&quot;\"",
+                "value_start": 32,
+                "value_end": 60,
+                "variable": "password",
+                "variable_start": 22,
+                "variable_end": 30,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.2772074387595462,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "ml_validation": "NOT_AVAILABLE",
+        "ml_probability": null,
+        "rule": "PASSWD_PAIR",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "<button onclick=\"alert(&quot;password:g1re0g2Pa5$w0Rd&quot;)\">ShowPass</button>",
+                "line_num": 21,
+                "path": "./tests/samples/sample.html",
+                "info": "./tests/samples/sample.html|RAW",
+                "value": "g1re0g2Pa5$w0Rd&quot;",
+                "value_start": 39,
+                "value_end": 60,
+                "variable": "password",
+                "variable_start": 30,
+                "variable_end": 38,
+                "entropy_validation": {
+                    "iterator": "BASE36_CHARS",
+                    "entropy": 3.1560513697361983,
+                    "valid": true
+                }
+            }
+        ]
+    },
     {
         "ml_validation": "NOT_AVAILABLE",
         "ml_probability": null,
diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json
index b7cdb268d..69653ba74 100644
--- a/tests/data/ml_threshold.json
+++ b/tests/data/ml_threshold.json
@@ -10280,6 +10280,84 @@
             }
         ]
     },
+    {
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.96,
+        "rule": "Token",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "<a href=\"http://example.com?token=&quot;g1re0g1T0keN3zWx&quot;\">TokenRequest</a>",
+                "line_num": 9,
+                "path": "./tests/samples/sample.html",
+                "info": "",
+                "value": "g1re0g1T0keN3zWx",
+                "value_start": 40,
+                "value_end": 56,
+                "variable": "token",
+                "variable_start": 28,
+                "variable_end": 33,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.5,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.999,
+        "rule": "Password",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "    placeholder=\"Your password: &quot;g1re0g1Pa5$w0Rd&quot;\"",
+                "line_num": 16,
+                "path": "./tests/samples/sample.html",
+                "info": "",
+                "value": "g1re0g1Pa5$w0Rd",
+                "value_start": 38,
+                "value_end": 53,
+                "variable": "Your password",
+                "variable_start": 17,
+                "variable_end": 30,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.246431222567951,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.998,
+        "rule": "Password",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": " <button onclick=\"alert(&quot;password:g1re0g2Pa5$w0Rd&quot;)\">ShowPass</button>",
+                "line_num": 21,
+                "path": "./tests/samples/sample.html",
+                "info": "",
+                "value": "g1re0g2Pa5$w0Rd",
+                "value_start": 39,
+                "value_end": 54,
+                "variable": "password",
+                "variable_start": 30,
+                "variable_end": 38,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.379764555901284,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "ml_validation": "VALIDATED_KEY",
         "ml_probability": 0.672,
diff --git a/tests/data/output.json b/tests/data/output.json
index 55c5797eb..8135c2645 100644
--- a/tests/data/output.json
+++ b/tests/data/output.json
@@ -9188,6 +9188,84 @@
             }
         ]
     },
+    {
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.96,
+        "rule": "Token",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "<a href=\"http://example.com?token=&quot;g1re0g1T0keN3zWx&quot;\">TokenRequest</a>",
+                "line_num": 9,
+                "path": "./tests/samples/sample.html",
+                "info": "",
+                "value": "g1re0g1T0keN3zWx",
+                "value_start": 40,
+                "value_end": 56,
+                "variable": "token",
+                "variable_start": 28,
+                "variable_end": 33,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.5,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.999,
+        "rule": "Password",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": "    placeholder=\"Your password: &quot;g1re0g1Pa5$w0Rd&quot;\"",
+                "line_num": 16,
+                "path": "./tests/samples/sample.html",
+                "info": "",
+                "value": "g1re0g1Pa5$w0Rd",
+                "value_start": 38,
+                "value_end": 53,
+                "variable": "Your password",
+                "variable_start": 17,
+                "variable_end": 30,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.246431222567951,
+                    "valid": false
+                }
+            }
+        ]
+    },
+    {
+        "ml_validation": "VALIDATED_KEY",
+        "ml_probability": 0.998,
+        "rule": "Password",
+        "severity": "medium",
+        "confidence": "moderate",
+        "line_data_list": [
+            {
+                "line": " <button onclick=\"alert(&quot;password:g1re0g2Pa5$w0Rd&quot;)\">ShowPass</button>",
+                "line_num": 21,
+                "path": "./tests/samples/sample.html",
+                "info": "",
+                "value": "g1re0g2Pa5$w0Rd",
+                "value_start": 39,
+                "value_end": 54,
+                "variable": "password",
+                "variable_start": 30,
+                "variable_end": 38,
+                "entropy_validation": {
+                    "iterator": "BASE64_CHARS",
+                    "entropy": 3.379764555901284,
+                    "valid": false
+                }
+            }
+        ]
+    },
     {
         "ml_validation": "VALIDATED_KEY",
         "ml_probability": 0.672,
diff --git a/tests/samples/sample.html b/tests/samples/sample.html
new file mode 100644
index 000000000..8e5051c3a
--- /dev/null
+++ b/tests/samples/sample.html
@@ -0,0 +1,24 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Title</title>
+</head>
+<body>
+
+<a href="http://example.com?token=&quot;g1re0g1T0keN3zWx&quot;">TokenRequest</a>
+<form>
+  <label for="password">Password:</label>
+  <input
+    type="password"
+    id="password"
+    name="password"
+    placeholder="Your password: &quot;g1re0g1Pa5$w0Rd&quot;"
+  />
+  <button type="submit">Login</button>
+</form>
+
+ <button onclick="alert(&quot;password:g1re0g2Pa5$w0Rd&quot;)">ShowPass</button>
+
+</body>
+</html>
\ No newline at end of file