Remove extra 'use_ml' from candidate

Samsung · Jan 15, 2025 · e2faf9f · e2faf9f
1 parent dc1a700
commit e2faf9f
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 21 deletions.
diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -167,7 +167,8 @@ def _use_ml_validation(self) -> bool:
             logger.info("Skip ML validation because no candidates were found")
             return False
         for i in self.credential_manager.candidates:
-            if i.use_ml:
+            # None value means ml should be processed for the candidate
+            if i.ml_probability is None:
                 # any() or all() is not used to speedup
                 return True
         logger.info("Skip ML validation because no candidates support it")
@@ -353,7 +354,7 @@ def post_processing(self) -> None:
             for group_key, group_candidates in cred_groups.items():
                 # Analyze with ML if any candidate in group require ML
                 for candidate in group_candidates:
-                    if candidate.use_ml:
+                    if candidate.ml_probability is None:
                         ml_cred_groups.append((group_key, group_candidates))
                         break
                 else:
@@ -366,7 +367,7 @@ def post_processing(self) -> None:
                 is_cred, probability = self.ml_validator.validate_groups(ml_cred_groups, self.ml_batch_size)
                 for i, (_, group_candidates) in enumerate(ml_cred_groups):
                     for candidate in group_candidates:
-                        if candidate.use_ml:
+                        if candidate.ml_probability is None:
                             if is_cred[i]:
                                 candidate.ml_probability = probability[i]
                                 new_cred_list.append(candidate)

diff --git a/credsweeper/credentials/candidate.py b/credsweeper/credentials/candidate.py
@@ -20,8 +20,7 @@ class Candidate:
         severity: critical/high/medium/low
         confidence: strong/moderate/weak
         config: user configs
-        validations: List of Validation objects that can check this credential using external API
-        use_ml: Should ML work on this credential or not. If not prediction based on regular expression and filter only
+        use_ml: Whether the candidate should be validated with ML. If not - ml_probability is set to -1
     """
 
     def __init__(self,
@@ -37,16 +36,15 @@ def __init__(self,
         self.rule_name = rule_name
         self.severity = severity
         self.config = config
-        self.use_ml = use_ml
-        self.confidence = confidence
+        # None - ML is applicable but not processed yet; -1 - ML is not applicable; [0.0, 1.0] - the ml decision
         self.ml_probability: Union[None, int, float] = None if use_ml else -1
+        self.confidence = confidence
 
     def compare(self, other: 'Candidate') -> bool:
         """Comparison method - checks only result of final cred"""
         if self.rule_name == other.rule_name \
                 and self.severity == other.severity \
                 and self.confidence == other.confidence \
-                and self.use_ml == other.use_ml \
                 and self.ml_probability == other.ml_probability \
                 and len(self.line_data_list) == len(other.line_data_list):
             for i, j in zip(self.line_data_list, other.line_data_list):
@@ -76,8 +74,8 @@ def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
         return f"rule: {self.rule_name}" \
                f" | severity: {self.severity.value}" \
                f" | confidence: {self.confidence.value}" \
-               f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
-               f" | ml_probability: {self.ml_probability}"
+               f" | ml_probability: {self.ml_probability}" \
+               f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]"
 
     def __str__(self):
         return self.to_str()
@@ -98,7 +96,6 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:
             "rule": self.rule_name,
             "severity": self.severity.value,
             "confidence": self.confidence.value,
-            "use_ml": self.use_ml,
             # put the array to end to make json more readable
             "line_data_list": [line_data.to_json(hashed, subtext) for line_data in self.line_data_list],
         }

diff --git a/docs/source/guide.rst b/docs/source/guide.rst
@@ -132,7 +132,7 @@ Get CLI output only:
 
 .. code-block:: text
 
-    rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: tests/samples/password.gradle | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
+    rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: tests/samples/password.gradle | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
 
 
 Exclude outputs using CLI:
@@ -193,7 +193,7 @@ Minimal example for scanning line list:
 
 .. code-block:: text
 
-    rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path:  | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
+    rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path:  | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
 
 Minimal example for scanning bytes:
 
@@ -211,7 +211,7 @@ Minimal example for scanning bytes:
 
 .. code-block:: text
 
-    rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path:  | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
+    rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path:  | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
 
 
 Minimal example for the ML validation:
@@ -240,7 +240,7 @@ Note that `"secret='template'"` is not reported due to failing check by the `MlV
 
 .. code-block:: text
 
-    rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path:  | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
+    rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path:  | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
 
 Configurations
 --------------

diff --git a/tests/test_app.py b/tests/test_app.py
@@ -52,13 +52,13 @@ def test_it_works_p(self) -> None:
                     rule: UUID
                     | severity: info
                     | confidence: strong
+                    | ml_probability: -1\n
                     | line_data_list:
                         [line: 'bace4d19-fa7e-beef-cafe-9129474bcd81 # tp'
                         | line_num: 1
                         | path: {target_path}
                         | value: 'bace4d19-fa7e-beef-cafe-9129474bcd81'
                         | entropy_validation: BASE36_CHARS 3.237326 True]
-                    | ml_probability: -1\n
                     Detected Credentials: 1\n
                     Time Elapsed:
                     """
@@ -102,23 +102,23 @@ def test_it_works_with_patch_p(self) -> None:
                     rule: UUID
                     | severity: info
                     | confidence: strong
+                    | ml_probability: -1\n
                     | line_data_list:
                     [line: 'bace4d19-fa7e-dead-beef-9129474bcd81'
                         | line_num: 1
                         | path: uuid
                         | value: 'bace4d19-fa7e-dead-beef-9129474bcd81'
                         | entropy_validation: BASE36_CHARS 3.223709 True]
-                    | ml_probability: -1\n
                     rule: UUID
                     | severity: info
                     | confidence: strong
+                    | ml_probability: -1\n
                     | line_data_list:
                     [line: 'bace4d19-fa7e-beef-cafe-9129474bcd81'
                         | line_num: 1
                         | path: uuid
                         | value: 'bace4d19-fa7e-beef-cafe-9129474bcd81'
                         | entropy_validation: BASE36_CHARS 3.237326 True]
-                    | ml_probability: -1\n
                     Added File Credentials: 1\n
                     Deleted File Credentials: 1\n
                     Time Elapsed:
@@ -137,16 +137,17 @@ def test_it_works_with_multiline_in_patch_p(self) -> None:
                     rule: AWS Client ID
                         | severity: high
                         | confidence: moderate
+                        | ml_probability: -1
                         | line_data_list:
                             [line: ' clid = "AKIAQWADE5R42RDZ4JEM"'
                             | line_num: 4
                             | path: creds.py
                             | value: 'AKIAQWADE5R42RDZ4JEM'
                             | entropy_validation: BASE64STDPAD_CHARS 3.684184 False]
-                        | ml_probability: -1
                     rule: AWS Multi
                         | severity: high
                         | confidence: moderate
+                        | ml_probability: -1
                         | line_data_list:
                             [line: ' clid = "AKIAQWADE5R42RDZ4JEM"'
                             | line_num: 4
@@ -158,17 +159,16 @@ def test_it_works_with_multiline_in_patch_p(self) -> None:
                             | path: creds.py
                             | value: 'V84C7sDU001tFFodKU95USNy97TkqXymnvsFmYhQ'
                             | entropy_validation: BASE64STDPAD_CHARS 4.784184 True]
-                        | ml_probability: -1
                     rule: Token
                         | severity: medium
                         | confidence: moderate
+                        | ml_probability: 0.9982267618179321\n
                         | line_data_list:
                             [line: ' token = "V84C7sDU001tFFodKU95USNy97TkqXymnvsFmYhQ"'
                             | line_num: 5
                             | path: creds.py
                             | value: 'V84C7sDU001tFFodKU95USNy97TkqXymnvsFmYhQ'
                             | entropy_validation: BASE64STDPAD_CHARS 4.784184 True]
-                        | ml_probability: 0.9982267618179321\n
                     Added File Credentials: 3\n
                     Deleted File Credentials: 0\n
                     Time Elapsed: