Skip to content

Commit

Permalink
Remove extra 'use_ml' from candidate
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Jan 15, 2025
1 parent dc1a700 commit e2faf9f
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 21 deletions.
7 changes: 4 additions & 3 deletions credsweeper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,8 @@ def _use_ml_validation(self) -> bool:
logger.info("Skip ML validation because no candidates were found")
return False
for i in self.credential_manager.candidates:
if i.use_ml:
# None value means ml should be processed for the candidate
if i.ml_probability is None:
# any() or all() is not used to speedup
return True
logger.info("Skip ML validation because no candidates support it")
Expand Down Expand Up @@ -353,7 +354,7 @@ def post_processing(self) -> None:
for group_key, group_candidates in cred_groups.items():
# Analyze with ML if any candidate in group require ML
for candidate in group_candidates:
if candidate.use_ml:
if candidate.ml_probability is None:
ml_cred_groups.append((group_key, group_candidates))
break
else:
Expand All @@ -366,7 +367,7 @@ def post_processing(self) -> None:
is_cred, probability = self.ml_validator.validate_groups(ml_cred_groups, self.ml_batch_size)
for i, (_, group_candidates) in enumerate(ml_cred_groups):
for candidate in group_candidates:
if candidate.use_ml:
if candidate.ml_probability is None:
if is_cred[i]:
candidate.ml_probability = probability[i]
new_cred_list.append(candidate)
Expand Down
13 changes: 5 additions & 8 deletions credsweeper/credentials/candidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ class Candidate:
severity: critical/high/medium/low
confidence: strong/moderate/weak
config: user configs
validations: List of Validation objects that can check this credential using external API
use_ml: Should ML work on this credential or not. If not prediction based on regular expression and filter only
use_ml: Whether the candidate should be validated with ML. If not - ml_probability is set to -1
"""

def __init__(self,
Expand All @@ -37,16 +36,15 @@ def __init__(self,
self.rule_name = rule_name
self.severity = severity
self.config = config
self.use_ml = use_ml
self.confidence = confidence
# None - ML is applicable but not processed yet; -1 - ML is not applicable; [0.0, 1.0] - the ml decision
self.ml_probability: Union[None, int, float] = None if use_ml else -1
self.confidence = confidence

def compare(self, other: 'Candidate') -> bool:
"""Comparison method - checks only result of final cred"""
if self.rule_name == other.rule_name \
and self.severity == other.severity \
and self.confidence == other.confidence \
and self.use_ml == other.use_ml \
and self.ml_probability == other.ml_probability \
and len(self.line_data_list) == len(other.line_data_list):
for i, j in zip(self.line_data_list, other.line_data_list):
Expand Down Expand Up @@ -76,8 +74,8 @@ def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
return f"rule: {self.rule_name}" \
f" | severity: {self.severity.value}" \
f" | confidence: {self.confidence.value}" \
f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
f" | ml_probability: {self.ml_probability}"
f" | ml_probability: {self.ml_probability}" \
f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]"

def __str__(self):
return self.to_str()
Expand All @@ -98,7 +96,6 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:
"rule": self.rule_name,
"severity": self.severity.value,
"confidence": self.confidence.value,
"use_ml": self.use_ml,
# put the array to end to make json more readable
"line_data_list": [line_data.to_json(hashed, subtext) for line_data in self.line_data_list],
}
Expand Down
8 changes: 4 additions & 4 deletions docs/source/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ Get CLI output only:
.. code-block:: text
rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: tests/samples/password.gradle | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: tests/samples/password.gradle | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
Exclude outputs using CLI:
Expand Down Expand Up @@ -193,7 +193,7 @@ Minimal example for scanning line list:
.. code-block:: text
rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 1 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
Minimal example for scanning bytes:

Expand All @@ -211,7 +211,7 @@ Minimal example for scanning bytes:
.. code-block:: text
rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
Minimal example for the ML validation:
Expand Down Expand Up @@ -240,7 +240,7 @@ Note that `"secret='template'"` is not reported due to failing check by the `MlV

.. code-block:: text
rule: Password | severity: medium | confidence: moderate | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False] | ml_probability: 0.9857242107391357
rule: Password | severity: medium | confidence: moderate | ml_probability: 0.9857242107391357 | line_data_list: [line: 'password = "cackle!"' | line_num: 2 | path: | value: 'cackle!' | entropy_validation: BASE64STDPAD_CHARS 2.120590 False]
Configurations
--------------
Expand Down
12 changes: 6 additions & 6 deletions tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,13 @@ def test_it_works_p(self) -> None:
rule: UUID
| severity: info
| confidence: strong
| ml_probability: -1\n
| line_data_list:
[line: 'bace4d19-fa7e-beef-cafe-9129474bcd81 # tp'
| line_num: 1
| path: {target_path}
| value: 'bace4d19-fa7e-beef-cafe-9129474bcd81'
| entropy_validation: BASE36_CHARS 3.237326 True]
| ml_probability: -1\n
Detected Credentials: 1\n
Time Elapsed:
"""
Expand Down Expand Up @@ -102,23 +102,23 @@ def test_it_works_with_patch_p(self) -> None:
rule: UUID
| severity: info
| confidence: strong
| ml_probability: -1\n
| line_data_list:
[line: 'bace4d19-fa7e-dead-beef-9129474bcd81'
| line_num: 1
| path: uuid
| value: 'bace4d19-fa7e-dead-beef-9129474bcd81'
| entropy_validation: BASE36_CHARS 3.223709 True]
| ml_probability: -1\n
rule: UUID
| severity: info
| confidence: strong
| ml_probability: -1\n
| line_data_list:
[line: 'bace4d19-fa7e-beef-cafe-9129474bcd81'
| line_num: 1
| path: uuid
| value: 'bace4d19-fa7e-beef-cafe-9129474bcd81'
| entropy_validation: BASE36_CHARS 3.237326 True]
| ml_probability: -1\n
Added File Credentials: 1\n
Deleted File Credentials: 1\n
Time Elapsed:
Expand All @@ -137,16 +137,17 @@ def test_it_works_with_multiline_in_patch_p(self) -> None:
rule: AWS Client ID
| severity: high
| confidence: moderate
| ml_probability: -1
| line_data_list:
[line: ' clid = "AKIAQWADE5R42RDZ4JEM"'
| line_num: 4
| path: creds.py
| value: 'AKIAQWADE5R42RDZ4JEM'
| entropy_validation: BASE64STDPAD_CHARS 3.684184 False]
| ml_probability: -1
rule: AWS Multi
| severity: high
| confidence: moderate
| ml_probability: -1
| line_data_list:
[line: ' clid = "AKIAQWADE5R42RDZ4JEM"'
| line_num: 4
Expand All @@ -158,17 +159,16 @@ def test_it_works_with_multiline_in_patch_p(self) -> None:
| path: creds.py
| value: 'V84C7sDU001tFFodKU95USNy97TkqXymnvsFmYhQ'
| entropy_validation: BASE64STDPAD_CHARS 4.784184 True]
| ml_probability: -1
rule: Token
| severity: medium
| confidence: moderate
| ml_probability: 0.9982267618179321\n
| line_data_list:
[line: ' token = "V84C7sDU001tFFodKU95USNy97TkqXymnvsFmYhQ"'
| line_num: 5
| path: creds.py
| value: 'V84C7sDU001tFFodKU95USNy97TkqXymnvsFmYhQ'
| entropy_validation: BASE64STDPAD_CHARS 4.784184 True]
| ml_probability: 0.9982267618179321\n
Added File Credentials: 3\n
Deleted File Credentials: 0\n
Time Elapsed:
Expand Down

0 comments on commit e2faf9f

Please sign in to comment.