removed api_validations

Samsung · Dec 12, 2024 · fc0e34a · fc0e34a
1 parent 84a5ed8
commit fc0e34a
Show file tree

Hide file tree

Showing 51 changed files with 26 additions and 3,058 deletions.
diff --git a/credsweeper/__init__.py b/credsweeper/__init__.py
@@ -4,10 +4,8 @@
     DataContentProvider, \
     TextContentProvider
 from credsweeper.ml_model.ml_validator import MlValidator
-from credsweeper.validations.apply_validation import ApplyValidation
 
 __all__ = [
-    'ApplyValidation',  #
     'ByteContentProvider',  #
     'ContentProvider',  #
     'CredSweeper',  #

diff --git a/credsweeper/__main__.py b/credsweeper/__main__.py
@@ -196,11 +196,6 @@ def get_arguments() -> Namespace:
                         dest="ml_providers",
                         required=False,
                         metavar="STR")
-    parser.add_argument("--api_validation",
-                        help="add credential api validation option to credsweeper pipeline. "
-                        "External API is used to reduce FP for some rule types.",
-                        dest="api_validation",
-                        action="store_true")
     parser.add_argument("--jobs",
                         "-j",
                         help="number of parallel processes to use (default: 1)",
@@ -296,7 +291,6 @@ def scan(args: Namespace, content_provider: AbstractProvider, json_filename: Opt
 
         credsweeper = CredSweeper(rule_path=args.rule_path,
                                   config_path=args.config_path,
-                                  api_validation=args.api_validation,
                                   json_filename=json_filename,
                                   xlsx_filename=xlsx_filename,
                                   hashed=args.hashed,

diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -19,7 +19,6 @@
 from credsweeper.file_handler.text_content_provider import TextContentProvider
 from credsweeper.scanner import Scanner
 from credsweeper.utils import Util
-from credsweeper.validations.apply_validation import ApplyValidation
 
 logger = logging.getLogger(__name__)
 
@@ -39,7 +38,6 @@ class CredSweeper:
     def __init__(self,
                  rule_path: Union[None, str, Path] = None,
                  config_path: Optional[str] = None,
-                 api_validation: bool = False,
                  json_filename: Union[None, str, Path] = None,
                  xlsx_filename: Union[None, str, Path] = None,
                  hashed: bool = False,
@@ -67,8 +65,6 @@ def __init__(self,
                 validation was the grained candidate model on machine learning
             config_path: optional str variable, path of CredSweeper config file
                 default built-in config is used if None
-            api_validation: optional boolean variable, specifying the need of
-                parallel API validation
             json_filename: optional string variable, path to save result
                 to json
             xlsx_filename: optional string variable, path to save result
@@ -97,7 +93,6 @@ def __init__(self,
             raise RuntimeError(f"Severity level provided: {severity}"
                                f" -- must be one of: {' | '.join([i.value for i in Severity])}")
         config_dict = self._get_config_dict(config_path=config_path,
-                                            api_validation=api_validation,
                                             use_filters=use_filters,
                                             find_by_ext=find_by_ext,
                                             depth=depth,
@@ -137,7 +132,6 @@ def _get_config_path(config_path: Optional[str]) -> Path:
     def _get_config_dict(
             self,  #
             config_path: Optional[str],  #
-            api_validation: bool,  #
             use_filters: bool,  #
             find_by_ext: bool,  #
             depth: int,  #
@@ -147,8 +141,6 @@ def _get_config_dict(
             exclude_lines: Optional[List[str]],  #
             exclude_values: Optional[List[str]]) -> Dict[str, Any]:
         config_dict = Util.json_load(self._get_config_path(config_path))
-        config_dict["validation"] = {}
-        config_dict["validation"]["api_validation"] = api_validation
         config_dict["use_filters"] = use_filters
         config_dict["find_by_ext"] = find_by_ext
         config_dict["size_limit"] = size_limit
@@ -268,14 +260,7 @@ def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextConten
     def __single_job_scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
         """Performs scan in main thread"""
         all_cred = self.files_scan(content_providers)
-        if self.config.api_validation:
-            api_validation = ApplyValidation()
-            for cred in all_cred:
-                logger.info("Run API Validation")
-                cred.api_validation = api_validation.validate(cred)
-                self.credential_manager.add_credential(cred)
-        else:
-            self.credential_manager.set_credentials(all_cred)
+        self.credential_manager.set_credentials(all_cred)
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
@@ -289,8 +274,6 @@ def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvide
             if "SILENCE" == self.__log_level:
                 logging.addLevelName(60, "SILENCE")
             log_kwargs["level"] = self.__log_level
-        # providers_map: List[Sequence[Union[DiffContentProvider, TextContentProvider]]] = \
-        #     [content_providers[x::self.pool_count] for x in range(self.pool_count)]
         with multiprocessing.get_context("spawn").Pool(processes=self.pool_count,
                                                        initializer=self.pool_initializer,
                                                        initargs=(log_kwargs, )) as pool:
@@ -299,10 +282,6 @@ def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvide
                                                                           for x in range(self.pool_count))):
                     for cred in scan_results:
                         self.credential_manager.add_credential(cred)
-                if self.config.api_validation:
-                    logger.info("Run API Validation")
-                    api_validation = ApplyValidation()
-                    api_validation.validate_credentials(pool, self.credential_manager)
             except KeyboardInterrupt:
                 pool.terminate()
                 pool.join()

diff --git a/credsweeper/config/config.py b/credsweeper/config/config.py
@@ -29,7 +29,6 @@ def __init__(self, config: Dict[str, Any]) -> None:
         self.check_for_literals: bool = config["check_for_literals"]
         self.not_allowed_path_pattern = re.compile(f"{Util.get_regex_combine_or(self.NOT_ALLOWED_PATH)}",
                                                    flags=re.IGNORECASE)
-        self.api_validation: bool = config["validation"]["api_validation"]
         self.use_filters: bool = config["use_filters"]
         self.line_data_output: List[str] = config["line_data_output"]
         self.candidate_output: List[str] = config["candidate_output"]

diff --git a/credsweeper/credentials/candidate.py b/credsweeper/credentials/candidate.py
@@ -6,7 +6,6 @@
 from credsweeper.common.constants import KeyValidationOption, Severity, Confidence
 from credsweeper.config import Config
 from credsweeper.credentials.line_data import LineData
-from credsweeper.validations.validation import Validation
 
 
 class Candidate:
@@ -31,19 +30,15 @@ def __init__(self,
                  rule_name: str,
                  severity: Severity,
                  config: Optional[Config] = None,
-                 validations: List[Validation] = None,
                  use_ml: bool = False,
                  confidence: Confidence = Confidence.MODERATE) -> None:
         self.line_data_list = line_data_list
         self.patterns = patterns
         self.rule_name = rule_name
         self.severity = severity
         self.config = config
-        self.validations: List[Validation] = validations if validations is not None else []
         self.use_ml = use_ml
         self.confidence = confidence
-
-        self.api_validation = KeyValidationOption.NOT_AVAILABLE
         self.ml_validation = KeyValidationOption.NOT_AVAILABLE
         self.ml_probability: Optional[float] = None
 
@@ -52,7 +47,6 @@ def compare(self, other: 'Candidate') -> bool:
         if self.rule_name == other.rule_name \
                 and self.severity == other.severity \
                 and self.confidence == other.confidence \
-                and self.api_validation == other.api_validation \
                 and self.use_ml == other.use_ml \
                 and self.ml_validation == other.ml_validation \
                 and self.ml_probability == other.ml_probability \
@@ -79,22 +73,12 @@ def _encode(value: Any) -> Any:
         else:
             return value
 
-    def is_api_validation_available(self) -> bool:
-        """Check if current credential candidate can be validated with external API.
-
-        Return:
-            True if any validation available, False otherwise
-
-        """
-        return len(self.validations) > 0
-
     def to_str(self, subtext: bool = False, hashed: bool = False) -> str:
         """Represent candidate with subtext or|and hashed values"""
         return f"rule: {self.rule_name}" \
                f" | severity: {self.severity.value}" \
                f" | confidence: {self.confidence.value}" \
                f" | line_data_list: [{', '.join([x.to_str(subtext, hashed) for x in self.line_data_list])}]" \
-               f" | api_validation: {self.api_validation.name}" \
                f" | ml_validation: {self.ml_validation.name}"
 
     def __str__(self):
@@ -111,7 +95,6 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:
 
         """
         full_output = {
-            "api_validation": self.api_validation.name,
             "ml_validation": self.ml_validation.name,
             "patterns": [pattern.pattern for pattern in self.patterns],
             "ml_probability": self.ml_probability,

diff --git a/credsweeper/rules/rule.py b/credsweeper/rules/rule.py
@@ -4,13 +4,12 @@
 from functools import cached_property
 from typing import Dict, List, Optional, Union, Set
 
-from credsweeper import validations, filters
+from credsweeper import filters
 from credsweeper.common.constants import RuleType, Severity, MAX_LINE_LENGTH, Confidence
 from credsweeper.common.keyword_pattern import KeywordPattern
 from credsweeper.config import Config
 from credsweeper.filters import Filter, group
 from credsweeper.filters.group import Group
-from credsweeper.validations import Validation
 
 logger = logging.getLogger(__name__)
 
@@ -73,7 +72,6 @@ def __init__(self, config: Config, rule_dict: Dict) -> None:
         # auxiliary fields
         self.__filters = self._init_filters(rule_dict.get(Rule.FILTER_TYPE, []))
         self.__use_ml = bool(rule_dict.get(Rule.USE_ML))
-        self.__validations = self._init_validations(rule_dict.get(Rule.VALIDATIONS))
         self.__required_substrings = set(i.strip().lower() for i in rule_dict.get(Rule.REQUIRED_SUBSTRINGS, []))
         self.__has_required_substrings = bool(self.__required_substrings)
         required_regex = rule_dict.get(Rule.REQUIRED_REGEX)
@@ -198,40 +196,6 @@ def use_ml(self) -> bool:
         """use_ml getter"""
         return self.__use_ml
 
-    @cached_property
-    def validations(self) -> List[Validation]:
-        """validations getter"""
-        return self.__validations
-
-    def _init_validations(self, validation_names: Union[None, str, List[str]]) -> List[Validation]:
-        """Set api validations to the current rule.
-
-        All string in `validation_names` should be class names from `credsweeper.validations`
-
-        Args:
-            validation_names: validation names
-
-        """
-        if not validation_names:
-            # empty string check to avoid exceptions for getattr
-            return []
-        elif isinstance(validation_names, str):
-            # more convenience way in case of single validator - only one line in YAML
-            if validation_template := getattr(validations, validation_names, None):
-                return [validation_template]
-        elif isinstance(validation_names, list):
-            _validations: List[Validation] = []
-            for vn in validation_names:
-                if validation_template := getattr(validations, vn, None):
-                    _validations.append(validation_template())
-                else:
-                    break
-            else:
-                return _validations
-        raise ValueError(f"Malformed rule '{self.__rule_name}'."
-                         f" field '{Rule.VALIDATIONS}' has invalid value"
-                         f" '{validation_names}'")
-
     @staticmethod
     def _assert_rule_mandatory_fields(rule_template: Dict) -> None:
         """Assert that rule_template have all required fields.

diff --git a/credsweeper/scanner/scan_type/scan_type.py b/credsweeper/scanner/scan_type/scan_type.py
@@ -171,9 +171,13 @@ def _get_candidates(cls, config: Config, rule: Rule, target: AnalysisTarget) ->
             for line_data in line_data_list:
                 if config.exclude_values and line_data.value.strip() in config.exclude_values:
                     continue
-
-                candidate = Candidate([line_data], rule.patterns, rule.rule_name, rule.severity, config,
-                                      rule.validations, rule.use_ml, rule.confidence)
+                candidate = Candidate(line_data_list=[line_data],
+                                      patterns=rule.patterns,
+                                      rule_name=rule.rule_name,
+                                      severity=rule.severity,
+                                      config=config,
+                                      use_ml=rule.use_ml,
+                                      confidence=rule.confidence)
                 # single pattern with multiple values means all the patterns must matched in target
                 if 1 < len(rule.patterns) and rule.rule_type in (RuleType.PATTERN, RuleType.KEYWORD):
                     # additional check whether all patterns match

diff --git a/credsweeper/secret/config.json b/credsweeper/secret/config.json
@@ -164,7 +164,6 @@
         "rule",
         "severity",
         "confidence",
-        "api_validation",
         "ml_validation",
         "ml_probability",
         "line_data_list"

diff --git a/credsweeper/validations/__init__.py b/credsweeper/validations/__init__.py
diff --git a/credsweeper/validations/apply_validation.py b/credsweeper/validations/apply_validation.py
diff --git a/credsweeper/validations/github_token_validation.py b/credsweeper/validations/github_token_validation.py