diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt index dcff96867..c58554957 100644 --- a/.ci/benchmark.txt +++ b/.ci/benchmark.txt @@ -1,6 +1,6 @@ -META MD5 414228344bac7e55c5127be7b244e460 -DATA MD5 abd9c025d5c323af814fbeb33f469c90 -DATA: 16342283 interested lines. MARKUP: 62020 items +META MD5 825045827e958406a8bfd2731c57f77c +DATA MD5 6444eafe650282d2407960f82ef5d014 +DATA: 16342283 interested lines. MARKUP: 62022 items FileType FileNumber ValidLines Positives Negatives Templates --------------- ------------ ------------ ----------- ----------- ----------- 194 28318 71 418 90 @@ -82,7 +82,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .ipynb 1 134 5 .j 1 241 4 .j2 30 5530 6 186 10 -.java 621 134132 362 1365 171 +.java 621 134132 368 1365 171 .jenkinsfile 1 58 2 6 .jinja2 1 64 2 .js 659 536413 531 2497 331 @@ -209,7 +209,7 @@ FileType FileNumber ValidLines Positives Negatives Templat .ts 583 106730 157 1800 203 .tsx 54 7914 1 114 5 .ttar 1 452 1 -.txt 440 78102 5287 6354 49 +.txt 440 78102 5288 6354 49 .utf8 1 77 2 .vsixmanifest 1 36 1 .vsmdi 1 6 2 @@ -222,53 +222,5 @@ FileType FileNumber ValidLines Positives Negatives Templat .yml 419 36169 559 889 376 .zsh 6 872 12 .zsh-theme 1 97 1 -TOTAL: 10232 16342283 12255 49692 5101 -credsweeper result_cnt : 11517, lost_cnt : 0, true_cnt : 11342, false_cnt : 175 -Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------- ----------- ----------- ----------- ---------- ----- ---- ----- ---- -------- -------- -------- -------- -------- -------- -API 130 3166 188 125 123 2 3352 7 0.000596 0.053846 0.997417 0.984000 0.946154 0.964706 -AWS Client ID 168 21 0 160 160 0 21 8 0.000000 0.047619 0.957672 1.000000 0.952381 0.975610 -AWS Multi 82 10 0 84 82 1 9 0 0.100000 0.000000 0.989130 0.987952 1.000000 0.993939 -AWS S3 Bucket 67 23 0 92 67 23 0 0 1.000000 0.000000 0.744444 0.744444 1.000000 0.853503 -Atlassian Old PAT token 27 308 3 12 3 8 303 24 0.025723 0.888889 0.905325 0.272727 0.111111 0.157895 -Auth 414 2739 82 390 387 3 2818 27 0.001063 0.065217 0.990726 0.992308 0.934783 0.962687 -Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194 -BASE64 Private Key 7 4 0 7 7 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333 -Bitbucket Client ID 143 2095 9 48 28 19 2085 115 0.009030 0.804196 0.940365 0.595745 0.195804 0.294737 -Bitbucket Client Secret 301 807 10 40 29 11 806 272 0.013464 0.903654 0.746869 0.725000 0.096346 0.170088 -CMD ConvertTo-SecureString 13 4 0 13 13 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -CMD Password 21 128 6 18 18 0 134 3 0.000000 0.142857 0.980645 1.000000 0.857143 0.923077 -CMD Secret 1 1 0 1 1 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -CMD Token 6 0 0 6 6 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 -Certificate 24 471 0 20 20 0 471 4 0.000000 0.166667 0.991919 1.000000 0.833333 0.909091 -Credential 91 421 76 92 91 1 496 0 0.002012 0.000000 0.998299 0.989130 1.000000 0.994536 -Docker Swarm Token 2 0 0 1 1 0 0 1 0.500000 0.500000 1.000000 0.500000 0.666667 -Dropbox App secret 64 139 1 46 35 10 130 29 0.071429 0.453125 0.808824 0.777778 0.546875 0.642202 -Facebook Access Token 0 1 0 0 0 1 0 0.000000 1.000000 -Firebase Domain 6 1 0 7 6 1 0 0 1.000000 0.000000 0.857143 0.857143 1.000000 0.923077 -Github Old Token 1 0 0 1 1 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 -Gitlab Feed Token 189 751 87 56 44 11 827 145 0.013126 0.767196 0.848101 0.800000 0.232804 0.360656 -Gitlab Incoming Email Token 37 8 0 21 19 2 6 18 0.250000 0.486486 0.555556 0.904762 0.513514 0.655172 -Google API Key 12 0 0 12 12 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 -Google Multi 10 2 0 11 10 1 1 0 0.500000 0.000000 0.916667 0.909091 1.000000 0.952381 -Google OAuth Access Token 3 0 0 3 3 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 -Grafana Provisioned API Key 22 1 0 5 5 0 1 17 0.000000 0.772727 0.260870 1.000000 0.227273 0.370370 -JSON Web Token 170 61 0 131 131 0 61 39 0.000000 0.229412 0.831169 1.000000 0.770588 0.870432 -Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000 -Jira 2FA 15 6 1 12 12 0 7 3 0.000000 0.200000 0.863636 1.000000 0.800000 0.888889 -Key 3909 15717 485 3944 3893 51 16151 16 0.003148 0.004093 0.996668 0.987069 0.995907 0.991468 -Nonce 91 49 0 89 88 1 48 3 0.020408 0.032967 0.971429 0.988764 0.967033 0.977778 -Other 8 7445 1 0 0 7446 8 0.000000 1.000000 0.998927 0.000000 -PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041 -Password 1869 7536 2680 1776 1758 18 10198 111 0.001762 0.059390 0.989326 0.989865 0.940610 0.964609 -Salt 47 76 1 44 44 0 77 3 0.000000 0.063830 0.975806 1.000000 0.936170 0.967033 -Secret 1297 1576 802 1288 1283 5 2373 14 0.002103 0.010794 0.994830 0.996118 0.989206 0.992650 -Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000 -Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -Tencent WeChat API App ID 6 0 0 6 6 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 -Token 643 4170 454 616 614 2 4622 29 0.000433 0.045101 0.994114 0.996753 0.954899 0.975377 -Twilio Credentials 30 39 0 30 30 0 39 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -URL Credentials 210 157 215 205 205 0 372 5 0.000000 0.023810 0.991409 1.000000 0.976190 0.987952 -UUID 1069 265 0 1068 1067 1 264 2 0.003774 0.001871 0.997751 0.999064 0.998129 0.998596 - 12255 49692 5101 11524 11342 175 49517 913 0.003522 0.074500 0.982437 0.984805 0.925500 0.954232 +TOTAL: 10232 16342283 12262 49692 5101 + diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 11e71c1da..f710d8288 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -30,8 +30,8 @@ jobs: - name: Checkout CredData uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - repository: Samsung/CredData - ref: main + repository: babenek/CredData + ref: 3d - name: Markup hashing run: | @@ -86,8 +86,8 @@ jobs: - name: Checkout CredData uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - repository: Samsung/CredData - ref: main + repository: babenek/CredData + ref: 3d - name: Markup hashing run: | @@ -189,8 +189,8 @@ jobs: - name: Checkout CredData uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - repository: Samsung/CredData - ref: main + repository: babenek/CredData + ref: 3d - name: Markup hashing run: | @@ -377,8 +377,8 @@ jobs: - name: Checkout CredData uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - repository: Samsung/CredData - ref: main + repository: babenek/CredData + ref: 3d - name: Markup hashing run: | diff --git a/Dockerfile b/Dockerfile index ceccdb102..44ff30529 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,10 @@ -FROM python:3.10@sha256:fd0fa50d997eb56ce560c6e5ca6a1f5cf8fdff87572a16ac07fb1f5ca01eb608 +FROM python:3.8 -WORKDIR /app +WORKDIR /user -ADD credsweeper /app/credsweeper +ADD tests/samples /user -COPY pyproject.toml /app/ -COPY README.md /app/ +RUN pip install credsweeper -RUN pip install . -COPY entrypoint.sh /entrypoint.sh - -RUN chmod a+x /entrypoint.sh - -ENTRYPOINT ["/entrypoint.sh"] +ENTRYPOINT ["credsweeper", "--path", "/user"] diff --git a/credsweeper/common/keyword_pattern.py b/credsweeper/common/keyword_pattern.py index db26d51b5..4c37d3b1f 100644 --- a/credsweeper/common/keyword_pattern.py +++ b/credsweeper/common/keyword_pattern.py @@ -3,14 +3,14 @@ class KeywordPattern: """Pattern set of keyword types""" - key_left = r"(\\[nrt])?"\ - r"(?P(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,]*)" \ + key_left = r"(\\[nrt]|%[0-9a-f]{2})?"\ + r"(?P(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \ r"(?P" # there will be inserted a keyword key_right = r")" \ - r"[^:='\"`<>{?!&]*)[`'\"]*)" # + r"[^%:='\"`<>{?!&]*)[`'\"]*)" # separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \ - r"(?P:( [a-z]{3,9}[?]? )?=|:|=(>|>|\\u0026gt;)|!=|===|==|=)" \ + r"(?P:( [a-z]{3,9}[?]? )?=|:|=(>|>|\\u0026gt;)|!=|===|==|=|%3d)" \ r"(\s|\\+[tnr])*" # might be curly, square or parenthesis with words before wrap = r"(?P(" \ diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py index fe19189f9..dac4ecd03 100644 --- a/credsweeper/credentials/line_data.py +++ b/credsweeper/credentials/line_data.py @@ -33,7 +33,9 @@ class LineData: comment_starts = ("//", "* ", "#", "/*", "|\\w+?\\>|\\&)") line_endings = re.compile(r"\\{1,8}[nr]") - url_param_split = re.compile(r"(%|\\u(00){0,2})(26|3f)", flags=re.IGNORECASE) + # https://en.wikipedia.org/wiki/Percent-encoding + url_param_split = re.compile( + r"(\\u(00){0,2}|%)(21|23|24|26|27|28|29|2a|2b|2c|2f|3a|3b|3d|3f|40|5b|5d)", flags=re.IGNORECASE) # some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt # \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML url_scheme_part_regex = re.compile(r"[0-9A-Za-z.-]{3}") @@ -159,6 +161,7 @@ def check_url_part(self) -> bool: self.url_part &= not self.url_chars_not_allowed_pattern.search(line_before_value, pos=url_pos + 3) self.url_part |= self.line[self.variable_start - 1] in "?&" if 0 < self.variable_start else False self.url_part |= bool(self.url_value_pattern.match(self.value)) + self.url_part |= bool(self.separator) and "%3D" == self.separator.upper() return self.url_part def clean_url_parameters(self) -> None: @@ -198,7 +201,7 @@ def clean_toml_parameters(self) -> None: cleaning_required = False for left, right in [('{', '}'), ('[', ']'), ('(', ')')]: if self.value.endswith(right) and left not in self.value \ - and line_before_value.count(left) > line_before_value.count(right): + and line_before_value.count(left) > line_before_value.count(right): # full match does not reasonable to implement due open character may be in other line self.value = self.value[:-1] cleaning_required = True @@ -262,15 +265,15 @@ def is_well_quoted_value(self) -> bool: rightquote = "" result = bool(leftquote) and ( # - bool(rightquote) and (leftquote == rightquote) # normal case - or '\\' == self.value_rightquote and '\\' == self.line[-1] # line wrap + bool(rightquote) and (leftquote == rightquote) # normal case + or '\\' == self.value_rightquote and '\\' == self.line[-1] # line wrap ) elif self.value_leftquote: result = ( # - ('\\' == self.value_rightquote or '\\' == self.value[-1]) and '\\' == self.line[-1] # line wrap - or '.php' == self.file_type # php may use multiline string - or 3 == self.value_leftquote.count('"') or 3 == self.value_leftquote.count("'") # python multiline + ('\\' == self.value_rightquote or '\\' == self.value[-1]) and '\\' == self.line[-1] # line wrap + or '.php' == self.file_type # php may use multiline string + or 3 == self.value_leftquote.count('"') or 3 == self.value_leftquote.count("'") # python multiline ) return result diff --git a/tests/__init__.py b/tests/__init__.py index 8d0f5d19d..35584b1aa 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -7,11 +7,11 @@ NEGLIGIBLE_ML_THRESHOLD = 0.0001 # credentials count after scan -SAMPLES_CRED_COUNT: int = 412 -SAMPLES_CRED_LINE_COUNT: int = 430 +SAMPLES_CRED_COUNT: int = 412+1 +SAMPLES_CRED_LINE_COUNT: int = 430+1 # credentials count after post-processing -SAMPLES_POST_CRED_COUNT: int = 369 +SAMPLES_POST_CRED_COUNT: int = 369+1 # with option --doc SAMPLES_IN_DOC = 448 diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index bbf77268d..db69a431e 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -13055,15 +13055,15 @@ "line_num": 3, "path": "./tests/samples/url_cred.js", "info": "./tests/samples/url_cred.js|RAW", - "value": "546DFS64N90P3AW7DX%2Fkeep", + "value": "546DFS64N90P3AW7DX", "value_start": 49, - "value_end": 74, + "value_end": 67, "variable": "Credential", "variable_start": 38, "variable_end": 48, "entropy_validation": { "iterator": "BASE64_CHARS", - "entropy": 4.058101942183734, + "entropy": 3.836591668108979, "valid": false } } @@ -13177,6 +13177,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "url3d = \"https://localhost.com/013948?26timestamp%3D1395782596%26token%3Dh1d3Me4ch534d801sl3jdk%26version%3D3.14%26si\";", + "line_num": 19, + "path": "./tests/samples/url_cred.js", + "info": "./tests/samples/url_cred.js|RAW", + "value": "h1d3Me4ch534d801sl3jdk", + "value_start": 73, + "value_end": 95, + "variable": "token", + "variable_start": 65, + "variable_end": 70, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.551740408502559, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json index f379bc15a..d29f6c886 100644 --- a/tests/data/ml_threshold.json +++ b/tests/data/ml_threshold.json @@ -11064,15 +11064,15 @@ "line_num": 3, "path": "./tests/samples/url_cred.js", "info": "", - "value": "b813406be181d57d69c77210296df2c06b109df201288f4757d48c7c8e05f4a9", + "value": "b93170f86433c19c76825330f8e83d282021114a6d9abada7cfc0be1ddd1c705", "value_start": 49, - "value_end": 74, + "value_end": 67, "variable": "b1c42b3ce118093bc656bf16e7b87e069403a18246d2ea36d3c667850cb5bda1", "variable_start": 38, "variable_end": 48, "entropy_validation": { "iterator": "BASE64_CHARS", - "entropy": 4.058101942183734, + "entropy": 3.836591668108979, "valid": false } } @@ -11186,6 +11186,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "26a0a7907cf3a2488fc8a50880c51c1ceb8b65229cf66f9d49e4ea0175ab2ce9", + "line_num": 19, + "path": "./tests/samples/url_cred.js", + "info": "", + "value": "ee2b690c88702370d6e18673a460b5800a3ffa73b582cb57d22a3f757199bcb3", + "value_start": 73, + "value_end": 95, + "variable": "3c469e9d6c5875d37a43f353d4f88e61fcf812c66eee3457465a40b0da4153e0", + "variable_start": 65, + "variable_end": 70, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.551740408502559, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/output.json b/tests/data/output.json index 717688b9c..e965e3b34 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -9930,15 +9930,15 @@ "line_num": 3, "path": "./tests/samples/url_cred.js", "info": "", - "value": "546DFS64N90P3AW7DX%2Fkeep", + "value": "546DFS64N90P3AW7DX", "value_start": 49, - "value_end": 74, + "value_end": 67, "variable": "Credential", "variable_start": 38, "variable_end": 48, "entropy_validation": { "iterator": "BASE64_CHARS", - "entropy": 4.058101942183734, + "entropy": 3.836591668108979, "valid": false } } @@ -10052,6 +10052,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "VALIDATED_KEY", + "ml_probability": 0.999, + "rule": "Token", + "severity": "medium", + "confidence": "moderate", + "line_data_list": [ + { + "line": "url3d = \"https://localhost.com/013948?26timestamp%3D1395782596%26token%3Dh1d3Me4ch534d801sl3jdk%26version%3D3.14%26si\";", + "line_num": 19, + "path": "./tests/samples/url_cred.js", + "info": "", + "value": "h1d3Me4ch534d801sl3jdk", + "value_start": 73, + "value_end": 95, + "variable": "token", + "variable_start": 65, + "variable_end": 70, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.551740408502559, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/samples/url_cred.js b/tests/samples/url_cred.js index 2bcbeb844..a309f6ea7 100644 --- a/tests/samples/url_cred.js +++ b/tests/samples/url_cred.js @@ -15,3 +15,5 @@ email_as_login = "smtps://example@gmail.com:FnD83JZs@smtp.gmail.com:465"; /* @"otpauth://host/port?set=VNMXQKAZFVOYOJCDNBIYXYIWX2&info=should_not_be_found_even_in_ml_threshold */ + +url3d = "https://localhost.com/013948?26timestamp%3D1395782596%26token%3Dh1d3Me4ch534d801sl3jdk%26version%3D3.14%26si";