Skip to content

Commit

Permalink
softreset
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Dec 5, 2024
1 parent caf6bd6 commit f869244
Show file tree
Hide file tree
Showing 10 changed files with 129 additions and 97 deletions.
62 changes: 7 additions & 55 deletions .ci/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
META MD5 414228344bac7e55c5127be7b244e460
DATA MD5 abd9c025d5c323af814fbeb33f469c90
DATA: 16342283 interested lines. MARKUP: 62020 items
META MD5 825045827e958406a8bfd2731c57f77c
DATA MD5 6444eafe650282d2407960f82ef5d014
DATA: 16342283 interested lines. MARKUP: 62022 items
FileType FileNumber ValidLines Positives Negatives Templates
--------------- ------------ ------------ ----------- ----------- -----------
194 28318 71 418 90
Expand Down Expand Up @@ -82,7 +82,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.ipynb 1 134 5
.j 1 241 4
.j2 30 5530 6 186 10
.java 621 134132 362 1365 171
.java 621 134132 368 1365 171
.jenkinsfile 1 58 2 6
.jinja2 1 64 2
.js 659 536413 531 2497 331
Expand Down Expand Up @@ -209,7 +209,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.ts 583 106730 157 1800 203
.tsx 54 7914 1 114 5
.ttar 1 452 1
.txt 440 78102 5287 6354 49
.txt 440 78102 5288 6354 49
.utf8 1 77 2
.vsixmanifest 1 36 1
.vsmdi 1 6 2
Expand All @@ -222,53 +222,5 @@ FileType FileNumber ValidLines Positives Negatives Templat
.yml 419 36169 559 889 376
.zsh 6 872 12
.zsh-theme 1 97 1
TOTAL: 10232 16342283 12255 49692 5101
credsweeper result_cnt : 11517, lost_cnt : 0, true_cnt : 11342, false_cnt : 175
Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1
------------------------------ ----------- ----------- ----------- ---------- ----- ---- ----- ---- -------- -------- -------- -------- -------- --------
API 130 3166 188 125 123 2 3352 7 0.000596 0.053846 0.997417 0.984000 0.946154 0.964706
AWS Client ID 168 21 0 160 160 0 21 8 0.000000 0.047619 0.957672 1.000000 0.952381 0.975610
AWS Multi 82 10 0 84 82 1 9 0 0.100000 0.000000 0.989130 0.987952 1.000000 0.993939
AWS S3 Bucket 67 23 0 92 67 23 0 0 1.000000 0.000000 0.744444 0.744444 1.000000 0.853503
Atlassian Old PAT token 27 308 3 12 3 8 303 24 0.025723 0.888889 0.905325 0.272727 0.111111 0.157895
Auth 414 2739 82 390 387 3 2818 27 0.001063 0.065217 0.990726 0.992308 0.934783 0.962687
Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194
BASE64 Private Key 7 4 0 7 7 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333
Bitbucket Client ID 143 2095 9 48 28 19 2085 115 0.009030 0.804196 0.940365 0.595745 0.195804 0.294737
Bitbucket Client Secret 301 807 10 40 29 11 806 272 0.013464 0.903654 0.746869 0.725000 0.096346 0.170088
CMD ConvertTo-SecureString 13 4 0 13 13 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
CMD Password 21 128 6 18 18 0 134 3 0.000000 0.142857 0.980645 1.000000 0.857143 0.923077
CMD Secret 1 1 0 1 1 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
CMD Token 6 0 0 6 6 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Certificate 24 471 0 20 20 0 471 4 0.000000 0.166667 0.991919 1.000000 0.833333 0.909091
Credential 91 421 76 92 91 1 496 0 0.002012 0.000000 0.998299 0.989130 1.000000 0.994536
Docker Swarm Token 2 0 0 1 1 0 0 1 0.500000 0.500000 1.000000 0.500000 0.666667
Dropbox App secret 64 139 1 46 35 10 130 29 0.071429 0.453125 0.808824 0.777778 0.546875 0.642202
Facebook Access Token 0 1 0 0 0 1 0 0.000000 1.000000
Firebase Domain 6 1 0 7 6 1 0 0 1.000000 0.000000 0.857143 0.857143 1.000000 0.923077
Github Old Token 1 0 0 1 1 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Gitlab Feed Token 189 751 87 56 44 11 827 145 0.013126 0.767196 0.848101 0.800000 0.232804 0.360656
Gitlab Incoming Email Token 37 8 0 21 19 2 6 18 0.250000 0.486486 0.555556 0.904762 0.513514 0.655172
Google API Key 12 0 0 12 12 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Google Multi 10 2 0 11 10 1 1 0 0.500000 0.000000 0.916667 0.909091 1.000000 0.952381
Google OAuth Access Token 3 0 0 3 3 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Grafana Provisioned API Key 22 1 0 5 5 0 1 17 0.000000 0.772727 0.260870 1.000000 0.227273 0.370370
JSON Web Token 170 61 0 131 131 0 61 39 0.000000 0.229412 0.831169 1.000000 0.770588 0.870432
Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000
Jira 2FA 15 6 1 12 12 0 7 3 0.000000 0.200000 0.863636 1.000000 0.800000 0.888889
Key 3909 15717 485 3944 3893 51 16151 16 0.003148 0.004093 0.996668 0.987069 0.995907 0.991468
Nonce 91 49 0 89 88 1 48 3 0.020408 0.032967 0.971429 0.988764 0.967033 0.977778
Other 8 7445 1 0 0 7446 8 0.000000 1.000000 0.998927 0.000000
PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041
Password 1869 7536 2680 1776 1758 18 10198 111 0.001762 0.059390 0.989326 0.989865 0.940610 0.964609
Salt 47 76 1 44 44 0 77 3 0.000000 0.063830 0.975806 1.000000 0.936170 0.967033
Secret 1297 1576 802 1288 1283 5 2373 14 0.002103 0.010794 0.994830 0.996118 0.989206 0.992650
Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000
Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
Tencent WeChat API App ID 6 0 0 6 6 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Token 643 4170 454 616 614 2 4622 29 0.000433 0.045101 0.994114 0.996753 0.954899 0.975377
Twilio Credentials 30 39 0 30 30 0 39 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
URL Credentials 210 157 215 205 205 0 372 5 0.000000 0.023810 0.991409 1.000000 0.976190 0.987952
UUID 1069 265 0 1068 1067 1 264 2 0.003774 0.001871 0.997751 0.999064 0.998129 0.998596
12255 49692 5101 11524 11342 175 49517 913 0.003522 0.074500 0.982437 0.984805 0.925500 0.954232
TOTAL: 10232 16342283 12262 49692 5101

16 changes: 8 additions & 8 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: Samsung/CredData
ref: main
repository: babenek/CredData
ref: 3d

- name: Markup hashing
run: |
Expand Down Expand Up @@ -86,8 +86,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: Samsung/CredData
ref: main
repository: babenek/CredData
ref: 3d

- name: Markup hashing
run: |
Expand Down Expand Up @@ -189,8 +189,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: Samsung/CredData
ref: main
repository: babenek/CredData
ref: 3d

- name: Markup hashing
run: |
Expand Down Expand Up @@ -377,8 +377,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: Samsung/CredData
ref: main
repository: babenek/CredData
ref: 3d

- name: Markup hashing
run: |
Expand Down
16 changes: 5 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
FROM python:3.10@sha256:fd0fa50d997eb56ce560c6e5ca6a1f5cf8fdff87572a16ac07fb1f5ca01eb608
FROM python:3.8

WORKDIR /app
WORKDIR /user

ADD credsweeper /app/credsweeper
ADD tests/samples /user

COPY pyproject.toml /app/
COPY README.md /app/
RUN pip install credsweeper

RUN pip install .

COPY entrypoint.sh /entrypoint.sh

RUN chmod a+x /entrypoint.sh

ENTRYPOINT ["/entrypoint.sh"]
ENTRYPOINT ["credsweeper", "--path", "/user"]
8 changes: 4 additions & 4 deletions credsweeper/common/keyword_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@

class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(\\[nrt])?"\
r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,]*)" \
key_left = r"(\\[nrt]|%[0-9a-f]{2})?"\
r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
r"[^:='\"`<>{?!&]*)[`'\"]*)" # <variable>
r"[^%:='\"`<>{?!&]*)[`'\"]*)" # <variable>
separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=)" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=|%3d)" \
r"(\s|\\+[tnr])*"
# might be curly, square or parenthesis with words before
wrap = r"(?P<wrap>(" \
Expand Down
17 changes: 10 additions & 7 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ class LineData:
comment_starts = ("//", "* ", "#", "/*", "<!––", "%{", "%", "...", "(*", "--", "--[[", "#=")
bash_param_split = re.compile("\\s+(\\-|\\||\\>|\\w+?\\>|\\&)")
line_endings = re.compile(r"\\{1,8}[nr]")
url_param_split = re.compile(r"(%|\\u(00){0,2})(26|3f)", flags=re.IGNORECASE)
# https://en.wikipedia.org/wiki/Percent-encoding
url_param_split = re.compile(
r"(\\u(00){0,2}|%)(21|23|24|26|27|28|29|2a|2b|2c|2f|3a|3b|3d|3f|40|5b|5d)", flags=re.IGNORECASE)
# some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt
# \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML
url_scheme_part_regex = re.compile(r"[0-9A-Za-z.-]{3}")
Expand Down Expand Up @@ -159,6 +161,7 @@ def check_url_part(self) -> bool:
self.url_part &= not self.url_chars_not_allowed_pattern.search(line_before_value, pos=url_pos + 3)
self.url_part |= self.line[self.variable_start - 1] in "?&" if 0 < self.variable_start else False
self.url_part |= bool(self.url_value_pattern.match(self.value))
self.url_part |= bool(self.separator) and "%3D" == self.separator.upper()
return self.url_part

def clean_url_parameters(self) -> None:
Expand Down Expand Up @@ -198,7 +201,7 @@ def clean_toml_parameters(self) -> None:
cleaning_required = False
for left, right in [('{', '}'), ('[', ']'), ('(', ')')]:
if self.value.endswith(right) and left not in self.value \
and line_before_value.count(left) > line_before_value.count(right):
and line_before_value.count(left) > line_before_value.count(right):
# full match does not reasonable to implement due open character may be in other line
self.value = self.value[:-1]
cleaning_required = True
Expand Down Expand Up @@ -262,15 +265,15 @@ def is_well_quoted_value(self) -> bool:
rightquote = ""

result = bool(leftquote) and ( #
bool(rightquote) and (leftquote == rightquote) # normal case
or '\\' == self.value_rightquote and '\\' == self.line[-1] # line wrap
bool(rightquote) and (leftquote == rightquote) # normal case
or '\\' == self.value_rightquote and '\\' == self.line[-1] # line wrap
)

elif self.value_leftquote:
result = ( #
('\\' == self.value_rightquote or '\\' == self.value[-1]) and '\\' == self.line[-1] # line wrap
or '.php' == self.file_type # php may use multiline string
or 3 == self.value_leftquote.count('"') or 3 == self.value_leftquote.count("'") # python multiline
('\\' == self.value_rightquote or '\\' == self.value[-1]) and '\\' == self.line[-1] # line wrap
or '.php' == self.file_type # php may use multiline string
or 3 == self.value_leftquote.count('"') or 3 == self.value_leftquote.count("'") # python multiline
)

return result
Expand Down
6 changes: 3 additions & 3 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
NEGLIGIBLE_ML_THRESHOLD = 0.0001

# credentials count after scan
SAMPLES_CRED_COUNT: int = 412
SAMPLES_CRED_LINE_COUNT: int = 430
SAMPLES_CRED_COUNT: int = 412+1
SAMPLES_CRED_LINE_COUNT: int = 430+1

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 369
SAMPLES_POST_CRED_COUNT: int = 369+1

# with option --doc
SAMPLES_IN_DOC = 448
Expand Down
33 changes: 30 additions & 3 deletions tests/data/depth_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -13055,15 +13055,15 @@
"line_num": 3,
"path": "./tests/samples/url_cred.js",
"info": "./tests/samples/url_cred.js|RAW",
"value": "546DFS64N90P3AW7DX%2Fkeep",
"value": "546DFS64N90P3AW7DX",
"value_start": 49,
"value_end": 74,
"value_end": 67,
"variable": "Credential",
"variable_start": 38,
"variable_end": 48,
"entropy_validation": {
"iterator": "BASE64_CHARS",
"entropy": 4.058101942183734,
"entropy": 3.836591668108979,
"valid": false
}
}
Expand Down Expand Up @@ -13177,6 +13177,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "VALIDATED_KEY",
"ml_probability": 0.999,
"rule": "Token",
"severity": "medium",
"confidence": "moderate",
"line_data_list": [
{
"line": "url3d = \"https://localhost.com/013948?26timestamp%3D1395782596%26token%3Dh1d3Me4ch534d801sl3jdk%26version%3D3.14%26si\";",
"line_num": 19,
"path": "./tests/samples/url_cred.js",
"info": "./tests/samples/url_cred.js|RAW",
"value": "h1d3Me4ch534d801sl3jdk",
"value_start": 73,
"value_end": 95,
"variable": "token",
"variable_start": 65,
"variable_end": 70,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.551740408502559,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
Loading

0 comments on commit f869244

Please sign in to comment.