Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Escaped pattern discovering #629

Merged
merged 12 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 72 additions & 72 deletions .ci/benchmark.txt

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions credsweeper/filters/group/weird_base64_token.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from credsweeper.common.constants import GroupType
from credsweeper.config import Config
from credsweeper.filters import ValueCoupleKeywordCheck, ValueNotPartEncodedCheck, \
ValueBase64DataCheck, ValueEntropyBase64Check, ValuePatternCheck, ValueNumberCheck, ValueTokenBase64Check
ValueBase64DataCheck, ValueEntropyBase64Check, ValuePatternCheck, ValueNumberCheck, ValueTokenBase64Check, \
ValueBase64PartCheck
from credsweeper.filters.group import Group


Expand All @@ -17,5 +18,6 @@ def __init__(self, config: Config) -> None:
ValueTokenBase64Check(),
ValueEntropyBase64Check(),
ValuePatternCheck(config),
ValueNotPartEncodedCheck()
ValueNotPartEncodedCheck(),
ValueBase64PartCheck(),
]
4 changes: 2 additions & 2 deletions credsweeper/filters/value_atlassian_token_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def check_atlassian_struct(value: str) -> bool:
# there is limit for big integer value: math.log10(1<<64) = 19.265919722494797
if 0 < delimiter_pos <= 20:
val = decoded[:delimiter_pos].decode(LATIN_1)
# at least 3 digits in the token
if 100 < int(val):
# at least 4 digits in the token
if 1000 <= int(val):
# test for ascii and Shannon entropy - there should be random data
data = decoded[delimiter_pos + 1:]
return Util.is_ascii_entropy_validate(data)
Expand Down
73 changes: 59 additions & 14 deletions credsweeper/filters/value_base64_part_check.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import contextlib
import re
import statistics

from credsweeper.common.constants import Chars
from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
from credsweeper.filters import Filter
from credsweeper.filters.value_entropy_base64_check import ValueEntropyBase64Check
from credsweeper.utils import Util


Expand All @@ -14,6 +16,9 @@ class ValueBase64PartCheck(Filter):
Check that candidate is NOT a part of base64 long line
"""

base64_pattern = re.compile(r"^(\\{1,8}[0abfnrtv]|[0-9A-Za-z+/=]){1,4000}")
base64_set = set(Chars.BASE64_CHARS.value)

def __init__(self, config: Config = None) -> None:
pass

Expand All @@ -30,26 +35,66 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""

with contextlib.suppress(Exception):
if line_data.value_start and line_data.line[line_data.value_start - 1] in ('/', '+'):
if '-' in line_data.value or '_' in line_data.value:
# the value contains url-safe chars, so '/' is a delimiter
line = line_data.line
len_line = len(line)
value = line_data.value
len_value = len(value)
if 0 == line_data.value_start and len_line >= 2 * len_value \
or 0 < line_data.value_start and line[line_data.value_start - 1] in ('/', '+', '\\', '%') \
or 0 < line_data.value_end < len_line and line[line_data.value_end] in ('/', '+', '\\', '%'):

if '-' in value or '_' in value:
# the value contains url-safe chars, so '/' or '+' is a delimiter
return False
value_entropy = Util.get_shannon_entropy(line_data.value, Chars.BASE64STD_CHARS.value)
left_start = line_data.value_start - len(line_data.value)

left_start = line_data.value_start - len_value
if 0 > left_start:
left_start = 0
left_entropy = Util.get_shannon_entropy(line_data.line[left_start:line_data.value_start],
Chars.BASE64STD_CHARS.value)
right_end = line_data.value_end + len(line_data.value)
if len(line_data.line) < right_end:
right_end = len(line_data.line)
right_entropy = Util.get_shannon_entropy(line_data.line[line_data.value_end:right_end],
Chars.BASE64STD_CHARS.value)
data = [value_entropy, left_entropy, right_entropy]
right_end = line_data.value_end + len_value
if len_line < right_end:
right_end = len_line

hunk_size = right_end - left_start

if hunk_size == 3 * len_value:
# simple analysis for maximal data size
if self.base64_pattern.match(line[left_start:right_end]):
# obviously case: all characters are base64 standard
babenek marked this conversation as resolved.
Show resolved Hide resolved
return True
elif right_end - left_start >= 2 * len_value:
# simple analysis for data too large to yield sensible insights
part_set = set(line[left_start:right_end])
if not part_set.difference(self.base64_set):
# obvious case: all characters are base64 standard
return True

left_part = line[left_start:line_data.value_start]
len_left = len(left_part)
right_part = line[line_data.value_end:right_end]
len_right = len(right_part)

min_entropy_value = ValueEntropyBase64Check.get_min_data_entropy(len_value)
value_entropy = Util.get_shannon_entropy(value, Chars.BASE64STD_CHARS.value)

if ValueEntropyBase64Check.min_length < len_left:
left_entropy = Util.get_shannon_entropy(left_part, Chars.BASE64STD_CHARS.value)
if len_left < len_value:
left_entropy *= len_value / len_left
else:
left_entropy = min_entropy_value

if ValueEntropyBase64Check.min_length < len_right:
right_entropy = Util.get_shannon_entropy(right_part, Chars.BASE64STD_CHARS.value)
if len_right < len_value:
left_entropy *= len_right / len_left
else:
right_entropy = min_entropy_value

data = [left_entropy, value_entropy, right_entropy, min_entropy_value]
avg = statistics.mean(data)
stdev = statistics.stdev(data, avg)
avg_min = avg - 1.1 * stdev
if avg_min < left_entropy and avg_min < right_entropy:
if avg_min <= left_entropy and avg_min <= right_entropy:
# high entropy of bound parts looks like a part of base64 long line
return True

Expand Down
5 changes: 4 additions & 1 deletion credsweeper/filters/value_entropy_base64_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
class ValueEntropyBase64Check(Filter):
"""Check that candidate have Shanon Entropy > 3 (for HEX_CHARS or BASE36_CHARS) or > 4.5 (for BASE64_CHARS)."""

# less size does not stable entropy - will be zero
babenek marked this conversation as resolved.
Show resolved Hide resolved
min_length = 12

def __init__(self, config: Config = None) -> None:
pass

Expand All @@ -25,7 +28,7 @@ def get_min_data_entropy(x: int) -> float:
y = 4.1
elif 32 == x:
y = 4.4
elif 12 <= x < 35:
elif ValueEntropyBase64Check.min_length <= x < 35:
# logarithm base 2 - slow, but precise. Approximation does not exceed stdev
y = 0.77 * math.log2(x) + 0.62
elif 35 <= x < 60:
Expand Down
14 changes: 7 additions & 7 deletions credsweeper/filters/value_not_part_encoded_check.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from typing import Optional

from credsweeper.common import static_keyword_checklist
from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
Expand Down Expand Up @@ -29,15 +30,14 @@ def check_line_target_fit(line_data: LineData, target: AnalysisTarget) -> bool:

@staticmethod
def check_val(line: str, pattern: re.Pattern) -> Optional[bool]:
"""Verifies whether the line looks like a pattern"""
match_obj = pattern.match(line)
if match_obj:
"""Verifies whether the line looks like a base64 pattern"""
if match_obj := pattern.match(line):
val = match_obj.group("val")
# not a path-like
if not val.startswith('/'):
return True
# padding sign
if '=' == val[-1]:
if not val.startswith('/') \
or not static_keyword_checklist.check_morphemes(val.lower(), 2) \
or '=' == val[-1]:
# padding char is a marker too
return True
return None

Expand Down
2 changes: 1 addition & 1 deletion credsweeper/filters/value_token_base64_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_min_strength(x: int) -> float:
elif x < 40:
y = ((0.0000405 * x - 0.004117) * x + 0.141) * x - 0.65
else:
y = 1
y = 0.9999
return y

def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
Expand Down
Loading
Loading