Skip to content

Commit

Permalink
Line info enchancement (#660)
Browse files Browse the repository at this point in the history
* line info refactoring

* excel style for info

* bump actions/upload-artifact

* optimize --doc scan

* style
  • Loading branch information
babenek authored Jan 17, 2025
1 parent 7ee8853 commit 06c54e7
Show file tree
Hide file tree
Showing 22 changed files with 1,619 additions and 1,577 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
- name: CredSweeper report
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: output.json
path: output.json
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,21 +144,21 @@ jobs:
- name: Upload CredSweeper log
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: credsweeper
path: credsweeper.${{ github.event.pull_request.head.sha }}.log

- name: Upload CredSweeper report
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: report
path: report.${{ github.event.pull_request.head.sha }}.json

- name: Upload benchmark output
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: benchmark
path: benchmark.${{ github.event.pull_request.head.sha }}.log
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ jobs:
- name: FLAKE 8 reports
if: ${{ failure() && steps.test_flake8.conclusion == 'failure' }}
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: flake8_report
path: flake8.txt
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/fuzz.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
- name: Store coverage report
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: htmlcov
path: htmlcov
Expand All @@ -81,7 +81,7 @@ jobs:
- name: New corpus upload
if: ${{ env.NEW_CORPUS > 0 }}
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: new_corpus
path: new_corpus
Expand All @@ -104,7 +104,7 @@ jobs:
- name: Crash corpus upload
if: ${{ env.CRASH_CORPUS > 0 }}
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: crash_corpus
path: crash_corpus
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ jobs:
- name: HTML coverage reports
if: always()
uses: actions/upload-artifact@ff15f0306b3f739f7b6fd43fb5d26cd321bd4de5 # v3.2.1
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
with:
name: coverage_html-${{ matrix.python-version }}
path: coverage_html
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/deep_scanner/bzip2_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def data_scan(
bzip2_content_provider = DataContentProvider(data=bz2.decompress(data_provider.data),
file_path=new_path,
file_type=Util.get_extension(new_path),
info=f"{data_provider.info}|BZIP2|{new_path}")
info=f"{data_provider.info}|BZIP2:{new_path}")
new_limit = recursive_limit_size - len(bzip2_content_provider.data)
bzip2_candidates = self.recursive_scan(bzip2_content_provider, depth, new_limit)
return bzip2_candidates
Expand Down
32 changes: 20 additions & 12 deletions credsweeper/deep_scanner/deep_scanner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import datetime
import logging
from pathlib import Path
from typing import List, Optional, Any, Tuple, Union

from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION
Expand Down Expand Up @@ -77,22 +76,26 @@ def scanner(self) -> Scanner:
return self.__scanner

@staticmethod
def get_deep_scanners(data: bytes, file_type: str) -> List[Any]:
def get_deep_scanners(data: bytes, file_type: str, depth: int) -> List[Any]:
"""Returns possibly scan methods for the data depends on content"""
deep_scanners: List[Any] = []
if Util.is_zip(data):
deep_scanners.append(ZipScanner)
if 0 < depth:
deep_scanners.append(ZipScanner)
# probably, there might be a docx, xlxs and so on.
# It might be scanned with text representation in third-party libraries.
deep_scanners.append(XlsxScanner)
deep_scanners.append(DocxScanner)
deep_scanners.append(PptxScanner)
elif Util.is_bzip2(data):
deep_scanners.append(Bzip2Scanner)
if 0 < depth:
deep_scanners.append(Bzip2Scanner)
elif Util.is_tar(data):
deep_scanners.append(TarScanner)
if 0 < depth:
deep_scanners.append(TarScanner)
elif Util.is_gzip(data):
deep_scanners.append(GzipScanner)
if 0 < depth:
deep_scanners.append(GzipScanner)
elif Util.is_pdf(data):
deep_scanners.append(PdfScanner)
elif Util.is_jks(data):
Expand All @@ -113,7 +116,10 @@ def get_deep_scanners(data: bytes, file_type: str) -> List[Any]:
deep_scanners.append(MxfileScanner)
deep_scanners.append(XmlScanner)
else:
deep_scanners = [EncoderScanner, LangScanner, ByteScanner]
if 0 < depth:
deep_scanners.append(EncoderScanner)
deep_scanners.append(LangScanner)
deep_scanners.append(ByteScanner)
return deep_scanners

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
Expand All @@ -136,27 +142,29 @@ def scan(self,
if isinstance(content_provider, TextContentProvider) or isinstance(content_provider, ByteContentProvider):
# Feature to scan files which might be containers
data = content_provider.data
info = "FILE"
elif isinstance(content_provider, DiffContentProvider) and content_provider.diff:
candidates = self.scanner.scan(content_provider)
# Feature to scan binary diffs
diff = content_provider.diff[0].get("line")
# the check for legal fix mypy issue
if isinstance(diff, bytes):
data = diff
info = "DIFF"
else:
logger.warning(f"Content provider {type(content_provider)} does not support deep scan")
info = "NA"

if data:
data_provider = DataContentProvider(data=data,
file_path=content_provider.file_path,
file_type=content_provider.file_type,
info=Path(content_provider.file_path).as_posix())
info=content_provider.info or info)
# iterate for all possibly scanner methods WITHOUT ByteContentProvider for TextContentProvider
scanner_classes = self.get_deep_scanners(data, content_provider.file_type)
scanner_classes = self.get_deep_scanners(data, content_provider.file_type, depth)
fallback = True
for scan_class in scanner_classes:
if new_candidates := scan_class.data_scan(self, data_provider, depth - 1,
recursive_limit_size - len(data)):
if new_candidates := scan_class.data_scan(self, data_provider, depth, recursive_limit_size - len(data)):
augment_candidates(candidates, new_candidates)
fallback = False
if fallback and ByteScanner not in scanner_classes and not Util.is_binary(data):
Expand Down Expand Up @@ -196,7 +204,7 @@ def recursive_scan(
else:
fallback = True
# iterate for all possibly scanner methods
scanner_classes = self.get_deep_scanners(data_provider.data, data_provider.file_type)
scanner_classes = self.get_deep_scanners(data_provider.data, data_provider.file_type, depth)
for scanner_class in scanner_classes:
if new_candidates := scanner_class.data_scan(self, data_provider, depth, recursive_limit_size):
augment_candidates(candidates, new_candidates)
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/deep_scanner/encoder_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def data_scan(
decoded_data_provider = DataContentProvider(data=data_provider.decoded,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|ENCODED")
info=f"{data_provider.info}|BASE64")
new_limit = recursive_limit_size - len(decoded_data_provider.data)
return self.recursive_scan(decoded_data_provider, depth, new_limit)
return None
2 changes: 1 addition & 1 deletion credsweeper/deep_scanner/gzip_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def data_scan(
gzip_content_provider = DataContentProvider(data=f.read(),
file_path=new_path,
file_type=Util.get_extension(new_path),
info=f"{data_provider.info}|GZIP|{new_path}")
info=f"{data_provider.info}|GZIP:{new_path}")
new_limit = recursive_limit_size - len(gzip_content_provider.data)
gzip_candidates = self.recursive_scan(gzip_content_provider, depth, new_limit)
return gzip_candidates
Expand Down
8 changes: 4 additions & 4 deletions credsweeper/deep_scanner/jks_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ def data_scan(
if keystore.private_keys or keystore.secret_keys:
candidate = Candidate.get_dummy_candidate(self.config, data_provider.file_path,
data_provider.file_type,
f"{data_provider.info}:'{pw_probe}' - has keys")
f"{data_provider.info}|JKS:'{pw_probe}' - has keys")
else:
candidate = Candidate.get_dummy_candidate(self.config, data_provider.file_path,
data_provider.file_type,
f"{data_provider.info}:'{pw_probe}' - default password")
candidate = Candidate.get_dummy_candidate(
self.config, data_provider.file_path, data_provider.file_type,
f"{data_provider.info}|JKS:'{pw_probe}' - default password")
candidates.append(candidate)
except Exception as jks_exc:
logger.debug(f"{data_provider.file_path}:{pw_probe}:{jks_exc}")
Expand Down
22 changes: 9 additions & 13 deletions credsweeper/deep_scanner/pdf_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,35 +27,31 @@ def data_scan(
# pdfminer.six - splits text in table to many lines. Allows to walk through elements
try:
candidates = []
pdf_lines = []
for page in extract_pages(io.BytesIO(data_provider.data), laparams=LAParams()):
for element in page:
if isinstance(element, LTText):
element_text = element.get_text().strip()
if element_text:
element_candidates = []
if 0 < depth and element_text:
if MIN_DATA_LEN < len(element_text):
pdf_content_provider = DataContentProvider(
data=element_text.encode(),
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|PDF[{page.pageid}]")
info=f"{data_provider.info}|PDF:{page.pageid}")
new_limit = recursive_limit_size - len(pdf_content_provider.data)
element_candidates = self.recursive_scan(pdf_content_provider, depth, new_limit)
candidates.extend(element_candidates)
if not element_candidates:
# skip to decrease duplicates of candidates
pdf_lines.append(element_text)
else:
string_data_provider = StringContentProvider(lines=[element_text],
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|PDF:{page.pageid}")
pdf_candidates = self.scanner.scan(string_data_provider)
candidates.extend(pdf_candidates)
elif isinstance(element, LTItem):
pass
else:
logger.error(f"Unsupported {element}")
string_data_provider = StringContentProvider(lines=pdf_lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|PDF[{page.pageid}]")
pdf_candidates = self.scanner.scan(string_data_provider)
candidates.extend(pdf_candidates)
return candidates
except Exception as pdf_exc:
logger.error(f"{data_provider.file_path}:{pdf_exc}")
Expand Down
4 changes: 2 additions & 2 deletions credsweeper/deep_scanner/pkcs12_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ def data_scan(
self.config, #
data_provider.file_path, #
data_provider.file_type, #
f"{data_provider.info}:'{pw_probe.decode()}' - has keys PKCS12")
f"{data_provider.info}|PKCS12:'{pw_probe.decode()}' - has keys PKCS12")
else:
candidate = Candidate.get_dummy_candidate(
self.config, #
data_provider.file_path, #
data_provider.file_type, #
f"{data_provider.info}:'{pw_probe.decode()}' - default password PKCS12")
f"{data_provider.info}|PKCS12:'{pw_probe.decode()}' - default password PKCS12")
candidates.append(candidate)
except Exception as pkcs_exc:
logger.debug(f"{data_provider.file_path}:{pw_probe.decode()}:{pkcs_exc}")
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/deep_scanner/pptx_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def data_scan(
string_data_provider = StringContentProvider(lines=pptx_lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|PPTX[{n+1}]")
info=f"{data_provider.info}|PPTX:{n+1}")
pptx_candidates = self.scanner.scan(string_data_provider)
candidates.extend(pptx_candidates)
return candidates
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/deep_scanner/tar_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def data_scan(
tar_content_provider = DataContentProvider(data=f.read(),
file_path=data_provider.file_path,
file_type=Util.get_extension(tfi.name),
info=f"{data_provider.info}|TAR|{tfi.name}")
info=f"{data_provider.info}|TAR:{tfi.name}")
# Nevertheless, use extracted data size
new_limit = recursive_limit_size - len(tar_content_provider.data)
tar_candidates = self.recursive_scan(tar_content_provider, depth, new_limit)
Expand Down
27 changes: 20 additions & 7 deletions credsweeper/deep_scanner/xlsx_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
import pandas as pd

from credsweeper.credentials import Candidate
from credsweeper.credentials.augment_candidates import augment_candidates
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
from credsweeper.file_handler.data_content_provider import DataContentProvider
from credsweeper.file_handler.string_content_provider import StringContentProvider
from credsweeper.utils import Util

logger = logging.getLogger(__name__)

Expand All @@ -26,15 +28,26 @@ def data_scan(
candidates = []
book = pd.read_excel(io.BytesIO(data_provider.data), sheet_name=None, header=None)
for sheet_name, sheet_data in book.items():
sheet_info = f"{data_provider.info}|{sheet_name}"
# replace open xml carriage returns _x000D_ before line feed only
df = sheet_data.replace(to_replace="_x000D_\n", value='\n', regex=True).fillna('').astype(str)
sheet_lines = ['\t'.join(x) for x in df.values]
string_data_provider = StringContentProvider(lines=sheet_lines,
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{data_provider.info}|xlsx:{sheet_name}")
sheet_candidates = self.scanner.scan(string_data_provider)
candidates.extend(sheet_candidates)
for row_pos, row in enumerate(df.values):
for col_pos, cell in enumerate(row):
cell_info = f"{sheet_info}:{Util.get_excel_column_name(col_pos)}{row_pos + 1}"
cell_provider = StringContentProvider(lines=cell.splitlines(),
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=cell_info)
cell_candidates = self.scanner.scan(cell_provider)
candidates.extend(cell_candidates)
row_line = '\t'.join(row)
row_provider = StringContentProvider(lines=[row_line],
file_path=data_provider.file_path,
file_type=data_provider.file_type,
info=f"{sheet_info}:R{row_pos + 1}")
row_candidates = self.scanner.scan(row_provider)
augment_candidates(candidates, row_candidates)

return candidates
except Exception as xlsx_exc:
logger.error(f"{data_provider.file_path}:{xlsx_exc}")
Expand Down
Loading

0 comments on commit 06c54e7

Please sign in to comment.