diff --git a/credsweeper/deep_scanner/deep_scanner.py b/credsweeper/deep_scanner/deep_scanner.py
index 808ea76dd..dbc006d7c 100644
--- a/credsweeper/deep_scanner/deep_scanner.py
+++ b/credsweeper/deep_scanner/deep_scanner.py
@@ -102,16 +102,15 @@ def get_deep_scanners(data: bytes, file_type: str) -> List[Any]:
elif file_type in [".eml", ".mht"]:
if Util.is_eml(data):
deep_scanners.append(EmlScanner)
- elif Util.is_html(data):
+ elif Util.is_xml(data) and Util.is_html(data):
deep_scanners.append(HtmlScanner)
else:
deep_scanners.append(ByteScanner)
- elif Util.is_html(data):
- deep_scanners.append(HtmlScanner)
- deep_scanners.append(XmlScanner)
- elif Util.is_mxfile(data):
- deep_scanners.append(MxfileScanner)
elif Util.is_xml(data):
+ if Util.is_html(data):
+ deep_scanners.append(HtmlScanner)
+ elif Util.is_mxfile(data):
+ deep_scanners.append(MxfileScanner)
deep_scanners.append(XmlScanner)
else:
deep_scanners = [EncoderScanner, LangScanner, ByteScanner]
diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py
index 4f9f49436..de249edfe 100644
--- a/credsweeper/utils/util.py
+++ b/credsweeper/utils/util.py
@@ -459,29 +459,38 @@ def is_elf(data: Union[bytes, bytearray]) -> bool:
@staticmethod
def is_html(data: Union[bytes, bytearray]) -> bool:
- """Used to detect html format of eml"""
+ """Used to detect html format. Suppose, invocation of is_xml() was True before."""
if isinstance(data, (bytes, bytearray)):
- if 0 <= data.find(b"" in data:
- return True
+ for opening_tag, closing_tag in [(b"", b""), (b"
"), (b"", b"
"),
+ (b"", b""), (b"", b"
"), (b"", b""),
+ (b"", b"
"), (b""), (b"", b" | "),
+ (b"", b"
"), (b"", b" | ")]:
+ opening_pos = data.find(opening_tag, 0, MAX_LINE_LENGTH)
+ if 0 <= opening_pos < data.find(closing_tag, opening_pos):
+ # opening and closing tags were found - suppose it is an HTML
+ return True
return False
@staticmethod
def is_mxfile(data: Union[bytes, bytearray]) -> bool:
- """Used to detect mxfile format"""
+ """Used to detect mxfile (drawio) format. Suppose, invocation of is_xml() was True before."""
if isinstance(data, (bytes, bytearray)):
- if 0 <= data.find(b"" in data:
+ mxfile_tag_pos = data.find(b"", mxfile_tag_pos):
return True
return False
+ # A well-formed XML must start from < or a whitespace character
+ XML_FIRST_TAG_PATTERN = re.compile(rb"^\s*<")
XML_CLOSE_TAG_PATTERN = re.compile(rb"[0-9A-Za-z_]{1,80}>")
@staticmethod
def is_xml(data: Union[bytes, bytearray]) -> bool:
- """Used to detect xml format"""
+ """Used to detect xml format from raw bytes"""
if isinstance(data, (bytes, bytearray)):
- start = data.find(b'<', 0, CHUNK_SIZE)
- if 0 <= start and 0 <= data.find(b'>', start + 1, CHUNK_SIZE):
- return bool(re.search(Util.XML_CLOSE_TAG_PATTERN, data))
+ if first_bracket_match := Util.XML_FIRST_TAG_PATTERN.search(data, 0, MAX_LINE_LENGTH):
+ if Util.XML_CLOSE_TAG_PATTERN.search(data, first_bracket_match.start() + 1, MAX_LINE_LENGTH):
+ return True
return False
@staticmethod
diff --git a/tests/utils/test_util.py b/tests/utils/test_util.py
index f7dc4d5bb..7cba01864 100644
--- a/tests/utils/test_util.py
+++ b/tests/utils/test_util.py
@@ -616,3 +616,28 @@ def test_subtext_p(self):
self.assertEqual("the lazy dog", Util.subtext(AZ_STRING, len(AZ_STRING) - 2, 6))
self.assertEqual(AZ_STRING[:39], Util.subtext(AZ_STRING, 15, 20))
self.assertEqual(AZ_STRING[-40:], Util.subtext(AZ_STRING, 33, 20))
+
+ def test_is_xml_n(self):
+ self.assertFalse(Util.is_xml(b''))
+ self.assertFalse(Util.is_xml(b'!<>'))
+ self.assertFalse(Util.is_xml(b'
'))
+ self.assertFalse(Util.is_xml(bytearray(b'\n' * MAX_LINE_LENGTH) + bytearray(b' far far away')))
+
+ def test_is_html_n(self):
+ self.assertFalse(Util.is_html(b""))
+
+ def test_is_mxfile_n(self):
+ self.assertFalse(Util.is_mxfile(b""))
+ self.assertFalse(Util.is_mxfile(b""))
+
+ def test_xml_n(self):
+ self.assertFalse(Util.is_xml(None))
+ self.assertFalse(Util.is_xml(''))
+ self.assertFalse(Util.is_html(None))
+ self.assertFalse(Util.is_html(None))
+
+ def test_xml_p(self):
+ data = b""
+ self.assertTrue(Util.is_xml(data))
+ self.assertTrue(Util.is_html(data))
+ self.assertTrue(Util.is_mxfile(data))