Skip to content

Commit

Permalink
xml detect enchancement
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Jan 16, 2025
1 parent 17e3bd8 commit 3ae27a3
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 15 deletions.
11 changes: 5 additions & 6 deletions credsweeper/deep_scanner/deep_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,15 @@ def get_deep_scanners(data: bytes, file_type: str) -> List[Any]:
elif file_type in [".eml", ".mht"]:
if Util.is_eml(data):
deep_scanners.append(EmlScanner)
elif Util.is_html(data):
elif Util.is_xml(data) and Util.is_html(data):
deep_scanners.append(HtmlScanner)
else:
deep_scanners.append(ByteScanner)
elif Util.is_html(data):
deep_scanners.append(HtmlScanner)
deep_scanners.append(XmlScanner)
elif Util.is_mxfile(data):
deep_scanners.append(MxfileScanner)
elif Util.is_xml(data):
if Util.is_html(data):
deep_scanners.append(HtmlScanner)
elif Util.is_mxfile(data):
deep_scanners.append(MxfileScanner)
deep_scanners.append(XmlScanner)
else:
deep_scanners = [EncoderScanner, LangScanner, ByteScanner]
Expand Down
27 changes: 18 additions & 9 deletions credsweeper/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,29 +459,38 @@ def is_elf(data: Union[bytes, bytearray]) -> bool:

@staticmethod
def is_html(data: Union[bytes, bytearray]) -> bool:
"""Used to detect html format of eml"""
"""Used to detect html format. Suppose, invocation of is_xml() was True before."""
if isinstance(data, (bytes, bytearray)):
if 0 <= data.find(b"<html", 0, CHUNK_SIZE) and b"</html>" in data:
return True
for opening_tag, closing_tag in [(b"<html>", b"</html>"), (b"<table", b"</table>"), (b"<p>", b"</p>"),
(b"<span>", b"</span>"), (b"<div>", b"</div>"), (b"<li>", b"</li>"),
(b"<ol>", b"</ol>"), (b"<ul>", b"</ul>"), (b"<th>", b"</th>"),
(b"<tr>", b"</tr>"), (b"<td>", b"</td>")]:
opening_pos = data.find(opening_tag, 0, MAX_LINE_LENGTH)
if 0 <= opening_pos < data.find(closing_tag, opening_pos):
# opening and closing tags were found - suppose it is an HTML
return True
return False

@staticmethod
def is_mxfile(data: Union[bytes, bytearray]) -> bool:
"""Used to detect mxfile format"""
"""Used to detect mxfile (drawio) format. Suppose, invocation of is_xml() was True before."""
if isinstance(data, (bytes, bytearray)):
if 0 <= data.find(b"<mxfile", 0, CHUNK_SIZE) and b"</mxfile>" in data:
mxfile_tag_pos = data.find(b"<mxfile", 0, MAX_LINE_LENGTH)
if 0 <= mxfile_tag_pos < data.find(b"</mxfile>", mxfile_tag_pos):
return True
return False

# A well-formed XML must start from < or a whitespace character
XML_FIRST_TAG_PATTERN = re.compile(rb"^\s*<")
XML_CLOSE_TAG_PATTERN = re.compile(rb"</[0-9A-Za-z_]{1,80}>")

@staticmethod
def is_xml(data: Union[bytes, bytearray]) -> bool:
"""Used to detect xml format"""
"""Used to detect xml format from raw bytes"""
if isinstance(data, (bytes, bytearray)):
start = data.find(b'<', 0, CHUNK_SIZE)
if 0 <= start and 0 <= data.find(b'>', start + 1, CHUNK_SIZE):
return bool(re.search(Util.XML_CLOSE_TAG_PATTERN, data))
if first_bracket_match := Util.XML_FIRST_TAG_PATTERN.search(data, 0, MAX_LINE_LENGTH):
if Util.XML_CLOSE_TAG_PATTERN.search(data, first_bracket_match.start() + 1, MAX_LINE_LENGTH):
return True
return False

@staticmethod
Expand Down
25 changes: 25 additions & 0 deletions tests/utils/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,3 +616,28 @@ def test_subtext_p(self):
self.assertEqual("the lazy dog", Util.subtext(AZ_STRING, len(AZ_STRING) - 2, 6))
self.assertEqual(AZ_STRING[:39], Util.subtext(AZ_STRING, 15, 20))
self.assertEqual(AZ_STRING[-40:], Util.subtext(AZ_STRING, 33, 20))

def test_is_xml_n(self):
self.assertFalse(Util.is_xml(b''))
self.assertFalse(Util.is_xml(b'!<>'))
self.assertFalse(Util.is_xml(b'<br />'))
self.assertFalse(Util.is_xml(bytearray(b'\n' * MAX_LINE_LENGTH) + bytearray(b' <xml>far far away</xml>')))

def test_is_html_n(self):
self.assertFalse(Util.is_html(b"</html><html>"))

def test_is_mxfile_n(self):
self.assertFalse(Util.is_mxfile(b"<mxfile>"))
self.assertFalse(Util.is_mxfile(b"</mxfile><mxfile>"))

def test_xml_n(self):
self.assertFalse(Util.is_xml(None))
self.assertFalse(Util.is_xml(''))
self.assertFalse(Util.is_html(None))
self.assertFalse(Util.is_html(None))

def test_xml_p(self):
data = b"<mxfile atr=0><table></table></mxfile>"
self.assertTrue(Util.is_xml(data))
self.assertTrue(Util.is_html(data))
self.assertTrue(Util.is_mxfile(data))

0 comments on commit 3ae27a3

Please sign in to comment.