Skip to content

Commit

Permalink
xml spec case
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Jan 16, 2025
1 parent 306190a commit d932db0
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 8 deletions.
8 changes: 4 additions & 4 deletions credsweeper/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,14 +481,14 @@ def is_mxfile(data: Union[bytes, bytearray]) -> bool:
return False

# A well-formed XML must start from < or a whitespace character
XML_FIRST_TAG_PATTERN = re.compile(rb"^\s*<([0-9A-Za-z_]{1,256})")
XML_CLOSE_TAG_PATTERN = re.compile(rb"</[0-9A-Za-z_]{1,256}>")
XML_FIRST_BRACKET_PATTERN = re.compile(rb"^\s*<")
XML_OPENING_TAG_PATTERN = re.compile(rb"<([0-9A-Za-z_]{1,256})")

@staticmethod
def is_xml(data: Union[bytes, bytearray]) -> bool:
"""Used to detect xml format from raw bytes"""
if isinstance(data, (bytes, bytearray)):
if first_bracket_match := Util.XML_FIRST_TAG_PATTERN.search(data, 0, MAX_LINE_LENGTH):
if isinstance(data, (bytes, bytearray)) and Util.XML_FIRST_BRACKET_PATTERN.search(data, 0, MAX_LINE_LENGTH):
if first_bracket_match := Util.XML_OPENING_TAG_PATTERN.search(data, 0, MAX_LINE_LENGTH):
start_pos = first_bracket_match.start()
closing_tag = b"</" + first_bracket_match.group(1) + b">"
if start_pos < data.find(closing_tag, start_pos):
Expand Down
12 changes: 8 additions & 4 deletions tests/utils/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,10 +619,13 @@ def test_subtext_p(self):

def test_is_xml_n(self):
self.assertFalse(Util.is_xml(b''))
self.assertFalse(Util.is_xml(b'!<>'))
self.assertFalse(Util.is_xml(b'<br />'))
self.assertFalse(Util.is_xml(bytearray(b'\n' * MAX_LINE_LENGTH) + bytearray(b' <xml>far far away</xml>')))
self.assertFalse(Util.is_xml(b'<html> unmatched tags </xml>'))
self.assertFalse(Util.is_xml(b"!<>"))
self.assertFalse(Util.is_xml(b"</onlyClosingTagIsFail>"))
self.assertFalse(Util.is_xml(b"</p><p>"))
self.assertFalse(Util.is_xml(b"<br />"))
self.assertFalse(Util.is_xml(bytearray(b'\n' * MAX_LINE_LENGTH) + bytearray(b" <xml>far far away</xml>")))
self.assertFalse(Util.is_xml(b"<html> unmatched tags </xml>"))
self.assertFalse(Util.is_xml(b"<?xml version='1.0' encoding='utf-8'?>"))

def test_is_html_n(self):
self.assertFalse(Util.is_html(b"</html><html>"))
Expand All @@ -638,6 +641,7 @@ def test_xml_n(self):
self.assertFalse(Util.is_html(None))

def test_xml_p(self):
self.assertTrue(Util.is_xml(b"<?xml version='1.0' encoding='utf-8'?><xml> matched tags </xml>"))
data = b"<mxfile atr=0><table></table></mxfile>"
self.assertTrue(Util.is_xml(data))
self.assertTrue(Util.is_html(data))
Expand Down

0 comments on commit d932db0

Please sign in to comment.