diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py index f86ffa6c9..1e45fe3fc 100644 --- a/credsweeper/utils/util.py +++ b/credsweeper/utils/util.py @@ -481,14 +481,14 @@ def is_mxfile(data: Union[bytes, bytearray]) -> bool: return False # A well-formed XML must start from < or a whitespace character - XML_FIRST_TAG_PATTERN = re.compile(rb"^\s*<([0-9A-Za-z_]{1,256})") - XML_CLOSE_TAG_PATTERN = re.compile(rb"") + XML_FIRST_BRACKET_PATTERN = re.compile(rb"^\s*<") + XML_OPENING_TAG_PATTERN = re.compile(rb"<([0-9A-Za-z_]{1,256})") @staticmethod def is_xml(data: Union[bytes, bytearray]) -> bool: """Used to detect xml format from raw bytes""" - if isinstance(data, (bytes, bytearray)): - if first_bracket_match := Util.XML_FIRST_TAG_PATTERN.search(data, 0, MAX_LINE_LENGTH): + if isinstance(data, (bytes, bytearray)) and Util.XML_FIRST_BRACKET_PATTERN.search(data, 0, MAX_LINE_LENGTH): + if first_bracket_match := Util.XML_OPENING_TAG_PATTERN.search(data, 0, MAX_LINE_LENGTH): start_pos = first_bracket_match.start() closing_tag = b"" if start_pos < data.find(closing_tag, start_pos): diff --git a/tests/utils/test_util.py b/tests/utils/test_util.py index 4aa919622..a4af145f3 100644 --- a/tests/utils/test_util.py +++ b/tests/utils/test_util.py @@ -619,10 +619,13 @@ def test_subtext_p(self): def test_is_xml_n(self): self.assertFalse(Util.is_xml(b'')) - self.assertFalse(Util.is_xml(b'!<>')) - self.assertFalse(Util.is_xml(b'
')) - self.assertFalse(Util.is_xml(bytearray(b'\n' * MAX_LINE_LENGTH) + bytearray(b' far far away'))) - self.assertFalse(Util.is_xml(b' unmatched tags ')) + self.assertFalse(Util.is_xml(b"!<>")) + self.assertFalse(Util.is_xml(b"")) + self.assertFalse(Util.is_xml(b"

")) + self.assertFalse(Util.is_xml(b"
")) + self.assertFalse(Util.is_xml(bytearray(b'\n' * MAX_LINE_LENGTH) + bytearray(b" far far away"))) + self.assertFalse(Util.is_xml(b" unmatched tags ")) + self.assertFalse(Util.is_xml(b"")) def test_is_html_n(self): self.assertFalse(Util.is_html(b"")) @@ -638,6 +641,7 @@ def test_xml_n(self): self.assertFalse(Util.is_html(None)) def test_xml_p(self): + self.assertTrue(Util.is_xml(b" matched tags ")) data = b"
" self.assertTrue(Util.is_xml(data)) self.assertTrue(Util.is_html(data))