Skip to content

Commit

Permalink
Fix #424: ckp
Browse files Browse the repository at this point in the history
  • Loading branch information
git-user committed Jan 29, 2024
1 parent a3b3035 commit 278c35b
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 16 deletions.
29 changes: 14 additions & 15 deletions pykern/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,7 @@

_DEFAULT_ROOT = "run"
_DEV_ONLY_FILES = ("setup.py", "pyproject.toml")
_STRANGE_CONTROL_CODES = [
bytes((n,))
for n in range(0, 32)
if bytes((n,)) not in b"\x07\x08\t\n\x0b\x0c\r\x1b"
]
_VALID_ASCII_CONTROL_CODES = frozenset((0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x1B))


def cfg_absolute_dir(value):
Expand Down Expand Up @@ -83,17 +79,18 @@ def is_pure_text(value, test_size=512):
bool: True if bytes_data is likely pure text, false if likely binary
"""

def _is_accepted_control_code_ratio(value):
def _is_accepted_control_code_ratio(text_value):
c = 0
for i in range(len(value)):
if bytes([value[i]]) in _STRANGE_CONTROL_CODES:
for char in text_value:
if ord(char) == 0:
return False
if ord(char) < 32 and ord(char) not in _VALID_ASCII_CONTROL_CODES:
c += 1
return (c / len(value)) < (1 / 3)
return (c / len(text_value)) < 0.33

def _try(chunk):
try:
chunk.decode("utf-8", "strict")
return b"\x00" not in value
return chunk.decode("utf-8", "strict")
except UnicodeDecodeError:
return False

Expand All @@ -105,13 +102,15 @@ def _valid_unicode(value, test_size):
# is truncated by test_size, we need to probe back
# a bit to find the end of the char.
for _ in range(4):
if _try(b):
return True
if d := _try(b):
return d
if len(b) <= 1:
return False
b = b[:-1]
return False

if _valid_unicode(value, test_size):
return _is_accepted_control_code_ratio(value)
if value == b"":
return True
if d := _valid_unicode(value, test_size):
return _is_accepted_control_code_ratio(d)
return False
5 changes: 4 additions & 1 deletion tests/util_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@ def test_is_pure_text():

a = "a".encode("utf-8")
for case in [
not util.is_pure_text(b"\0"),
not util.is_pure_text(
b"\0" + "Valid text to see that zero byte causes failure".encode("utf-8")
),
not util.is_pure_text(a + b"\xc2", 2),
not util.is_pure_text(
b"\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
),
not util.is_pure_text(b"\xd4\x16\xc0\xd6\xec\xbf\x92\xe6\x84T\xc9 \xe9\xbf"),
util.is_pure_text(b""),
util.is_pure_text(b"This is example text"),
util.is_pure_text(b"\x07\x08\t\n\x0b\x0c\r\x0e\x0f"),
util.is_pure_text(a + "¡".encode("utf-8"), 2),
Expand Down

0 comments on commit 278c35b

Please sign in to comment.