From 51c8a8a22c052add1158da8fae1e5772ad990d3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Tue, 13 Feb 2024 12:01:44 +0100 Subject: [PATCH] Fix whitespace catastrophic backtracking --- tests/test_encoding.py | 2 +- tiktoken_ext/openai_public.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 687dbdcc..0e02b47a 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -14,7 +14,7 @@ @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) def test_extremely_big_encoding(make_enc: Callable[[], tiktoken.Encoding]): enc = make_enc() - for c in ["^", "0", "a", "'s"]: # TODO " ", "\n" are still failing + for c in ["^", "0", "a", "'s", " ", "\n"]: print(f"Validating `{c}`") big_value = c * 10_000 diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index c7b41541..ce33973b 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -9,7 +9,7 @@ # The pattern in the original GPT-2 release is: # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" # This is equivalent, but executes faster: -_legacy_splitter_regex = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s++""" +_legacy_splitter_regex = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s""" def gpt2(): @@ -84,7 +84,7 @@ def cl100k_base(): } return { "name": "cl100k_base", - "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s*[\r\n]|\s+(?!\S)|\s++""", + "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""", "mergeable_ranks": mergeable_ranks, "special_tokens": special_tokens, }