performance improvements

KoljaB · Feb 15, 2025 · df2368d · df2368d
1 parent c49751b
commit df2368d
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 49 deletions.
diff --git a/stream2sentence/stream2sentence_time_based.py b/stream2sentence/stream2sentence_time_based.py
@@ -138,6 +138,7 @@ def generate_sentences(
     preferred_sentence_fragment_delimiters_global = set(preferred_sentence_fragment_delimiters)
     sentence_fragment_delimiters_global = set(sentence_fragment_delimiters)
     delimiter_ignore_prefixes_global = set(delimiter_ignore_prefixes)
+    punkt_sentence_tokenizer = PunktSentenceTokenizer()
 
     start_time = time.time()
     last_sentence_time = time.time()
@@ -148,7 +149,7 @@ def generate_sentences(
 
 
     def handle_output(output, sentence_boundary_index=None):
-        nonlocal has_output_started, llm_buffer_full, output_sentences, min_output_lengths, start_time, token, last_sentence_time
+        nonlocal has_output_started, llm_buffer_full, output_sentences, min_output_lengths, start_time, last_sentence_time
         if not has_output_started:
             #once output has started we go based on TTS start for deadline
             start_time = time.time()
@@ -165,15 +166,14 @@ def handle_output(output, sentence_boundary_index=None):
     for token in generator:
         llm_buffer_full += token
         llm_buffer_full = llm_buffer_full.lstrip()
-
-        split_buffer = llm_buffer_full.split()[:-1] #remove last word
-        if len(split_buffer) == 0:
+        if len(llm_buffer_full.split(None, 2)) < 2:
             #must have at least two words since last token may not be a full word
             continue
-        llm_buffer = ' '.join(split_buffer)
+
+        llm_buffer = llm_buffer_full.rsplit(" ", 1)[0] #remove last word
         sentences_on_buffer = nltk.tokenize.sent_tokenize(llm_buffer)
-        sentence_boundaries = list(PunktSentenceTokenizer().span_tokenize(llm_buffer_full)) #handle white space descrepancies in full_buffer and buffer after split()
-        
+        sentence_boundaries = list(punkt_sentence_tokenizer.span_tokenize(llm_buffer_full)) #handle white space descrepancies in full_buffer and buffer after split()
+
         num_sentences_output = len(output_sentences)
         min_output_length = get_index_or_last(min_output_lengths, num_sentences_output)
         sentences_needed_for_min_len = get_sentences_needed_for_min_length(sentences_on_buffer, min_output_length)
@@ -203,7 +203,10 @@ def handle_output(output, sentence_boundary_index=None):
                 is_not_min_length = get_num_words(output) < min_output_length
                 max_wait_for_fragment = get_index_or_last(max_wait_for_fragments, num_sentences_output)
                 waiting_for_fragment = (time.time() - last_sentence_time < max_wait_for_fragment)
-                last_word_avoid_pause = output.split()[-1] in wait_for_if_non_fragment
+
+                _, last_word = output.rsplit(" ", 1)
+                last_word_avoid_pause = last_word in wait_for_if_non_fragment
+
                 if is_not_min_length or waiting_for_fragment or last_word_avoid_pause:
                     continue
 

diff --git a/tests/test_time_based.py b/tests/test_time_based.py
@@ -64,37 +64,37 @@ def is_within_tolerance(num1, num2, tolerance):
 
 def compare_results(result, expected_result):
     for i in range(len(result)):
-        if expected_result[i][0] != result[i][0] or not is_within_tolerance(float(expected_result[i][1]), float(result[i][1]), 0.2):
+        if expected_result[i][0] != result[i][0] or not is_within_tolerance(float(expected_result[i][1]), float(result[i][1]), 0.25):
             raise ValueError(f"RESULT MISMATCH - expected={expected_result[i]} - actual={result[i]}")
 
 
 result_1 = run_test(input_stewart_wiki, 9, True)
 expected_result_1 = [
     ['In 1996 Mr. Stewart hosted a short-lived talk show entitled,', '1.8'],
-    ["Where's Elvis This Week?, which was a half-hour,", '3.0'],
-    ['weekly comedy television program.', '5.4'],
-    ['It aired on Sunday nights in the United Kingdom on BBC Two.', '7.0'],
-    ['It was filmed at the CBS Broadcast Center in New York City and featured a set of panelists, two from the UK and two from the United States,', '9.5'],
-    ['who discussed news items and cultural issues.', '12.8'],
-    ['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '16.3'],
-    ['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '19.5'],
-    ["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '24.0'],
-    ['Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling).', '25.1'],
-    ["Stewart also headlined the 1997 White House Correspondents' dinner.", '25.1']
+    ["Where's Elvis This Week?, which was a half-hour,", '2.8'],
+    ['weekly comedy television program.', '5.2'],
+    ['It aired on Sunday nights in the United Kingdom on BBC Two.', '6.8'],
+    ['It was filmed at the CBS Broadcast Center in New York City and featured a set of panelists, two from the UK and two from the United States,', '9.3'],
+    ['who discussed news items and cultural issues.', '12.6'],
+    ['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '16.1'],
+    ['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '19.3'],
+    ["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '23.8'],
+    ['Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling).', '24.9'],
+    ["Stewart also headlined the 1997 White House Correspondents' dinner.", '24.9']
 ]
 compare_results(result_1, expected_result_1)
 
 result_2 = run_test(input_stewart_wiki, 5)
 expected_result_2 = [
-    ['In 1996 Mr. Stewart hosted a short-lived talk show entitled,', '3.2'],
-    ["Where's Elvis This Week?, which was a half-hour,", '5.6'],
-    ['weekly comedy television program.', '8.3'],
-    ['It aired on Sunday nights in the United Kingdom on BBC Two.', '9.6'],
-    ['It was filmed at the CBS Broadcast Center in New York City and featured a set', '13.6'],
-    ['of panelists, two from the UK and two from the United States, who discussed news items and cultural issues.', '19.0'],
-    ['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '25.4'],
-    ['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '30.2'],
-    ["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '38.0'],
+    ['In 1996 Mr. Stewart hosted a short-lived talk show entitled,', '3.0'],
+    ["Where's Elvis This Week?, which was a half-hour,", '5.4'],
+    ['weekly comedy television program.', '8.0'],
+    ['It aired on Sunday nights in the United Kingdom on BBC Two.', '9.4'],
+    ['It was filmed at the CBS Broadcast Center in New York City and featured a set', '13.4'],
+    ['of panelists, two from the UK and two from the United States, who discussed news items and cultural issues.', '18.8'],
+    ['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '25.1'],
+    ['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '30.0'],
+    ["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '37.7'],
     ['Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling).', '44.9'],
     ["Stewart also headlined the 1997 White House Correspondents' dinner.", '44.9']
 ]
@@ -103,31 +103,32 @@ def compare_results(result, expected_result):
 result_3 = run_test(input_problematic, 9)
 expected_result_3 = [
     ['First sentence is short.', '1.0'],
-    ['Second sentence is very long,', '1.6'],
+    ['Second sentence is very long,', '1.5'],
     ['and totally a run on,', '3.1'],
-    ['and would definitely cause problems if this is what the output of the llm was and we only had a quick', '5.4'],
-    ['yield value of one this needs to be broken up thanks.', '11.8'],
-    ['Third sentence also very long that lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '15.5'],
-    ['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '25.2'],
-    ['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.', '25.2']
+    ['and would definitely cause problems if this is what the output of the llm was and we only had a quick', '5.2'],
+    ['yield value of one this needs to be broken up thanks.', '11.7'],
+    ['Third sentence also very long that lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '15.3'],
+    ['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '25.0'],
+    ['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.', '25.0']
 ]
 compare_results(result_3, expected_result_3)
 
-result_4 = run_test(input_problematic, 5)
+result_4 = run_test(input_problematic, 5, True)
 expected_result_4 = [
-    ['First sentence is short.', '1.6'],
-    ['Second sentence is very long,', '2.9'],
-    ['and totally a run on,', '4.3'],
-    ['and would definitely cause problems if this is what the output', '6.9'],
-    ['of the llm was and we only had a quick', '9.6'],
-    ['yield value of one this needs to be broken up', '12.3'],
-    ['thanks. Third sentence also very long that lorem ipsum dolor sit amet,', '15.8'],
-    ['consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim', '19.8'],
-    ['ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '25.1'],
-    ['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit,', '29.9'],
-    ['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam,', '35.3'],
-    ['quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '40.6'],
-    ['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident,', '44.7'],
-    ['sunt in culpa qui officia deserunt mollit anim id est laborum.', '45.2']
+    ['First sentence is short.', '1.3'],
+    ['Second sentence is very long,', '2.7'],
+    ['and totally a run on,', '4.0'],
+    ['and would definitely cause problems if this is what the output', '6.7'],
+    ['of the llm was and we only had a quick', '9.4'],
+    ['yield value of one this needs to be broken', '11.8'],
+    ['up thanks. Third sentence also very long that lorem', '14.2'],
+    ['ipsum dolor sit amet, consectetur adipiscing elit,', '17.4'],
+    ['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim', '19.5'],
+    ['ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '23.5'],
+    ['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit,', '28.3'],
+    ['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam,', '33.6'],
+    ['quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '39.3'],
+    ['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non', '42.0'],
+    ['proident, sunt in culpa qui officia deserunt mollit anim id est laborum.', '45.2']
 ]
 compare_results(result_4, expected_result_4)