Skip to content

Commit

Permalink
performance improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
ekcrisp committed Feb 15, 2025
1 parent c49751b commit df2368d
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 49 deletions.
19 changes: 11 additions & 8 deletions stream2sentence/stream2sentence_time_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def generate_sentences(
preferred_sentence_fragment_delimiters_global = set(preferred_sentence_fragment_delimiters)
sentence_fragment_delimiters_global = set(sentence_fragment_delimiters)
delimiter_ignore_prefixes_global = set(delimiter_ignore_prefixes)
punkt_sentence_tokenizer = PunktSentenceTokenizer()

start_time = time.time()
last_sentence_time = time.time()
Expand All @@ -148,7 +149,7 @@ def generate_sentences(


def handle_output(output, sentence_boundary_index=None):
nonlocal has_output_started, llm_buffer_full, output_sentences, min_output_lengths, start_time, token, last_sentence_time
nonlocal has_output_started, llm_buffer_full, output_sentences, min_output_lengths, start_time, last_sentence_time
if not has_output_started:
#once output has started we go based on TTS start for deadline
start_time = time.time()
Expand All @@ -165,15 +166,14 @@ def handle_output(output, sentence_boundary_index=None):
for token in generator:
llm_buffer_full += token
llm_buffer_full = llm_buffer_full.lstrip()

split_buffer = llm_buffer_full.split()[:-1] #remove last word
if len(split_buffer) == 0:
if len(llm_buffer_full.split(None, 2)) < 2:
#must have at least two words since last token may not be a full word
continue
llm_buffer = ' '.join(split_buffer)

llm_buffer = llm_buffer_full.rsplit(" ", 1)[0] #remove last word
sentences_on_buffer = nltk.tokenize.sent_tokenize(llm_buffer)
sentence_boundaries = list(PunktSentenceTokenizer().span_tokenize(llm_buffer_full)) #handle white space descrepancies in full_buffer and buffer after split()
sentence_boundaries = list(punkt_sentence_tokenizer.span_tokenize(llm_buffer_full)) #handle white space descrepancies in full_buffer and buffer after split()

num_sentences_output = len(output_sentences)
min_output_length = get_index_or_last(min_output_lengths, num_sentences_output)
sentences_needed_for_min_len = get_sentences_needed_for_min_length(sentences_on_buffer, min_output_length)
Expand Down Expand Up @@ -203,7 +203,10 @@ def handle_output(output, sentence_boundary_index=None):
is_not_min_length = get_num_words(output) < min_output_length
max_wait_for_fragment = get_index_or_last(max_wait_for_fragments, num_sentences_output)
waiting_for_fragment = (time.time() - last_sentence_time < max_wait_for_fragment)
last_word_avoid_pause = output.split()[-1] in wait_for_if_non_fragment

_, last_word = output.rsplit(" ", 1)
last_word_avoid_pause = last_word in wait_for_if_non_fragment

if is_not_min_length or waiting_for_fragment or last_word_avoid_pause:
continue

Expand Down
83 changes: 42 additions & 41 deletions tests/test_time_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,37 +64,37 @@ def is_within_tolerance(num1, num2, tolerance):

def compare_results(result, expected_result):
for i in range(len(result)):
if expected_result[i][0] != result[i][0] or not is_within_tolerance(float(expected_result[i][1]), float(result[i][1]), 0.2):
if expected_result[i][0] != result[i][0] or not is_within_tolerance(float(expected_result[i][1]), float(result[i][1]), 0.25):
raise ValueError(f"RESULT MISMATCH - expected={expected_result[i]} - actual={result[i]}")


result_1 = run_test(input_stewart_wiki, 9, True)
expected_result_1 = [
['In 1996 Mr. Stewart hosted a short-lived talk show entitled,', '1.8'],
["Where's Elvis This Week?, which was a half-hour,", '3.0'],
['weekly comedy television program.', '5.4'],
['It aired on Sunday nights in the United Kingdom on BBC Two.', '7.0'],
['It was filmed at the CBS Broadcast Center in New York City and featured a set of panelists, two from the UK and two from the United States,', '9.5'],
['who discussed news items and cultural issues.', '12.8'],
['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '16.3'],
['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '19.5'],
["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '24.0'],
['Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling).', '25.1'],
["Stewart also headlined the 1997 White House Correspondents' dinner.", '25.1']
["Where's Elvis This Week?, which was a half-hour,", '2.8'],
['weekly comedy television program.', '5.2'],
['It aired on Sunday nights in the United Kingdom on BBC Two.', '6.8'],
['It was filmed at the CBS Broadcast Center in New York City and featured a set of panelists, two from the UK and two from the United States,', '9.3'],
['who discussed news items and cultural issues.', '12.6'],
['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '16.1'],
['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '19.3'],
["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '23.8'],
['Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling).', '24.9'],
["Stewart also headlined the 1997 White House Correspondents' dinner.", '24.9']
]
compare_results(result_1, expected_result_1)

result_2 = run_test(input_stewart_wiki, 5)
expected_result_2 = [
['In 1996 Mr. Stewart hosted a short-lived talk show entitled,', '3.2'],
["Where's Elvis This Week?, which was a half-hour,", '5.6'],
['weekly comedy television program.', '8.3'],
['It aired on Sunday nights in the United Kingdom on BBC Two.', '9.6'],
['It was filmed at the CBS Broadcast Center in New York City and featured a set', '13.6'],
['of panelists, two from the UK and two from the United States, who discussed news items and cultural issues.', '19.0'],
['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '25.4'],
['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '30.2'],
["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '38.0'],
['In 1996 Mr. Stewart hosted a short-lived talk show entitled,', '3.0'],
["Where's Elvis This Week?, which was a half-hour,", '5.4'],
['weekly comedy television program.', '8.0'],
['It aired on Sunday nights in the United Kingdom on BBC Two.', '9.4'],
['It was filmed at the CBS Broadcast Center in New York City and featured a set', '13.4'],
['of panelists, two from the UK and two from the United States, who discussed news items and cultural issues.', '18.8'],
['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '25.1'],
['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '30.0'],
["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '37.7'],
['Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling).', '44.9'],
["Stewart also headlined the 1997 White House Correspondents' dinner.", '44.9']
]
Expand All @@ -103,31 +103,32 @@ def compare_results(result, expected_result):
result_3 = run_test(input_problematic, 9)
expected_result_3 = [
['First sentence is short.', '1.0'],
['Second sentence is very long,', '1.6'],
['Second sentence is very long,', '1.5'],
['and totally a run on,', '3.1'],
['and would definitely cause problems if this is what the output of the llm was and we only had a quick', '5.4'],
['yield value of one this needs to be broken up thanks.', '11.8'],
['Third sentence also very long that lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '15.5'],
['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '25.2'],
['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.', '25.2']
['and would definitely cause problems if this is what the output of the llm was and we only had a quick', '5.2'],
['yield value of one this needs to be broken up thanks.', '11.7'],
['Third sentence also very long that lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '15.3'],
['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '25.0'],
['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.', '25.0']
]
compare_results(result_3, expected_result_3)

result_4 = run_test(input_problematic, 5)
result_4 = run_test(input_problematic, 5, True)
expected_result_4 = [
['First sentence is short.', '1.6'],
['Second sentence is very long,', '2.9'],
['and totally a run on,', '4.3'],
['and would definitely cause problems if this is what the output', '6.9'],
['of the llm was and we only had a quick', '9.6'],
['yield value of one this needs to be broken up', '12.3'],
['thanks. Third sentence also very long that lorem ipsum dolor sit amet,', '15.8'],
['consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim', '19.8'],
['ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '25.1'],
['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit,', '29.9'],
['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam,', '35.3'],
['quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '40.6'],
['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident,', '44.7'],
['sunt in culpa qui officia deserunt mollit anim id est laborum.', '45.2']
['First sentence is short.', '1.3'],
['Second sentence is very long,', '2.7'],
['and totally a run on,', '4.0'],
['and would definitely cause problems if this is what the output', '6.7'],
['of the llm was and we only had a quick', '9.4'],
['yield value of one this needs to be broken', '11.8'],
['up thanks. Third sentence also very long that lorem', '14.2'],
['ipsum dolor sit amet, consectetur adipiscing elit,', '17.4'],
['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim', '19.5'],
['ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '23.5'],
['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit,', '28.3'],
['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam,', '33.6'],
['quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '39.3'],
['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non', '42.0'],
['proident, sunt in culpa qui officia deserunt mollit anim id est laborum.', '45.2']
]
compare_results(result_4, expected_result_4)

0 comments on commit df2368d

Please sign in to comment.