diff --git a/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json b/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json new file mode 100644 index 00000000..d7742749 --- /dev/null +++ b/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json @@ -0,0 +1,90 @@ +{ + "input": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Willkommen ", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "zum ", + "start": 0.82, + "end": 30.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "letzten ", + "start": 30.07, + "end": 31.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "Token.", + "start": 31.65, + "end": 32.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ], + "expected": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Willkommen ", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "zum ", + "start": 0.82, + "end": 30.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "letzten ", + "start": 30.07, + "end": 31.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "Token.", + "start": 31.65, + "end": 32.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ] +} diff --git a/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json new file mode 100644 index 00000000..169b2b98 --- /dev/null +++ b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json @@ -0,0 +1,167 @@ +{ + "input": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "*", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "Klirren", + "start": 0.82, + "end": 1.0, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + }, + { + "text": "*", + "start": 1.0, + "end": 1.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 1.07, + "end": 1.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 1.65, + "end": 2.0, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Infos, ", + "start": 2.0, + "end": 2.07, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 2.07, + "end": 2.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 2.65, + "end": 3.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Fakten, ", + "start": 3.06, + "end": 3.09, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ], + "expected": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "*", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "Klirren", + "start": 0.82, + "end": 1.0, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + }, + { + "text": "*", + "start": 1.0, + "end": 1.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 1.07, + "end": 1.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 1.65, + "end": 2.0, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Infos, ", + "start": 2.0, + "end": 2.07, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Ich ", + "start": 2.07, + "end": 2.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 2.65, + "end": 3.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Fakten, ", + "start": 3.06, + "end": 3.09, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ] +} diff --git a/worker/tests/test_transcribe.py b/worker/tests/test_transcribe.py index 3489fb25..fa6faab0 100644 --- a/worker/tests/test_transcribe.py +++ b/worker/tests/test_transcribe.py @@ -67,6 +67,7 @@ def test_strict_sentence_paragraphs(data_file): async_doc_chain_func_to_list(strict_sentence_paragraphs)(test_data.input) ) ) + assert [x.text() for x in output] == [x.text() for x in test_data.expected] assert output == test_data.expected diff --git a/worker/transcribee_worker/whisper_transcribe.py b/worker/transcribee_worker/whisper_transcribe.py index 71cbcd8b..caec0dcf 100644 --- a/worker/transcribee_worker/whisper_transcribe.py +++ b/worker/transcribee_worker/whisper_transcribe.py @@ -251,12 +251,30 @@ async def strict_sentence_paragraphs( iter: AsyncIterator[Paragraph], ) -> AsyncIterator[Paragraph]: acc_paragraph = None + acc_used_paras = [] + combination_active = True async for paragraph in iter: - if acc_paragraph is None: + if not combination_active: + yield paragraph + continue + elif acc_paragraph is None: acc_paragraph = Paragraph( lang=paragraph.lang, speaker=paragraph.speaker, children=[] ) - + acc_used_paras = [] + elif ( + (start := acc_paragraph.start()) is not None + and (end := paragraph.end()) is not None + and end - start > 30 + ): + # It seems like whisper doesn't produce sentence breaks. Ignore the + # current `acc_paragraph` and yield the original paras instead, + # disable this step until the end of the document + combination_active = False + for para in acc_used_paras: + yield para + yield paragraph + continue elif ( acc_paragraph.lang != paragraph.lang or acc_paragraph.speaker != paragraph.speaker @@ -266,8 +284,9 @@ async def strict_sentence_paragraphs( acc_paragraph = Paragraph( lang=paragraph.lang, speaker=paragraph.speaker, children=[] ) + acc_used_paras = [] - elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES): + if any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES): if acc_paragraph.children: yield acc_paragraph acc_paragraph = None @@ -285,7 +304,9 @@ async def strict_sentence_paragraphs( acc_paragraph = Paragraph( lang=paragraph.lang, speaker=paragraph.speaker, children=[] ) - for atom in paragraph.children: + acc_used_paras = [] + acc_yield_offset = 0 + for i, atom in enumerate(paragraph.children): acc_paragraph.children.append(atom) text = acc_paragraph.text() if offset + len(text) in breaks and not any( @@ -296,7 +317,15 @@ async def strict_sentence_paragraphs( acc_paragraph = Paragraph( lang=paragraph.lang, speaker=paragraph.speaker, children=[] ) - if acc_paragraph is not None and acc_paragraph.children: + acc_yield_offset = i + acc_used_paras.append( + Paragraph( + lang=paragraph.lang, + speaker=paragraph.speaker, + children=paragraph.children[acc_yield_offset:], + ) + ) + if acc_paragraph is not None and acc_paragraph.children and combination_active: yield acc_paragraph