From 1e78b2d6b411ce2cef4a5af8001e3e326514cc1f Mon Sep 17 00:00:00 2001 From: pajowu Date: Mon, 27 Nov 2023 23:27:31 +0100 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=90=9B=20Skip=20para=20recombination?= =?UTF-8?q?=20if=20produced=20para=20is=20>=2030s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...ce_paragraphs-doest_combine_long_para.json | 90 +++++++++++++++++++ .../transcribee_worker/whisper_transcribe.py | 37 +++++++- 2 files changed, 123 insertions(+), 4 deletions(-) create mode 100644 worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json diff --git a/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json b/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json new file mode 100644 index 00000000..d7742749 --- /dev/null +++ b/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json @@ -0,0 +1,90 @@ +{ + "input": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Willkommen ", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "zum ", + "start": 0.82, + "end": 30.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "letzten ", + "start": 30.07, + "end": 31.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "Token.", + "start": 31.65, + "end": 32.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ], + "expected": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Willkommen ", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "zum ", + "start": 0.82, + "end": 30.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "letzten ", + "start": 30.07, + "end": 31.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "Token.", + "start": 31.65, + "end": 32.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ] +} diff --git a/worker/transcribee_worker/whisper_transcribe.py b/worker/transcribee_worker/whisper_transcribe.py index 71cbcd8b..1e7785a9 100644 --- a/worker/transcribee_worker/whisper_transcribe.py +++ b/worker/transcribee_worker/whisper_transcribe.py @@ -251,12 +251,30 @@ async def strict_sentence_paragraphs( iter: AsyncIterator[Paragraph], ) -> AsyncIterator[Paragraph]: acc_paragraph = None + acc_used_paras = [] + combination_active = True async for paragraph in iter: - if acc_paragraph is None: + if not combination_active: + yield paragraph + continue + elif acc_paragraph is None: acc_paragraph = Paragraph( lang=paragraph.lang, speaker=paragraph.speaker, children=[] ) - + acc_used_paras = [] + elif ( + (start := acc_paragraph.start()) is not None + and (end := paragraph.end()) is not None + and end - start > 30 + ): + # It seems like whisper doesn't produce sentence breaks. Ignore the + # current `acc_paragraph` and yield the original paras instead, + # disable this step until the end of the document + combination_active = False + for para in acc_used_paras: + yield para + yield paragraph + continue elif ( acc_paragraph.lang != paragraph.lang or acc_paragraph.speaker != paragraph.speaker @@ -266,6 +284,7 @@ async def strict_sentence_paragraphs( acc_paragraph = Paragraph( lang=paragraph.lang, speaker=paragraph.speaker, children=[] ) + acc_used_paras = [] elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES): if acc_paragraph.children: @@ -285,7 +304,9 @@ async def strict_sentence_paragraphs( acc_paragraph = Paragraph( lang=paragraph.lang, speaker=paragraph.speaker, children=[] ) - for atom in paragraph.children: + acc_used_paras = [] + acc_yield_offset = 0 + for i, atom in enumerate(paragraph.children): acc_paragraph.children.append(atom) text = acc_paragraph.text() if offset + len(text) in breaks and not any( @@ -296,7 +317,15 @@ async def strict_sentence_paragraphs( acc_paragraph = Paragraph( lang=paragraph.lang, speaker=paragraph.speaker, children=[] ) - if acc_paragraph is not None and acc_paragraph.children: + acc_yield_offset = i + acc_used_paras.append( + Paragraph( + lang=paragraph.lang, + speaker=paragraph.speaker, + children=paragraph.children[acc_yield_offset:], + ) + ) + if acc_paragraph is not None and acc_paragraph.children and combination_active: yield acc_paragraph From 21a5ec2bfff617890b0cbf97900fba6d6ddc5508 Mon Sep 17 00:00:00 2001 From: pajowu Date: Tue, 28 Nov 2023 00:13:30 +0100 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=90=9B=20Fix=20DONT=5FCOMBINE=5FRES?= =?UTF-8?q?=20not=20doing=20anything?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...tence_paragraphs-leaves_special_paras.json | 167 ++++++++++++++++++ worker/tests/test_transcribe.py | 1 + .../transcribee_worker/whisper_transcribe.py | 2 +- 3 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json diff --git a/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json new file mode 100644 index 00000000..169b2b98 --- /dev/null +++ b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json @@ -0,0 +1,167 @@ +{ + "input": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "*", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "Klirren", + "start": 0.82, + "end": 1.0, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + }, + { + "text": "*", + "start": 1.0, + "end": 1.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 1.07, + "end": 1.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 1.65, + "end": 2.0, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Infos, ", + "start": 2.0, + "end": 2.07, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 2.07, + "end": 2.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 2.65, + "end": 3.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Fakten, ", + "start": 3.06, + "end": 3.09, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ], + "expected": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "*", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "Klirren", + "start": 0.82, + "end": 1.0, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + }, + { + "text": "*", + "start": 1.0, + "end": 1.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 1.07, + "end": 1.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 1.65, + "end": 2.0, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Infos, ", + "start": 2.0, + "end": 2.07, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Ich ", + "start": 2.07, + "end": 2.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 2.65, + "end": 3.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Fakten, ", + "start": 3.06, + "end": 3.09, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ] +} diff --git a/worker/tests/test_transcribe.py b/worker/tests/test_transcribe.py index 3489fb25..fa6faab0 100644 --- a/worker/tests/test_transcribe.py +++ b/worker/tests/test_transcribe.py @@ -67,6 +67,7 @@ def test_strict_sentence_paragraphs(data_file): async_doc_chain_func_to_list(strict_sentence_paragraphs)(test_data.input) ) ) + assert [x.text() for x in output] == [x.text() for x in test_data.expected] assert output == test_data.expected diff --git a/worker/transcribee_worker/whisper_transcribe.py b/worker/transcribee_worker/whisper_transcribe.py index 1e7785a9..caec0dcf 100644 --- a/worker/transcribee_worker/whisper_transcribe.py +++ b/worker/transcribee_worker/whisper_transcribe.py @@ -286,7 +286,7 @@ async def strict_sentence_paragraphs( ) acc_used_paras = [] - elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES): + if any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES): if acc_paragraph.children: yield acc_paragraph acc_paragraph = None