diff --git a/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json new file mode 100644 index 00000000..169b2b98 --- /dev/null +++ b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json @@ -0,0 +1,167 @@ +{ + "input": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "*", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "Klirren", + "start": 0.82, + "end": 1.0, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + }, + { + "text": "*", + "start": 1.0, + "end": 1.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 1.07, + "end": 1.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 1.65, + "end": 2.0, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Infos, ", + "start": 2.0, + "end": 2.07, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 2.07, + "end": 2.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 2.65, + "end": 3.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Fakten, ", + "start": 3.06, + "end": 3.09, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ], + "expected": [ + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "*", + "start": 0.0, + "end": 0.82, + "conf": 0.4493102431297302, + "conf_ts": 0.0 + }, + { + "text": "Klirren", + "start": 0.82, + "end": 1.0, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + }, + { + "text": "*", + "start": 1.0, + "end": 1.07, + "conf": 0.9744400978088379, + "conf_ts": 0.005903157405555248 + } + ] + }, + { + "type": "paragraph", + "speaker": null, + "lang": "de", + "children": [ + { + "text": "Ich ", + "start": 1.07, + "end": 1.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 1.65, + "end": 2.0, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Infos, ", + "start": 2.0, + "end": 2.07, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Ich ", + "start": 2.07, + "end": 2.65, + "conf": 0.9838394522666931, + "conf_ts": 0.01149927917867899 + }, + { + "text": "will ", + "start": 2.65, + "end": 3.06, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + }, + { + "text": "Fakten, ", + "start": 3.06, + "end": 3.09, + "conf": 0.9566531777381897, + "conf_ts": 0.0096774036064744 + } + ] + } + ] +} diff --git a/worker/tests/test_transcribe.py b/worker/tests/test_transcribe.py index 3489fb25..15e905b9 100644 --- a/worker/tests/test_transcribe.py +++ b/worker/tests/test_transcribe.py @@ -67,6 +67,7 @@ def test_strict_sentence_paragraphs(data_file): async_doc_chain_func_to_list(strict_sentence_paragraphs)(test_data.input) ) ) + assert [x.text() for x in output] == [x.text() for x in test_data.expected] assert output == test_data.expected @@ -103,6 +104,6 @@ def test_space_and_sentences(data_file): )(test_data.input) ) ) - for p in output: - print(p.json()) + # for p in output: + # print(p.json()) assert output == test_data.expected diff --git a/worker/transcribee_worker/whisper_transcribe.py b/worker/transcribee_worker/whisper_transcribe.py index 1e7785a9..caec0dcf 100644 --- a/worker/transcribee_worker/whisper_transcribe.py +++ b/worker/transcribee_worker/whisper_transcribe.py @@ -286,7 +286,7 @@ async def strict_sentence_paragraphs( ) acc_used_paras = [] - elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES): + if any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES): if acc_paragraph.children: yield acc_paragraph acc_paragraph = None