From 21a5ec2bfff617890b0cbf97900fba6d6ddc5508 Mon Sep 17 00:00:00 2001
From: pajowu <git@ca.pajowu.de>
Date: Tue, 28 Nov 2023 00:13:30 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20DONT=5FCOMBINE=5FRES=20not?=
 =?UTF-8?q?=20doing=20anything?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...tence_paragraphs-leaves_special_paras.json | 167 ++++++++++++++++++
 worker/tests/test_transcribe.py               |   1 +
 .../transcribee_worker/whisper_transcribe.py  |   2 +-
 3 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json

diff --git a/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json
new file mode 100644
index 00000000..169b2b98
--- /dev/null
+++ b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json
@@ -0,0 +1,167 @@
+{
+  "input": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "*",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "Klirren",
+          "start": 0.82,
+          "end": 1.0,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "*",
+          "start": 1.0,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Ich ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 1.65,
+          "end": 2.0,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Infos, ",
+          "start": 2.0,
+          "end": 2.07,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Ich ",
+          "start": 2.07,
+          "end": 2.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 2.65,
+          "end": 3.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Fakten, ",
+          "start": 3.06,
+          "end": 3.09,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    }
+  ],
+  "expected": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "*",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "Klirren",
+          "start": 0.82,
+          "end": 1.0,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "*",
+          "start": 1.0,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Ich ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 1.65,
+          "end": 2.0,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Infos, ",
+          "start": 2.0,
+          "end": 2.07,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Ich ",
+          "start": 2.07,
+          "end": 2.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 2.65,
+          "end": 3.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Fakten, ",
+          "start": 3.06,
+          "end": 3.09,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    }
+  ]
+}
diff --git a/worker/tests/test_transcribe.py b/worker/tests/test_transcribe.py
index 3489fb25..fa6faab0 100644
--- a/worker/tests/test_transcribe.py
+++ b/worker/tests/test_transcribe.py
@@ -67,6 +67,7 @@ def test_strict_sentence_paragraphs(data_file):
             async_doc_chain_func_to_list(strict_sentence_paragraphs)(test_data.input)
         )
     )
+    assert [x.text() for x in output] == [x.text() for x in test_data.expected]
     assert output == test_data.expected
 
 
diff --git a/worker/transcribee_worker/whisper_transcribe.py b/worker/transcribee_worker/whisper_transcribe.py
index 1e7785a9..caec0dcf 100644
--- a/worker/transcribee_worker/whisper_transcribe.py
+++ b/worker/transcribee_worker/whisper_transcribe.py
@@ -286,7 +286,7 @@ async def strict_sentence_paragraphs(
             )
             acc_used_paras = []
 
-        elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
+        if any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
             if acc_paragraph.children:
                 yield acc_paragraph
             acc_paragraph = None