From 1e78b2d6b411ce2cef4a5af8001e3e326514cc1f Mon Sep 17 00:00:00 2001
From: pajowu <git@ca.pajowu.de>
Date: Mon, 27 Nov 2023 23:27:31 +0100
Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=90=9B=20Skip=20para=20recombination?=
 =?UTF-8?q?=20if=20produced=20para=20is=20>=2030s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...ce_paragraphs-doest_combine_long_para.json | 90 +++++++++++++++++++
 .../transcribee_worker/whisper_transcribe.py  | 37 +++++++-
 2 files changed, 123 insertions(+), 4 deletions(-)
 create mode 100644 worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json

diff --git a/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json b/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json
new file mode 100644
index 00000000..d7742749
--- /dev/null
+++ b/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json
@@ -0,0 +1,90 @@
+{
+  "input": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "zum ",
+          "start": 0.82,
+          "end": 30.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "letzten ",
+          "start": 30.07,
+          "end": 31.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Token.",
+          "start": 31.65,
+          "end": 32.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    }
+  ],
+  "expected": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "zum ",
+          "start": 0.82,
+          "end": 30.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "letzten ",
+          "start": 30.07,
+          "end": 31.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Token.",
+          "start": 31.65,
+          "end": 32.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    }
+  ]
+}
diff --git a/worker/transcribee_worker/whisper_transcribe.py b/worker/transcribee_worker/whisper_transcribe.py
index 71cbcd8b..1e7785a9 100644
--- a/worker/transcribee_worker/whisper_transcribe.py
+++ b/worker/transcribee_worker/whisper_transcribe.py
@@ -251,12 +251,30 @@ async def strict_sentence_paragraphs(
     iter: AsyncIterator[Paragraph],
 ) -> AsyncIterator[Paragraph]:
     acc_paragraph = None
+    acc_used_paras = []
+    combination_active = True
     async for paragraph in iter:
-        if acc_paragraph is None:
+        if not combination_active:
+            yield paragraph
+            continue
+        elif acc_paragraph is None:
             acc_paragraph = Paragraph(
                 lang=paragraph.lang, speaker=paragraph.speaker, children=[]
             )
-
+            acc_used_paras = []
+        elif (
+            (start := acc_paragraph.start()) is not None
+            and (end := paragraph.end()) is not None
+            and end - start > 30
+        ):
+            # It seems like whisper doesn't produce sentence breaks. Ignore the
+            # current `acc_paragraph` and yield the original paras instead,
+            # disable this step until the end of the document
+            combination_active = False
+            for para in acc_used_paras:
+                yield para
+            yield paragraph
+            continue
         elif (
             acc_paragraph.lang != paragraph.lang
             or acc_paragraph.speaker != paragraph.speaker
@@ -266,6 +284,7 @@ async def strict_sentence_paragraphs(
             acc_paragraph = Paragraph(
                 lang=paragraph.lang, speaker=paragraph.speaker, children=[]
             )
+            acc_used_paras = []
 
         elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
             if acc_paragraph.children:
@@ -285,7 +304,9 @@ async def strict_sentence_paragraphs(
             acc_paragraph = Paragraph(
                 lang=paragraph.lang, speaker=paragraph.speaker, children=[]
             )
-        for atom in paragraph.children:
+            acc_used_paras = []
+        acc_yield_offset = 0
+        for i, atom in enumerate(paragraph.children):
             acc_paragraph.children.append(atom)
             text = acc_paragraph.text()
             if offset + len(text) in breaks and not any(
@@ -296,7 +317,15 @@ async def strict_sentence_paragraphs(
                 acc_paragraph = Paragraph(
                     lang=paragraph.lang, speaker=paragraph.speaker, children=[]
                 )
-    if acc_paragraph is not None and acc_paragraph.children:
+                acc_yield_offset = i
+        acc_used_paras.append(
+            Paragraph(
+                lang=paragraph.lang,
+                speaker=paragraph.speaker,
+                children=paragraph.children[acc_yield_offset:],
+            )
+        )
+    if acc_paragraph is not None and acc_paragraph.children and combination_active:
         yield acc_paragraph
 
 

From 21a5ec2bfff617890b0cbf97900fba6d6ddc5508 Mon Sep 17 00:00:00 2001
From: pajowu <git@ca.pajowu.de>
Date: Tue, 28 Nov 2023 00:13:30 +0100
Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=90=9B=20Fix=20DONT=5FCOMBINE=5FRES?=
 =?UTF-8?q?=20not=20doing=20anything?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...tence_paragraphs-leaves_special_paras.json | 167 ++++++++++++++++++
 worker/tests/test_transcribe.py               |   1 +
 .../transcribee_worker/whisper_transcribe.py  |   2 +-
 3 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json

diff --git a/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json
new file mode 100644
index 00000000..169b2b98
--- /dev/null
+++ b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json
@@ -0,0 +1,167 @@
+{
+  "input": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "*",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "Klirren",
+          "start": 0.82,
+          "end": 1.0,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "*",
+          "start": 1.0,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Ich ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 1.65,
+          "end": 2.0,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Infos, ",
+          "start": 2.0,
+          "end": 2.07,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Ich ",
+          "start": 2.07,
+          "end": 2.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 2.65,
+          "end": 3.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Fakten, ",
+          "start": 3.06,
+          "end": 3.09,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    }
+  ],
+  "expected": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "*",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "Klirren",
+          "start": 0.82,
+          "end": 1.0,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "*",
+          "start": 1.0,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Ich ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 1.65,
+          "end": 2.0,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Infos, ",
+          "start": 2.0,
+          "end": 2.07,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Ich ",
+          "start": 2.07,
+          "end": 2.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 2.65,
+          "end": 3.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Fakten, ",
+          "start": 3.06,
+          "end": 3.09,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    }
+  ]
+}
diff --git a/worker/tests/test_transcribe.py b/worker/tests/test_transcribe.py
index 3489fb25..fa6faab0 100644
--- a/worker/tests/test_transcribe.py
+++ b/worker/tests/test_transcribe.py
@@ -67,6 +67,7 @@ def test_strict_sentence_paragraphs(data_file):
             async_doc_chain_func_to_list(strict_sentence_paragraphs)(test_data.input)
         )
     )
+    assert [x.text() for x in output] == [x.text() for x in test_data.expected]
     assert output == test_data.expected
 
 
diff --git a/worker/transcribee_worker/whisper_transcribe.py b/worker/transcribee_worker/whisper_transcribe.py
index 1e7785a9..caec0dcf 100644
--- a/worker/transcribee_worker/whisper_transcribe.py
+++ b/worker/transcribee_worker/whisper_transcribe.py
@@ -286,7 +286,7 @@ async def strict_sentence_paragraphs(
             )
             acc_used_paras = []
 
-        elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
+        if any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
             if acc_paragraph.children:
                 yield acc_paragraph
             acc_paragraph = None