bugbakery · pajowu · Nov 28, 2023 · Nov 27, 2023 · Nov 27, 2023
diff --git a/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json b/worker/tests/data/test_strict_sentence_paragraphs-doest_combine_long_para.json
@@ -0,0 +1,90 @@
+{
+  "input": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "zum ",
+          "start": 0.82,
+          "end": 30.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "letzten ",
+          "start": 30.07,
+          "end": 31.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Token.",
+          "start": 31.65,
+          "end": 32.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    }
+  ],
+  "expected": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "zum ",
+          "start": 0.82,
+          "end": 30.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "letzten ",
+          "start": 30.07,
+          "end": 31.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Token.",
+          "start": 31.65,
+          "end": 32.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    }
+  ]
+}
diff --git a/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json b/worker/tests/data/test_strict_sentence_paragraphs-leaves_special_paras.json
@@ -0,0 +1,167 @@
+{
+  "input": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "*",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "Klirren",
+          "start": 0.82,
+          "end": 1.0,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "*",
+          "start": 1.0,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Ich ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 1.65,
+          "end": 2.0,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Infos, ",
+          "start": 2.0,
+          "end": 2.07,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Ich ",
+          "start": 2.07,
+          "end": 2.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 2.65,
+          "end": 3.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Fakten, ",
+          "start": 3.06,
+          "end": 3.09,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    }
+  ],
+  "expected": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "*",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "Klirren",
+          "start": 0.82,
+          "end": 1.0,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "*",
+          "start": 1.0,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        }
+      ]
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "lang": "de",
+      "children": [
+        {
+          "text": "Ich ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 1.65,
+          "end": 2.0,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Infos, ",
+          "start": 2.0,
+          "end": 2.07,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Ich ",
+          "start": 2.07,
+          "end": 2.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "will ",
+          "start": 2.65,
+          "end": 3.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Fakten, ",
+          "start": 3.06,
+          "end": 3.09,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ]
+    }
+  ]
+}
diff --git a/worker/tests/test_transcribe.py b/worker/tests/test_transcribe.py
@@ -67,6 +67,7 @@ def test_strict_sentence_paragraphs(data_file):
             async_doc_chain_func_to_list(strict_sentence_paragraphs)(test_data.input)
         )
     )
+    assert [x.text() for x in output] == [x.text() for x in test_data.expected]
     assert output == test_data.expected
 
 

diff --git a/worker/transcribee_worker/whisper_transcribe.py b/worker/transcribee_worker/whisper_transcribe.py
@@ -251,12 +251,30 @@ async def strict_sentence_paragraphs(
     iter: AsyncIterator[Paragraph],
 ) -> AsyncIterator[Paragraph]:
     acc_paragraph = None
+    acc_used_paras = []
+    combination_active = True
     async for paragraph in iter:
-        if acc_paragraph is None:
+        if not combination_active:
+            yield paragraph
+            continue
+        elif acc_paragraph is None:
             acc_paragraph = Paragraph(
                 lang=paragraph.lang, speaker=paragraph.speaker, children=[]
             )
-
+            acc_used_paras = []
+        elif (
+            (start := acc_paragraph.start()) is not None
+            and (end := paragraph.end()) is not None
+            and end - start > 30
+        ):
+            # It seems like whisper doesn't produce sentence breaks. Ignore the
+            # current `acc_paragraph` and yield the original paras instead,
+            # disable this step until the end of the document
+            combination_active = False
+            for para in acc_used_paras:
+                yield para
+            yield paragraph
+            continue
         elif (
             acc_paragraph.lang != paragraph.lang
             or acc_paragraph.speaker != paragraph.speaker
@@ -266,8 +284,9 @@ async def strict_sentence_paragraphs(
             acc_paragraph = Paragraph(
                 lang=paragraph.lang, speaker=paragraph.speaker, children=[]
             )
+            acc_used_paras = []
 
-        elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
+        if any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
             if acc_paragraph.children:
                 yield acc_paragraph
             acc_paragraph = None
@@ -285,7 +304,9 @@ async def strict_sentence_paragraphs(
             acc_paragraph = Paragraph(
                 lang=paragraph.lang, speaker=paragraph.speaker, children=[]
             )
-        for atom in paragraph.children:
+            acc_used_paras = []
+        acc_yield_offset = 0
+        for i, atom in enumerate(paragraph.children):
             acc_paragraph.children.append(atom)
             text = acc_paragraph.text()
             if offset + len(text) in breaks and not any(
@@ -296,7 +317,15 @@ async def strict_sentence_paragraphs(
                 acc_paragraph = Paragraph(
                     lang=paragraph.lang, speaker=paragraph.speaker, children=[]
                 )
-    if acc_paragraph is not None and acc_paragraph.children:
+                acc_yield_offset = i
+        acc_used_paras.append(
+            Paragraph(
+                lang=paragraph.lang,
+                speaker=paragraph.speaker,
+                children=paragraph.children[acc_yield_offset:],
+            )
+        )
+    if acc_paragraph is not None and acc_paragraph.children and combination_active:
         yield acc_paragraph
-Original file line number
+Diff line change
@@ Expand Up / @@ -67,6 +67,7 @@ def test_strict_sentence_paragraphs(data_file): @@
                 async_doc_chain_func_to_list(strict_sentence_paragraphs)(test_data.input)
             )
         )
+        assert [x.text() for x in output] == [x.text() for x in test_data.expected]
         assert output == test_data.expected
@@ Expand Down @@