Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Paragraph recombination fixes #395

Merged
merged 2 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
{
"input": [
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "zum ",
"start": 0.82,
"end": 30.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
}
]
},
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "letzten ",
"start": 30.07,
"end": 31.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Token.",
"start": 31.65,
"end": 32.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
]
}
],
"expected": [
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "zum ",
"start": 0.82,
"end": 30.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
}
]
},
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "letzten ",
"start": 30.07,
"end": 31.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Token.",
"start": 31.65,
"end": 32.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
{
"input": [
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "*",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "Klirren",
"start": 0.82,
"end": 1.0,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
},
{
"text": "*",
"start": 1.0,
"end": 1.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
}
]
},
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "Ich ",
"start": 1.07,
"end": 1.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "will ",
"start": 1.65,
"end": 2.0,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
},
{
"text": "Infos, ",
"start": 2.0,
"end": 2.07,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
]
},
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "Ich ",
"start": 2.07,
"end": 2.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "will ",
"start": 2.65,
"end": 3.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
},
{
"text": "Fakten, ",
"start": 3.06,
"end": 3.09,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
]
}
],
"expected": [
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "*",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "Klirren",
"start": 0.82,
"end": 1.0,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
},
{
"text": "*",
"start": 1.0,
"end": 1.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
}
]
},
{
"type": "paragraph",
"speaker": null,
"lang": "de",
"children": [
{
"text": "Ich ",
"start": 1.07,
"end": 1.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "will ",
"start": 1.65,
"end": 2.0,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
},
{
"text": "Infos, ",
"start": 2.0,
"end": 2.07,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
},
{
"text": "Ich ",
"start": 2.07,
"end": 2.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "will ",
"start": 2.65,
"end": 3.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
},
{
"text": "Fakten, ",
"start": 3.06,
"end": 3.09,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
]
}
]
}
1 change: 1 addition & 0 deletions worker/tests/test_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def test_strict_sentence_paragraphs(data_file):
async_doc_chain_func_to_list(strict_sentence_paragraphs)(test_data.input)
)
)
assert [x.text() for x in output] == [x.text() for x in test_data.expected]
assert output == test_data.expected


Expand Down
39 changes: 34 additions & 5 deletions worker/transcribee_worker/whisper_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,30 @@ async def strict_sentence_paragraphs(
iter: AsyncIterator[Paragraph],
) -> AsyncIterator[Paragraph]:
acc_paragraph = None
acc_used_paras = []
combination_active = True
async for paragraph in iter:
if acc_paragraph is None:
if not combination_active:
yield paragraph
continue
elif acc_paragraph is None:
acc_paragraph = Paragraph(
lang=paragraph.lang, speaker=paragraph.speaker, children=[]
)

acc_used_paras = []
elif (
(start := acc_paragraph.start()) is not None
and (end := paragraph.end()) is not None
and end - start > 30
):
# It seems like whisper doesn't produce sentence breaks. Ignore the
# current `acc_paragraph` and yield the original paras instead,
# disable this step until the end of the document
combination_active = False
for para in acc_used_paras:
yield para
yield paragraph
continue
elif (
acc_paragraph.lang != paragraph.lang
or acc_paragraph.speaker != paragraph.speaker
Expand All @@ -266,8 +284,9 @@ async def strict_sentence_paragraphs(
acc_paragraph = Paragraph(
lang=paragraph.lang, speaker=paragraph.speaker, children=[]
)
acc_used_paras = []

elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
if any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
if acc_paragraph.children:
yield acc_paragraph
acc_paragraph = None
Expand All @@ -285,7 +304,9 @@ async def strict_sentence_paragraphs(
acc_paragraph = Paragraph(
lang=paragraph.lang, speaker=paragraph.speaker, children=[]
)
for atom in paragraph.children:
acc_used_paras = []
acc_yield_offset = 0
for i, atom in enumerate(paragraph.children):
acc_paragraph.children.append(atom)
text = acc_paragraph.text()
if offset + len(text) in breaks and not any(
Expand All @@ -296,7 +317,15 @@ async def strict_sentence_paragraphs(
acc_paragraph = Paragraph(
lang=paragraph.lang, speaker=paragraph.speaker, children=[]
)
if acc_paragraph is not None and acc_paragraph.children:
acc_yield_offset = i
acc_used_paras.append(
Paragraph(
lang=paragraph.lang,
speaker=paragraph.speaker,
children=paragraph.children[acc_yield_offset:],
)
)
if acc_paragraph is not None and acc_paragraph.children and combination_active:
yield acc_paragraph


Expand Down