Skip to content

Commit

Permalink
Merge pull request #337 from dsavinov-actionengine/support_vtt
Browse files Browse the repository at this point in the history
Adding support for WebVTT (Web Video Text Tracks) (.vtt) format
  • Loading branch information
kbairak authored Jun 11, 2024
2 parents 5b2f43e + 3fdc783 commit f286be5
Show file tree
Hide file tree
Showing 10 changed files with 413 additions and 2 deletions.
3 changes: 2 additions & 1 deletion bin/create_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from io import open

from openformats.formats import (android, github_markdown_v2, json, plaintext,
po, srt)
po, srt, vtt)
from openformats.tests.utils import translate_stringset

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
Expand All @@ -29,6 +29,7 @@ def get_handler(ext):
return {
'txt': plaintext.PlaintextHandler(),
'srt': srt.SrtHandler(),
'vtt': vtt.VttHandler(),
'xml': android.AndroidHandler(),
'json': json.JsonHandler(),
'po': po.PoHandler(),
Expand Down
1 change: 1 addition & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends $PKGS && \
ENV PYTHONDONTWRITEBYTECODE=1

COPY requirements.txt /requirements.txt
RUN pip install --upgrade pip
RUN pip install -r /requirements.txt

WORKDIR /app
Expand Down
185 changes: 185 additions & 0 deletions openformats/formats/vtt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
from itertools import count
import re

from ..handlers import Handler
from openformats.exceptions import ParseError
from openformats.strings import OpenString
from openformats.transcribers import Transcriber


class VttHandler(Handler):
name = "VTT"
extension = "vtt"
EXTRACTS_RAW = False

NON_SPACE_PAT = re.compile(r'[^\s]')

def _generate_split_subtitles(self, content, **kwargs):
start = 0
for section in content.split('\n\n'): # sections are separated by blank lines
# find first non-space character of section
match = self.NON_SPACE_PAT.search(section)
if match:
yield start + match.start(), section.strip()
start += len(section) + 2

def parse(self, content):
self.transcriber = Transcriber(content)
source = self.transcriber.source
stringset = []
self._order = count()
for start, subtitle_section in self._generate_split_subtitles(source):
self.transcriber.copy_until(start)
offset, string = self._parse_section(start, subtitle_section)

if string:
stringset.append(string)

self.transcriber.copy_until(offset)
self.transcriber.add(string.template_replacement)
self.transcriber.skip(len(string.string))
else:
self.transcriber.copy_until(start + len(subtitle_section))

self.transcriber.copy_until(len(source))

template = self.transcriber.get_destination()
if not template.startswith('WEBVTT'):
raise ParseError("VTT file should start with 'WEBVTT'!")
return template, stringset

def _parse_section(self, offset, section):
src_strings = section.split('\n') # identifier_str is optional in VTT

timings = ""
timings_index = -1
for i in range(len(src_strings)):
str = src_strings[i];
if "-->" in str:
timings = str
timings_index = i
break

if timings_index < 0:
return None, None

# Identifier (lines preceding the line with timings) is optional in VTT.
# Identifier can be either numberic or textual, and it is not necessarily unique.
identifier = '\n'.join(src_strings[:timings_index])

# timings
timings_parse_error = False
try:
splitted = timings.split(None, 3)
if len(splitted) == 3:
start, arrow, end = splitted
else:
start, arrow, end, _ = splitted
except ValueError:
timings_parse_error = True
else:
if arrow != "-->":
timings_parse_error = True
if timings_parse_error:
raise ParseError(
f"Timings on line {self.transcriber.line_number + 1} "
"don't follow '[start] --> [end] (position)' pattern"
)
try:
start = self._format_timing(start)
except ValueError:
raise ParseError(
f"Problem with start of timing at line {self.transcriber.line_number + 1}: '{start}'"
)
try:
end = self._format_timing(end)
except ValueError:
raise ParseError(
f"Problem with end of timing at line {self.transcriber.line_number + 1}: '{end}'"
)

# Content
string_to_translate = '\n'.join(src_strings[timings_index+1:])
if string_to_translate == "":
raise ParseError(f"Subtitle is empty on line {self.transcriber.line_number + 2}")

string = OpenString(timings, string_to_translate,
occurrences=f"{start},{end}",
order=next(self._order))
offset += len(identifier) + len(timings) + 1;
if len(identifier):
offset += 1
return offset, string

def _format_timing(self, timing):
try:
rest, milliseconds = timing.split('.')
milliseconds = f"{milliseconds:<03}"
except ValueError:
rest, milliseconds = timing, "000"
# timing may or may not contain hours part
if rest.count(':') == 1:
minutes, seconds = rest.split(':')
minutes, seconds, milliseconds = (int(minutes),
int(seconds),
int(milliseconds))
return f"{minutes:02}:{seconds:02}.{milliseconds:03}"
elif rest.count(':') == 2:
hours, minutes, seconds = rest.split(':')
hours, minutes, seconds, milliseconds = (int(hours),
int(minutes),
int(seconds),
int(milliseconds))
return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
else:
raise ParseError(f"Unexpected timing format on line {self.transcriber.line_number + 2}")

def compile(self, template, stringset, **kwargs):
transcriber = Transcriber(template)
template = transcriber.source
stringset = iter(stringset)
try:
string = next(stringset)
except StopIteration:
raise ParseError("stringset cannot be empty")

for start, subtitle_section in self._generate_split_subtitles(template):
transcriber.copy_until(start)
transcriber.mark_section_start()

# Find hash after timings
hash_position = -1
if subtitle_section.count('-->') > 0:
arrow_pos = subtitle_section.index('-->')
try:
end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->'))
hash_position = end_of_timings + 1
except ValueError:
# No newlines after timing: subtitle is missing
pass

if hash_position < 0:
transcriber.copy_until(start + len(subtitle_section))
transcriber.mark_section_end()
elif (subtitle_section[
hash_position:
hash_position + len(string.template_replacement)
] == string.template_replacement):
# found it
transcriber.copy_until(start + hash_position)
transcriber.add(string.string)
transcriber.skip(len(string.template_replacement))
transcriber.copy_until(start + len(subtitle_section))
transcriber.mark_section_end()
try:
string = next(stringset)
except StopIteration:
pass
else:
# did not find it, must remove section
transcriber.copy_until(start + len(subtitle_section))
transcriber.mark_section_end()
transcriber.remove_section()

transcriber.copy_until(len(template))
return transcriber.get_destination()
Empty file.
35 changes: 35 additions & 0 deletions openformats/tests/formats/vtt/files/1_el.vtt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
WEBVTT
STYLE here
some long,
long style

1
00:01:28.797 --> 00:01:30.297
Γεια σου, Κόσμε!

NOTE some note
here
2
00:01:45.105 --> 00:01:47.940 X:350 Y:240
Pinky: Brain, ρι θες να κάνουμε απόψε;
Brain: Ό,τι και κάθε βράδυ, Pinky: θα κατακτήσουμε τον κόσμο!

3
00:02:45.105 --> 00:02:47.940
el:A phrase with escaped &lt;HTML tags&gt;

4
00:03:45.105 --> 00:03:47.940
el:<font color="#00ff00">A phrase with <b>HTML</b> characters</font>

5
00:05:45.105 --> 00:05:47.940
el:A phrase with unicode characters: ΑβΓδΕ → ♡ Ш

6
00:06:45.105 --> 00:06:47.940
el:Three lines: First
Second
Third
35 changes: 35 additions & 0 deletions openformats/tests/formats/vtt/files/1_en.vtt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
WEBVTT
STYLE here
some long,
long style

1
00:01:28.797 --> 00:01:30.297
Hello, World!

NOTE some note
here
2
00:01:45.105 --> 00:01:47.940 X:350 Y:240
Pinky: Gee, Brain, what do you want to do tonight?
Brain: The same thing we do every night, Pinky - try to take over the world!

3
00:02:45.105 --> 00:02:47.940
A phrase with escaped &lt;HTML tags&gt;

4
00:03:45.105 --> 00:03:47.940
<font color="#00ff00">A phrase with <b>HTML</b> characters</font>

5
00:05:45.105 --> 00:05:47.940
A phrase with unicode characters: ΑβΓδΕ → ♡ Ш

6
00:06:45.105 --> 00:06:47.940
Three lines: First
Second
Third
32 changes: 32 additions & 0 deletions openformats/tests/formats/vtt/files/1_tpl.vtt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
WEBVTT
STYLE here
some long,
long style

1
00:01:28.797 --> 00:01:30.297
c386a46eaaa5ecd18e760683c3e36987_tr

NOTE some note
here
2
00:01:45.105 --> 00:01:47.940 X:350 Y:240
f3736d657f04cedbb1eefd07e7fb4e53_tr

3
00:02:45.105 --> 00:02:47.940
12a3c29d1c2ead6744096c2bcf5cb5a0_tr

4
00:03:45.105 --> 00:03:47.940
32189023ec2e2af1c96ff6e50889a8e5_tr

5
00:05:45.105 --> 00:05:47.940
df27c645bb92280c825e3e1c94a3f0b8_tr

6
00:06:45.105 --> 00:06:47.940
22394ab09ce61d63e1f9d56ef64c4e40_tr
Loading

0 comments on commit f286be5

Please sign in to comment.