Merge pull request #337 from dsavinov-actionengine/support_vtt

Adding support for WebVTT (Web Video Text Tracks) (.vtt) format
transifex · Jun 11, 2024 · f286be5 · f286be5
2 parents 5b2f43e + 3fdc783
commit f286be5
Show file tree

Hide file tree

Showing 10 changed files with 413 additions and 2 deletions.
diff --git a/bin/create_files.py b/bin/create_files.py
@@ -15,7 +15,7 @@
 from io import open
 
 from openformats.formats import (android, github_markdown_v2, json, plaintext,
-                                 po, srt)
+                                 po, srt, vtt)
 from openformats.tests.utils import translate_stringset
 
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -29,6 +29,7 @@ def get_handler(ext):
     return {
         'txt': plaintext.PlaintextHandler(),
         'srt': srt.SrtHandler(),
+        'vtt': vtt.VttHandler(),
         'xml': android.AndroidHandler(),
         'json': json.JsonHandler(),
         'po': po.PoHandler(),

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -20,6 +20,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends $PKGS && \
 ENV PYTHONDONTWRITEBYTECODE=1
 
 COPY requirements.txt /requirements.txt
+RUN pip install --upgrade pip
 RUN pip install -r /requirements.txt
 
 WORKDIR /app

diff --git a/openformats/formats/vtt.py b/openformats/formats/vtt.py
@@ -0,0 +1,185 @@
+from itertools import count
+import re
+
+from ..handlers import Handler
+from openformats.exceptions import ParseError
+from openformats.strings import OpenString
+from openformats.transcribers import Transcriber
+
+
+class VttHandler(Handler):
+    name = "VTT"
+    extension = "vtt"
+    EXTRACTS_RAW = False
+
+    NON_SPACE_PAT = re.compile(r'[^\s]')
+
+    def _generate_split_subtitles(self, content, **kwargs):
+        start = 0
+        for section in content.split('\n\n'):  # sections are separated by blank lines
+            # find first non-space character of section
+            match = self.NON_SPACE_PAT.search(section)
+            if match:
+                yield start + match.start(), section.strip()
+            start += len(section) + 2
+
+    def parse(self, content):
+        self.transcriber = Transcriber(content)
+        source = self.transcriber.source
+        stringset = []
+        self._order = count()
+        for start, subtitle_section in self._generate_split_subtitles(source):
+            self.transcriber.copy_until(start)
+            offset, string = self._parse_section(start, subtitle_section)
+
+            if string:
+                stringset.append(string)
+
+                self.transcriber.copy_until(offset)
+                self.transcriber.add(string.template_replacement)
+                self.transcriber.skip(len(string.string))
+            else:
+                self.transcriber.copy_until(start + len(subtitle_section))
+
+        self.transcriber.copy_until(len(source))
+
+        template = self.transcriber.get_destination()
+        if not template.startswith('WEBVTT'):
+            raise ParseError("VTT file should start with 'WEBVTT'!")
+        return template, stringset
+
+    def _parse_section(self, offset, section):
+        src_strings = section.split('\n')  # identifier_str is optional in VTT
+
+        timings = ""
+        timings_index = -1
+        for i in range(len(src_strings)):
+            str = src_strings[i];
+            if "-->" in str:
+                timings = str
+                timings_index = i
+                break
+
+        if timings_index < 0:
+            return None, None
+
+        # Identifier (lines preceding the line with timings) is optional in VTT.
+        # Identifier can be either numberic or textual, and it is not necessarily unique.
+        identifier = '\n'.join(src_strings[:timings_index])
+
+        # timings
+        timings_parse_error = False
+        try:
+            splitted = timings.split(None, 3)
+            if len(splitted) == 3:
+                start, arrow, end = splitted
+            else:
+                start, arrow, end, _ = splitted
+        except ValueError:
+            timings_parse_error = True
+        else:
+            if arrow != "-->":
+                timings_parse_error = True
+        if timings_parse_error:
+            raise ParseError(
+                f"Timings on line {self.transcriber.line_number + 1} "
+                "don't follow '[start] --> [end] (position)' pattern"
+            )
+        try:
+            start = self._format_timing(start)
+        except ValueError:
+            raise ParseError(
+                f"Problem with start of timing at line {self.transcriber.line_number + 1}: '{start}'"
+            )
+        try:
+            end = self._format_timing(end)
+        except ValueError:
+            raise ParseError(
+                f"Problem with end of timing at line {self.transcriber.line_number + 1}: '{end}'"
+            )
+
+        # Content
+        string_to_translate = '\n'.join(src_strings[timings_index+1:])
+        if string_to_translate == "":
+            raise ParseError(f"Subtitle is empty on line {self.transcriber.line_number + 2}")
+
+        string = OpenString(timings, string_to_translate,
+                            occurrences=f"{start},{end}",
+                            order=next(self._order))
+        offset += len(identifier) + len(timings) + 1;
+        if len(identifier):
+            offset += 1
+        return offset, string
+
+    def _format_timing(self, timing):
+        try:
+            rest, milliseconds = timing.split('.')
+            milliseconds = f"{milliseconds:<03}"
+        except ValueError:
+            rest, milliseconds = timing, "000"
+        # timing may or may not contain hours part
+        if rest.count(':') == 1:
+            minutes, seconds = rest.split(':')
+            minutes, seconds, milliseconds = (int(minutes),
+                                              int(seconds),
+                                              int(milliseconds))
+            return f"{minutes:02}:{seconds:02}.{milliseconds:03}"
+        elif rest.count(':') == 2:
+            hours, minutes, seconds = rest.split(':')
+            hours, minutes, seconds, milliseconds = (int(hours),
+                                                    int(minutes),
+                                                    int(seconds),
+                                                    int(milliseconds))
+            return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
+        else:
+            raise ParseError(f"Unexpected timing format on line {self.transcriber.line_number + 2}")
+
+    def compile(self, template, stringset, **kwargs):
+        transcriber = Transcriber(template)
+        template = transcriber.source
+        stringset = iter(stringset)
+        try:
+            string = next(stringset)
+        except StopIteration:
+            raise ParseError("stringset cannot be empty")
+
+        for start, subtitle_section in self._generate_split_subtitles(template):
+            transcriber.copy_until(start)
+            transcriber.mark_section_start()
+
+            # Find hash after timings
+            hash_position = -1
+            if subtitle_section.count('-->') > 0:
+                arrow_pos = subtitle_section.index('-->')
+                try:
+                    end_of_timings = subtitle_section.index('\n', arrow_pos + len('-->'))
+                    hash_position = end_of_timings + 1
+                except ValueError:
+                    # No newlines after timing: subtitle is missing
+                    pass
+
+            if hash_position < 0:
+                transcriber.copy_until(start + len(subtitle_section))
+                transcriber.mark_section_end()
+            elif (subtitle_section[
+                    hash_position:
+                    hash_position + len(string.template_replacement)
+                    ] == string.template_replacement):
+                # found it
+                transcriber.copy_until(start + hash_position)
+                transcriber.add(string.string)
+                transcriber.skip(len(string.template_replacement))
+                transcriber.copy_until(start + len(subtitle_section))
+                transcriber.mark_section_end()
+                try:
+                    string = next(stringset)
+                except StopIteration:
+                    pass
+            else:
+                # did not find it, must remove section
+                transcriber.copy_until(start + len(subtitle_section))
+                transcriber.mark_section_end()
+                transcriber.remove_section()
+
+        transcriber.copy_until(len(template))
+        return transcriber.get_destination()
diff --git a/openformats/tests/formats/vtt/__init__.py b/openformats/tests/formats/vtt/__init__.py
diff --git a/openformats/tests/formats/vtt/files/1_el.vtt b/openformats/tests/formats/vtt/files/1_el.vtt
@@ -0,0 +1,35 @@
+WEBVTT
+
+STYLE here
+some long,
+long style
+
+1
+00:01:28.797 --> 00:01:30.297
+Γεια σου, Κόσμε!
+
+NOTE some note
+here
+
+2
+00:01:45.105 --> 00:01:47.940 X:350 Y:240
+Pinky: Brain, ρι θες να κάνουμε απόψε;
+Brain: Ό,τι και κάθε βράδυ, Pinky: θα κατακτήσουμε τον κόσμο!
+
+3
+00:02:45.105 --> 00:02:47.940
+el:A phrase with escaped &lt;HTML tags&gt;
+
+4
+00:03:45.105 --> 00:03:47.940
+el:<font color="#00ff00">A phrase with <b>HTML</b> characters</font>
+
+5
+00:05:45.105 --> 00:05:47.940
+el:A phrase with unicode characters: ΑβΓδΕ → ♡ Ш
+
+6
+00:06:45.105 --> 00:06:47.940
+el:Three lines: First
+Second
+Third
diff --git a/openformats/tests/formats/vtt/files/1_en.vtt b/openformats/tests/formats/vtt/files/1_en.vtt
@@ -0,0 +1,35 @@
+WEBVTT
+
+STYLE here
+some long,
+long style
+
+1
+00:01:28.797 --> 00:01:30.297
+Hello, World!
+
+NOTE some note
+here
+
+2
+00:01:45.105 --> 00:01:47.940 X:350 Y:240
+Pinky: Gee, Brain, what do you want to do tonight?
+Brain: The same thing we do every night, Pinky - try to take over the world!
+
+3
+00:02:45.105 --> 00:02:47.940
+A phrase with escaped &lt;HTML tags&gt;
+
+4
+00:03:45.105 --> 00:03:47.940
+<font color="#00ff00">A phrase with <b>HTML</b> characters</font>
+
+5
+00:05:45.105 --> 00:05:47.940
+A phrase with unicode characters: ΑβΓδΕ → ♡ Ш
+
+6
+00:06:45.105 --> 00:06:47.940
+Three lines: First
+Second
+Third
diff --git a/openformats/tests/formats/vtt/files/1_tpl.vtt b/openformats/tests/formats/vtt/files/1_tpl.vtt
@@ -0,0 +1,32 @@
+WEBVTT
+
+STYLE here
+some long,
+long style
+
+1
+00:01:28.797 --> 00:01:30.297
+c386a46eaaa5ecd18e760683c3e36987_tr
+
+NOTE some note
+here
+
+2
+00:01:45.105 --> 00:01:47.940 X:350 Y:240
+f3736d657f04cedbb1eefd07e7fb4e53_tr
+
+3
+00:02:45.105 --> 00:02:47.940
+12a3c29d1c2ead6744096c2bcf5cb5a0_tr
+
+4
+00:03:45.105 --> 00:03:47.940
+32189023ec2e2af1c96ff6e50889a8e5_tr
+
+5
+00:05:45.105 --> 00:05:47.940
+df27c645bb92280c825e3e1c94a3f0b8_tr
+
+6
+00:06:45.105 --> 00:06:47.940
+22394ab09ce61d63e1f9d56ef64c4e40_tr