-
Notifications
You must be signed in to change notification settings - Fork 0
/
srt-to-json.py
54 lines (42 loc) · 1.8 KB
/
srt-to-json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Convert word-by-word SRT from whisper.cpp to the Podcast Namespace JSON structure
# Usage: python whisper.json.py <input.srt>
import sys
import re
import json
# read the srt file
with open(sys.argv[1], 'r') as input_file:
srt = input_file.read()
# create a transcript object
transcript = {
'version': '1.0.0',
'segments': []
}
# split the srt file into blocks
blocks = re.split(r"\n\n(?=\d+)", srt, maxsplit=0, flags=0)
def to_seconds(time):
return int(time[0]) * 3600 + int(time[1]) * 60 + int(time[2].split(',')[0]) + int(time[2].split(',')[1]) / 1000
# loop over the blocks
for block in blocks:
# split the block into lines
lines = block.split('\n')
# split the first line into start and end
start_end = lines[1].split(' --> ')
# split the start and end into hours, minutes, seconds, and milliseconds
# convert the start and end to seconds
# append a segment to the transcript segments array
transcript['segments'].append({
'startTime': to_seconds(start_end[0].split(':')),
'endTime': to_seconds(start_end[1].split(':')),
'body': lines[2].strip()
})
# if a segment has no body, remove it
transcript['segments'] = list(filter(lambda segment: segment['body'] != '', transcript['segments']))
# if a segment just contains puctuation, combine it with the previous segment
for i in range(len(transcript['segments']) - 1, 0, -1):
if transcript['segments'][i]['body'] in ['.', ',', '!', '?']:
transcript['segments'][i - 1]['body'] += transcript['segments'][i]['body']
transcript['segments'][i - 1]['endTime'] = transcript['segments'][i]['endTime']
transcript['segments'].pop(i)
# write the transcript to the output file
with open(sys.argv[1] + '.json', 'w') as output_file:
json.dump(transcript, output_file, indent=4)