-
Notifications
You must be signed in to change notification settings - Fork 6
/
populatedb.py
executable file
·94 lines (71 loc) · 2.81 KB
/
populatedb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""Script for processing Seinfeld scripts and populating a SQLite3 DB. """
import argparse
import sqlite3
import sys
from scrape import scrape_episode
class DatabasePopulator(object):
def __init__(self, filename):
self.con = sqlite3.connect(filename)
self.cur = self.con.cursor()
def commit(self):
self.con.commit()
def _add_episode(self, season_num, episode_num, title, date, writer,
director):
self.cur.execute("""
INSERT INTO episode
(season_number, episode_number, title, the_date, writer, director)
VALUES(?, ?, ?, ?, ?, ?)""",
(season_num, episode_num, title, date, writer, director))
val = self.cur.execute("""
SELECT id
FROM episode
WHERE season_number = ? AND episode_number = ?
""", (season_num, episode_num))
return val.next()[0]
def _add_utterance(self, episode_id, utterance_number, speaker, utterance):
self.cur.execute("""
INSERT INTO utterance
(episode_id, utterance_number, speaker, text)
VALUES(?, ?, ?, ?)""",
(episode_id, utterance_number, speaker, utterance))
val = self.cur.execute("""
SELECT id
FROM utterance
WHERE episode_id = ? AND utterance_number = ?
""", (episode_id, utterance_number))
return val.next()[0]
def add_episode(self, html):
info, utterances = scrape_episode(html)
season_num = info['season_num']
episode_num = info['episode_num']
title = info['title']
date = info['date']
writer = ', '.join(info['writers'])
director = info['director']
episode_id = self._add_episode(season_num, episode_num, title, date,
writer, director)
for utt_num, (speaker, utterance) in enumerate(utterances, start=1):
utterance_id = self._add_utterance(
episode_id,
utt_num,
speaker,
utterance
)
def main(args):
pop = DatabasePopulator(args.db_filepath)
scripts_path = args.scripts_path
with open(scripts_path, 'r') as fh:
html = fh.read()
pop.add_episode(html)
pop.commit()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'db_filepath',
help='Path to SQLite DB file to be created.'
)
parser.add_argument(
'scripts_path',
help='Path to directory containing scripts files.'
)
main(parser.parse_args(sys.argv[1:]))