-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathorcr_scraper.py
71 lines (59 loc) · 1.87 KB
/
orcr_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from bs4 import BeautifulSoup
import csv
import sqlite3
from os import path
import conf
def html_to_csv():
if path.exists(path.join(conf.csv_loc, conf.record_name + ".txt")):
print("File already exists. This may be the data file, so not reprocessing")
print("Please type any key to continue or Ctrl-C to stop execution")
else:
outfile = open(path.join(conf.csv_loc, conf.record_name + ".txt"), "w");
infile = open(path.join(conf.html_loc, conf.record_name + ".html"), "r");
soup = BeautifulSoup(infile, "lxml");
main_table = soup.find("table", {"class":"border_table_at"})
table_rows = main_table.find_all("tr")
for row in table_rows:
cells = row.find_all("td")
for cell in cells:
field = cell.text.strip()
if field == "Female-only (including Supernumerary)":
field = field.replace(" (including Supernumerary)", "")
outfile.write(field.replace(",", "").replace(" ", " ") + ",")
outfile.write("\n")
outfile.close()
infile.close()
print("HTML converted to CSV")
print("Please check for errors and type any key to continue")
input()
def csv_to_sqlite():
reader = csv.reader(open(path.join(conf.csv_loc, conf.record_name + ".txt"), "r"))
conn = sqlite3.connect(conf.db_loc)
c = conn.cursor()
query = conf.queries["create_table_query"]
# Create table
c.execute(query)
conn.commit()
query = conf.queries["insert_record_query"]
for row in reader:
if row[5].endswith("P"):
row[5] = row[5].replace("P","")
row.insert(6,"1")
print("OPR preparatory for " + str(row))
else:
row.insert(6,"0")
if row[7].endswith("P"):
row[7] = row[7].replace("P","")
row.insert(8,"1")
print("CPR preparatory for " + str(row))
else:
row.insert(8,"0")
c.execute(query, row)
conn.commit()
conn.close()
print("Successfully inserted all rows into db")
def main():
html_to_csv()
csv_to_sqlite()
if __name__ == "__main__":
main()