-
Notifications
You must be signed in to change notification settings - Fork 3
/
scraper_functions.py
115 lines (88 loc) · 3.57 KB
/
scraper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# for reading pdfs
import camelot
# import ghostscript
import tkinter
import pandas as pd
import pandas as pd
import time
import os
def get_html(url, driver_path=r"C:\Users\tungl\Downloads\chromedriver_win32\chromedriver"):
'''Get the html of a webpage. You need to have a Chrome driver
installed in order to execute this function.
Args:
url (str): url of the website
driver_path (str): driver path of the Chrome Driver
Return:
html (str): HTML of the rendered website
'''
# Create a headless webdriver instance and interact with the website
options = Options()
options.headless = True
driver = webdriver.Chrome(driver_path, options=options)
driver.get(url)
# Give some time for the browser to render and load the data
time.sleep(5)
# Getting the HTML of this page
html = driver.page_source
# Close the webdriver
driver.quit()
return html
def parse_table(html):
'''Parse the HTML to extract tables.
Arg:
html (str): HTML of the rendered website
Return:
tables (list): a list of parsed HTML table
'''
# using BeautifulSoup to parse the HTML and find all the tables
soup = BeautifulSoup(html, "html.parser")
# remove superscripts in html
for sup in soup.select('sup'):
sup.extract()
return soup
# read pdfs
def read_pdf(pdf_file, page_numbers):
tables = camelot.read_pdf(os.path.join('pdfs', pdf_file), pages=",".join(str(x) for x in page_numbers))
print("Total tables extracted:", tables.n)
all_tables = pd.DataFrame()
for table in tables:
table = table.df.replace('\n', ' ', regex=True)
all_tables = pd.concat([all_tables, table])
# all_tables = pd.concat([table.df for table in tables])
all_tables.columns = all_tables.iloc[0]
all_tables = all_tables[1:]
print(f'{all_tables}\n\n\n')
return all_tables
def url_text_to_table(url, section_name, separator):
'''Parse the html to find all the bullet point parking requiremnets
Args:
url: url of the webpage
section_name: the section name that contains the bullet point requirements
separator: a separator, usually ":" or ".", that divides the sentence into
"Use" and "Parking Requirement"
Return:
pandas DataFrame
'''
html = get_html(url)
soup = BeautifulSoup(html, "html.parser")
title = soup.find("div", text = section_name)
section = title.find_parent("li")
# finding the "content" class with the biggest number at the end
raw_contents = section.find_all('p', class_ = lambda value: value and value.startswith("content"))
content_names = set(' '.join(content['class']) for content in raw_contents)
content_nums = []
for name in content_names:
content_nums.append(int(name[-1]))
wanted_class = f"content{max(content_nums)}"
wanted_contents = section.find_all('p', class_ = wanted_class)
requirement_dicts = []
for requirement in wanted_contents:
# need to find a way to separate properly
# might run into the problem of "Single-family dwelling—Two spaces."
info = requirement.text.replace("\n", "").split(separator,1)
requirement_dicts.append({"Use": info[0],
"Number of Spaces": info[1]})
return pd.DataFrame(requirement_dicts)