-
Notifications
You must be signed in to change notification settings - Fork 1
/
tobacco_scraper.py
100 lines (81 loc) · 3.54 KB
/
tobacco_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Python script for web scraping all registered tobacco retailers in Scotland based on the register hosted here:
# https://www.tobaccoregisterscotland.org/
##Import required libraries
import requests
from bs4 import BeautifulSoup
import os
import csv
from datetime import datetime
print("Program started at {}".format(datetime.now().strftime("%H:%M:%S %m/%d/%Y")))
#Sort out file saving and other bits
#First create the headers we need
headers = ['Date of Query', 'Time of Query', 'Name', 'Address', 'Postcode', 'Local Authority', 'Type', 'Products Sold', 'Company Name', 'Status']
#Then the filename and path
file = "TobaccoRegister_{}.csv".format(datetime.now().strftime("%m-%d-%Y"))
#Then create the blank file with the column headers
with open(file, "w", encoding='utf-8', newline='') as filename:
our_writer = csv.writer(filename, delimiter = "|", lineterminator=os.linesep)
our_writer.writerow(headers)
#Collect the total results on the page
#First do an initial beautiful soup search before the loop to pull the page
URL = 'https://www.tobaccoregisterscotland.org/search-the-register/?\
Name=\
&Postcode=\
&LocalAuthority=\
&BusinessType=\
&ProductType=\
&PremisesStatus=\
&page=1'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
total_results = int(soup.find('div', attrs={'class':'premises-search-results__total'}).get_text().split()[3])
#now start the loop
for PageNumber in range(1,int(total_results/20)+2):
#Specify the URL we want to scrape
URL = 'https://www.tobaccoregisterscotland.org/search-the-register/?\
Name=\
&Postcode=\
&LocalAuthority=\
&BusinessType=\
&ProductType=\
&PremisesStatus=\
&page={}'\
.format(PageNumber)
#Retrieve the contents of that URL
page = requests.get(URL)
#Take this content and turn it into a beautiful soup object
soup = BeautifulSoup(page.content, 'html.parser')
#Find all of the 'dd' html tags and place in a list called 'items'
items = soup.find_all('dd')
#Remove all of the HTML codes from the items
cleaned_items = []
for item in items:
cleaned_items.append(item.contents[0])
#Obtain the missing premises names
premise_names = []
for item in soup.find_all('b'):
premise_names.append(item.contents[0])
#Create separate lists for each of the remaining fields
statuses = cleaned_items[5::6]
addresses = cleaned_items[0::6]
local_authorities = cleaned_items[1::6]
types = cleaned_items[2::6]
products_solds = cleaned_items[3::6]
company_names = cleaned_items[4::6]
#And query dates and times
date = [datetime.now().strftime("%m/%d/%Y")] * int(len(items)/6)
time = [datetime.now().strftime("%H:%M:%S")] * int(len(items)/6)
#Do the same for the postcodes by extracting from the address field
postcodes = []
for address in addresses:
postcodes.append(' '.join(address.split()[-2:]))
#Combine the lists into a "zip" object in preparation for writing to a CSV
rows = zip(date, time, premise_names, addresses, postcodes, local_authorities, types, products_solds, company_names, statuses)
#Append the rows to the bottom of our csv
with open(file, "a", encoding='utf-8', newline='') as filename:
writer = csv.writer(filename, delimiter ='|', lineterminator=os.linesep)
for row in rows:
writer.writerow(row)
print("Page {} completed".format(PageNumber))
#Print completion statement
print("Scrape completed at {}. All {} pages scraped.".format(datetime.now().strftime("%H:%M:%S %m/%d/%Y"), total_results))