-
Notifications
You must be signed in to change notification settings - Fork 0
/
BreitbartScrape.py
240 lines (212 loc) · 12.1 KB
/
BreitbartScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
from urllib.request import urlopen, Request
import time, random, re, json
from bs4 import BeautifulSoup as bsoup
from datetime import timedelta, date
from Classes.BreitbartArticleInformation import BreitbartArticleInformation
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
# Specifying how many random articles I will want it to sample
numberOfRandomSamples = 300 # TODO change in live samples
# keywords to use: covid-19, covid-2019, coronavirus, pandemic, epidemic
keyword = 'epidemic'
# have to change cse tok sometimes (it expires)
internalAPIurlFormat = 'https://cse.google.com/cse/element/v1' + \
'?rsz=large' + \
'&num=8' + \
'&hl=en' + \
'&source=gcsc' + \
'&gss=.com' + \
'&cselibv=57975621473fd078&cx=partner-pub-9229289037503472:6795176714' + \
'&q={keyword}' + \
'&safe=active' + \
'&cse_tok=AJvRUv3tKIOK1jYKh_6b1gT47O8F:1591207563717' + \
'&filter=1' + \
'&sort=date:r:{date}:{date}' + \
'&exp=csqr,cc,4355061' + \
'&oq={keyword}' + \
'&gs_l=partner-generic.12...0.0.0.13174.0.0.0.0.0.0.0.0..0.0.csems%2Cnrl%3D13...0.0....34.partner-generic..0.0.0.&callback=google.search.cse.api8105'
# Overriding user agent for requests
# instead of thinking my requests come from python (urlopen), it will think they're from Chrome
pretendChrome = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
pretendDesktopSafari = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A'
pretendMobileSafari = 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/83.0.4103.88 Mobile/15E148 Safari/604.1'
# TODO change depending on device below - mobile or wifi (chrome or safari)
refererWithKeyword = 'https://www.breitbart.com/search/?s={}'.format(keyword) # TODO change referer if using in another file!!
# TODO get it from dev tools - Request Headers - referer
headers = {'User-Agent':pretendDesktopSafari, 'Referer':refererWithKeyword, 'DNT':1}
# Create function to make random sleep intervals (rather than always 2 seconds)
def randomSleepsFunction():
return random.randrange(10, 70, 1) / 10.0 #0.1 is step size - so it can randomize 1.1, 1.2, 1.3, ... 6.9, 7.0 etc.
# Set up the date loop to increment the date & page count in the url
def dateRangeFunction(startDate, endDate):
for n in range(int((endDate - startDate).days)):
yield startDate + timedelta(n)
startDate = date(2020, 1, 1) # TODO change dates before live
endDate = date(2020, 5, 1) # doesn't count last day (5/1)
finalListOfAllArticles = []
pageCountURLAddOn = '&start={}' # Will have to be added to the API url - {} page count will be added below
for singleDate in dateRangeFunction(startDate, endDate):
time.sleep(randomSleepsFunction())
dateOfArticle = singleDate.strftime("%Y%m%d")
internalAPIurl = internalAPIurlFormat.format(keyword=keyword, date=dateOfArticle)
print(internalAPIurl)
pretendRequestFromChrome = Request(internalAPIurl, data=None, headers=headers)
openingURL = urlopen(pretendRequestFromChrome).read().decode('utf-8') #json
# Cleaning out the junk at the top and bottom of the json file before parsing
locationOfFirstParenthesis = openingURL.index('(') + 1
openingURL = openingURL[locationOfFirstParenthesis:-2]
#print(openingURL)
# Make python read as json, not as a string (outputs dictionary)
realJSONDict = json.loads(openingURL)
# Entering dictionary to look for my things - adds all things (& info from entire items key) from 'results' key into my final list
if 'results' in realJSONDict:
inProcessListOfAllArticles = realJSONDict['results']
listParsedOutURLs = []
for eachArticle in inProcessListOfAllArticles:
if '/video/' in eachArticle['url'] \
or '/clips/' in eachArticle['url'] \
or '/tag/' in eachArticle['url'] \
or 'author' not in eachArticle['richSnippet']['metatags'] \
or 'AP' in eachArticle['richSnippet']['metatags']['author'] \
or 'AFP' in eachArticle['richSnippet']['metatags']['author'] \
or 'UPI' in eachArticle['richSnippet']['metatags']['author']: # TODO add any url exceptions here
continue
else:
listParsedOutURLs.append(eachArticle)
finalListOfAllArticles.extend(listParsedOutURLs) # add the 10 from that page into the list - used
# .extend rather than .append so it adds the list as a list to the new list - not as one object
# Calculate how many pages there will be total to put in the API url
print('Working on ' + dateOfArticle)
if 'resultCount' in realJSONDict['cursor']:
numberOfResults = int(realJSONDict['cursor']['resultCount'].replace(',', '')) # find the number of total results
if numberOfResults > 99:
numberOfResults = 99 # Google CSE only allows max 100 results per day/query at once
# technically, I am randomizing not from all articles, but from GCSE's top articles
# (based on whatever "relevancy" alg.) from my query (w/ specific date incrementing)
else:
continue
# Parse over pages to increase (Google CSE only allows 10 results max)
for startingPageCount in range(10, numberOfResults, 10):
print('on page count ' + str(startingPageCount))
time.sleep(randomSleepsFunction())
newAPI = internalAPIurl + pageCountURLAddOn.format(startingPageCount)
pretendRequestFromChrome = Request(newAPI, data=None, headers=headers)
openingURL = urlopen(pretendRequestFromChrome).read().decode('utf-8') #json
# Cleaning out the junk at the top and bottom of the json file before parsing
locationOfFirstParenthesis = openingURL.index('(') + 1
openingURL = openingURL[locationOfFirstParenthesis:-2]
realJSONDict = json.loads(openingURL)
if 'results' in realJSONDict:
inProcessListOfAllArticles = realJSONDict['results']
listParsedOutURLs = []
for eachArticle in inProcessListOfAllArticles:
if '/video/' in eachArticle['url'] \
or '/clips/' in eachArticle['url'] \
or '/tag/' in eachArticle['url'] \
or 'author' not in eachArticle['richSnippet']['metatags'] \
or 'AP' in eachArticle['richSnippet']['metatags']['author'] \
or 'AFP' in eachArticle['richSnippet']['metatags']['author'] \
or 'UPI' in eachArticle['richSnippet']['metatags']['author']: #TODO have to change here too if added anything
continue
else:
listParsedOutURLs.append(eachArticle)
finalListOfAllArticles.extend(listParsedOutURLs)
# Generate a certain number of random articles within the list
print(len(finalListOfAllArticles))
desiredNumberOfArticles = None # have to create variable outside of if statement
if len(finalListOfAllArticles) < numberOfRandomSamples:
print('Using ' + str(len(finalListOfAllArticles)) + ' articles')
desiredNumberOfArticles = finalListOfAllArticles
else:
desiredNumberOfArticles = random.sample(finalListOfAllArticles, numberOfRandomSamples)
print(len(desiredNumberOfArticles))
# To get the urls from that list (which has a bunch of dictionaries) and add them to a new list:
listOfURLs = []
for eachPartOfList in desiredNumberOfArticles:
articleURL = eachPartOfList['url']
listOfURLs.append(articleURL)
#print(listOfURLs)
# Preparing to start with running chrome driver with Selenium:
chromeDriverPath = '/Users/yanisa/GoogleDrive/Publications_Conferences/Code/2020.CovidMetaphorMetonymyBookChptCollab/chromedriver'
chromeOptions = Options()
chromeOptions.add_argument('--headless')
webDriver = webdriver.Chrome(executable_path=chromeDriverPath, options=chromeOptions)
listOfInformation = []
with webDriver as driver:
# Set timeout time
wait = WebDriverWait(driver, 20)
driver.implicitly_wait(10)
driver.maximize_window()
print('Opening Selenium...')
# Making Selenium go into the list and open the URLs
index = 1
for eachURL in listOfURLs:
time.sleep(randomSleepsFunction())
driver.get(eachURL)
print('Starting: ' + eachURL)
# Protecting against timeout errors/articles that don't work for some reason:
retries = 0
while retries < 2:
try:
wait.until(presence_of_element_located((By.CLASS_NAME, "the-article")))
break
except TimeoutException:
driver.refresh()
retries = retries + 1
print('Retry #' + str(retries))
if retries == 2: # if it has tried a bunch and STILL doesn't work, just skip it and keep going
print('URL skipped - retries exceeded: ' + eachURL)
continue
html = driver.execute_script("return document.documentElement.outerHTML;")
soup = bsoup(html, 'html.parser')
firstElement = soup.find_all('article', attrs={'class': 'the-article'})
# Wrapping in try/except so it goes around any errors
try:
for articleInfo in firstElement:
# Headline
inProcessHeadline = articleInfo.find('h1')
headline = inProcessHeadline.text
# Date
inProcessDate = articleInfo.find('time')
date = inProcessDate.text
# Authors
inProcessAuthorsFirst = articleInfo.find('div', attrs={'class': 'header_byline'})
if inProcessAuthorsFirst.find('a') != None:
inProcessAuthorsSecond = inProcessAuthorsFirst.find('a')
author = inProcessAuthorsSecond.text
else:
author = 'unable to retrieve author'
# Article Type
articleType = 'Non-specific article type - Breitbart'
# Text
text = ''
inProcessText = articleInfo.find('div', attrs={'class': 'entry-content'})
inProcessTextSecond = inProcessText.find_all('p')
for paragraph in inProcessTextSecond:
text = text + ' ' + paragraph.text
#print('Got info for Article #' + str(index))
print('Finished Article #' + str(index) + ': ' + eachURL)
putThingsInClass = BreitbartArticleInformation(eachURL, headline, date, author, articleType, text)
listOfInformation.append(putThingsInClass)
index = index + 1
except Exception as exception:
print('URL skipped - page doesn\'t match template: ' + eachURL)
print(exception)
# Save them as individual .txt files
beginningFilePath = '/Users/yanisa/GoogleDrive/Publications_Conferences/Code/2020.CovidMetaphorMetonymyBookChptCollab/FinalData/Breitbart/PrelimData/'
for article in listOfInformation:
makeFile = open(beginningFilePath + article.headline + '.txt', 'w')
makeFile.write(article.headline + '\n' + article.url + '\n' + str(article.date) + '\n' + article.author + '\n' + article.articleType + '\n\n' + article.text)
print('Done!!!')
# Change date range and number of random samples before doing it for real!!
# BEFORE RUNNING!!!
# - Did you change the keyword?
# - Is the date range correct?
# - Are you sampling the correct amount? (currently taking 300 of each, then will take 300 of that total)
# - Is the destination file correct?
# (Keywords) Go in this order:
# covid-19, covid-2019, coronavirus, pandemic, epidemic