-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy paththe_economist_scraper.py
45 lines (36 loc) · 1.03 KB
/
the_economist_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import datetime
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.economist.com"
response = requests.get(url)
te_soup = BeautifulSoup(response.content, 'html.parser')
te_headlines = te_soup.find('section', {'id': 'new-relic-top-stories'})
te_h3 = te_headlines.find_all('h3')
data = {
'org': url,
'scraped_at': datetime.datetime.now(),
'headline_1': '',
'headline_2': '',
'headline_3': '',
}
headlines = []
links = []
for idx, h in enumerate(te_h3[:3]):
try:
headlines.append(h.text)
links.append(h.find('a', href=True))
except:
pass
for i in range(0, len(headlines)):
key = f'headline_{i+1}'
value = str(headlines[i]) + ", " + url + str(links[i]['href'])
data[key] = value
df = pd.DataFrame(data, index= [0])
print(df.head())
try:
existing_df = pd.read_csv("updated_headlines.csv")
except:
existing_df = pd.DataFrame([])
combined = pd.concat([df, existing_df], ignore_index=True)
combined.to_csv("updated_headlines.csv", index=False)