-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
33 lines (24 loc) · 1.06 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import requests
import xml.etree.ElementTree as ET
import csv
base_url = "https://www.nola.com/tncms/sitemap/editorial.xml?year="
all_articles_by_year = {}
for year in range(2016, 2024):
url = base_url + str(year)
response = requests.get(url)
xml_content = response.text
root = ET.fromstring(xml_content)
links = [element.text for element in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]
articles_year = []
for link in links:
response = requests.get(link)
xml_content = response.text
root = ET.fromstring(xml_content)
articles_year.extend([element.text for element in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')])
all_articles_by_year[year] = articles_year
with open('articles_by_year.csv', mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Year', 'Article'])
for year, articles in all_articles_by_year.items():
for article in articles:
writer.writerow([year, article])