-
Notifications
You must be signed in to change notification settings - Fork 50
/
Copy pathWebCrawler.py
47 lines (33 loc) · 1.32 KB
/
WebCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
# assign your desired site link to scrape on the url variable
url = 'https://bandera.inquirer.net/balita/'
request = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
source = urlopen(request).read()
soup = BeautifulSoup(source, 'lxml')
div_id_with_news_links = 'landing-main-default'
div = soup.find('div', {'id': div_id_with_news_links})
news_links = {}
for h2 in div.find_all('h2'):
a = h2.find('a')
print(a['href'])
news_links[a.text] = a['href'] # Saving each links to a dictionary named 'news_links'
# Below, I'm exporting the content of the news
# Creating individual file for each news
news_request = Request(a['href'], headers={'User-Agent': 'Mozilla/5.0'})
news_source = urlopen(news_request).read()
news_soup = BeautifulSoup(news_source, 'lxml')
news_title = news_soup.h1.text
p_content = ""
for paragraph in news_soup.find_all('p'):
p_content += paragraph.text + "\n"
news_file = open(a.text+".txt", "w+")
news_file.write('Title: ' + news_title +'\n')
news_file.write('Content: ' + p_content +'\n')
news_file.close()
""" Just in case you only need the links,
I saved them in a dictionary.
{'title': 'link'}
"""
print("\nDictionary of Links: \n")
print(news_links)