-
Notifications
You must be signed in to change notification settings - Fork 0
/
PakTrack.py
123 lines (104 loc) · 5.99 KB
/
PakTrack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from bs4 import BeautifulSoup
import requests
from csv import writer
import csv
import urllib
import os.path
import json
import datetime
dailynews_links = {'Dawn': 'https://www.dawn.com/feeds/home', 'The News Intl': 'https://www.thenews.com.pk/rss/1/1', 'Tribune': 'https://tribune.com.pk/feed/pakistan'}
globalnews_links = {'NY Times': 'https://rss.nytimes.com/services/xml/rss/nyt/AsiaPacific.xml', 'WashPost': 'http://feeds.washingtonpost.com/rss/rss_blogpost', 'BBC': 'http://feeds.bbci.co.uk/news/world/rss.xml?edition=uk', 'CBC': 'https://www.cbc.ca/cmlink/rss-world', 'Aljazeera': 'https://www.aljazeera.com/xml/rss/all.xml'}
globalnews_terms = ['pakistan', 'Pakistan', 'Pak', 'pak', 'Qureshi', 'qureshi', 'Khan', 'khan', 'Bajwa', 'bajwa', 'Kashmir', 'kashmir', 'Islamabad', 'islamabad', 'Karachi', 'karachi', 'Lahore', 'lahore', 'Rawalpindi', 'rawalpindi', 'Peshawar', 'peshawar', 'Multan', 'multan', 'Faisalabad', 'faisalabad', 'quetta', 'Quetta', 'Hyderabad', 'hyderabad', 'Sindh', 'sindh', 'Gujaranwala', 'gujanwala', 'Durand Line', 'durand line', 'Balochistan', 'balochistan', 'Khyber Pakhtunkhwa', 'khyber pakhtunkhwa', 'Punjab', 'punjab', 'gilgit-baltistan', 'Gilgit-Baltistan', 'gilgit baltistan', 'Gilgit Baltistan']
def dailynews_scraper(dict_of_links):
potential_links = []
for site,link in dict_of_links.items():
res = requests.get(link)
soup = BeautifulSoup(res.content, features="xml")
parsed_url = urllib.parse.urlparse(link)
base_url = f'{parsed_url[0]}://{parsed_url[1]}'
list_of_links = soup.findAll('item')
for item in list_of_links:
try:
if 'http' in item.link.text:
potential_links.append({'news agency': site, 'title': item.title.text.strip().split('\n')[0],'link': item.link.text})
else:
relative_path = item.link.text
complete_url = f'{base_url}{relative_path}'
potential_links.append({'news agency': site, 'title': item.title.text.strip().split('\n')[0],'link': complete_url})
except:
print("Error Found!")
return potential_links
def globalnews_scraper(dict_of_links, search_terms):
potential_links = []
for site,link in dict_of_links.items():
res = requests.get(link)
parsed_url = urllib.parse.urlparse(link)
base_url = f'{parsed_url[0]}://{parsed_url[1]}'
soup = BeautifulSoup(res.content, features="xml")
list_of_links = soup.findAll('item')
for item in list_of_links:
if any(term in item.text for term in search_terms):
try:
if 'http' in item.link.text:
potential_links.append({'news agency': site, 'title': item.title.text.strip().split('\n')[0],'link': item.link.text})
else:
relative_path = item.link.text
complete_url = f'{base_url}{relative_path}'
potential_links.append({'news agency': site, 'title': item.title.text.strip().split('\n')[0],'link': complete_url})
except:
print("Error Found!")
return potential_links
def dailylist_to_csvs(scraper_results):
if not (os.path.isfile('all_daily_results.csv')):
with open('all_daily_results.csv', 'w', newline = '') as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames = ['news agency', 'title', 'link'])
csv_writer.writeheader()
oldlinks = []
oldlinks2 = []
with open('all_daily_results.csv', 'r', encoding = 'utf-8') as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
oldlinks.append(row['link'])
oldlinks2.append(row['link'])
with open('new_daily_results.csv', 'w', encoding = 'utf-8', newline="") as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames = ['news agency', 'title', 'link'])
csv_writer.writeheader()
for result in scraper_results:
if result['link'] not in oldlinks:
csv_writer.writerow(result)
oldlinks.append(result['link'])
with open('all_daily_results.csv', 'a+', encoding = 'utf-8', newline="") as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames = ['news agency', 'title', 'link'])
for result in scraper_results:
if result['link'] not in oldlinks2:
csv_writer.writerow(result)
oldlinks2.append(result['link'])
def globallist_to_csvs(scraper_results):
if not(os.path.isfile('all_global_results.csv')):
with open('all_global_results.csv', 'w', newline = '') as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames = ['news agency', 'title', 'link'])
csv_writer.writeheader()
oldlinks = []
oldlinks2 = []
with open('all_global_results.csv', 'r', encoding = 'utf-8') as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
oldlinks.append(row['link'])
oldlinks2.append(row['link'])
with open('new_global_results.csv', 'w', encoding = 'utf-8', newline="") as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames = ['news agency', 'title', 'link'])
csv_writer.writeheader()
for result in scraper_results:
if result['link'] not in oldlinks:
csv_writer.writerow(result)
oldlinks.append(result['link'])
with open('all_global_results.csv', 'a+', encoding = 'utf-8', newline="") as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames = ['news agency', 'title', 'link'])
for result in scraper_results:
if result['link'] not in oldlinks2:
csv_writer.writerow(result)
oldlinks2.append(result['link'])
dailyresults = dailynews_scraper(dailynews_links)
dailylist_to_csvs(dailyresults)
globalresults = globalnews_scraper(globalnews_links, globalnews_terms)
globallist_to_csvs(globalresults)