-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler_for_bdnews.py
137 lines (92 loc) · 3.83 KB
/
crawler_for_bdnews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import requests
import json
import hashlib
import os
from bs4 import BeautifulSoup
import stat
def compute_MD5_hash(string):
m = hashlib.md5()
m.update(string.encode('utf-8'))
return m.hexdigest()
def get_soup(request_url):
page = requests.get(request_url)
soup = BeautifulSoup(page.text, 'html.parser')
return soup
def get_all_news_url(soup):
news_list = soup.find_all("div", class_="article news-bn first default")
article_url_list = []
for i in range(len(news_list)):
for link in news_list[i].find_all('a'):
article_url_list.append(link.get('href'))
return article_url_list
def get_all_titles(articles):
titles = []
for i in range(len(articles)):
page = requests.get(articles[i])
soup = BeautifulSoup(page.text, 'html.parser')
title_list = soup.find('h1', class_ ='print-only')
titles.append(title_list.string)
return titles
def get_all_datetime(articles):
datetime_list = []
for i in range(len(articles)):
page = requests.get(articles[i])
soup = BeautifulSoup(page.text, 'html.parser')
date_temp = soup.find('p', class_ = 'dateline print-only')
date_temp_text = date_temp.get_text()
date_temp_text_final = date_temp_text[11:33]
date_temp_text_final.strip()
datetime_list.append(date_temp_text_final)
return datetime_list
def get_primary_contents(articles):
primary_contents = []
for i in range(len(articles)):
page = requests.get(articles[i])
soup = BeautifulSoup(page.text, 'html.parser')
bold_content = soup.find("meta", property="og:description")
primary_contents.append(bold_content.get("content"))
return primary_contents
def get_full_article_data(articles, primary_contents):
list_article = []
for i in range(len(articles)):
page = requests.get(articles[i])
soup = BeautifulSoup(page.text, 'html.parser')
article_body = soup.find('div', class_ = 'custombody print-only')
x = article_body.find_all('p')
article = ''
final_article = ''
list_paragraphs = []
for j in range(len(x)):
temp = x[j].text.strip()
temp = temp.replace('\n', ' ')
list_paragraphs.append(temp)
article = "".join(list_paragraphs)
final_article = primary_contents[i] + "\n" + article
list_article.append(final_article)
return list_article
def get_json_file(articles, titles, datetime_list, list_article):
basic_path = os.path.dirname(__file__)
for i in range(len(articles)):
json_file = {'title': titles[i], 'date': datetime_list[i], 'content': list_article[i]}
date_local_directory = datetime_list[i].strip()
hashed_url = compute_MD5_hash(articles[i]).upper()
local_directory_format = hashed_url + '.json'
directory_temp_date = datetime_list[i].lstrip()
directory_temp_date = directory_temp_date[0:11]
directory_temp_date = directory_temp_date.replace(' ', '-')
new_path = r'bdnews24.bangla'
new_path = new_path + "\\" + directory_temp_date
file_path = basic_path + "\\" + new_path
if not os.path.exists(file_path):
os.makedirs(file_path, mode=0o777)
file_path_name = file_path + "\\" + local_directory_format
with open(file_path_name, 'w', encoding='utf-8') as f:
json.dump(json_file, f, ensure_ascii=False)
request_url = 'https://bangla.bdnews24.com/'
soup = get_soup(request_url)
article_url_list = get_all_news_url(soup)
titles = get_all_titles(article_url_list)
datetime_list = get_all_datetime(article_url_list)
primary_contents = get_primary_contents(article_url_list)
list_article = get_full_article_data(article_url_list, primary_contents)
get_json_file(article_url_list, titles, datetime_list, list_article)