-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathblog_crawler.py
124 lines (104 loc) · 4.85 KB
/
blog_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import errno
import hashlib
import logging
import os
import time
import urllib.error
import urllib.parse
import urllib.request
from os.path import exists, dirname
import requests
import yaml
from pyquery import PyQuery as pq
logger = logging.getLogger('blog_crawler')
def fetchall(num_recent_blogs=4):
# Fetch from Hacking Distributed
exclude_urls = {"http://hackingdistributed.com/2015/08/17/coin-needs-a-board/",
"http://hackingdistributed.com/2014/12/17/changetip-must-die/",
"http://hackingdistributed.com/2014/11/30/reasonable-bitcoin-security-precautions/",
"http://hackingdistributed.com/2014/06/13/time-for-a-hard-bitcoin-fork/",
"http://hackingdistributed.com/2014/05/27/mtgox-willy-markus/",
"http://hackingdistributed.com/2014/03/22/just-so-mtgox/",
"http://hackingdistributed.com/2013/11/19/why-da-man-loves-bitcoin/",
"http://hackingdistributed.com/2013/11/10/clearest-description-of-bitcoin/",
"http://hackingdistributed.com/2013/11/09/no-you-dint/",
"http://hackingdistributed.com/2013/11/06/comment-policy/",
"http://hackingdistributed.com/2013/06/20/virtual-notary-intro/",
}
# target = [
# "http://hackingdistributed.com/tag/bitcoin/",
# "http://hackingdistributed.com/tag/ethereum/",
# "http://hackingdistributed.com/tag/decentralization/",
# ]
target = []
results = []
for argurl in target:
logging.info(f'Fetching blogs from {argurl}')
r = requests.request('GET', argurl)
r.encoding = 'utf-8'
d = pq(r.text)
for elem in d.find("h2.post-title a"):
url = pq(elem).attr["href"]
if url not in exclude_urls:
title = pq(elem).text()
pubinfo = pq(elem).parent().parent().find(".post-metadata .post-published")
date = pq(pubinfo).find(".post-date").html().strip()
authors = pq(pubinfo).find(".post-authors").html().strip()
summary = pq(elem).parent().parent().parent().find(".post-summary").text()
date_components = date.split()
if len(date_components) == 6:
date_object = time.strptime(date, '%B %d, %Y at %I:%M %p')
elif len(date_components) == 7:
date_object = time.strptime(date, '%A %B %d, %Y at %I:%M %p')
else:
logger.fatal("cannot parse date str %s", date)
results.append((date_object, url, title, date, authors, summary))
# dedupe using url. note that p[1] is url
results = list({p[1]: p for p in results}.values())
# Read from yaml
with open('./content/blogs.yaml', 'r') as c:
data = yaml.safe_load(c)
for post in data:
date = post['date'].timetuple()
results.append((date, post['url'], post['title'], post['date'].strftime('%B %d, %Y'), post['authors'],
post['summary']))
results.sort(reverse=True)
posts = []
for date, url, title, date, authors, summary in results:
posts.append(dict(date=date, url=url, title=title, authors=authors,
summary=summary))
recent = []
for date, url, title, date, authors, summary in results[:num_recent_blogs]:
r = requests.request('GET', url)
r.encoding = 'utf-8'
d = pq(r.text)
img = d.find("div.figure img")
imgsrc = pq(img).attr["src"]
# title = title.replace("'", "\\'")
logger.info('Generating preview for recent blog: %s', title)
# summary = summary.replace("'", "\\'")
# try to get the first author's pic
if imgsrc is None:
img = d.find("div.pull-right img")
imgsrc = pq(img).attr["src"]
if imgsrc == "http://hackingdistributed.com/images/dinomark.png":
imgsrc = "http://hackingdistributed.com/images/vzamfir.jpg"
# go with the ic3 default
if imgsrc is not None:
response = urllib.request.urlopen(imgsrc)
imagecontents = response.read()
_, imageext = os.path.splitext(imgsrc)
imagehash = hashlib.sha256()
imagehash.update(imagecontents)
imagename = imagehash.hexdigest()
imgsrc = "images/hotlinks/" + imagename + imageext
if not exists(dirname(imgsrc)):
try:
os.makedirs(dirname(imgsrc))
except OSError as e:
if e.errno != errno.EEXIST:
raise
with open(imgsrc, "wb") as img:
img.write(imagecontents)
recent.append(dict(date=date, url=url, title=title, authors=authors, imgsrc=imgsrc, summary=summary))
return recent, posts