-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathethcrash_scrape.py
125 lines (102 loc) · 3.79 KB
/
ethcrash_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import requests
from bs4 import BeautifulSoup as BS
from aiohttp import ClientSession
import asyncio
import subprocess
import os
import csv
import time
start = 100
end = 632120
batch_size = 5
store_filename = "ethcrash_stats.csv"
GET_URL = "https://www.ethercrash.io/game/{0}"
# A simple async fetch function
async def fetch(url, session):
# (a)waiting for for the server response to come back (during that time the event loop is free)
async with session.get(url) as response:
# reading the response and parsing in is also async operation
print("Url processed: {0}".format(url))
return await response.text()
# The main function to download, get the number of posts to download as n
async def run(indexes):
# fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
# init the future, each future is a url-request
tasks = [asyncio.ensure_future(fetch(GET_URL.format(i), session)) for i in indexes]
# wait for all responses to come back
return await asyncio.gather(*tasks)
# takes the indexes as list of indexes to go through
def get_next_indexes_results(indexes):
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(indexes))
return loop.run_until_complete(future)
def update_start(filename, default):
if os.path.exists(filename):
last_line = subprocess.check_output(['tail', '-1', filename])
game_number = last_line.decode().split(",")[0]
return int(game_number) + 1
return default
def get_url(game_number):
url = "https://www.ethcrash.io/game/{0}"
return url.format(game_number)
def make_request(url):
r = requests.get(url)
if r.ok:
print("Successfully requested url: {0}".format(url))
return r.text
print("Failed to request url: {0}".format(url))
return False
def parse_request(html):
soup = BS(html, 'html.parser')
game_number = soup.find("h4",class_="lb-title").text.encode("ascii", 'replace').decode().split("|")[0].split("Game")[1].strip()
date = soup.find("p", class_="text-muted mb-0").text.encode("ascii", 'replace').decode().split("on ")[-1]
crash = soup.find("h4",class_="lb-title").text.encode("ascii", 'replace').decode().split("@")[1]
return game_number, date, crash
def write_to_csv(filename, data):
with open(filename, 'a') as csv_file:
file_writer = csv.writer(csv_file)
for row in data:
file_writer.writerow(row)
return True
def batches(iterable, n=1):
current_batch = []
for item in iterable:
current_batch.append(item)
if len(current_batch) == n:
yield current_batch
current_batch = []
if current_batch:
yield current_batch
def main():
now = time.time()
done = time.time()
updated_start = update_start(store_filename, start)
for batch in batches(range(updated_start, end + 1), batch_size):
start_game = time.time()
data = []
html_data = get_next_indexes_results(batch)
print("All Urls processed")
for html in html_data:
try:
row = parse_request(html)
except Exception as e:
print("Passed.")
data.append(row)
data.sort(key=lambda x: x[1])
end_game = time.time()
print(
"Elapsed time for game numbers from {0} to {1} is: {2}".format(
batch[0],
batch[-1],
end_game - start_game
)
)
write_to_csv(store_filename, data)
done = time.time()
time.sleep(3)
print("Elapsed time: {0}".format(done - now))
print("Average Elapsed time: {0}".format((done - now) / (end + 1 - updated_start)))
if __name__ == '__main__':
main()