-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathportablefreeware_com_crawl.py
66 lines (55 loc) · 2.62 KB
/
portablefreeware_com_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import progressbar
import crawler
import re
class PortableFreewareCrawler(crawler.Crawler):
def __init__(self):
crawler.Crawler.__init__(self, "https://www.portablefreeware.com/")
self.downloads = None
self.next_page_url = self.base_url
self.skip_count = 0
self.count_skipped = 0
def fetch_executables(self, count, skip_count=0):
count_downloaded = 0
self.count_skipped = 0
self.skip_count = skip_count
with progressbar.ProgressBar(max_value=count) as bar:
while count_downloaded < count:
if self.next_page_url is not None:
self.get_document(self.next_page_url)
self.extract_download_links()
while len(self.downloads) != 0 and count_downloaded < count:
try:
download_url = self.downloads.pop(0)
self.file_name = re.search("\/([^\.\/]+.(zip|exe))", download_url).group(1)
self.download_file(download_url)
count_downloaded = count_downloaded + 1
bar.update(count_downloaded)
except Exception:
bar.update(count_downloaded)
continue
else:
break
print("Successfully downloaded " + str(count_downloaded) + " samples.")
def extract_download_links(self):
try:
navs = self.xpath_selector.xpath('//li[@class="nav"]/a/@href')
self.next_page_url = self.base_url + navs[0 if len(navs) == 1 else 1]
except Exception:
self.next_page_url = None
entries = self.xpath_selector.xpath('//div[contains(@class, "entry ")]/div/ul[@class="inline app-links group"]/li[@class="download"]/a/@href')
self.downloads = []
while len(entries) != 0:
current_entry = entries.pop(0)
if self.count_skipped < self.skip_count:
self.count_skipped = self.count_skipped + 1
else:
try:
app_id = re.search("=(.*)", current_entry).group(1)
if len(entries) != 0 and app_id in entries[0] and "dd64" in entries[0]: # Go for 64-bits if possible
current_entry = entries.pop(0)
before_redirect = self.base_url + current_entry
self.downloads.append(self.follow_redirect(before_redirect))
except Exception:
continue
c = PortableFreewareCrawler()
c.fetch_executables(150, 120)