-
Notifications
You must be signed in to change notification settings - Fork 4
/
downld-step-2.py
executable file
·123 lines (100 loc) · 3.55 KB
/
downld-step-2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python2
# coding=utf-8
import requests
from bs4 import BeautifulSoup as Bs4
import time
# Yaowen Xu
# 2021年2月1日 15点49分
head_url = "http://www.winkawaks.org/roms/full-rom-list.htm"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
print("roms-download: " + head_url)
# https://www.winkawaks.org/roms/full-rom-list.htm
zip_list = []
o_file = open('download.sh', 'a')
# # construct download script
# o_file.writelines("#!/bin/bash\n")
# o_file.writelines("set -x\n")
# o_file.writelines("# auto gen by yaowenxu\n")
# o_file.writelines("\n")
# o_file.writelines("PA=`pwd`\n")
# o_file.writelines("cd $PA/roms\n")
# o_file.writelines("\n")
def get_index_url():
list_href = []
reaponse = requests.get(head_url, headers=headers)
baseurl = head_url[0:head_url.rindex('/')] + '/'
# print(baseurl)
soup = Bs4(reaponse.text, "lxml")
urls = soup.find_all("a")
# print(urls)
if urls:
for url in urls:
url_href = url.get("href")
if url_href:
if url_href[0] == "/":
continue
full_url_1 = baseurl + str(url_href)
list_href.append(full_url_1)
out_url = list(set(list_href))
return out_url
# https://www.winkawaks.org/roms/neogeo/garoup.htm
def get_download_url(urllist):
url_list = []
for url in urllist:
print("deal-with: " + url)
baseurl_2 = url[0:url.rindex('/')] + '/'
response = requests.get(url, headers=headers)
soup = Bs4(response.text, "lxml")
urls = soup.find_all("a")
# deal with urls
if urls:
for url2 in urls:
url2_1 = url2.get("href")
if url2_1:
# check for index page
if url2_1[0] == "/":
continue
full_url_2 = baseurl_2 + str(url2_1)
# check for duplication
if full_url_2 in urllist:
continue
url_list.append(full_url_2)
print("get-download-link: " + full_url_2)
# Get the download link of the zip file and output it to the file;
get_zip_file_url(full_url_2)
# break
out_url = list(set(url_list))
return out_url
def get_zip_file_url(url):
baseurl_3 = url[0:url.rindex('/')] + '/'
response = requests.get(url, headers=headers)
soup = Bs4(response.text, "lxml")
urls = soup.find_all("a")
# deal with urls
if urls:
for url3 in urls:
url3_1 = url3.get("href")
if url3_1:
linksuffix = url3_1[len(url3_1)-4:len(url3_1)]
if linksuffix == ".zip":
ziplink_out = str(url3_1)
print(ziplink_out)
zip_list.append(ziplink_out)
print("get-zipfile-link: " + ziplink_out)
o_file.writelines("curl -OL " + ziplink_out + "\n")
time.sleep(1)
return
if __name__ == "__main__":
urllist = get_index_url()
print("index link numbers: "+str(len(urllist)))
leng = len(urllist)
downlist = get_download_url(urllist[leng/2:leng])
print("download link numbers: "+str(len(downlist)))
zip_list = list(set(zip_list))
print("zipfile numbers: "+str(len(zip_list)))
o_file.writelines("\n")
o_file.writelines("# Zip File Numbers: "+str(len(zip_list)))
o_file.writelines("\n")
o_file.close()