This repository has been archived by the owner on Mar 3, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
advanced demo.py
129 lines (99 loc) · 3.46 KB
/
advanced demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# coding: utf-8
# NOTE: this script requires tqdm
# In[1]:
from importlib import reload
from pikax import *
import multiprocessing as mp
from tqdm import tqdm
import os
import itertools
import requests
# disable all logs
settings.LOG_STD = False
settings.LOG_INFORM = False
settings.LOG_WARN = False
settings.LOG_NORMAL = False
settings.MAX_PAGES_PER_ARTWORK = 1
# In[2]:
# multi process worker
def worker_proc(fn, queue, stop_item):
while True:
item = queue.get()
if item == stop_item:
break
fn(item)
# multi process init
def concurrent_run(fn, inputs, num_workers=4, stop_item=None):
queue = mp.Queue(1)
stop_item = stop_item # the item to tell worker to stop
workers = []
for _ in range(num_workers):
workers.append(mp.Process(target=worker_proc, args=(fn, queue, stop_item)))
for worker in workers:
worker.start()
tqdm_iter = tqdm(inputs, ncols=100, desc=tqdm_desc_pre)
for input_ in tqdm_iter:
tqdm_iter.set_description_str(desc=f'{tqdm_desc_pre} {input_[0]}')
queue.put(input_)
for _ in workers:
queue.put(stop_item)
for worker in workers:
worker.join()
# In[3]:
# given illust id and download path
# download the given artwork
def download_id(item):
id_, path_ = item
try:
# init a illustration
illust = Illust(id_)
# filter likes
if illust.likes < like_threshold:
return
# get one page from the artwork, alternately you can use a for loop
# because settings.MAX_PAGES_PER_ARTWORK had been set to one
status, (download_url, headers), filename = next(iter(illust))
os.makedirs(path_, exist_ok=True)
image_path = os.path.join(path_, filename)
if os.path.exists(image_path):
# return if we already downloaded the artwork
return
# actual download
with open(image_path, 'wb') as file:
with requests.get(download_url, headers=headers, stream=True) as req:
req.raise_for_status()
for chunk in req.iter_content(1024):
file.write(chunk)
except (ArtworkError, requests.RequestException, OSError):
tqdm.write(f'{id_} failed')
# In[4]:
# the max artwork to download
search_limit = 200
# the download like filter
like_threshold = 50
# number of worker to use
num_workers = 4
# download path
path = 'images_data/data/{name}'
# tqdm setting
tqdm_desc_pre = 'Process Artworks'
def main():
# tag names to download
search_names = ['黑子的篮球', 'lovelive', '進撃の巨人', '银魂', 'one piece', '十六夜咲夜', '名探偵コナン']
# add some if you like !
# '琪露诺', '碧蓝航线', '鬼灭之刃', 'JoJo的奇妙冒险', '龙珠', '游戏王', '芙兰朵露・斯卡蕾特', '少女前线',
# '晓美焰', '轻音少女', '鹿目圆', '灵梦', '东京喰种'
# init a client to search id
client = AndroidAPIClient(settings.username, settings.password)
tqdm_names = tqdm(search_names, ncols=75)
for name in tqdm_names:
tqdm_names.set_description_str(desc=f'Class: {name}')
# get id using client
ids = client.search(keyword=name, limit=search_limit)
download_path = path.format(name=name)
# run concurrent download
concurrent_run(download_id, list(zip(ids, itertools.repeat(download_path))), num_workers=num_workers)
# In[5]:
if __name__ == '__main__':
main()