-
Notifications
You must be signed in to change notification settings - Fork 56
/
ao3_work_ids.py
286 lines (247 loc) · 8.39 KB
/
ao3_work_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# Retrieve fic ids from an AO3 search
# Will return in searched order
# Saves ids to a csv for later use e.g. to retrieve fic text
# Options:
# Only retrieve multichapter fics
# Modify search to include a list of tags
# (e.g. you want all fics tagged either "romance" or "fluff")
from bs4 import BeautifulSoup
import re
import time
import requests
import csv
import sys
import datetime
import argparse
import os
page_empty = False
base_url = ""
url = ""
num_requested_fic = 0
num_recorded_fic = 0
csv_name = ""
multichap_only = ""
tags = []
# keep track of all processed ids to avoid repeats:
# this is separate from the temporary batch of ids
# that are written to the csv and then forgotten
seen_ids = set()
#
# Ask the user for:
# a url of a works listed page
# e.g.
# https://archiveofourown.org/works?utf8=%E2%9C%93&work_search%5Bsort_column%5D=word_count&work_search%5Bother_tag_names%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=&work_search%5Bcomplete%5D=0&commit=Sort+and+Filter&tag_id=Harry+Potter+-+J*d*+K*d*+Rowling
# https://archiveofourown.org/tags/Harry%20Potter%20-%20J*d*%20K*d*%20Rowling/works?commit=Sort+and+Filter&page=2&utf8=%E2%9C%93&work_search%5Bcomplete%5D=0&work_search%5Blanguage_id%5D=&work_search%5Bother_tag_names%5D=&work_search%5Bquery%5D=&work_search%5Bsort_column%5D=word_count
# how many fics they want
# what to call the output csv
#
# If you would like to add additional search terms (that is should contain at least one of, but not necessarily all of)
# specify these in the tag csv, one per row.
def get_args():
global base_url
global url
global csv_name
global num_requested_fic
global multichap_only
global tags
parser = argparse.ArgumentParser(description='Scrape AO3 work IDs given a search URL')
parser.add_argument(
'url', metavar='URL',
help='a single URL pointing to an AO3 search page')
parser.add_argument(
'--out_csv', default='work_ids',
help='csv output file name')
parser.add_argument(
'--header', default='',
help='user http header')
parser.add_argument(
'--num_to_retrieve', default='a',
help='how many fic ids you want')
parser.add_argument(
'--multichapter_only', default='',
help='only retrieve ids for multichapter fics')
parser.add_argument(
'--tag_csv', default='',
help='provide an optional list of tags; the retrieved fics must have one or more such tags')
args = parser.parse_args()
url = args.url
csv_name = str(args.out_csv)
# defaults to all
if (str(args.num_to_retrieve) == 'a'):
num_requested_fic = -1
else:
num_requested_fic = int(args.num_to_retrieve)
multichap_only = str(args.multichapter_only)
if multichap_only != "":
multichap_only = True
else:
multichap_only = False
tag_csv = str(args.tag_csv)
if (tag_csv):
with open(tag_csv, "r") as tags_f:
tags_reader = csv.reader(tags_f)
for row in tags_reader:
tags.append(row[0])
header_info = str(args.header)
return header_info
#
# navigate to a works listed page,
# then extract all work ids
#
def get_ids(header_info=''):
global page_empty
global seen_ids
# make the request. if we 429, try again later
headers = {'user-agent' : header_info}
req = requests.get(url, headers=headers)
while req.status_code == 429:
# >5 second delay between requests as per AO3's terms of service
time.sleep(10)
req = requests.get(url, headers=headers)
print("Request answered with Status-Code 429, retrying...")
soup = BeautifulSoup(req.text, "lxml")
# some responsiveness in the "UI"
sys.stdout.write('.')
sys.stdout.flush()
works = soup.select("li.work.blurb.group")
# see if we've gone too far and run out of fic:
if (len(works) == 0):
page_empty = True
# process list for new fic ids
ids = []
for tag in works:
if (multichap_only):
# FOR MULTICHAP ONLY
chaps = tag.find('dd', class_="chapters")
if (chaps.text != u"1/1"):
t = tag.get('id')
t = t[5:]
if not t in seen_ids:
ids.append(t)
seen_ids.add(t)
else:
t = tag.get('id')
t = t[5:]
if not t in seen_ids:
ids.append(t)
seen_ids.add(t)
return ids
#
# update the url to move to the next page
# note that if you go too far, ao3 won't error,
# but there will be no works listed
#
def update_url_to_next_page():
global url
key = "page="
start = url.find(key)
# there is already a page indicator in the url
if (start != -1):
# find where in the url the page indicator starts and ends
page_start_index = start + len(key)
page_end_index = url.find("&", page_start_index)
# if it's in the middle of the url
if (page_end_index != -1):
page = int(url[page_start_index:page_end_index]) + 1
url = url[:page_start_index] + str(page) + url[page_end_index:]
# if it's at the end of the url
else:
page = int(url[page_start_index:]) + 1
url = url[:page_start_index] + str(page)
# there is no page indicator, so we are on page 1
else:
# there are other modifiers
if (url.find("?") != -1):
url = url + "&page=2"
# there an no modifiers yet
else:
url = url + "?page=2"
# modify the base_url to include the new tag, and save to global url
def add_tag_to_url(tag):
global url
key = "&work_search%5Bother_tag_names%5D="
if (base_url.find(key)):
start = base_url.find(key) + len(key)
new_url = base_url[:start] + tag + "%2C" + base_url[start:]
url = new_url
else:
url = base_url + "&work_search%5Bother_tag_names%5D=" + tag
#
# after every page, write the gathered ids
# to the csv, so a crash doesn't lose everything.
# include the url where it was found,
# so an interrupted search can be restarted
#
def write_ids_to_csv(ids):
global num_recorded_fic
with open(csv_name + ".csv", 'a', newline="") as csvfile:
wr = csv.writer(csvfile, delimiter=',')
for id in ids:
if (not_finished()):
wr.writerow([id, url])
num_recorded_fic = num_recorded_fic + 1
else:
break
#
# if you want everything, you're not done
# otherwise compare recorded against requested.
# recorded doesn't update until it's actually written to the csv.
# If you've gone too far and there are no more fic, end.
#
def not_finished():
if (page_empty):
return False
if (num_requested_fic == -1):
return True
else:
if (num_recorded_fic < num_requested_fic):
return True
else:
return False
#
# include a text file with the starting url,
# and the number of requested fics
#
def make_readme():
with open(csv_name + "_readme.txt", "w") as text_file:
text_file.write("url: " + url + "\n" + "num_requested_fic: " + str(num_requested_fic) + "\n" + "retreived on: " + str(datetime.datetime.now()))
# reset flags to run again
# note: do not reset seen_ids
def reset():
global page_empty
global num_recorded_fic
page_empty = False
num_recorded_fic = 0
def process_for_ids(header_info=''):
while(not_finished()):
# 5 second delay between requests as per AO3's terms of service
time.sleep(5)
ids = get_ids(header_info)
write_ids_to_csv(ids)
update_url_to_next_page()
def load_existing_ids():
global seen_ids
if (os.path.exists(csv_name + ".csv")):
print("skipping existing IDs...\n")
with open(csv_name + ".csv", 'r') as csvfile:
id_reader = csv.reader(csvfile)
for row in id_reader:
seen_ids.add(row[0])
else:
print("no existing file; creating new file...\n")
def main():
header_info = get_args()
make_readme()
print ("loading existing file ...\n")
load_existing_ids()
print("processing...\n")
if (len(tags)):
for t in tags:
print ("Getting tag: ", t)
reset()
add_tag_to_url(t)
process_for_ids(header_info)
else:
process_for_ids(header_info)
print("That's all, folks.")
main()