-
Notifications
You must be signed in to change notification settings - Fork 0
/
dbtweets.py
596 lines (508 loc) · 18.8 KB
/
dbtweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
#! python3
# coding: utf-8
import json
import logging
import os
import time
from collections import deque
import requests
import schedule
import tweepy
__version__ = "1.3.0"
# File and directory names
CONFIG_FILE = "config.json"
IMG_DIR = "img"
RECENT_IDS_FILE = "recentids.txt"
DB_DUMP_FILE = "danbooru_dump.txt"
# Templates
DB_URL = "http://danbooru.donmai.us"
DB_API_URL = DB_URL + "/{endpoint}.json{params}"
PIXIV_URL = "http://www.pixiv.net/member_illust.php?mode=medium&illust_id={id}"
DA_URL = "http://{artist}.deviantart.com/gallery/#/{id}"
LOG_FMT = "%(levelname)s (%(name)s): %(message)s"
# Preset tag blacklist, mostly tags that are too explicit
TAG_BLACKLIST = (
"pregnant",
"diaper",
"inflation",
"panties",
"guro",
"scat",
"peeing",
"comic",
"bikini",
"chastity_belt",
"trefoil",
"undressing",
"spread_legs",
"pussy",
"nipples",
"censored",
"cum",
"nude",
"sex",
"facial",
"vaginal",
"cum_on_body",
"convenient_censoring",
"bottomless",
"covering_breasts",
"groin",
"cameltoe",
"panty_lift",
"french_kiss",
"underboob",
"between_breasts",
"lingerie",
"ebola",
"navel_cutout",
"partially_visible_vulva",
"ball_gag",
"bdsm",
"bondage",
"gag",
"gagged",
"spoilers",
"penis",
"disembodied_penis")
# Usually list of usernames who don't want their art reposted
USER_BLACKLIST = (
"khee",
"bakakhee",
"cactuskhee",
"junkhee")
# Use for source URLs
# (except Twitter, Pixiv, and deviantart.net, which are processed separately)
SOURCE_DOMAINS = (
"tumblr.com",
"deviantart.com",
"twitpic.com",
"seiga.nicovideo.jp")
# Use for verifying content type before downloading an image
ALLOWED_CONTENT_TYPES = ("image/jpeg", "image/png", "image/gif",
"binary/octet-stream")
# Post ID number of "TrainerTrish" art
TRISH_ID = 2575437
logger = logging.getLogger(__name__)
config_dict = {}
# For debugging purposes
db_request_raw = ""
class ImageQueue():
def __init__(self):
self._items = []
def enqueue(self, post_id: str, image_uri: str, source: str=None):
item = (str(post_id), image_uri, source)
self._items.insert(0, item)
def dequeue(self):
return self._items.pop()
def __len__(self):
return self._items.__len__()
def __str__(self):
return self._items.__str__()
def is_empty(self):
return self.__len__() < 1
def get_first_item(self):
return self._items[-1] if self._items else None
class TweetPicBot():
def __init__(self, keys: dict):
auth = tweepy.OAuthHandler(keys["consumer"], keys["consumer_secret"])
auth.set_access_token(keys["access"], keys["access_secret"])
self._api = tweepy.API(auth)
self._authenticate()
def _authenticate(self):
try:
user = self._api.verify_credentials().screen_name
logger.info(
"Twitter API keys verified successfully, authenticated as @%s",
user)
except tweepy.TweepError as t:
log_tweepy_err(t, "Can't verify Twitter API keys")
raise SystemExit
def send_tweet(self, media_path: str, tweet=""):
# Return True if tweet was sent successfully, otherwise False
try:
logger.debug("Uploading %s", media_path)
media_id = self._api.media_upload(media_path).media_id_string
logger.debug("Sending tweet")
self._api.update_status(status=tweet, media_ids=[media_id])
return True
except tweepy.TweepError as t:
log_tweepy_err(t, "Failed to send tweet")
return False
image_queue = ImageQueue()
recent_ids = deque([], 25)
def log_tweepy_err(e: tweepy.TweepError, prefix: str=""):
# Tweepy's TweepError exception class is weird,
# that's why I have this set up
if prefix != "":
errmsg = prefix + ": "
if e.api_code:
code = e.api_code
msg = e.args[0][0]["message"]
errmsg += "{0} (error code {1})".format(msg, code)
else:
errmsg += str(e)
logger.error(errmsg)
def parse_config():
global config_dict
with open(CONFIG_FILE) as f:
config_dict = json.load(f)
verify_keys()
def verify_keys():
# TODO: clean up this function
def do_assert(condition, err_msg: str):
assert condition, err_msg
def verify_blacklist_keys():
blacklist = config_dict["blacklist"]
for k in ("tags", "artists"):
do_assert(k in blacklist,
"Required key \"%s\" not found in blacklist config" % k)
do_assert(isinstance(blacklist[k], list), "Required blacklist key "
"\"%s\" must have value of type array (list)" % k)
def verify_twitter_keys():
twkeys = config_dict["twitter_keys"]
for k in ("consumer", "consumer_secret", "access", "access_secret"):
do_assert(k in twkeys,
"Required key \"%s\" not found in Twitter keys config" %k)
do_assert(isinstance(twkeys[k], str) and twkeys[k] != "",
"Required key \"%s\" must have value of type string "
"and can't be blank" % k)
for k in ("tags", "blacklist", "twitter_keys", "score", "favorites",
"limit"):
do_assert(k in config_dict,
"Required key \"%s\" not found in config" % k)
v = config_dict[k]
if k in ("blacklist", "twitter_keys"):
do_assert(isinstance(v, dict), "Required key "
"%s must have value of type object (dict)" % k)
elif k == "tags":
do_assert(isinstance(v, list), "Required key "
"\"%s\" must have value of type array (list)" % k)
do_assert(len(v) < 3,
"Search queries are limited to 2 tags")
do_assert(len(v) > 0, "Tags cannot be blank")
elif k in ("score", "favorites"):
do_assert(isinstance(v, int), "Required key "
"\"%s\" must have value of integer" % k)
elif k == "limit":
do_assert(isinstance(v, int) or v == None, "Required key "
"\"%s\" must have value of integer or null" % k)
if isinstance(v, int):
do_assert(0 < v <= 100, "Required key "
"\"%s\" is out of range (must be between 1 and 100)" % k)
else:
do_assert(isinstance(v, str), "Required key "
"\"%s\" must have value of type string and can't be blank" % k)
verify_twitter_keys()
verify_blacklist_keys()
def get_danbooru_request(endpoint: str, params: dict):
# Convert params dict to URI string
if params:
params_list = []
params_str = "?"
for k in params:
if not (isinstance(k, str) and isinstance(params[k], str)):
continue
params_list.append("{0}={1}".format(k, params[k]))
params_str += "&".join(params_list)
else:
params_str = ""
global db_request_raw
r = requests.get(DB_API_URL.format(endpoint=endpoint, params=params_str))
db_request_raw = r.content.decode()
return r.json()
def populate_queue(attempts=1):
# Step 1: Assemble URI parameters
tags_str = "+".join(config_dict["tags"])
logger.info("Building post queue for tag(s) \"%s\"", tags_str)
params = {
"tags":tags_str,
"random":"true"
}
if config_dict["limit"]:
params["limit"] = str(config_dict["limit"])
# Step 2: Get request and check if it returned any posts
posts = get_danbooru_request("posts", params)
assert posts, "Provided tag(s) \"%s\" returned no posts" % tags_str
# Step 3: Iterate through and filter posts
# Unfiltered posts are added to image queue
postcount = 0
for post in posts:
# Evaluate post data for filtering
if not eval_post(post):
continue
# Enqueue post info
postid = post["id"]
# Use "large_file_url" just in case post's actual image is too big
url = post["large_file_url"]
# Check if image URL is within Danbooru domain
# (i.e. root directory of website) and add domain prefix if true
if url.startswith("/"):
url = DB_URL + url
source = get_source(post)
image_queue.enqueue(postid, url, source)
logger.debug("Added post ID %s to queue", postid)
postcount += 1
# Step 4: Log queue size when done, otherwise run function again
if postcount > 0:
logger.info("%s/%s images added to queue, current queue size is now %s",
postcount, len(posts), len(image_queue))
return
# Give up after 3 attempts
if attempts >= 3:
raise SystemExit
logger.info("No matching images added to queue, retrying in 5s")
attempts += 1
time.sleep(5)
populate_queue(attempts)
def eval_post(post: dict):
# Returns False if given post is caught by any filters below
postid = post["id"]
# Check if post is banned (no image available)
if post["is_banned"]:
logger.debug("Post ID %s is banned and skipped", postid)
return False
# Check if rating is q(uestionable) or e(xplicit)
if post["rating"] != "s":
logger.debug("Post ID %s skipped due to rating (rated %s)", postid,
post["rating"])
return False
# Evaluate tags, score, favorite count, and filetype
return (eval_tags(post["tag_string"], postid) and
eval_score(post["score"], postid) and
eval_favorites(post["fav_count"], postid) and
eval_filetype(post["large_file_url"], postid)
)
def eval_tags(tag_string: str, postid):
# Return True if no tags are in blacklist, otherwise return False
tags = tag_string.split()
blacklist_config = config_dict["blacklist"]
for t in tags:
if t in TAG_BLACKLIST or t in blacklist_config["tags"]:
logger.debug("Post ID %s contains blacklisted tag: %s", postid, t)
return False
if t in USER_BLACKLIST or t in blacklist_config["artists"]:
logger.debug("Post ID %s is by blacklisted artist: %s", postid, t)
return False
return True
def eval_score(score: int, postid):
# Return True if post's score meets threshold, otherwise return False
if score >= config_dict["score"]:
return True
logger.debug("Post ID %s did not meet score threshold of %s", postid,
config_dict["score"])
return False
def eval_favorites(count: int, postid):
# Same as eval_score, but evaluate based on post's favorite count
if count >= config_dict["favorites"]:
return True
logger.debug("Post ID %s did not meet favorite count threshold of %s",
postid, config_dict["favorites"])
return False
def eval_filetype(filename: str, postid):
filetype = filename.split(".")[-1]
if filetype.lower() in ("jpg", "jpeg", "png", "gif"):
return True
logger.debug("Post ID %s has invalid filetype of .%s", postid, filetype)
return False
def get_source(post: dict):
def get_da_permalink(url: str):
info = url.split("_by_")[-1].split(".")[0]
info_dash_split = info.split("-")
if len(info_dash_split) < 2:
return url
artist, art_id = info_dash_split
return DA_URL.format(artist=artist, id=art_id)
if "bad_id" in post["tag_string_general"]:
logger.debug("Post ID %s contains tag \"bad_id\"", post["id"])
return
if post["pixiv_id"]:
return PIXIV_URL.format(id=post["pixiv_id"])
source = post["source"]
if source.startswith("https://twitter.com/"):
return "@" + source.split("/")[3]
if "deviantart.net/" in source:
return get_da_permalink(source)
for domain in SOURCE_DOMAINS:
if domain + "/" in source:
return source
return
def dump_db_request(error):
# Dumps response content to file when bot runs into an error
# while building queue
if type(error) in (SystemExit, KeyboardInterrupt, AssertionError):
return
with open(DB_DUMP_FILE, "w", encoding="utf_8") as f:
f.write(db_request_raw)
logger.info("Error occurred while populating queue, "
"response content dumped to %s", DB_DUMP_FILE)
def post_image(bot: TweetPicBot):
# Step 1: Repopulate queue if size is less than 5
if len(image_queue) < 5:
try:
populate_queue()
except Exception as e:
dump_db_request(e)
raise
# Step 2: Check if post ID was already posted in the last 25 tweets
postdata = image_queue.get_first_item()
postid = postdata[0]
while postid in recent_ids:
# Discard post in queue
image_queue.dequeue()
logger.debug("Post ID %s was uploaded in the last 25 tweets", postid)
postdata = image_queue.get_first_item()
postid = postdata[0]
# Step 3: Download image to file
url = postdata[1]
try:
file_path = download_file(postid, url)
except IOError as err:
# If received content type is not in ALLOWED_CONTENT_TYPES list above,
# then move on to next post in queue
logger.error("%s, moving on to next post in queue", err)
image_queue.dequeue()
# Add 1s delay to make sure we're not flooding Danbooru with requests
time.sleep(1)
post_image(bot)
return
except:
logger.exception("Failed to download image for post ID %s, "
"will retry at next scheduled interval", postid)
return
# Step 4: Prepare tweet content
if postid == TRISH_ID:
source_str = "Trish-chan (aka @PlayerOneTimmy as a girl)\n"
else:
source_str = ""
source = postdata[2]
if source:
source_str += "Source: %s" % source
# Step 5: Send tweet and add post ID to recent IDs list
if bot.send_tweet(file_path, source_str):
logger.info("Tweet sent successfully! "
"Post ID of uploaded image was %s", postid)
# Discard post from queue when done
image_queue.dequeue()
logger.debug("%s post(s) remaining in queue", len(image_queue))
# Save recent IDs file
recent_ids.append(postid)
save_recent_ids()
else:
logger.info(
"Tweet for post ID %s will be sent at next scheduled interval",
postid)
def download_file(postid: str, url: str):
# based from http://stackoverflow.com/a/16696317
local_filename = "{0}.{1}".format(postid, url.split('.')[-1])
path = "{0}/{1}".format(IMG_DIR, local_filename)
if local_filename in os.listdir(IMG_DIR + "/"):
logger.debug("Image already exists: %s", path)
return path
logger.info("Downloading post ID %s to %s", postid, path)
time_start = time.time()
chunks = 0 # for checking 0-byte files
r = requests.get(url, stream=True)
# Check Content-Type header in case Danbooru returns HTML/XML file
# instead of an image for any reason
if "Content-Type" in r.headers:
# Strip out extra stuff in content headers for some images
# (example: "image/jpeg; charset=utf-8")
content_type = r.headers["Content-Type"].split("; ")[0]
if content_type not in ALLOWED_CONTENT_TYPES:
raise IOError("Content type '%s' is invalid for media upload"
% content_type)
with open(path, "wb") as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
chunks += 1 # count each valid chunk
time_end = time.time()
# Raises error if no valid chunks were downloaded
if chunks < 1:
if local_filename in os.listdir(IMG_DIR + "/"):
os.remove(path)
raise IOError("0-byte file downloaded")
# Raises error if downloaded size doesn't match expected size
if "Content-Length" in r.headers:
expected_size = int(r.headers["Content-Length"])
filesize = os.stat(path).st_size
if filesize != expected_size:
os.remove(path)
raise IOError(
"Filesize mismatch (possible corrupted/interrupted download)"
)
elapsed = round(time_end - time_start, 3)
logger.info("Completed downloading %s in %ss", local_filename, elapsed)
return path
def logging_setup():
logging.basicConfig(format=LOG_FMT, level=logging.INFO)
logging.Formatter.converter = time.gmtime
filehandler = logging.FileHandler("events.log")
fmt = logging.Formatter("[%(asctime)s] " + LOG_FMT, "%Y-%m-%dT%H:%M:%SZ")
filehandler.setFormatter(fmt)
logging.getLogger().addHandler(filehandler)
logging.getLogger("oauthlib").setLevel(logging.WARNING)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("requests_oauthlib").setLevel(logging.WARNING)
logging.getLogger("schedule").setLevel(logging.WARNING)
logging.getLogger("tweepy").setLevel(logging.WARNING)
def load_recent_ids():
if not RECENT_IDS_FILE in os.listdir():
logger.debug("%s not found in current directory, skipping",
RECENT_IDS_FILE)
return
with open(RECENT_IDS_FILE) as f:
id_count = 0
for line in f:
line = line.strip("\n")
if line.isdigit():
recent_ids.append(line)
id_count += 1
logger.debug("Found %s post ID(s) in file", id_count)
logger.info("Recent post IDs loaded")
def save_recent_ids():
with open(RECENT_IDS_FILE, mode="w") as f:
f.write("\n".join(recent_ids))
logger.debug("Saved last 25 post IDs to %s", RECENT_IDS_FILE)
def main_loop(interval: int=30):
# Check interval range
assert 60 > interval > 0, "Interval must be between 1 and 59 minutes"
# Make images directory if it doesn't exist
if not IMG_DIR in os.listdir():
logger.info("Creating images directory")
os.mkdir(IMG_DIR)
# Set up Twitter API client
bot = TweetPicBot(config_dict["twitter_keys"])
# Build initial queue, then set up schedule
# Post immediately if current UTC minute is divisible by interval
current_min = time.gmtime().tm_min
if current_min % interval == 0:
post_image(bot)
else:
try:
populate_queue()
except Exception as e:
dump_db_request(e)
raise
for m in range(0, 60, interval):
schedule.every().hour.at("00:%s" % m).do(post_image, bot)
while True:
schedule.run_pending()
time.sleep(1)
if __name__ == "__main__":
logging_setup()
logger.info("Timmy's Danbooru Twitter Bot v%s is starting up",
__version__)
try:
load_recent_ids()
parse_config()
main_loop()
except (KeyboardInterrupt, SystemExit):
# Use Ctrl-C to terminate the bot
logger.info("Now shutting down")
except AssertionError as e:
logger.error(e)
except:
logger.exception("Exception occurred, now shutting down")
schedule.clear()