-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetcher.py
84 lines (78 loc) · 3.05 KB
/
fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
from requests import get, post
from json import dump, load
from sys import argv
from os import environ, makedirs
import os.path
key = environ.get("YOUTUBE_KEY")
def get_vids(channel_id, cache=True):
try:
if cache:
with open(os.path.join("cache", channel_id, "ids")) as f:
for line in f:
yield line.strip()
else:
raise Exception
except:
ids = []
playlist_id = "UU" + channel_id[2:]
api_url = "https://www.googleapis.com/youtube/v3/playlistItems"
params = {"part": "snippet", "maxResults": "50", "playlistId": playlist_id, "key": key}
while True:
vids = get(api_url, params=params).json()
for vid in vids["items"]:
ids.append(vid["snippet"]["resourceId"]["videoId"])
yield vid["snippet"]["resourceId"]["videoId"]
if "nextPageToken" in vids:
params["pageToken"] = vids["nextPageToken"]
else:
break
with open(os.path.join("cache", channel_id, "ids"), "w") as f:
f.write("\n".join(ids))
def get_subs_data(video_id, cache=True, channel_id=None):
try:
if cache and channel_id:
with open(os.path.join("cache", channel_id, video_id + ".json")) as f:
return load(f)
else:
raise Exception
except:
data = post("https://vznx16favj.execute-api.us-east-1.amazonaws.com/default/getSubtitles?videoID=" + video_id).json()
with open(os.path.join("cache", "UC" + data["video-details"]["channelId"][2:], video_id + ".json"), "w") as f:
dump(data,f)
return data
def get_yt_subs_data(video_id, channel_id, cache=True):
try:
if cache and channel_id:
with open(os.path.join("cache", channel_id, video_id + ".yt")) as f:
return load(f)
else:
raise Exception
except:
api_url = "https://www.googleapis.com/youtube/v3/captions"
params = {"part": "snippet", "videoId": video_id, "key": key}
data = get(api_url, params=params).json()
with open(os.path.join("cache", channel_id, video_id + ".yt"), "w") as f:
dump(data,f)
return data
def main():
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("channel")
parser.add_argument("--no-cache", action="store_true")
args = parser.parse_args()
try:
os.makedirs(os.path.join("cache", args.channel))
except FileExistsError:
pass
ids = [i for i in get_vids(args.channel, cache=not args.no_cache)]
print("retrieved list of video ids: {} videos".format(len(ids)))
for i in ids:
_ = get_subs_data(i, cache=not args.no_cache, channel_id=args.channel)
print("got {}: {} subs".format(i, _["subtitles"]["Count"]))
_ = get_yt_subs_data(i, args.channel, cache=True)
print("yt {}: {} subs".format(i, len(_["items"])))
return 0
if __name__ == "__main__":
from sys import exit
exit(main())