-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathamara_langs.py
134 lines (122 loc) · 4.87 KB
/
amara_langs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import requests, json
from itertools import groupby
import amara_vids as av
import datetime
import sys
# remainder: objects, code using the objects
class AmaraInfoSet(object):
"""Object gets/stores metrics for videos/translations across whatever boundary/set specified by account(s) chosen"""
def __init__(self,relv): #relvid should be a RelevantVideos instance
# manage and create langmap -- parse iana subtag (language code) registry
stf = open("iana_subtag_registry.txt").read()
langlist = stf.split("%%")
codelangs = {}
separated = [x.strip().split("\n") for x in langlist]
for si in separated:
for term in si:
if term.startswith("Subtag:"):
codelangs[term.split(":")[1].strip().rstrip()] = si[2].split(":")[1].strip().rstrip()
relvid = relv
relvid.manage_links()
self.baseurl = "http://www.amara.org/api2/partners/videos/{}/languages/?format=json" # structured for .format arg
self.vid_ids = relvid.ids.keys()
self.flag = False
self.total_indiv_subtitles = 0
self.lang_names = []
self.langs = {}
self.lang_map = codelangs # created dict above
self.lang_map['swa'] = "Swahili" # because of a multiple version problem, see error correction
self.lang_map['zh'] = "Chinese" # same thing
self.lang_map['zhx'] = "Chinese" # same thing
def get_info(self):
#print self.lang_map
self.flag = True # notes that get info has been run on this instance
now = str(datetime.datetime.now()).split(":")[0].replace("\/","") + "h"
# open csv file and write videoid-language pairs to it
ft = open("video_ids_langs_{}.csv".format(now), "w")
ft.write("Video ID,Language Translation\n")
for i in self.vid_ids:
if i != "openmichigan":
try:
t = json.loads(requests.get(self.baseurl.format(i)).text)
self.total_indiv_subtitles += int(t["meta"]["total_count"]) # weird number TODO check
for ob in t["objects"]:
if ob["language_code"]:
#print ob["language_code"] # DEBUG
lc = ob["language_code"]
if "-" in lc:
lc = lc.split("-")[0].replace("-","").strip().rstrip()
if self.lang_map[lc] in self.langs:
self.langs[self.lang_map[lc]] += 1
else:
self.langs[self.lang_map[lc]] = 1
ft.write("{}\t{}\n".format(i,self.lang_map[lc]))
if ob["name"] != "english": # using for non-english languages, primary use case
self.lang_names.append(ob["name"])
#self.get_non_english_langs()
except Exception, e:
print "Exception raised - {}".format(e)
print "Looking at video: {}".format(i)
continue
self.get_non_english_langs()
def get_total_subtitles(self):
"""Gets and returns the number of total non-English subtitles extant in this InfoSet"""
tot = 0
for i in self.langs:
if "en" not in i:
tot += self.langs[i]
return tot
def get_non_english_langs(self):
"""Saves a list of all languages, besides English, subtitles have been translated into"""
self.non_eng_langs = []
self.total_transls = 0 # maybe not necessary, TODO consider
for k in self.langs:
if "nglish" not in k:
self.total_transls += self.langs[k]
#print self.langs[k]
if k != "Abhkazian": # TODO: what is going on there
self.non_eng_langs.append(k)
def __str__(self):
"""Provides print-to-console representation of InfoSet"""
if not self.flag:
self.get_info()
s = """
Number of total languages including English: {}
Total non-English translations: {}
Languages:\n
""".format(len(self.langs.keys()),self.total_transls) #len([x for x in self.langs.keys() if "english" not in x.lower()])
for l in sorted(self.langs.keys(),key=lambda x:self.langs[x]):
s += "- {} {}\n".format(self.langs[l], l)
return s
def __repr__(self):
# TODO: return some form of structured non-C/TSV data.
pass
def write_csv(self):
#self.get_info()
if not self.flag:
self.get_info()
now = str(datetime.datetime.now())
f = open("amara_info_{}.csv".format(now), "w") # creates csv file dated with current date/time
f.write("Language Name,Number of Subtitles\n")
f.write("Total Subtitles (Including English),{}\n".format(self.total_indiv_subtitles))
f.write("Total Non-English Subtitles,{}\n".format(int(self.total_transls))) # TODO add the total number of non-English subtitles
for l in sorted(self.langs.keys(),key=lambda x:self.langs[x]):
f.write("{},{}\n".format(l,self.langs[l]))
if __name__ == '__main__':
# access accounts to be used and save in AmaraAccount instances
if len(sys.argv) == 1:
om_acct = av.AmaraAccount("openmichigan.video")
addl_acct = av.AmaraAccount("kludewig")
accounts = [om_acct,addl_acct]
else:
accounts = []
for item in sys.argv[1:]:
accounts.append(item)
# get relevant videos from the account objects
relvs = av.RelevantVideos(*accounts)
# create an AmaraInfoSet instance and write file/print information to console
total_info = AmaraInfoSet(relvs)
# print information to console
print total_info
# write aggregate file
total_info.write_csv()