-
Notifications
You must be signed in to change notification settings - Fork 0
/
common_utils.py
95 lines (73 loc) · 2.86 KB
/
common_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
def file_to_str(filename):
"""
Return a string of all the words in the given list of words,
each separated by a space.
Args:
filename (string): filename corresponding to a list of words.
Assumes each word is separated by a newline character.
Returns:
wordlist (string): list of words in the given file
"""
assert os.path.exists(filename), 'file does not exist'
f = open(filename)
wordlist = f.read().replace('\n', ' ')
f.close()
return wordlist
def file_to_list(filename):
"""
Return a list of all the words in the given list of words.
Args:
filename (string): filename corresponding to a list of words.
Assumes each word is separated by a newline character.
Returns:
wordlist (list): list of words in the given file
"""
assert os.path.exists(filename), 'file does not exist'
f = open(filename)
wordlist = [w.strip('\n') for w in f.readlines()]
f.close()
return wordlist
# TODO: need generic method to fetch publisher/company info from filename. This only works for chinese-games/dataset2
def get_publisher_name(filename):
"""
Each word list in citizen-lab-data/chinese-games/dataset2 is
prefaced by the publisher or developer that was used to discover
the list. Return the name of the publisher given the file name.
Args:
filename (string): filename corresponding to a list of words.
The file must be located in citizen-lab-data/chinese-games/dataset2
or in citizen-lab-data/chinese-games/dataset2-grouped
Returns:
publisher_name (string): name of the publisher
"""
assert os.path.exists(filename), 'file does not exist'
assert 'citizen-lab-data/chinese-games/dataset2' in filename, \
'invalid file name, see docstring for usage'
file = os.path.basename(filename)
if '#' in file:
return file[:file.index('#')]
return file[:file.index('.')]
def publisher_to_filename(dir):
"""
Return a mapping of publisher names to all the filenames in the
directory corresponding to wordlists from that publisher.
Args:
dir: directory containing wordlists
Returns:
publishers (dict): a mapping from publisher names to a list of
filenames in the given directory
"""
# TODO: different logic for open-source directory.
# currently this is for citizen-lab-data/chinese-games/dataset2 only
assert os.path.exists(dir), 'directory does not exist'
publishers = {}
for file in os.listdir(dir):
publisher = file[:file.index('#')]
is_txt = os.path.splitext(file)[1] == '.txt' # only .txt for now
if is_txt:
if publisher in publishers:
publishers[publisher].append(file)
else:
publishers[publisher] = [file]
return publishers