This repository has been archived by the owner on Sep 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_data.py
63 lines (56 loc) · 1.9 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# usage: python utils/process_data "input-texts/*.txt" "input-metadata/*.json"
import glob, json, sys, os, sqlite3, re, uuid, collections, hashlib
txt_files = sys.argv[1]
meta_file = sys.argv[2]
min_count = 1 # minimum count terms need to be retained
# compose a db_name that's a hash of the params used
s = ''.join([str(i) for i in [txt_files, meta_file, min_count]])
try:
unique_id = hashlib.sha224(s).hexdigest()
except Exception as exc:
unique_id = hashlib.sha224(s.encode('utf8')).hexdigest()
# build up a dictionary of d[filename] = {'year': ...other meta attrs...}
meta_d = collections.defaultdict()
with open(meta_file) as f:
j = json.load(f)
for i in j:
meta_d[i.get('filename', '')] = i
# build up d[token][year] = word count
years = set()
c = 0
d = collections.defaultdict(collections.Counter)
is_int_like = lambda x: all([i.isnumeric() for i in str(x)])
for i in glob.glob(txt_files):
filename = os.path.basename(i)
year = meta_d.get(filename, {}).get('year', False)
if not year or not is_int_like(year): continue
year = int(year)
years.add(year)
if not year: continue
with open(i) as f:
f = f.read().lower() # lowercase
f = re.sub(r'[^\w\s]', '', f) # remove punct
for word in f.split():
d[word][year] += 1
c += 1
print(' * total tokens:', c)
print(' * total types:', len(d))
db_name = unique_id + '.db'
conn = sqlite3.connect(db_name)
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS onegrams
(token text, year integer, count integer)''')
# insert the data
for word in d:
for year in d[word]:
count = d[word][year]
if count < min_count: continue
c.execute('INSERT INTO onegrams VALUES ("{}", {}, {})'.format(word, year, count))
conn.commit()
with open('config.json', 'w') as out:
json.dump({
'db': db_name,
'year_min': min(list(years)),
'year_max': max(list(years)),
'default_query': ['spring','summer','fall','winter'],
}, out)