-
Notifications
You must be signed in to change notification settings - Fork 0
/
cooccurrence_df.py
120 lines (96 loc) · 2.86 KB
/
cooccurrence_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
For fast loading of cooccurence network for key words without need to
recalculate statistics.
Useful for presenting results without requiring too much computation.
"""
import time
import pandas as pd
import NL_helpers
t0 = time.time()
def populate_df(df, stat, search_terms):
"""Populate df of cooccurrences for list of search terms
and their cooccurrences."""
if stat == 'log dice':
function = NL_helpers.log_dice_coocs
elif stat == 'ml':
function = NL_helpers.ml_coocs
for term in search_terms:
try:
coocs = function(term, dtm, ttm, num_coocs=NUM_COOCS)
cooc_df.loc[f'{term}_{stat}'] = coocs.index.to_series().append(coocs, ignore_index=True).to_list()
for i in coocs.index:
if not f'{i}_{stat}' in cooc_df.index:
secondary_coocs = function(i, dtm, ttm, num_coocs=NUM_COOCS)
cooc_df.loc[f'{i}_{stat}'] = secondary_coocs.index.to_series().append(secondary_coocs, ignore_index=True).to_list()
except KeyError:
print(f'{term} not in dictionary')
search_terms = [
'philosophy',
'theology',
'speculative',
'ethics',
'metaphysics',
'theosophy',
'materialism',
'idealism',
'liberalism',
'socialism',
'stout',
'freethinker'
]
NUM_COOCS = 50
dtm = pd.read_pickle('pickles/dtm_aggressive_16kwords.tar.gz')
ttm = pd.read_pickle('pickles/tt_agressive_16kwords.pickle')
cooc_df = pd.DataFrame(columns = [f'Term {i}' for i in range(NUM_COOCS)] + [f'Score {i}' for i in range(NUM_COOCS)])
stats = ['log dice', 'ml']
for stat in stats:
populate_df(cooc_df, stat, search_terms)
print(f'time taken: {time.time()-t0}')
cooc_df
cooc_df.to_pickle('pickles/cooc_df.pickle')
del dtm, ttm
to_be_generated = [
'BOW_entities',
'TF-IDF_entities',
'BOW_propn',
'TF-IDF_propn',
'TF-IDF_25kwords'
]
entity_terms = [
'plato',
'stout',
'theosophy',
'university',
'canterbury college',
'the new zealand institute',
'the church',
'new zealand'
]
propn_terms = [
'Besant',
'Stout',
'Vogel',
'Plato',
'Aristotle',
'Spencer',
'Carlyle',
'Darwin',
'Hosking',
'Worthington',
'Collins'
]
for variety in to_be_generated:
if variety == 'TF-IDF_25kwords':
keywords = search_terms
elif variety.endswith('entities'):
keywords = entity_terms
elif variety.endswith('propn'):
keywords = propn_terms
dtm = pd.read_pickle(f'pickles/dtm_{variety}.tar.gz')
ttm = pd.read_pickle(f'pickles/ttm_{variety}.tar.gz')
cooc_df = pd.DataFrame(columns = [f'Term {i}' for i in range(NUM_COOCS)] + [f'Score {i}' for i in range(NUM_COOCS)])
stats = ['log dice', 'ml']
for stat in stats:
populate_df(cooc_df, stat, keywords)
cooc_df.to_pickle(f'pickles/cooc_{variety}_df.tar.gz')
del dtm, ttm