-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_feature_frequency.py
50 lines (43 loc) · 2.04 KB
/
get_feature_frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import numpy as np
import os
cwd = os.getcwd()
from sklearn.feature_extraction.text import CountVectorizer
document = os.path.join('yelp_dataset_challenge_academic_dataset', 'corpus_1useful_review.txt`')#[Huge amount of data around 7MB] # ['john is a guy', 'person guy']
vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=200)
# stop_words='english', max_features=200
# Don't need both X and transformer; they should be identical
X = vectorizer.fit_transform(document)
matrix_terms = np.array(vectorizer.get_feature_names())
# Use the axis keyword to sum over rows
matrix_freq = np.asarray(X.sum(axis=0)).ravel()
final_matrix = np.array([matrix_terms,matrix_freq])
# matrixfile = open(os.path.join(cwd, 'final_matrix'), 'w')
# matrixfile.write(str(final_matrix))
np.savetxt("uni_tri-gram_frequency.csv", final_matrix, delimiter=",")
# EDIT: If you want a dictionary from term to frequency, try this after calling fit_transform:
#
# terms = vectorizer.get_feature_names()
# freqs = X.sum(axis=0).A1
# result = dict(zip(terms, freqs))
# Note that CountVectorizer can also take a file instead of a string and there's no need to read the whole file into memory. In code:
# import io
# from collections import Counter
#
# import numpy as np
# from sklearn.feature_extraction.text import CountVectorizer
#
# infile = '/path/to/input.txt'
#
# ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=1)
#
# with io.open(infile, 'r', encoding='utf8') as fin:
# X = ngram_vectorizer.fit_transform(fin)
# vocab = ngram_vectorizer.get_feature_names()
# counts = X.sum(axis=0).A1
# freq_distribution = Counter(dict(zip(vocab, counts)))
# print (freq_distribution.most_common(10))
# from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
# word_vectorizer = CountVectorizer(ngram_range=(1,2), analyzer='word')
# sparse_matrix = word_vectorizer.fit_transform(df['description'])
# frequencies = sum(sparse_matrix).toarray()[0]
# # pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])