-
Notifications
You must be signed in to change notification settings - Fork 0
/
LSA.py
91 lines (78 loc) · 2.72 KB
/
LSA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: utf-8 -*-
"""
"""
from numpy import zeros
import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import svd
titles =[
"The Neatest Little Guide to Stock Market Investing",
"Investing For Dummies, 4th Edition",
"The Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns",
"The Little Book of Value Investing",
"Value Investing: From Graham to Buffett and Beyond",
"Rich Dad's Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not!",
"Investing in Real Estate, 5th Edition",
"Stock Investing For Dummies",
"Rich Dad's Advisors: The ABC's of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss"
]
stopwords = ['and','edition','for','in','little','of','the','to']
ignorechars = ''''',:'!'''
class LSA(object):
def __init__(self, stopwords, ignorechars):
self.stopwords = stopwords
self.ignorechars = ignorechars
self.wdict = {}
self.dcount = 0
def parse(self, doc):
words = doc.split();
for w in words:
#print self.dcount
w = w.lower().translate(self.ignorechars)
if w in self.stopwords:
continue
elif w in self.wdict:
self.wdict[w].append(self.dcount)
else:
self.wdict[w] = [self.dcount]
self.dcount += 1
def build(self):
self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 1]
self.keys.sort()
print (self.keys)
self.A = zeros([len(self.keys), self.dcount])
for i, k in enumerate(self.keys):
for d in self.wdict[k]:
self.A[i,d] += 1
def printA(self):
print (self.A)
u,s,vt = svd(self.A)
print ("""\r""")
print (u)
print ("""\r""")
print (s)
print ("""\r""")
print (vt)
print ("""\r""")
plt.title("LSA")
plt.xlabel(u'dimention2')
plt.ylabel(u'dimention3')
titles = ['T1','T2','T3','T4','T5','T6','T7','T8','T9']
vdemention2 = vt[1]
vdemention3 = vt[2]
for j in range(len(vdemention2)):
plt.text(vdemention2[j],vdemention3[j],titles[j])
plt.plot(vdemention2, vdemention3, '.')
plt.show()
ut = u.T
demention2 = ut[1]
demention3 = ut[2]
for i in range(len(demention2)):
plt.text(demention2[i],demention3[i],self.keys[i])
plt.plot(demention2, demention3, '.')
plt.show()
mylsa = LSA(stopwords, ignorechars)
for t in titles:
mylsa.parse(t)
mylsa.build()
mylsa.printA()