forked from steady-pace/Domain_QA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
改进后的基于web领域术语抽取
71 lines (49 loc) · 1.63 KB
/
改进后的基于web领域术语抽取
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python
# -*- coding: gb2312 -*-
from gensim.models import word2vec #上面的编码变成 utf8,gb2312都可以
import gensim
import os
class MySentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname)):
yield line.split()
sentences = MySentences('D:\\手机-预处理后\\')#('D:\\手机-正文\\花卉\\')
model = gensim.models.Word2Vec(sentences,min_count=2,size=500)
model.save('D:\\model\\mymodelphone')
new_model = gensim.models.Word2Vec.load('D:\\model\\mymodelphone')
list=new_model.most_similar('小米',topn=20)
#print(list[1][0])
#for s in list:
# t=s[0]
#print t.decode('utf8') ##这样就好了。不需要encode
# print (t)
#list=new_model.most_similar('牡丹')
#print(list[1][0])
for s in list:
t=s[0]
print t,s[1]#.decode('utf8') ##这样就好了。不需要encode
print s[1]
'''
#读取当个文档,进行训练,进行测试
sentences = []
#sentences=word2vec.Text8Corpus("D:\\test.txt")
f = open("D:\\test.txt", "r")
for line in f.readlines():
# print "create %s"%line
sentences.append(line.split(" "))
f.close()
#print sentences
model = word2vec.Word2Vec(sentences,min_count=3,size=50)
# 如何打印出来 汉字,easy啊。直接打印list[0][0]
list=model.most_similar('普吉岛')
#print(list[1][0])
for s in list:
t=s[0]
print t.decode('utf8') ##这样就好了。不需要encode
#s="蒙牛 -- 伊利 间的余弦距离:"
#print s.decode('utf-8')
#print model.similarity('旅游','中国')
'''