-
Notifications
You must be signed in to change notification settings - Fork 0
/
bow_engine.py
44 lines (35 loc) · 1.21 KB
/
bow_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re
from SearchEngine.engine_base import SearchEngineBase, main
class BOWEngine(SearchEngineBase):
def __init__(self):
super(BOWEngine, self).__init__()
self.__id_to_words = {}
def process_corpus(self, id, text):
self.__id_to_words[id] = self.parse_text_to_words(text)
def search(self, query):
query_words = self.parse_text_to_words(query)
results = []
for id, words in self.__id_to_words.items():
if self.query_match(query_words, words):
results.append(id)
return results
@staticmethod
def query_match(query_words, words):
for query_word in query_words:
if query_word not in words:
return False
return True
@staticmethod
def parse_text_to_words(text):
# 使用正则表达式去除标点符号和换行符
text = re.sub(r'[^\w ]', ' ', text)
# 转为小写
text = text.lower()
# 生成所有单词的列表
word_list = text.split(' ')
# 去除空白单词
word_list = filter(None, word_list)
# 返回单词的 set
return set(word_list)
search_engine = BOWEngine()
main(search_engine)