-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexing.py
34 lines (30 loc) · 1.03 KB
/
indexing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import json
from elasticsearch import Elasticsearch
from os import listdir
# cine: 1347
# doisong: 630
# sport: 808
# xahoi: 709
files = listdir('./crawl-data');
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
i = 1; # Start from 1
for file in files:
with open('./crawl-data/' + file, encoding='utf8') as fp:
lines = fp.readlines()
for line in lines:
parts = line.split('|')
news = {
'timestamp': parts[0],
'category': parts[1],
'title': parts[2],
'content': parts[3]
}
es.index(index='kenh14', doc_type='news', id=i, body=json.loads(json.dumps(news)))
i = i + 1
print("Indexing.. %d" % i)
### test query
# q = es.get(index='kenh14', doc_type='news', id=5)
# q = es.search(index="swapi", body={"query": {"prefix" : { "name" : "Darth Vader" }}})
# q = es.count(index="swapi", doc_type="people")
# q = es.search(index="swapi", body={"query": {"fuzzy_field" : { "name" : {"like_text": "lu", "max_query_terms":5}}}})
# print(q)