forked from esitarski/CrossMgr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHelpIndex.py
98 lines (74 loc) · 2.85 KB
/
HelpIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from whoosh.index import create_in, open_dir
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import *
import os
import shutil
import glob
import re
from bs4 import BeautifulSoup
htmlDocDir = 'CrossMgrHtmlDoc'
indexDir = 'CrossMgrHelpIndex'
def BuildHelpIndex():
if os.path.exists( indexDir ):
shutil.rmtree( indexDir, ignore_errors = True )
os.mkdir( indexDir )
stemmingAnalyzer = StemmingAnalyzer()
schema = Schema( path=ID(stored=True, unique=True), section=TEXT(stored=True), title=TEXT(stored=True, analyzer=stemmingAnalyzer),
level=NUMERIC(stored=True), content=TEXT(stored=True, analyzer=stemmingAnalyzer) )
ix = create_in( indexDir, schema )
writer = ix.writer()
titleTags = set([u'h1', u'h2', u'h3', u'h4', u'h5'])
newLines = re.compile( '\n+' )
nonNumeric = re.compile( r'[^\d]' )
def addDocument( fname, section, lastTitle, textCur ):
# print u'addDocument: lastTitle={}'.format(lastTitle)
if lastTitle and textCur:
section = '|'.join( section ) if section else lastTitle.get_text()
# print u'Indexing: {}: {}'.format(os.path.basename(fname), section)
content = newLines.sub( u'\n', u'\n'.join(textCur) )
writer.add_document( path = os.path.basename(fname) + u'#' + lastTitle['id'],
title = lastTitle.get_text(),
section = section,
level = int(nonNumeric.sub(u'', lastTitle.name)),
content = content )
# Extract content sections from the html pages.
for f in glob.iglob( os.path.join(htmlDocDir, '*.html') ):
doc = BeautifulSoup( open(f).read(), 'html.parser' )
div = doc.find('div', class_='content')
if not div:
continue
lastTitle = None
textCur = []
section = []
for child in div.contents:
try:
tag = child.name
except:
tag = None
if tag not in titleTags:
try:
textCur.append( child.get_text() )
except:
pass
continue
addDocument( f, section, lastTitle, textCur )
iSection = int(int(nonNumeric.sub('', tag))) - 1
section = section[:iSection]
section.append( child.get_text() )
lastTitle = child
textCur = []
addDocument( f, section, lastTitle, textCur )
writer.commit()
#---------------------------------------------------------------------------------------------
if __name__ == '__main__':
BuildHelpIndex()
from whoosh.qparser import QueryParser
ix = open_dir( indexDir, readonly=True )
with ix.searcher() as searcher, open('search.html', 'w') as f:
query = QueryParser('content', ix.schema).parse(u'fastest lap')
results = searcher.search(query, limit=20)
f.write( '<table><tr><th></th><th align="left">Section</th><th align="left">Match</th></tr>\n' )
for i, hit in enumerate(results):
f.write( '<tr><td align="left">%d.</td><td><a href="%s">%s</a></td><td>%s</td></tr>\n' % ((i+1), hit['path'], hit['section'], hit.highlights('content')) )
f.write( '</table>\n' )
ix.close()