Skip to content

Commit

Permalink
search: implement whoosh search engine
Browse files Browse the repository at this point in the history
* Updates README.
* Also adds some cosmetic fixes.
* Adds a CLI with commands to initialize the index and index all pages.

Co-authored-by: Pascal Repond <[email protected]>
  • Loading branch information
PascalRepond committed Aug 24, 2023
1 parent 89410af commit a937b1f
Show file tree
Hide file tree
Showing 10 changed files with 212 additions and 67 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,7 @@ dmypy.json
*.mo

# Macosx
.DS_Store
.DS_Store

# Whoosh index
/examples/index/
15 changes: 9 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,18 @@ Simple file based wiki for Flask.

### Requirements

* Python >=3.6.2
* Python >=3.8.0,<4.0.0
* [Poetry](https://python-poetry.org/)

### Install
### Install dev environment

- Clone the git repository
- run `poetry install`
- If you want to enable debug mode, run `export FLASK_ENV=development`
- `cd examples`, then `poetry run flask run`
- go to http://localhost:5000/wiki
- `cd examples`,
- `poetry run flask flask_wiki init-index`
- `poetry run flask flask_wiki index`
- then `poetry run flask run --debug`
- go to http://localhost:5000/help

## Configuration

Expand All @@ -36,8 +38,9 @@ Simple file based wiki for Flask.
- WIKI_HOME = 'home'
- WIKI_CURRENT_LANGUAGE = lambda: 'en'
- WIKI_LANGUAGES = ['en']
- WIKI_URL_PREFIX = '/wiki'
- WIKI_URL_PREFIX = '/help'
- WIKI_CONTENT_DIR = './data'
- WIKI_INDEX_DIR = './index'
- WIKI_UPLOAD_FOLDER = os.path.join(WIKI_CONTENT_DIR, 'files')
- WIKI_ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'svg'}
- WIKI_MARKDOWN_EXTENSIONS = set(('codehilite', 'fenced_code'))
Expand Down
2 changes: 1 addition & 1 deletion examples/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def get_locale():
app.config.from_mapping(test_config)
Bootstrap4(app)
Wiki(app)
babel = Babel(app, locale_selector=get_locale)
Babel(app, locale_selector=get_locale)

@app.context_processor
def inject_conf_var():
Expand Down
144 changes: 97 additions & 47 deletions flask_wiki/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,19 @@
"""Core classes."""

import os
import re
from collections import OrderedDict
from datetime import datetime
from io import open
from pathlib import Path

import markdown
from bs4 import BeautifulSoup
from flask import abort, current_app, g
from werkzeug.local import LocalProxy
from whoosh import index, qparser
from whoosh.analysis import LanguageAnalyzer
from whoosh.fields import ID, TEXT, Schema
from whoosh.writing import AsyncWriter

from .markdown_ext import BootstrapExtension
from .utils import clean_url, wikilink
Expand All @@ -44,7 +48,9 @@ def __init__(self, text):

self.md = markdown.Markdown(extensions={
BootstrapExtension(),
'codehilite', 'fenced_code', 'toc', 'meta', 'tables'
'codehilite',
'fenced_code',
'toc', 'meta', 'tables'
}.union(markdown_ext))

self.input = text
Expand Down Expand Up @@ -167,6 +173,19 @@ def render(self):
self.modification_datetime = datetime.fromtimestamp(
os.path.getmtime(self.path))

def index(self):
"""Index page data for whoosh search engine."""
index_dir = index.open_dir(current_app.config.get('WIKI_INDEX_DIR'))
writer = AsyncWriter(index_dir)
writer.update_document(
url=self.url,
title=self.title,
body=self.raw_body,
tags=self.tags,
language=self.language
)
writer.commit()

def save(self, update=True):
"""Save a page."""
folder = os.path.dirname(self.path)
Expand All @@ -178,6 +197,7 @@ def save(self, update=True):
f.write(line)
f.write(u'\n')
f.write(self.body.replace(u'\r\n', u'\n'))
self.index()
if update:
self.load()
self.render()
Expand Down Expand Up @@ -214,7 +234,6 @@ def title(self):

@title.setter
def title(self, value):
"""."""
self['title'] = value

@property
Expand All @@ -227,9 +246,23 @@ def tags(self):

@tags.setter
def tags(self, value):
"""."""
self['tags'] = value

@property
def raw_body(self):
"""Return raw text of the body.
Returns the raw text of the body without markdown or html markup,
used for indexing and search results display.
"""
html = markdown.markdown(self.body)
html = BeautifulSoup(html, 'html.parser')
return html.get_text(separator=' ')

@raw_body.setter
def raw_body(self, value):
self['raw_body'] = value

@property
def language(self):
"""Return page language.
Expand All @@ -239,11 +272,11 @@ def language(self):
"""
filename = Path(self.path).stem
return filename.split('_')[-1] if '_' in filename\
else current_wiki.languages[0]
else list(current_wiki.languages.keys())[0]


class WikiBase(object):
"""."""
"""Utility class for wiki management methods."""

def __init__(self, root):
"""."""
Expand Down Expand Up @@ -309,9 +342,54 @@ def delete(self, url):
if not self.exists(url):
return False
os.remove(path)
index_dir = index.open_dir(current_app.config.get('WIKI_INDEX_DIR'))
writer = AsyncWriter(index_dir)
writer.delete_by_term('path', path)
writer.commit()
return True

def index(self):
def init_search_index(self):
"""Create a new whoosh search index for the wiki."""
index_dir = current_app.config.get('WIKI_INDEX_DIR')
# initialize whoosh index schema
schema = Schema(
url=ID(stored=True, unique=True),
title=TEXT(stored=True, analyzer=LanguageAnalyzer("fr")),
tags=TEXT(stored=True),
body=TEXT(stored=True, analyzer=LanguageAnalyzer("fr")),
language=ID(stored=True)
)
if not os.path.exists(index_dir):
os.mkdir(index_dir)
index.create_in(index_dir, schema)

def search(self, query, ix, searcher):
"""Search the whoosh index for a given query.
:param str query: the search query
:param whoosh.index ix: the whoosh index to use
:param whoosh.searcher searcher: an active whoosh searcher instance
:returns: a whoosh.results object instance
"""
# parse the query to search all fields present in the schema
fields = ix.schema.names()
query_parser = qparser.MultifieldParser(
fields,
schema=ix.schema,
group=qparser.OrGroup
)
parsed_query = query_parser.parse(query)
# return a whoosh Results object to treat results
results = searcher.search(parsed_query)
# set highlights fragment size to 50 words
results.fragmenter.surround = 50
# set highlights separator for display
results.formatter.between = '<strong> [...] </strong>'
# return the modified Results object
return results

def list_pages(self):
"""Build up a list of all the available pages.
:returns: a list of all the wiki pages
Expand All @@ -332,6 +410,11 @@ def index(self):
pages.append(page)
return sorted(pages, key=lambda x: x.title.lower())

def index_all_pages(self):
"""Index all the pages for the current wiki."""
for page in self.list_pages():
Page.index(page)

def index_by(self, key):
"""Get an index based on the given key.
Expand All @@ -352,13 +435,13 @@ def index_by(self, key):
return pages

def get_by_title(self, title):
"""."""
pages = self.index(attr='title')
"""Get all page titles."""
pages = self.list_pages(attr='title')
return pages.get(title)

def get_tags(self):
"""."""
pages = self.index()
"""Get all tags."""
pages = self.list_pages()
tags = {}
for page in pages:
pagetags = page.tags.split(',')
Expand All @@ -372,9 +455,9 @@ def get_tags(self):
tags[tag] = [page]
return tags

def index_by_tag(self, tag):
"""."""
pages = self.index()
def list_tagged_pages(self, tag):
"""Get a list of all pages that have a tag."""
pages = self.list_pages()
tagged = [page for page in pages if tag in page.tags]
return sorted(tagged, key=lambda x: x.title.lower())

Expand All @@ -388,39 +471,6 @@ def languages(self):
"""."""
return current_app.config.get('WIKI_LANGUAGES')

def search(self, term, ignore_case=True, attrs=None):
"""."""
if attrs is None:
attrs = ['title', 'tags', 'body']
pages = self.index()

for page in pages:
page["score"] = 0

# When searching for "*", return ALL pages
if term == "*":
return pages

current_language_pages = [
p for p in pages if p.language == self.current_language]

# If no query term, return all current language pages
if not term:
return current_language_pages

regex = re.compile(
re.escape(term), re.IGNORECASE if ignore_case else 0)

matched = []
for page in current_language_pages:
for attr in attrs:
if found := re.findall(regex, getattr(page, attr)):
page["score"] += len(found)
if page not in matched:
matched.append(page)
# Sort results by score
return sorted(matched, key=lambda x: x["score"], reverse=True)


def get_wiki():
"""."""
Expand Down
35 changes: 35 additions & 0 deletions flask_wiki/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
#
# This file is part of Flask-Wiki
# Copyright (C) 2023 RERO
#
# Flask-Wiki is free software; you can redistribute it and/or modify
# it under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Click command-line interface for flask-wiki."""

import click
from flask.cli import with_appcontext

from .api import get_wiki


@click.group()
def flask_wiki():
"""Command-line interface for flask-wiki."""
pass


@flask_wiki.command()
@with_appcontext
def init_index():
"""Init whoosh search index."""
get_wiki().init_search_index()


@flask_wiki.command()
@with_appcontext
def index():
"""Index all wiki pages for whoosh search."""
get_wiki().index_all_pages()
3 changes: 2 additions & 1 deletion flask_wiki/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@
WIKI_HOME = 'home'
WIKI_CURRENT_LANGUAGE = lambda: 'en'
WIKI_LANGUAGES = ['en']
WIKI_URL_PREFIX = '/wiki'
WIKI_URL_PREFIX = '/help'
WIKI_CONTENT_DIR = './data'
WIKI_UPLOAD_FOLDER = os.path.join(WIKI_CONTENT_DIR, 'files')
WIKI_INDEX_DIR = './index'
WIKI_ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'svg'}

"""Markdown Extensions.
Expand Down
6 changes: 3 additions & 3 deletions flask_wiki/templates/wiki/search.html
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,20 @@
<header>
<div class="pb-3">
{{ results | length }} {{ ngettext('result', 'results', results | length) }}
<a href="{{ url_for('wiki.search', q='*') }}" class="btn btn-sm btn-outline-primary ml-2">{{ _("All languages") }}</a>
</div>
</header>
<div>
<ul class="list-group list-group-flush">
{%- for result in results -%}
<li class="list-group-item">
<div class="d-lg-flex justify-content-lg-between">
<article class="mb-2">
<article class="mb-2 col-10">
<h5 class="m-0">
<a class="mr-2" href="{{ url_for('wiki.page', url=result.url) }}">
{{ result.title }}
</a>
</h5>
<p>{{ result.highlights("body", top=3) | safe }}</p>
<ul class="m-0 p-0">
<li class="badge badge-secondary">
{{ result.language | upper }}
Expand All @@ -45,7 +45,7 @@ <h5 class="m-0">
</ul>
</article>
{% if can_edit_wiki %}
<footer>
<footer class="ml-3 col-4">
<button data-name="{{ result.title }}"
data-link="{{ url_for('wiki.page', url=result.url) }}"
class="copy-file-code btn btn-sm btn-outline-primary">
Expand Down
Loading

0 comments on commit a937b1f

Please sign in to comment.