diff --git a/README.md b/README.md index 41e3a9f7..707fe201 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,6 @@ The recommended way is to install Annif from source annif-venv/bin/activate pip install annif -You will also need NLTK data files: - - python -m nltk.downloader punkt_tab - Start up the application: annif @@ -113,10 +109,6 @@ Enter the virtual environment: poetry shell -You will also need NLTK data files: - - python -m nltk.downloader punkt_tab - Start up the application: annif diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 25bdb6b5..129b882a 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -6,7 +6,12 @@ import functools import unicodedata +import annif + +logger = annif.logger + _KEY_TOKEN_MIN_LENGTH = "token_min_length" +_NLTK_TOKENIZER_DATA = "punkt_tab" class Analyzer(metaclass=abc.ABCMeta): @@ -21,6 +26,21 @@ def __init__(self, **kwargs) -> None: if _KEY_TOKEN_MIN_LENGTH in kwargs: self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH]) + import nltk.data + + try: + nltk.data.find("tokenizers/" + _NLTK_TOKENIZER_DATA) + except LookupError as err: + logger.debug(str(err)) + if _NLTK_TOKENIZER_DATA in str(err): + logger.warning( + f'NLTK datapackage "{_NLTK_TOKENIZER_DATA}" not found, ' + "downloading it now." + ) + nltk.download(_NLTK_TOKENIZER_DATA) + else: + raise + def tokenize_sentences(self, text: str) -> list[str]: """Tokenize a piece of text (e.g. a document) into sentences.""" import nltk.tokenize diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index ecddfbb3..ff4b7079 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -1,5 +1,7 @@ """Unit tests for analyzers in Annif""" +from unittest import mock + import pytest import annif.analyzer @@ -15,6 +17,15 @@ def test_get_analyzer_badspec(): annif.analyzer.get_analyzer("()") +@mock.patch("nltk.data.find", side_effect=LookupError("Resource punkt_tab not found")) +@mock.patch("nltk.download") +def test_nltk_data_missing(download, find): + annif.analyzer.get_analyzer("snowball(english)") + assert find.called + assert download.called + assert download.call_args == mock.call("punkt_tab") + + def test_english_analyzer_normalize_word(): analyzer = annif.analyzer.get_analyzer("snowball(english)") assert analyzer._normalize_word("running") == "run"