"NLTK Download" takes too much time!How can I just read the cache! #16295
Replies: 2 comments 1 reply
-
Hey there, @JulianChenlol! I'm here to assist you with any bugs, questions, or contributions. Let's tackle this issue together. To configure NLTK to read from the cache instead of downloading the package every time you start the app, you can set the import os
import nltk
class GlobalsHelper:
"""Helper to retrieve globals.
Helpful for global caching of certain variables that can be expensive to load.
(e.g. tokenization)
"""
_stopwords: Optional[List[str]] = None
_nltk_data_dir: Optional[str] = None
def __init__(self) -> None:
"""Initialize NLTK stopwords and punkt."""
self._nltk_data_dir = os.environ.get(
"NLTK_DATA",
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"_static/nltk_cache",
),
)
if self._nltk_data_dir not in nltk.data.path:
nltk.data.path.append(self._nltk_data_dir)
# ensure access to data is there
try:
nltk.data.find("corpora/stopwords", paths=[self._nltk_data_dir])
except LookupError:
nltk.download("stopwords", download_dir=self._nltk_data_dir)
try:
nltk.data.find("tokenizers/punkt", paths=[self._nltk_data_dir])
except LookupError:
nltk.download("punkt", download_dir=self._nltk_data_dir)
@property
def stopwords(self) -> List[str]:
"""Get stopwords."""
if self._stopwords is None:
try:
from nltk.corpus import stopwords
except ImportError:
raise ImportError(
"`nltk` package not found, please run `pip install nltk`"
)
try:
nltk.data.find("corpora/stopwords", paths=[self._nltk_data_dir])
except LookupError:
nltk.download("stopwords", download_dir=self._nltk_data_dir)
self._stopwords = stopwords.words("english")
return self._stopwords
globals_helper = GlobalsHelper() This code sets up a cache directory for NLTK data and ensures that the data is downloaded to this directory if it is not already present. You can use this approach to avoid downloading the NLTK data every time you start the app [1]. |
Beta Was this translation helpful? Give feedback.
-
[nltk_data] Downloading package punkt_tab to |
Beta Was this translation helpful? Give feedback.
-
[nltk_data] Downloading package punkt_tab to
[nltk_data] D:\software\manbaforge\envs\flask-
[nltk_data] python31011\lib\site-
[nltk_data] packages\llama_index\core_static/nltk_cache...
[nltk_data] Package punkt_tab is already up-to-date!
I download everytime I start up the app! How can I just read the cache!
Thank you!
Beta Was this translation helpful? Give feedback.
All reactions