From 2dd15e9f632d045bd7715c25c8dbc8d9423c31e3 Mon Sep 17 00:00:00 2001 From: sabaimran <65192171+sabaimran@users.noreply.github.com> Date: Mon, 18 Sep 2023 14:41:26 -0700 Subject: [PATCH] Resolve issues with GPT4All and fix prompt for yesterday extract questions date filter (#483) - GPT4All integration had ceased working with 0.1.7 specification. Update to use 1.0.12. At a later date, we should also use first party support for llama v2 via gpt4all - Update the system prompt for the extract_questions flow to add start and end date to the yesterday date filter example. - Update all setup data in conftest.py to use new client-server indexing pattern --- pyproject.toml | 4 +- .../processor/conversation/gpt4all/utils.py | 7 ++-- src/khoj/processor/conversation/prompts.py | 2 +- tests/conftest.py | 37 +++++-------------- 4 files changed, 16 insertions(+), 34 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 61d6bef32..a52fc9b6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,8 +59,8 @@ dependencies = [ "bs4 >= 0.0.1", "anyio == 3.7.1", "pymupdf >= 1.23.3", - "gpt4all == 0.1.9; platform_system == 'Linux' and platform_machine == 'x86_64'", - "gpt4all == 0.1.9; platform_system == 'Windows' or platform_system == 'Darwin'", + "gpt4all == 1.0.12; platform_system == 'Linux' and platform_machine == 'x86_64'", + "gpt4all == 1.0.12; platform_system == 'Windows' or platform_system == 'Darwin'", ] dynamic = ["version"] diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py index 95eeb4966..4042fbe2a 100644 --- a/src/khoj/processor/conversation/gpt4all/utils.py +++ b/src/khoj/processor/conversation/gpt4all/utils.py @@ -28,9 +28,10 @@ def download_model(model_name: str): raise e url = model_metadata.model_name_to_url.get(model_name) + model_path = os.path.expanduser(f"~/.cache/gpt4all/") if not url: logger.debug(f"Model {model_name} not found in model metadata. Skipping download.") - return GPT4All(model_name) + return GPT4All(model_name=model_name, model_path=model_path) filename = os.path.expanduser(f"~/.cache/gpt4all/{model_name}") if os.path.exists(filename): @@ -39,8 +40,8 @@ def download_model(model_name: str): requests.get("https://www.google.com/", timeout=5) except: logger.debug("User is offline. Disabling allowed download flag") - return GPT4All(model_name, allow_download=False) - return GPT4All(model_name) + return GPT4All(model_name=model_name, model_path=model_path, allow_download=False) + return GPT4All(model_name=model_name, model_path=model_path) # Download the model to a tmp file. Once the download is completed, move the tmp file to the actual file tmp_filename = filename + ".tmp" diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index cb9ecdcc1..4de3c623b 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -209,7 +209,7 @@ Q: What does yesterday's note say? -["Note from {yesterday_date} dt='{yesterday_date}'"] +["Note from {yesterday_date} dt>='{yesterday_date}' dt<'{current_date}'"] A: Yesterday's note contains the following information: ... diff --git a/tests/conftest.py b/tests/conftest.py index 45df8ffbd..be332eae4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,7 +26,8 @@ TextSearchConfig, ImageSearchConfig, ) -from khoj.utils import state +from khoj.utils import state, fs_syncer +from khoj.routers.indexer import configure_content from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.search_filter.date_filter import DateFilter @@ -220,15 +221,10 @@ def chat_client(md_content_config: ContentConfig, search_config: SearchConfig, p state.SearchType = configure_search_types(state.config) # Index Markdown Content for Search - filters = [DateFilter(), WordFilter(), FileFilter()] state.search_models.text_search = text_search.initialize_model(search_config.asymmetric) - state.content_index.markdown = text_search.setup( - MarkdownToJsonl, - get_sample_data("markdown"), - md_content_config.markdown, - state.search_models.text_search.bi_encoder, - regenerate=False, - filters=filters, + all_files = fs_syncer.collect_files(state.config.content_type) + state.content_index = configure_content( + state.content_index, state.config.content_type, all_files, state.search_models ) # Initialize Processor from Config @@ -273,7 +269,7 @@ def client(content_config: ContentConfig, search_config: SearchConfig, processor @pytest.fixture(scope="function") def client_offline_chat( - md_content_config: ContentConfig, search_config: SearchConfig, processor_config_offline_chat: ProcessorConfig + search_config: SearchConfig, processor_config_offline_chat: ProcessorConfig, content_config: ContentConfig ): # Initialize app state state.config.content_type = md_content_config @@ -281,27 +277,12 @@ def client_offline_chat( state.SearchType = configure_search_types(state.config) # Index Markdown Content for Search - filters = [DateFilter(), WordFilter(), FileFilter()] state.search_models.text_search = text_search.initialize_model(search_config.asymmetric) state.search_models.image_search = image_search.initialize_model(search_config.image) - state.content_index.org = text_search.setup( - OrgToJsonl, - get_sample_data("org"), - content_config.org, - state.search_models.text_search.bi_encoder, - regenerate=False, - ) - state.content_index.image = image_search.setup( - content_config.image, state.search_models.image_search, regenerate=False - ) - state.content_index.markdown = text_search.setup( - MarkdownToJsonl, - get_sample_data("markdown"), - md_content_config.markdown, - state.search_models.text_search.bi_encoder, - regenerate=False, - filters=filters, + all_files = fs_syncer.collect_files(content_config.content_type) + state.content_index = configure_content( + state.content_index, state.config.content_type, all_files, state.search_models ) # Initialize Processor from Config