From 343854752c442aefd5eeccf09eaa1ef904d32a86 Mon Sep 17 00:00:00 2001
From: sabaimran <65192171+sabaimran@users.noreply.github.com>
Date: Fri, 8 Sep 2023 17:07:26 -0700
Subject: [PATCH] Improve docker builds for local hosting (#476)

* Remove GPT4All dependency in pyproject.toml and use multiplatform builds in the dockerization setup in GH actions
* Move configure_search method into indexer
* Add conditional installation for gpt4all
* Add hint to go to localhost:42110 in the docs. Addresses #477
---
 .github/workflows/dockerize.yml               |  1 +
 docker-compose.yml                            |  7 ++-
 docs/setup.md                                 | 19 +++++-
 pyproject.toml                                |  3 +-
 src/khoj/configure.py                         | 26 +-------
 src/khoj/main.py                              |  4 ++
 .../conversation/gpt4all/chat_model.py        | 33 +++++++++--
 .../processor/conversation/gpt4all/utils.py   |  7 ++-
 src/khoj/routers/indexer.py                   | 59 +++++++++++++++++--
 tests/test_gpt4all_chat_actors.py             |  5 +-
 10 files changed, 122 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/dockerize.yml b/.github/workflows/dockerize.yml
index 3ae927106..70098040f 100644
--- a/.github/workflows/dockerize.yml
+++ b/.github/workflows/dockerize.yml
@@ -41,6 +41,7 @@ jobs:
         with:
           context: .
           file: Dockerfile
+          platforms: linux/amd64, linux/arm64
           push: true
           tags: ghcr.io/${{ github.repository }}:${{ env.DOCKER_IMAGE_TAG }}
           build-args: |
diff --git a/docker-compose.yml b/docker-compose.yml
index c15d1025b..5f1bb1f96 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -24,5 +24,10 @@ services:
       # You can set these volumes to point to empty directories on host
       - ./tests/data/embeddings/:/root/.khoj/content/
       - ./tests/data/models/:/root/.khoj/search/
+      - khoj_config:/root/.khoj/
     # Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
-    command: --host="0.0.0.0" --port=42110 -c=config/khoj_docker.yml -vv
+    command: --host="0.0.0.0" --port=42110 -vv
+
+
+volumes:
+  khoj_config:
diff --git a/docs/setup.md b/docs/setup.md
index 4354678c5..d273618b3 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -8,6 +8,8 @@ These are the general setup instructions for Khoj.
   Its simpler as it can skip the *configure* step below.
 
 ### 1. Install
+
+#### 1.1 Local Setup
 Run the following command in your terminal to install the Khoj backend.
 
 - On Linux/MacOS
@@ -22,7 +24,7 @@ Run the following command in your terminal to install the Khoj backend.
 For more detailed Windows installation and troubleshooting, see [Windows Install](./windows_install.md).
 
 
-### 2. Start
+##### 1.1.1 Start
 
 Run the following command from your terminal to start the Khoj backend and open Khoj in your browser.
 
@@ -30,16 +32,27 @@ Run the following command from your terminal to start the Khoj backend and open
 khoj
 ```
 
+Khoj should now be running at http://localhost:42110. You can see the web UI in your browser.
+
 Note: To start Khoj automatically in the background use [Task scheduler](https://www.windowscentral.com/how-create-automated-task-using-task-scheduler-windows-10) on Windows or [Cron](https://en.wikipedia.org/wiki/Cron) on Mac, Linux (e.g with `@reboot khoj`)
 
-### 3. Configure
+#### 1.2 Docker Setup
+Use the sample docker-compose [in Github](https://github.com/khoj-ai/khoj/blob/master/docker-compose.yml) to run Khoj in Docker. To start the container, run the following command in the same directory as the docker-compose.yml file. You'll have to configure the mounted directories to match your local knowledge base.
+
+```shell
+docker-compose up
+```
+
+Khoj should now be running at http://localhost:42110. You can see the web UI in your browser.
+
+### 2. Configure
 1. Set `File`, `Folder` and hit `Save` in each Plugins you want to enable for Search on the Khoj config page
 2. Add your OpenAI API key to Chat Feature settings if you want to use Chat
 3. Click `Configure` and wait. The app will download ML models and index the content for search and (optionally) chat
 
 ![configure demo](https://user-images.githubusercontent.com/6413477/255307879-61247d3f-c69a-46ef-b058-9bc533cb5c72.mp4 ':include :type=mp4')
 
-### 4. Install Interface Plugins (Optional)
+### 3. Install Interface Plugins (Optional)
 Khoj exposes a web interface to search, chat and configure by default.<br />
 The optional steps below allow using Khoj from within an existing application like Obsidian or Emacs.
 
diff --git a/pyproject.toml b/pyproject.toml
index 20a675054..61d6bef32 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,9 +57,10 @@ dependencies = [
     "langchain >= 0.0.187",
     "requests >= 2.26.0",
     "bs4 >= 0.0.1",
-    "gpt4all >= 1.0.7",
     "anyio == 3.7.1",
     "pymupdf >= 1.23.3",
+    "gpt4all == 0.1.9; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "gpt4all == 0.1.9; platform_system == 'Windows' or platform_system == 'Darwin'",
 ]
 dynamic = ["version"]
 
diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index 6a358c1f9..7e6cc4090 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -11,18 +11,16 @@
 from fastapi.staticfiles import StaticFiles
 
 # Internal Packages
-from khoj.search_type import image_search, text_search
 from khoj.utils import constants, state
 from khoj.utils.config import (
     SearchType,
-    SearchModels,
     ProcessorConfigModel,
     ConversationProcessorConfigModel,
 )
 from khoj.utils.helpers import resolve_absolute_path, merge_dicts
 from khoj.utils.fs_syncer import collect_files
-from khoj.utils.rawconfig import FullConfig, ProcessorConfig, SearchConfig, ConversationProcessorConfig
-from khoj.routers.indexer import configure_content, load_content
+from khoj.utils.rawconfig import FullConfig, ProcessorConfig, ConversationProcessorConfig
+from khoj.routers.indexer import configure_content, load_content, configure_search
 
 
 logger = logging.getLogger(__name__)
@@ -136,26 +134,6 @@ def configure_search_types(config: FullConfig):
     return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types))
 
 
-def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
-    # Run Validation Checks
-    if search_config is None:
-        logger.warning("🚨 No Search configuration available.")
-        return None
-    if search_models is None:
-        search_models = SearchModels()
-
-    # Initialize Search Models
-    if search_config.asymmetric:
-        logger.info("🔍 📜 Setting up text search model")
-        search_models.text_search = text_search.initialize_model(search_config.asymmetric)
-
-    if search_config.image:
-        logger.info("🔍 🌄 Setting up image search model")
-        search_models.image_search = image_search.initialize_model(search_config.image)
-
-    return search_models
-
-
 def configure_processor(
     processor_config: Optional[ProcessorConfig], state_processor_config: Optional[ProcessorConfigModel] = None
 ):
diff --git a/src/khoj/main.py b/src/khoj/main.py
index ab88480da..6710ed050 100644
--- a/src/khoj/main.py
+++ b/src/khoj/main.py
@@ -100,3 +100,7 @@ def poll_task_scheduler():
     timer_thread.daemon = True
     timer_thread.start()
     schedule.run_pending()
+
+
+if __name__ == "__main__":
+    run()
diff --git a/src/khoj/processor/conversation/gpt4all/chat_model.py b/src/khoj/processor/conversation/gpt4all/chat_model.py
index 6f91fdf44..9bc9ea52a 100644
--- a/src/khoj/processor/conversation/gpt4all/chat_model.py
+++ b/src/khoj/processor/conversation/gpt4all/chat_model.py
@@ -1,12 +1,10 @@
-from typing import Iterator, Union, List
+from typing import Iterator, Union, List, Any
 from datetime import datetime
 import logging
 from threading import Thread
 
 from langchain.schema import ChatMessage
 
-from gpt4all import GPT4All
-
 from khoj.processor.conversation.utils import ThreadedGenerator, generate_chatml_messages_with_context
 from khoj.processor.conversation import prompts
 from khoj.utils.constants import empty_escape_sequences
@@ -19,7 +17,7 @@
 def extract_questions_offline(
     text: str,
     model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
-    loaded_model: Union[GPT4All, None] = None,
+    loaded_model: Union[Any, None] = None,
     conversation_log={},
     use_history: bool = True,
     should_extract_questions: bool = True,
@@ -27,6 +25,15 @@ def extract_questions_offline(
     """
     Infer search queries to retrieve relevant notes to answer user query
     """
+    try:
+        from gpt4all import GPT4All
+    except ModuleNotFoundError as e:
+        logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
+        raise e
+
+    # Assert that loaded_model is either None or of type GPT4All
+    assert loaded_model is None or isinstance(loaded_model, GPT4All), "loaded_model must be of type GPT4All or None"
+
     all_questions = text.split("? ")
     all_questions = [q + "?" for q in all_questions[:-1]] + [all_questions[-1]]
 
@@ -117,13 +124,20 @@ def converse_offline(
     user_query,
     conversation_log={},
     model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
-    loaded_model: Union[GPT4All, None] = None,
+    loaded_model: Union[Any, None] = None,
     completion_func=None,
     conversation_command=ConversationCommand.Default,
 ) -> Union[ThreadedGenerator, Iterator[str]]:
     """
     Converse with user using Llama
     """
+    try:
+        from gpt4all import GPT4All
+    except ModuleNotFoundError as e:
+        logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
+        raise e
+
+    assert loaded_model is None or isinstance(loaded_model, GPT4All), "loaded_model must be of type GPT4All or None"
     gpt4all_model = loaded_model or GPT4All(model)
     # Initialize Variables
     compiled_references_message = "\n\n".join({f"{item}" for item in references})
@@ -152,7 +166,14 @@ def converse_offline(
     return g
 
 
-def llm_thread(g, messages: List[ChatMessage], model: GPT4All):
+def llm_thread(g, messages: List[ChatMessage], model: Any):
+    try:
+        from gpt4all import GPT4All
+    except ModuleNotFoundError as e:
+        logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
+        raise e
+
+    assert isinstance(model, GPT4All), "model should be of type GPT4All"
     user_message = messages[-1]
     system_message = messages[0]
     conversation_history = messages[1:-1]
diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py
index 21f3afb45..95eeb4966 100644
--- a/src/khoj/processor/conversation/gpt4all/utils.py
+++ b/src/khoj/processor/conversation/gpt4all/utils.py
@@ -3,7 +3,6 @@
 import requests
 import hashlib
 
-from gpt4all import GPT4All
 from tqdm import tqdm
 
 from khoj.processor.conversation.gpt4all import model_metadata
@@ -22,6 +21,12 @@ def get_md5_checksum(filename: str):
 
 
 def download_model(model_name: str):
+    try:
+        from gpt4all import GPT4All
+    except ModuleNotFoundError as e:
+        logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
+        raise e
+
     url = model_metadata.model_name_to_url.get(model_name)
     if not url:
         logger.debug(f"Model {model_name} not found in model metadata. Skipping download.")
diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index ca2d884ef..1d89ee112 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -1,6 +1,7 @@
 # Standard Packages
 import logging
 import sys
+import json
 from typing import Optional, Union, Dict
 
 # External Packages
@@ -8,7 +9,7 @@
 from pydantic import BaseModel
 
 # Internal Packages
-from khoj.utils import state
+from khoj.utils import state, constants
 from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
@@ -18,11 +19,14 @@
 from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
 from khoj.utils.rawconfig import ContentConfig, TextContentConfig
 from khoj.search_type import text_search, image_search
+from khoj.utils.yaml import save_config_to_file_updated_state
 from khoj.utils.config import SearchModels
 from khoj.utils.constants import default_config
 from khoj.utils.helpers import LRU, get_file_type
 from khoj.utils.rawconfig import (
     ContentConfig,
+    FullConfig,
+    SearchConfig,
 )
 from khoj.search_filter.date_filter import DateFilter
 from khoj.search_filter.word_filter import WordFilter
@@ -111,6 +115,28 @@ async def index_batch(
             plaintext=plaintext_files,
         )
 
+        if state.config == None:
+            logger.info("First run, initializing state.")
+            default_full_config = FullConfig(
+                content_type=None,
+                search_type=SearchConfig.parse_obj(constants.default_config["search-type"]),
+                processor=None,
+            )
+            state.config = default_full_config
+            default_content_config = ContentConfig(
+                org=None,
+                markdown=None,
+                pdf=None,
+                image=None,
+                github=None,
+                notion=None,
+                plaintext=None,
+                plugins=None,
+            )
+            state.config.content_type = default_content_config
+            save_config_to_file_updated_state()
+            configure_search(state.search_models, state.config.search_type)
+
         # Extract required fields from config
         state.content_index = configure_content(
             state.content_index,
@@ -129,6 +155,26 @@ async def index_batch(
     return Response(content="OK", status_code=200)
 
 
+def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
+    # Run Validation Checks
+    if search_config is None:
+        logger.warning("🚨 No Search configuration available.")
+        return None
+    if search_models is None:
+        search_models = SearchModels()
+
+    # Initialize Search Models
+    if search_config.asymmetric:
+        logger.info("🔍 📜 Setting up text search model")
+        search_models.text_search = text_search.initialize_model(search_config.asymmetric)
+
+    if search_config.image:
+        logger.info("🔍 🌄 Setting up image search model")
+        search_models.image_search = image_search.initialize_model(search_config.image)
+
+    return search_models
+
+
 def configure_content(
     content_index: Optional[ContentIndex],
     content_config: Optional[ContentConfig],
@@ -138,6 +184,9 @@ def configure_content(
     t: Optional[Union[state.SearchType, str]] = None,
     full_corpus: bool = True,
 ) -> Optional[ContentIndex]:
+    def has_valid_text_config(config: TextContentConfig):
+        return config.input_files or config.input_filter
+
     # Run Validation Checks
     if content_config is None:
         logger.warning("🚨 No Content configuration available.")
@@ -158,7 +207,7 @@ def configure_content(
         # Initialize Org Notes Search
         if (
             (t == None or t == state.SearchType.Org.value)
-            and (content_config.org or files["org"])
+            and ((content_config.org and has_valid_text_config(content_config.org)) or files["org"])
             and search_models.text_search
         ):
             if content_config.org == None:
@@ -187,7 +236,7 @@ def configure_content(
         # Initialize Markdown Search
         if (
             (t == None or t == state.SearchType.Markdown.value)
-            and (content_config.markdown or files["markdown"])
+            and ((content_config.markdown and has_valid_text_config(content_config.markdown)) or files["markdown"])
             and search_models.text_search
             and files["markdown"]
         ):
@@ -218,7 +267,7 @@ def configure_content(
         # Initialize PDF Search
         if (
             (t == None or t == state.SearchType.Pdf.value)
-            and (content_config.pdf or files["pdf"])
+            and ((content_config.pdf and has_valid_text_config(content_config.pdf)) or files["pdf"])
             and search_models.text_search
             and files["pdf"]
         ):
@@ -249,7 +298,7 @@ def configure_content(
         # Initialize Plaintext Search
         if (
             (t == None or t == state.SearchType.Plaintext.value)
-            and (content_config.plaintext or files["plaintext"])
+            and ((content_config.plaintext and has_valid_text_config(content_config.plaintext)) or files["plaintext"])
             and search_models.text_search
             and files["plaintext"]
         ):
diff --git a/tests/test_gpt4all_chat_actors.py b/tests/test_gpt4all_chat_actors.py
index 92b3f9562..d7904ff83 100644
--- a/tests/test_gpt4all_chat_actors.py
+++ b/tests/test_gpt4all_chat_actors.py
@@ -13,7 +13,10 @@
 import freezegun
 from freezegun import freeze_time
 
-from gpt4all import GPT4All
+try:
+    from gpt4all import GPT4All
+except ModuleNotFoundError as e:
+    print("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
 
 # Internal Packages
 from khoj.processor.conversation.gpt4all.chat_model import converse_offline, extract_questions_offline, filter_questions