Merge branch 'main' into patch-1

onyx-dot-app · Oct 31, 2023 · 8812601 · 8812601
2 parents ebecb44 + c6663d8
commit 8812601
Show file tree

Hide file tree

Showing 256 changed files with 10,892 additions and 5,060 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -6,7 +6,7 @@ As an open source project in a rapidly changing space, we welcome all contributi
 
 ## 💃 Guidelines
 ### Contribution Opportunities
-The [GitHub issues](https://github.com/danswer-ai/danswer/issues) page is a great place to start for contribution ideas.
+The [GitHub Issues](https://github.com/danswer-ai/danswer/issues) page is a great place to start for contribution ideas.
 
 Issues that have been explicitly approved by the maintainers (aligned with the direction of the project)
 will be marked with the `approved by maintainers` label.
@@ -19,7 +19,9 @@ If you have a new/different contribution in mind, we'd love to hear about it!
 Your input is vital to making sure that Danswer moves in the right direction.
 Before starting on implementation, please raise a GitHub issue.
 
-And always feel free to message us (Chris Weaver / Yuhong Sun) on Slack / Discord directly about anything at all. 
+And always feel free to message us (Chris Weaver / Yuhong Sun) on 
+[Slack](https://join.slack.com/t/danswer/shared_invite/zt-1u3h3ke3b-VGh1idW19R8oiNRiKBYv2w) / 
+[Discord](https://discord.gg/TDJ59cGV2X) directly about anything at all. 
 
 
 ### Contributing Code
@@ -44,8 +46,8 @@ We would love to see you there!
 
 
 ## Get Started 🚀
-Danswer being a fully functional app, relies on several external pieces of software, specifically:
-- Postgres (Relational DB)
+Danswer being a fully functional app, relies on some external pieces of software, specifically:
+- [Postgres](https://www.postgresql.org/) (Relational DB)
 - [Vespa](https://vespa.ai/) (Vector DB/Search Engine)
 
 This guide provides instructions to set up the Danswer specific services outside of Docker because it's easier for
@@ -54,11 +56,9 @@ development purposes but also feel free to just use the containers and update wi
 
 
 ### Local Set Up
-We've tested primarily with Python versions >= 3.11 but the code should work with Python >= 3.9.
+It is recommended to use Python versions >= 3.11.
 
-This guide skips a few optional features for simplicity, reach out if you need any of these:
-- User Authentication feature
-- File Connector background job
+This guide skips setting up User Authentication for the purpose of simplicity
 
 
 #### Installing Requirements
@@ -93,18 +93,11 @@ playwright install
 
 
 #### Dependent Docker Containers
-First navigate to `danswer/deployment/docker_compose`, then start up the containers with:
-
-Postgres:
-```bash
-docker compose -f docker-compose.dev.yml -p danswer-stack up -d relational_db
-```
-
-Vespa:
+First navigate to `danswer/deployment/docker_compose`, then start up Vespa and Postgres with:
 ```bash
-docker compose -f docker-compose.dev.yml -p danswer-stack up -d index
+docker compose -f docker-compose.dev.yml -p danswer-stack up -d index relational_db
 ```
-
+(index refers to Vespa and relational_db refers to Postgres)
 
 #### Running Danswer
 
@@ -115,71 +108,50 @@ mkdir dynamic_config_storage
 
 To start the frontend, navigate to `danswer/web` and run:
 ```bash
-AUTH_TYPE=disabled npm run dev
+npm run dev
 ```
-_for Windows, run:_
+
+Package the Vespa schema. This will only need to be done when the Vespa schema is updated locally.
+
+Nagivate to `danswer/backend/danswer/document_index/vespa/app_config` and run:
 ```bash
-(SET "AUTH_TYPE=disabled" && npm run dev)
+zip -r ../vespa-app.zip .
 ```
+- Note: If you don't have the `zip` utility, you will need to install it prior to running the above
 
+The first time running Danswer, you will also need to run the DB migrations for Postgres.
+After the first time, this is no longer required unless the DB models change.
 
-The first time running Danswer, you will need to run the DB migrations for Postgres.
 Navigate to `danswer/backend` and with the venv active, run:
 ```bash
 alembic upgrade head
 ```
 
-Additionally, we have to package the Vespa schema deployment:
-Nagivate to `danswer/backend/danswer/datastores/vespa/app_config` and run:
+Next, start the task queue which orchestrates the background jobs.
+Jobs that take more time are run async from the API server.
+
+Still in `danswer/backend`, run:
 ```bash
-zip -r ../vespa-app.zip .
+python ./scripts/dev_run_background_jobs.py
 ```
-- Note: If you don't have the `zip` utility, you will need to install it prior to running the above
 
 To run the backend API server, navigate back to `danswer/backend` and run:
 ```bash
 AUTH_TYPE=disabled \
 DYNAMIC_CONFIG_DIR_PATH=./dynamic_config_storage \
-VESPA_DEPLOYMENT_ZIP=./danswer/datastores/vespa/vespa-app.zip \
+VESPA_DEPLOYMENT_ZIP=./danswer/document_index/vespa/vespa-app.zip \
 uvicorn danswer.main:app --reload --port 8080
 ```
 _For Windows (for compatibility with both PowerShell and Command Prompt):_
 ```bash
 powershell -Command "
     $env:AUTH_TYPE='disabled'
     $env:DYNAMIC_CONFIG_DIR_PATH='./dynamic_config_storage'
-    $env:VESPA_DEPLOYMENT_ZIP='./danswer/datastores/vespa/vespa-app.zip'
+    $env:VESPA_DEPLOYMENT_ZIP='./danswer/document_index/vespa/vespa-app.zip'
     uvicorn danswer.main:app --reload --port 8080 
 "
 ```
 
-To run the background job to check for connector updates and index documents, navigate to `danswer/backend` and run:
-```bash
-PYTHONPATH=. DYNAMIC_CONFIG_DIR_PATH=./dynamic_config_storage python danswer/background/update.py
-```
-_For Windows:_
-```bash
-powershell -Command " $env:PYTHONPATH='.'; $env:DYNAMIC_CONFIG_DIR_PATH='./dynamic_config_storage'; python danswer/background/update.py "
-```
-
-To run the background job to check for periodically check for document set updates, navigate to `danswer/backend` and run:
-```bash
-PYTHONPATH=. DYNAMIC_CONFIG_DIR_PATH=./dynamic_config_storage python danswer/background/document_set_sync_script.py
-```
-_For Windows:_
-```bash
-powershell -Command " $env:PYTHONPATH='.'; $env:DYNAMIC_CONFIG_DIR_PATH='./dynamic_config_storage'; python danswer/background/document_set_sync_script.py "
-```
-
-To run Celery, which handles deletion of connectors + syncing of document sets, navigate to `danswer/backend` and run:
-```bash
-PYTHONPATH=. DYNAMIC_CONFIG_DIR_PATH=./dynamic_config_storage celery -A  danswer.background.celery worker --loglevel=info --concurrency=1
-```
-_For Windows:_
-```bash
-powershell -Command " $env:PYTHONPATH='.'; $env:DYNAMIC_CONFIG_DIR_PATH='./dynamic_config_storage'; celery -A  danswer.background.celery worker --loglevel=info --concurrency=1 "
-```
-
 Note: if you need finer logging, add the additional environment variable `LOG_LEVEL=DEBUG` to the relevant services.
 
 ### Formatting and Linting

diff --git a/backend/.gitignore b/backend/.gitignore
@@ -4,8 +4,6 @@ site_crawls/
 .ipynb_checkpoints/
 api_keys.py
 *ipynb
-qdrant-data/
-typesense-data/
 .env
 vespa-app.zip
 dynamic_config_storage/
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,54 +1,52 @@
 FROM python:3.11.4-slim-bookworm
 
-RUN apt-get update \
-    && apt-get install -y git cmake pkg-config libprotobuf-c-dev protobuf-compiler \
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y git cmake pkg-config libprotobuf-c-dev protobuf-compiler \
        libprotobuf-dev libgoogle-perftools-dev libpq-dev build-essential cron curl \
-       supervisor zip \
-    && rm -rf /var/lib/apt/lists/*
-
-COPY ./requirements/default.txt /tmp/requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
+       supervisor zip ca-certificates gnupg && \
+    rm -rf /var/lib/apt/lists/* && \
+    apt-get clean
 
+# Install Python dependencies
 # Remove py which is pulled in by retry, py is not needed and is a CVE
-RUN pip uninstall -y py
-
-RUN playwright install chromium
-RUN playwright install-deps chromium
+COPY ./requirements/default.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt && \
+    pip uninstall -y py && \
+    playwright install chromium && \
+    playwright install-deps chromium
 
-# install nodejs and replace nodejs packaged with playwright (18.17.0) with the one installed below 
+# install nodejs and replace nodejs packaged with playwright (18.17.0) with the one installed below
 # based on the instructions found here:
 # https://nodejs.org/en/download/package-manager#debian-and-ubuntu-based-linux-distributions
 # this is temporarily needed until playwright updates their packaged node version to
 # 20.5.1+
-RUN apt-get update
-RUN apt-get install -y ca-certificates curl gnupg
-RUN mkdir -p /etc/apt/keyrings
-RUN curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg
-RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list
-RUN apt-get update
-RUN apt-get install nodejs -y
-# replace nodejs packaged with playwright (18.17.0) with the one installed above
-RUN cp /usr/bin/node /usr/local/lib/python3.11/site-packages/playwright/driver/node
-# remove nodejs (except for the binary we moved into playwright)
-RUN apt-get remove -y nodejs
+RUN mkdir -p /etc/apt/keyrings && \
+    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \
+    echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \
+    apt-get update && \
+    apt-get install -y nodejs && \
+    cp /usr/bin/node /usr/local/lib/python3.11/site-packages/playwright/driver/node && \
+    apt-get remove -y nodejs
 
 # Cleanup for CVEs and size reduction
-RUN apt-get remove -y linux-libc-dev \
-    && apt-get autoremove -y \
-    && rm -rf /var/lib/apt/lists/*
-
 # Remove tornado test key to placate vulnerability scanners
 # More details can be found here:
 # https://github.com/tornadoweb/tornado/issues/3107
-RUN rm /usr/local/lib/python3.11/site-packages/tornado/test/test.key
+RUN apt-get remove -y linux-libc-dev && \
+    apt-get autoremove -y && \
+    rm -rf /var/lib/apt/lists/* && \
+    rm /usr/local/lib/python3.11/site-packages/tornado/test/test.key
 
+# Set up application files
 WORKDIR /app
 COPY ./danswer /app/danswer
 COPY ./alembic /app/alembic
 COPY ./alembic.ini /app/alembic.ini
 COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
 
-WORKDIR /app/danswer/datastores/vespa/app_config
+# Create Vespa app zip
+WORKDIR /app/danswer/document_index/vespa/app_config
 RUN zip -r /app/danswer/vespa-app.zip .
 WORKDIR /app
 
@@ -57,5 +55,6 @@ COPY ./scripts/migrate_vespa_to_acl.py /app/migrate_vespa_to_acl.py
 
 ENV PYTHONPATH /app
 
-# By default this container does nothing, it is used by api server and background which specify their own CMD
+# Default command which does nothing
+# This container is used by api server and background which specify their own CMD
 CMD ["tail", "-f", "/dev/null"]
diff --git a/backend/alembic/env.py b/backend/alembic/env.py
@@ -7,6 +7,7 @@
 from sqlalchemy import pool
 from sqlalchemy.engine import Connection
 from sqlalchemy.ext.asyncio import create_async_engine
+from celery.backends.database.session import ResultModelBase  # type: ignore
 
 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.
@@ -21,7 +22,7 @@
 # for 'autogenerate' support
 # from myapp import mymodel
 # target_metadata = mymodel.Base.metadata
-target_metadata = Base.metadata
+target_metadata = [Base.metadata, ResultModelBase.metadata]
 
 # other values from the config, defined by the needs of env.py,
 # can be acquired:
@@ -44,7 +45,7 @@ def run_migrations_offline() -> None:
     url = build_connection_string()
     context.configure(
         url=url,
-        target_metadata=target_metadata,
+        target_metadata=target_metadata,  # type: ignore
         literal_binds=True,
         dialect_opts={"paramstyle": "named"},
     )
@@ -54,7 +55,7 @@ def run_migrations_offline() -> None:
 
 
 def do_run_migrations(connection: Connection) -> None:
-    context.configure(connection=connection, target_metadata=target_metadata)
+    context.configure(connection=connection, target_metadata=target_metadata)  # type: ignore
 
     with context.begin_transaction():
         context.run_migrations()

diff --git a/backend/alembic/versions/30c1d5744104_persona_datetime_aware.py b/backend/alembic/versions/30c1d5744104_persona_datetime_aware.py
@@ -0,0 +1,37 @@
+"""Persona Datetime Aware
+
+Revision ID: 30c1d5744104
+Revises: 7f99be1cb9f5
+Create Date: 2023-10-16 23:21:01.283424
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "30c1d5744104"
+down_revision = "7f99be1cb9f5"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column("persona", sa.Column("datetime_aware", sa.Boolean(), nullable=True))
+    op.execute("UPDATE persona SET datetime_aware = TRUE")
+    op.alter_column("persona", "datetime_aware", nullable=False)
+    op.create_index(
+        "_default_persona_name_idx",
+        "persona",
+        ["name"],
+        unique=True,
+        postgresql_where=sa.text("default_persona = true"),
+    )
+
+
+def downgrade() -> None:
+    op.drop_index(
+        "_default_persona_name_idx",
+        table_name="persona",
+        postgresql_where=sa.text("default_persona = true"),
+    )
+    op.drop_column("persona", "datetime_aware")
diff --git a/backend/alembic/versions/3b25685ff73c_move_is_public_to_cc_pair.py b/backend/alembic/versions/3b25685ff73c_move_is_public_to_cc_pair.py
@@ -0,0 +1,49 @@
+"""Move is_public to cc_pair
+
+Revision ID: 3b25685ff73c
+Revises: e0a68a81d434
+Create Date: 2023-10-05 18:47:09.582849
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "3b25685ff73c"
+down_revision = "e0a68a81d434"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column("is_public", sa.Boolean(), nullable=True),
+    )
+    # fill in is_public for existing rows
+    op.execute(
+        "UPDATE connector_credential_pair SET is_public = true WHERE is_public IS NULL"
+    )
+    op.alter_column("connector_credential_pair", "is_public", nullable=False)
+
+    op.add_column(
+        "credential",
+        sa.Column("is_admin", sa.Boolean(), nullable=True),
+    )
+    op.execute("UPDATE credential SET is_admin = true WHERE is_admin IS NULL")
+    op.alter_column("credential", "is_admin", nullable=False)
+
+    op.drop_column("credential", "public_doc")
+
+
+def downgrade() -> None:
+    op.add_column(
+        "credential",
+        sa.Column("public_doc", sa.Boolean(), nullable=True),
+    )
+    # setting public_doc to false for all existing rows to be safe
+    # NOTE: this is likely not the correct state of the world but it's the best we can do
+    op.execute("UPDATE credential SET public_doc = false WHERE public_doc IS NULL")
+    op.alter_column("credential", "public_doc", nullable=False)
+    op.drop_column("connector_credential_pair", "is_public")
+    op.drop_column("credential", "is_admin")
diff --git a/backend/alembic/versions/3c5e35aa9af0_polling_document_count.py b/backend/alembic/versions/3c5e35aa9af0_polling_document_count.py
@@ -35,6 +35,7 @@ def upgrade() -> None:
                 "SUCCESS",
                 "FAILED",
                 name="indexingstatus",
+                native_enum=False,
             ),
             nullable=False,
         ),