Update DocumentsManagers (#121)

* remove sqlite db support * update document manager API * remove entry points for document ingestion * update README + example app for quickstart * update tests * rename documents to documents_manager throughout the code * make openai embedding model the default choice; add progress bar when computing embeddings; save intermediate embedding file * remove unused functions * have option to specify where the csv gets saved * parallelize embedding computation * add test for embedding computation in parallel * use same embedding function everywhere * make required columns optional, pass the csv arg in add instead of init * add try/except when parsing the docs to skip problematic files
jerpint · Aug 10, 2023 · c662476 · c662476
1 parent 4a730ce
commit c662476
Show file tree

Hide file tree

Showing 25 changed files with 394 additions and 880 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 *.db
 
 buster/apps/data/
+deeplake_store/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/README.md b/README.md
@@ -11,27 +11,53 @@ You can try out our [live demo here](https://huggingface.co/spaces/jerpint/buste
 
 Here is a quick guide to help you deploy buster on your own dataset!
 
-First step, install buster locally. Note that buster requires python>=3.10.
+We will look at deploying a simple app locally.
+First step, install buster. Note that buster requires python>=3.10.
 
 ```
 git clone https://github.com/jerpint/buster.git
-pip install -e .
+pip install .
 ```
 
-Then, go to the examples folder. We've attached a sample `stackoverflow.csv` file to help you get started. You will convert the .csv to a `documents.db` file.
+Then, go to the examples folder:
 
-```
-buster_csv_parser stackoverflow.csv --output-filepath documents.db
-```
+    cd buster/buster/examples
 
-This will generate the embeddings and save them locally. Finally, run
+We've attached a sample `stackoverflow.csv` file to help you get started.
 
-```
-gradio gradio_app.py
-```
+You will first ingest the documents to be ready for buster. In this example, we use Deeplake's vectore store, but you can always write your own custom `DocumentManager`:
+
+
+    import pandas as pd
+    from buster.documents_manager import DeepLakeDocumentsManager
+
+    # Read the csv
+    df = pd.read_csv("stackoverflow.csv")
+
+    # Generate the embeddings for our documents and store them in a deeplake format
+    dm = DeepLakeDocumentsManager(vector_store_path="deeplake_store", overwrite=True)
+    dm.add(df)
+
+You can also just simply run the script:
+
+    python generate_embeddings.py
+
+
+This will generate the embeddings and save them locally in the `deeplake_store` folder.
+Note: You only need to run this operation one time.
+
+Now, you can launch your gradio app:
+
+    gradio gradio_app.py
 
 This will launch the gradio app locally, which you should be able to view on [localhost]( http://127.0.0.1:7860)
 
+In the .csv, we expect columns ["title", "url", "content", "source"] for each row of the csv:
+
+* title: this will be the title of the url to display
+* url: the actual link that will be shown to the user
+* source: where the content was originally sourced from (e.g. wikipedia, medium, etc.)
+* content: plaintext of the document to be embedded. Note that we do not do any chunking (yet). It is your responsibility to ensure each document is of an appropriate context length.
 
 ## How does Buster work?
 
@@ -57,4 +83,4 @@ For more information, you can watch the livestream where explain how buster work
 
 - [Livestream recording](https://youtu.be/LB5g-AhfPG8)
 
-- [Livestream notebook](https://colab.research.google.com/drive/1CosxSNod48KrkyBn5_vkeleb7u0CrBa6)
+- [Livestream notebook](https://colab.research.google.com/drive/1CosxSNod48KrkyBn5_vkeleb7u0CrBa6) (Note this notebook is deprecated and not maintained anymore)
diff --git a/buster/completers/base.py b/buster/completers/base.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import warnings
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import Any, Iterator, Optional
@@ -261,7 +262,9 @@ def get_completion(
         logger.info(f"{user_input=}")
 
         if len(matched_documents) == 0:
-            logger.warning("no documents found...")
+            warning_msg = "No documents found during retrieval."
+            warnings.warn(warning_msg)
+            logger.warning(warning_msg)
 
             # empty dataframe
             matched_documents = pd.DataFrame(columns=matched_documents.columns)

diff --git a/buster/docparser.py b/buster/docparser.py
@@ -4,24 +4,16 @@
 from pathlib import Path
 from typing import Type
 
-import click
 import numpy as np
 import pandas as pd
-import tiktoken
 from bs4 import BeautifulSoup
 from openai.embeddings_utils import get_embedding
 
-from buster.documents import DocumentsManager
-from buster.documents.sqlite.documents import DocumentsDB
 from buster.parser import HuggingfaceParser, Parser, SphinxParser
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 
-EMBEDDING_MODEL = "text-embedding-ada-002"
-EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002
-
-
 supported_docs = {
     "mila": {
         "base_url": "https://docs.mila.quebec/",
@@ -104,108 +96,14 @@ def get_all_documents(
 
     dfs = []
     for file in files:
-        filepath = os.path.join(root_dir, file)
-        df = get_document(filepath, base_url, parser_cls, min_section_length, max_section_length)
-        dfs.append(df)
+        try:
+            filepath = os.path.join(root_dir, file)
+            df = get_document(filepath, base_url, parser_cls, min_section_length, max_section_length)
+            dfs.append(df)
+        except:
+            print(f"Skipping {filepath}...")
+            continue
 
     documents_df = pd.concat(dfs, ignore_index=True)
 
     return documents_df
-
-
-def compute_n_tokens(
-    df: pd.DataFrame, embedding_encoding: str = EMBEDDING_ENCODING, col: str = "content"
-) -> pd.DataFrame:
-    """Counts the tokens in the content column and adds the count to a n_tokens column."""
-    logger.info("Computing tokens counts...")
-    encoding = tiktoken.get_encoding(encoding_name=embedding_encoding)
-    # TODO are there unexpected consequences of allowing endoftext?
-    df["n_tokens"] = df[col].apply(lambda x: len(encoding.encode(x, allowed_special={"<|endoftext|>"})))
-    return df
-
-
-def max_word_count(df: pd.DataFrame, max_words: int, col: str = "content") -> pd.DataFrame:
-    """Trim the word count of an entry to max_words"""
-    assert df[col].apply(lambda s: isinstance(s, str)).all(), f"Column {col} must contain only strings"
-    word_counts_before = df[col].apply(lambda x: len(x.split()))
-    df[col] = df[col].apply(lambda x: " ".join(x.split()[:max_words]))
-    word_counts_after = df[col].apply(lambda x: len(x.split()))
-
-    trimmed = df[word_counts_before == word_counts_after]
-    logger.info(f"trimmed {len(trimmed)} documents to {max_words} words.")
-
-    return df
-
-
-def compute_embeddings(df: pd.DataFrame, engine: str = EMBEDDING_MODEL, col="embedding") -> pd.DataFrame:
-    logger.info(f"Computing embeddings for {len(df)} documents...")
-    df[col] = df.content.apply(lambda x: np.asarray(get_embedding(x, engine=engine), dtype=np.float32))
-    logger.info(f"Done computing embeddings for {len(df)} documents.")
-    return df
-
-
-def generate_embeddings_parser(root_dir: str, output_filepath: str, source: str) -> pd.DataFrame:
-    documents = get_all_documents(root_dir, supported_docs[source]["base_url"], supported_docs[source]["parser"])
-    return generate_embeddings(documents, output_filepath)
-
-
-def documents_to_db(
-    documents: pd.DataFrame,
-    documents_manager: DocumentsManager,
-):
-    logger.info("Preparing database...")
-    sources = documents["source"].unique()
-    for source in sources:
-        df = documents[documents.source == source]
-        documents_manager.add(df)
-    logger.info(f"Documents saved to documents manager: {documents_manager}")
-
-
-def update_source(source: str, documents_manager: DocumentsManager, display_name: str = None, note: str = None):
-    documents_manager.update_source(source, display_name, note)
-
-
-def generate_embeddings(
-    documents: pd.DataFrame,
-    documents_manager: DocumentsManager,
-    max_words=500,
-    embedding_engine: str = EMBEDDING_MODEL,
-) -> pd.DataFrame:
-    # check that we have the appropriate columns in our dataframe
-
-    assert set(required_cols := ["content", "title", "url"]).issubset(
-        set(documents.columns)
-    ), f"Your dataframe must contain {required_cols}."
-
-    # Get all documents and precompute their embeddings
-    documents = max_word_count(documents, max_words=max_words)
-    documents = compute_n_tokens(documents)
-    documents = compute_embeddings(documents, engine=embedding_engine)
-
-    # save the documents to a db for later use
-    documents_to_db(documents, documents_manager)
-
-    return documents
-
-
-@click.command()
-@click.argument("documents-csv")
-@click.option(
-    "--output-filepath", default="documents.db", help='Where your database will be saved. Default is "documents.db"'
-)
-@click.option(
-    "--max-words", default=500, help="Number of maximum allowed words per document, excess is trimmed. Default is 500"
-)
-@click.option(
-    "--embeddings-engine", default=EMBEDDING_MODEL, help=f"Embedding model to use. Default is {EMBEDDING_MODEL}"
-)
-def main(documents_csv: str, output_filepath: str, max_words: int, embeddings_engine: str):
-    # prepare the documents manager
-    documents_manager = DocumentsDB(output_filepath)
-
-    documents = pd.read_csv(documents_csv)
-    documents = generate_embeddings(documents, documents_manager, max_words, embeddings_engine)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/buster/documents/base.py b/buster/documents/base.py
diff --git a/buster/documents/deeplake.py b/buster/documents/deeplake.py
diff --git a/buster/documents/sqlite/__init__.py b/buster/documents/sqlite/__init__.py