diff --git a/buster/docparser.py b/buster/docparser.py index 05c7116..24e1b7e 100644 --- a/buster/docparser.py +++ b/buster/docparser.py @@ -8,6 +8,7 @@ import pandas as pd from bs4 import BeautifulSoup from openai.embeddings_utils import get_embedding +from tqdm import tqdm from buster.parser import HuggingfaceParser, Parser, SphinxParser @@ -49,7 +50,8 @@ def get_document( - filepath: str, + root_dir: str, + file: str, base_url: str, parser_cls: Type[Parser], min_section_length: int = 100, @@ -60,12 +62,12 @@ def get_document( Sections are broken into subsections if they are longer than `max_section_length`. Sections correspond to `section` HTML tags that have a headerlink attached. """ + filepath = os.path.join(root_dir, file) with open(filepath, "r") as f: source = f.read() - filename = Path(filepath).name soup = BeautifulSoup(source, "html.parser") - parser = parser_cls(soup, base_url, filename, min_section_length, max_section_length) + parser = parser_cls(soup, base_url, root_dir, filepath, min_section_length, max_section_length) sections = [] urls = [] @@ -95,13 +97,12 @@ def get_all_documents( files = glob.glob("**/*.html", root_dir=root_dir, recursive=True) dfs = [] - for file in files: + for file in tqdm(files): try: - filepath = os.path.join(root_dir, file) - df = get_document(filepath, base_url, parser_cls, min_section_length, max_section_length) + df = get_document(root_dir, file, base_url, parser_cls, min_section_length, max_section_length) dfs.append(df) - except: - print(f"Skipping {filepath}...") + except Exception as e: + print(f"Skipping {file} due to the following error: {e}") continue documents_df = pd.concat(dfs, ignore_index=True) diff --git a/buster/examples/generate_embeddings.py b/buster/examples/generate_embeddings.py index b527c9b..a0f7926 100644 --- a/buster/examples/generate_embeddings.py +++ b/buster/examples/generate_embeddings.py @@ -11,4 +11,4 @@ dm = DeepLakeDocumentsManager(vector_store_path="deeplake_store", overwrite=True, required_columns=REQUIRED_COLUMNS) # Generate the embeddings for our documents and store them to the deeplake store -dm.add(df, csv_checkpoint="embeddings.csv") +dm.add(df, csv_filename="embeddings.csv") diff --git a/buster/examples/gradio_app.py b/buster/examples/gradio_app.py index e5456d8..c46a708 100644 --- a/buster/examples/gradio_app.py +++ b/buster/examples/gradio_app.py @@ -49,7 +49,7 @@ def chat(history): yield history, completion -block = gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") +block = gr.Blocks() with block: with gr.Row(): diff --git a/buster/parser.py b/buster/parser.py index 0ea2e6e..e79470b 100644 --- a/buster/parser.py +++ b/buster/parser.py @@ -3,6 +3,7 @@ from abc import ABC, abstractmethod from dataclasses import InitVar, dataclass, field from itertools import takewhile, zip_longest +from pathlib import Path from typing import Iterator import bs4 @@ -73,13 +74,25 @@ def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]: class Parser(ABC): soup: BeautifulSoup base_url: str - filename: str + root_dir: str + filepath: str min_section_length: int = 100 max_section_length: int = 2000 - @abstractmethod + @property + def relative_path(self) -> str: + """Gets the relative path of the file to the root dir. + + This is particularly useful for websites with pages, subdomains, etc. + The split is to remove the .html extension + """ + parent = Path(self.root_dir) + son = Path(self.filepath) + self._relative_path = str(son.relative_to(parent)).split(".")[0] + return self._relative_path + def build_url(self, suffix: str) -> str: - ... + return self.base_url + self.relative_path + suffix @abstractmethod def find_sections(self) -> Iterator[Section]: @@ -111,9 +124,6 @@ def find_sections(self) -> Iterator[Section]: yield section return - def build_url(self, suffix: str) -> str: - return self.base_url + self.filename + suffix - class HuggingfaceParser(Parser): def find_sections(self) -> Iterator[Section]: @@ -122,12 +132,8 @@ def find_sections(self) -> Iterator[Section]: href = section.find("a", href=True, class_="header-link") nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings())) - url = self.build_url(href["href"].strip().replace("\n", "")) + suffix = href["href"].strip().replace("\n", "") + url = self.build_url(suffix) name = section.text.strip().replace("\n", "") yield Section(url, name, nodes) - return - - def build_url(self, suffix: str) -> str: - # The splitext is to remove the .html extension - return self.base_url + os.path.splitext(self.filename)[0] + suffix