fix relative paths in parser (#125)

* fix relative paths in parser
jerpint · Aug 17, 2023 · 24acaac · 24acaac
1 parent e0fbbd6
commit 24acaac
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 22 deletions.
diff --git a/buster/docparser.py b/buster/docparser.py
@@ -8,6 +8,7 @@
 import pandas as pd
 from bs4 import BeautifulSoup
 from openai.embeddings_utils import get_embedding
+from tqdm import tqdm
 
 from buster.parser import HuggingfaceParser, Parser, SphinxParser
 
@@ -49,7 +50,8 @@
 
 
 def get_document(
-    filepath: str,
+    root_dir: str,
+    file: str,
     base_url: str,
     parser_cls: Type[Parser],
     min_section_length: int = 100,
@@ -60,12 +62,12 @@ def get_document(
     Sections are broken into subsections if they are longer than `max_section_length`.
     Sections correspond to `section` HTML tags that have a headerlink attached.
     """
+    filepath = os.path.join(root_dir, file)
     with open(filepath, "r") as f:
         source = f.read()
 
-    filename = Path(filepath).name
     soup = BeautifulSoup(source, "html.parser")
-    parser = parser_cls(soup, base_url, filename, min_section_length, max_section_length)
+    parser = parser_cls(soup, base_url, root_dir, filepath, min_section_length, max_section_length)
 
     sections = []
     urls = []
@@ -95,13 +97,12 @@ def get_all_documents(
     files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)
 
     dfs = []
-    for file in files:
+    for file in tqdm(files):
         try:
-            filepath = os.path.join(root_dir, file)
-            df = get_document(filepath, base_url, parser_cls, min_section_length, max_section_length)
+            df = get_document(root_dir, file, base_url, parser_cls, min_section_length, max_section_length)
             dfs.append(df)
-        except:
-            print(f"Skipping {filepath}...")
+        except Exception as e:
+            print(f"Skipping {file} due to the following error: {e}")
             continue
 
     documents_df = pd.concat(dfs, ignore_index=True)

diff --git a/buster/examples/generate_embeddings.py b/buster/examples/generate_embeddings.py
@@ -11,4 +11,4 @@
 dm = DeepLakeDocumentsManager(vector_store_path="deeplake_store", overwrite=True, required_columns=REQUIRED_COLUMNS)
 
 # Generate the embeddings for our documents and store them to the deeplake store
-dm.add(df, csv_checkpoint="embeddings.csv")
+dm.add(df, csv_filename="embeddings.csv")
diff --git a/buster/examples/gradio_app.py b/buster/examples/gradio_app.py
@@ -49,7 +49,7 @@ def chat(history):
         yield history, completion
 
 
-block = gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}")
+block = gr.Blocks()
 
 with block:
     with gr.Row():

diff --git a/buster/parser.py b/buster/parser.py
@@ -3,6 +3,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import InitVar, dataclass, field
 from itertools import takewhile, zip_longest
+from pathlib import Path
 from typing import Iterator
 
 import bs4
@@ -73,13 +74,25 @@ def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]:
 class Parser(ABC):
     soup: BeautifulSoup
     base_url: str
-    filename: str
+    root_dir: str
+    filepath: str
     min_section_length: int = 100
     max_section_length: int = 2000
 
-    @abstractmethod
+    @property
+    def relative_path(self) -> str:
+        """Gets the relative path of the file to the root dir.
+
+        This is particularly useful for websites with pages, subdomains, etc.
+        The split is to remove the .html extension
+        """
+        parent = Path(self.root_dir)
+        son = Path(self.filepath)
+        self._relative_path = str(son.relative_to(parent)).split(".")[0]
+        return self._relative_path
+
     def build_url(self, suffix: str) -> str:
-        ...
+        return self.base_url + self.relative_path + suffix
 
     @abstractmethod
     def find_sections(self) -> Iterator[Section]:
@@ -111,9 +124,6 @@ def find_sections(self) -> Iterator[Section]:
             yield section
         return
 
-    def build_url(self, suffix: str) -> str:
-        return self.base_url + self.filename + suffix
-
 
 class HuggingfaceParser(Parser):
     def find_sections(self) -> Iterator[Section]:
@@ -122,12 +132,8 @@ def find_sections(self) -> Iterator[Section]:
             href = section.find("a", href=True, class_="header-link")
             nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings()))
 
-            url = self.build_url(href["href"].strip().replace("\n", ""))
+            suffix = href["href"].strip().replace("\n", "")
+            url = self.build_url(suffix)
             name = section.text.strip().replace("\n", "")
             yield Section(url, name, nodes)
-
         return
-
-    def build_url(self, suffix: str) -> str:
-        # The splitext is to remove the .html extension
-        return self.base_url + os.path.splitext(self.filename)[0] + suffix