Skip to content

Commit

Permalink
fix relative paths in parser (#125)
Browse files Browse the repository at this point in the history
* fix relative paths in parser
  • Loading branch information
jerpint authored Aug 17, 2023
1 parent e0fbbd6 commit 24acaac
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 22 deletions.
17 changes: 9 additions & 8 deletions buster/docparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
from bs4 import BeautifulSoup
from openai.embeddings_utils import get_embedding
from tqdm import tqdm

from buster.parser import HuggingfaceParser, Parser, SphinxParser

Expand Down Expand Up @@ -49,7 +50,8 @@


def get_document(
filepath: str,
root_dir: str,
file: str,
base_url: str,
parser_cls: Type[Parser],
min_section_length: int = 100,
Expand All @@ -60,12 +62,12 @@ def get_document(
Sections are broken into subsections if they are longer than `max_section_length`.
Sections correspond to `section` HTML tags that have a headerlink attached.
"""
filepath = os.path.join(root_dir, file)
with open(filepath, "r") as f:
source = f.read()

filename = Path(filepath).name
soup = BeautifulSoup(source, "html.parser")
parser = parser_cls(soup, base_url, filename, min_section_length, max_section_length)
parser = parser_cls(soup, base_url, root_dir, filepath, min_section_length, max_section_length)

sections = []
urls = []
Expand Down Expand Up @@ -95,13 +97,12 @@ def get_all_documents(
files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)

dfs = []
for file in files:
for file in tqdm(files):
try:
filepath = os.path.join(root_dir, file)
df = get_document(filepath, base_url, parser_cls, min_section_length, max_section_length)
df = get_document(root_dir, file, base_url, parser_cls, min_section_length, max_section_length)
dfs.append(df)
except:
print(f"Skipping {filepath}...")
except Exception as e:
print(f"Skipping {file} due to the following error: {e}")
continue

documents_df = pd.concat(dfs, ignore_index=True)
Expand Down
2 changes: 1 addition & 1 deletion buster/examples/generate_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
dm = DeepLakeDocumentsManager(vector_store_path="deeplake_store", overwrite=True, required_columns=REQUIRED_COLUMNS)

# Generate the embeddings for our documents and store them to the deeplake store
dm.add(df, csv_checkpoint="embeddings.csv")
dm.add(df, csv_filename="embeddings.csv")
2 changes: 1 addition & 1 deletion buster/examples/gradio_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def chat(history):
yield history, completion


block = gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}")
block = gr.Blocks()

with block:
with gr.Row():
Expand Down
30 changes: 18 additions & 12 deletions buster/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from abc import ABC, abstractmethod
from dataclasses import InitVar, dataclass, field
from itertools import takewhile, zip_longest
from pathlib import Path
from typing import Iterator

import bs4
Expand Down Expand Up @@ -73,13 +74,25 @@ def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]:
class Parser(ABC):
soup: BeautifulSoup
base_url: str
filename: str
root_dir: str
filepath: str
min_section_length: int = 100
max_section_length: int = 2000

@abstractmethod
@property
def relative_path(self) -> str:
"""Gets the relative path of the file to the root dir.
This is particularly useful for websites with pages, subdomains, etc.
The split is to remove the .html extension
"""
parent = Path(self.root_dir)
son = Path(self.filepath)
self._relative_path = str(son.relative_to(parent)).split(".")[0]
return self._relative_path

def build_url(self, suffix: str) -> str:
...
return self.base_url + self.relative_path + suffix

@abstractmethod
def find_sections(self) -> Iterator[Section]:
Expand Down Expand Up @@ -111,9 +124,6 @@ def find_sections(self) -> Iterator[Section]:
yield section
return

def build_url(self, suffix: str) -> str:
return self.base_url + self.filename + suffix


class HuggingfaceParser(Parser):
def find_sections(self) -> Iterator[Section]:
Expand All @@ -122,12 +132,8 @@ def find_sections(self) -> Iterator[Section]:
href = section.find("a", href=True, class_="header-link")
nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings()))

url = self.build_url(href["href"].strip().replace("\n", ""))
suffix = href["href"].strip().replace("\n", "")
url = self.build_url(suffix)
name = section.text.strip().replace("\n", "")
yield Section(url, name, nodes)

return

def build_url(self, suffix: str) -> str:
# The splitext is to remove the .html extension
return self.base_url + os.path.splitext(self.filename)[0] + suffix

0 comments on commit 24acaac

Please sign in to comment.