diff --git a/metaphor/notion/README.md b/metaphor/notion/README.md index dc625ea9..223b06d3 100644 --- a/metaphor/notion/README.md +++ b/metaphor/notion/README.md @@ -35,7 +35,7 @@ azure_openAI_version: # "2023-12-01-preview" azure_openAI_model_name: # "Embedding_ada002" azure_openAI_model: # "text-embedding-ada-002" -notion_api_version: # "2022-06-08" +notion_api_version: # "2022-06-28" include_text: # False ``` diff --git a/metaphor/notion/config.py b/metaphor/notion/config.py index 4ab16855..61fe6a2f 100644 --- a/metaphor/notion/config.py +++ b/metaphor/notion/config.py @@ -22,4 +22,4 @@ class NotionRunConfig(BaseConfig): include_text: bool = False # Notion API version - notion_api_version: str = "2022-06-08" + notion_api_version: str = "2022-06-28" diff --git a/metaphor/static_web/README.md b/metaphor/static_web/README.md new file mode 100644 index 00000000..5cdda983 --- /dev/null +++ b/metaphor/static_web/README.md @@ -0,0 +1,31 @@ +# Static Webpage Connector + +## Setup + +## Config File + +Create a YAML config file based on the following template. + +`depth = 1` corresponds to scraping the specified page and its subpages only. Higher configured depths will recursively perform the same action on subpages `n` times. + +### Required Configurations + +```yaml +output: + file: + directory: +``` + +### Optional Configurations + +## Testing + +Follow the [Installation](../../README.md) instructions to install `metaphor-connectors` in your environment (or virtualenv). Make sure to include either `all` or `static_web` extra. + +To test the connector locally, change the config file to output to a local path and run the following command + +```shell +metaphor static_web +``` + +Manually verify the output after the command finishes. diff --git a/metaphor/static_web/__init__.py b/metaphor/static_web/__init__.py new file mode 100644 index 00000000..b4fe4bdc --- /dev/null +++ b/metaphor/static_web/__init__.py @@ -0,0 +1,6 @@ +from metaphor.common.cli import cli_main +from metaphor.static_web.extractor import StaticWebExtractor + + +def main(config_file: str): + cli_main(StaticWebExtractor, config_file) diff --git a/metaphor/static_web/config.py b/metaphor/static_web/config.py new file mode 100644 index 00000000..881e0b86 --- /dev/null +++ b/metaphor/static_web/config.py @@ -0,0 +1,25 @@ +from pydantic.dataclasses import dataclass + +from metaphor.common.base_config import BaseConfig +from metaphor.common.dataclass import ConnectorConfig + + +@dataclass(config=ConnectorConfig) +class StaticWebRunConfig(BaseConfig): + # Top-level URLs to scrape content from + links: list + + # Configurable scraping depth + depths: list + + # Azure OpenAI services configs + azure_openAI_key: str + azure_openAI_endpoint: str + + # Default Azure OpenAI services configs + azure_openAI_version: str = "2023-12-01-preview" + azure_openAI_model: str = "text-embedding-ada-002" + azure_openAI_model_name: str = "Embedding_ada002" + + # Store the document's content alongside embeddings + include_text: bool = False diff --git a/metaphor/static_web/extractor.py b/metaphor/static_web/extractor.py new file mode 100644 index 00000000..e4a6c8c3 --- /dev/null +++ b/metaphor/static_web/extractor.py @@ -0,0 +1,241 @@ +import datetime +from typing import Collection, List, Tuple +from urllib.parse import urljoin, urlparse + +import requests +from bs4 import BeautifulSoup +from bs4.element import Comment +from llama_index import Document +from requests.exceptions import HTTPError, RequestException + +from metaphor.common.base_extractor import BaseExtractor +from metaphor.common.embeddings import embed_documents, map_metadata, sanitize_text +from metaphor.common.logger import get_logger +from metaphor.common.utils import md5_digest +from metaphor.models.crawler_run_metadata import Platform +from metaphor.static_web.config import StaticWebRunConfig + +logger = get_logger() + +embedding_chunk_size = 512 +embedding_overlap_size = 50 + + +class StaticWebExtractor(BaseExtractor): + """Static webpage extractor.""" + + _description = "Crawls webpages and and extracts documents & embeddings." + _platform = Platform.UNKNOWN + + @staticmethod + def from_config_file(config_file: str) -> "StaticWebExtractor": + return StaticWebExtractor(StaticWebRunConfig.from_yaml_file(config_file)) + + def __init__(self, config: StaticWebRunConfig): + super().__init__(config=config) + + self.target_URLs = config.links + self.target_depths = config.depths + + self.azure_openAI_key = config.azure_openAI_key + self.azure_openAI_version = config.azure_openAI_version + self.azure_openAI_endpoint = config.azure_openAI_endpoint + self.azure_openAI_model = config.azure_openAI_model + self.azure_openAI_model_name = config.azure_openAI_model_name + + self.include_text = config.include_text + + async def extract(self) -> Collection[dict]: + logger.info("Scraping provided URLs") + self.docs = list() # type: List[Document] + self.visited_pages = set() # type: set + + for page, depth in zip(self.target_URLs, self.target_depths): + logger.info(f"Processing {page} with depth {depth}") + self.current_parent_page = page + + # Fetch target content + success, content = self._check_page_make_document(page) + + if success: + logger.info(f"Done with parent page {page}") + if depth: # recursive subpage processing + await self._process_subpages(page, content, depth) + + # Embedding process + logger.info("Starting embedding process") + vector_store_index = embed_documents( + self.docs, + self.azure_openAI_key, + self.azure_openAI_version, + self.azure_openAI_endpoint, + self.azure_openAI_model, + self.azure_openAI_model_name, + embedding_chunk_size, + embedding_overlap_size, + ) + + embedded_nodes = map_metadata( + vector_store_index, include_text=self.include_text + ) + + return embedded_nodes + + async def _process_subpages( + self, + parent_URL: str, + parent_content: str, + target_depth: int, + current_depth: int = 1, + ) -> None: + logger.info(f"Processing subpages of {parent_URL}") + subpages = self._get_subpages_from_HTML(parent_content, parent_URL) + + if current_depth > target_depth: # on recursion depth reached + return + + for subpage in subpages: + if subpage in self.visited_pages: + continue + + logger.info(f"Processing subpage {subpage} of parent {parent_URL}") + success, content = self._check_page_make_document(subpage) + + if success: + logger.info(f"Done with subpage {subpage}") + await self._process_subpages( + subpage, content, target_depth, current_depth + 1 + ) + + def _check_page_make_document(self, page: str) -> Tuple[bool, str]: + """ + Gets a page's HTML and adds to the visited pages set. + If page has valid content, extracts the text and title and generates + a Document object for the page. + + Returns a bool and the page_content: + out[0]: False if the page content is invalid, True otherwise. + out[1]: "" if the page content is invalid, page_content otherwise + """ + + page_content = self._get_page_HTML(page) + self.visited_pages.add(page) + + if page_content == "ERROR IN PAGE RETRIEVAL": + return (False, "") + else: + page_text = self._get_text_from_HTML(page_content) + page_title = self._get_title_from_HTML(page_content) + + page_doc = self._make_document(page, page_title, page_text) + self.docs.append(page_doc) + + return (True, page_content) + + def _get_page_HTML(self, input_URL: str) -> str: + """ + Fetches a webpage's content, returning an error message on failure. + """ + try: + r = requests.get(input_URL, timeout=5) + r.raise_for_status() + return r.text + except (HTTPError, RequestException) as e: + logger.warning(f"Error in retrieving {input_URL}, error {e}") + return "ERROR IN PAGE RETRIEVAL" + + def _get_subpages_from_HTML(self, html_content: str, input_URL: str) -> List[str]: + """ + Extracts and returns a list of subpage URLs from a given page's HTML and URL. + Subpage URLs are reconstructed to be absolute URLs and anchor links are trimmed. + """ + # Retrieve input page + + soup = BeautifulSoup(html_content, "lxml") + links = soup.find_all("a", href=True) + + # Parse the domain of the input URL + input_domain = urlparse(self.current_parent_page).netloc + subpages = [input_URL] + + # Find eligible links + for link in links: + href = link["href"] + full_url = urljoin(input_URL, href) + + # Check if the domain of the full URL matches the input domain + if urlparse(full_url).netloc == input_domain: + # Remove any query parameters or fragments + full_url = urljoin(full_url, urlparse(full_url).path) + if full_url not in subpages: + subpages.append(full_url) + + return subpages + + def _get_text_from_HTML(self, html_content: str) -> str: + """ + Extracts and returns visible text from given HTML content as a single string. + Designed to handle output from get_page_HTML. + """ + + def filter_visible(el): + if el.parent.name in [ + "style", + "script", + "head", + "title", + "meta", + "[document]", + ]: + return False + elif isinstance(el, Comment): + return False + else: + return True + + # Use bs4 to find visible text elements + soup = BeautifulSoup(html_content, "lxml") + visible_text = filter(filter_visible, soup.findAll(string=True)) + return "\n".join(t.strip() for t in visible_text) + + def _get_title_from_HTML(self, html_content: str) -> str: + """ + Extracts the title of a webpage given HTML content as a single string. + Designed to handle output from get_page_HTML. + """ + + soup = BeautifulSoup(html_content, "lxml") + title_tag = soup.find("title") + + if title_tag: + return title_tag.text + + else: + return "" + + def _make_document( + self, page_URL: str, page_title: str, page_text: str + ) -> Document: + """ + Constructs Document objects from webpage URLs + and their content, including extra metadata. + + Cleans text content and includes data like page title, + platform URL, page link, refresh timestamp, and page ID. + """ + netloc = urlparse(page_URL).netloc + current_time = str(datetime.datetime.utcnow()) + + doc = Document( + text=sanitize_text(page_text), + extra_info={ + "title": page_title, + "platform": netloc, + "link": page_URL, + "lastRefreshed": current_time, + # Create a pageId based on page_URL - is this necessary? + "pageId": md5_digest(page_URL.encode()), + }, + ) + + return doc diff --git a/poetry.lock b/poetry.lock index 7be6b576..01c9c3f9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -482,6 +482,27 @@ test = ["beautifulsoup4 (>=4.8.0)", "coverage (>=4.5.4)", "fixtures (>=3.0.0)", toml = ["tomli (>=1.1.0)"] yaml = ["PyYAML"] +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +description = "Screen-scraping library" +optional = true +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, + {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "black" version = "23.12.1" @@ -530,17 +551,17 @@ uvloop = ["uvloop (>=0.15.2)"] [[package]] name = "boto3" -version = "1.34.39" +version = "1.34.40" description = "The AWS SDK for Python" optional = false python-versions = ">= 3.8" files = [ - {file = "boto3-1.34.39-py3-none-any.whl", hash = "sha256:476896e70d36c9134d4125834280c597c17b54bff4902baf2e5fcde74f8acec8"}, - {file = "boto3-1.34.39.tar.gz", hash = "sha256:35bcbecf1b5d3620c93f0062d2994177f8bda25a9d2cba144d6462793c16065b"}, + {file = "boto3-1.34.40-py3-none-any.whl", hash = "sha256:49eb215e4142d441e26eedaf5d0b43065200f0849d82c904bc9a62d1328016cd"}, + {file = "boto3-1.34.40.tar.gz", hash = "sha256:81d026ed8c8305b880c71f9f287f9b745b52bd358a91cfc133844c907db4d7ee"}, ] [package.dependencies] -botocore = ">=1.34.39,<1.35.0" +botocore = ">=1.34.40,<1.35.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -549,13 +570,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.34.39" +version = "1.34.40" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.8" files = [ - {file = "botocore-1.34.39-py3-none-any.whl", hash = "sha256:e175360445424b83b0e28ae20d301b99cf44ff2c9d5ab1d8670899bec05a9753"}, - {file = "botocore-1.34.39.tar.gz", hash = "sha256:9f00bd5e4698bcdd37ce6e224a896baf58d209678ed92834944b767de9061cc5"}, + {file = "botocore-1.34.40-py3-none-any.whl", hash = "sha256:a3edd774653a61a1b211e4ea88cdb1c2655ffcc7660ba77b41a4027b097d145d"}, + {file = "botocore-1.34.40.tar.gz", hash = "sha256:cb794bdb5b3d41845749a182ec93cb1453560e52b97ae0ab43ace81deb011f6d"}, ] [package.dependencies] @@ -2403,6 +2424,111 @@ cattrs = {version = ">=1.3", markers = "python_version >= \"3.7\""} requests = ">=2.22" typing-extensions = ">=4.1.1" +[[package]] +name = "lxml" +version = "5.0.1" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" +files = [ + {file = "lxml-5.0.1-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d78e91cbffe733ff325e0d258bb64702c8d91f8f652db088bd2989c6a8f90cc"}, + {file = "lxml-5.0.1-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:19251f782ea51a4841e747158195312ef63e06b47889e159dc5f1b2e5d668465"}, + {file = "lxml-5.0.1-cp27-cp27m-win32.whl", hash = "sha256:8689c54483b1f16b577b8194a58fd6feab6b9d5699e297ffbc552acb0874dfe1"}, + {file = "lxml-5.0.1-cp27-cp27m-win_amd64.whl", hash = "sha256:099eacbfdda668eda3e7e0705eced115a2e9425bb66cfce41a79fef1821a319c"}, + {file = "lxml-5.0.1-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b6bb5a0a87ab1e01f086cbb418be9e409719cd216954aa38b1cceee36a561ce1"}, + {file = "lxml-5.0.1-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:4b49a1569ed6d05808f4d163a316e7bf4a230e0c36855b59f56020ae27ae586a"}, + {file = "lxml-5.0.1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:dbff288e1869db78f8731ca257553dd699edef07e173b35e71b1122b630d6008"}, + {file = "lxml-5.0.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0f70e5de6b3e24ababeca597f776e5f37973f05d28a4d9f467aa5b45745af762"}, + {file = "lxml-5.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:32a135d4ef8f966bc087d450d641df73fc6874f04cf6608111541b50090e6f13"}, + {file = "lxml-5.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:b4eef43c5dc5c579d0804e55a32dd1bacbd008c8191ed4d65be278bbb11ddc61"}, + {file = "lxml-5.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:7febf50135363e981097eeada84781eeae92bfc3c203495f63d6b542a7132ba7"}, + {file = "lxml-5.0.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0a79eca2ef5e032c8ed9da07f84a07a29105f220b777613dfe7fc31445691ee3"}, + {file = "lxml-5.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8de180f748a17382dd695b3999be03a647d09af16ae589c4e9c37138ddb6d4c6"}, + {file = "lxml-5.0.1-cp310-cp310-win32.whl", hash = "sha256:6af86081c888ce81ca7e361ed7fa2ba1678e2a86eb5a059c96d5a719364d319e"}, + {file = "lxml-5.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:0dc36ec06514fe8848c4733d47f96a0636f82d9ca3eaa2132373426bc03f178f"}, + {file = "lxml-5.0.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:f56e6a38c64a79e228d48344bb3bec265ac035fc1277ce8c049445bb18e4cd41"}, + {file = "lxml-5.0.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d58af4ebd711dad40f1c024a779950d9918d04d74f49230edf5d271dcd33c28"}, + {file = "lxml-5.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:00bfccab28f710bb13f7f644c980ad50ce3e5b6a412b5bb9a6c30136b298fb2c"}, + {file = "lxml-5.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:45e747b4e82390ffd802528b9e7b39333c1ce71423057bf5961b30ec0d52f76f"}, + {file = "lxml-5.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:03c977ffc9a4bf17b3e0f8db0451dc38e9f4ec92cfdb5df462d38fbe6e6e0825"}, + {file = "lxml-5.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d0047c90e0ebd0d8f3c1e6636e10f597b8f25e4ef9e6416dd2e5c4c0960270cc"}, + {file = "lxml-5.0.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff29353c12f0abc9cb3395899b7192a970d5a63f80ac1e7f0c3247ed83f5dcd4"}, + {file = "lxml-5.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2ec9fa65e0638551a5ad31cb9fa160b321f19632e5ec517fe68d7b4110133e69"}, + {file = "lxml-5.0.1-cp311-cp311-win32.whl", hash = "sha256:9a4eff4d8ad0bbc9f470a9be19c5e718af4baf47111d7c2d9b036b9986107e7c"}, + {file = "lxml-5.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:3714f339449d2738b4fadd078a6022704a2af3cab06bec9627b19eaa4205090d"}, + {file = "lxml-5.0.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:473f2d0511dd84697326ee9362b0c0c2e9f99a433dcb1fbb5aa8df3d1b2185db"}, + {file = "lxml-5.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:62a3c0fdf70f785cd29824666d1dcea88c207f0b73ddbc28fb7a6a1a5bbb1af7"}, + {file = "lxml-5.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:83ff41e1bc0666f31acda52407d869ea257f232c2d9394806647a0e7454de73e"}, + {file = "lxml-5.0.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:42069ce16dab9755b381b90defd846ca407b9ead05dc20732fd5502b5cc49b87"}, + {file = "lxml-5.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:737a4dba9f7ee842c6597f719dda2eabeeaefe42443f7f24de20c262f88527cd"}, + {file = "lxml-5.0.1-cp312-cp312-win32.whl", hash = "sha256:67ddff0272905a0b78a2c3ea01487e0551cc38094cd5994f73af370f355ecb47"}, + {file = "lxml-5.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:a28eab2d9ea79b830be50e3350d827ae8abf5b23e278e14929824d5ab2069008"}, + {file = "lxml-5.0.1-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3013823b0069cb4bd9b558e87076a18142703c6d2ac3a5d5521dd35734d23a72"}, + {file = "lxml-5.0.1-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b005101a257c494e84d36ecb62b02ba195b02b7f8892f57b1f5aaa352ed44467"}, + {file = "lxml-5.0.1-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:f9464ff2fd1f2ba4d0d246a560d369ee7e7213c556a30db9ae9426850ce1baf9"}, + {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:9e8a4782ecaaacf8e9355f39179b1f00e7f27b774306eccbe80f6215724be4cd"}, + {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcb25128c9e7f01c00ad116d2c762c3942724981a35c6e5c551ab55d4c2bfcfe"}, + {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:39a3cf581e9040e66aaa719b2f338b2b7eb43ce1db059089c82ae72e0fd48b47"}, + {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:6cdd0fb749c80ffcf408f659b209e82333f10b517786083a3dd3c3b5adc60111"}, + {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cfdac95815d3025e4c9edce2ef2ebe4e034edc35a2c44a606dd846554387ae38"}, + {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:c21e60c017cab3a7e7786185cc8853b8614b01ccd69cc8b24608e5356784631b"}, + {file = "lxml-5.0.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:5892cff0a6817743fe470b7402677310ffc8514a74de14d4e591cecdc457ff61"}, + {file = "lxml-5.0.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:57f5c362cbd9d688fb1fa07c8955cec26d5c066fbcb4163aa341ff751eba7587"}, + {file = "lxml-5.0.1-cp36-cp36m-win32.whl", hash = "sha256:8a70c47c14f89b8bfb430f85b608aa460204fe7c005545d79afd31b925cc6669"}, + {file = "lxml-5.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:8ba56c3686fa60cc04191d22d1733aad484c9cbc153cdc3e8eb9bdfcad30f704"}, + {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c4eaa83b610595ef9f20aa69b96053d5b7f3f70c67c7a3af8f433136a9d315ab"}, + {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:ae3a0ec0f1b6cf1e8bca41bc86cd64ba02e31c71716efbf149a0f7ebc168cf0b"}, + {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:c5b23f63fcec652bf1e775eca5e03a713a4994d2a7ce2e70a91e964a26220e0d"}, + {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:d05cf827f160340f67c25ce6f271689a844605aa123849f1a80e21c9bd21f00b"}, + {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bcded868b05583d41ab0b024f39f90a04e486a2349a9b493d8d17024f1433aa6"}, + {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:821fb791565536617b058c2cd5b8d28a1285b3375eecc5bd6b5c6d87add0d3dd"}, + {file = "lxml-5.0.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1546fa25a6dde5698271e19787062446f433c98ca7eab35545f665dde2c1bd34"}, + {file = "lxml-5.0.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:31362b900b8fd2a026b9d24695ebe5449ea8f0c948af2445d7880b738d9fc368"}, + {file = "lxml-5.0.1-cp37-cp37m-win32.whl", hash = "sha256:0963de4fe463caff48e6ce4d83d19a3c099126874185d10cf490c29057ca518d"}, + {file = "lxml-5.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:b7bb0010c2969c23bf3d2b3892e638a7cb83e7daeb749e3db5f3c002fd191e11"}, + {file = "lxml-5.0.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:e2bd8faf6a9682ca385e4bca1a38a057be716dc303f16ddec9e4c9cf01b7d722"}, + {file = "lxml-5.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:0fb55d77e685def5218701d5d296fca62f595752c88402404da685864b06b67e"}, + {file = "lxml-5.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:73e71b95c5215310a92e560369ac1f0e2cd018d5a36be182da88958f3d6084f5"}, + {file = "lxml-5.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:0ecc0f1e1d901b66f2f68edff85b8ff421fa9683d02eaea6742a42c145d741b6"}, + {file = "lxml-5.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f36c3103a6f2641777b64f1da860f37cbaa718ce3feea621583627269e68eb03"}, + {file = "lxml-5.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:dcc7dc4b9b65f185aa8533abc78f0a3b2ac38fe075bb23d3c1590ba0990f6c80"}, + {file = "lxml-5.0.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1215c8e57a25ad68488abb83a36734f6c6b3f0ccd880f0c68da98682d463ef09"}, + {file = "lxml-5.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:070469a23f2ef3a9e72165af7e0b12eca9a6e47c3a8ec1cad60d14fb2f2c3aa8"}, + {file = "lxml-5.0.1-cp38-cp38-win32.whl", hash = "sha256:b889c0b9be774466308c3590e093ce9a6f9115c78fc8624aa6ba8dfeaf5188ab"}, + {file = "lxml-5.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:0499310df9afc0ce634ce14cacbb333d62f561038ea4db640494e4a22ac4f2e9"}, + {file = "lxml-5.0.1-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:18b456f1bfb338be94a916166ac5507e73b9eb9f6e1f0fbd1c8caff2f3fa5535"}, + {file = "lxml-5.0.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:60974986aa80a8bb883da5663f90bc632bd4ce0d0508e66a9132412facec65f6"}, + {file = "lxml-5.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:d6d3ce5638cd4ed3fa684507f164e7039e1b07475bc8f37ba6e12271c1a2e9e0"}, + {file = "lxml-5.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e97c74725e86d84a477df081eef69b70f048128afee841dbd8c690a9e3d2e8e0"}, + {file = "lxml-5.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:387c5416b8bb4b8ad7306797cb2719a841f5f3836b3c39fcaa56b9af5448dd2a"}, + {file = "lxml-5.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:feb1102d9e5de08120d46a1011110c43b2547ecb3ae80030801e0e2dacd1ee18"}, + {file = "lxml-5.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:745383c124f096cc03fb942c8a05ea1e8cb4f44c5b28887adce6224e4540808e"}, + {file = "lxml-5.0.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4ae66b6b0f82e7839b6b8d009182c652d48e7d2ea21a6709f3033ce5fbf199c4"}, + {file = "lxml-5.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:430780608d16b3bb96ef99a67a1a0626c8a295193af53ce9c4d5ec3fef2fbc79"}, + {file = "lxml-5.0.1-cp39-cp39-win32.whl", hash = "sha256:331237209fe76951450c1119af0879f04f32d1b07b21e83a34ba439520492feb"}, + {file = "lxml-5.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:c32a4fbae218b336d547bc626897325202e4e9f1794ac5f4d0bb3caacf41df21"}, + {file = "lxml-5.0.1-pp310-pypy310_pp73-macosx_11_0_x86_64.whl", hash = "sha256:96c2abcbcb24f850f00f279c4660ce6498cae16ff1659845838d752c26890748"}, + {file = "lxml-5.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:99b5ca5775435aa296d32ea711e194aaae871b21dbf0d57cb7d4431e5d3ad699"}, + {file = "lxml-5.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a99dae826c80cf0a21dd21eb66db16310d1f58ac4c27c804da980212fbcabe2"}, + {file = "lxml-5.0.1-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:4df6be79be4d7574e9e4002aeb6aa03d3f3809582db07abb166df7fc6e7438af"}, + {file = "lxml-5.0.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:49dc4dcf14a08f160bb3f5f571f63514d918b8933a25c221536571a5fc010271"}, + {file = "lxml-5.0.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:47f46a2ebade07f3afa47695882e7725440c49bf77cba39c3762a42597e5aad3"}, + {file = "lxml-5.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:18caa0d3925902949cb060fe5f2da70c953d60bd9ef78657bd389f6db30533cc"}, + {file = "lxml-5.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:8991837fdf8086486f1c300d936bacd757e2e5398be84cd54a1fba0e6b6d5591"}, + {file = "lxml-5.0.1-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:d64e543b07964ff73b4eb994bee894803a80e19fd3b29a5ffbe3c637fe43e788"}, + {file = "lxml-5.0.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:d80d9f4d986bb6ad65bae86f07391152f7b6c65cfc63d118616b18b0be2e79da"}, + {file = "lxml-5.0.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ea5e4b3eff9029a02fe7736540675ab8fca44277232f0027397b0d7111d04b1c"}, + {file = "lxml-5.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e2388a792f9c239510d62a9e615662b8202e4ca275aafcc9c4af154654462a14"}, + {file = "lxml-5.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3ffc56d68b9782919d69ae9a6fac99efd7547f2666ccb7ecfd12871564d16133"}, + {file = "lxml-5.0.1.tar.gz", hash = "sha256:4432a1d89a9b340bc6bd1201aef3ba03112f151d3f340d9218247dc0c85028ab"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=3.0.7)"] + [[package]] name = "lz4" version = "4.3.3" @@ -4767,6 +4893,17 @@ files = [ {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] +[[package]] +name = "soupsieve" +version = "2.5" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = true +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, + {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, +] + [[package]] name = "sql-metadata" version = "2.10.0" @@ -5544,7 +5681,7 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] [extras] -all = ["GitPython", "SQLAlchemy", "asyncpg", "avro", "azure-identity", "azure-mgmt-datafactory", "confluent-kafka", "databricks-sdk", "databricks-sql-connector", "fastavro", "google-cloud-bigquery", "google-cloud-logging", "gql", "grpcio-tools", "lkml", "llama-hub", "looker-sdk", "more-itertools", "msal", "msgraph-beta-sdk", "parse", "pycarlo", "pyhive", "pymssql", "pymysql", "sasl", "snowflake-connector-python", "sql-metadata", "sqllineage", "tableauserverclient", "thoughtspot_rest_api_v1", "thrift", "thrift-sasl", "trino"] +all = ["GitPython", "SQLAlchemy", "asyncpg", "avro", "azure-identity", "azure-mgmt-datafactory", "beautifulsoup4", "confluent-kafka", "databricks-sdk", "databricks-sql-connector", "fastavro", "google-cloud-bigquery", "google-cloud-logging", "gql", "grpcio-tools", "lkml", "llama-hub", "llama-index", "looker-sdk", "lxml", "more-itertools", "msal", "msgraph-beta-sdk", "parse", "pycarlo", "pyhive", "pymssql", "pymysql", "sasl", "snowflake-connector-python", "sql-metadata", "sqllineage", "tableauserverclient", "thoughtspot_rest_api_v1", "thrift", "thrift-sasl", "trino"] bigquery = ["google-cloud-bigquery", "google-cloud-logging", "sql-metadata"] datafactory = ["azure-identity", "azure-mgmt-datafactory"] datahub = ["gql"] @@ -5556,12 +5693,13 @@ metabase = ["sql-metadata"] monte-carlo = ["pycarlo"] mssql = ["pymssql"] mysql = ["SQLAlchemy", "pymysql"] -notion = ["llama-hub"] +notion = ["llama-hub", "llama-index"] postgresql = ["asyncpg"] power-bi = ["msal", "msgraph-beta-sdk", "sql-metadata"] redshift = ["asyncpg", "sqllineage"] s3 = ["fastavro", "more-itertools", "parse"] snowflake = ["snowflake-connector-python", "sql-metadata"] +static-web = ["beautifulsoup4", "llama-hub", "llama-index", "lxml"] synapse = ["pymssql"] tableau = ["sqllineage", "tableauserverclient"] throughtspot = ["thoughtspot_rest_api_v1"] @@ -5571,4 +5709,4 @@ unity-catalog = ["databricks-sdk", "databricks-sql-connector"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.12" -content-hash = "049550a1e52650775b228975d8b6d00c54183c53c0dfdf682edfef46319ae291" +content-hash = "a724f91f53a69aacaca10721789b90211a93800bd21d681d8e460a79b3e4a735" diff --git a/pyproject.toml b/pyproject.toml index 6905fac3..d570987b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "metaphor-connectors" -version = "0.13.123" +version = "0.13.124" license = "Apache-2.0" description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app." authors = ["Metaphor "] @@ -20,6 +20,7 @@ avro = { version = "^1.11.3", optional = true } aws-assume-role-lib = "^2.10.0" azure-identity = { version = "^1.14.0", optional = true } azure-mgmt-datafactory = { version = "^3.1.0", optional = true } +beautifulsoup4 = { version = "^4.12.3", optional = true } boto3 = "^1.28.57" botocore = "^1.31.58" canonicaljson = "^2.0.0" @@ -35,7 +36,9 @@ grpcio-tools = { version = "^1.59.3", optional = true } jsonschema = "^4.18.6" lkml = { version = "^1.3.1", optional = true } llama-hub = {version = "0.0.67", optional = true } +llama-index = {version = "0.9.48", optional = true} looker-sdk = { version = "^23.6.0", optional = true } +lxml = { version = "~=5.0.0", optional = true } metaphor-models = "0.30.17" more-itertools = { version = "^10.1.0", optional = true } msal = { version = "^1.20.0", optional = true } @@ -71,6 +74,7 @@ all = [ "avro", "azure-identity", "azure-mgmt-datafactory", + "beautifulsoup4", "confluent-kafka", "databricks-sdk", "databricks-sql-connector", @@ -82,10 +86,12 @@ all = [ "grpcio-tools", "lkml", "looker-sdk", + "llama-hub", + "llama-index", + "lxml", "more-itertools", "msal", "msgraph-beta-sdk", - "llama-hub", "parse", "pycarlo", "pyhive", @@ -114,12 +120,13 @@ metabase = ["sql-metadata"] monte_carlo = ["pycarlo"] mssql = ["pymssql"] mysql = ["pymysql", "SQLAlchemy"] -notion = ["llama-hub"] +notion = ["llama-hub", "llama-index"] postgresql = ["asyncpg"] power_bi = ["msal", "msgraph-beta-sdk", "sql-metadata"] redshift = ["asyncpg", "sqllineage"] s3 = ["fastavro", "more-itertools", "parse"] snowflake = ["snowflake-connector-python", "sql-metadata"] +static_web = ["beautifulsoup4", "llama-hub", "llama-index", "lxml"] synapse = ["pymssql"] tableau = ["tableauserverclient", "sqllineage"] throughtspot = ["thoughtspot-rest-api-v1"] diff --git a/tests/static_web/__init__.py b/tests/static_web/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/static_web/config.yml b/tests/static_web/config.yml new file mode 100644 index 00000000..99f13cd3 --- /dev/null +++ b/tests/static_web/config.yml @@ -0,0 +1,12 @@ +--- +links: + - https://metaphor.io/ +depths: + - 1 +azure_openAI_key: azure_openAI_key +azure_openAI_version: azure_openAI_version +azure_openAI_endpoint: azure_openAI_endpoint +azure_openAI_model: text-embedding-ada-002 +azure_openAI_model_name: EmbeddingModel +include_text: true +output: {} diff --git a/tests/static_web/expected.json b/tests/static_web/expected.json new file mode 100644 index 00000000..57de2757 --- /dev/null +++ b/tests/static_web/expected.json @@ -0,0 +1,24 @@ +[ + { + "externalSearchDocument": { + "entityId": "EXTERNAL_DOCUMENT~ABCD1234", + "documentId": "EXTERNAL_DOCUMENT~ABCD1234", + "embedding_1": [ + 0.1, + 0.2, + 0.3, + 0.4 + ], + "pageId": "e19d5cd5af0378da05f63f891c7467af", + "lastRefreshed": "2024-02-05 00:00:00.000000", + "metadata": { + "title": "Hello World!", + "platform": "example.com", + "pageId": "e19d5cd5af0378da05f63f891c7467af", + "link": "https://example.com", + "lastRefreshed": "2024-02-05 00:00:00.000000" + }, + "embeddedString_1": "Title: Hello World!\nHello World!" + } + } +] diff --git a/tests/static_web/sample_pages/main.html b/tests/static_web/sample_pages/main.html new file mode 100644 index 00000000..36e9e97e --- /dev/null +++ b/tests/static_web/sample_pages/main.html @@ -0,0 +1,10 @@ + + + Main Page + + +

Main Page

+

Go to Subpage 1 (p1)

+

Go to Subpage 2 (p2)

+ + diff --git a/tests/static_web/sample_pages/page1.html b/tests/static_web/sample_pages/page1.html new file mode 100644 index 00000000..ebbccae3 --- /dev/null +++ b/tests/static_web/sample_pages/page1.html @@ -0,0 +1,9 @@ + + + Subpage 1 + + +

Subpage 1

+

Back to Main Page

+ + diff --git a/tests/static_web/sample_pages/page2.html b/tests/static_web/sample_pages/page2.html new file mode 100644 index 00000000..c66ebd1b --- /dev/null +++ b/tests/static_web/sample_pages/page2.html @@ -0,0 +1,10 @@ + + + Subpage 2 + + +

Subpage 2

+

Go to Child Page 1 (p3)

+

Go to Child Page 2 (p4)

+ + diff --git a/tests/static_web/sample_pages/page3.html b/tests/static_web/sample_pages/page3.html new file mode 100644 index 00000000..dc092687 --- /dev/null +++ b/tests/static_web/sample_pages/page3.html @@ -0,0 +1,8 @@ + + + Child Page 1 + + +

Child Page 1 (p3)

+ + diff --git a/tests/static_web/sample_pages/page4.html b/tests/static_web/sample_pages/page4.html new file mode 100644 index 00000000..97bfc9dd --- /dev/null +++ b/tests/static_web/sample_pages/page4.html @@ -0,0 +1,8 @@ + + + Child Page 2 + + +

Child Page 2 (p4)

+ + diff --git a/tests/static_web/sample_pages/titles_text.html b/tests/static_web/sample_pages/titles_text.html new file mode 100644 index 00000000..dfe820da --- /dev/null +++ b/tests/static_web/sample_pages/titles_text.html @@ -0,0 +1,26 @@ + + + Test Title + + + + + + +

Visible paragraph 1.

+
+

Visible paragraph 2.

+ + + +
+ + diff --git a/tests/static_web/test_config.py b/tests/static_web/test_config.py new file mode 100644 index 00000000..da93a74e --- /dev/null +++ b/tests/static_web/test_config.py @@ -0,0 +1,18 @@ +from metaphor.common.base_config import OutputConfig +from metaphor.static_web.config import StaticWebRunConfig + + +def test_config(test_root_dir): + config = StaticWebRunConfig.from_yaml_file(f"{test_root_dir}/static_web/config.yml") + + assert config == StaticWebRunConfig( + links=["https://metaphor.io/"], + depths=[1], + azure_openAI_key="azure_openAI_key", + azure_openAI_version="azure_openAI_version", + azure_openAI_endpoint="azure_openAI_endpoint", + azure_openAI_model="text-embedding-ada-002", + azure_openAI_model_name="EmbeddingModel", + include_text=True, + output=OutputConfig(), + ) diff --git a/tests/static_web/test_extractor.py b/tests/static_web/test_extractor.py new file mode 100644 index 00000000..32d480a6 --- /dev/null +++ b/tests/static_web/test_extractor.py @@ -0,0 +1,257 @@ +from unittest.mock import MagicMock, patch + +import pytest +import requests + +from metaphor.common.base_config import OutputConfig +from metaphor.static_web.config import StaticWebRunConfig +from metaphor.static_web.extractor import StaticWebExtractor +from tests.test_utils import load_json, load_text + + +@pytest.fixture +def static_web_extractor(): + config = StaticWebRunConfig( + links=["https://example.com"], + depths=[1], + azure_openAI_key="key", + azure_openAI_version="version", + azure_openAI_endpoint="endpoint", + azure_openAI_model="text-embedding-ada-002", + azure_openAI_model_name="model_name", + include_text=True, + output=OutputConfig(), + ) + return StaticWebExtractor(config) + + +# Test for successful page HTML retrieval +@patch("requests.get") +def test_get_page_HTML_success(mock_get: MagicMock, static_web_extractor): + mock_get_val = MagicMock() + mock_get_val.text = "Test" + + mock_get.return_value = mock_get_val + + content = static_web_extractor._get_page_HTML("https://example.com") + assert content == "Test" + + +# Test for handling retrieval failure +@patch("requests.get") +def test_get_page_HTML_failure(mock_get: MagicMock, static_web_extractor): + mock_get_val = MagicMock() + + mock_get.side_effect = requests.RequestException() + mock_get.return_value = mock_get_val + + content = static_web_extractor._get_page_HTML("https://example.com") + assert content == "ERROR IN PAGE RETRIEVAL" + + +# Test for extracting subpages from HTML +def test_get_subpages_from_HTML(static_web_extractor): + html_content = "Link" + static_web_extractor.current_parent_page = "https://example.com" + result = static_web_extractor._get_subpages_from_HTML( + html_content, "https://example.com" + ) + assert "https://example.com/test" in result + + +# Test for extracting visible text from HTML, with filtering +def test_get_text_from_HTML_with_filtering(static_web_extractor, test_root_dir: str): + html_content = load_text( + f"{test_root_dir}/static_web/sample_pages/titles_text.html" + ) + text = static_web_extractor._get_text_from_HTML(html_content) + assert "Visible paragraph 1." in text + assert "Visible paragraph 2." in text + assert "Test Title" not in text + assert "Some style" not in text + assert "Some script" not in text + assert "Some meta" not in text + assert "Commented text" not in text + assert "Script text" not in text + assert "Style text" not in text + + +# Test for extracting title from HTML +def test_get_title_from_HTML_success(static_web_extractor): + html_content = "Test Title" + title = static_web_extractor._get_title_from_HTML(html_content) + assert title == "Test Title" + + +# Test for extracting empty title +def test_get_title_from_HTML_failure(static_web_extractor): + html_content = "

Hello World!

" + title = static_web_extractor._get_title_from_HTML(html_content) + assert title == "" + + +# Test for making a document +def test_make_document(static_web_extractor): + doc = static_web_extractor._make_document( + "https://example.com", "Test Title", "Test Content" + ) + assert doc.text == "Test Content" + assert doc.extra_info["title"] == "Test Title" + assert doc.extra_info["link"] == "https://example.com" + + +# Test process subpages +@patch("metaphor.static_web.StaticWebExtractor._get_page_HTML") +@patch("metaphor.static_web.StaticWebExtractor._get_subpages_from_HTML") +@pytest.mark.asyncio +async def test_process_subpages( + mock_get_subpages_from_HTML: MagicMock, + mock_get_page_HTML: MagicMock, + static_web_extractor, +): + # Mocking the responses for page HTML and subpages + parent_html = "Subpage 1" + subpage1_html = "

Content of Subpage 1

" + + mock_get_page_HTML.side_effect = [parent_html, subpage1_html] + mock_get_subpages_from_HTML.return_value = ["https://example.com/subpage1"] + + # Call the _process_subpages method + static_web_extractor.visited_pages = set() + static_web_extractor.docs = list() + await static_web_extractor._process_subpages("https://example.com", parent_html, 2) + + # Check if _get_page_HTML is called for subpages + mock_get_page_HTML.assert_any_call("https://example.com/subpage1") + + # Verify that documents are added correctly + assert len(static_web_extractor.docs) > 0 + assert ( + static_web_extractor.docs[0].extra_info["link"] + == "https://example.com/subpage1" + ) + # Further assertions can be made based on the structure of your Document objects + + # Check if the depth limit is respected + assert not any( + "subpage2" in doc.extra_info["link"] for doc in static_web_extractor.docs + ) + + +# Test 1 layer recursion +@patch("metaphor.static_web.StaticWebExtractor._get_page_HTML") +@patch("metaphor.static_web.extractor.embed_documents") +@patch("metaphor.static_web.extractor.map_metadata") +@pytest.mark.asyncio +async def test_shallow_recursion( + mock_map_metadata: MagicMock, + mock_embed_docs: MagicMock, + mock_get_HTML: MagicMock, + static_web_extractor, + test_root_dir: str, +): + mock_map_metadata.return_value = [] + mock_embed_docs.return_value = [] + + # Mock pages appropriately + page_folder = f"{test_root_dir}/static_web/sample_pages" + + mock_get_HTML.side_effect = [ + load_text(f"{page_folder}/main.html"), + load_text(f"{page_folder}/page1.html"), + load_text(f"{page_folder}/page2.html"), + load_text(f"{page_folder}/page3.html"), + load_text(f"{page_folder}/page4.html"), + ] + + # Initialize extractor attributes for shallow recursion test + static_web_extractor.target_URLs = ["https://example.com/main"] + static_web_extractor.target_depths = [1] + + await static_web_extractor.extract() + + assert len(static_web_extractor.visited_pages) == 3 + assert len(static_web_extractor.docs) == 3 + + +# Test infinite +@patch("metaphor.static_web.StaticWebExtractor._get_page_HTML") +@patch("metaphor.static_web.extractor.embed_documents") +@patch("metaphor.static_web.extractor.map_metadata") +@pytest.mark.asyncio +async def test_infinite_recursion( + mock_map_metadata: MagicMock, + mock_embed_docs: MagicMock, + mock_get_HTML: MagicMock, + static_web_extractor, + test_root_dir: str, +): + mock_map_metadata.return_value = [] + mock_embed_docs.return_value = [] + + # Mock pages appropriately + page_folder = f"{test_root_dir}/static_web/sample_pages" + + mock_get_HTML.side_effect = [ + load_text(f"{page_folder}/main.html"), + load_text(f"{page_folder}/page1.html"), + load_text(f"{page_folder}/page2.html"), + load_text(f"{page_folder}/page3.html"), + load_text(f"{page_folder}/page4.html"), + ] + + # Initialize extractor attributes for infinite recursion test + # page1 has a backlink to main, so we should not see multiple instances + static_web_extractor.target_URLs = ["https://example.com/main"] + static_web_extractor.target_depths = [2] + + await static_web_extractor.extract() + + assert len(static_web_extractor.visited_pages) == 5 + assert len(static_web_extractor.docs) == 5 + + +# Test extract +@patch("metaphor.static_web.StaticWebExtractor._get_page_HTML") +@patch("metaphor.static_web.StaticWebExtractor._process_subpages") +@patch("metaphor.static_web.extractor.embed_documents") +@pytest.mark.asyncio +async def test_extractor( + mock_embed_docs: MagicMock, + mock_process_subpages: MagicMock, + mock_get_HTML: MagicMock, + static_web_extractor, + test_root_dir: str, +): + # mock VSI + mock_vector_store_index = MagicMock() + + mock_vector_store_index.storage_context.to_dict.return_value = { + "vector_store": { + "default": { + "embedding_dict": {"abcd1234": [0.1, 0.2, 0.3, 0.4]}, + "metadata_dict": { + "abcd1234": { + "title": "Hello World!", + "pageId": "e19d5cd5af0378da05f63f891c7467af", + "platform": "example.com", + "link": "https://example.com", + "lastRefreshed": "2024-02-05 00:00:00.000000", + } + }, + } + }, + "doc_store": { + "docstore/data": {"abcd1234": {"__data__": {"text": "Hello World!"}}} + }, + } + + mock_embed_docs.return_value = mock_vector_store_index + + mock_process_subpages.return_value = None + + mock_get_HTML.return_value = "

Hello World!

Link" + + events = await static_web_extractor.extract() + + assert events == load_json(f"{test_root_dir}/static_web/expected.json") diff --git a/tests/test_utils.py b/tests/test_utils.py index e1e63a9d..15ac22e7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,6 +8,11 @@ def load_json(path): return json.load(f) +def load_text(path): + with open(path, "r") as f: + return f.read() + + def compare_list_ignore_order(a: list, b: list): t = list(b) # make a mutable copy try: