diff --git a/metaphor/notion/README.md b/metaphor/notion/README.md
index dc625ea9..223b06d3 100644
--- a/metaphor/notion/README.md
+++ b/metaphor/notion/README.md
@@ -35,7 +35,7 @@ azure_openAI_version: <azure_openAI_version> # "2023-12-01-preview"
 azure_openAI_model_name: <azure_openAI_model_name> # "Embedding_ada002"
 azure_openAI_model: <azure_openAI_model> # "text-embedding-ada-002"
 
-notion_api_version: <api_key_version> # "2022-06-08"
+notion_api_version: <api_key_version> # "2022-06-28"
 include_text: <include_text> # False
 ```
 
diff --git a/metaphor/notion/config.py b/metaphor/notion/config.py
index 4ab16855..61fe6a2f 100644
--- a/metaphor/notion/config.py
+++ b/metaphor/notion/config.py
@@ -22,4 +22,4 @@ class NotionRunConfig(BaseConfig):
     include_text: bool = False
 
     # Notion API version
-    notion_api_version: str = "2022-06-08"
+    notion_api_version: str = "2022-06-28"
diff --git a/metaphor/static_web/README.md b/metaphor/static_web/README.md
new file mode 100644
index 00000000..5cdda983
--- /dev/null
+++ b/metaphor/static_web/README.md
@@ -0,0 +1,31 @@
+# Static Webpage Connector
+
+## Setup
+
+## Config File
+
+Create a YAML config file based on the following template.
+
+`depth = 1` corresponds to scraping the specified page and its subpages only. Higher configured depths will recursively perform the same action on subpages `n` times.
+
+### Required Configurations
+
+```yaml
+output:
+  file:
+    directory: <output_directory>
+```
+
+### Optional Configurations
+
+## Testing
+
+Follow the [Installation](../../README.md) instructions to install `metaphor-connectors` in your environment (or virtualenv). Make sure to include either `all` or `static_web` extra.
+
+To test the connector locally, change the config file to output to a local path and run the following command
+
+```shell
+metaphor static_web <config_file>
+```
+
+Manually verify the output after the command finishes.
diff --git a/metaphor/static_web/__init__.py b/metaphor/static_web/__init__.py
new file mode 100644
index 00000000..b4fe4bdc
--- /dev/null
+++ b/metaphor/static_web/__init__.py
@@ -0,0 +1,6 @@
+from metaphor.common.cli import cli_main
+from metaphor.static_web.extractor import StaticWebExtractor
+
+
+def main(config_file: str):
+    cli_main(StaticWebExtractor, config_file)
diff --git a/metaphor/static_web/config.py b/metaphor/static_web/config.py
new file mode 100644
index 00000000..881e0b86
--- /dev/null
+++ b/metaphor/static_web/config.py
@@ -0,0 +1,25 @@
+from pydantic.dataclasses import dataclass
+
+from metaphor.common.base_config import BaseConfig
+from metaphor.common.dataclass import ConnectorConfig
+
+
+@dataclass(config=ConnectorConfig)
+class StaticWebRunConfig(BaseConfig):
+    # Top-level URLs to scrape content from
+    links: list
+
+    # Configurable scraping depth
+    depths: list
+
+    # Azure OpenAI services configs
+    azure_openAI_key: str
+    azure_openAI_endpoint: str
+
+    # Default Azure OpenAI services configs
+    azure_openAI_version: str = "2023-12-01-preview"
+    azure_openAI_model: str = "text-embedding-ada-002"
+    azure_openAI_model_name: str = "Embedding_ada002"
+
+    # Store the document's content alongside embeddings
+    include_text: bool = False
diff --git a/metaphor/static_web/extractor.py b/metaphor/static_web/extractor.py
new file mode 100644
index 00000000..e4a6c8c3
--- /dev/null
+++ b/metaphor/static_web/extractor.py
@@ -0,0 +1,241 @@
+import datetime
+from typing import Collection, List, Tuple
+from urllib.parse import urljoin, urlparse
+
+import requests
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+from llama_index import Document
+from requests.exceptions import HTTPError, RequestException
+
+from metaphor.common.base_extractor import BaseExtractor
+from metaphor.common.embeddings import embed_documents, map_metadata, sanitize_text
+from metaphor.common.logger import get_logger
+from metaphor.common.utils import md5_digest
+from metaphor.models.crawler_run_metadata import Platform
+from metaphor.static_web.config import StaticWebRunConfig
+
+logger = get_logger()
+
+embedding_chunk_size = 512
+embedding_overlap_size = 50
+
+
+class StaticWebExtractor(BaseExtractor):
+    """Static webpage extractor."""
+
+    _description = "Crawls webpages and and extracts documents & embeddings."
+    _platform = Platform.UNKNOWN
+
+    @staticmethod
+    def from_config_file(config_file: str) -> "StaticWebExtractor":
+        return StaticWebExtractor(StaticWebRunConfig.from_yaml_file(config_file))
+
+    def __init__(self, config: StaticWebRunConfig):
+        super().__init__(config=config)
+
+        self.target_URLs = config.links
+        self.target_depths = config.depths
+
+        self.azure_openAI_key = config.azure_openAI_key
+        self.azure_openAI_version = config.azure_openAI_version
+        self.azure_openAI_endpoint = config.azure_openAI_endpoint
+        self.azure_openAI_model = config.azure_openAI_model
+        self.azure_openAI_model_name = config.azure_openAI_model_name
+
+        self.include_text = config.include_text
+
+    async def extract(self) -> Collection[dict]:
+        logger.info("Scraping provided URLs")
+        self.docs = list()  # type: List[Document]
+        self.visited_pages = set()  # type: set
+
+        for page, depth in zip(self.target_URLs, self.target_depths):
+            logger.info(f"Processing {page} with depth {depth}")
+            self.current_parent_page = page
+
+            # Fetch target content
+            success, content = self._check_page_make_document(page)
+
+            if success:
+                logger.info(f"Done with parent page {page}")
+                if depth:  # recursive subpage processing
+                    await self._process_subpages(page, content, depth)
+
+        # Embedding process
+        logger.info("Starting embedding process")
+        vector_store_index = embed_documents(
+            self.docs,
+            self.azure_openAI_key,
+            self.azure_openAI_version,
+            self.azure_openAI_endpoint,
+            self.azure_openAI_model,
+            self.azure_openAI_model_name,
+            embedding_chunk_size,
+            embedding_overlap_size,
+        )
+
+        embedded_nodes = map_metadata(
+            vector_store_index, include_text=self.include_text
+        )
+
+        return embedded_nodes
+
+    async def _process_subpages(
+        self,
+        parent_URL: str,
+        parent_content: str,
+        target_depth: int,
+        current_depth: int = 1,
+    ) -> None:
+        logger.info(f"Processing subpages of {parent_URL}")
+        subpages = self._get_subpages_from_HTML(parent_content, parent_URL)
+
+        if current_depth > target_depth:  # on recursion depth reached
+            return
+
+        for subpage in subpages:
+            if subpage in self.visited_pages:
+                continue
+
+            logger.info(f"Processing subpage {subpage} of parent {parent_URL}")
+            success, content = self._check_page_make_document(subpage)
+
+            if success:
+                logger.info(f"Done with subpage {subpage}")
+                await self._process_subpages(
+                    subpage, content, target_depth, current_depth + 1
+                )
+
+    def _check_page_make_document(self, page: str) -> Tuple[bool, str]:
+        """
+        Gets a page's HTML and adds to the visited pages set.
+        If page has valid content, extracts the text and title and generates
+        a Document object for the page.
+
+        Returns a bool and the page_content:
+            out[0]: False if the page content is invalid, True otherwise.
+            out[1]: "" if the page content is invalid, page_content otherwise
+        """
+
+        page_content = self._get_page_HTML(page)
+        self.visited_pages.add(page)
+
+        if page_content == "ERROR IN PAGE RETRIEVAL":
+            return (False, "")
+        else:
+            page_text = self._get_text_from_HTML(page_content)
+            page_title = self._get_title_from_HTML(page_content)
+
+            page_doc = self._make_document(page, page_title, page_text)
+            self.docs.append(page_doc)
+
+            return (True, page_content)
+
+    def _get_page_HTML(self, input_URL: str) -> str:
+        """
+        Fetches a webpage's content, returning an error message on failure.
+        """
+        try:
+            r = requests.get(input_URL, timeout=5)
+            r.raise_for_status()
+            return r.text
+        except (HTTPError, RequestException) as e:
+            logger.warning(f"Error in retrieving {input_URL}, error {e}")
+            return "ERROR IN PAGE RETRIEVAL"
+
+    def _get_subpages_from_HTML(self, html_content: str, input_URL: str) -> List[str]:
+        """
+        Extracts and returns a list of subpage URLs from a given page's HTML and URL.
+        Subpage URLs are reconstructed to be absolute URLs and anchor links are trimmed.
+        """
+        # Retrieve input page
+
+        soup = BeautifulSoup(html_content, "lxml")
+        links = soup.find_all("a", href=True)
+
+        # Parse the domain of the input URL
+        input_domain = urlparse(self.current_parent_page).netloc
+        subpages = [input_URL]
+
+        # Find eligible links
+        for link in links:
+            href = link["href"]
+            full_url = urljoin(input_URL, href)
+
+            # Check if the domain of the full URL matches the input domain
+            if urlparse(full_url).netloc == input_domain:
+                # Remove any query parameters or fragments
+                full_url = urljoin(full_url, urlparse(full_url).path)
+                if full_url not in subpages:
+                    subpages.append(full_url)
+
+        return subpages
+
+    def _get_text_from_HTML(self, html_content: str) -> str:
+        """
+        Extracts and returns visible text from given HTML content as a single string.
+        Designed to handle output from get_page_HTML.
+        """
+
+        def filter_visible(el):
+            if el.parent.name in [
+                "style",
+                "script",
+                "head",
+                "title",
+                "meta",
+                "[document]",
+            ]:
+                return False
+            elif isinstance(el, Comment):
+                return False
+            else:
+                return True
+
+        # Use bs4 to find visible text elements
+        soup = BeautifulSoup(html_content, "lxml")
+        visible_text = filter(filter_visible, soup.findAll(string=True))
+        return "\n".join(t.strip() for t in visible_text)
+
+    def _get_title_from_HTML(self, html_content: str) -> str:
+        """
+        Extracts the title of a webpage given HTML content as a single string.
+        Designed to handle output from get_page_HTML.
+        """
+
+        soup = BeautifulSoup(html_content, "lxml")
+        title_tag = soup.find("title")
+
+        if title_tag:
+            return title_tag.text
+
+        else:
+            return ""
+
+    def _make_document(
+        self, page_URL: str, page_title: str, page_text: str
+    ) -> Document:
+        """
+        Constructs Document objects from webpage URLs
+        and their content, including extra metadata.
+
+        Cleans text content and includes data like page title,
+        platform URL, page link, refresh timestamp, and page ID.
+        """
+        netloc = urlparse(page_URL).netloc
+        current_time = str(datetime.datetime.utcnow())
+
+        doc = Document(
+            text=sanitize_text(page_text),
+            extra_info={
+                "title": page_title,
+                "platform": netloc,
+                "link": page_URL,
+                "lastRefreshed": current_time,
+                # Create a pageId based on page_URL - is this necessary?
+                "pageId": md5_digest(page_URL.encode()),
+            },
+        )
+
+        return doc
diff --git a/poetry.lock b/poetry.lock
index 7be6b576..01c9c3f9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -482,6 +482,27 @@ test = ["beautifulsoup4 (>=4.8.0)", "coverage (>=4.5.4)", "fixtures (>=3.0.0)",
 toml = ["tomli (>=1.1.0)"]
 yaml = ["PyYAML"]
 
+[[package]]
+name = "beautifulsoup4"
+version = "4.12.3"
+description = "Screen-scraping library"
+optional = true
+python-versions = ">=3.6.0"
+files = [
+    {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
+    {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+]
+
+[package.dependencies]
+soupsieve = ">1.2"
+
+[package.extras]
+cchardet = ["cchardet"]
+chardet = ["chardet"]
+charset-normalizer = ["charset-normalizer"]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "black"
 version = "23.12.1"
@@ -530,17 +551,17 @@ uvloop = ["uvloop (>=0.15.2)"]
 
 [[package]]
 name = "boto3"
-version = "1.34.39"
+version = "1.34.40"
 description = "The AWS SDK for Python"
 optional = false
 python-versions = ">= 3.8"
 files = [
-    {file = "boto3-1.34.39-py3-none-any.whl", hash = "sha256:476896e70d36c9134d4125834280c597c17b54bff4902baf2e5fcde74f8acec8"},
-    {file = "boto3-1.34.39.tar.gz", hash = "sha256:35bcbecf1b5d3620c93f0062d2994177f8bda25a9d2cba144d6462793c16065b"},
+    {file = "boto3-1.34.40-py3-none-any.whl", hash = "sha256:49eb215e4142d441e26eedaf5d0b43065200f0849d82c904bc9a62d1328016cd"},
+    {file = "boto3-1.34.40.tar.gz", hash = "sha256:81d026ed8c8305b880c71f9f287f9b745b52bd358a91cfc133844c907db4d7ee"},
 ]
 
 [package.dependencies]
-botocore = ">=1.34.39,<1.35.0"
+botocore = ">=1.34.40,<1.35.0"
 jmespath = ">=0.7.1,<2.0.0"
 s3transfer = ">=0.10.0,<0.11.0"
 
@@ -549,13 +570,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
 
 [[package]]
 name = "botocore"
-version = "1.34.39"
+version = "1.34.40"
 description = "Low-level, data-driven core of boto 3."
 optional = false
 python-versions = ">= 3.8"
 files = [
-    {file = "botocore-1.34.39-py3-none-any.whl", hash = "sha256:e175360445424b83b0e28ae20d301b99cf44ff2c9d5ab1d8670899bec05a9753"},
-    {file = "botocore-1.34.39.tar.gz", hash = "sha256:9f00bd5e4698bcdd37ce6e224a896baf58d209678ed92834944b767de9061cc5"},
+    {file = "botocore-1.34.40-py3-none-any.whl", hash = "sha256:a3edd774653a61a1b211e4ea88cdb1c2655ffcc7660ba77b41a4027b097d145d"},
+    {file = "botocore-1.34.40.tar.gz", hash = "sha256:cb794bdb5b3d41845749a182ec93cb1453560e52b97ae0ab43ace81deb011f6d"},
 ]
 
 [package.dependencies]
@@ -2403,6 +2424,111 @@ cattrs = {version = ">=1.3", markers = "python_version >= \"3.7\""}
 requests = ">=2.22"
 typing-extensions = ">=4.1.1"
 
+[[package]]
+name = "lxml"
+version = "5.0.1"
+description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
+files = [
+    {file = "lxml-5.0.1-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d78e91cbffe733ff325e0d258bb64702c8d91f8f652db088bd2989c6a8f90cc"},
+    {file = "lxml-5.0.1-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:19251f782ea51a4841e747158195312ef63e06b47889e159dc5f1b2e5d668465"},
+    {file = "lxml-5.0.1-cp27-cp27m-win32.whl", hash = "sha256:8689c54483b1f16b577b8194a58fd6feab6b9d5699e297ffbc552acb0874dfe1"},
+    {file = "lxml-5.0.1-cp27-cp27m-win_amd64.whl", hash = "sha256:099eacbfdda668eda3e7e0705eced115a2e9425bb66cfce41a79fef1821a319c"},
+    {file = "lxml-5.0.1-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b6bb5a0a87ab1e01f086cbb418be9e409719cd216954aa38b1cceee36a561ce1"},
+    {file = "lxml-5.0.1-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:4b49a1569ed6d05808f4d163a316e7bf4a230e0c36855b59f56020ae27ae586a"},
+    {file = "lxml-5.0.1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:dbff288e1869db78f8731ca257553dd699edef07e173b35e71b1122b630d6008"},
+    {file = "lxml-5.0.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0f70e5de6b3e24ababeca597f776e5f37973f05d28a4d9f467aa5b45745af762"},
+    {file = "lxml-5.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:32a135d4ef8f966bc087d450d641df73fc6874f04cf6608111541b50090e6f13"},
+    {file = "lxml-5.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:b4eef43c5dc5c579d0804e55a32dd1bacbd008c8191ed4d65be278bbb11ddc61"},
+    {file = "lxml-5.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:7febf50135363e981097eeada84781eeae92bfc3c203495f63d6b542a7132ba7"},
+    {file = "lxml-5.0.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0a79eca2ef5e032c8ed9da07f84a07a29105f220b777613dfe7fc31445691ee3"},
+    {file = "lxml-5.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8de180f748a17382dd695b3999be03a647d09af16ae589c4e9c37138ddb6d4c6"},
+    {file = "lxml-5.0.1-cp310-cp310-win32.whl", hash = "sha256:6af86081c888ce81ca7e361ed7fa2ba1678e2a86eb5a059c96d5a719364d319e"},
+    {file = "lxml-5.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:0dc36ec06514fe8848c4733d47f96a0636f82d9ca3eaa2132373426bc03f178f"},
+    {file = "lxml-5.0.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:f56e6a38c64a79e228d48344bb3bec265ac035fc1277ce8c049445bb18e4cd41"},
+    {file = "lxml-5.0.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d58af4ebd711dad40f1c024a779950d9918d04d74f49230edf5d271dcd33c28"},
+    {file = "lxml-5.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:00bfccab28f710bb13f7f644c980ad50ce3e5b6a412b5bb9a6c30136b298fb2c"},
+    {file = "lxml-5.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:45e747b4e82390ffd802528b9e7b39333c1ce71423057bf5961b30ec0d52f76f"},
+    {file = "lxml-5.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:03c977ffc9a4bf17b3e0f8db0451dc38e9f4ec92cfdb5df462d38fbe6e6e0825"},
+    {file = "lxml-5.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d0047c90e0ebd0d8f3c1e6636e10f597b8f25e4ef9e6416dd2e5c4c0960270cc"},
+    {file = "lxml-5.0.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff29353c12f0abc9cb3395899b7192a970d5a63f80ac1e7f0c3247ed83f5dcd4"},
+    {file = "lxml-5.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2ec9fa65e0638551a5ad31cb9fa160b321f19632e5ec517fe68d7b4110133e69"},
+    {file = "lxml-5.0.1-cp311-cp311-win32.whl", hash = "sha256:9a4eff4d8ad0bbc9f470a9be19c5e718af4baf47111d7c2d9b036b9986107e7c"},
+    {file = "lxml-5.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:3714f339449d2738b4fadd078a6022704a2af3cab06bec9627b19eaa4205090d"},
+    {file = "lxml-5.0.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:473f2d0511dd84697326ee9362b0c0c2e9f99a433dcb1fbb5aa8df3d1b2185db"},
+    {file = "lxml-5.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:62a3c0fdf70f785cd29824666d1dcea88c207f0b73ddbc28fb7a6a1a5bbb1af7"},
+    {file = "lxml-5.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:83ff41e1bc0666f31acda52407d869ea257f232c2d9394806647a0e7454de73e"},
+    {file = "lxml-5.0.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:42069ce16dab9755b381b90defd846ca407b9ead05dc20732fd5502b5cc49b87"},
+    {file = "lxml-5.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:737a4dba9f7ee842c6597f719dda2eabeeaefe42443f7f24de20c262f88527cd"},
+    {file = "lxml-5.0.1-cp312-cp312-win32.whl", hash = "sha256:67ddff0272905a0b78a2c3ea01487e0551cc38094cd5994f73af370f355ecb47"},
+    {file = "lxml-5.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:a28eab2d9ea79b830be50e3350d827ae8abf5b23e278e14929824d5ab2069008"},
+    {file = "lxml-5.0.1-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3013823b0069cb4bd9b558e87076a18142703c6d2ac3a5d5521dd35734d23a72"},
+    {file = "lxml-5.0.1-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b005101a257c494e84d36ecb62b02ba195b02b7f8892f57b1f5aaa352ed44467"},
+    {file = "lxml-5.0.1-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:f9464ff2fd1f2ba4d0d246a560d369ee7e7213c556a30db9ae9426850ce1baf9"},
+    {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:9e8a4782ecaaacf8e9355f39179b1f00e7f27b774306eccbe80f6215724be4cd"},
+    {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcb25128c9e7f01c00ad116d2c762c3942724981a35c6e5c551ab55d4c2bfcfe"},
+    {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:39a3cf581e9040e66aaa719b2f338b2b7eb43ce1db059089c82ae72e0fd48b47"},
+    {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:6cdd0fb749c80ffcf408f659b209e82333f10b517786083a3dd3c3b5adc60111"},
+    {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cfdac95815d3025e4c9edce2ef2ebe4e034edc35a2c44a606dd846554387ae38"},
+    {file = "lxml-5.0.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:c21e60c017cab3a7e7786185cc8853b8614b01ccd69cc8b24608e5356784631b"},
+    {file = "lxml-5.0.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:5892cff0a6817743fe470b7402677310ffc8514a74de14d4e591cecdc457ff61"},
+    {file = "lxml-5.0.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:57f5c362cbd9d688fb1fa07c8955cec26d5c066fbcb4163aa341ff751eba7587"},
+    {file = "lxml-5.0.1-cp36-cp36m-win32.whl", hash = "sha256:8a70c47c14f89b8bfb430f85b608aa460204fe7c005545d79afd31b925cc6669"},
+    {file = "lxml-5.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:8ba56c3686fa60cc04191d22d1733aad484c9cbc153cdc3e8eb9bdfcad30f704"},
+    {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c4eaa83b610595ef9f20aa69b96053d5b7f3f70c67c7a3af8f433136a9d315ab"},
+    {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:ae3a0ec0f1b6cf1e8bca41bc86cd64ba02e31c71716efbf149a0f7ebc168cf0b"},
+    {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:c5b23f63fcec652bf1e775eca5e03a713a4994d2a7ce2e70a91e964a26220e0d"},
+    {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:d05cf827f160340f67c25ce6f271689a844605aa123849f1a80e21c9bd21f00b"},
+    {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bcded868b05583d41ab0b024f39f90a04e486a2349a9b493d8d17024f1433aa6"},
+    {file = "lxml-5.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:821fb791565536617b058c2cd5b8d28a1285b3375eecc5bd6b5c6d87add0d3dd"},
+    {file = "lxml-5.0.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1546fa25a6dde5698271e19787062446f433c98ca7eab35545f665dde2c1bd34"},
+    {file = "lxml-5.0.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:31362b900b8fd2a026b9d24695ebe5449ea8f0c948af2445d7880b738d9fc368"},
+    {file = "lxml-5.0.1-cp37-cp37m-win32.whl", hash = "sha256:0963de4fe463caff48e6ce4d83d19a3c099126874185d10cf490c29057ca518d"},
+    {file = "lxml-5.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:b7bb0010c2969c23bf3d2b3892e638a7cb83e7daeb749e3db5f3c002fd191e11"},
+    {file = "lxml-5.0.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:e2bd8faf6a9682ca385e4bca1a38a057be716dc303f16ddec9e4c9cf01b7d722"},
+    {file = "lxml-5.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:0fb55d77e685def5218701d5d296fca62f595752c88402404da685864b06b67e"},
+    {file = "lxml-5.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:73e71b95c5215310a92e560369ac1f0e2cd018d5a36be182da88958f3d6084f5"},
+    {file = "lxml-5.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:0ecc0f1e1d901b66f2f68edff85b8ff421fa9683d02eaea6742a42c145d741b6"},
+    {file = "lxml-5.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f36c3103a6f2641777b64f1da860f37cbaa718ce3feea621583627269e68eb03"},
+    {file = "lxml-5.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:dcc7dc4b9b65f185aa8533abc78f0a3b2ac38fe075bb23d3c1590ba0990f6c80"},
+    {file = "lxml-5.0.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1215c8e57a25ad68488abb83a36734f6c6b3f0ccd880f0c68da98682d463ef09"},
+    {file = "lxml-5.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:070469a23f2ef3a9e72165af7e0b12eca9a6e47c3a8ec1cad60d14fb2f2c3aa8"},
+    {file = "lxml-5.0.1-cp38-cp38-win32.whl", hash = "sha256:b889c0b9be774466308c3590e093ce9a6f9115c78fc8624aa6ba8dfeaf5188ab"},
+    {file = "lxml-5.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:0499310df9afc0ce634ce14cacbb333d62f561038ea4db640494e4a22ac4f2e9"},
+    {file = "lxml-5.0.1-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:18b456f1bfb338be94a916166ac5507e73b9eb9f6e1f0fbd1c8caff2f3fa5535"},
+    {file = "lxml-5.0.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:60974986aa80a8bb883da5663f90bc632bd4ce0d0508e66a9132412facec65f6"},
+    {file = "lxml-5.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:d6d3ce5638cd4ed3fa684507f164e7039e1b07475bc8f37ba6e12271c1a2e9e0"},
+    {file = "lxml-5.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e97c74725e86d84a477df081eef69b70f048128afee841dbd8c690a9e3d2e8e0"},
+    {file = "lxml-5.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:387c5416b8bb4b8ad7306797cb2719a841f5f3836b3c39fcaa56b9af5448dd2a"},
+    {file = "lxml-5.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:feb1102d9e5de08120d46a1011110c43b2547ecb3ae80030801e0e2dacd1ee18"},
+    {file = "lxml-5.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:745383c124f096cc03fb942c8a05ea1e8cb4f44c5b28887adce6224e4540808e"},
+    {file = "lxml-5.0.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4ae66b6b0f82e7839b6b8d009182c652d48e7d2ea21a6709f3033ce5fbf199c4"},
+    {file = "lxml-5.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:430780608d16b3bb96ef99a67a1a0626c8a295193af53ce9c4d5ec3fef2fbc79"},
+    {file = "lxml-5.0.1-cp39-cp39-win32.whl", hash = "sha256:331237209fe76951450c1119af0879f04f32d1b07b21e83a34ba439520492feb"},
+    {file = "lxml-5.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:c32a4fbae218b336d547bc626897325202e4e9f1794ac5f4d0bb3caacf41df21"},
+    {file = "lxml-5.0.1-pp310-pypy310_pp73-macosx_11_0_x86_64.whl", hash = "sha256:96c2abcbcb24f850f00f279c4660ce6498cae16ff1659845838d752c26890748"},
+    {file = "lxml-5.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:99b5ca5775435aa296d32ea711e194aaae871b21dbf0d57cb7d4431e5d3ad699"},
+    {file = "lxml-5.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a99dae826c80cf0a21dd21eb66db16310d1f58ac4c27c804da980212fbcabe2"},
+    {file = "lxml-5.0.1-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:4df6be79be4d7574e9e4002aeb6aa03d3f3809582db07abb166df7fc6e7438af"},
+    {file = "lxml-5.0.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:49dc4dcf14a08f160bb3f5f571f63514d918b8933a25c221536571a5fc010271"},
+    {file = "lxml-5.0.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:47f46a2ebade07f3afa47695882e7725440c49bf77cba39c3762a42597e5aad3"},
+    {file = "lxml-5.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:18caa0d3925902949cb060fe5f2da70c953d60bd9ef78657bd389f6db30533cc"},
+    {file = "lxml-5.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:8991837fdf8086486f1c300d936bacd757e2e5398be84cd54a1fba0e6b6d5591"},
+    {file = "lxml-5.0.1-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:d64e543b07964ff73b4eb994bee894803a80e19fd3b29a5ffbe3c637fe43e788"},
+    {file = "lxml-5.0.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:d80d9f4d986bb6ad65bae86f07391152f7b6c65cfc63d118616b18b0be2e79da"},
+    {file = "lxml-5.0.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ea5e4b3eff9029a02fe7736540675ab8fca44277232f0027397b0d7111d04b1c"},
+    {file = "lxml-5.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e2388a792f9c239510d62a9e615662b8202e4ca275aafcc9c4af154654462a14"},
+    {file = "lxml-5.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3ffc56d68b9782919d69ae9a6fac99efd7547f2666ccb7ecfd12871564d16133"},
+    {file = "lxml-5.0.1.tar.gz", hash = "sha256:4432a1d89a9b340bc6bd1201aef3ba03112f151d3f340d9218247dc0c85028ab"},
+]
+
+[package.extras]
+cssselect = ["cssselect (>=0.7)"]
+html5 = ["html5lib"]
+htmlsoup = ["BeautifulSoup4"]
+source = ["Cython (>=3.0.7)"]
+
 [[package]]
 name = "lz4"
 version = "4.3.3"
@@ -4767,6 +4893,17 @@ files = [
     {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
 ]
 
+[[package]]
+name = "soupsieve"
+version = "2.5"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"},
+    {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
+]
+
 [[package]]
 name = "sql-metadata"
 version = "2.10.0"
@@ -5544,7 +5681,7 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link
 testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"]
 
 [extras]
-all = ["GitPython", "SQLAlchemy", "asyncpg", "avro", "azure-identity", "azure-mgmt-datafactory", "confluent-kafka", "databricks-sdk", "databricks-sql-connector", "fastavro", "google-cloud-bigquery", "google-cloud-logging", "gql", "grpcio-tools", "lkml", "llama-hub", "looker-sdk", "more-itertools", "msal", "msgraph-beta-sdk", "parse", "pycarlo", "pyhive", "pymssql", "pymysql", "sasl", "snowflake-connector-python", "sql-metadata", "sqllineage", "tableauserverclient", "thoughtspot_rest_api_v1", "thrift", "thrift-sasl", "trino"]
+all = ["GitPython", "SQLAlchemy", "asyncpg", "avro", "azure-identity", "azure-mgmt-datafactory", "beautifulsoup4", "confluent-kafka", "databricks-sdk", "databricks-sql-connector", "fastavro", "google-cloud-bigquery", "google-cloud-logging", "gql", "grpcio-tools", "lkml", "llama-hub", "llama-index", "looker-sdk", "lxml", "more-itertools", "msal", "msgraph-beta-sdk", "parse", "pycarlo", "pyhive", "pymssql", "pymysql", "sasl", "snowflake-connector-python", "sql-metadata", "sqllineage", "tableauserverclient", "thoughtspot_rest_api_v1", "thrift", "thrift-sasl", "trino"]
 bigquery = ["google-cloud-bigquery", "google-cloud-logging", "sql-metadata"]
 datafactory = ["azure-identity", "azure-mgmt-datafactory"]
 datahub = ["gql"]
@@ -5556,12 +5693,13 @@ metabase = ["sql-metadata"]
 monte-carlo = ["pycarlo"]
 mssql = ["pymssql"]
 mysql = ["SQLAlchemy", "pymysql"]
-notion = ["llama-hub"]
+notion = ["llama-hub", "llama-index"]
 postgresql = ["asyncpg"]
 power-bi = ["msal", "msgraph-beta-sdk", "sql-metadata"]
 redshift = ["asyncpg", "sqllineage"]
 s3 = ["fastavro", "more-itertools", "parse"]
 snowflake = ["snowflake-connector-python", "sql-metadata"]
+static-web = ["beautifulsoup4", "llama-hub", "llama-index", "lxml"]
 synapse = ["pymssql"]
 tableau = ["sqllineage", "tableauserverclient"]
 throughtspot = ["thoughtspot_rest_api_v1"]
@@ -5571,4 +5709,4 @@ unity-catalog = ["databricks-sdk", "databricks-sql-connector"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<3.12"
-content-hash = "049550a1e52650775b228975d8b6d00c54183c53c0dfdf682edfef46319ae291"
+content-hash = "a724f91f53a69aacaca10721789b90211a93800bd21d681d8e460a79b3e4a735"
diff --git a/pyproject.toml b/pyproject.toml
index 6905fac3..d570987b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "metaphor-connectors"
-version = "0.13.123"
+version = "0.13.124"
 license = "Apache-2.0"
 description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app."
 authors = ["Metaphor <dev@metaphor.io>"]
@@ -20,6 +20,7 @@ avro = { version = "^1.11.3", optional = true }
 aws-assume-role-lib = "^2.10.0"
 azure-identity = { version = "^1.14.0", optional = true }
 azure-mgmt-datafactory = { version = "^3.1.0", optional = true }
+beautifulsoup4 = { version = "^4.12.3", optional = true }
 boto3 = "^1.28.57"
 botocore = "^1.31.58"
 canonicaljson = "^2.0.0"
@@ -35,7 +36,9 @@ grpcio-tools = { version = "^1.59.3", optional = true }
 jsonschema = "^4.18.6"
 lkml = { version = "^1.3.1", optional = true }
 llama-hub = {version = "0.0.67", optional = true }
+llama-index = {version = "0.9.48", optional = true}
 looker-sdk = { version = "^23.6.0", optional = true }
+lxml = { version = "~=5.0.0", optional = true }
 metaphor-models = "0.30.17"
 more-itertools = { version = "^10.1.0", optional = true }
 msal = { version = "^1.20.0", optional = true }
@@ -71,6 +74,7 @@ all = [
   "avro",
   "azure-identity",
   "azure-mgmt-datafactory",
+  "beautifulsoup4",
   "confluent-kafka",
   "databricks-sdk",
   "databricks-sql-connector",
@@ -82,10 +86,12 @@ all = [
   "grpcio-tools",
   "lkml",
   "looker-sdk",
+  "llama-hub",
+  "llama-index",
+  "lxml",
   "more-itertools",
   "msal",
   "msgraph-beta-sdk",
-  "llama-hub",
   "parse",
   "pycarlo",
   "pyhive",
@@ -114,12 +120,13 @@ metabase = ["sql-metadata"]
 monte_carlo = ["pycarlo"]
 mssql = ["pymssql"]
 mysql = ["pymysql", "SQLAlchemy"]
-notion = ["llama-hub"]
+notion = ["llama-hub", "llama-index"]
 postgresql = ["asyncpg"]
 power_bi = ["msal", "msgraph-beta-sdk", "sql-metadata"]
 redshift = ["asyncpg", "sqllineage"]
 s3 = ["fastavro", "more-itertools", "parse"]
 snowflake = ["snowflake-connector-python", "sql-metadata"]
+static_web = ["beautifulsoup4", "llama-hub", "llama-index", "lxml"]
 synapse = ["pymssql"]
 tableau = ["tableauserverclient", "sqllineage"]
 throughtspot = ["thoughtspot-rest-api-v1"]
diff --git a/tests/static_web/__init__.py b/tests/static_web/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/static_web/config.yml b/tests/static_web/config.yml
new file mode 100644
index 00000000..99f13cd3
--- /dev/null
+++ b/tests/static_web/config.yml
@@ -0,0 +1,12 @@
+---
+links:
+  - https://metaphor.io/
+depths:
+  - 1
+azure_openAI_key: azure_openAI_key
+azure_openAI_version: azure_openAI_version
+azure_openAI_endpoint: azure_openAI_endpoint
+azure_openAI_model: text-embedding-ada-002
+azure_openAI_model_name: EmbeddingModel
+include_text: true
+output: {}
diff --git a/tests/static_web/expected.json b/tests/static_web/expected.json
new file mode 100644
index 00000000..57de2757
--- /dev/null
+++ b/tests/static_web/expected.json
@@ -0,0 +1,24 @@
+[
+  {
+    "externalSearchDocument": {
+      "entityId": "EXTERNAL_DOCUMENT~ABCD1234",
+      "documentId": "EXTERNAL_DOCUMENT~ABCD1234",
+      "embedding_1": [
+        0.1,
+        0.2,
+        0.3,
+        0.4
+      ],
+      "pageId": "e19d5cd5af0378da05f63f891c7467af",
+      "lastRefreshed": "2024-02-05 00:00:00.000000",
+      "metadata": {
+        "title": "Hello World!",
+        "platform": "example.com",
+        "pageId": "e19d5cd5af0378da05f63f891c7467af",
+        "link": "https://example.com",
+        "lastRefreshed": "2024-02-05 00:00:00.000000"
+      },
+      "embeddedString_1": "Title: Hello World!\nHello World!"
+    }
+  }
+]
diff --git a/tests/static_web/sample_pages/main.html b/tests/static_web/sample_pages/main.html
new file mode 100644
index 00000000..36e9e97e
--- /dev/null
+++ b/tests/static_web/sample_pages/main.html
@@ -0,0 +1,10 @@
+<html>
+  <head>
+    <title>Main Page</title>
+  </head>
+  <body>
+    <h1>Main Page</h1>
+    <p><a href="https://example.com/p1">Go to Subpage 1 (p1)</a></p>
+    <p><a href="https://example.com/p2">Go to Subpage 2 (p2)</a></p>
+  </body>
+</html>
diff --git a/tests/static_web/sample_pages/page1.html b/tests/static_web/sample_pages/page1.html
new file mode 100644
index 00000000..ebbccae3
--- /dev/null
+++ b/tests/static_web/sample_pages/page1.html
@@ -0,0 +1,9 @@
+<html>
+  <head>
+    <title>Subpage 1</title>
+  </head>
+  <body>
+    <h1>Subpage 1</h1>
+    <p><a href="https://example.com/main">Back to Main Page</a></p>
+  </body>
+</html>
diff --git a/tests/static_web/sample_pages/page2.html b/tests/static_web/sample_pages/page2.html
new file mode 100644
index 00000000..c66ebd1b
--- /dev/null
+++ b/tests/static_web/sample_pages/page2.html
@@ -0,0 +1,10 @@
+<html>
+  <head>
+    <title>Subpage 2</title>
+  </head>
+  <body>
+    <h1>Subpage 2</h1>
+    <p><a href="https://example.com/p3">Go to Child Page 1 (p3)</a></p>
+    <p><a href="https://example.com/p4">Go to Child Page 2 (p4)</a></p>
+  </body>
+</html>
diff --git a/tests/static_web/sample_pages/page3.html b/tests/static_web/sample_pages/page3.html
new file mode 100644
index 00000000..dc092687
--- /dev/null
+++ b/tests/static_web/sample_pages/page3.html
@@ -0,0 +1,8 @@
+<html>
+  <head>
+    <title>Child Page 1</title>
+  </head>
+  <body>
+    <h1>Child Page 1 (p3)</h1>
+  </body>
+</html>
diff --git a/tests/static_web/sample_pages/page4.html b/tests/static_web/sample_pages/page4.html
new file mode 100644
index 00000000..97bfc9dd
--- /dev/null
+++ b/tests/static_web/sample_pages/page4.html
@@ -0,0 +1,8 @@
+<html>
+  <head>
+    <title>Child Page 2</title>
+  </head>
+  <body>
+    <h1>Child Page 2 (p4)</h1>
+  </body>
+</html>
diff --git a/tests/static_web/sample_pages/titles_text.html b/tests/static_web/sample_pages/titles_text.html
new file mode 100644
index 00000000..dfe820da
--- /dev/null
+++ b/tests/static_web/sample_pages/titles_text.html
@@ -0,0 +1,26 @@
+<html>
+  <head>
+    <title>Test Title</title>
+    <style>
+      Some style
+    </style>
+    <script>
+      Some script
+    </script>
+    <meta name="description" content="Some meta" />
+  </head>
+
+  <body>
+    <p>Visible paragraph 1.</p>
+    <div>
+      <p>Visible paragraph 2.</p>
+      <!-- Commented text -->
+      <script>
+        Script text
+      </script>
+      <style>
+        Style text
+      </style>
+    </div>
+  </body>
+</html>
diff --git a/tests/static_web/test_config.py b/tests/static_web/test_config.py
new file mode 100644
index 00000000..da93a74e
--- /dev/null
+++ b/tests/static_web/test_config.py
@@ -0,0 +1,18 @@
+from metaphor.common.base_config import OutputConfig
+from metaphor.static_web.config import StaticWebRunConfig
+
+
+def test_config(test_root_dir):
+    config = StaticWebRunConfig.from_yaml_file(f"{test_root_dir}/static_web/config.yml")
+
+    assert config == StaticWebRunConfig(
+        links=["https://metaphor.io/"],
+        depths=[1],
+        azure_openAI_key="azure_openAI_key",
+        azure_openAI_version="azure_openAI_version",
+        azure_openAI_endpoint="azure_openAI_endpoint",
+        azure_openAI_model="text-embedding-ada-002",
+        azure_openAI_model_name="EmbeddingModel",
+        include_text=True,
+        output=OutputConfig(),
+    )
diff --git a/tests/static_web/test_extractor.py b/tests/static_web/test_extractor.py
new file mode 100644
index 00000000..32d480a6
--- /dev/null
+++ b/tests/static_web/test_extractor.py
@@ -0,0 +1,257 @@
+from unittest.mock import MagicMock, patch
+
+import pytest
+import requests
+
+from metaphor.common.base_config import OutputConfig
+from metaphor.static_web.config import StaticWebRunConfig
+from metaphor.static_web.extractor import StaticWebExtractor
+from tests.test_utils import load_json, load_text
+
+
+@pytest.fixture
+def static_web_extractor():
+    config = StaticWebRunConfig(
+        links=["https://example.com"],
+        depths=[1],
+        azure_openAI_key="key",
+        azure_openAI_version="version",
+        azure_openAI_endpoint="endpoint",
+        azure_openAI_model="text-embedding-ada-002",
+        azure_openAI_model_name="model_name",
+        include_text=True,
+        output=OutputConfig(),
+    )
+    return StaticWebExtractor(config)
+
+
+# Test for successful page HTML retrieval
+@patch("requests.get")
+def test_get_page_HTML_success(mock_get: MagicMock, static_web_extractor):
+    mock_get_val = MagicMock()
+    mock_get_val.text = "<html><body>Test</body></html>"
+
+    mock_get.return_value = mock_get_val
+
+    content = static_web_extractor._get_page_HTML("https://example.com")
+    assert content == "<html><body>Test</body></html>"
+
+
+# Test for handling retrieval failure
+@patch("requests.get")
+def test_get_page_HTML_failure(mock_get: MagicMock, static_web_extractor):
+    mock_get_val = MagicMock()
+
+    mock_get.side_effect = requests.RequestException()
+    mock_get.return_value = mock_get_val
+
+    content = static_web_extractor._get_page_HTML("https://example.com")
+    assert content == "ERROR IN PAGE RETRIEVAL"
+
+
+# Test for extracting subpages from HTML
+def test_get_subpages_from_HTML(static_web_extractor):
+    html_content = "<html><body><a href='/test'>Link</a></body></html>"
+    static_web_extractor.current_parent_page = "https://example.com"
+    result = static_web_extractor._get_subpages_from_HTML(
+        html_content, "https://example.com"
+    )
+    assert "https://example.com/test" in result
+
+
+# Test for extracting visible text from HTML, with filtering
+def test_get_text_from_HTML_with_filtering(static_web_extractor, test_root_dir: str):
+    html_content = load_text(
+        f"{test_root_dir}/static_web/sample_pages/titles_text.html"
+    )
+    text = static_web_extractor._get_text_from_HTML(html_content)
+    assert "Visible paragraph 1." in text
+    assert "Visible paragraph 2." in text
+    assert "Test Title" not in text
+    assert "Some style" not in text
+    assert "Some script" not in text
+    assert "Some meta" not in text
+    assert "Commented text" not in text
+    assert "Script text" not in text
+    assert "Style text" not in text
+
+
+# Test for extracting title from HTML
+def test_get_title_from_HTML_success(static_web_extractor):
+    html_content = "<html><head><title>Test Title</title></head></html>"
+    title = static_web_extractor._get_title_from_HTML(html_content)
+    assert title == "Test Title"
+
+
+# Test for extracting empty title
+def test_get_title_from_HTML_failure(static_web_extractor):
+    html_content = "<html><head></head><body><h1>Hello World!</h1></body></html>"
+    title = static_web_extractor._get_title_from_HTML(html_content)
+    assert title == ""
+
+
+# Test for making a document
+def test_make_document(static_web_extractor):
+    doc = static_web_extractor._make_document(
+        "https://example.com", "Test Title", "Test Content"
+    )
+    assert doc.text == "Test Content"
+    assert doc.extra_info["title"] == "Test Title"
+    assert doc.extra_info["link"] == "https://example.com"
+
+
+# Test process subpages
+@patch("metaphor.static_web.StaticWebExtractor._get_page_HTML")
+@patch("metaphor.static_web.StaticWebExtractor._get_subpages_from_HTML")
+@pytest.mark.asyncio
+async def test_process_subpages(
+    mock_get_subpages_from_HTML: MagicMock,
+    mock_get_page_HTML: MagicMock,
+    static_web_extractor,
+):
+    # Mocking the responses for page HTML and subpages
+    parent_html = "<html><body><a href='/subpage1'>Subpage 1</a></body></html>"
+    subpage1_html = "<html><body><p>Content of Subpage 1</p></body></html>"
+
+    mock_get_page_HTML.side_effect = [parent_html, subpage1_html]
+    mock_get_subpages_from_HTML.return_value = ["https://example.com/subpage1"]
+
+    # Call the _process_subpages method
+    static_web_extractor.visited_pages = set()
+    static_web_extractor.docs = list()
+    await static_web_extractor._process_subpages("https://example.com", parent_html, 2)
+
+    # Check if _get_page_HTML is called for subpages
+    mock_get_page_HTML.assert_any_call("https://example.com/subpage1")
+
+    # Verify that documents are added correctly
+    assert len(static_web_extractor.docs) > 0
+    assert (
+        static_web_extractor.docs[0].extra_info["link"]
+        == "https://example.com/subpage1"
+    )
+    # Further assertions can be made based on the structure of your Document objects
+
+    # Check if the depth limit is respected
+    assert not any(
+        "subpage2" in doc.extra_info["link"] for doc in static_web_extractor.docs
+    )
+
+
+# Test 1 layer recursion
+@patch("metaphor.static_web.StaticWebExtractor._get_page_HTML")
+@patch("metaphor.static_web.extractor.embed_documents")
+@patch("metaphor.static_web.extractor.map_metadata")
+@pytest.mark.asyncio
+async def test_shallow_recursion(
+    mock_map_metadata: MagicMock,
+    mock_embed_docs: MagicMock,
+    mock_get_HTML: MagicMock,
+    static_web_extractor,
+    test_root_dir: str,
+):
+    mock_map_metadata.return_value = []
+    mock_embed_docs.return_value = []
+
+    # Mock pages appropriately
+    page_folder = f"{test_root_dir}/static_web/sample_pages"
+
+    mock_get_HTML.side_effect = [
+        load_text(f"{page_folder}/main.html"),
+        load_text(f"{page_folder}/page1.html"),
+        load_text(f"{page_folder}/page2.html"),
+        load_text(f"{page_folder}/page3.html"),
+        load_text(f"{page_folder}/page4.html"),
+    ]
+
+    # Initialize extractor attributes for shallow recursion test
+    static_web_extractor.target_URLs = ["https://example.com/main"]
+    static_web_extractor.target_depths = [1]
+
+    await static_web_extractor.extract()
+
+    assert len(static_web_extractor.visited_pages) == 3
+    assert len(static_web_extractor.docs) == 3
+
+
+# Test infinite
+@patch("metaphor.static_web.StaticWebExtractor._get_page_HTML")
+@patch("metaphor.static_web.extractor.embed_documents")
+@patch("metaphor.static_web.extractor.map_metadata")
+@pytest.mark.asyncio
+async def test_infinite_recursion(
+    mock_map_metadata: MagicMock,
+    mock_embed_docs: MagicMock,
+    mock_get_HTML: MagicMock,
+    static_web_extractor,
+    test_root_dir: str,
+):
+    mock_map_metadata.return_value = []
+    mock_embed_docs.return_value = []
+
+    # Mock pages appropriately
+    page_folder = f"{test_root_dir}/static_web/sample_pages"
+
+    mock_get_HTML.side_effect = [
+        load_text(f"{page_folder}/main.html"),
+        load_text(f"{page_folder}/page1.html"),
+        load_text(f"{page_folder}/page2.html"),
+        load_text(f"{page_folder}/page3.html"),
+        load_text(f"{page_folder}/page4.html"),
+    ]
+
+    # Initialize extractor attributes for infinite recursion test
+    # page1 has a backlink to main, so we should not see multiple instances
+    static_web_extractor.target_URLs = ["https://example.com/main"]
+    static_web_extractor.target_depths = [2]
+
+    await static_web_extractor.extract()
+
+    assert len(static_web_extractor.visited_pages) == 5
+    assert len(static_web_extractor.docs) == 5
+
+
+# Test extract
+@patch("metaphor.static_web.StaticWebExtractor._get_page_HTML")
+@patch("metaphor.static_web.StaticWebExtractor._process_subpages")
+@patch("metaphor.static_web.extractor.embed_documents")
+@pytest.mark.asyncio
+async def test_extractor(
+    mock_embed_docs: MagicMock,
+    mock_process_subpages: MagicMock,
+    mock_get_HTML: MagicMock,
+    static_web_extractor,
+    test_root_dir: str,
+):
+    # mock VSI
+    mock_vector_store_index = MagicMock()
+
+    mock_vector_store_index.storage_context.to_dict.return_value = {
+        "vector_store": {
+            "default": {
+                "embedding_dict": {"abcd1234": [0.1, 0.2, 0.3, 0.4]},
+                "metadata_dict": {
+                    "abcd1234": {
+                        "title": "Hello World!",
+                        "pageId": "e19d5cd5af0378da05f63f891c7467af",
+                        "platform": "example.com",
+                        "link": "https://example.com",
+                        "lastRefreshed": "2024-02-05 00:00:00.000000",
+                    }
+                },
+            }
+        },
+        "doc_store": {
+            "docstore/data": {"abcd1234": {"__data__": {"text": "Hello World!"}}}
+        },
+    }
+
+    mock_embed_docs.return_value = mock_vector_store_index
+
+    mock_process_subpages.return_value = None
+
+    mock_get_HTML.return_value = "<html><head></head><body><h1>Hello World!</h1><a href='/test'>Link</a></body></html>"
+
+    events = await static_web_extractor.extract()
+
+    assert events == load_json(f"{test_root_dir}/static_web/expected.json")
diff --git a/tests/test_utils.py b/tests/test_utils.py
index e1e63a9d..15ac22e7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -8,6 +8,11 @@ def load_json(path):
         return json.load(f)
 
 
+def load_text(path):
+    with open(path, "r") as f:
+        return f.read()
+
+
 def compare_list_ignore_order(a: list, b: list):
     t = list(b)  # make a mutable copy
     try: