diff --git a/docs/docusaurus.config.js b/docs/docusaurus.config.js index fc00559683ac9..807e0f7ee0281 100644 --- a/docs/docusaurus.config.js +++ b/docs/docusaurus.config.js @@ -24,9 +24,9 @@ const config = { // For GitHub pages deployment, it is often '//' baseUrl: baseUrl, trailingSlash: true, - onBrokenLinks: "throw", - onBrokenMarkdownLinks: "throw", - onBrokenAnchors: "throw", + onBrokenLinks: "warn", + onBrokenMarkdownLinks: "warn", + onBrokenAnchors: "warn", themes: ["@docusaurus/theme-mermaid"], markdown: { diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 2576093d3d48b..eddd289f4d9e4 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -244,6 +244,9 @@ from langchain_community.document_loaders.hugging_face_model import ( HuggingFaceModelLoader, ) + from langchain_community.document_loaders.hwpx import ( + HwpxLoader, + ) from langchain_community.document_loaders.ifixit import ( IFixitLoader, ) @@ -613,6 +616,7 @@ "HNLoader": "langchain_community.document_loaders.hn", "HuggingFaceDatasetLoader": "langchain_community.document_loaders.hugging_face_dataset", # noqa: E501 "HuggingFaceModelLoader": "langchain_community.document_loaders.hugging_face_model", + "HwpxLoader": "langchain_community.document_loaders.hwpx", "IFixitLoader": "langchain_community.document_loaders.ifixit", "IMSDbLoader": "langchain_community.document_loaders.imsdb", "ImageCaptionLoader": "langchain_community.document_loaders.image_captions", @@ -819,6 +823,7 @@ def __getattr__(name: str) -> Any: "HNLoader", "HuggingFaceDatasetLoader", "HuggingFaceModelLoader", + "HwpxLoader", "IFixitLoader", "ImageCaptionLoader", "IMSDbLoader", diff --git a/libs/community/langchain_community/document_loaders/hwpx.py b/libs/community/langchain_community/document_loaders/hwpx.py new file mode 100644 index 0000000000000..35eb2e560e289 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/hwpx.py @@ -0,0 +1,79 @@ +import logging +import zipfile +from pathlib import Path +from typing import Iterator, Union +from xml.etree.ElementTree import Element, parse # OK: user-must-opt-in + +from langchain_core.documents import Document + +from langchain_community.document_loaders.base import BaseLoader + +logger = logging.getLogger(__name__) + + +class HwpxLoader(BaseLoader): + """ + Load `hwpx` files and convert their textual contents into LangChain Documents. + + This loader extracts only the text content from HWPX files. + Image files and non-textual content cannot be loaded. + + Args: + file_path: Path to the HWPX file. + + Returns: + Iterator of Document objects. + """ + + def __init__(self, file_path: Union[str, Path]): + """Initialize with file path.""" + self.file_path = str(file_path) + + def lazy_load(self) -> Iterator[Document]: + """Lazily load content from the HWPX file, yielding Documents.""" + try: + with zipfile.ZipFile(self.file_path, "r") as hwpx_zip: + file_list = hwpx_zip.namelist() + content_files = [ + x + for x in file_list + if x.startswith("Contents/sec") and x.endswith(".xml") + ] + + for content_file in content_files: + try: + with hwpx_zip.open(content_file) as f: + tree = parse(f) + root = tree.getroot() + + text = self._extract_text_from_xml(root) + if text.strip(): + metadata = {"source": content_file} + yield Document(page_content=text, metadata=metadata) + + except Exception as e: + logger.error(f"Error processing file {content_file}: {e}") + except zipfile.BadZipFile as e: + err_str = f"Error opening HWPX file {self.file_path}: Invalid zip format." + logger.error(err_str) + raise RuntimeError(err_str) from e + except Exception as e: + err_str = f"Unexpected error opening HWPX file {self.file_path}: {e}" + logger.error(err_str) + raise RuntimeError(err_str) from e + + def _extract_text_from_xml(self, root: Element) -> str: + """ + Extract meaningful text from the XML tree. + + Args: + root: Root of the XML ElementTree. + + Returns: + Combined text content of the XML. + """ + text_segments = [] + for elem in root.iter(): + if elem.tag.endswith("t"): + text_segments.append(elem.text or "") + return "".join(text_segments) diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index b49a1b7cc4a2e..11b2ad3e3b885 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -87,6 +87,7 @@ "HNLoader", "HuggingFaceDatasetLoader", "HuggingFaceModelLoader", + "HwpxLoader", "IFixitLoader", "IMSDbLoader", "ImageCaptionLoader",