feat(scraping)!: scrape empty ebd sections and provide paragraph text (…

…#263) * WIP * WIP * fix linting * updated snapshots * add EbdNoTableSection to structure remarks * updated docstring * updated snapshots * Changed default value for empty ebd string and made it optional * Modified docstring * fix small bug * cover some edge cases when collecting ebd sections without tables * lint typecheck * updated snapshots * raise error if table does not exist at all * cleaned unused code
Hochfrequenz · Nov 7, 2024 · b02a186 · b02a186
1 parent 751a6c0
commit b02a186
Show file tree

Hide file tree

Showing 7 changed files with 1,568 additions and 644 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "rebdhuhn>=0.2.3",
+    "rebdhuhn>=0.4.1",
     "python-docx",
     "more_itertools",
     "attrs",

diff --git a/requirements.in b/requirements.in
@@ -1,4 +1,4 @@
-rebdhuhn>=0.2.3
+rebdhuhn>=0.4.1
 python-docx
 more_itertools
 attrs

diff --git a/requirements.txt b/requirements.txt
@@ -17,6 +17,8 @@ charset-normalizer==3.3.2
     # via requests
 click==8.1.7
     # via -r requirements.in
+colorama==0.4.6
+    # via click
 idna==3.7
     # via requests
 lxml==5.2.1
@@ -30,7 +32,7 @@ networkx==3.3
     # via rebdhuhn
 python-docx==1.1.2
     # via -r requirements.in
-rebdhuhn==0.4.0
+rebdhuhn==0.4.1
     # via -r requirements.in
 requests==2.32.0
     # via rebdhuhn

diff --git a/src/ebdamame/__init__.py b/src/ebdamame/__init__.py
@@ -104,19 +104,56 @@ def _table_is_an_ebd_table(table: Table) -> bool:
     return False
 
 
+@attrs.define(kw_only=True, frozen=True)
+class EbdNoTableSection:
+    """
+    Represents an empty section in the document
+    """
+
+    ebd_key: str = attrs.field(validator=attrs.validators.instance_of(str))
+    remark: str = attrs.field(validator=attrs.validators.instance_of(str))
+
+
 # pylint:disable=too-many-branches
-def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
+def is_heading(paragraph: Paragraph) -> bool:
+    """
+    Returns True if the paragraph is a heading.
+    """
+    return paragraph.style is not None and paragraph.style.style_id in {
+        "berschrift1",
+        "berschrift2",
+        "berschrift3",
+    }
+
+
+def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table] | EbdNoTableSection:
     """
-    Opens the file specified in docx_file_path and returns the tables that relate to the given ebd_key.
-    There might be more than 1 docx table for 1 EBD table.
-    This is because of inconsistencies and manual editing during creation of the documents by EDI@Energy.
-    Raises an TableNotFoundError if the table was not found.
+    Opens the file specified in `docx_file_path` and returns the tables that relate to the given `ebd_key`.
+
+    This function processes the document to find tables associated with the given `ebd_key`.
+    There might be more than one table for a single EBD table due to inconsistencies and manual editing during
+    the creation of the documents by EDI@Energy.
+    There are sections relating to the EBD key without any tables.
+    In this case, the section is identified and the related paragraph is captured as a remark
+    (e.g. 'Es ist das EBD E_0556 zu nutzen.' for EBD_0561).
+
+    Args:
+        docx_file_path (Path): The path to the .docx file to be processed.
+        ebd_key (str): The EBD key to search for in the document.
+
+    Returns:
+        List[Table] | EbdNoTableSection: A list of `Table` objects if tables are found, or an `EbdNoTableSection` object
+        if no tables are found but the section is identified and are remark is captured.
+
+    Raises:
+        TableNotFoundError: If no tables related to the given `ebd_key` are found in the document.
     """
     if _ebd_key_pattern.match(ebd_key) is None:
         raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
     document = get_document(docx_file_path)
 
-    found_subsection_of_requested_table: bool = False
+    empty_ebd_text: str | None = None  # paragraph text if there is no ebd table
+    found_table_in_subsection: bool = False
     is_inside_subsection_of_requested_table: bool = False
     tables: List[Table] = []
     tables_and_paragraphs = _get_tables_and_paragraphs(document)
@@ -127,21 +164,20 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
             # 1. before each EbdTable there is a paragraph whose text starts with the respective EBD key
             # 2. there are no duplicates
             is_ebd_heading_of_requested_ebd_key = paragraph.text.startswith(ebd_key)
-            if _ebd_key_with_heading_pattern.match(paragraph.text) is not None and found_subsection_of_requested_table:
+            if is_inside_subsection_of_requested_table and is_heading(paragraph):
                 _logger.warning("No EBD table found in subsection for: '%s'", ebd_key)
                 break
-            if is_ebd_heading_of_requested_ebd_key:
-                found_subsection_of_requested_table = True
+            if is_inside_subsection_of_requested_table and paragraph.text.strip() != "":
+                if empty_ebd_text is None:
+                    # the first text paragraph after we found the correct section containing the ebd key
+                    empty_ebd_text = paragraph.text.strip()
+                else:
+                    empty_ebd_text += ("\n") + paragraph.text.strip()
             is_inside_subsection_of_requested_table = (
                 is_ebd_heading_of_requested_ebd_key or is_inside_subsection_of_requested_table
             )
-            if (
-                is_inside_subsection_of_requested_table
-                and paragraph.text.strip().startswith("Es ist das EBD")
-                and paragraph.text.strip().endswith("zu nutzen.")
-            ):
-                # that's kind of a dirty hack. But it works.
-                break
+        if isinstance(table_or_paragraph, Table) and is_inside_subsection_of_requested_table:
+            found_table_in_subsection = True
         if (
             isinstance(table_or_paragraph, Table)
             and is_inside_subsection_of_requested_table
@@ -173,7 +209,14 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
             # break the outer loop, too; no need to iterate any further
             break
     if not any(tables):
-        raise TableNotFoundError(ebd_key=ebd_key)
+        if not is_inside_subsection_of_requested_table:
+            raise TableNotFoundError(ebd_key=ebd_key)
+        if empty_ebd_text is None:
+            if found_table_in_subsection:
+                # probably there is an error while scraping the tables
+                raise TableNotFoundError(ebd_key=ebd_key)
+            return EbdNoTableSection(ebd_key=ebd_key, remark="")
+        return EbdNoTableSection(ebd_key=ebd_key, remark=empty_ebd_text.strip())
     return tables
 
 

diff --git a/unittests/__init__.py b/unittests/__init__.py
@@ -20,7 +20,7 @@ def get_document(datafiles, filename: str) -> DocumentType:
     return ebdamame.get_document(path)
 
 
-def get_ebd_docx_tables(datafiles, filename: str, ebd_key: str) -> List[Table]:
+def get_ebd_docx_tables(datafiles, filename: str, ebd_key: str) -> List[Table] | ebdamame.EbdNoTableSection:
     """
     a datafiles compatible wrapper around ebdamame.get_ebd_docx_tables
     """