Skip to content

Commit

Permalink
feat(scraping)!: scrape empty ebd sections and provide paragraph text (
Browse files Browse the repository at this point in the history
…#263)

* WIP

* WIP

* fix linting

* updated snapshots

* add EbdNoTableSection to structure remarks

* updated docstring

* updated snapshots

* Changed default value for empty ebd string and made it optional

* Modified docstring

* fix small bug

* cover some edge cases when collecting ebd sections without tables

* lint typecheck

* updated snapshots

* raise error if table does not exist at all

* cleaned unused code
  • Loading branch information
DeltaDaniel authored Nov 7, 2024
1 parent 751a6c0 commit b02a186
Show file tree
Hide file tree
Showing 7 changed files with 1,568 additions and 644 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ classifiers = [
"Programming Language :: Python :: 3.12",
]
dependencies = [
"rebdhuhn>=0.2.3",
"rebdhuhn>=0.4.1",
"python-docx",
"more_itertools",
"attrs",
Expand Down
2 changes: 1 addition & 1 deletion requirements.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
rebdhuhn>=0.2.3
rebdhuhn>=0.4.1
python-docx
more_itertools
attrs
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ charset-normalizer==3.3.2
# via requests
click==8.1.7
# via -r requirements.in
colorama==0.4.6
# via click
idna==3.7
# via requests
lxml==5.2.1
Expand All @@ -30,7 +32,7 @@ networkx==3.3
# via rebdhuhn
python-docx==1.1.2
# via -r requirements.in
rebdhuhn==0.4.0
rebdhuhn==0.4.1
# via -r requirements.in
requests==2.32.0
# via rebdhuhn
Expand Down
77 changes: 60 additions & 17 deletions src/ebdamame/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,19 +104,56 @@ def _table_is_an_ebd_table(table: Table) -> bool:
return False


@attrs.define(kw_only=True, frozen=True)
class EbdNoTableSection:
"""
Represents an empty section in the document
"""

ebd_key: str = attrs.field(validator=attrs.validators.instance_of(str))
remark: str = attrs.field(validator=attrs.validators.instance_of(str))


# pylint:disable=too-many-branches
def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
def is_heading(paragraph: Paragraph) -> bool:
"""
Returns True if the paragraph is a heading.
"""
return paragraph.style is not None and paragraph.style.style_id in {
"berschrift1",
"berschrift2",
"berschrift3",
}


def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table] | EbdNoTableSection:
"""
Opens the file specified in docx_file_path and returns the tables that relate to the given ebd_key.
There might be more than 1 docx table for 1 EBD table.
This is because of inconsistencies and manual editing during creation of the documents by EDI@Energy.
Raises an TableNotFoundError if the table was not found.
Opens the file specified in `docx_file_path` and returns the tables that relate to the given `ebd_key`.
This function processes the document to find tables associated with the given `ebd_key`.
There might be more than one table for a single EBD table due to inconsistencies and manual editing during
the creation of the documents by EDI@Energy.
There are sections relating to the EBD key without any tables.
In this case, the section is identified and the related paragraph is captured as a remark
(e.g. 'Es ist das EBD E_0556 zu nutzen.' for EBD_0561).
Args:
docx_file_path (Path): The path to the .docx file to be processed.
ebd_key (str): The EBD key to search for in the document.
Returns:
List[Table] | EbdNoTableSection: A list of `Table` objects if tables are found, or an `EbdNoTableSection` object
if no tables are found but the section is identified and are remark is captured.
Raises:
TableNotFoundError: If no tables related to the given `ebd_key` are found in the document.
"""
if _ebd_key_pattern.match(ebd_key) is None:
raise ValueError(f"The ebd_key '{ebd_key}' does not match {_ebd_key_pattern.pattern}")
document = get_document(docx_file_path)

found_subsection_of_requested_table: bool = False
empty_ebd_text: str | None = None # paragraph text if there is no ebd table
found_table_in_subsection: bool = False
is_inside_subsection_of_requested_table: bool = False
tables: List[Table] = []
tables_and_paragraphs = _get_tables_and_paragraphs(document)
Expand All @@ -127,21 +164,20 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
# 1. before each EbdTable there is a paragraph whose text starts with the respective EBD key
# 2. there are no duplicates
is_ebd_heading_of_requested_ebd_key = paragraph.text.startswith(ebd_key)
if _ebd_key_with_heading_pattern.match(paragraph.text) is not None and found_subsection_of_requested_table:
if is_inside_subsection_of_requested_table and is_heading(paragraph):
_logger.warning("No EBD table found in subsection for: '%s'", ebd_key)
break
if is_ebd_heading_of_requested_ebd_key:
found_subsection_of_requested_table = True
if is_inside_subsection_of_requested_table and paragraph.text.strip() != "":
if empty_ebd_text is None:
# the first text paragraph after we found the correct section containing the ebd key
empty_ebd_text = paragraph.text.strip()
else:
empty_ebd_text += ("\n") + paragraph.text.strip()
is_inside_subsection_of_requested_table = (
is_ebd_heading_of_requested_ebd_key or is_inside_subsection_of_requested_table
)
if (
is_inside_subsection_of_requested_table
and paragraph.text.strip().startswith("Es ist das EBD")
and paragraph.text.strip().endswith("zu nutzen.")
):
# that's kind of a dirty hack. But it works.
break
if isinstance(table_or_paragraph, Table) and is_inside_subsection_of_requested_table:
found_table_in_subsection = True
if (
isinstance(table_or_paragraph, Table)
and is_inside_subsection_of_requested_table
Expand Down Expand Up @@ -173,7 +209,14 @@ def get_ebd_docx_tables(docx_file_path: Path, ebd_key: str) -> List[Table]:
# break the outer loop, too; no need to iterate any further
break
if not any(tables):
raise TableNotFoundError(ebd_key=ebd_key)
if not is_inside_subsection_of_requested_table:
raise TableNotFoundError(ebd_key=ebd_key)
if empty_ebd_text is None:
if found_table_in_subsection:
# probably there is an error while scraping the tables
raise TableNotFoundError(ebd_key=ebd_key)
return EbdNoTableSection(ebd_key=ebd_key, remark="")
return EbdNoTableSection(ebd_key=ebd_key, remark=empty_ebd_text.strip())
return tables


Expand Down
2 changes: 1 addition & 1 deletion unittests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_document(datafiles, filename: str) -> DocumentType:
return ebdamame.get_document(path)


def get_ebd_docx_tables(datafiles, filename: str, ebd_key: str) -> List[Table]:
def get_ebd_docx_tables(datafiles, filename: str, ebd_key: str) -> List[Table] | ebdamame.EbdNoTableSection:
"""
a datafiles compatible wrapper around ebdamame.get_ebd_docx_tables
"""
Expand Down
Loading

0 comments on commit b02a186

Please sign in to comment.