diff --git a/CHANGELOG.md b/CHANGELOG.md
index af8711b0..b7241945 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,6 @@
+## 0.7.35
+Fix syntax for generated HTML tables
+
## 0.7.34
* Reduce excessive logging
diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
index 10f845e7..15c467cd 100644
--- a/test_unstructured_inference/models/test_tables.py
+++ b/test_unstructured_inference/models/test_tables.py
@@ -970,7 +970,7 @@ def test_table_prediction_runs_with_empty_recognize(
def test_table_prediction_with_ocr_tokens(table_transformer, example_image, mocked_ocr_tokens):
prediction = table_transformer.predict(example_image, ocr_tokens=mocked_ocr_tokens)
- assert '
' in prediction
+ assert '' in prediction
assert " |
---|
Blind | 5 | 1 | 4 | 34.5%, n=1 | " in prediction
@@ -1216,26 +1216,6 @@ def test_header_supercell_tree(supercells, expected_len):
assert len(supercells) == expected_len
-def test_cells_to_html():
- # example table
- # +----------+---------------------+
- # | two | two columns |
- # | |----------+----------|
- # | rows |sub cell 1|sub cell 2|
- # +----------+----------+----------+
- cells = [
- {"row_nums": [0, 1], "column_nums": [0], "cell text": "two row", "column header": False},
- {"row_nums": [0], "column_nums": [1, 2], "cell text": "two cols", "column header": False},
- {"row_nums": [1], "column_nums": [1], "cell text": "sub cell 1", "column header": False},
- {"row_nums": [1], "column_nums": [2], "cell text": "sub cell 2", "column header": False},
- ]
- expected = (
- 'two row | two '
- "cols | | sub cell 1 | sub cell 2 | "
- )
- assert tables.cells_to_html(cells) == expected
-
-
@pytest.mark.parametrize("zoom", [1, 0.1, 5, -1, 0])
def test_zoom_image(example_image, zoom):
width, height = example_image.size
@@ -1247,6 +1227,534 @@ def test_zoom_image(example_image, zoom):
assert new_h == np.round(height * zoom, 0)
+@pytest.mark.parametrize(
+ ("input_cells", "expected_html"),
+ [
+ # +----------+---------------------+
+ # | row1col1 | row1col2 | row1col3 |
+ # |----------|----------+----------|
+ # | row2col1 | row2col2 | row2col3 |
+ # +----------+----------+----------+
+ pytest.param(
+ [
+ {
+ "row_nums": [0],
+ "column_nums": [0],
+ "cell text": "row1col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [0],
+ "column_nums": [1],
+ "cell text": "row1col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [0],
+ "column_nums": [2],
+ "cell text": "row1col3",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [0],
+ "cell text": "row2col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [1],
+ "cell text": "row2col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [2],
+ "cell text": "row2col3",
+ "column header": False,
+ },
+ ],
+ (
+ "row1col1 | row1col2 | row1col3 | "
+ "row2col1 | row2col2 | row2col3 | "
+ ),
+ id="simple table without header",
+ ),
+ # +----------+---------------------+
+ # | h1col1 | h1col2 | h1col3 |
+ # |----------|----------+----------|
+ # | row1col1 | row1col2 | row1col3 |
+ # |----------|----------+----------|
+ # | row2col1 | row2col2 | row2col3 |
+ # +----------+----------+----------+
+ pytest.param(
+ [
+ {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True},
+ {"row_nums": [0], "column_nums": [1], "cell text": "h1col2", "column header": True},
+ {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True},
+ {
+ "row_nums": [1],
+ "column_nums": [0],
+ "cell text": "row1col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [1],
+ "cell text": "row1col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [2],
+ "cell text": "row1col3",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [0],
+ "cell text": "row2col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [1],
+ "cell text": "row2col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [2],
+ "cell text": "row2col3",
+ "column header": False,
+ },
+ ],
+ (
+ "h1col1 | h1col2 | h1col2 | "
+ "row1col1 | row1col2 | row1col3 | "
+ "row2col1 | row2col2 | row2col3 | "
+ ),
+ id="simple table with header",
+ ),
+ # +----------+---------------------+
+ # | h1col1 | h1col2 | h1col3 |
+ # |----------|----------+----------|
+ # | row1col1 | row1col2 | row1col3 |
+ # |----------|----------+----------|
+ # | row2col1 | row2col2 | row2col3 |
+ # +----------+----------+----------+
+ pytest.param(
+ [
+ {"row_nums": [0], "column_nums": [1], "cell text": "h1col2", "column header": True},
+ {
+ "row_nums": [2],
+ "column_nums": [0],
+ "cell text": "row2col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [0],
+ "cell text": "row1col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [1],
+ "cell text": "row2col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [1],
+ "cell text": "row1col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [2],
+ "cell text": "row2col3",
+ "column header": False,
+ },
+ {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True},
+ {
+ "row_nums": [1],
+ "column_nums": [2],
+ "cell text": "row1col3",
+ "column header": False,
+ },
+ {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True},
+ ],
+ (
+ "h1col1 | h1col2 | h1col2 | "
+ "row1col1 | row1col2 | row1col3 | "
+ "row2col1 | row2col2 | row2col3 | "
+ ),
+ id="simple table with header, mixed elements",
+ ),
+ # +----------+---------------------+
+ # | two | two columns |
+ # | |----------+----------|
+ # | rows |sub cell 1|sub cell 2|
+ # +----------+----------+----------+
+ pytest.param(
+ [
+ {
+ "row_nums": [0, 1],
+ "column_nums": [0],
+ "cell text": "two row",
+ "column header": False,
+ },
+ {
+ "row_nums": [0],
+ "column_nums": [1, 2],
+ "cell text": "two cols",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [1],
+ "cell text": "sub cell 1",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [2],
+ "cell text": "sub cell 2",
+ "column header": False,
+ },
+ ],
+ (
+ 'two row | two '
+ "cols | sub cell 1 | sub cell 2 | "
+ " "
+ ),
+ id="various spans, no headers",
+ ),
+ # +----------+---------------------+----------+
+ # | | h1col23 | h1col4 |
+ # | h12col1 |----------+----------+----------|
+ # | | h2col2 | h2col34 |
+ # |----------|----------+----------+----------+
+ # | r3col1 | r3col2 | |
+ # |----------+----------| r34col34 |
+ # | r4col12 | |
+ # +----------+----------+----------+----------+
+ pytest.param(
+ [
+ {
+ "row_nums": [0, 1],
+ "column_nums": [0],
+ "cell text": "h12col1",
+ "column header": True,
+ },
+ {
+ "row_nums": [0],
+ "column_nums": [1, 2],
+ "cell text": "h1col23",
+ "column header": True,
+ },
+ {"row_nums": [0], "column_nums": [3], "cell text": "h1col4", "column header": True},
+ {"row_nums": [1], "column_nums": [1], "cell text": "h2col2", "column header": True},
+ {
+ "row_nums": [1],
+ "column_nums": [2, 3],
+ "cell text": "h2col34",
+ "column header": True,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [0],
+ "cell text": "r3col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [1],
+ "cell text": "r3col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [2, 3],
+ "column_nums": [2, 3],
+ "cell text": "r34col34",
+ "column header": False,
+ },
+ {
+ "row_nums": [3],
+ "column_nums": [0, 1],
+ "cell text": "r4col12",
+ "column header": False,
+ },
+ ],
+ (
+ 'h12col1 | '
+ 'h1col23 | h1col4 | '
+ 'h2col2 | h2col34 | '
+ 'r3col1 | r3col2 | r34col34 | '
+ 'r4col12 | '
+ ),
+ id="various spans, with 2 row header",
+ ),
+ ],
+)
+def test_cells_to_html(input_cells, expected_html):
+ assert tables.cells_to_html(input_cells) == expected_html
+
+
+@pytest.mark.parametrize(
+ ("input_cells", "expected_cells"),
+ [
+ pytest.param(
+ [
+ {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True},
+ {"row_nums": [0], "column_nums": [1], "cell text": "h1col2", "column header": True},
+ {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True},
+ {
+ "row_nums": [1],
+ "column_nums": [0],
+ "cell text": "row1col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [1],
+ "cell text": "row1col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [2],
+ "cell text": "row1col3",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [0],
+ "cell text": "row2col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [1],
+ "cell text": "row2col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [2],
+ "cell text": "row2col3",
+ "column header": False,
+ },
+ ],
+ [
+ {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True},
+ {"row_nums": [0], "column_nums": [1], "cell text": "h1col2", "column header": True},
+ {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True},
+ {
+ "row_nums": [1],
+ "column_nums": [0],
+ "cell text": "row1col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [1],
+ "cell text": "row1col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [2],
+ "cell text": "row1col3",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [0],
+ "cell text": "row2col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [1],
+ "cell text": "row2col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [2],
+ "cell text": "row2col3",
+ "column header": False,
+ },
+ ],
+ id="identical tables, no changes expected",
+ ),
+ pytest.param(
+ [
+ {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True},
+ {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True},
+ {
+ "row_nums": [1],
+ "column_nums": [0],
+ "cell text": "row1col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [1],
+ "cell text": "row1col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [0],
+ "cell text": "row2col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [1],
+ "cell text": "row2col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [2],
+ "cell text": "row2col3",
+ "column header": False,
+ },
+ ],
+ [
+ {"row_nums": [0], "column_nums": [0], "cell text": "h1col1", "column header": True},
+ {"row_nums": [0], "column_nums": [1], "cell text": "", "column header": True},
+ {"row_nums": [0], "column_nums": [2], "cell text": "h1col2", "column header": True},
+ {
+ "row_nums": [1],
+ "column_nums": [0],
+ "cell text": "row1col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [1],
+ "column_nums": [1],
+ "cell text": "row1col2",
+ "column header": False,
+ },
+ {"row_nums": [1], "column_nums": [2], "cell text": "", "column header": False},
+ {
+ "row_nums": [2],
+ "column_nums": [0],
+ "cell text": "row2col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [1],
+ "cell text": "row2col2",
+ "column header": False,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [2],
+ "cell text": "row2col3",
+ "column header": False,
+ },
+ ],
+ id="missing column in header and in the middle",
+ ),
+ pytest.param(
+ [
+ {
+ "row_nums": [0, 1],
+ "column_nums": [0],
+ "cell text": "h12col1",
+ "column header": True,
+ },
+ {
+ "row_nums": [0],
+ "column_nums": [1, 2],
+ "cell text": "h1col23",
+ "column header": True,
+ },
+ {"row_nums": [1], "column_nums": [1], "cell text": "h2col2", "column header": True},
+ {
+ "row_nums": [1],
+ "column_nums": [2, 3],
+ "cell text": "h2col34",
+ "column header": True,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [0],
+ "cell text": "r3col1",
+ "column header": False,
+ },
+ {
+ "row_nums": [2, 3],
+ "column_nums": [2, 3],
+ "cell text": "r34col34",
+ "column header": False,
+ },
+ {
+ "row_nums": [3],
+ "column_nums": [0, 1],
+ "cell text": "r4col12",
+ "column header": False,
+ },
+ ],
+ [
+ {
+ "row_nums": [0, 1],
+ "column_nums": [0],
+ "cell text": "h12col1",
+ "column header": True,
+ },
+ {
+ "row_nums": [0],
+ "column_nums": [1, 2],
+ "cell text": "h1col23",
+ "column header": True,
+ },
+ {"row_nums": [0], "column_nums": [3], "cell text": "", "column header": True},
+ {"row_nums": [1], "column_nums": [1], "cell text": "h2col2", "column header": True},
+ {
+ "row_nums": [1],
+ "column_nums": [2, 3],
+ "cell text": "h2col34",
+ "column header": True,
+ },
+ {
+ "row_nums": [2],
+ "column_nums": [0],
+ "cell text": "r3col1",
+ "column header": False,
+ },
+ {"row_nums": [2], "column_nums": [1], "cell text": "", "column header": False},
+ {
+ "row_nums": [2, 3],
+ "column_nums": [2, 3],
+ "cell text": "r34col34",
+ "column header": False,
+ },
+ {
+ "row_nums": [3],
+ "column_nums": [0, 1],
+ "cell text": "r4col12",
+ "column header": False,
+ },
+ ],
+ id="missing column in header and in the middle in table with spans",
+ ),
+ ],
+)
+def test_fill_cells(input_cells, expected_cells):
+ def sort_cells(cells):
+ return sorted(cells, key=lambda x: (x["row_nums"], x["column_nums"]))
+
+ assert sort_cells(tables.fill_cells(input_cells)) == sort_cells(expected_cells)
+
+
def test_padded_results_has_right_dimensions(table_transformer, example_image):
str_class_name2idx = tables.get_class_map("structure")
# a simpler mapping so we keep all structure in the returned objs below for test
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index e6fd9f15..d0586119 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.34" # pragma: no cover
+__version__ = "0.7.35" # pragma: no cover
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
index 48f4c383..d639eb62 100644
--- a/unstructured_inference/models/tables.py
+++ b/unstructured_inference/models/tables.py
@@ -648,11 +648,8 @@ def structure_to_cells(table_structure, tokens):
def fill_cells(cells: List[dict]) -> List[dict]:
- """add empty cells to pad cells that spans multiple rows for html conversion
-
- For example if a cell takes row 0 and 1 and column 0, we add a new empty cell at row 1 and
- column 0. This padding ensures the structure of the output table is intact. In this example the
- cell data is {"row_nums": [0, 1], "column_nums": [0], ...}
+ """fills the missing cells in the table by adding a cells with empty text
+ where there are no cells detected by the model.
A cell contains the following keys relevent to the html conversion:
row_nums: List[int]
@@ -663,28 +660,60 @@ def fill_cells(cells: List[dict]) -> List[dict]:
than one numbers
cell text: str
the text in this cell
+ column header: bool
+ whether this cell is a column header
"""
- new_cells = cells.copy()
+ table_rows_no = max({row for cell in cells for row in cell["row_nums"]})
+ table_cols_no = max({col for cell in cells for col in cell["column_nums"]})
+ filled = np.zeros((table_rows_no + 1, table_cols_no + 1), dtype=bool)
for cell in cells:
- for extra_row in sorted(cell["row_nums"][1:]):
- new_cell = cell.copy()
- new_cell["row_nums"] = [extra_row]
- new_cell["cell text"] = ""
- new_cells.append(new_cell)
+ for row in cell["row_nums"]:
+ for col in cell["column_nums"]:
+ filled[row, col] = True
+ # add cells for which filled is false
+ header_rows = {row for cell in cells if cell["column header"] for row in cell["row_nums"]}
+ new_cells = cells.copy()
+ not_filled_idx = np.where(filled == False) # noqa: E712
+ for row, col in zip(not_filled_idx[0], not_filled_idx[1]):
+ new_cell = {
+ "row_nums": [row],
+ "column_nums": [col],
+ "cell text": "",
+ "column header": row in header_rows,
+ }
+ new_cells.append(new_cell)
return new_cells
-def cells_to_html(cells):
- """Convert table structure to html format."""
+def cells_to_html(cells: List[dict]) -> str:
+ """Convert table structure to html format.
+
+ Args:
+ cells: List of dictionaries representing table cells, where each dictionary has the
+ following format:
+ {
+ "row_nums": List[int],
+ "column_nums": List[int],
+ "cell text": str,
+ "column header": bool,
+ }
+ Returns:
+ str: HTML table string
+ """
cells = sorted(fill_cells(cells), key=lambda k: (min(k["row_nums"]), min(k["column_nums"])))
table = ET.Element("table")
current_row = -1
+ table_header = None
+ table_has_header = any(cell["column header"] for cell in cells)
+ if table_has_header:
+ table_header = ET.SubElement(table, "thead")
+
+ table_body = ET.SubElement(table, "tbody")
for cell in cells:
this_row = min(cell["row_nums"])
-
attrib = {}
colspan = len(cell["column_nums"])
if colspan > 1:
@@ -695,11 +724,12 @@ def cells_to_html(cells):
if this_row > current_row:
current_row = this_row
if cell["column header"]:
+ table_subelement = table_header
cell_tag = "th"
- row = ET.SubElement(table, "thead")
else:
+ table_subelement = table_body
cell_tag = "td"
- row = ET.SubElement(table, "tr")
+ row = ET.SubElement(table_subelement, "tr") # type: ignore
tcell = ET.SubElement(row, cell_tag, attrib=attrib)
tcell.text = cell["cell text"]
|