Change the text processing to remove empty blocks full of whitespace

crocs-muni · Oct 13, 2023 · 0ca0d9a · 0ca0d9a
1 parent 05f91e3
commit 0ca0d9a
Showing 1 changed file with 6 additions and 2 deletions.
diff --git a/src/sec_certs/utils/pdf.py b/src/sec_certs/utils/pdf.py
@@ -176,8 +176,12 @@ def segmented_pdf_to_text(segmented_pdf: list[dict[str, Any]]) -> str:
                         spans = []
                         for span in line:
                             spans.append(span.strip())
-                        lines.append(" ".join(spans))
-                    block_texts.append("\n".join(lines)) # lines are separated by "\n"
+                        line = " ".join(spans)
+                        if len(line.strip()) > 0:
+                            lines.append(line)
+                    block_text = "\n".join(lines) # TODO maybe change to " ", depends how we wanna view it
+                    if len(block_text.strip()) > 0:
+                        block_texts.append(block_text) # lines are separated by "\n"
                 # deal with table which has header and rows
                 elif block["type"] == "table":
                     row_texts = []