diff --git a/scripts/html-table-to-csv b/scripts/html-table-to-csv new file mode 100644 index 0000000..2d7eecf --- /dev/null +++ b/scripts/html-table-to-csv @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +__author__ = "Nathan Lloyd" + +from html.parser import HTMLParser +from html import escape +from sys import stdin, stdout, exit + +DATA_ELEMENTS = ["td", "th"] +ROW_ELEMENTS = ["tr", "thead", "tbody", "tfoot"] +BUFFER_SIZE = 4096 + +class TableParser(HTMLParser): + """ + A custom HTML parser that extracts tabular data from HTML tables. + + This parser inherits from the `HTMLParser` class and overrides its methods + to handle table-related HTML elements and extract data from them. + """ + def __init__(self, separator='\t', newline='\n'): + """ Initializes the TableHTMLParser with optional separator + and newline characters. """ + super().__init__() + self.separator = separator + self.newline = newline + self._in_table: bool = False + self._at_data = False + + def handle_starttag(self, tag, attrs): + """ Method handles by detecting table elements and data elements + (e.g.,