From ecdb3f9ec1e963b9f93825697a6a2d2506e35a31 Mon Sep 17 00:00:00 2001 From: Nathan Lloyd Date: Sat, 29 Jul 2023 13:23:53 +0100 Subject: [PATCH] Create html-table-to-csv This does not produce "Official CSV", the default separator is actually tabs, when it should be comma's. Easy to modify the parser object if needed. Instead of storing the string of data inside it is immediately passed out to insure low memory consumption and greater, speed so it can be written to the display buffer immediately. data is `escape`d on input and on output to ensure data safety. (not a HTML security expert.) TODO: - Implement `` and `` - Add minimum `` and `` features e.g. horizontal divider line. - Manage multiple tables by `id` --- scripts/html-table-to-csv | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 scripts/html-table-to-csv diff --git a/scripts/html-table-to-csv b/scripts/html-table-to-csv new file mode 100644 index 0000000..2d7eecf --- /dev/null +++ b/scripts/html-table-to-csv @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +__author__ = "Nathan Lloyd" + +from html.parser import HTMLParser +from html import escape +from sys import stdin, stdout, exit + +DATA_ELEMENTS = ["td", "th"] +ROW_ELEMENTS = ["tr", "thead", "tbody", "tfoot"] +BUFFER_SIZE = 4096 + +class TableParser(HTMLParser): + """ + A custom HTML parser that extracts tabular data from HTML tables. + + This parser inherits from the `HTMLParser` class and overrides its methods + to handle table-related HTML elements and extract data from them. + """ + def __init__(self, separator='\t', newline='\n'): + """ Initializes the TableHTMLParser with optional separator + and newline characters. """ + super().__init__() + self.separator = separator + self.newline = newline + self._in_table: bool = False + self._at_data = False + + def handle_starttag(self, tag, attrs): + """ Method handles by detecting table elements and data elements + (e.g., or ) inside a table. """ + if "table" == tag: + # this could handle multiple tables in one html document + self._in_table = next((v for k, v in attrs if "id"==k), True) + if tag in DATA_ELEMENTS: + self._at_data = True + + def handle_endtag(self, tag): + """ Method handles by detecting the end of table elements and reseting + the data extraction flag. It also controls the output format by + writing separators and newlines as needed. """ + if "table" == tag: + self._in_table = False + if self._in_table: + if tag in DATA_ELEMENTS: + stdout.write(self.separator) + self._at_data = False + if tag in ROW_ELEMENTS: + stdout.write(self.newline) + + def handle_data(self, data): + """ This method is used to capture and output the data inside table + data elements ( or ) when the parser is inside a table + and currently at a data element. """ + if self._in_table and self._at_data: + stdout.write(escape(data.strip())) + +if __name__ == "__name__": + parser = TableParser() + while (data := stdin.read(BUFFER_SIZE)): + parser.feed(escape(data)) + parser.close() + exit()