From 6a8ea32b8574bd762f5f6b3a8384ed7d67d044aa Mon Sep 17 00:00:00 2001 From: Jackson Date: Mon, 14 Oct 2024 11:30:23 +0200 Subject: [PATCH 1/3] make scrape_docs.py csv consistent with format - ignoring h1 - no indents from h2 --- scrape_docs.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/scrape_docs.py b/scrape_docs.py index 0320082..49b2432 100755 --- a/scrape_docs.py +++ b/scrape_docs.py @@ -32,12 +32,9 @@ def scrape(output, url): apidoc = requests.get(chapter).text soup = BeautifulSoup(apidoc, 'html.parser') - any_heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + any_heading_tag = ['h2', 'h3', 'h4', 'h5', 'h6'] elements = soup.find_all([*any_heading_tag, 'a'],) - if output == 'csv': - print('"Synapse Admin API","synadm command(s)"') - for e in elements: if e.name in any_heading_tag and output == 'debug': print(f'{e.name}: {e.text}') @@ -52,10 +49,10 @@ def scrape(output, url): spacing = '' for h in any_heading_tag: if e.parent.name == h: - # h1 is no spacing (decrease by 1), - # h2 is 2 spaces, h3 is 4.... + # h2 is no spacing (decrease by 2), + # h3 is 2 spaces, h4 is 4.... # two literal spaces are replaced by '|indent| ' - spacing_count = int(e.parent.name[-1]) - 1 + spacing_count = int(e.parent.name[-1]) - 2 for val in range(0, spacing_count * 2): spacing += '|indent| ' rst = f'{spacing}`{e.text} <{fulllink}>`_' From 612298cf400da8be9cee6257e50bbb26c13286aa Mon Sep 17 00:00:00 2001 From: Jackson Date: Mon, 14 Oct 2024 11:42:45 +0200 Subject: [PATCH 2/3] CONTRIBUTING.md: add pip install, and handling commands --- CONTRIBUTING.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a2b0da1..4ccb830 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -64,7 +64,13 @@ We are maintaining `synadm` in our spare time and currently are not sponsored by We keep track of which Synapse Admin API's `synadm` supports in a set of tables on [API to CLI Mapping](https://synadm.readthedocs.io/en/latest/features.html). The structure of this page follows the layout of the official [Synapse Admin API documentation](https://element-hq.github.io/synapse/latest/usage/administration/admin_api/index.html). Each table represents one main chapter of the Synapse documentation. -In our documentation source, the page is defined by [features.rst](https://github.com/JOJ0/synadm/blob/master/doc/source/features.rst), which contains multiple CSV files, each representing a table. To assist with maintaining this page, we offer a [web scraper tool](https://github.com/JOJ0/synadm/blob/master/scrape_docs.py) that pulls data from the Synapse Admin API docs and creates an initial version of such a CSV table. A basic usage example is +In our documentation source, the page is defined by [features.rst](https://github.com/JOJ0/synadm/blob/master/doc/source/features.rst), which contains multiple CSV files, each representing a table. To assist with maintaining this page, we offer a [web scraper tool](https://github.com/JOJ0/synadm/blob/master/scrape_docs.py) that pulls data from the Synapse Admin API docs and creates an initial version of such a CSV table. To get started with using the tool, first run this command in the repository: + +``` +pip install -e '.[scrape_docs]' +``` + +A basic usage example is: ``` ./scrape_docs.py -o csv https://element-hq.github.io/synapse/latest/admin_api/rooms.html @@ -77,12 +83,20 @@ which prints a two-column CSV table containing restructuredText formatted hyperl ``` This would directly link to the `USER_ID` argument's documentation of that command. + Linking to an option is also possible: ``` :option:`synadm media list -r` ``` +If there's no `synadm` command for the corresponding item, leave the right +column empty. If the item is a section has no real API and is nested with +commands (e.g. [delete local media][dellocalmedia]), use the `—` character. + +[dellocalmedia]:https://element-hq.github.io/synapse/latest/admin_api/media_admin_api.html#delete-local-media + + Due to a shortcoming of Sphinx it is currently not possible to link to a plain command (without any option or argument). Also see `scrape_docs.py --help` and the [existing CSV files](https://github.com/JOJ0/synadm/tree/master/doc/source/features). From bc60b173c85adf85fe1f0ab65f1c2609bcb90a59 Mon Sep 17 00:00:00 2001 From: J0J0 Todos Date: Tue, 15 Oct 2024 08:18:43 +0200 Subject: [PATCH 3/3] scrape_docs: Fix h1 exclusion and refactor - h1 tag shouldn't be excluded in output - Refactor for readability: Add a separate function for finding "how many indentations" are required. --- scrape_docs.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/scrape_docs.py b/scrape_docs.py index 49b2432..79255cf 100755 --- a/scrape_docs.py +++ b/scrape_docs.py @@ -28,11 +28,24 @@ def scrape(output, url): The default output format is "csv", which gives a two column CSV table containing restructuredText formatted hyperlinks and a headline. ''' + def get_indentation_levels(heading_tags, heading_tag): + """Returns how many indentation levels are required depending on the + passed heading tag + + h1 is no indentation, + h2 is one indentation level, + h3 is two, and so on... + """ + for h in heading_tags: + if heading_tag == h: + return int(heading_tag[-1]) - 1 + return 0 + chapter = url apidoc = requests.get(chapter).text soup = BeautifulSoup(apidoc, 'html.parser') - any_heading_tag = ['h2', 'h3', 'h4', 'h5', 'h6'] + any_heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] elements = soup.find_all([*any_heading_tag, 'a'],) for e in elements: @@ -43,18 +56,18 @@ def scrape(output, url): link = e['href'] if output == 'debug': print(f'Element text:\t{e.text}\nLink/Anchor:\t{link}') + indent_count = get_indentation_levels(any_heading_tag, + e.parent.name) + print(f'Indentations:\t{indent_count}') if output in ['rst', 'csv']: parts = chapter.split('/admin_api/') fulllink = f'{parts[0]}/admin_api/{parts[1]}{link}' + indent_count = get_indentation_levels(any_heading_tag, + e.parent.name) spacing = '' - for h in any_heading_tag: - if e.parent.name == h: - # h2 is no spacing (decrease by 2), - # h3 is 2 spaces, h4 is 4.... - # two literal spaces are replaced by '|indent| ' - spacing_count = int(e.parent.name[-1]) - 2 - for val in range(0, spacing_count * 2): - spacing += '|indent| ' + for val in range(0, indent_count): + # '|indent| ' represents one indentation level + spacing += '|indent| ' rst = f'{spacing}`{e.text} <{fulllink}>`_' if output == 'csv': left_col = f'"{rst}"'