-
Notifications
You must be signed in to change notification settings - Fork 27
/
scrape_docs.py
executable file
·84 lines (72 loc) · 3.27 KB
/
scrape_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
# Prerequisites: pip install -e '.[scrape_docs]'
# Usage: scrape_docs.py --help
import click
from bs4 import BeautifulSoup
import requests
@click.command()
@click.option(
'--output', '-o', default='csv',
type=click.Choice(['debug', 'rst', 'csv']), show_choices=True,
help='''Output format "debug" prints headline level, headline text and the
scraped link/anchor; "rst" gives a restructuredText formatted hyperlink
including indentation according to headline levels. Spaces at the beginning
of lines have a special meaning in rst documents, thus the hardcoded
replacement marker "|indent|" is used; "csv" is a two-column comma
separated value format that includes the results of "rst" as the left
column's contents.''')
@click.argument('URL')
def scrape(output, url):
'''Scrape one chapter of Admin API docs and spit out in various formats.
URL is the address of the Synapse Admin API docs chapter. For example:
https://element-hq.github.io/synapse/latest/admin_api/rooms.html
The default output format is "csv", which gives a two column CSV table
containing restructuredText formatted hyperlinks and a headline.
'''
def get_indentation_levels(heading_tags, heading_tag):
"""Returns how many indentation levels are required depending on the
passed heading tag
h1 is no indentation,
h2 is one indentation level,
h3 is two, and so on...
"""
for h in heading_tags:
if heading_tag == h:
return int(heading_tag[-1]) - 1
return 0
chapter = url
apidoc = requests.get(chapter).text
soup = BeautifulSoup(apidoc, 'html.parser')
any_heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
elements = soup.find_all([*any_heading_tag, 'a'],)
for e in elements:
if e.name in any_heading_tag and output == 'debug':
print(f'{e.name}: {e.text}')
if e.name == 'a':
if e.parent.name in any_heading_tag:
link = e['href']
if output == 'debug':
print(f'Element text:\t{e.text}\nLink/Anchor:\t{link}')
indent_count = get_indentation_levels(any_heading_tag,
e.parent.name)
print(f'Indentations:\t{indent_count}')
if output in ['rst', 'csv']:
parts = chapter.split('/admin_api/')
fulllink = f'{parts[0]}/admin_api/{parts[1]}{link}'
indent_count = get_indentation_levels(any_heading_tag,
e.parent.name)
spacing = ''
for val in range(0, indent_count):
# '|indent| ' represents one indentation level
spacing += '|indent| '
rst = f'{spacing}`{e.text} <{fulllink}>`_'
if output == 'csv':
left_col = f'"{rst}"'
print(f'{left_col},')
elif output == 'rst':
print(rst)
if output == 'debug': # Final spacing only with debug format
print()
# print(soup.prettify())
if __name__ == '__main__':
scrape()