-
Notifications
You must be signed in to change notification settings - Fork 1
/
report.py
executable file
·118 lines (87 loc) · 4.72 KB
/
report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from numpy import where
from pandas import DataFrame, read_csv
from pathlib import Path
import sys
repo_name = sys.argv[1]
run_id = sys.argv[2]
report_outfile = 'output/report.csv'
summary_outfile = 'output/summary.csv'
# Read CSV files from the input directory
matches = read_csv('output/matches.csv', keep_default_na=False)
pages = read_csv('output/pages.csv', keep_default_na=False)
revisions = read_csv('output/revisions.csv', keep_default_na=False)
sources = read_csv('output/sources.csv', keep_default_na=False)
# Link to source code on GitHub
sources = sources.merge(revisions[['page_id', 'rev_id', 'rev_SHA']])
sources = sources.sort_values(by=['rev_id', 'file_name'], ascending=False)
sources = sources.drop_duplicates(subset=['page_id', 'code_element'])
sources['source_link'] = f"https://github.com/{repo_name}/blob/" + \
sources['rev_SHA'] + '/' + sources['file_name'] + \
'#L' + sources['line_number'].astype(str)
# Reshape revisions as columns, group by pages, select timestamp and SHA
details = revisions[['page_id', 'rev_id', 'rev_SHA', 'rev_timestamp']]
details = details.pivot(index=['page_id'], columns=['rev_id'])
# Rename columns (rev_col, id) as rev_col_id
details.columns = details.columns.map('{0[0]}_{0[1]}'.format)
details_cols = sorted(details.columns.to_list(), \
key=lambda col_name: int(col_name.split('_')[-1]))
# Combine details in pivot table with the original DataFrame
unique = revisions[['page_id', 'doc_SHA', 'doc_timestamp']].drop_duplicates()
details = DataFrame(details.to_records()).merge(pages).merge(unique)
details = details.reindex(columns=['page_id', 'page_type', 'page_name', \
*details_cols, 'doc_SHA', 'doc_timestamp'])
# Link to the documentation on GitHub
details['doc_link'] = where(details['page_type'] == 'repo', \
# Repository
f"https://github.com/{repo_name}/blob/" + \
details['doc_SHA'] + '/' + details['page_name'], \
# Wiki
f"https://github.com/{repo_name}/wiki/" + \
details['page_name'].str.rsplit('.', n=1).str[0] + '/' + details['doc_SHA'])
# Reshape revisions as columns, group by code elements and pages
matches = matches.groupby(['code_element', 'page_id', 'rev_id'])['count'].sum().unstack('rev_id')
# Remove rows that are only either 0 or NaN
matches = matches.loc[~(matches.isna() | matches.eq(0)).all(axis=1)]
# Remove files if there are no results
if not len(matches):
Path(report_outfile).unlink(missing_ok=True)
Path(summary_outfile).unlink(missing_ok=True)
sys.exit()
# Rename columns (count, id) as rev_id
matches.columns = ['rev_' + str(col) for col in matches.columns]
# Create columns for missing revisions
rev_cols = [f"rev_{i+1}" for i in range(revisions['rev_id'].max())]
matches = matches.reindex(columns=rev_cols)
# Include these columns in the output
output_cols = ['code_element', 'page_type', 'page_name', *rev_cols, \
*details_cols, 'doc_SHA', 'doc_timestamp', 'doc_link', 'source_link']
# Combine pivot table with DataFrame as output
report = DataFrame(matches.to_records()).merge(details)
report = report.fillna('.')
report = report.merge(sources[['page_id', 'code_element', 'source_link']], how='left')
report = report.fillna('code_element is a file name')
report = report.sort_values(by=['page_type', 'page_name', 'code_element'])
report.to_csv(report_outfile, columns=output_cols, index=None)
# Filter rows with count that goes to zero
snapshot = report['rev_1'] > 0
current = report['rev_2'] == 0
outdated = report[snapshot & current]
# Remove file if there are no results
if not len(outdated):
Path(summary_outfile).unlink(missing_ok=True)
sys.exit()
outdated.to_csv(summary_outfile, columns=output_cols, index=None)
num_outdated_references = len(outdated.index)
max_references_to_show = 10
if num_outdated_references:
header = f"Merging this pull request may result in {num_outdated_references} outdated {'reference' if num_outdated_references == 1 else 'references'} in the documentation:"
output_format = DataFrame('- [ ] [' + outdated['page_name'] + '](' + outdated['doc_link'] + \
') - [`' + outdated['code_element'] + '`](' + outdated['source_link'] + ')')
outdated_references = [outdated_reference.strip() for outdated_reference in output_format.to_string(header=False, index=False, max_colwidth=None).split('\n')]
partial_references = '\n'.join(outdated_references[:max_references_to_show])
all_references = '\n'.join(outdated_references)
with open('output/summary.md', 'w') as f:
f.write(f"{header}\n\n{all_references}\n")
print(f"{header}\n\n{partial_references}")
if num_outdated_references > max_references_to_show: print('... (and more)')
print(f"\n[View the full report](https://github.com/{repo_name}/actions/runs/{run_id})")