-
Notifications
You must be signed in to change notification settings - Fork 1
/
analysis.sh
executable file
·191 lines (157 loc) · 6.99 KB
/
analysis.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env bash
run_evaluate() {
exclude_path="repo/.DOCER_exclude"
# List of regular expressions to match code elements
regex_path="tool/regex_list.txt"
# Create output directory
mkdir -p output
export regex_path
# Set directories for the current repository
repo_dir="repo"
wiki_dir="wiki"
# Print headers for the CSV output files
printf '%s\n' "page_id,rev_id,code_element,count" > "output/matches.csv"
printf '%s\n' "page_id,rev_id,code_element,file_name,line_number" > "output/sources.csv"
printf '%s\n' "page_id,page_type,page_name" > "output/pages.csv"
printf '%s\n' "page_id,rev_id,rev_SHA,rev_timestamp,doc_SHA,doc_timestamp" > "output/revisions.csv"
export repo_name
export repo_dir
export wiki_dir
while IFS= read -r -d $'\0' page; do
evaluate_page "$page"
done < <({
# Find README.md in the source code repository
awk 'BEGIN { RS="\0"; ORS="\0" }; { print 0, $0 }' <(
grep -axzF README.md <( # Match README.md in the root directory
sort -z <( # Sort page names
find "$repo_dir" -type f -printf '%P\0' 2> /dev/null
)
)
);
# Find documentation files in the wiki repository
awk 'BEGIN { RS="\0"; ORS="\0" }; { print NR, $0 }' <(
grep -aivzE "(^|\/)_[^\/]*\." <( # Match file names that do not start with '_'
# Match a list of valid markup extensions: https://github.com/github/markup#markups
grep -aizP "\.(markdown|mdown|mkdn|md|textile|rdoc|org|creole|mediawiki|wiki|rst|asciidoc|adoc|asc|pod)$" <(
sort -z <( # Sort page names
find "$wiki_dir" -type f -printf '%P\0' 2> /dev/null
)
)
)
);
})
}
evaluate_page() {
IFS=' ' read -r -d $'\0' page_id page_name < <(printf '%s\0' "$1")
# Set the directory to the page location
if ((page_id == 0)); then
page_dir="$repo_dir"
else
page_dir="$wiki_dir"
fi
printf '%s,%s,"%s"\n' "$page_id" "${page_dir##*/}" "${page_name//\"/\"\"}" >> "output/pages.csv"
# Get when the page was last updated
snapshot_timestamp="$(git -C "$page_dir" log -1 --first-parent --pretty=format:%ct HEAD -- "$page_name")"
snapshot_SHA="$(tail -1 <(git -C "$repo_dir" rev-list --max-age="$snapshot_timestamp" --first-parent HEAD 2> /dev/null))"
export page_id
export page_dir
export page_name
if ((${#snapshot_SHA})); then
# Compare the snapshot and the latest revision
while IFS= read -r revision; do
evaluate_revision "$revision"
done < <(
awk '{ print NR, $0 }' <({
printf '%s\n' "$snapshot_SHA";
git -C "$repo_dir" rev-list -1 --first-parent HEAD 2> /dev/null;
})
)
else
# Page was updated after the latest revision,
# set the snapshot to the latest revision
while IFS= read -r revision; do
evaluate_revision "$revision"
done < <(
awk '{ print NR, $0 }' <({
git -C "$repo_dir" rev-list -1 --first-parent HEAD 2> /dev/null;
git -C "$repo_dir" rev-list -1 --first-parent HEAD 2> /dev/null;
})
)
fi
}
evaluate_revision() {
read -r rev_id rev_SHA < <(printf '%s' "$1")
rev_timestamp="$(git -C "$repo_dir" log -1 --first-parent --pretty=format:%ct "$rev_SHA")"
doc_SHA="$(git -C "$page_dir" rev-list -1 --min-age="$rev_timestamp" --first-parent HEAD -- "./$page_name")"
# Return early if documentation SHA is not found
if ((!${#doc_SHA})); then return; fi
doc_timestamp="$(git -C "$page_dir" log -1 --first-parent --pretty=format:%ct "$doc_SHA")"
page_found="$(git -C "$page_dir" ls-tree "$doc_SHA" --name-only "./$page_name")"
# Return early if page is not found
if ((!${#page_found})); then return; fi
printf '%s,%s,%s,%s,%s,%s\n' "$page_id" "$rev_id" "$rev_SHA" "$rev_timestamp" "$doc_SHA" "$doc_timestamp" >> "output/revisions.csv"
# List of file names found in this revision
file_names="$(
tr '\0' '\n' < <( # Change delimiter from '\0' to '\n'
sed -nz '/\n/!p' <( # Remove names containing newline
sort -uz <(
git -C "$repo_dir" ls-tree -rz "$rev_SHA" --name-only;
)
)
)
)"
# List of unique code elements in the current documentation page
# that match the list of regular expressions provided
code_elements="$(
grep -vxF -f <(cat "$exclude_path" 2> /dev/null) <(
sort -u <(
git -C "$page_dir" grep -howIP -f "$PWD/$regex_path" "$doc_SHA" -- "./$page_name"
)
)
)"
# List of code elements in the repository (excluding ./README.md)
# that match the code elements found in the documentation and file names
matched_elements="$(
sort <({
# Search for code elements in the documentation
git -C "$repo_dir" grep -howFI -f <(printf '%s' "$code_elements") "$rev_SHA" -- ':!./README.md';
# Intersection of code elements and file names
grep -xF -f <(printf '%s' "$code_elements") <(
sed -r 's/(.*)/\/\1\n\1/g' <( # Duplicate path and prepend '/'
# Recursively get subpaths (path/to/file -> to/file -> file)
while ((${#file_names})); do
# Remove empty lines and print the file names
grep -v '^$' <(printf '%s' "$file_names")
file_names="$(
# Remove first part of the path component
sed -r 's/[^\/]*(\/|$)//' <(printf '%s' "$file_names")
)"
done
)
);
})
)"
# Extract source information of code elements
while IFS=: read -r -d '' SHA file_name; read -d '' line_number; read -r code_element; do
printf '%s,%s,"%s","%s",%s\n' "$page_id" "$rev_id" "${code_element//\"/\"\"}" "${file_name//\"/\"\"}" "$line_number"
done < <(
git -C "$repo_dir" grep -aznowFI -f <(printf '%s' "$code_elements") "$rev_SHA" -- ':!./README.md'
) >> "output/sources.csv"
# List of code elements that are not matched
while read -r code_element; do
printf '%s,%s,"%s",0\n' "$page_id" "$rev_id" "${code_element//\"/\"\"}"
done < <(
# Subtraction of matched elements from code elements
grep -vxF -f <(printf '%s' "$matched_elements") <(printf '%s' "$code_elements")
) >> "output/matches.csv"
# List of code elements that are matched
while read -r count code_element; do
printf '%s,%s,"%s",%s\n' "$page_id" "$rev_id" "${code_element//\"/\"\"}" "$count"
done < <(
# Count the occurrences of matched elements
uniq -c <(printf '%s' "$matched_elements")
) >> "output/matches.csv"
}
export -f evaluate_page
export -f evaluate_revision
run_evaluate