-
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathcompose-db.js
182 lines (147 loc) · 8.02 KB
/
compose-db.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
const DataFetcher = require('./build/utils/compose-fetcher.js');
const DataProcessor = require('./build/utils/compose-processor.js');
const DataIO = require('./build/utils/compose-io.js');
const COMMITS_PER_PAGE = 150;
async function main() {
// Internal utility methods.
const checkForExit = () => {
if (process.exitCode > 0) {
console.log(` Terminating with an exit code ${process.exitCode}.`);
process.exit();
}
};
// Getting PRs between two commits is a complicated task, and must be done in
// multiple steps. GitHub API does not have a method for that, so we must improvise.
// We also need to consider that there is no easy way to fetch information for
// an arbitrary list of commits; the API can work on ranges, but not on lists.
//
// We do not need to run this operation constantly. Release versions don't change.
// (Though some metadata of PRs can change, so re-indexing should be possible, on
// demand.)
// We also have to preconfigure some information, e.g. manually supply the tags
// or hashes, which serve as release boundaries.
console.log("[*] Building local commit and pull request database.");
const dataIO = new DataIO();
dataIO.parseArgs();
checkForExit();
await dataIO.loadConfig();
checkForExit();
const databaseName = `${dataIO.data_owner}.${dataIO.data_repo}.${dataIO.data_version}.json`;
console.log(`[*] Configured for the "${dataIO.data_owner}/${dataIO.data_repo}" repository; version ${dataIO.data_version}.`);
const dataFetcher = new DataFetcher(dataIO.data_owner, dataIO.data_repo);
const dataProcessor = new DataProcessor();
if (dataIO.update_data) {
console.log(`[*] Loading existing data to perform an update.`);
const oldData = await dataIO.loadData(databaseName);
dataProcessor.takeData(oldData);
}
console.log("[*] Checking the rate limits before.");
await dataFetcher.checkRates();
checkForExit();
// First, we checkout the repository for the specified branch/tag/hash. We will
// use it to retrieve a clean commit log. This step creates a shallow copy of the
// repository, as we are only interested in the history of the branch.
// Still, it extracts all of the current files, so it may take a bit of time.
if (dataIO.skip_checkout) {
console.log(`[*] Skipping the repository checkout.`);
} else {
console.log(`[*] Checking out the repository at "${dataIO.last_commit}".`);
await dataFetcher.checkoutRepo(dataIO.git_tag, dataIO.last_commit);
checkForExit();
}
if (dataIO.checkout_dir !== "") {
console.log(`[*] Using the local clone at "${dataIO.checkout_dir}".`);
}
if (dataIO.skip_gitlog) {
console.log(`[*] Skipping the commit log extraction.`);
dataProcessor.consumeOldLog();
} else {
console.log(`[*] Extracting the commit log between "${dataIO.first_commit}" and "${dataIO.last_commit}".`);
const commitLogSize = await dataFetcher.countCommitHistory(dataIO.first_commit, dataIO.last_commit, dataIO.checkout_dir);
const commitLog = await dataFetcher.getCommitHistory(dataIO.first_commit, dataIO.last_commit, dataIO.checkout_dir);
checkForExit();
// Second, we parse the extracted commit log, to generate a list of commit hashes
// for the next step. We also try to extract the information about this being a
// cherry-pick, and not the original commit. We can rely on the commit message body
// containing a certain string, from which we can take the original commit hash.
dataProcessor.processLog(commitLog, commitLogSize);
checkForExit();
// We also need to keep track of the commit history of each release within a version.
// Releases can, and most often do, include commits outside of the defined range. This
// happens when a contribution is authored before the defined range, but merged within
// it.
console.log(`[*] Extracting commit logs for releases.`);
for (let i = 0; i < dataIO.releases.length; i++) {
const release = dataIO.releases[i];
console.log(` Extracting the commit log for "${release.name}" (between "${release.from_ref}" and "${release.ref}").`);
const releaseLog = await dataFetcher.getCommitsBetween(release.from_ref, release.ref, dataIO.checkout_dir);
checkForExit();
console.log(` Processing the commit log for "${release.name}".`);
dataProcessor._processReleaseLog(release.name, releaseLog);
checkForExit();
}
}
// This method returns only non-merge commits; we don't need to fetch anything about
// merge commits. We only need them for a complete commit history.
const commitHashes = dataProcessor.getCommitHashes();
if (dataIO.skip_github) {
console.log(`[*] Skipping the commit data fetching from GitHub.`);
dataProcessor.consumeOldCommits();
} else {
// Third, we generate a query to the GraphQL API to fetch the information about
// linked PRs. GraphQL API doesn't have a filter to extract data for a list of
// commit hashes, but it supports having multiple sub-queries within the same request,
// which is our way in.
//
// While paginated queries are limited to 100 entries per page, sub-queries do not
// appear to be similarly limited. We are still limited by the total number of nodes
// we can theoretically fetch, which is 500 000. As such, we still want to do this
// in batches, so the number of nodes in each request is manageable.
console.log("[*] Fetching commit data from GitHub.");
let commitsRaw = {};
const totalPages = Math.ceil(commitHashes.length / COMMITS_PER_PAGE);
// Pages are starting with 1 for better presentation.
let page = 1;
while (page <= totalPages) {
const batchHashes = commitHashes.splice(0, COMMITS_PER_PAGE);
const batchCommits = await dataFetcher.fetchCommits(batchHashes, page, totalPages);
checkForExit();
Object.assign(commitsRaw, batchCommits);
page++;
// Wait for a bit before proceeding to avoid hitting the secondary rate limit in GitHub API.
// See https://docs.github.com/en/rest/guides/best-practices-for-integrators#dealing-with-secondary-rate-limits.
await dataFetcher.delay(DataFetcher.API_DELAY_MSEC);
// Add an extra delay every few requests, because the chance to trigger the hidden rate issue
// seems to grow with the number of queries.
if (page % 8 === 0) {
console.log("[*] Waiting a bit for the API to cool down...");
await dataFetcher.delay(DataFetcher.API_DELAY_MSEC * 4);
}
}
// Fourth, we consolidate the information. Commits are populated with links to their
// respective PRs, and PRs store references to their commits. We will save this to
// a file for the specified range, which should be between two stable releases.
//
// For intermediate releases (developer previews) we have preconfigured hashes and
// can simply pass them to the final data. Frontend will handle the rest.
console.log(`[*] Processing ${Object.keys(commitsRaw).length} commits.`);
dataProcessor.processCommits(commitsRaw, `${dataIO.data_owner}/${dataIO.data_repo}`);
checkForExit();
}
console.log("[*] Checking the rate limits after.")
await dataFetcher.checkRates();
checkForExit();
console.log("[*] Finalizing database.")
const output = {
"generated_at": Date.now(),
"log": dataProcessor.log,
"release_logs": dataProcessor.releaseLogs,
"authors": dataProcessor.authors,
"commits": dataProcessor.commits,
"pulls": dataProcessor.pulls,
};
await dataIO.saveData(databaseName, output);
checkForExit();
console.log("[*] Database built.");
}
main();