From f99f523a2097c68febffc6fc893f49b8d652dec7 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Thu, 15 Nov 2018 15:03:31 +0100 Subject: [PATCH] db module // incomplete, abandonned #13 #22 ghuser-io/ghuser.io#190 --- calculateContribsAndMeta.js | 30 +++++------------- fetchOrgs.js | 26 ++++----------- fetchRepos.js | 17 ++-------- fetchUserDetailsAndContribs.js | 34 +++++++------------- findUsersToRemove.js | 17 ++++------ impl/data.js | 2 ++ impl/db.js | 58 ++++++++++++++++++++++++++++++++++ impl/dbFile.js | 4 +++ printDataStats.js | 26 +++++++-------- 9 files changed, 109 insertions(+), 105 deletions(-) create mode 100755 impl/db.js diff --git a/calculateContribsAndMeta.js b/calculateContribsAndMeta.js index 277e8f2b7..c71d41481 100755 --- a/calculateContribsAndMeta.js +++ b/calculateContribsAndMeta.js @@ -9,6 +9,7 @@ const sleep = require('await-sleep'); const data = require('./impl/data'); + const db = require('./impl/db'); const DbFile = require('./impl/dbFile'); const scriptUtils = require('./impl/scriptUtils'); @@ -18,31 +19,16 @@ return; async function calculateContribsAndMeta() { - let spinner; - - let spinnerText = 'Reading users from DB...'; - spinner = ora(spinnerText).start(); - const users = {}; let numUsers = 0; - for (const file of fs.readdirSync(data.users)) { - await sleep(0); // make loop interruptible + for await (const user of db.asyncNonRemovedUsers()) { + ++numUsers; - if (file.endsWith('.json')) { - const user = new DbFile(path.join(data.users, file)); - if (!user.ghuser_deleted_because && !user.removed_from_github) { - users[file] = user; - ++numUsers; - spinner.text = `${spinnerText} [${numUsers}]`; - - // Make sure the corresponding contrib file exists (not the case if it's a new user): - (new DbFile(path.join(data.contribs, file))).write(); - } - } + // Make sure the corresponding contrib file exists (not the case if it's a new user): + db.createUserContribList(user.login); //LA_TODO to be tested } - spinner.succeed(`Found ${numUsers} users in DB`); - spinnerText = 'Reading contribution lists from DB...'; - spinner = ora(spinnerText).start(); + const spinnerText = 'Reading contribution lists from DB...'; + let spinner = ora(spinnerText).start(); const contribs = {}; for (const file of fs.readdirSync(data.contribs)) { await sleep(0); // make loop interruptible @@ -79,7 +65,7 @@ const toBeDeleted = []; for (const contribList in contribs) { - if (!users[contribList]) { + if (!users[contribList]) { //LA_TODO I've removed this var toBeDeleted.push(contribList); } } diff --git a/fetchOrgs.js b/fetchOrgs.js index b8e788cc9..a443508eb 100755 --- a/fetchOrgs.js +++ b/fetchOrgs.js @@ -8,6 +8,7 @@ const path = require('path'); const data = require('./impl/data'); + const db = require('./impl/db'); const DbFile = require('./impl/dbFile'); const github = require('./impl/github'); const scriptUtils = require('./impl/scriptUtils'); @@ -18,36 +19,21 @@ return; async function fetchOrgs() { - let spinner; - // In this file we store repo owners that we know aren't organizations. This avoids querying // them next time. const nonOrgs = new DbFile(data.nonOrgs); nonOrgs.non_orgs = nonOrgs.non_orgs || []; - const users = []; - for (const file of fs.readdirSync(data.users)) { - if (file.endsWith('.json')) { - const user = new DbFile(path.join(data.users, file)); - if (!user.ghuser_deleted_because && !user.removed_from_github) { - users.push(user); - } - } - } - let userOrgs = new Set([]); - for (const user of users) { - userOrgs = new Set([...userOrgs, ...user.organizations]); - } - await fetchOrgs(userOrgs); - let contribOwners = new Set([]); - for (const user of users) { + for await (const user of db.asyncNonRemovedUsers()) { //LA_TODO to be tested + userOrgs = new Set([...userOrgs, ...user.organizations]); contribOwners = new Set([ ...contribOwners, ...(user.contribs && user.contribs.repos.map(repo => repo.split('/')[0]) || []) ]); } + await fetchOrgs(userOrgs); await fetchOrgs(contribOwners); stripUnreferencedOrgs(); @@ -57,7 +43,7 @@ async function fetchOrgs(owners) { owners: for (const owner of owners) { - spinner = ora(`Fetching owner ${owner}...`).start(); + const spinner = ora(`Fetching owner ${owner}...`).start(); const org = new DbFile(path.join(data.orgs, `${owner}.json`)); if (org.avatar_url) { spinner.succeed(`Organization ${owner} is already known`); @@ -67,7 +53,7 @@ spinner.succeed(`${owner} is a user`); continue; } - for (const user of users) { + for (const user of users) { //LA_TODO I removed this var if (user.login === owner) { spinner.succeed(`${owner} is a user`); nonOrgs.non_orgs.push(owner); diff --git a/fetchRepos.js b/fetchRepos.js index 1858fb9be..a73725930 100755 --- a/fetchRepos.js +++ b/fetchRepos.js @@ -10,6 +10,7 @@ const sleep = require('await-sleep'); const data = require('./impl/data'); + const db = require('./impl/db'); const DbFile = require('./impl/dbFile'); const fetchJson = require('./impl/fetchJson'); @@ -46,23 +47,9 @@ optional arguments: return; async function fetchRepos(firsttime) { - console.log('Reading users from DB...') - const users = []; - for (const file of fs.readdirSync(data.users)) { - await sleep(0); // make loop interruptible - - if (file.endsWith('.json')) { - const user = new DbFile(path.join(data.users, file)); - if (!user.ghuser_deleted_because && !user.removed_from_github) { - users.push(user); - } - } - } - console.log(`Found ${users.length} users in DB`); - console.log('Searching repos referenced by users...'); const referencedRepos = new Set([]); - for (const user of users) { + for await (const user of db.asyncNonRemovedUsers()) { //LA_TODO to be tested for (const repo in (user.contribs && user.contribs.repos || [])) { await sleep(0); // make loop interruptible diff --git a/fetchUserDetailsAndContribs.js b/fetchUserDetailsAndContribs.js index 311cd6e89..33b10df2f 100755 --- a/fetchUserDetailsAndContribs.js +++ b/fetchUserDetailsAndContribs.js @@ -10,6 +10,7 @@ let path = require('path'); const data = require('./impl/data'); + const db = require('./impl/db'); const DbFile = require('./impl/dbFile'); const fetchJson = require('./impl/fetchJson'); const github = require('./impl/github'); @@ -67,34 +68,21 @@ optional arguments: } if (cli.input.length === 1) { - await fetchUserDetailsAndContribs(`${cli.input[0].toLowerCase()}.json`); + const userFilePath = path.join(data.users, `${cli.input[0].toLowerCase()}.json`); + const user = new DbFile(userFilePath); + await fetchUserDetailsAndContribs(user); //LA_TODO to be tested } else { - for (const file of fs.readdirSync(data.users)) { - if (file.endsWith('.json')) { - await fetchUserDetailsAndContribs(file); - } + for await (const user of db.asyncNonRemovedUsers()) { + await fetchUserDetailsAndContribs(user); //LA_TODO to be tested } } return; - async function fetchUserDetailsAndContribs(userFileName) { - let spinner; - - const userFilePath = path.join(data.users, userFileName); - const userFile = new DbFile(userFilePath); + async function fetchUserDetailsAndContribs(userFile) { if (!userFile.login) { throw `${userFilePath} is malformed. Did you run ./addUser.js ?`; } - if (userFile.ghuser_deleted_because) { - console.log(`${userFile.login} has been deleted, skipping...`); - return; - } - if (userFile.removed_from_github) { - // For now ok, but maybe some day we'll have to deal with resurrected users. - console.log(`${userFile.login} was removed from GitHub in the past, skipping...`); - return; - } { const now = new Date; @@ -117,7 +105,7 @@ optional arguments: async function fetchDetails(userFile) { const ghUserUrl = `https://api.github.com/users/${userFile.login}`; - spinner = ora(`Fetching ${ghUserUrl}...`).start(); + const spinner = ora(`Fetching ${ghUserUrl}...`).start(); const ghDataJson = await github.fetchGHJson( ghUserUrl, spinner, [304, 404], userFile.contribs && userFile.contribs.fetched_at && new Date(userFile.contribs.fetched_at) @@ -151,7 +139,7 @@ optional arguments: async function fetchOrgs(userFile) { const orgsUrl = userFile.organizations_url; - spinner = ora(`Fetching ${orgsUrl}...`).start(); + const spinner = ora(`Fetching ${orgsUrl}...`).start(); const orgsDataJson = await github.fetchGHJson(orgsUrl, spinner); spinner.succeed(`Fetched ${orgsUrl}`); @@ -194,7 +182,7 @@ optional arguments: // fetchUserContribs() won't find forks as they are not considered to be contributions. But // the user might well have popular forks. - spinner = ora(`Fetching ${userFile.login}'s popular forks...`).start(); + const spinner = ora(`Fetching ${userFile.login}'s popular forks...`).start(); const perPage = 100; for (let page = 1; page <= 5; ++page) { @@ -219,7 +207,7 @@ optional arguments: async function fetchSettings(userFile) { const url = `https://raw.githubusercontent.com/${userFile.login}/ghuser.io.settings/master/ghuser.io.json`; - spinner = ora(`Fetching ${userFile.login}'s settings...`).start(); + const spinner = ora(`Fetching ${userFile.login}'s settings...`).start(); const dataJson = await fetchJson(url, spinner, [404]); if (dataJson == 404) { diff --git a/findUsersToRemove.js b/findUsersToRemove.js index bca5af89f..2af37e60b 100755 --- a/findUsersToRemove.js +++ b/findUsersToRemove.js @@ -8,6 +8,7 @@ const path = require('path'); const data = require('./impl/data'); + const db = require('./impl/db'); const DbFile = require('./impl/dbFile'); const github = require('./impl/github'); const scriptUtils = require('./impl/scriptUtils'); @@ -23,20 +24,16 @@ // * aren't marked not to be deleted, and // * haven't starred the project. - let spinner; const now = new Date; const minAgeMonths = 1; const users = []; - for (const file of fs.readdirSync(data.users)) { - if (file.endsWith('.json')) { - const user = new DbFile(path.join(data.users, file)); - if (!user.ghuser_deleted_because && !user.ghuser_keep_because && !user.removed_from_github - && now - Date.parse(user.ghuser_created_at) > minAgeMonths * 30 * 24 * 60 * 60 * 1000) { - users.push(user); - } + for await (const user of db.asyncNonRemovedUsers()) { + if (!user.ghuser_keep_because + && now - Date.parse(user.ghuser_created_at) > minAgeMonths * 30 * 24 * 60 * 60 * 1000) { + users.push(user); } - } + } //LA_TODO to be tested const stargazers = await fetchStargazers('ghuser-io/ghuser.io'); const toRemove = users.map(user => user.login).filter(user => stargazers.indexOf(user) === -1); @@ -62,7 +59,7 @@ to make sure we're not wasting resources, I'd like to know if you'd like to keep async function fetchStargazers(repo) { const ghUrl = `https://api.github.com/repos/${repo}/stargazers`; - spinner = ora(`Fetching ${ghUrl}...`).start(); + const spinner = ora(`Fetching ${ghUrl}...`).start(); const ghDataJson = await github.fetchGHJson(ghUrl, spinner); spinner.succeed(`Fetched ${ghUrl}`); return ghDataJson.map(stargazer => stargazer.login); diff --git a/impl/data.js b/impl/data.js index 7614bfd04..82dd675a4 100755 --- a/impl/data.js +++ b/impl/data.js @@ -1,6 +1,8 @@ #!/usr/bin/env node 'use strict'; +//TODO all this should move inside db.js and this file should be removed. + (() => { const os = require('os'); const fs = require('fs'); diff --git a/impl/db.js b/impl/db.js new file mode 100755 index 000000000..efc7ae7cc --- /dev/null +++ b/impl/db.js @@ -0,0 +1,58 @@ +#!/usr/bin/env node +'use strict'; + +module.exports = { + + // Async generator yielding an instance of DbFile for each user present in the database and not + // marked as removed ghuser or GitHub. + asyncNonRemovedUsers, + + // Creates the list of all contribs of a user if it doesn't exist already. In other words, writes + // an instance of DbFile on disk. + // @param login Case insensitive. + createUserContribList, +}; + + +const fs = require('fs'); +const path = require('path'); + +const ora = require('ora'); +const sleep = require('await-sleep'); + +const data = require('./data'); +const DbFile = require('./dbFile'); //LA_TODO should be the only include of this file, i.e. move content here? + + +async function* asyncNonRemovedUsers() { + const spinnerText = 'Reading users from DB...'; + const spinner = ora(spinnerText).start(); + let numUsers = 0; + + for (const file of fs.readdirSync(data.users)) { + await sleep(0); // make loop interruptible + + if (file.endsWith(DB_FILE_EXT)) { + const pathToFile = path.join(data.users, file); + const user = new DbFile(pathToFile); + if (!user.ghuser_deleted_because && !user.removed_from_github) { + ++numUsers; + spinner.text = `${spinnerText} [${numUsers}]`; + + yield user; + } + } + } + + spinner.succeed(`Found ${numUsers} users in DB`); +} + +function createUserContribList(login) { + if (!login) { + throw 'login is mandatory'; + } + (new DbFile(path.join(data.contribs, login.toLowerCase() + DB_FILE_EXT))).write(); +} + + +const DB_FILE_EXT = '.json'; //TODO should be the only occurence of this string in the codebase diff --git a/impl/dbFile.js b/impl/dbFile.js index 630d9d165..4ae61cadf 100755 --- a/impl/dbFile.js +++ b/impl/dbFile.js @@ -27,6 +27,10 @@ fs.writeFileSync(this._path(), JSON.stringify(this, null, 2) + '\n', 'utf-8'); } + sizeBytes() { + return fs.statSync(this._path()).size; + } + deleteAllPropsBut(exceptions) { Object.keys(this).forEach(prop => { if (prop.startsWith('_') || prop in exceptions) { diff --git a/printDataStats.js b/printDataStats.js index ba048d4e4..32e333d7e 100755 --- a/printDataStats.js +++ b/printDataStats.js @@ -7,39 +7,35 @@ const path = require('path'); const data = require('./impl/data'); + const db = require('./impl/db'); const DbFile = require('./impl/dbFile'); const scriptUtils = require('./impl/scriptUtils'); scriptUtils.printUnhandledRejections(); - printDataStats(); + asyncPrintDataStats(); return; - function printDataStats() { + async function asyncPrintDataStats() { let numUsers = 0; let largestUserFileName; let largestUserFileSize = 0; let totalUserSize = 0; - for (const file of fs.readdirSync(data.users)) { - if (file.endsWith('.json')) { - const pathToFile = path.join(data.users, file); - const user = new DbFile(pathToFile); - if (!user.ghuser_deleted_because && !user.removed_from_github) { - ++numUsers; - } - const userFileSize = fs.statSync(pathToFile).size; - if (userFileSize > largestUserFileSize) { - largestUserFileSize = userFileSize; - largestUserFileName = file; - } - totalUserSize += userFileSize; + for await (const user of db.asyncNonRemovedUsers()) { + ++numUsers; + const userFileSize = user.sizeBytes(); + if (userFileSize > largestUserFileSize) { + largestUserFileSize = userFileSize; + largestUserFileName = user.login; } + totalUserSize += userFileSize; } console.log(data.users); console.log(` ${numUsers} users`); console.log(` largest: ${largestUserFileName} (${toKB(largestUserFileSize)})`); console.log(` total: ${toKB(totalUserSize)}`); + throw 'LA_TEMP'; let largestContribFileName; let largestContribFileSize = 0;