Skip to content

Commit

Permalink
extend download script and minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
leilafedd committed Jun 8, 2023
1 parent 7899543 commit a857b92
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 102 deletions.
6 changes: 3 additions & 3 deletions codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
"codeRepository": "https://github.com/fusion-jena/KeySearchWiki",
"dateCreated": "2021-06-14",
"datePublished": "2021-06-14",
"dateModified": "2022-05-24",
"dateModified": "2023-06-08",
"downloadUrl": "",
"issueTracker": "https://github.com/fusion-jena/KeySearchWiki/issues",
"name": "KeySearchWiki-generation-workflow",
"version": "1.2.1",
"version": "1.2.2",
"identifier": "",
"description": "Source code for generating KeySearchWiki - An Automatically Generated Dataset for Keyword Search over Wikidata",
"referencePublication": "",
Expand Down Expand Up @@ -64,4 +64,4 @@
}
}
]
}
}
2 changes: 0 additions & 2 deletions src/cache-population/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ export default {
downloadError: path.join(__dirname,'..', '..','wikipedia-dump/download-error-log.json'),
wikidataDumpLangList: path.join(__dirname,'..', '..','wikipedia-dump/wd-lang-list.json'),
catGroupedBylang: path.join(__dirname,'..', '..','wikipedia-dump/cat-by-lang.json'),
postImport: path.join(__dirname,'..', '..','wikipedia-dump/postimport.sql.gz'),
preImport: path.join(__dirname,'..', '..','wikipedia-dump/preimport.sql.gz'),
maxValues: 10000

};
77 changes: 0 additions & 77 deletions src/cache-population/queries/phase2/download.js

This file was deleted.

114 changes: 114 additions & 0 deletions src/cache-population/queries/phase2/downloadAll.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import Config from './../../config';
import mkdirp from 'mkdirp';
import fs from 'fs';
import https from 'https';
import { downloadLog } from './../../../util/logger';

const { performance } = require('perf_hooks');

/**
* download specific sql dump tables for all languages given as input
*
* @param {Array} langs list of languages
*
*/


let totalTemp = 0, failed = 0;

export default async function downloadAll(langs){

console.log('File download started ...');
downloadLog.info('File download started ...');

// total to download
let total = langs.length * Config.tables.length;

mkdirp.sync(Config.dumpLocation);

let errorLog = fs.createWriteStream(Config.downloadError, { flags: 'a' });

const t0 = performance.now();

for await (const lang of langs) {

//download needed tables
for await (const table of Config.tables){

let link = Config.linkPrefix + lang + 'wiki' + '/' + Config.dumpDate + '/' + lang + 'wiki-' + Config.dumpDate + '-' + table;

await downloadOne(link, lang, table, total, errorLog);

}

// reduce the load by delaying the execution
await new Promise(resolve => setTimeout(resolve, 5*1000));
}

const t1 = performance.now();

downloadLog.info(`{time: ${(t1 - t0) / 1000}, failed: ${failed}}`);
console.log(`
Time taken: ${(t1 - t0) / 1000} s
Failed : ${failed}
`);

}

/**
* function to download one file with retry
*
*/
async function downloadOne(link, lang, table, total, errorLog, leftRetries = Config.maxRetries){

if(leftRetries <= 0){
throw 'limit # of query retries reached ! ' ;
return ;
}

try{
await new Promise (resolve=> {

https.get(link, res => {
if(res.statusCode == 200){
// create language folder
mkdirp.sync(Config.dumpLocation + lang);
const path = Config.dumpLocation + lang + '/' + lang + 'wiki-' + Config.dumpDate + '-' + table;
const filePath = fs.createWriteStream(path);
res.pipe(filePath).on('finish',() => {
filePath.close();
totalTemp ++ ;
downloadLog.info(`${totalTemp} / ${total} files , downloaded ! (${lang})`);
console.log(`${totalTemp} / ${total} files , downloaded ! (${lang})`);
resolve();
});
}
else {
errorLog.write(`{lang: ${lang} , link: ${link} , statusCode: ${res.statusCode}}`);
errorLog.write('\r\n');
failed ++;
resolve();
}
});

});
}
catch(e){
//if some network failure occurs, retry
if (e.code == 'ECONNREFUSED' || e.code == 'ECONNRESET' || e.code == 'ETIMEDOUT' || e.code == 'EAI_AGAIN' || e.code == 'EHOSTUNREACH') {
failed ++;
//write error in log together with query
errorLog.write(`{ errorCode: ${e.code}, errorMessage: ${e.message}, lang: ${lang} , link: ${link} , leftRetries: ${leftRetries}}`);
errorLog.write('\r\n');
// reduce the load by delaying the execution
await new Promise(resolve => setTimeout(resolve, Config.defaultDelay * 1000));
//retry ;
return await downloadOne(link, lang, table, total, errorLog, leftRetries - 1);
}

else {
console.log(e);
return;
}
}
}
29 changes: 13 additions & 16 deletions src/cache-population/queries/phase2/populateCaches.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,26 +46,23 @@ export default async function populateCaches(grouped){
//console.log(`-> ${lang}:`);
wikipediaCachePopLog.info(`-> ${lang}:`);
let t0 = performance.now();
if(lang != 'en'){
for(let table of Config.tables){
const { stdout } = await exec(`zcat ${Config.preImport} ${Config.dumpLocation}${lang}/${lang}wiki-${Config.dumpDate}-${table} ${Config.postImport} | mysql -u '${Config.user}' -p${Config.password} ${Config.databaseName}`);
if(stdout == ''){
for(let table of Config.tables){
const { stdout } = await exec(`zcat ${Config.dumpLocation}${lang}/${lang}wiki-${Config.dumpDate}-${table} | mysql -u '${Config.user}' -p${Config.password} ${Config.databaseName}`);
if(stdout == ''){
//console.log(` ${table} imported !`);
wikipediaCachePopLog.info(` ${table} imported !`);
}
else{
wikipediaCachePopLog.info(` ${table} imported !`);
}
else{
//console.log(stdout);
errorLog.write(`{lang:${lang}, table:${table}, error:${stdout} }`);
errorLog.write('\r\n');
}
errorLog.write(`{lang:${lang}, table:${table}, error:${stdout} }`);
errorLog.write('\r\n');
}


let t1 = performance.now();
timeLog.write(`{action: import , lang: ${lang}, time: ${(t1 - t0) / 1000}}`);
timeLog.write('\r\n');
}

let t1 = performance.now();
timeLog.write(`{action: import , lang: ${lang}, time: ${(t1 - t0) / 1000}}`);
timeLog.write('\r\n');

// for progress tracking
let numberCats = grouped[lang].length;
let tempNCats = 1 ;
Expand Down Expand Up @@ -115,7 +112,7 @@ export default async function populateCaches(grouped){

if(pageidspage.length > 0){

//allow only pageids of namespace = 0 => main page (noticed that also e.g., ns = 10 is considered as 'page')
//allow only pageids of namespace = 0 => main page (noticed that also e.g., ns = 10 is considered as 'page')
let namespaces = await mariadb.getNS(conn, pageidspage);

namespaces.forEach(ns => {
Expand Down
8 changes: 4 additions & 4 deletions src/cache-population/runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { cachePopLog as Log } from './../util/logger';
import { isInstanceOf } from './util';
import groupByLang from './queries/phase2/groupByLang';
import getCatsTarget from './queries/phase2/getCatsTarget';
import download from './queries/phase2/download';
import downloadAll from './queries/phase2/downloadAll';
import populateCaches from './queries/phase2/populateCaches';

import Glob from 'glob-promise';
Expand Down Expand Up @@ -107,19 +107,19 @@ import Zlib from 'zlib';
let grouped = await groupByLang(catToTarget.exclude , catToTarget.catToTarget);

//download wikipedia dump tables for all languages
await download(grouped.langList.both);
await downloadAll(grouped.langList.both);


Log.info( '---------------- Phase 2 (Wikipedia cache population) ----------------' );

let grouped = JSON.parse(Fs.readFileSync(CacheConfig.catGroupedBylang));
let group = JSON.parse(Fs.readFileSync(CacheConfig.catGroupedBylang));

//just temporary to avoid importing english language (put it in the beginning of grouped object)
/*let temp = grouped['en'];
delete grouped.en;
let newgrouped = Object.assign({en: temp}, grouped);*/

await populateCaches(grouped);
await populateCaches(group);


/* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Third Phase XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */
Expand Down

0 comments on commit a857b92

Please sign in to comment.