-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
84 lines (78 loc) · 2.95 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
const http = require('https');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
const fs = require('fs');
const readline = require('readline');
const events = require('events');
const fd = new events.EventEmitter();
const host = 'https://wiki.ihris.org';
//read all categories
http.get(`${host}/mediawiki/index.php?title=Special:Categories&offset=&limit=500`, (res) => {
let data = '';
res.on('data', (chunk) => data += chunk);
res.on('end', () => {
let dom = new JSDOM(data).window.document;
let elements = Array.from(dom.getElementById('mw-content-text').getElementsByTagName('a')).filter(elem => RegExp('/wiki/Category*').test(elem));
dataToFile('categories.txt', elements, true); //write all categories to file
fd.on('close', () => {
const rl = readline.createInterface({
input: fs.createReadStream('data/categories.txt'),
crlfDelay: Infinity // wait for both \r and \n to mark end of line
});
rl.on('line', (line) => {
let lineJson = JSON.parse(line);
http.get(`${host}${lineJson.url}`, (res) => {
let newdata = '';
res.on('data', (chunk) => newdata += chunk);
res.on('end', () => {
let newdom = new JSDOM(newdata).window.document;
let links_div = newdom.getElementById('mw-pages');
links_div = links_div ? links_div : newdom.getElementById('mw-content-text');
let all_links = Array.from(links_div.getElementsByTagName('a')).filter(newelem => !RegExp('/wiki/Category*').test(newelem));
dataToFile('all_urls.txt', all_links, true, lineJson.category);
});
});
});
});
});
}).on('error', (e) => {
console.error(`Got an error: ${e.message}`);
});
/**
* Write data to file. created filename if not exists
* @param {String} filename to write data to
* @param {Array} data an array of data to write to filename
* @param {boolean} append if true, data will be appended to filename
*
*/
function dataToFile(filename, data, append, category){
let ws = fs.createWriteStream(`data/${filename}`, {flags: `${append ? 'a' : 'w'}`});
if(category) {
for(let d of data) {
ws.write(`${JSON.stringify(createURLObject(d, category))}\n`);
}
return;
}
for( let d of data){
ws.write(`${JSON.stringify(createCategoryObject(d))}\n`);
}
ws.end(''); // must explicitly have this for finish/close event to fire
ws.on('finish', () => fd.emit('close'));
}
/**
*
* @param {Object} data
*/
function createCategoryObject(dom_obj){
return {
url: dom_obj.href,
category: dom_obj.innerHTML
};
}
function createURLObject(dom_obj, category) {
return {
category: category,
url: dom_obj.href,
text: dom_obj.innerHTML
};
}