-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawlSite.js
102 lines (85 loc) · 3.17 KB
/
crawlSite.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
const request = require('request');
const cheerio = require('cheerio');
const path = require('path');
const fs = require('fs');
const arg1 = process.argv.slice(2)[0];
const arg2 = process.argv.slice(2)[1];
const arg3 = process.argv.slice(2)[2];
const crawlSite = (url, siteName, styleSheetPathIdentifier) => {
const pagesVisited = new Set();
const baseUrl = `${url}`;
const outputFileName = `csv/crawled_${siteName}_pages-`;
const ignore = [
'mailto',
'/#',
'javascript:'
];
const createFile = (fileName) => {
let date = new Date().toLocaleString();
date = date.replace(/[\W_]+/g,"-");
const name = `${fileName}` + date + '.csv';
const filePath = path.join(__dirname, name);
console.log('filePath is ', filePath)
fs.writeFile(filePath, '', (err) => {
if (err) throw err;
console.log(`The file "${name}" has been created.`);
});
return `${name}`;
}
const file = createFile(outputFileName);
const crawl = (url) => {
if (pagesVisited.has(url)) {
return;
}
pagesVisited.add(url);
request(url, (error, response, body) => {
if (error) {
const shouldIgnore = ignore.some(ignoreString => {
return `${error}`.includes(ignoreString);
})
if (shouldIgnore) {
return;
}
console.log(`Error: ${error}`);
return;
}
const $ = cheerio.load(body);
console.log(`Crawled: ${url}`);
const stylesheets = $('link[rel="stylesheet"]')
.map((i, el) => {
return $(el).attr('href')
})
.get()
.filter((stylesheetUrl) =>
stylesheetUrl.includes(styleSheetPathIdentifier)
);
// const javascriptBundles = $('script[src]')
// .map((i, el) => $(el).attr('src'))
// .get()
// .filter((jsBundleUrl) => {
// console.log('jsBundleUrl ', jsBundleUrl)
// if (jsBundleUrl.indexOf('/static/js/') > -1) {
// return jsBundleUrl;
// }
// });
// console.log(`JavaScript bundles: ${javascriptBundles.join(', ')}`);
// if (stylesheets || javascriptBundles) {
// fs.appendFileSync(outputFileName, `${url},${stylesheets.join(',')},${javascriptBundles.join(',')}\n`);
// }
if (stylesheets.length > 0) {
console.log(`Stylesheets: ${stylesheets.join(', ')}`);
fs.appendFileSync(file, `${url},${stylesheets.join(',')}\n`);
}
$('a[href]').map((i, el) => $(el).attr('href')).get().forEach((link) => {
if (link.startsWith('/')) {
link = baseUrl + link;
}
if (link.includes(baseUrl)) {
crawl(link, file);
}
});
});
};
crawl(baseUrl);
}
crawlSite(arg1, arg2, arg3);