diff --git a/API.md b/API.md index 97ba6e05..32ce43ff 100644 --- a/API.md +++ b/API.md @@ -75,6 +75,9 @@ const HCCrawler = require('headless-chrome-crawler'); * `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values. * `onSuccess(response)` <[Function]> Function to be called when `evaluatePage()` successes. * `response` <[Object]> + * `redirectChain` <[Array]<[Object]>> Redirect chain of requests. + * `url` <[string]> Requested url. + * `headers` <[Object]> Request headers. * `response` <[Object]> * `ok` <[boolean]> whether the status code in the range 200-299 or not. * `status` <[string]> status code of the request. @@ -83,7 +86,7 @@ const HCCrawler = require('headless-chrome-crawler'); * `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values. * `result` <[Serializable]> The result resolved from `evaluatePage()` option. * `screenshot` <[Buffer]> Buffer with the screenshot image, which is `null` when `screenshot` option not passed. - * `links` <[Array]> List of links found in the requested page. + * `links` <[Array]<[string]>> List of links found in the requested page. * `depth` <[number]> Depth of the followed links. * `onError(error)` <[Function]> Function to be called when request fails. * `error` <[Error]> Error object. @@ -158,7 +161,8 @@ url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, ret * `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links. * `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred. * `depthPriority` <[boolean]> Whether to adjust priority based on its depth, defaults to `true`. Leave default to increase priority for higher depth, which is [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search). - * `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same. + * `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `true`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same. + * `skipRequestedRedirect` <[boolean]> Whether to skip requests already appeared in redirect chains of requests, default to `false`. This option is ignored when `skipDuplicates` is set `false`. * `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`. * `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`. * `allowedDomains` <[Array]<[string]|[RegExp]>> List of domains allowed to request. Pass `null` or leave default to skip checking allowed domain diff --git a/lib/crawler.js b/lib/crawler.js index 37d58f73..0994f6fa 100644 --- a/lib/crawler.js +++ b/lib/crawler.js @@ -1,8 +1,8 @@ +const map = require('lodash/map'); const reduce = require('lodash/reduce'); const pick = require('lodash/pick'); const isEmpty = require('lodash/isEmpty'); const uniq = require('lodash/uniq'); -const noop = require('lodash/noop'); const devices = require('puppeteer/DeviceDescriptors'); const { resolveUrl, @@ -15,6 +15,10 @@ const GOTO_OPTIONS = [ 'timeout', 'waitUntil', ]; +const REQUEST_FIELDS = [ + 'url', + 'headers', +]; const RESPONSE_FIELDS = [ 'ok', 'url', @@ -46,6 +50,7 @@ class Crawler { const links = await this._collectLinks(response.url); return { response: this._reduceResponse(response), + redirectChain: this._getRedirectChain(response), result, screenshot, links, @@ -209,9 +214,9 @@ class Crawler { * @private */ async _scrape() { - const evaluatePage = this._options.evaluatePage || noop; + if (!this._options.evaluatePage) return null; await this._addJQuery(); - return this._page.evaluate(evaluatePage); + return this._page.evaluate(this._options.evaluatePage); } /** @@ -267,7 +272,19 @@ class Crawler { } /** - * @param {!Response} response + * @param {!Puppeteer.Request} request + * @return {!Object} + * @private + */ + _reduceRequest(request) { + return reduce(REQUEST_FIELDS, (memo, field) => { + memo[field] = request[field](); + return memo; + }, {}); + } + + /** + * @param {!Puppeteer.Response} response * @return {!Object} * @private */ @@ -277,6 +294,15 @@ class Crawler { return memo; }, {}); } + + /** + * @param {!Puppeteer.Response} response + * @return {!Array} + * @private + */ + _getRedirectChain(response) { + return map(response.request().redirectChain(), this._reduceRequest); + } } tracePublicAPI(Crawler); diff --git a/lib/hccrawler.js b/lib/hccrawler.js index fdba1760..9ca479f7 100644 --- a/lib/hccrawler.js +++ b/lib/hccrawler.js @@ -8,7 +8,7 @@ const each = require('lodash/each'); const includes = require('lodash/includes'); const isString = require('lodash/isString'); const isArray = require('lodash/isArray'); -const request = require('request-promise'); +const rp = require('request-promise'); // @ts-ignore const robotsParser = require('robots-parser'); const Puppeteer = require('puppeteer'); @@ -119,6 +119,7 @@ class HCCrawler extends EventEmitter { depthPriority: true, obeyRobotsTxt: true, followSitemapXml: false, + skipRequestedRedirect: false, screenshot: null, viewport: null, }, options); @@ -284,14 +285,15 @@ class HCCrawler extends EventEmitter { */ async _startRequest(options, depth) { const skip = await this._skipRequest(options); - await this._markRequested(options); if (skip) { this.emit(HCCrawler.Events.RequestSkipped, options); + await this._markRequested(options); return; } const allowed = await this._checkAllowedRobots(options); if (!allowed) { this.emit(HCCrawler.Events.RequestDisallowed, options); + await this._markRequested(options); return; } await this._followSitemap(options, depth); @@ -329,10 +331,14 @@ class HCCrawler extends EventEmitter { this.emit(HCCrawler.Events.NewPage, crawler.page()); try { const res = await crawler.crawl(); + const requested = await this._checkRequestedRedirect(options, res.response); + if (requested) return []; extend(res, { options, depth }); await crawler.close(); this.emit(HCCrawler.Events.RequestFinished, res); await this._success(res); + await this._markRequested(options); + await this._markRequestedRedirects(options, res.redirectChain, res.response); this._exportLine(res); return res.links; } catch (error) { @@ -387,7 +393,7 @@ class HCCrawler extends EventEmitter { let sitemapXml = await this._cache.get(sitemapUrl); if (!sitemapXml) { try { - sitemapXml = await request(sitemapUrl); + sitemapXml = await rp(sitemapUrl); } catch (error) { this.emit(HCCrawler.Events.SitemapXmlRequestFailed, error); sitemapXml = EMPTY_TXT; @@ -408,7 +414,7 @@ class HCCrawler extends EventEmitter { let robotsTxt = await this._cache.get(robotsUrl); if (!robotsTxt) { try { - robotsTxt = await request(robotsUrl); + robotsTxt = await rp(robotsUrl); } catch (error) { this.emit(HCCrawler.Events.RobotsTxtRequestFailed, error); robotsTxt = EMPTY_TXT; @@ -454,6 +460,18 @@ class HCCrawler extends EventEmitter { return !!value; } + /** + * @param {!Object} options + * @param {!Object} response + * @return {!Promise} + * @private + */ + async _checkRequestedRedirect(options, response) { + if (!options.skipRequestedRedirect) return false; + const requested = await this._checkRequested(extend({}, options, { url: response.url })); + return requested; + } + /** * @param {!Object} options * @return {!Promise} @@ -465,6 +483,21 @@ class HCCrawler extends EventEmitter { await this._cache.set(key, '1'); } + /** + * @param {!Object} options + * @param {!Array} redirectChain + * @param {!Object} response + * @return {!Promise} + * @private + */ + async _markRequestedRedirects(options, redirectChain, response) { + if (!options.skipRequestedRedirect) return; + await Promise.all(map(redirectChain, async request => { + await this._markRequested(extend({}, options, { url: request.url })); + })); + await this._markRequested(extend({}, options, { url: response.url })); + } + /** * @param {!Object} options * @return {!Promise} diff --git a/lib/puppeteer.d.ts b/lib/puppeteer.d.ts index b8dca753..637b5fd1 100644 --- a/lib/puppeteer.d.ts +++ b/lib/puppeteer.d.ts @@ -1,4 +1,4 @@ -import { Browser, Page, Dialog } from 'puppeteer'; +import { Browser, Page, Request, Response, Dialog } from 'puppeteer'; export as namespace Puppeteer; -export { Browser, Page, Dialog }; +export { Browser, Page, Request, Response, Dialog }; diff --git a/test/hccrawler/index.test.js b/test/hccrawler/index.test.js index 6bb44f57..1736122e 100644 --- a/test/hccrawler/index.test.js +++ b/test/hccrawler/index.test.js @@ -98,7 +98,10 @@ describe('HCCrawler', function () { context('when the crawler is launched with necessary options', function () { beforeEach(async function () { - crawler = await HCCrawler.launch(extend({ onSuccess }, DEFAULT_OPTIONS)); + crawler = await HCCrawler.launch(extend({ + evaluatePage, + onSuccess, + }, DEFAULT_OPTIONS)); }); it('shows the browser version', async function () { @@ -334,10 +337,7 @@ describe('HCCrawler', function () { }); it('fails evaluating the delayed content without the waitFor option', async function () { - await crawler.queue({ - url: INDEX_PAGE, - evaluatePage, - }); + await crawler.queue(INDEX_PAGE); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); assert.equal(onSuccess.firstCall.args[0].result, ''); @@ -347,7 +347,6 @@ describe('HCCrawler', function () { await crawler.queue({ url: INDEX_PAGE, waitFor: { selectorOrFunctionOrTimeout: 150 }, - evaluatePage, }); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); @@ -358,7 +357,6 @@ describe('HCCrawler', function () { await crawler.queue({ url: INDEX_PAGE, waitFor: { selectorOrFunctionOrTimeout: 'h1' }, - evaluatePage, }); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); @@ -374,7 +372,6 @@ describe('HCCrawler', function () { )), args: ['Welcome to'], }, - evaluatePage, }); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); @@ -382,6 +379,41 @@ describe('HCCrawler', function () { }); }); + context('when the page is redirected multiple times', function () { + beforeEach(function () { + server.setRedirect('/1.html', '/2.html'); + server.setRedirect('/2.html', '/3.html'); + }); + + it('resolves a redirect chain', async function () { + await crawler.queue(`${PREFIX}/1.html`); + await crawler.onIdle(); + assert.equal(onSuccess.callCount, 1); + assert.equal(onSuccess.firstCall.args[0].result, '/3.html'); + assert.equal(onSuccess.firstCall.args[0].redirectChain.length, 2); + assert.equal(onSuccess.firstCall.args[0].redirectChain[0].url, `${PREFIX}/1.html`); + assert.equal(onSuccess.firstCall.args[0].redirectChain[1].url, `${PREFIX}/2.html`); + }); + + it('requested already requested redirects', async function () { + await crawler.queue(`${PREFIX}/1.html`); + await crawler.onIdle(); + await crawler.queue(`${PREFIX}/2.html`); + await crawler.queue(`${PREFIX}/3.html`); + await crawler.onIdle(); + assert.equal(onSuccess.callCount, 3); + }); + + it('skips already requested redirects with skipRequestedRedirect = true', async function () { + await crawler.queue({ url: `${PREFIX}/1.html`, skipRequestedRedirect: true }); + await crawler.onIdle(); + await crawler.queue({ url: `${PREFIX}/2.html`, skipRequestedRedirect: true }); + await crawler.queue({ url: `${PREFIX}/3.html`, skipRequestedRedirect: true }); + await crawler.onIdle(); + assert.equal(onSuccess.callCount, 1); + }); + }); + context('when the page requires the basic authentication', function () { beforeEach(function () { server.setContent('/', 'Authorization succeeded!'); @@ -389,10 +421,7 @@ describe('HCCrawler', function () { }); it('fails authentication when username and password options are not set', async function () { - await crawler.queue({ - url: INDEX_PAGE, - evaluatePage, - }); + await crawler.queue(INDEX_PAGE); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); assert.equal(onSuccess.firstCall.args[0].result, 'HTTP Error 401 Unauthorized: Access is denied'); @@ -403,7 +432,6 @@ describe('HCCrawler', function () { url: INDEX_PAGE, username: 'password', password: 'username', - evaluatePage, }); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); @@ -415,7 +443,6 @@ describe('HCCrawler', function () { url: INDEX_PAGE, username: 'username', password: 'password', - evaluatePage, }); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); @@ -454,8 +481,8 @@ describe('HCCrawler', function () { beforeEach(async function () { server.setContent('/', ''); crawler = await HCCrawler.launch(extend({ - onSuccess, evaluatePage, + onSuccess, device: 'iPhone 6', }, DEFAULT_OPTIONS)); }); @@ -485,6 +512,7 @@ describe('HCCrawler', function () { context('when the crawler is launched with retryCount = 0', function () { beforeEach(async function () { crawler = await HCCrawler.launch(extend({ + evaluatePage, onSuccess, onError, retryCount: 0, @@ -492,14 +520,14 @@ describe('HCCrawler', function () { }); it('succeeds evaluating page', async function () { - await crawler.queue({ url: INDEX_PAGE, evaluatePage }); + await crawler.queue(INDEX_PAGE); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); assert.equal(onSuccess.firstCall.args[0].result, '/'); }); it('fails evaluating page with jQuery = false', async function () { - await crawler.queue({ url: INDEX_PAGE, jQuery: false, evaluatePage }); + await crawler.queue({ url: INDEX_PAGE, jQuery: false }); await crawler.onIdle(); assert.equal(onError.callCount, 1); assert.ok(includes(onError.firstCall.args[0].message, 'Evaluation failed:')); @@ -514,7 +542,7 @@ describe('HCCrawler', function () { }); it('succeeds evaluating page', async function () { - await crawler.queue({ url: `${PREFIX}/csp.html`, evaluatePage }); + await crawler.queue(`${PREFIX}/csp.html`); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); assert.ok(includes(onSuccess.firstCall.args[0].result, 'Welcome to')); @@ -527,7 +555,7 @@ describe('HCCrawler', function () { }); it('succeeds evaluating page', async function () { - await crawler.queue({ url: `${INDEX_PAGE}`, evaluatePage }); + await crawler.queue(`${INDEX_PAGE}`); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); assert.equal(onSuccess.firstCall.args[0].result, '/'); @@ -540,7 +568,7 @@ describe('HCCrawler', function () { }); it('succeeds request when the timeout option is not set', async function () { - await crawler.queue({ url: INDEX_PAGE }); + await crawler.queue(INDEX_PAGE); await crawler.onIdle(); assert.equal(onSuccess.callCount, 1); }); diff --git a/test/server/index.js b/test/server/index.js index 088f6657..b986dbb0 100644 --- a/test/server/index.js +++ b/test/server/index.js @@ -20,6 +20,7 @@ class Server { constructor(port) { this._server = createServer(this._onRequest.bind(this)); this._server.listen(port); + this._routes = new Map(); this._delays = new Map(); this._auths = new Map(); this._csps = new Map(); @@ -27,6 +28,7 @@ class Server { } reset() { + this._routes.clear(); this._delays.clear(); this._auths.clear(); this._contents.clear(); @@ -73,6 +75,17 @@ class Server { this._delays.set(path, delay); } + /** + * @param {string} from + * @param {string} to + */ + setRedirect(from, to) { + this._routes.set(from, (request, response) => { + response.writeHead(302, { location: to }); + response.end(); + }); + } + /** * @param {!IncomingMessage} request * @param {!ServerResponse} response @@ -90,6 +103,11 @@ class Server { } const delay = this._delays.get(path) || 0; setTimeout(() => { + const route = this._routes.get(path); + if (route) { + route(request, response); + return; + } const content = this._contents.get(path); if (content) { response.end(content);