Skip to content

Commit

Permalink
Merge pull request #223 from yujiosaka/support-redirect-chain
Browse files Browse the repository at this point in the history
Support redirect chain
  • Loading branch information
yujiosaka authored Apr 20, 2018
2 parents 37ded4f + f9fab14 commit c485126
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 32 deletions.
8 changes: 6 additions & 2 deletions API.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ const HCCrawler = require('headless-chrome-crawler');
* `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values.
* `onSuccess(response)` <[Function]> Function to be called when `evaluatePage()` successes.
* `response` <[Object]>
* `redirectChain` <[Array]<[Object]>> Redirect chain of requests.
* `url` <[string]> Requested url.
* `headers` <[Object]> Request headers.
* `response` <[Object]>
* `ok` <[boolean]> whether the status code in the range 200-299 or not.
* `status` <[string]> status code of the request.
Expand All @@ -83,7 +86,7 @@ const HCCrawler = require('headless-chrome-crawler');
* `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values.
* `result` <[Serializable]> The result resolved from `evaluatePage()` option.
* `screenshot` <[Buffer]> Buffer with the screenshot image, which is `null` when `screenshot` option not passed.
* `links` <[Array]> List of links found in the requested page.
* `links` <[Array]<[string]>> List of links found in the requested page.
* `depth` <[number]> Depth of the followed links.
* `onError(error)` <[Function]> Function to be called when request fails.
* `error` <[Error]> Error object.
Expand Down Expand Up @@ -158,7 +161,8 @@ url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, ret
* `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links.
* `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred.
* `depthPriority` <[boolean]> Whether to adjust priority based on its depth, defaults to `true`. Leave default to increase priority for higher depth, which is [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search).
* `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same.
* `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `true`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same.
* `skipRequestedRedirect` <[boolean]> Whether to skip requests already appeared in redirect chains of requests, default to `false`. This option is ignored when `skipDuplicates` is set `false`.
* `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`.
* `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`.
* `allowedDomains` <[Array]<[string]|[RegExp]>> List of domains allowed to request. Pass `null` or leave default to skip checking allowed domain
Expand Down
34 changes: 30 additions & 4 deletions lib/crawler.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
const map = require('lodash/map');
const reduce = require('lodash/reduce');
const pick = require('lodash/pick');
const isEmpty = require('lodash/isEmpty');
const uniq = require('lodash/uniq');
const noop = require('lodash/noop');
const devices = require('puppeteer/DeviceDescriptors');
const {
resolveUrl,
Expand All @@ -15,6 +15,10 @@ const GOTO_OPTIONS = [
'timeout',
'waitUntil',
];
const REQUEST_FIELDS = [
'url',
'headers',
];
const RESPONSE_FIELDS = [
'ok',
'url',
Expand Down Expand Up @@ -46,6 +50,7 @@ class Crawler {
const links = await this._collectLinks(response.url);
return {
response: this._reduceResponse(response),
redirectChain: this._getRedirectChain(response),
result,
screenshot,
links,
Expand Down Expand Up @@ -209,9 +214,9 @@ class Crawler {
* @private
*/
async _scrape() {
const evaluatePage = this._options.evaluatePage || noop;
if (!this._options.evaluatePage) return null;
await this._addJQuery();
return this._page.evaluate(evaluatePage);
return this._page.evaluate(this._options.evaluatePage);
}

/**
Expand Down Expand Up @@ -267,7 +272,19 @@ class Crawler {
}

/**
* @param {!Response} response
* @param {!Puppeteer.Request} request
* @return {!Object}
* @private
*/
_reduceRequest(request) {
return reduce(REQUEST_FIELDS, (memo, field) => {
memo[field] = request[field]();
return memo;
}, {});
}

/**
* @param {!Puppeteer.Response} response
* @return {!Object}
* @private
*/
Expand All @@ -277,6 +294,15 @@ class Crawler {
return memo;
}, {});
}

/**
* @param {!Puppeteer.Response} response
* @return {!Array<!Object>}
* @private
*/
_getRedirectChain(response) {
return map(response.request().redirectChain(), this._reduceRequest);
}
}

tracePublicAPI(Crawler);
Expand Down
41 changes: 37 additions & 4 deletions lib/hccrawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const each = require('lodash/each');
const includes = require('lodash/includes');
const isString = require('lodash/isString');
const isArray = require('lodash/isArray');
const request = require('request-promise');
const rp = require('request-promise');
// @ts-ignore
const robotsParser = require('robots-parser');
const Puppeteer = require('puppeteer');
Expand Down Expand Up @@ -119,6 +119,7 @@ class HCCrawler extends EventEmitter {
depthPriority: true,
obeyRobotsTxt: true,
followSitemapXml: false,
skipRequestedRedirect: false,
screenshot: null,
viewport: null,
}, options);
Expand Down Expand Up @@ -284,14 +285,15 @@ class HCCrawler extends EventEmitter {
*/
async _startRequest(options, depth) {
const skip = await this._skipRequest(options);
await this._markRequested(options);
if (skip) {
this.emit(HCCrawler.Events.RequestSkipped, options);
await this._markRequested(options);
return;
}
const allowed = await this._checkAllowedRobots(options);
if (!allowed) {
this.emit(HCCrawler.Events.RequestDisallowed, options);
await this._markRequested(options);
return;
}
await this._followSitemap(options, depth);
Expand Down Expand Up @@ -329,10 +331,14 @@ class HCCrawler extends EventEmitter {
this.emit(HCCrawler.Events.NewPage, crawler.page());
try {
const res = await crawler.crawl();
const requested = await this._checkRequestedRedirect(options, res.response);
if (requested) return [];
extend(res, { options, depth });
await crawler.close();
this.emit(HCCrawler.Events.RequestFinished, res);
await this._success(res);
await this._markRequested(options);
await this._markRequestedRedirects(options, res.redirectChain, res.response);
this._exportLine(res);
return res.links;
} catch (error) {
Expand Down Expand Up @@ -387,7 +393,7 @@ class HCCrawler extends EventEmitter {
let sitemapXml = await this._cache.get(sitemapUrl);
if (!sitemapXml) {
try {
sitemapXml = await request(sitemapUrl);
sitemapXml = await rp(sitemapUrl);
} catch (error) {
this.emit(HCCrawler.Events.SitemapXmlRequestFailed, error);
sitemapXml = EMPTY_TXT;
Expand All @@ -408,7 +414,7 @@ class HCCrawler extends EventEmitter {
let robotsTxt = await this._cache.get(robotsUrl);
if (!robotsTxt) {
try {
robotsTxt = await request(robotsUrl);
robotsTxt = await rp(robotsUrl);
} catch (error) {
this.emit(HCCrawler.Events.RobotsTxtRequestFailed, error);
robotsTxt = EMPTY_TXT;
Expand Down Expand Up @@ -454,6 +460,18 @@ class HCCrawler extends EventEmitter {
return !!value;
}

/**
* @param {!Object} options
* @param {!Object} response
* @return {!Promise<!boolean>}
* @private
*/
async _checkRequestedRedirect(options, response) {
if (!options.skipRequestedRedirect) return false;
const requested = await this._checkRequested(extend({}, options, { url: response.url }));
return requested;
}

/**
* @param {!Object} options
* @return {!Promise}
Expand All @@ -465,6 +483,21 @@ class HCCrawler extends EventEmitter {
await this._cache.set(key, '1');
}

/**
* @param {!Object} options
* @param {!Array<!Object>} redirectChain
* @param {!Object} response
* @return {!Promise}
* @private
*/
async _markRequestedRedirects(options, redirectChain, response) {
if (!options.skipRequestedRedirect) return;
await Promise.all(map(redirectChain, async request => {
await this._markRequested(extend({}, options, { url: request.url }));
}));
await this._markRequested(extend({}, options, { url: response.url }));
}

/**
* @param {!Object} options
* @return {!Promise<?boolean>}
Expand Down
4 changes: 2 additions & 2 deletions lib/puppeteer.d.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Browser, Page, Dialog } from 'puppeteer';
import { Browser, Page, Request, Response, Dialog } from 'puppeteer';

export as namespace Puppeteer;
export { Browser, Page, Dialog };
export { Browser, Page, Request, Response, Dialog };
Loading

0 comments on commit c485126

Please sign in to comment.