From 849ad18d598b4356dcc70985b2f0b70cf5a43b45 Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Tue, 5 Sep 2023 11:44:53 +0100 Subject: [PATCH 1/4] feat: test a better list of bots and allow users to configure the bot list --- src/posthog-core.ts | 3 ++- src/types.ts | 4 ++++ src/utils.ts | 58 ++++++++++++++++++++++++++++++++++++++------- 3 files changed, 56 insertions(+), 9 deletions(-) diff --git a/src/posthog-core.ts b/src/posthog-core.ts index 81826d2f3..edd056947 100644 --- a/src/posthog-core.ts +++ b/src/posthog-core.ts @@ -114,6 +114,7 @@ const defaultConfig = (): PostHogConfig => ({ loaded: __NOOP, store_google: true, custom_campaign_params: [], + custom_blocked_useragents: [], save_referrer: true, test: false, verbose: false, @@ -853,7 +854,7 @@ export class PostHog { return } - if (_isBlockedUA(userAgent)) { + if (_isBlockedUA(userAgent, this.get_config('custom_blocked_useragents'))) { return } diff --git a/src/types.ts b/src/types.ts index e39a4f2a1..a041dc894 100644 --- a/src/types.ts +++ b/src/types.ts @@ -63,6 +63,10 @@ export interface PostHogConfig { loaded: (posthog_instance: PostHog) => void store_google: boolean custom_campaign_params: string[] + // a list of strings to be tested against navigator.userAgent to determine if the source is a bot + // this is **added to** the default list of bots that we check + // defaults to the empty array + custom_blocked_useragents: string[] save_referrer: boolean test: boolean verbose: boolean diff --git a/src/utils.ts b/src/utils.ts index 0a786882c..5a0ab52b6 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -461,18 +461,60 @@ export const _utf8Encode = function (string: string): string { return utftext } +const BLOCKED_UA_STRS = [ + 'ahrefsbot', + 'applebot', + 'baiduspider', + 'bingbot', + 'bingpreview', + 'bot.htm', + 'bot.php', + 'crawler', + 'duckduckbot', + 'facebookexternal', + 'facebookcatalog', + 'gptbot', + 'hubspot', + 'linkedinbot', + 'mj12bot', + 'petalbot', + 'pinterest', + 'prerender', + 'rogerbot', + 'screaming frog', + 'semrushbot', + 'sitebulb', + 'twitterbot', + 'yahoo! slurp', + 'yandexbot', + + // a whole bunch of goog-specific crawlers + // https://developers.google.com/search/docs/advanced/crawling/overview-google-crawlers + 'adsbot-google', + 'apis-google', + 'duplexweb-google', + 'feedfetcher-google', + 'google favicon', + 'google web preview', + 'google-read-aloud', + 'googlebot', + 'googleweblight', + 'mediapartners-google', + 'storebot-google', +] + +let botRegex: RegExp | null = null // _.isBlockedUA() // This is to block various web spiders from executing our JS and // sending false capturing data -export const _isBlockedUA = function (ua: string): boolean { - if ( - /(google web preview|baiduspider|yandexbot|bingbot|googlebot|yahoo! slurp|ahrefsbot|facebookexternalhit|facebookcatalog|applebot|semrushbot|duckduckbot|twitterbot|rogerbot|linkedinbot|mj12bot|sitebulb|bot.htm|bot.php|hubspot|crawler|prerender|gptbot)/i.test( - ua - ) - ) { - return true +export const _isBlockedUA = function (ua: string, customBlockedUserAgents: string[]): boolean { + if (botRegex === null) { + // convert BLOCKED_UA_STRS to a regex like bot.php|hubspot|crawler|prerender etc.: + const joinedBots = BLOCKED_UA_STRS.concat(customBlockedUserAgents).join('|') + botRegex = new RegExp(joinedBots, 'i') } - return false + + return botRegex.test(ua) } /** From 67d057885049b246a9a2306a71e27f8a6427df67 Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Tue, 5 Sep 2023 11:49:23 +0100 Subject: [PATCH 2/4] fix --- src/utils.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils.ts b/src/utils.ts index 5a0ab52b6..f4a0a2772 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -514,7 +514,7 @@ export const _isBlockedUA = function (ua: string, customBlockedUserAgents: strin botRegex = new RegExp(joinedBots, 'i') } - return botRegex.test(ua) + return !botRegex.test(ua) } /** From 6fd714af7b243bdf73d534c09cb2f562a4e66082 Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Tue, 5 Sep 2023 12:21:20 +0100 Subject: [PATCH 3/4] Fix --- src/__tests__/utils.js | 19 ++++++++++++++++++- src/utils.ts | 13 ++++--------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/__tests__/utils.js b/src/__tests__/utils.js index a3ca73dfd..b60b98b70 100644 --- a/src/__tests__/utils.js +++ b/src/__tests__/utils.js @@ -5,7 +5,13 @@ * currently not supported in the browser lib). */ -import { _copyAndTruncateStrings, _info, loadScript } from '../utils' +import { _copyAndTruncateStrings, _info, _isBlockedUA, DEFAULT_BLOCKED_UA_STRS, loadScript } from '../utils' + +function userAgentFor(botString) { + const randOne = (Math.random() + 1).toString(36).substring(7) + const randTwo = (Math.random() + 1).toString(36).substring(7) + return `Mozilla/5.0 (compatible; ${botString}/${randOne}; +http://a.com/bot/${randTwo})` +} describe(`utils.js`, () => { it('should have $host and $pathname in properties', () => { @@ -208,4 +214,15 @@ describe('loadScript', () => { new_script.onerror('uh-oh') expect(callback).toHaveBeenCalledWith('uh-oh') }) + + describe('user agent blocking', () => { + it.each(DEFAULT_BLOCKED_UA_STRS.concat('testington'))( + 'blocks a bot based on the user agent %s', + (botString) => { + const randomisedUserAgent = userAgentFor(botString) + + expect(_isBlockedUA(randomisedUserAgent, ['testington'])).toBe(true) + } + ) + }) }) diff --git a/src/utils.ts b/src/utils.ts index f4a0a2772..e98058d73 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -461,7 +461,7 @@ export const _utf8Encode = function (string: string): string { return utftext } -const BLOCKED_UA_STRS = [ +export const DEFAULT_BLOCKED_UA_STRS = [ 'ahrefsbot', 'applebot', 'baiduspider', @@ -503,18 +503,13 @@ const BLOCKED_UA_STRS = [ 'storebot-google', ] -let botRegex: RegExp | null = null // _.isBlockedUA() // This is to block various web spiders from executing our JS and // sending false capturing data export const _isBlockedUA = function (ua: string, customBlockedUserAgents: string[]): boolean { - if (botRegex === null) { - // convert BLOCKED_UA_STRS to a regex like bot.php|hubspot|crawler|prerender etc.: - const joinedBots = BLOCKED_UA_STRS.concat(customBlockedUserAgents).join('|') - botRegex = new RegExp(joinedBots, 'i') - } - - return !botRegex.test(ua) + return DEFAULT_BLOCKED_UA_STRS.concat(customBlockedUserAgents).some((blockedUA) => { + return ua.includes(blockedUA) + }) } /** From d41507c34634b91946c4b4d76271a0dfb51197c1 Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Tue, 5 Sep 2023 12:29:17 +0100 Subject: [PATCH 4/4] fix --- src/utils.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/utils.ts b/src/utils.ts index e98058d73..2ee2ab003 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -508,7 +508,12 @@ export const DEFAULT_BLOCKED_UA_STRS = [ // sending false capturing data export const _isBlockedUA = function (ua: string, customBlockedUserAgents: string[]): boolean { return DEFAULT_BLOCKED_UA_STRS.concat(customBlockedUserAgents).some((blockedUA) => { - return ua.includes(blockedUA) + if (ua.includes) { + return ua.includes(blockedUA) + } else { + // IE 11 :/ + return ua.indexOf(blockedUA) !== -1 + } }) }