diff --git a/src/__tests__/utils.js b/src/__tests__/utils.js index a3ca73dfd..b60b98b70 100644 --- a/src/__tests__/utils.js +++ b/src/__tests__/utils.js @@ -5,7 +5,13 @@ * currently not supported in the browser lib). */ -import { _copyAndTruncateStrings, _info, loadScript } from '../utils' +import { _copyAndTruncateStrings, _info, _isBlockedUA, DEFAULT_BLOCKED_UA_STRS, loadScript } from '../utils' + +function userAgentFor(botString) { + const randOne = (Math.random() + 1).toString(36).substring(7) + const randTwo = (Math.random() + 1).toString(36).substring(7) + return `Mozilla/5.0 (compatible; ${botString}/${randOne}; +http://a.com/bot/${randTwo})` +} describe(`utils.js`, () => { it('should have $host and $pathname in properties', () => { @@ -208,4 +214,15 @@ describe('loadScript', () => { new_script.onerror('uh-oh') expect(callback).toHaveBeenCalledWith('uh-oh') }) + + describe('user agent blocking', () => { + it.each(DEFAULT_BLOCKED_UA_STRS.concat('testington'))( + 'blocks a bot based on the user agent %s', + (botString) => { + const randomisedUserAgent = userAgentFor(botString) + + expect(_isBlockedUA(randomisedUserAgent, ['testington'])).toBe(true) + } + ) + }) }) diff --git a/src/posthog-core.ts b/src/posthog-core.ts index 81826d2f3..edd056947 100644 --- a/src/posthog-core.ts +++ b/src/posthog-core.ts @@ -114,6 +114,7 @@ const defaultConfig = (): PostHogConfig => ({ loaded: __NOOP, store_google: true, custom_campaign_params: [], + custom_blocked_useragents: [], save_referrer: true, test: false, verbose: false, @@ -853,7 +854,7 @@ export class PostHog { return } - if (_isBlockedUA(userAgent)) { + if (_isBlockedUA(userAgent, this.get_config('custom_blocked_useragents'))) { return } diff --git a/src/types.ts b/src/types.ts index e39a4f2a1..a041dc894 100644 --- a/src/types.ts +++ b/src/types.ts @@ -63,6 +63,10 @@ export interface PostHogConfig { loaded: (posthog_instance: PostHog) => void store_google: boolean custom_campaign_params: string[] + // a list of strings to be tested against navigator.userAgent to determine if the source is a bot + // this is **added to** the default list of bots that we check + // defaults to the empty array + custom_blocked_useragents: string[] save_referrer: boolean test: boolean verbose: boolean diff --git a/src/utils.ts b/src/utils.ts index 0a786882c..2ee2ab003 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -461,18 +461,60 @@ export const _utf8Encode = function (string: string): string { return utftext } +export const DEFAULT_BLOCKED_UA_STRS = [ + 'ahrefsbot', + 'applebot', + 'baiduspider', + 'bingbot', + 'bingpreview', + 'bot.htm', + 'bot.php', + 'crawler', + 'duckduckbot', + 'facebookexternal', + 'facebookcatalog', + 'gptbot', + 'hubspot', + 'linkedinbot', + 'mj12bot', + 'petalbot', + 'pinterest', + 'prerender', + 'rogerbot', + 'screaming frog', + 'semrushbot', + 'sitebulb', + 'twitterbot', + 'yahoo! slurp', + 'yandexbot', + + // a whole bunch of goog-specific crawlers + // https://developers.google.com/search/docs/advanced/crawling/overview-google-crawlers + 'adsbot-google', + 'apis-google', + 'duplexweb-google', + 'feedfetcher-google', + 'google favicon', + 'google web preview', + 'google-read-aloud', + 'googlebot', + 'googleweblight', + 'mediapartners-google', + 'storebot-google', +] + // _.isBlockedUA() // This is to block various web spiders from executing our JS and // sending false capturing data -export const _isBlockedUA = function (ua: string): boolean { - if ( - /(google web preview|baiduspider|yandexbot|bingbot|googlebot|yahoo! slurp|ahrefsbot|facebookexternalhit|facebookcatalog|applebot|semrushbot|duckduckbot|twitterbot|rogerbot|linkedinbot|mj12bot|sitebulb|bot.htm|bot.php|hubspot|crawler|prerender|gptbot)/i.test( - ua - ) - ) { - return true - } - return false +export const _isBlockedUA = function (ua: string, customBlockedUserAgents: string[]): boolean { + return DEFAULT_BLOCKED_UA_STRS.concat(customBlockedUserAgents).some((blockedUA) => { + if (ua.includes) { + return ua.includes(blockedUA) + } else { + // IE 11 :/ + return ua.indexOf(blockedUA) !== -1 + } + }) } /**