feat(cdp): Modify bot filter to use global list (#37893)

This commit is contained in:
Ben White
2025-09-23 14:21:23 +02:00
committed by GitHub
parent 074082a4a9
commit fa27c01966
11 changed files with 13393 additions and 354 deletions

View File

@@ -43,6 +43,7 @@
!requirements.txt
!rust
!share/GeoLite2-City.mmdb
!share/bot-ips.txt
!staticfiles
!tailwind.config.js
!test-runner-jest-environment.js

63
.github/workflows/update-bot-ips.yml vendored Normal file
View File

@@ -0,0 +1,63 @@
name: Update Bot IPs
on:
schedule:
# Run daily at 2 AM UTC
- cron: '0 2 * * *'
workflow_dispatch: # Allow manual triggering
permissions:
contents: read
pull-requests: write
jobs:
update-bot-ips:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
token: ${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }}
fetch-depth: 0
- name: Run update script
run: |
./bin/update-bots-list
- name: Check for changes
id: check-changes
run: |
if git diff --quiet share/bot-ips.txt; then
echo "changes=false" >> $GITHUB_OUTPUT
echo "No changes detected in bot IPs"
else
echo "changes=true" >> $GITHUB_OUTPUT
echo "Changes detected in bot IPs"
git diff --stat share/bot-ips.txt
fi
- name: Create Pull Request
if: steps.check-changes.outputs.changes == 'true'
uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v5
with:
token: ${{ secrets.GITHUB_TOKEN }}
commit-message: 'Update bot IPs from GoodBots repository'
title: '🤖 Update bot IPs from GoodBots repository'
body: |
This PR updates the bot IPs list from the [GoodBots repository](https://github.com/AnTheMaker/GoodBots).
**Changes:**
- Updated `share/bot-ips.txt` with latest bot IPs
**Source:** https://raw.githubusercontent.com/AnTheMaker/GoodBots/main/all.ips
This is an automated update that runs daily.
base: master
delete-branch: true
labels: |
automated
bot-ips
dependencies
reviewers:
- PostHog/team-messaging

33
bin/update-bots-list Executable file
View File

@@ -0,0 +1,33 @@
#!/bin/bash
# Script to update bot IPs from GoodBots repository
# Downloads IPs from GitHub and saves to share/bot-ips.txt
set -e
# Configuration
GITHUB_URL="https://raw.githubusercontent.com/AnTheMaker/GoodBots/main/all.ips"
TARGET_FILE="share/bot-ips.txt"
# Get the directory of this script
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
TARGET_PATH="$PROJECT_ROOT/$TARGET_FILE"
echo "Downloading bot IPs from: $GITHUB_URL"
echo "Saving to: $TARGET_PATH"
# Create directory if it doesn't exist
mkdir -p "$(dirname "$TARGET_PATH")"
# Download and save the IPs
curl -s "$GITHUB_URL" > "$TARGET_PATH"
# Check if download was successful
if [ $? -eq 0 ] && [ -s "$TARGET_PATH" ]; then
LINE_COUNT=$(wc -l < "$TARGET_PATH")
echo "Successfully downloaded $LINE_COUNT IP addresses to $TARGET_PATH"
else
echo "Error: Failed to download or file is empty"
exit 1
fi

View File

@@ -1,41 +0,0 @@
import itertools
import requests
def get_prefixes_from_url(url):
data = requests.get(url).json()
return [
x
for x in itertools.chain.from_iterable(
(prefix.get("ipv4Prefix", None), prefix.get("ipv6Prefix", None)) for prefix in data["prefixes"]
)
if x
]
def to_hog_friendly_string(ip_ranges):
hog_friendly_string = "let known_bot_ip_prefixes := [\n"
for index, (key, value) in enumerate(ip_ranges.items()):
if index > 0:
hog_friendly_string += "\n"
hog_friendly_string += f" // {key}\n"
for ip in value:
hog_friendly_string += f" '{ip}',\n"
hog_friendly_string += "];\n"
return hog_friendly_string
def update_bot_ip_ranges():
# this list is incomplete, you can help by expanding it
ip_ranges = {
"ahrefs": get_prefixes_from_url("https://api.ahrefs.com/v3/public/crawler-ip-ranges"),
"bing": get_prefixes_from_url("https://www.bing.com/toolbox/bingbot.json"),
"google": get_prefixes_from_url("https://www.gstatic.com/ipranges/goog.json"),
}
print(to_hog_friendly_string(ip_ranges)) # noqa: T201
if __name__ == "__main__":
update_bot_ip_ranges()

View File

@@ -0,0 +1,240 @@
import fs from 'fs'
import path from 'path'
export const KNOWN_BOT_IP_LIST = fs
.readFileSync(path.join(__dirname, '../../../../..', 'share', 'bot-ips.txt'), 'utf8')
.split('\n')
.map((line) => line.trim())
export const KNOWN_BOT_UA_LIST = [
'bot',
'crawler',
'spider',
'feedfetcher-google',
'mediapartners-google',
'apis-google',
'slurp',
'python-urllib',
'python-requests',
'aiohttp',
'httpx',
'libwww-perl',
'httpunit',
'nutch',
'go-http-client',
'biglotron',
'teoma',
'convera',
'gigablast',
'ia_archiver',
'webmon ',
'httrack',
'grub.org',
'netresearchserver',
'speedy',
'fluffy',
'findlink',
'panscient',
'ips-agent',
'yanga',
'yandex',
'yadirectfetcher',
'cyberpatrol',
'postrank',
'page2rss',
'linkdex',
'ezooms',
'heritrix',
'findthatfile',
'europarchive.org',
'mappydata',
'eright',
'apercite',
'aboundex',
'summify',
'ec2linkfinder',
'facebookexternalhit',
'yeti',
'retrevopageanalyzer',
'sogou',
'wotbox',
'ichiro',
'drupact',
'coccoc',
'integromedb',
'siteexplorer.info',
'proximic',
'changedetection',
'cc metadata scaper',
'g00g1e.net',
'binlar',
'a6-indexer',
'admantx',
'megaindex',
'ltx71',
'bubing',
'qwantify',
'lipperhey',
'addthis',
'metauri',
'scrapy',
'capsulechecker',
'sonic',
'sysomos',
'trove',
'deadlinkchecker',
'slack-imgproxy',
'embedly',
'iskanie',
'skypeuripreview',
'google-adwords-instant',
'whatsapp',
'electricmonk',
'yahoo link preview',
'xenu link sleuth',
'pcore-http',
'appinsights',
'phantomjs',
'jetslide',
'newsharecounts',
'tineye',
'linkarchiver',
'digg deeper',
'snacktory',
'okhttp',
'nuzzel',
'omgili',
'pocketparser',
'um-ln',
'muckrack',
'netcraftsurveyagent',
'appengine-google',
'jetty',
'upflow',
'thinklab',
'traackr.com',
'twurly',
'mastodon',
'http_get',
'brandverity',
'check_http',
'ezid',
'genieo',
'meltwaternews',
'moreover',
'scoutjet',
'seoscanners',
'hatena',
'google web preview',
'adscanner',
'netvibes',
'baidu-yunguance',
'btwebclient',
'disqus',
'feedly',
'fever',
'flamingo_searchengine',
'flipboardproxy',
'g2 web services',
'vkshare',
'siteimprove.com',
'dareboost',
'feedspot',
'seokicks',
'tracemyfile',
'zgrab',
'pr-cy.ru',
'datafeedwatch',
'zabbix',
'google-xrawler',
'axios',
'amazon cloudfront',
'pulsepoint',
'cloudflare-alwaysonline',
'google-structured-data-testing-tool',
'wordupinfosearch',
'webdatastats',
'httpurlconnection',
'outbrain',
'w3c_validator',
'w3c-checklink',
'w3c-mobileok',
'w3c_i18n-checker',
'feedvalidator',
'w3c_css_validator',
'w3c_unicorn',
'google-physicalweb',
'blackboard',
'bazqux',
'twingly',
'rivva',
'dataprovider.com',
'theoldreader.com',
'anyevent',
'nmap scripting engine',
'2ip.ru',
'clickagy',
'google favicon',
'hubspot',
'chrome-lighthouse',
'headlesschrome',
'simplescraper',
'fedoraplanet',
'friendica',
'nextcloud',
'tiny tiny rss',
'datanyze',
'google-site-verification',
'trendsmapresolver',
'tweetedtimes',
'gwene',
'simplepie',
'searchatlas',
'superfeedr',
'freewebmonitoring sitechecker',
'pandalytics',
'seewithkids',
'cincraw',
'freshrss',
'google-certificates-bridge',
'viber',
'evc-batch',
'virustotal',
'uptime-kuma',
'feedbin',
'snap url preview service',
'ruxitsynthetic',
'google-read-aloud',
'mediapartners',
'wget',
'ahrefsgot',
'ahrefssiteaudit',
'wesee:search',
'y!j',
'collection@infegy.com',
'deusu',
'bingpreview',
'daum',
'pingdom',
'barkrowler',
'yak',
'ning',
'ahc',
'apache-httpclient',
'buck',
'newspaper',
'sentry',
'fetch',
'miniflux',
'validator.nu',
'grouphigh',
'checkmarknetwork',
'www.uptime.com',
'mixnodecache',
'domains project',
'pagepeeker',
'vigil',
'php-curl-class',
'ptst',
'seostar.co',
]

View File

@@ -16,7 +16,7 @@ import { HogFunctionMonitoringService } from '../services/monitoring/hog-functio
import { HogWatcherService, HogWatcherState } from '../services/monitoring/hog-watcher.service'
import { convertToHogFunctionFilterGlobal, filterFunctionInstrumented } from '../utils/hog-function-filtering'
import { createInvocation, createInvocationResult } from '../utils/invocation-utils'
import { cleanNullValues } from './transformation-functions'
import { getTransformationFunctions } from './transformation-functions'
export const hogTransformationDroppedEvents = new Counter({
name: 'hog_transformation_dropped_events',
@@ -102,15 +102,7 @@ export class HogTransformerService {
private async getTransformationFunctions() {
const geoipLookup = await this.hub.geoipService.get()
return {
geoipLookup: (val: unknown): any => {
return typeof val === 'string' ? geoipLookup.city(val) : null
},
cleanNullValues,
postHogCapture: () => {
throw new Error('posthogCapture is not supported in transformations')
},
}
return getTransformationFunctions(geoipLookup)
}
private createInvocationGlobals(event: PluginEvent): HogFunctionInvocationGlobals {

View File

@@ -1,3 +1,7 @@
import { GeoIp } from '~/utils/geoip'
import { KNOWN_BOT_IP_LIST, KNOWN_BOT_UA_LIST } from './bots/bots'
const MAX_DEPTH = 3
function cleanNullValuesInternal(value: unknown, depth: number): unknown {
@@ -32,3 +36,35 @@ function cleanNullValuesInternal(value: unknown, depth: number): unknown {
export function cleanNullValues(value: unknown): unknown {
return cleanNullValuesInternal(value, 1)
}
export const isKnownBotUserAgent = (value: unknown): boolean => {
if (typeof value !== 'string') {
return false
}
const userAgent = (value as string).toLowerCase()
return KNOWN_BOT_UA_LIST.some((bot) => userAgent.includes(bot))
}
export const isKnownBotIp = (ip: unknown): boolean => {
if (typeof ip !== 'string') {
return false
}
const ipString = ip as string
return KNOWN_BOT_IP_LIST.includes(ipString)
}
export const getTransformationFunctions = (geoipLookup: GeoIp) => {
return {
geoipLookup: (val: unknown): any => {
return typeof val === 'string' ? geoipLookup.city(val) : null
},
cleanNullValues,
isKnownBotUserAgent,
isKnownBotIp,
postHogCapture: () => {
throw new Error('posthogCapture is not supported in transformations')
},
}
}

View File

@@ -7,6 +7,8 @@ const DEFAULT_INPUTS = {
userAgent: '$raw_user_agent',
customBotPatterns: '',
customIpPrefixes: '',
filterKnownBotUserAgents: true,
filterKnownBotIps: true,
keepUndefinedUseragent: 'Yes',
}
@@ -52,35 +54,17 @@ describe('bot-detection.template', () => {
})
it.each([
['Yes', true],
['No', false],
['Yes', true, undefined],
['No', false, undefined],
['Yes', true, ''],
['No', false, ''],
])(
'should treat missing user agent when keepUndefinedUseragent is %s',
async (keepUndefinedUseragent, shouldKeepEvent) => {
mockGlobals = tester.createGlobals({
event: {
properties: {},
},
})
const response = await tester.invoke({ ...DEFAULT_INPUTS, keepUndefinedUseragent }, mockGlobals)
expect(response.finished).toBeTruthy()
expect(response.error).toBeFalsy()
shouldKeepEvent ? expect(response.execResult).toBeTruthy() : expect(response.execResult).toBeFalsy()
}
)
it.each([
['Yes', true],
['No', false],
])(
'should treat empty user agent when keepUndefinedUseragent is %s',
async (keepUndefinedUseragent, shouldKeepEvent) => {
async (keepUndefinedUseragent, shouldKeepEvent, ua) => {
mockGlobals = tester.createGlobals({
event: {
properties: {
$raw_user_agent: '',
$raw_user_agent: ua,
},
},
})
@@ -212,4 +196,51 @@ describe('bot-detection.template', () => {
expect(response.error).toBeFalsy()
expect(response.execResult).toBeFalsy()
})
it('should not filter out known bot user agents if filterKnownBotUserAgents is false', async () => {
mockGlobals = tester.createGlobals({
event: {
properties: {
$raw_user_agent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
},
},
})
const response = await tester.invoke(
{
...DEFAULT_INPUTS,
filterKnownBotUserAgents: false,
},
mockGlobals
)
expect(response.finished).toBeTruthy()
expect(response.error).toBeFalsy()
expect(response.execResult).toBeTruthy()
})
it('should not filter out known bot ips if filterKnownBotIps is false', async () => {
mockGlobals = tester.createGlobals({
event: {
properties: {
$ip: '5.39.1.225',
$raw_user_agent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
},
},
})
const response = await tester.invoke(
{
...DEFAULT_INPUTS,
filterKnownBotIps: false,
},
mockGlobals
)
expect(response.finished).toBeTruthy()
expect(response.error).toBeFalsy()
expect(response.execResult).toBeTruthy()
})
})

View File

@@ -12,77 +12,20 @@ export const template: HogFunctionTemplate = {
category: ['Custom'],
code_language: 'hog',
code: `
// List of known bot user agents
let known_bot_filter_list := ['bot', 'crawler', 'spider', 'feedfetcher-google',
'mediapartners-google', 'apis-google', 'slurp', 'python-urllib',
'python-requests', 'aiohttp', 'httpx', 'libwww-perl',
'httpunit', 'nutch', 'go-http-client', 'biglotron', 'teoma',
'convera', 'gigablast', 'ia_archiver', 'webmon ', 'httrack',
'grub.org', 'netresearchserver', 'speedy', 'fluffy',
'findlink', 'panscient', 'ips-agent', 'yanga', 'yandex',
'yadirectfetcher', 'cyberpatrol', 'postrank', 'page2rss',
'linkdex', 'ezooms', 'heritrix', 'findthatfile', 'europarchive.org',
'mappydata', 'eright', 'apercite', 'aboundex', 'summify', 'ec2linkfinder',
'facebookexternalhit', 'yeti', 'retrevopageanalyzer', 'sogou', 'wotbox',
'ichiro', 'drupact', 'coccoc', 'integromedb', 'siteexplorer.info',
'proximic', 'changedetection', 'cc metadata scaper', 'g00g1e.net',
'binlar', 'a6-indexer', 'admantx', 'megaindex', 'ltx71', 'bubing',
'qwantify', 'lipperhey', 'addthis', 'metauri', 'scrapy', 'capsulechecker',
'sonic', 'sysomos', 'trove', 'deadlinkchecker', 'slack-imgproxy', 'embedly',
'iskanie', 'skypeuripreview', 'google-adwords-instant', 'whatsapp',
'electricmonk', 'yahoo link preview', 'xenu link sleuth', 'pcore-http',
'appinsights', 'phantomjs', 'jetslide', 'newsharecounts', 'tineye',
'linkarchiver', 'digg deeper', 'snacktory', 'okhttp', 'nuzzel', 'omgili',
'pocketparser', 'um-ln', 'muckrack', 'netcraftsurveyagent', 'appengine-google',
'jetty', 'upflow', 'thinklab', 'traackr.com', 'twurly', 'mastodon', 'http_get',
'brandverity', 'check_http', 'ezid', 'genieo', 'meltwaternews', 'moreover',
'scoutjet', 'seoscanners', 'hatena', 'google web preview', 'adscanner',
'netvibes', 'baidu-yunguance', 'btwebclient', 'disqus', 'feedly', 'fever',
'flamingo_searchengine', 'flipboardproxy', 'g2 web services', 'vkshare',
'siteimprove.com', 'dareboost', 'feedspot', 'seokicks', 'tracemyfile',
'zgrab', 'pr-cy.ru', 'datafeedwatch', 'zabbix', 'google-xrawler', 'axios',
'amazon cloudfront', 'pulsepoint', 'cloudflare-alwaysonline',
'google-structured-data-testing-tool', 'wordupinfosearch', 'webdatastats',
'httpurlconnection', 'outbrain', 'w3c_validator', 'w3c-checklink',
'w3c-mobileok', 'w3c_i18n-checker', 'feedvalidator', 'w3c_css_validator',
'w3c_unicorn', 'google-physicalweb', 'blackboard', 'bazqux', 'twingly',
'rivva', 'dataprovider.com', 'theoldreader.com', 'anyevent',
'nmap scripting engine', '2ip.ru', 'clickagy', 'google favicon',
'hubspot', 'chrome-lighthouse', 'headlesschrome', 'simplescraper',
'fedoraplanet', 'friendica', 'nextcloud', 'tiny tiny rss', 'datanyze',
'google-site-verification', 'trendsmapresolver', 'tweetedtimes', 'gwene',
'simplepie', 'searchatlas', 'superfeedr', 'freewebmonitoring sitechecker',
'pandalytics', 'seewithkids', 'cincraw', 'freshrss', 'google-certificates-bridge',
'viber', 'evc-batch', 'virustotal', 'uptime-kuma', 'feedbin',
'snap url preview service', 'ruxitsynthetic', 'google-read-aloud',
'mediapartners', 'wget', 'wget', 'ahrefsgot', 'ahrefssiteaudit',
'wesee:search', 'y!j', 'collection@infegy.com', 'deusu', 'bingpreview',
'daum', 'pingdom', 'barkrowler', 'yak', 'ning', 'ahc', 'apache-httpclient',
'buck', 'newspaper', 'sentry', 'fetch', 'miniflux', 'validator.nu',
'grouphigh', 'checkmarknetwork', 'www.uptime.com', 'mixnodecache',
'domains project', 'pagepeeker', 'vigil', 'php-curl-class', 'ptst',
'seostar.co']
let userAgentProperty := inputs.userAgent
// Get the user agent value
let user_agent := event.properties[inputs.userAgent]
// Check if user agent property exists in event
if (empty(event.properties[userAgentProperty]) and inputs.keepUndefinedUseragent == 'No') {
if (empty(user_agent) and inputs.keepUndefinedUseragent == 'No') {
return null
}
// Get the user agent value
let user_agent := event.properties[userAgentProperty]
// Check for empty string
if (user_agent == '' and inputs.keepUndefinedUseragent == 'No') {
if (inputs.filterKnownBotUserAgents and isKnownBotUserAgent(user_agent)) {
return null
}
// Now that we know we have a valid user agent, convert to lower case
user_agent := lower(user_agent)
let bot_list := []
// Handle custom bot patterns
let bot_list := known_bot_filter_list
if (notEmpty(inputs.customBotPatterns)) {
let custom_patterns := splitByString(',', inputs.customBotPatterns)
// Add each custom pattern to the list
@@ -91,227 +34,34 @@ if (notEmpty(inputs.customBotPatterns)) {
}
}
// Function to check if user agent contains any bot identifier
fun isBotUa(ua) {
for (let bot_name in bot_list) {
if (ua =~* bot_name) {
return true
}
// If bot is detected, return null to filter out the event
for (let bot_name in bot_list) {
if (user_agent =~* bot_name) {
return null
}
return false
}
// If bot is detected, return null to filter out the event
// Otherwise return the original event
if (isBotUa(user_agent)) {
let ip := event.properties['$ip']
if (empty(ip)) {
return event
}
if (inputs.filterKnownBotIps and isKnownBotIp(ip)) {
return null
}
let known_bot_ip_prefixes := [
// ahrefs
'5.39.1.224/27',
'5.39.109.160/27',
'15.235.27.0/24',
'15.235.96.0/24',
'15.235.98.0/24',
'37.59.204.128/27',
'51.68.247.192/27',
'51.75.236.128/27',
'51.89.129.0/24',
'51.161.37.0/24',
'51.161.65.0/24',
'51.195.183.0/24',
'51.195.215.0/24',
'51.195.244.0/24',
'51.222.95.0/24',
'51.222.168.0/24',
'51.222.253.0/26',
'54.36.148.0/23',
'54.37.118.64/27',
'54.38.147.0/24',
'54.39.0.0/24',
'54.39.6.0/24',
'54.39.89.0/24',
'54.39.136.0/24',
'54.39.203.0/24',
'54.39.210.0/24',
'92.222.104.192/27',
'92.222.108.96/27',
'94.23.188.192/27',
'142.44.220.0/24',
'142.44.225.0/24',
'142.44.228.0/24',
'142.44.233.0/24',
'148.113.128.0/24',
'148.113.130.0/24',
'167.114.139.0/24',
'168.100.149.0/24',
'176.31.139.0/27',
'198.244.168.0/24',
'198.244.183.0/24',
'198.244.186.193/32',
'198.244.186.194/31',
'198.244.186.196/30',
'198.244.186.200/31',
'198.244.186.202/32',
'198.244.226.0/24',
'198.244.240.0/24',
'198.244.242.0/24',
'202.8.40.0/22',
'202.94.84.110/31',
'202.94.84.112/31',
// bing
'157.55.39.0/24',
'207.46.13.0/24',
'40.77.167.0/24',
'13.66.139.0/24',
'13.66.144.0/24',
'52.167.144.0/24',
'13.67.10.16/28',
'13.69.66.240/28',
'13.71.172.224/28',
'139.217.52.0/28',
'191.233.204.224/28',
'20.36.108.32/28',
'20.43.120.16/28',
'40.79.131.208/28',
'40.79.186.176/28',
'52.231.148.0/28',
'20.79.107.240/28',
'51.105.67.0/28',
'20.125.163.80/28',
'40.77.188.0/22',
'65.55.210.0/24',
'199.30.24.0/23',
'40.77.202.0/24',
'40.77.139.0/25',
'20.74.197.0/28',
'20.15.133.160/27',
'40.77.177.0/24',
'40.77.178.0/23',
// google
'8.8.4.0/24',
'8.8.8.0/24',
'8.34.208.0/20',
'8.35.192.0/20',
'23.236.48.0/20',
'23.251.128.0/19',
'34.0.0.0/15',
'34.2.0.0/16',
'34.3.0.0/23',
'34.3.3.0/24',
'34.3.4.0/24',
'34.3.8.0/21',
'34.3.16.0/20',
'34.3.32.0/19',
'34.3.64.0/18',
'34.4.0.0/14',
'34.8.0.0/13',
'34.16.0.0/12',
'34.32.0.0/11',
'34.64.0.0/10',
'34.128.0.0/10',
'35.184.0.0/13',
'35.192.0.0/14',
'35.196.0.0/15',
'35.198.0.0/16',
'35.199.0.0/17',
'35.199.128.0/18',
'35.200.0.0/13',
'35.208.0.0/12',
'35.224.0.0/12',
'35.240.0.0/13',
'57.140.192.0/18',
'64.15.112.0/20',
'64.233.160.0/19',
'66.22.228.0/23',
'66.102.0.0/20',
'66.249.64.0/19',
'70.32.128.0/19',
'72.14.192.0/18',
'74.125.0.0/16',
'104.154.0.0/15',
'104.196.0.0/14',
'104.237.160.0/19',
'107.167.160.0/19',
'107.178.192.0/18',
'108.59.80.0/20',
'108.170.192.0/18',
'108.177.0.0/17',
'130.211.0.0/16',
'136.22.160.0/20',
'136.22.176.0/21',
'136.22.184.0/23',
'136.22.186.0/24',
'136.124.0.0/15',
'142.250.0.0/15',
'146.148.0.0/17',
'152.65.208.0/22',
'152.65.214.0/23',
'152.65.218.0/23',
'152.65.222.0/23',
'152.65.224.0/19',
'162.120.128.0/17',
'162.216.148.0/22',
'162.222.176.0/21',
'172.110.32.0/21',
'172.217.0.0/16',
'172.253.0.0/16',
'173.194.0.0/16',
'173.255.112.0/20',
'192.158.28.0/22',
'192.178.0.0/15',
'193.186.4.0/24',
'199.36.154.0/23',
'199.36.156.0/24',
'199.192.112.0/22',
'199.223.232.0/21',
'207.223.160.0/20',
'208.65.152.0/22',
'208.68.108.0/22',
'208.81.188.0/22',
'208.117.224.0/19',
'209.85.128.0/17',
'216.58.192.0/19',
'216.73.80.0/20',
'216.239.32.0/19',
'2001:4860::/32',
'2404:6800::/32',
'2404:f340::/32',
'2600:1900::/28',
'2605:ef80::/32',
'2606:40::/32',
'2606:73c0::/32',
'2607:1c0:241:40::/60',
'2607:1c0:300::/40',
'2607:f8b0::/32',
'2620:11a:a000::/40',
'2620:120:e000::/40',
'2800:3f0::/32',
'2a00:1450::/32',
'2c0f:fb50::/32',
];
let bot_ip_prefixes := []
if (inputs.customIpPrefixes and notEmpty(inputs.customIpPrefixes)) {
let custom_prefixes := splitByString(',', inputs.customIpPrefixes)
// Add each custom prefix to the list
for (let prefix in custom_prefixes) {
known_bot_ip_prefixes := arrayPushBack(known_bot_ip_prefixes, trim(prefix))
if (isIPAddressInRange(ip, trim(prefix))) {
return null
}
}
}
let ip := event.properties['$ip']
if (notEmpty(ip)) {
for (let prefix in known_bot_ip_prefixes) {
if (isIPAddressInRange(ip, prefix)) {
return null
}
}
}
return event
`,
inputs_schema: [
@@ -324,6 +74,16 @@ return event
secret: false,
required: true,
},
{
key: 'filterKnownBotUserAgents',
type: 'boolean',
label: 'Filter out known bot user agents',
description:
"Filter out known bot user agents using PostHog's known bot user agents list. This is kept up to date dynamically.",
default: true,
secret: false,
required: true,
},
{
key: 'customBotPatterns',
type: 'string',
@@ -333,6 +93,16 @@ return event
secret: false,
required: false,
},
{
key: 'filterKnownBotIps',
type: 'boolean',
label: 'Filter out known bot ips',
description:
"Filter out known bot ips using PostHog's known bot ips list. This is kept up to date dynamically.",
default: true,
secret: false,
required: true,
},
{
key: 'customIpPrefixes',
type: 'string',

View File

@@ -2,6 +2,7 @@ import Chance from 'chance'
import merge from 'deepmerge'
import { Settings } from 'luxon'
import { getTransformationFunctions } from '~/cdp/hog-transformations/transformation-functions'
import { formatLiquidInput } from '~/cdp/services/hog-inputs.service'
import { NativeDestinationExecutorService } from '~/cdp/services/native-destination-executor.service'
import { isNativeHogFunction } from '~/cdp/utils'
@@ -10,7 +11,6 @@ import { CyclotronInputType } from '~/schema/cyclotron'
import { GeoIPService, GeoIp } from '~/utils/geoip'
import { Hub } from '../../../types'
import { cleanNullValues } from '../../hog-transformations/transformation-functions'
import { HogExecutorService } from '../../services/hog-executor.service'
import {
CyclotronJobInvocationHogFunction,
@@ -246,16 +246,8 @@ export class TemplateTester {
}
const globalsWithInputs = await this.hogExecutor.buildInputsWithGlobals(hogFunction, globals)
const invocation = createInvocation(globalsWithInputs, hogFunction)
const transformationFunctions = {
geoipLookup: (val: unknown): any => {
return typeof val === 'string' ? this.geoIp?.city(val) : null
},
cleanNullValues,
}
const transformationFunctions = getTransformationFunctions(this.geoIp!)
const extraFunctions = invocation.hogFunction.type === 'transformation' ? transformationFunctions : {}
return this.getExecutor().execute(invocation, { functions: extraFunctions })

12922
share/bot-ips.txt Normal file

File diff suppressed because it is too large Load Diff