mirror of
https://github.com/BillyOutlast/posthog.git
synced 2026-02-04 03:01:23 +01:00
feat(cdp): Revert Modify bot filter to use global list (#38513)
This commit is contained in:
@@ -43,7 +43,6 @@
|
||||
!requirements.txt
|
||||
!rust
|
||||
!share/GeoLite2-City.mmdb
|
||||
!share/bot-ips.txt
|
||||
!staticfiles
|
||||
!tailwind.config.js
|
||||
!test-runner-jest-environment.js
|
||||
|
||||
63
.github/workflows/update-bot-ips.yml
vendored
63
.github/workflows/update-bot-ips.yml
vendored
@@ -1,63 +0,0 @@
|
||||
name: Update Bot IPs
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Run daily at 2 AM UTC
|
||||
- cron: '0 2 * * *'
|
||||
workflow_dispatch: # Allow manual triggering
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
update-bot-ips:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
token: ${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }}
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Run update script
|
||||
run: |
|
||||
./bin/update-bots-list
|
||||
|
||||
- name: Check for changes
|
||||
id: check-changes
|
||||
run: |
|
||||
if git diff --quiet share/bot-ips.txt; then
|
||||
echo "changes=false" >> $GITHUB_OUTPUT
|
||||
echo "No changes detected in bot IPs"
|
||||
else
|
||||
echo "changes=true" >> $GITHUB_OUTPUT
|
||||
echo "Changes detected in bot IPs"
|
||||
git diff --stat share/bot-ips.txt
|
||||
fi
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.check-changes.outputs.changes == 'true'
|
||||
uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v5
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
commit-message: 'Update bot IPs from GoodBots repository'
|
||||
title: '🤖 Update bot IPs from GoodBots repository'
|
||||
body: |
|
||||
This PR updates the bot IPs list from the [GoodBots repository](https://github.com/AnTheMaker/GoodBots).
|
||||
|
||||
**Changes:**
|
||||
- Updated `share/bot-ips.txt` with latest bot IPs
|
||||
|
||||
**Source:** https://raw.githubusercontent.com/AnTheMaker/GoodBots/main/all.ips
|
||||
|
||||
This is an automated update that runs daily.
|
||||
base: master
|
||||
delete-branch: true
|
||||
labels: |
|
||||
automated
|
||||
bot-ips
|
||||
dependencies
|
||||
reviewers:
|
||||
- PostHog/team-messaging
|
||||
@@ -1,33 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script to update bot IPs from GoodBots repository
|
||||
# Downloads IPs from GitHub and saves to share/bot-ips.txt
|
||||
|
||||
set -e
|
||||
|
||||
# Configuration
|
||||
GITHUB_URL="https://raw.githubusercontent.com/AnTheMaker/GoodBots/main/all.ips"
|
||||
TARGET_FILE="share/bot-ips.txt"
|
||||
|
||||
# Get the directory of this script
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
TARGET_PATH="$PROJECT_ROOT/$TARGET_FILE"
|
||||
|
||||
echo "Downloading bot IPs from: $GITHUB_URL"
|
||||
echo "Saving to: $TARGET_PATH"
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
mkdir -p "$(dirname "$TARGET_PATH")"
|
||||
|
||||
# Download and save the IPs
|
||||
curl -s "$GITHUB_URL" > "$TARGET_PATH"
|
||||
|
||||
# Check if download was successful
|
||||
if [ $? -eq 0 ] && [ -s "$TARGET_PATH" ]; then
|
||||
LINE_COUNT=$(wc -l < "$TARGET_PATH")
|
||||
echo "Successfully downloaded $LINE_COUNT IP addresses to $TARGET_PATH"
|
||||
else
|
||||
echo "Error: Failed to download or file is empty"
|
||||
exit 1
|
||||
fi
|
||||
41
plugin-server/bin/update_bot_ip_prefixes.py
Normal file
41
plugin-server/bin/update_bot_ip_prefixes.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import itertools
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def get_prefixes_from_url(url):
|
||||
data = requests.get(url).json()
|
||||
return [
|
||||
x
|
||||
for x in itertools.chain.from_iterable(
|
||||
(prefix.get("ipv4Prefix", None), prefix.get("ipv6Prefix", None)) for prefix in data["prefixes"]
|
||||
)
|
||||
if x
|
||||
]
|
||||
|
||||
|
||||
def to_hog_friendly_string(ip_ranges):
|
||||
hog_friendly_string = "let known_bot_ip_prefixes := [\n"
|
||||
for index, (key, value) in enumerate(ip_ranges.items()):
|
||||
if index > 0:
|
||||
hog_friendly_string += "\n"
|
||||
hog_friendly_string += f" // {key}\n"
|
||||
for ip in value:
|
||||
hog_friendly_string += f" '{ip}',\n"
|
||||
hog_friendly_string += "];\n"
|
||||
return hog_friendly_string
|
||||
|
||||
|
||||
def update_bot_ip_ranges():
|
||||
# this list is incomplete, you can help by expanding it
|
||||
ip_ranges = {
|
||||
"ahrefs": get_prefixes_from_url("https://api.ahrefs.com/v3/public/crawler-ip-ranges"),
|
||||
"bing": get_prefixes_from_url("https://www.bing.com/toolbox/bingbot.json"),
|
||||
"google": get_prefixes_from_url("https://www.gstatic.com/ipranges/goog.json"),
|
||||
}
|
||||
|
||||
print(to_hog_friendly_string(ip_ranges)) # noqa: T201
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
update_bot_ip_ranges()
|
||||
@@ -1,240 +0,0 @@
|
||||
import fs from 'fs'
|
||||
import path from 'path'
|
||||
|
||||
export const KNOWN_BOT_IP_LIST = fs
|
||||
.readFileSync(path.join(__dirname, '../../../../..', 'share', 'bot-ips.txt'), 'utf8')
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
|
||||
export const KNOWN_BOT_UA_LIST = [
|
||||
'bot',
|
||||
'crawler',
|
||||
'spider',
|
||||
'feedfetcher-google',
|
||||
'mediapartners-google',
|
||||
'apis-google',
|
||||
'slurp',
|
||||
'python-urllib',
|
||||
'python-requests',
|
||||
'aiohttp',
|
||||
'httpx',
|
||||
'libwww-perl',
|
||||
'httpunit',
|
||||
'nutch',
|
||||
'go-http-client',
|
||||
'biglotron',
|
||||
'teoma',
|
||||
'convera',
|
||||
'gigablast',
|
||||
'ia_archiver',
|
||||
'webmon ',
|
||||
'httrack',
|
||||
'grub.org',
|
||||
'netresearchserver',
|
||||
'speedy',
|
||||
'fluffy',
|
||||
'findlink',
|
||||
'panscient',
|
||||
'ips-agent',
|
||||
'yanga',
|
||||
'yandex',
|
||||
'yadirectfetcher',
|
||||
'cyberpatrol',
|
||||
'postrank',
|
||||
'page2rss',
|
||||
'linkdex',
|
||||
'ezooms',
|
||||
'heritrix',
|
||||
'findthatfile',
|
||||
'europarchive.org',
|
||||
'mappydata',
|
||||
'eright',
|
||||
'apercite',
|
||||
'aboundex',
|
||||
'summify',
|
||||
'ec2linkfinder',
|
||||
'facebookexternalhit',
|
||||
'yeti',
|
||||
'retrevopageanalyzer',
|
||||
'sogou',
|
||||
'wotbox',
|
||||
'ichiro',
|
||||
'drupact',
|
||||
'coccoc',
|
||||
'integromedb',
|
||||
'siteexplorer.info',
|
||||
'proximic',
|
||||
'changedetection',
|
||||
'cc metadata scaper',
|
||||
'g00g1e.net',
|
||||
'binlar',
|
||||
'a6-indexer',
|
||||
'admantx',
|
||||
'megaindex',
|
||||
'ltx71',
|
||||
'bubing',
|
||||
'qwantify',
|
||||
'lipperhey',
|
||||
'addthis',
|
||||
'metauri',
|
||||
'scrapy',
|
||||
'capsulechecker',
|
||||
'sonic',
|
||||
'sysomos',
|
||||
'trove',
|
||||
'deadlinkchecker',
|
||||
'slack-imgproxy',
|
||||
'embedly',
|
||||
'iskanie',
|
||||
'skypeuripreview',
|
||||
'google-adwords-instant',
|
||||
'whatsapp',
|
||||
'electricmonk',
|
||||
'yahoo link preview',
|
||||
'xenu link sleuth',
|
||||
'pcore-http',
|
||||
'appinsights',
|
||||
'phantomjs',
|
||||
'jetslide',
|
||||
'newsharecounts',
|
||||
'tineye',
|
||||
'linkarchiver',
|
||||
'digg deeper',
|
||||
'snacktory',
|
||||
'okhttp',
|
||||
'nuzzel',
|
||||
'omgili',
|
||||
'pocketparser',
|
||||
'um-ln',
|
||||
'muckrack',
|
||||
'netcraftsurveyagent',
|
||||
'appengine-google',
|
||||
'jetty',
|
||||
'upflow',
|
||||
'thinklab',
|
||||
'traackr.com',
|
||||
'twurly',
|
||||
'mastodon',
|
||||
'http_get',
|
||||
'brandverity',
|
||||
'check_http',
|
||||
'ezid',
|
||||
'genieo',
|
||||
'meltwaternews',
|
||||
'moreover',
|
||||
'scoutjet',
|
||||
'seoscanners',
|
||||
'hatena',
|
||||
'google web preview',
|
||||
'adscanner',
|
||||
'netvibes',
|
||||
'baidu-yunguance',
|
||||
'btwebclient',
|
||||
'disqus',
|
||||
'feedly',
|
||||
'fever',
|
||||
'flamingo_searchengine',
|
||||
'flipboardproxy',
|
||||
'g2 web services',
|
||||
'vkshare',
|
||||
'siteimprove.com',
|
||||
'dareboost',
|
||||
'feedspot',
|
||||
'seokicks',
|
||||
'tracemyfile',
|
||||
'zgrab',
|
||||
'pr-cy.ru',
|
||||
'datafeedwatch',
|
||||
'zabbix',
|
||||
'google-xrawler',
|
||||
'axios',
|
||||
'amazon cloudfront',
|
||||
'pulsepoint',
|
||||
'cloudflare-alwaysonline',
|
||||
'google-structured-data-testing-tool',
|
||||
'wordupinfosearch',
|
||||
'webdatastats',
|
||||
'httpurlconnection',
|
||||
'outbrain',
|
||||
'w3c_validator',
|
||||
'w3c-checklink',
|
||||
'w3c-mobileok',
|
||||
'w3c_i18n-checker',
|
||||
'feedvalidator',
|
||||
'w3c_css_validator',
|
||||
'w3c_unicorn',
|
||||
'google-physicalweb',
|
||||
'blackboard',
|
||||
'bazqux',
|
||||
'twingly',
|
||||
'rivva',
|
||||
'dataprovider.com',
|
||||
'theoldreader.com',
|
||||
'anyevent',
|
||||
'nmap scripting engine',
|
||||
'2ip.ru',
|
||||
'clickagy',
|
||||
'google favicon',
|
||||
'hubspot',
|
||||
'chrome-lighthouse',
|
||||
'headlesschrome',
|
||||
'simplescraper',
|
||||
'fedoraplanet',
|
||||
'friendica',
|
||||
'nextcloud',
|
||||
'tiny tiny rss',
|
||||
'datanyze',
|
||||
'google-site-verification',
|
||||
'trendsmapresolver',
|
||||
'tweetedtimes',
|
||||
'gwene',
|
||||
'simplepie',
|
||||
'searchatlas',
|
||||
'superfeedr',
|
||||
'freewebmonitoring sitechecker',
|
||||
'pandalytics',
|
||||
'seewithkids',
|
||||
'cincraw',
|
||||
'freshrss',
|
||||
'google-certificates-bridge',
|
||||
'viber',
|
||||
'evc-batch',
|
||||
'virustotal',
|
||||
'uptime-kuma',
|
||||
'feedbin',
|
||||
'snap url preview service',
|
||||
'ruxitsynthetic',
|
||||
'google-read-aloud',
|
||||
'mediapartners',
|
||||
'wget',
|
||||
'ahrefsgot',
|
||||
'ahrefssiteaudit',
|
||||
'wesee:search',
|
||||
'y!j',
|
||||
'collection@infegy.com',
|
||||
'deusu',
|
||||
'bingpreview',
|
||||
'daum',
|
||||
'pingdom',
|
||||
'barkrowler',
|
||||
'yak',
|
||||
'ning',
|
||||
'ahc',
|
||||
'apache-httpclient',
|
||||
'buck',
|
||||
'newspaper',
|
||||
'sentry',
|
||||
'fetch',
|
||||
'miniflux',
|
||||
'validator.nu',
|
||||
'grouphigh',
|
||||
'checkmarknetwork',
|
||||
'www.uptime.com',
|
||||
'mixnodecache',
|
||||
'domains project',
|
||||
'pagepeeker',
|
||||
'vigil',
|
||||
'php-curl-class',
|
||||
'ptst',
|
||||
'seostar.co',
|
||||
]
|
||||
@@ -16,7 +16,7 @@ import { HogFunctionMonitoringService } from '../services/monitoring/hog-functio
|
||||
import { HogWatcherService, HogWatcherState } from '../services/monitoring/hog-watcher.service'
|
||||
import { convertToHogFunctionFilterGlobal, filterFunctionInstrumented } from '../utils/hog-function-filtering'
|
||||
import { createInvocation, createInvocationResult } from '../utils/invocation-utils'
|
||||
import { getTransformationFunctions } from './transformation-functions'
|
||||
import { cleanNullValues } from './transformation-functions'
|
||||
|
||||
export const hogTransformationDroppedEvents = new Counter({
|
||||
name: 'hog_transformation_dropped_events',
|
||||
@@ -102,7 +102,15 @@ export class HogTransformerService {
|
||||
|
||||
private async getTransformationFunctions() {
|
||||
const geoipLookup = await this.hub.geoipService.get()
|
||||
return getTransformationFunctions(geoipLookup)
|
||||
return {
|
||||
geoipLookup: (val: unknown): any => {
|
||||
return typeof val === 'string' ? geoipLookup.city(val) : null
|
||||
},
|
||||
cleanNullValues,
|
||||
postHogCapture: () => {
|
||||
throw new Error('posthogCapture is not supported in transformations')
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
private createInvocationGlobals(event: PluginEvent): HogFunctionInvocationGlobals {
|
||||
|
||||
@@ -1,7 +1,3 @@
|
||||
import { GeoIp } from '~/utils/geoip'
|
||||
|
||||
import { KNOWN_BOT_IP_LIST, KNOWN_BOT_UA_LIST } from './bots/bots'
|
||||
|
||||
const MAX_DEPTH = 3
|
||||
|
||||
function cleanNullValuesInternal(value: unknown, depth: number): unknown {
|
||||
@@ -36,35 +32,3 @@ function cleanNullValuesInternal(value: unknown, depth: number): unknown {
|
||||
export function cleanNullValues(value: unknown): unknown {
|
||||
return cleanNullValuesInternal(value, 1)
|
||||
}
|
||||
|
||||
export const isKnownBotUserAgent = (value: unknown): boolean => {
|
||||
if (typeof value !== 'string') {
|
||||
return false
|
||||
}
|
||||
|
||||
const userAgent = (value as string).toLowerCase()
|
||||
return KNOWN_BOT_UA_LIST.some((bot) => userAgent.includes(bot))
|
||||
}
|
||||
|
||||
export const isKnownBotIp = (ip: unknown): boolean => {
|
||||
if (typeof ip !== 'string') {
|
||||
return false
|
||||
}
|
||||
|
||||
const ipString = ip as string
|
||||
return KNOWN_BOT_IP_LIST.includes(ipString)
|
||||
}
|
||||
|
||||
export const getTransformationFunctions = (geoipLookup: GeoIp) => {
|
||||
return {
|
||||
geoipLookup: (val: unknown): any => {
|
||||
return typeof val === 'string' ? geoipLookup.city(val) : null
|
||||
},
|
||||
cleanNullValues,
|
||||
isKnownBotUserAgent,
|
||||
isKnownBotIp,
|
||||
postHogCapture: () => {
|
||||
throw new Error('posthogCapture is not supported in transformations')
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,8 +7,6 @@ const DEFAULT_INPUTS = {
|
||||
userAgent: '$raw_user_agent',
|
||||
customBotPatterns: '',
|
||||
customIpPrefixes: '',
|
||||
filterKnownBotUserAgents: true,
|
||||
filterKnownBotIps: true,
|
||||
keepUndefinedUseragent: 'Yes',
|
||||
}
|
||||
|
||||
@@ -54,17 +52,35 @@ describe('bot-detection.template', () => {
|
||||
})
|
||||
|
||||
it.each([
|
||||
['Yes', true, undefined],
|
||||
['No', false, undefined],
|
||||
['Yes', true, ''],
|
||||
['No', false, ''],
|
||||
['Yes', true],
|
||||
['No', false],
|
||||
])(
|
||||
'should treat missing user agent when keepUndefinedUseragent is %s',
|
||||
async (keepUndefinedUseragent, shouldKeepEvent, ua) => {
|
||||
async (keepUndefinedUseragent, shouldKeepEvent) => {
|
||||
mockGlobals = tester.createGlobals({
|
||||
event: {
|
||||
properties: {},
|
||||
},
|
||||
})
|
||||
|
||||
const response = await tester.invoke({ ...DEFAULT_INPUTS, keepUndefinedUseragent }, mockGlobals)
|
||||
|
||||
expect(response.finished).toBeTruthy()
|
||||
expect(response.error).toBeFalsy()
|
||||
shouldKeepEvent ? expect(response.execResult).toBeTruthy() : expect(response.execResult).toBeFalsy()
|
||||
}
|
||||
)
|
||||
|
||||
it.each([
|
||||
['Yes', true],
|
||||
['No', false],
|
||||
])(
|
||||
'should treat empty user agent when keepUndefinedUseragent is %s',
|
||||
async (keepUndefinedUseragent, shouldKeepEvent) => {
|
||||
mockGlobals = tester.createGlobals({
|
||||
event: {
|
||||
properties: {
|
||||
$raw_user_agent: ua,
|
||||
$raw_user_agent: '',
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -196,51 +212,4 @@ describe('bot-detection.template', () => {
|
||||
expect(response.error).toBeFalsy()
|
||||
expect(response.execResult).toBeFalsy()
|
||||
})
|
||||
|
||||
it('should not filter out known bot user agents if filterKnownBotUserAgents is false', async () => {
|
||||
mockGlobals = tester.createGlobals({
|
||||
event: {
|
||||
properties: {
|
||||
$raw_user_agent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
const response = await tester.invoke(
|
||||
{
|
||||
...DEFAULT_INPUTS,
|
||||
filterKnownBotUserAgents: false,
|
||||
},
|
||||
mockGlobals
|
||||
)
|
||||
|
||||
expect(response.finished).toBeTruthy()
|
||||
expect(response.error).toBeFalsy()
|
||||
expect(response.execResult).toBeTruthy()
|
||||
})
|
||||
|
||||
it('should not filter out known bot ips if filterKnownBotIps is false', async () => {
|
||||
mockGlobals = tester.createGlobals({
|
||||
event: {
|
||||
properties: {
|
||||
$ip: '5.39.1.225',
|
||||
$raw_user_agent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
const response = await tester.invoke(
|
||||
{
|
||||
...DEFAULT_INPUTS,
|
||||
filterKnownBotIps: false,
|
||||
},
|
||||
mockGlobals
|
||||
)
|
||||
|
||||
expect(response.finished).toBeTruthy()
|
||||
expect(response.error).toBeFalsy()
|
||||
expect(response.execResult).toBeTruthy()
|
||||
})
|
||||
})
|
||||
|
||||
@@ -12,20 +12,77 @@ export const template: HogFunctionTemplate = {
|
||||
category: ['Custom'],
|
||||
code_language: 'hog',
|
||||
code: `
|
||||
// Get the user agent value
|
||||
let user_agent := event.properties[inputs.userAgent]
|
||||
// List of known bot user agents
|
||||
let known_bot_filter_list := ['bot', 'crawler', 'spider', 'feedfetcher-google',
|
||||
'mediapartners-google', 'apis-google', 'slurp', 'python-urllib',
|
||||
'python-requests', 'aiohttp', 'httpx', 'libwww-perl',
|
||||
'httpunit', 'nutch', 'go-http-client', 'biglotron', 'teoma',
|
||||
'convera', 'gigablast', 'ia_archiver', 'webmon ', 'httrack',
|
||||
'grub.org', 'netresearchserver', 'speedy', 'fluffy',
|
||||
'findlink', 'panscient', 'ips-agent', 'yanga', 'yandex',
|
||||
'yadirectfetcher', 'cyberpatrol', 'postrank', 'page2rss',
|
||||
'linkdex', 'ezooms', 'heritrix', 'findthatfile', 'europarchive.org',
|
||||
'mappydata', 'eright', 'apercite', 'aboundex', 'summify', 'ec2linkfinder',
|
||||
'facebookexternalhit', 'yeti', 'retrevopageanalyzer', 'sogou', 'wotbox',
|
||||
'ichiro', 'drupact', 'coccoc', 'integromedb', 'siteexplorer.info',
|
||||
'proximic', 'changedetection', 'cc metadata scaper', 'g00g1e.net',
|
||||
'binlar', 'a6-indexer', 'admantx', 'megaindex', 'ltx71', 'bubing',
|
||||
'qwantify', 'lipperhey', 'addthis', 'metauri', 'scrapy', 'capsulechecker',
|
||||
'sonic', 'sysomos', 'trove', 'deadlinkchecker', 'slack-imgproxy', 'embedly',
|
||||
'iskanie', 'skypeuripreview', 'google-adwords-instant', 'whatsapp',
|
||||
'electricmonk', 'yahoo link preview', 'xenu link sleuth', 'pcore-http',
|
||||
'appinsights', 'phantomjs', 'jetslide', 'newsharecounts', 'tineye',
|
||||
'linkarchiver', 'digg deeper', 'snacktory', 'okhttp', 'nuzzel', 'omgili',
|
||||
'pocketparser', 'um-ln', 'muckrack', 'netcraftsurveyagent', 'appengine-google',
|
||||
'jetty', 'upflow', 'thinklab', 'traackr.com', 'twurly', 'mastodon', 'http_get',
|
||||
'brandverity', 'check_http', 'ezid', 'genieo', 'meltwaternews', 'moreover',
|
||||
'scoutjet', 'seoscanners', 'hatena', 'google web preview', 'adscanner',
|
||||
'netvibes', 'baidu-yunguance', 'btwebclient', 'disqus', 'feedly', 'fever',
|
||||
'flamingo_searchengine', 'flipboardproxy', 'g2 web services', 'vkshare',
|
||||
'siteimprove.com', 'dareboost', 'feedspot', 'seokicks', 'tracemyfile',
|
||||
'zgrab', 'pr-cy.ru', 'datafeedwatch', 'zabbix', 'google-xrawler', 'axios',
|
||||
'amazon cloudfront', 'pulsepoint', 'cloudflare-alwaysonline',
|
||||
'google-structured-data-testing-tool', 'wordupinfosearch', 'webdatastats',
|
||||
'httpurlconnection', 'outbrain', 'w3c_validator', 'w3c-checklink',
|
||||
'w3c-mobileok', 'w3c_i18n-checker', 'feedvalidator', 'w3c_css_validator',
|
||||
'w3c_unicorn', 'google-physicalweb', 'blackboard', 'bazqux', 'twingly',
|
||||
'rivva', 'dataprovider.com', 'theoldreader.com', 'anyevent',
|
||||
'nmap scripting engine', '2ip.ru', 'clickagy', 'google favicon',
|
||||
'hubspot', 'chrome-lighthouse', 'headlesschrome', 'simplescraper',
|
||||
'fedoraplanet', 'friendica', 'nextcloud', 'tiny tiny rss', 'datanyze',
|
||||
'google-site-verification', 'trendsmapresolver', 'tweetedtimes', 'gwene',
|
||||
'simplepie', 'searchatlas', 'superfeedr', 'freewebmonitoring sitechecker',
|
||||
'pandalytics', 'seewithkids', 'cincraw', 'freshrss', 'google-certificates-bridge',
|
||||
'viber', 'evc-batch', 'virustotal', 'uptime-kuma', 'feedbin',
|
||||
'snap url preview service', 'ruxitsynthetic', 'google-read-aloud',
|
||||
'mediapartners', 'wget', 'wget', 'ahrefsgot', 'ahrefssiteaudit',
|
||||
'wesee:search', 'y!j', 'collection@infegy.com', 'deusu', 'bingpreview',
|
||||
'daum', 'pingdom', 'barkrowler', 'yak', 'ning', 'ahc', 'apache-httpclient',
|
||||
'buck', 'newspaper', 'sentry', 'fetch', 'miniflux', 'validator.nu',
|
||||
'grouphigh', 'checkmarknetwork', 'www.uptime.com', 'mixnodecache',
|
||||
'domains project', 'pagepeeker', 'vigil', 'php-curl-class', 'ptst',
|
||||
'seostar.co']
|
||||
|
||||
let userAgentProperty := inputs.userAgent
|
||||
|
||||
// Check if user agent property exists in event
|
||||
if (empty(user_agent) and inputs.keepUndefinedUseragent == 'No') {
|
||||
if (empty(event.properties[userAgentProperty]) and inputs.keepUndefinedUseragent == 'No') {
|
||||
return null
|
||||
}
|
||||
|
||||
if (inputs.filterKnownBotUserAgents and isKnownBotUserAgent(user_agent)) {
|
||||
// Get the user agent value
|
||||
let user_agent := event.properties[userAgentProperty]
|
||||
|
||||
// Check for empty string
|
||||
if (user_agent == '' and inputs.keepUndefinedUseragent == 'No') {
|
||||
return null
|
||||
}
|
||||
|
||||
let bot_list := []
|
||||
// Now that we know we have a valid user agent, convert to lower case
|
||||
user_agent := lower(user_agent)
|
||||
|
||||
// Handle custom bot patterns
|
||||
let bot_list := known_bot_filter_list
|
||||
if (notEmpty(inputs.customBotPatterns)) {
|
||||
let custom_patterns := splitByString(',', inputs.customBotPatterns)
|
||||
// Add each custom pattern to the list
|
||||
@@ -34,34 +91,227 @@ if (notEmpty(inputs.customBotPatterns)) {
|
||||
}
|
||||
}
|
||||
|
||||
// If bot is detected, return null to filter out the event
|
||||
for (let bot_name in bot_list) {
|
||||
if (user_agent =~* bot_name) {
|
||||
return null
|
||||
// Function to check if user agent contains any bot identifier
|
||||
fun isBotUa(ua) {
|
||||
for (let bot_name in bot_list) {
|
||||
if (ua =~* bot_name) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
let ip := event.properties['$ip']
|
||||
if (empty(ip)) {
|
||||
return event
|
||||
}
|
||||
|
||||
if (inputs.filterKnownBotIps and isKnownBotIp(ip)) {
|
||||
// If bot is detected, return null to filter out the event
|
||||
// Otherwise return the original event
|
||||
if (isBotUa(user_agent)) {
|
||||
return null
|
||||
}
|
||||
|
||||
let bot_ip_prefixes := []
|
||||
let known_bot_ip_prefixes := [
|
||||
// ahrefs
|
||||
'5.39.1.224/27',
|
||||
'5.39.109.160/27',
|
||||
'15.235.27.0/24',
|
||||
'15.235.96.0/24',
|
||||
'15.235.98.0/24',
|
||||
'37.59.204.128/27',
|
||||
'51.68.247.192/27',
|
||||
'51.75.236.128/27',
|
||||
'51.89.129.0/24',
|
||||
'51.161.37.0/24',
|
||||
'51.161.65.0/24',
|
||||
'51.195.183.0/24',
|
||||
'51.195.215.0/24',
|
||||
'51.195.244.0/24',
|
||||
'51.222.95.0/24',
|
||||
'51.222.168.0/24',
|
||||
'51.222.253.0/26',
|
||||
'54.36.148.0/23',
|
||||
'54.37.118.64/27',
|
||||
'54.38.147.0/24',
|
||||
'54.39.0.0/24',
|
||||
'54.39.6.0/24',
|
||||
'54.39.89.0/24',
|
||||
'54.39.136.0/24',
|
||||
'54.39.203.0/24',
|
||||
'54.39.210.0/24',
|
||||
'92.222.104.192/27',
|
||||
'92.222.108.96/27',
|
||||
'94.23.188.192/27',
|
||||
'142.44.220.0/24',
|
||||
'142.44.225.0/24',
|
||||
'142.44.228.0/24',
|
||||
'142.44.233.0/24',
|
||||
'148.113.128.0/24',
|
||||
'148.113.130.0/24',
|
||||
'167.114.139.0/24',
|
||||
'168.100.149.0/24',
|
||||
'176.31.139.0/27',
|
||||
'198.244.168.0/24',
|
||||
'198.244.183.0/24',
|
||||
'198.244.186.193/32',
|
||||
'198.244.186.194/31',
|
||||
'198.244.186.196/30',
|
||||
'198.244.186.200/31',
|
||||
'198.244.186.202/32',
|
||||
'198.244.226.0/24',
|
||||
'198.244.240.0/24',
|
||||
'198.244.242.0/24',
|
||||
'202.8.40.0/22',
|
||||
'202.94.84.110/31',
|
||||
'202.94.84.112/31',
|
||||
|
||||
// bing
|
||||
'157.55.39.0/24',
|
||||
'207.46.13.0/24',
|
||||
'40.77.167.0/24',
|
||||
'13.66.139.0/24',
|
||||
'13.66.144.0/24',
|
||||
'52.167.144.0/24',
|
||||
'13.67.10.16/28',
|
||||
'13.69.66.240/28',
|
||||
'13.71.172.224/28',
|
||||
'139.217.52.0/28',
|
||||
'191.233.204.224/28',
|
||||
'20.36.108.32/28',
|
||||
'20.43.120.16/28',
|
||||
'40.79.131.208/28',
|
||||
'40.79.186.176/28',
|
||||
'52.231.148.0/28',
|
||||
'20.79.107.240/28',
|
||||
'51.105.67.0/28',
|
||||
'20.125.163.80/28',
|
||||
'40.77.188.0/22',
|
||||
'65.55.210.0/24',
|
||||
'199.30.24.0/23',
|
||||
'40.77.202.0/24',
|
||||
'40.77.139.0/25',
|
||||
'20.74.197.0/28',
|
||||
'20.15.133.160/27',
|
||||
'40.77.177.0/24',
|
||||
'40.77.178.0/23',
|
||||
|
||||
// google
|
||||
'8.8.4.0/24',
|
||||
'8.8.8.0/24',
|
||||
'8.34.208.0/20',
|
||||
'8.35.192.0/20',
|
||||
'23.236.48.0/20',
|
||||
'23.251.128.0/19',
|
||||
'34.0.0.0/15',
|
||||
'34.2.0.0/16',
|
||||
'34.3.0.0/23',
|
||||
'34.3.3.0/24',
|
||||
'34.3.4.0/24',
|
||||
'34.3.8.0/21',
|
||||
'34.3.16.0/20',
|
||||
'34.3.32.0/19',
|
||||
'34.3.64.0/18',
|
||||
'34.4.0.0/14',
|
||||
'34.8.0.0/13',
|
||||
'34.16.0.0/12',
|
||||
'34.32.0.0/11',
|
||||
'34.64.0.0/10',
|
||||
'34.128.0.0/10',
|
||||
'35.184.0.0/13',
|
||||
'35.192.0.0/14',
|
||||
'35.196.0.0/15',
|
||||
'35.198.0.0/16',
|
||||
'35.199.0.0/17',
|
||||
'35.199.128.0/18',
|
||||
'35.200.0.0/13',
|
||||
'35.208.0.0/12',
|
||||
'35.224.0.0/12',
|
||||
'35.240.0.0/13',
|
||||
'57.140.192.0/18',
|
||||
'64.15.112.0/20',
|
||||
'64.233.160.0/19',
|
||||
'66.22.228.0/23',
|
||||
'66.102.0.0/20',
|
||||
'66.249.64.0/19',
|
||||
'70.32.128.0/19',
|
||||
'72.14.192.0/18',
|
||||
'74.125.0.0/16',
|
||||
'104.154.0.0/15',
|
||||
'104.196.0.0/14',
|
||||
'104.237.160.0/19',
|
||||
'107.167.160.0/19',
|
||||
'107.178.192.0/18',
|
||||
'108.59.80.0/20',
|
||||
'108.170.192.0/18',
|
||||
'108.177.0.0/17',
|
||||
'130.211.0.0/16',
|
||||
'136.22.160.0/20',
|
||||
'136.22.176.0/21',
|
||||
'136.22.184.0/23',
|
||||
'136.22.186.0/24',
|
||||
'136.124.0.0/15',
|
||||
'142.250.0.0/15',
|
||||
'146.148.0.0/17',
|
||||
'152.65.208.0/22',
|
||||
'152.65.214.0/23',
|
||||
'152.65.218.0/23',
|
||||
'152.65.222.0/23',
|
||||
'152.65.224.0/19',
|
||||
'162.120.128.0/17',
|
||||
'162.216.148.0/22',
|
||||
'162.222.176.0/21',
|
||||
'172.110.32.0/21',
|
||||
'172.217.0.0/16',
|
||||
'172.253.0.0/16',
|
||||
'173.194.0.0/16',
|
||||
'173.255.112.0/20',
|
||||
'192.158.28.0/22',
|
||||
'192.178.0.0/15',
|
||||
'193.186.4.0/24',
|
||||
'199.36.154.0/23',
|
||||
'199.36.156.0/24',
|
||||
'199.192.112.0/22',
|
||||
'199.223.232.0/21',
|
||||
'207.223.160.0/20',
|
||||
'208.65.152.0/22',
|
||||
'208.68.108.0/22',
|
||||
'208.81.188.0/22',
|
||||
'208.117.224.0/19',
|
||||
'209.85.128.0/17',
|
||||
'216.58.192.0/19',
|
||||
'216.73.80.0/20',
|
||||
'216.239.32.0/19',
|
||||
'2001:4860::/32',
|
||||
'2404:6800::/32',
|
||||
'2404:f340::/32',
|
||||
'2600:1900::/28',
|
||||
'2605:ef80::/32',
|
||||
'2606:40::/32',
|
||||
'2606:73c0::/32',
|
||||
'2607:1c0:241:40::/60',
|
||||
'2607:1c0:300::/40',
|
||||
'2607:f8b0::/32',
|
||||
'2620:11a:a000::/40',
|
||||
'2620:120:e000::/40',
|
||||
'2800:3f0::/32',
|
||||
'2a00:1450::/32',
|
||||
'2c0f:fb50::/32',
|
||||
];
|
||||
|
||||
if (inputs.customIpPrefixes and notEmpty(inputs.customIpPrefixes)) {
|
||||
let custom_prefixes := splitByString(',', inputs.customIpPrefixes)
|
||||
// Add each custom prefix to the list
|
||||
for (let prefix in custom_prefixes) {
|
||||
if (isIPAddressInRange(ip, trim(prefix))) {
|
||||
return null
|
||||
}
|
||||
known_bot_ip_prefixes := arrayPushBack(known_bot_ip_prefixes, trim(prefix))
|
||||
}
|
||||
}
|
||||
|
||||
let ip := event.properties['$ip']
|
||||
if (notEmpty(ip)) {
|
||||
for (let prefix in known_bot_ip_prefixes) {
|
||||
if (isIPAddressInRange(ip, prefix)) {
|
||||
return null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return event
|
||||
`,
|
||||
inputs_schema: [
|
||||
@@ -74,16 +324,6 @@ return event
|
||||
secret: false,
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
key: 'filterKnownBotUserAgents',
|
||||
type: 'boolean',
|
||||
label: 'Filter out known bot user agents',
|
||||
description:
|
||||
"Filter out known bot user agents using PostHog's known bot user agents list. This is kept up to date dynamically.",
|
||||
default: true,
|
||||
secret: false,
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
key: 'customBotPatterns',
|
||||
type: 'string',
|
||||
@@ -93,16 +333,6 @@ return event
|
||||
secret: false,
|
||||
required: false,
|
||||
},
|
||||
{
|
||||
key: 'filterKnownBotIps',
|
||||
type: 'boolean',
|
||||
label: 'Filter out known bot ips',
|
||||
description:
|
||||
"Filter out known bot ips using PostHog's known bot ips list. This is kept up to date dynamically.",
|
||||
default: true,
|
||||
secret: false,
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
key: 'customIpPrefixes',
|
||||
type: 'string',
|
||||
|
||||
@@ -2,7 +2,6 @@ import Chance from 'chance'
|
||||
import merge from 'deepmerge'
|
||||
import { Settings } from 'luxon'
|
||||
|
||||
import { getTransformationFunctions } from '~/cdp/hog-transformations/transformation-functions'
|
||||
import { formatLiquidInput } from '~/cdp/services/hog-inputs.service'
|
||||
import { NativeDestinationExecutorService } from '~/cdp/services/native-destination-executor.service'
|
||||
import { isNativeHogFunction } from '~/cdp/utils'
|
||||
@@ -11,6 +10,7 @@ import { CyclotronInputType } from '~/schema/cyclotron'
|
||||
import { GeoIPService, GeoIp } from '~/utils/geoip'
|
||||
|
||||
import { Hub } from '../../../types'
|
||||
import { cleanNullValues } from '../../hog-transformations/transformation-functions'
|
||||
import { HogExecutorService } from '../../services/hog-executor.service'
|
||||
import {
|
||||
CyclotronJobInvocationHogFunction,
|
||||
@@ -246,8 +246,16 @@ export class TemplateTester {
|
||||
}
|
||||
|
||||
const globalsWithInputs = await this.hogExecutor.buildInputsWithGlobals(hogFunction, globals)
|
||||
|
||||
const invocation = createInvocation(globalsWithInputs, hogFunction)
|
||||
const transformationFunctions = getTransformationFunctions(this.geoIp!)
|
||||
|
||||
const transformationFunctions = {
|
||||
geoipLookup: (val: unknown): any => {
|
||||
return typeof val === 'string' ? this.geoIp?.city(val) : null
|
||||
},
|
||||
cleanNullValues,
|
||||
}
|
||||
|
||||
const extraFunctions = invocation.hogFunction.type === 'transformation' ? transformationFunctions : {}
|
||||
|
||||
return this.getExecutor().execute(invocation, { functions: extraFunctions })
|
||||
|
||||
12922
share/bot-ips.txt
12922
share/bot-ips.txt
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user