mirror of
https://github.com/BillyOutlast/posthog.git
synced 2026-02-04 03:01:23 +01:00
498 lines
22 KiB
TypeScript
498 lines
22 KiB
TypeScript
import Piscina from '@posthog/piscina'
|
||
import * as Sentry from '@sentry/node'
|
||
import { Server } from 'http'
|
||
import { Consumer, KafkaJSProtocolError } from 'kafkajs'
|
||
import * as schedule from 'node-schedule'
|
||
import { Counter } from 'prom-client'
|
||
|
||
import { getPluginServerCapabilities } from '../capabilities'
|
||
import { defaultConfig } from '../config/config'
|
||
import { Hub, PluginServerCapabilities, PluginsServerConfig } from '../types'
|
||
import { createHub, KafkaConfig } from '../utils/db/hub'
|
||
import { killProcess } from '../utils/kill'
|
||
import { captureEventLoopMetrics } from '../utils/metrics'
|
||
import { cancelAllScheduledJobs } from '../utils/node-schedule'
|
||
import { PubSub } from '../utils/pubsub'
|
||
import { status } from '../utils/status'
|
||
import { createPostgresPool, delay, getPiscinaStats, stalenessCheck } from '../utils/utils'
|
||
import { TeamManager } from '../worker/ingestion/team-manager'
|
||
import { makePiscina as defaultMakePiscina } from '../worker/piscina'
|
||
import { GraphileWorker } from './graphile-worker/graphile-worker'
|
||
import { loadPluginSchedule } from './graphile-worker/schedule'
|
||
import { startGraphileWorker } from './graphile-worker/worker-setup'
|
||
import { startAnalyticsEventsIngestionConsumer } from './ingestion-queues/analytics-events-ingestion-consumer'
|
||
import { startAnalyticsEventsIngestionOverflowConsumer } from './ingestion-queues/analytics-events-ingestion-overflow-consumer'
|
||
import { startAnonymousEventBufferConsumer } from './ingestion-queues/anonymous-event-buffer-consumer'
|
||
import { startJobsConsumer } from './ingestion-queues/jobs-consumer'
|
||
import { IngestionConsumer } from './ingestion-queues/kafka-queue'
|
||
import { startOnEventHandlerConsumer } from './ingestion-queues/on-event-handler-consumer'
|
||
import { startScheduledTasksConsumer } from './ingestion-queues/scheduled-tasks-consumer'
|
||
import { SessionRecordingBlobIngester } from './ingestion-queues/session-recording/session-recordings-blob-consumer'
|
||
import { startSessionRecordingEventsConsumer } from './ingestion-queues/session-recording/session-recordings-consumer'
|
||
import { createHttpServer } from './services/http-server'
|
||
import { getObjectStorage } from './services/object_storage'
|
||
|
||
const { version } = require('../../package.json')
|
||
|
||
// TODO: refactor this into a class, removing the need for many different Servers
|
||
export type ServerInstance = {
|
||
hub: Hub
|
||
piscina: Piscina
|
||
queue: IngestionConsumer | null
|
||
stop: () => Promise<void>
|
||
}
|
||
|
||
export async function startPluginsServer(
|
||
config: Partial<PluginsServerConfig>,
|
||
makePiscina: (config: PluginsServerConfig) => Piscina = defaultMakePiscina,
|
||
capabilities: PluginServerCapabilities | undefined
|
||
): Promise<Partial<ServerInstance>> {
|
||
const timer = new Date()
|
||
|
||
const serverConfig: PluginsServerConfig = {
|
||
...defaultConfig,
|
||
...config,
|
||
}
|
||
|
||
status.updatePrompt(serverConfig.PLUGIN_SERVER_MODE)
|
||
status.info('ℹ️', `${serverConfig.WORKER_CONCURRENCY} workers, ${serverConfig.TASKS_PER_WORKER} tasks per worker`)
|
||
|
||
// Structure containing initialized clients for Postgres, Kafka, Redis, etc.
|
||
let hub: Hub | undefined
|
||
|
||
// Used to trigger reloads of plugin code/config
|
||
let pubSub: PubSub | undefined
|
||
|
||
// A Node Worker Thread pool
|
||
let piscina: Piscina | undefined
|
||
|
||
// Ingestion Kafka consumer. Handles both analytics events and screen
|
||
// recording events. The functionality roughly looks like:
|
||
//
|
||
// 1. events come in via the /e/ and friends endpoints and published to the
|
||
// plugin_events_ingestion Kafka topic.
|
||
// 2. this queue consumes from the plugin_events_ingestion topic.
|
||
// 3. update or creates people in the Persons table in pg with the new event
|
||
// data.
|
||
// 4. passes he event through `processEvent` on any plugins that the team
|
||
// has enabled.
|
||
// 5. publishes the resulting event to a Kafka topic on which ClickHouse is
|
||
// listening.
|
||
let analyticsEventsIngestionConsumer: IngestionConsumer | undefined
|
||
let analyticsEventsIngestionOverflowConsumer: IngestionConsumer | undefined
|
||
|
||
let onEventHandlerConsumer: IngestionConsumer | undefined
|
||
|
||
// Kafka consumer. Handles events that we couldn't find an existing person
|
||
// to associate. The buffer handles delaying the ingestion of these events
|
||
// (default 60 seconds) to allow for the person to be created in the
|
||
// meantime.
|
||
let bufferConsumer: Consumer | undefined
|
||
let stopSessionRecordingEventsConsumer: (() => void) | undefined
|
||
let stopSessionRecordingBlobConsumer: (() => void) | undefined
|
||
let joinSessionRecordingEventsConsumer: ((timeout?: number) => Promise<void>) | undefined
|
||
let joinSessionRecordingBlobConsumer: ((timeout?: number) => Promise<void>) | undefined
|
||
let jobsConsumer: Consumer | undefined
|
||
let schedulerTasksConsumer: Consumer | undefined
|
||
|
||
let httpServer: Server | undefined // healthcheck server
|
||
|
||
let graphileWorker: GraphileWorker | undefined
|
||
|
||
let closeHub: (() => Promise<void>) | undefined
|
||
|
||
let lastActivityCheck: NodeJS.Timeout | undefined
|
||
let stopEventLoopMetrics: (() => void) | undefined
|
||
|
||
let shuttingDown = false
|
||
async function closeJobs(): Promise<void> {
|
||
shuttingDown = true
|
||
status.info('💤', ' Shutting down gracefully...')
|
||
lastActivityCheck && clearInterval(lastActivityCheck)
|
||
|
||
// HACKY: Stop all consumers and the graphile worker, as well as the
|
||
// http server. Note that we close the http server before the others to
|
||
// ensure that e.g. if something goes wrong and we deadlock, then if
|
||
// we're running in k8s, the liveness check will fail, and thus k8s will
|
||
// kill the pod.
|
||
//
|
||
// I say hacky because we've got a weak dependency on the liveness check
|
||
// configuration.
|
||
httpServer?.close()
|
||
cancelAllScheduledJobs()
|
||
stopEventLoopMetrics?.()
|
||
await Promise.allSettled([
|
||
pubSub?.stop(),
|
||
graphileWorker?.stop(),
|
||
analyticsEventsIngestionConsumer?.stop(),
|
||
analyticsEventsIngestionOverflowConsumer?.stop(),
|
||
onEventHandlerConsumer?.stop(),
|
||
bufferConsumer?.disconnect(),
|
||
jobsConsumer?.disconnect(),
|
||
stopSessionRecordingEventsConsumer?.(),
|
||
stopSessionRecordingBlobConsumer?.(),
|
||
schedulerTasksConsumer?.disconnect(),
|
||
])
|
||
|
||
if (piscina) {
|
||
await stopPiscina(piscina)
|
||
}
|
||
|
||
await closeHub?.()
|
||
|
||
status.info('👋', 'Over and out!')
|
||
}
|
||
|
||
for (const signal of ['SIGINT', 'SIGTERM', 'SIGHUP']) {
|
||
process.on(signal, () => process.emit('beforeExit', 0))
|
||
}
|
||
|
||
process.on('beforeExit', async () => {
|
||
// This makes async exit possible with the process waiting until jobs are closed
|
||
await closeJobs()
|
||
process.exit(0)
|
||
})
|
||
|
||
// Code list in https://kafka.apache.org/0100/protocol.html
|
||
const kafkaJSIgnorableCodes = new Set([
|
||
22, // ILLEGAL_GENERATION
|
||
25, // UNKNOWN_MEMBER_ID
|
||
27, // REBALANCE_IN_PROGRESS
|
||
])
|
||
|
||
process.on('unhandledRejection', (error: Error) => {
|
||
status.error('🤮', `Unhandled Promise Rejection: ${error.stack}`)
|
||
|
||
if (error instanceof KafkaJSProtocolError) {
|
||
kafkaProtocolErrors.inc({
|
||
type: error.type,
|
||
code: error.code,
|
||
})
|
||
|
||
// Ignore some "business as usual" Kafka errors, send the rest to sentry
|
||
if (error.code in kafkaJSIgnorableCodes) {
|
||
return
|
||
}
|
||
}
|
||
|
||
Sentry.captureException(error, {
|
||
extra: { detected_at: `pluginServer.ts on unhandledRejection` },
|
||
})
|
||
})
|
||
|
||
process.on('uncaughtException', async (error: Error) => {
|
||
// If there are unhandled exceptions anywhere, perform a graceful
|
||
// shutdown. The initial trigger for including this handler is due to
|
||
// the graphile-worker code throwing an exception when it can't call
|
||
// `nudge` on a worker. Unsure as to why this happens, but at any rate,
|
||
// to ensure that we gracefully shutdown Kafka consumers, for which
|
||
// unclean shutdowns can cause considerable delay in starting to consume
|
||
// again, we try to gracefully shutdown.
|
||
//
|
||
// See https://nodejs.org/api/process.html#event-uncaughtexception for
|
||
// details on the handler.
|
||
if (shuttingDown) {
|
||
return
|
||
}
|
||
status.error('🤮', `uncaught_exception`, { error: error.stack })
|
||
await closeJobs()
|
||
|
||
process.exit(1)
|
||
})
|
||
|
||
capabilities = capabilities ?? getPluginServerCapabilities(serverConfig)
|
||
let serverInstance: (Partial<ServerInstance> & Pick<ServerInstance, 'hub'>) | undefined
|
||
|
||
// A collection of healthchecks that should be used to validate the
|
||
// health of the plugin-server. These are used by the /_health endpoint
|
||
// to determine if we should trigger a restart of the pod. These should
|
||
// be super lightweight and ideally not do any IO.
|
||
const healthChecks: { [service: string]: () => Promise<boolean> | boolean } = {}
|
||
|
||
try {
|
||
// Based on the mode the plugin server was started, we start a number of
|
||
// different services. Mostly this is reasonably obvious from the name.
|
||
// There is however the `queue` which is a little more complicated.
|
||
// Depending on the capabilities we start with, it will either consume
|
||
// from:
|
||
//
|
||
// 1. plugin_events_ingestion
|
||
// 2. clickhouse_events_json
|
||
// 3. clickhouse_events_json and plugin_events_ingestion
|
||
// 4. conversion_events_buffer
|
||
//
|
||
if (capabilities.processPluginJobs || capabilities.pluginScheduledTasks) {
|
||
;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, null, capabilities)
|
||
serverInstance = serverInstance ? serverInstance : { hub }
|
||
|
||
graphileWorker = new GraphileWorker(hub)
|
||
// `connectProducer` just runs the PostgreSQL migrations. Ideally it
|
||
// would be great to move the migration to bin/migrate and ensure we
|
||
// have a way for the pods to wait for the migrations to complete as
|
||
// we do with other migrations. However, I couldn't find a
|
||
// `graphile-worker` supported way to do this, and I don't think
|
||
// it's that heavy so it may be fine, but something to watch out
|
||
// for.
|
||
await graphileWorker.connectProducer()
|
||
piscina = piscina ?? makePiscina(serverConfig)
|
||
await startGraphileWorker(hub, graphileWorker, piscina)
|
||
|
||
if (capabilities.pluginScheduledTasks) {
|
||
schedulerTasksConsumer = await startScheduledTasksConsumer({
|
||
piscina: piscina,
|
||
kafka: hub.kafka,
|
||
producer: hub.kafkaProducer.producer,
|
||
partitionConcurrency: serverConfig.KAFKA_PARTITIONS_CONSUMED_CONCURRENTLY,
|
||
statsd: hub.statsd,
|
||
})
|
||
}
|
||
|
||
if (capabilities.processPluginJobs) {
|
||
jobsConsumer = await startJobsConsumer({
|
||
kafka: hub.kafka,
|
||
producer: hub.kafkaProducer.producer,
|
||
graphileWorker: graphileWorker,
|
||
statsd: hub.statsd,
|
||
})
|
||
}
|
||
}
|
||
|
||
if (capabilities.ingestion) {
|
||
;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, null, capabilities)
|
||
serverInstance = serverInstance ? serverInstance : { hub }
|
||
|
||
piscina = piscina ?? makePiscina(serverConfig)
|
||
const { queue, isHealthy: isAnalyticsEventsIngestionHealthy } = await startAnalyticsEventsIngestionConsumer(
|
||
{
|
||
hub: hub,
|
||
piscina: piscina,
|
||
}
|
||
)
|
||
|
||
analyticsEventsIngestionConsumer = queue
|
||
healthChecks['analytics-ingestion'] = isAnalyticsEventsIngestionHealthy
|
||
|
||
bufferConsumer = await startAnonymousEventBufferConsumer({
|
||
hub: hub,
|
||
piscina: piscina,
|
||
kafka: hub.kafka,
|
||
producer: hub.kafkaProducer,
|
||
statsd: hub.statsd,
|
||
})
|
||
}
|
||
|
||
if (capabilities.ingestionOverflow) {
|
||
;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, null, capabilities)
|
||
serverInstance = serverInstance ? serverInstance : { hub }
|
||
|
||
piscina = piscina ?? makePiscina(serverConfig)
|
||
analyticsEventsIngestionOverflowConsumer = await startAnalyticsEventsIngestionOverflowConsumer({
|
||
hub: hub,
|
||
piscina: piscina,
|
||
})
|
||
}
|
||
|
||
if (capabilities.processAsyncHandlers) {
|
||
;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, null, capabilities)
|
||
serverInstance = serverInstance ? serverInstance : { hub }
|
||
|
||
piscina = piscina ?? makePiscina(serverConfig)
|
||
const { queue: onEventQueue, isHealthy: isOnEventsIngestionHealthy } = await startOnEventHandlerConsumer({
|
||
hub: hub,
|
||
piscina: piscina,
|
||
})
|
||
|
||
onEventHandlerConsumer = onEventQueue
|
||
|
||
healthChecks['on-event-ingestion'] = isOnEventsIngestionHealthy
|
||
}
|
||
|
||
// If we have
|
||
if (hub && serverInstance) {
|
||
pubSub = new PubSub(hub, {
|
||
[hub.PLUGINS_RELOAD_PUBSUB_CHANNEL]: async () => {
|
||
status.info('⚡', 'Reloading plugins!')
|
||
await piscina?.broadcastTask({ task: 'reloadPlugins' })
|
||
|
||
if (hub?.capabilities.pluginScheduledTasks && piscina) {
|
||
await piscina.broadcastTask({ task: 'reloadSchedule' })
|
||
hub.pluginSchedule = await loadPluginSchedule(piscina)
|
||
}
|
||
},
|
||
'reset-available-features-cache': async (message) => {
|
||
await piscina?.broadcastTask({ task: 'resetAvailableFeaturesCache', args: JSON.parse(message) })
|
||
},
|
||
...(capabilities.processAsyncHandlers
|
||
? {
|
||
'reload-action': async (message) =>
|
||
await piscina?.broadcastTask({ task: 'reloadAction', args: JSON.parse(message) }),
|
||
'drop-action': async (message) =>
|
||
await piscina?.broadcastTask({ task: 'dropAction', args: JSON.parse(message) }),
|
||
}
|
||
: {}),
|
||
})
|
||
|
||
await pubSub.start()
|
||
|
||
// every 5 minutes all ActionManager caches are reloaded for eventual consistency
|
||
schedule.scheduleJob('*/5 * * * *', async () => {
|
||
await piscina?.broadcastTask({ task: 'reloadAllActions' })
|
||
})
|
||
// every 5 seconds set Redis keys @posthog-plugin-server/ping and @posthog-plugin-server/version
|
||
schedule.scheduleJob('*/5 * * * * *', async () => {
|
||
await hub!.db!.redisSet('@posthog-plugin-server/ping', new Date().toISOString(), 60, {
|
||
jsonSerialize: false,
|
||
})
|
||
await hub!.db!.redisSet('@posthog-plugin-server/version', version, undefined, { jsonSerialize: false })
|
||
})
|
||
// every 10 seconds sends stuff to StatsD
|
||
schedule.scheduleJob('*/10 * * * * *', () => {
|
||
if (piscina) {
|
||
for (const [key, value] of Object.entries(getPiscinaStats(piscina))) {
|
||
if (value !== undefined) {
|
||
hub!.statsd?.gauge(`piscina.${key}`, value)
|
||
}
|
||
}
|
||
}
|
||
})
|
||
|
||
if (hub.statsd) {
|
||
stopEventLoopMetrics = captureEventLoopMetrics(hub.statsd, hub.instanceId)
|
||
}
|
||
|
||
if (serverConfig.STALENESS_RESTART_SECONDS > 0) {
|
||
// check every 10 sec how long it has been since the last activity
|
||
|
||
let lastFoundActivity: number
|
||
lastActivityCheck = setInterval(() => {
|
||
const stalenessCheckResult = stalenessCheck(hub, serverConfig.STALENESS_RESTART_SECONDS)
|
||
|
||
if (
|
||
hub?.lastActivity &&
|
||
stalenessCheckResult.isServerStale &&
|
||
lastFoundActivity !== hub?.lastActivity
|
||
) {
|
||
lastFoundActivity = hub?.lastActivity
|
||
const extra = {
|
||
piscina: piscina ? JSON.stringify(getPiscinaStats(piscina)) : null,
|
||
...stalenessCheckResult,
|
||
}
|
||
Sentry.captureMessage(
|
||
`Plugin Server has not ingested events for over ${serverConfig.STALENESS_RESTART_SECONDS} seconds! Rebooting.`,
|
||
{
|
||
extra,
|
||
}
|
||
)
|
||
console.log(
|
||
`Plugin Server has not ingested events for over ${serverConfig.STALENESS_RESTART_SECONDS} seconds! Rebooting.`,
|
||
extra
|
||
)
|
||
hub?.statsd?.increment(`alerts.stale_plugin_server_restarted`)
|
||
|
||
killProcess()
|
||
}
|
||
}, Math.min(serverConfig.STALENESS_RESTART_SECONDS, 10000))
|
||
}
|
||
|
||
serverInstance.piscina = piscina
|
||
serverInstance.queue = analyticsEventsIngestionConsumer
|
||
serverInstance.stop = closeJobs
|
||
|
||
hub.statsd?.timing('total_setup_time', timer)
|
||
status.info('🚀', 'All systems go')
|
||
|
||
hub.lastActivity = new Date().valueOf()
|
||
hub.lastActivityType = 'serverStart'
|
||
}
|
||
|
||
if (capabilities.sessionRecordingIngestion) {
|
||
const postgres = hub?.postgres ?? createPostgresPool(serverConfig.DATABASE_URL)
|
||
const teamManager = hub?.teamManager ?? new TeamManager(postgres, serverConfig)
|
||
const {
|
||
stop,
|
||
isHealthy: isSessionRecordingsHealthy,
|
||
join,
|
||
} = await startSessionRecordingEventsConsumer({
|
||
teamManager: teamManager,
|
||
kafkaConfig: serverConfig as KafkaConfig,
|
||
consumerMaxBytes: serverConfig.KAFKA_CONSUMPTION_MAX_BYTES,
|
||
consumerMaxBytesPerPartition: serverConfig.KAFKA_CONSUMPTION_MAX_BYTES_PER_PARTITION,
|
||
consumerMaxWaitMs: serverConfig.KAFKA_CONSUMPTION_MAX_WAIT_MS,
|
||
})
|
||
stopSessionRecordingEventsConsumer = stop
|
||
joinSessionRecordingEventsConsumer = join
|
||
healthChecks['session-recordings'] = isSessionRecordingsHealthy
|
||
}
|
||
|
||
if (capabilities.sessionRecordingBlobIngestion) {
|
||
const postgres = hub?.postgres ?? createPostgresPool(serverConfig.DATABASE_URL)
|
||
const teamManager = hub?.teamManager ?? new TeamManager(postgres, serverConfig)
|
||
const s3 = hub?.objectStorage ?? getObjectStorage(serverConfig)
|
||
if (!s3) {
|
||
throw new Error("Can't start session recording blob ingestion without object storage")
|
||
}
|
||
const ingester = new SessionRecordingBlobIngester(teamManager, serverConfig, s3)
|
||
await ingester.start()
|
||
const batchConsumer = ingester.batchConsumer
|
||
if (batchConsumer) {
|
||
stopSessionRecordingBlobConsumer = () => ingester.stop()
|
||
joinSessionRecordingBlobConsumer = () => batchConsumer.join()
|
||
healthChecks['session-recordings-blob'] = () => batchConsumer.isHealthy() ?? false
|
||
}
|
||
}
|
||
|
||
if (capabilities.http) {
|
||
httpServer = createHttpServer(healthChecks, analyticsEventsIngestionConsumer, piscina)
|
||
}
|
||
|
||
// If session recordings consumer is defined, then join it. If join
|
||
// resolves, then the consumer has stopped and we should shut down
|
||
// everything else. Ideally we would also join all the other background
|
||
// tasks as well to ensure we stop the server if we hit any errors and
|
||
// don't end up with zombie instances, but I'll leave that refactoring
|
||
// for another time. Note that we have the liveness health checks
|
||
// already, so in K8s cases zombies should be reaped anyway, albeit not
|
||
// in the most efficient way.
|
||
//
|
||
// When extending to other consumers, we would want to do something like
|
||
//
|
||
// ```
|
||
// try {
|
||
// await Promise.race([sessionConsumer.join(), analyticsConsumer.join(), ...])
|
||
// } finally {
|
||
// await closeJobs()
|
||
// }
|
||
// ```
|
||
if (joinSessionRecordingEventsConsumer) {
|
||
joinSessionRecordingEventsConsumer().catch(closeJobs)
|
||
}
|
||
if (joinSessionRecordingBlobConsumer) {
|
||
joinSessionRecordingBlobConsumer().catch(closeJobs)
|
||
}
|
||
|
||
return serverInstance ?? { stop: closeJobs }
|
||
} catch (error) {
|
||
Sentry.captureException(error)
|
||
status.error('💥', 'Launchpad failure!', { error: error.stack ?? error })
|
||
void Sentry.flush().catch(() => null) // Flush Sentry in the background
|
||
await closeJobs()
|
||
process.exit(1)
|
||
}
|
||
}
|
||
|
||
export async function stopPiscina(piscina: Piscina): Promise<void> {
|
||
// Wait *up to* 5 seconds to shut down VMs.
|
||
await Promise.race([piscina.broadcastTask({ task: 'teardownPlugins' }), delay(5000)])
|
||
// Wait 2 seconds to flush the last queues and caches
|
||
await Promise.all([piscina.broadcastTask({ task: 'flushKafkaMessages' }), delay(2000)])
|
||
try {
|
||
await piscina.destroy()
|
||
} catch {}
|
||
}
|
||
|
||
const kafkaProtocolErrors = new Counter({
|
||
name: 'kafka_protocol_errors_total',
|
||
help: 'Kafka protocol errors encountered, by type',
|
||
labelNames: ['type', 'code'],
|
||
})
|