Files
posthog/plugin-server/src/main/pluginsServer.ts

498 lines
22 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import Piscina from '@posthog/piscina'
import * as Sentry from '@sentry/node'
import { Server } from 'http'
import { Consumer, KafkaJSProtocolError } from 'kafkajs'
import * as schedule from 'node-schedule'
import { Counter } from 'prom-client'
import { getPluginServerCapabilities } from '../capabilities'
import { defaultConfig } from '../config/config'
import { Hub, PluginServerCapabilities, PluginsServerConfig } from '../types'
import { createHub, KafkaConfig } from '../utils/db/hub'
import { killProcess } from '../utils/kill'
import { captureEventLoopMetrics } from '../utils/metrics'
import { cancelAllScheduledJobs } from '../utils/node-schedule'
import { PubSub } from '../utils/pubsub'
import { status } from '../utils/status'
import { createPostgresPool, delay, getPiscinaStats, stalenessCheck } from '../utils/utils'
import { TeamManager } from '../worker/ingestion/team-manager'
import { makePiscina as defaultMakePiscina } from '../worker/piscina'
import { GraphileWorker } from './graphile-worker/graphile-worker'
import { loadPluginSchedule } from './graphile-worker/schedule'
import { startGraphileWorker } from './graphile-worker/worker-setup'
import { startAnalyticsEventsIngestionConsumer } from './ingestion-queues/analytics-events-ingestion-consumer'
import { startAnalyticsEventsIngestionOverflowConsumer } from './ingestion-queues/analytics-events-ingestion-overflow-consumer'
import { startAnonymousEventBufferConsumer } from './ingestion-queues/anonymous-event-buffer-consumer'
import { startJobsConsumer } from './ingestion-queues/jobs-consumer'
import { IngestionConsumer } from './ingestion-queues/kafka-queue'
import { startOnEventHandlerConsumer } from './ingestion-queues/on-event-handler-consumer'
import { startScheduledTasksConsumer } from './ingestion-queues/scheduled-tasks-consumer'
import { SessionRecordingBlobIngester } from './ingestion-queues/session-recording/session-recordings-blob-consumer'
import { startSessionRecordingEventsConsumer } from './ingestion-queues/session-recording/session-recordings-consumer'
import { createHttpServer } from './services/http-server'
import { getObjectStorage } from './services/object_storage'
const { version } = require('../../package.json')
// TODO: refactor this into a class, removing the need for many different Servers
export type ServerInstance = {
hub: Hub
piscina: Piscina
queue: IngestionConsumer | null
stop: () => Promise<void>
}
export async function startPluginsServer(
config: Partial<PluginsServerConfig>,
makePiscina: (config: PluginsServerConfig) => Piscina = defaultMakePiscina,
capabilities: PluginServerCapabilities | undefined
): Promise<Partial<ServerInstance>> {
const timer = new Date()
const serverConfig: PluginsServerConfig = {
...defaultConfig,
...config,
}
status.updatePrompt(serverConfig.PLUGIN_SERVER_MODE)
status.info('', `${serverConfig.WORKER_CONCURRENCY} workers, ${serverConfig.TASKS_PER_WORKER} tasks per worker`)
// Structure containing initialized clients for Postgres, Kafka, Redis, etc.
let hub: Hub | undefined
// Used to trigger reloads of plugin code/config
let pubSub: PubSub | undefined
// A Node Worker Thread pool
let piscina: Piscina | undefined
// Ingestion Kafka consumer. Handles both analytics events and screen
// recording events. The functionality roughly looks like:
//
// 1. events come in via the /e/ and friends endpoints and published to the
// plugin_events_ingestion Kafka topic.
// 2. this queue consumes from the plugin_events_ingestion topic.
// 3. update or creates people in the Persons table in pg with the new event
// data.
// 4. passes he event through `processEvent` on any plugins that the team
// has enabled.
// 5. publishes the resulting event to a Kafka topic on which ClickHouse is
// listening.
let analyticsEventsIngestionConsumer: IngestionConsumer | undefined
let analyticsEventsIngestionOverflowConsumer: IngestionConsumer | undefined
let onEventHandlerConsumer: IngestionConsumer | undefined
// Kafka consumer. Handles events that we couldn't find an existing person
// to associate. The buffer handles delaying the ingestion of these events
// (default 60 seconds) to allow for the person to be created in the
// meantime.
let bufferConsumer: Consumer | undefined
let stopSessionRecordingEventsConsumer: (() => void) | undefined
let stopSessionRecordingBlobConsumer: (() => void) | undefined
let joinSessionRecordingEventsConsumer: ((timeout?: number) => Promise<void>) | undefined
let joinSessionRecordingBlobConsumer: ((timeout?: number) => Promise<void>) | undefined
let jobsConsumer: Consumer | undefined
let schedulerTasksConsumer: Consumer | undefined
let httpServer: Server | undefined // healthcheck server
let graphileWorker: GraphileWorker | undefined
let closeHub: (() => Promise<void>) | undefined
let lastActivityCheck: NodeJS.Timeout | undefined
let stopEventLoopMetrics: (() => void) | undefined
let shuttingDown = false
async function closeJobs(): Promise<void> {
shuttingDown = true
status.info('💤', ' Shutting down gracefully...')
lastActivityCheck && clearInterval(lastActivityCheck)
// HACKY: Stop all consumers and the graphile worker, as well as the
// http server. Note that we close the http server before the others to
// ensure that e.g. if something goes wrong and we deadlock, then if
// we're running in k8s, the liveness check will fail, and thus k8s will
// kill the pod.
//
// I say hacky because we've got a weak dependency on the liveness check
// configuration.
httpServer?.close()
cancelAllScheduledJobs()
stopEventLoopMetrics?.()
await Promise.allSettled([
pubSub?.stop(),
graphileWorker?.stop(),
analyticsEventsIngestionConsumer?.stop(),
analyticsEventsIngestionOverflowConsumer?.stop(),
onEventHandlerConsumer?.stop(),
bufferConsumer?.disconnect(),
jobsConsumer?.disconnect(),
stopSessionRecordingEventsConsumer?.(),
stopSessionRecordingBlobConsumer?.(),
schedulerTasksConsumer?.disconnect(),
])
if (piscina) {
await stopPiscina(piscina)
}
await closeHub?.()
status.info('👋', 'Over and out!')
}
for (const signal of ['SIGINT', 'SIGTERM', 'SIGHUP']) {
process.on(signal, () => process.emit('beforeExit', 0))
}
process.on('beforeExit', async () => {
// This makes async exit possible with the process waiting until jobs are closed
await closeJobs()
process.exit(0)
})
// Code list in https://kafka.apache.org/0100/protocol.html
const kafkaJSIgnorableCodes = new Set([
22, // ILLEGAL_GENERATION
25, // UNKNOWN_MEMBER_ID
27, // REBALANCE_IN_PROGRESS
])
process.on('unhandledRejection', (error: Error) => {
status.error('🤮', `Unhandled Promise Rejection: ${error.stack}`)
if (error instanceof KafkaJSProtocolError) {
kafkaProtocolErrors.inc({
type: error.type,
code: error.code,
})
// Ignore some "business as usual" Kafka errors, send the rest to sentry
if (error.code in kafkaJSIgnorableCodes) {
return
}
}
Sentry.captureException(error, {
extra: { detected_at: `pluginServer.ts on unhandledRejection` },
})
})
process.on('uncaughtException', async (error: Error) => {
// If there are unhandled exceptions anywhere, perform a graceful
// shutdown. The initial trigger for including this handler is due to
// the graphile-worker code throwing an exception when it can't call
// `nudge` on a worker. Unsure as to why this happens, but at any rate,
// to ensure that we gracefully shutdown Kafka consumers, for which
// unclean shutdowns can cause considerable delay in starting to consume
// again, we try to gracefully shutdown.
//
// See https://nodejs.org/api/process.html#event-uncaughtexception for
// details on the handler.
if (shuttingDown) {
return
}
status.error('🤮', `uncaught_exception`, { error: error.stack })
await closeJobs()
process.exit(1)
})
capabilities = capabilities ?? getPluginServerCapabilities(serverConfig)
let serverInstance: (Partial<ServerInstance> & Pick<ServerInstance, 'hub'>) | undefined
// A collection of healthchecks that should be used to validate the
// health of the plugin-server. These are used by the /_health endpoint
// to determine if we should trigger a restart of the pod. These should
// be super lightweight and ideally not do any IO.
const healthChecks: { [service: string]: () => Promise<boolean> | boolean } = {}
try {
// Based on the mode the plugin server was started, we start a number of
// different services. Mostly this is reasonably obvious from the name.
// There is however the `queue` which is a little more complicated.
// Depending on the capabilities we start with, it will either consume
// from:
//
// 1. plugin_events_ingestion
// 2. clickhouse_events_json
// 3. clickhouse_events_json and plugin_events_ingestion
// 4. conversion_events_buffer
//
if (capabilities.processPluginJobs || capabilities.pluginScheduledTasks) {
;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, null, capabilities)
serverInstance = serverInstance ? serverInstance : { hub }
graphileWorker = new GraphileWorker(hub)
// `connectProducer` just runs the PostgreSQL migrations. Ideally it
// would be great to move the migration to bin/migrate and ensure we
// have a way for the pods to wait for the migrations to complete as
// we do with other migrations. However, I couldn't find a
// `graphile-worker` supported way to do this, and I don't think
// it's that heavy so it may be fine, but something to watch out
// for.
await graphileWorker.connectProducer()
piscina = piscina ?? makePiscina(serverConfig)
await startGraphileWorker(hub, graphileWorker, piscina)
if (capabilities.pluginScheduledTasks) {
schedulerTasksConsumer = await startScheduledTasksConsumer({
piscina: piscina,
kafka: hub.kafka,
producer: hub.kafkaProducer.producer,
partitionConcurrency: serverConfig.KAFKA_PARTITIONS_CONSUMED_CONCURRENTLY,
statsd: hub.statsd,
})
}
if (capabilities.processPluginJobs) {
jobsConsumer = await startJobsConsumer({
kafka: hub.kafka,
producer: hub.kafkaProducer.producer,
graphileWorker: graphileWorker,
statsd: hub.statsd,
})
}
}
if (capabilities.ingestion) {
;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, null, capabilities)
serverInstance = serverInstance ? serverInstance : { hub }
piscina = piscina ?? makePiscina(serverConfig)
const { queue, isHealthy: isAnalyticsEventsIngestionHealthy } = await startAnalyticsEventsIngestionConsumer(
{
hub: hub,
piscina: piscina,
}
)
analyticsEventsIngestionConsumer = queue
healthChecks['analytics-ingestion'] = isAnalyticsEventsIngestionHealthy
bufferConsumer = await startAnonymousEventBufferConsumer({
hub: hub,
piscina: piscina,
kafka: hub.kafka,
producer: hub.kafkaProducer,
statsd: hub.statsd,
})
}
if (capabilities.ingestionOverflow) {
;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, null, capabilities)
serverInstance = serverInstance ? serverInstance : { hub }
piscina = piscina ?? makePiscina(serverConfig)
analyticsEventsIngestionOverflowConsumer = await startAnalyticsEventsIngestionOverflowConsumer({
hub: hub,
piscina: piscina,
})
}
if (capabilities.processAsyncHandlers) {
;[hub, closeHub] = hub ? [hub, closeHub] : await createHub(serverConfig, null, capabilities)
serverInstance = serverInstance ? serverInstance : { hub }
piscina = piscina ?? makePiscina(serverConfig)
const { queue: onEventQueue, isHealthy: isOnEventsIngestionHealthy } = await startOnEventHandlerConsumer({
hub: hub,
piscina: piscina,
})
onEventHandlerConsumer = onEventQueue
healthChecks['on-event-ingestion'] = isOnEventsIngestionHealthy
}
// If we have
if (hub && serverInstance) {
pubSub = new PubSub(hub, {
[hub.PLUGINS_RELOAD_PUBSUB_CHANNEL]: async () => {
status.info('⚡', 'Reloading plugins!')
await piscina?.broadcastTask({ task: 'reloadPlugins' })
if (hub?.capabilities.pluginScheduledTasks && piscina) {
await piscina.broadcastTask({ task: 'reloadSchedule' })
hub.pluginSchedule = await loadPluginSchedule(piscina)
}
},
'reset-available-features-cache': async (message) => {
await piscina?.broadcastTask({ task: 'resetAvailableFeaturesCache', args: JSON.parse(message) })
},
...(capabilities.processAsyncHandlers
? {
'reload-action': async (message) =>
await piscina?.broadcastTask({ task: 'reloadAction', args: JSON.parse(message) }),
'drop-action': async (message) =>
await piscina?.broadcastTask({ task: 'dropAction', args: JSON.parse(message) }),
}
: {}),
})
await pubSub.start()
// every 5 minutes all ActionManager caches are reloaded for eventual consistency
schedule.scheduleJob('*/5 * * * *', async () => {
await piscina?.broadcastTask({ task: 'reloadAllActions' })
})
// every 5 seconds set Redis keys @posthog-plugin-server/ping and @posthog-plugin-server/version
schedule.scheduleJob('*/5 * * * * *', async () => {
await hub!.db!.redisSet('@posthog-plugin-server/ping', new Date().toISOString(), 60, {
jsonSerialize: false,
})
await hub!.db!.redisSet('@posthog-plugin-server/version', version, undefined, { jsonSerialize: false })
})
// every 10 seconds sends stuff to StatsD
schedule.scheduleJob('*/10 * * * * *', () => {
if (piscina) {
for (const [key, value] of Object.entries(getPiscinaStats(piscina))) {
if (value !== undefined) {
hub!.statsd?.gauge(`piscina.${key}`, value)
}
}
}
})
if (hub.statsd) {
stopEventLoopMetrics = captureEventLoopMetrics(hub.statsd, hub.instanceId)
}
if (serverConfig.STALENESS_RESTART_SECONDS > 0) {
// check every 10 sec how long it has been since the last activity
let lastFoundActivity: number
lastActivityCheck = setInterval(() => {
const stalenessCheckResult = stalenessCheck(hub, serverConfig.STALENESS_RESTART_SECONDS)
if (
hub?.lastActivity &&
stalenessCheckResult.isServerStale &&
lastFoundActivity !== hub?.lastActivity
) {
lastFoundActivity = hub?.lastActivity
const extra = {
piscina: piscina ? JSON.stringify(getPiscinaStats(piscina)) : null,
...stalenessCheckResult,
}
Sentry.captureMessage(
`Plugin Server has not ingested events for over ${serverConfig.STALENESS_RESTART_SECONDS} seconds! Rebooting.`,
{
extra,
}
)
console.log(
`Plugin Server has not ingested events for over ${serverConfig.STALENESS_RESTART_SECONDS} seconds! Rebooting.`,
extra
)
hub?.statsd?.increment(`alerts.stale_plugin_server_restarted`)
killProcess()
}
}, Math.min(serverConfig.STALENESS_RESTART_SECONDS, 10000))
}
serverInstance.piscina = piscina
serverInstance.queue = analyticsEventsIngestionConsumer
serverInstance.stop = closeJobs
hub.statsd?.timing('total_setup_time', timer)
status.info('🚀', 'All systems go')
hub.lastActivity = new Date().valueOf()
hub.lastActivityType = 'serverStart'
}
if (capabilities.sessionRecordingIngestion) {
const postgres = hub?.postgres ?? createPostgresPool(serverConfig.DATABASE_URL)
const teamManager = hub?.teamManager ?? new TeamManager(postgres, serverConfig)
const {
stop,
isHealthy: isSessionRecordingsHealthy,
join,
} = await startSessionRecordingEventsConsumer({
teamManager: teamManager,
kafkaConfig: serverConfig as KafkaConfig,
consumerMaxBytes: serverConfig.KAFKA_CONSUMPTION_MAX_BYTES,
consumerMaxBytesPerPartition: serverConfig.KAFKA_CONSUMPTION_MAX_BYTES_PER_PARTITION,
consumerMaxWaitMs: serverConfig.KAFKA_CONSUMPTION_MAX_WAIT_MS,
})
stopSessionRecordingEventsConsumer = stop
joinSessionRecordingEventsConsumer = join
healthChecks['session-recordings'] = isSessionRecordingsHealthy
}
if (capabilities.sessionRecordingBlobIngestion) {
const postgres = hub?.postgres ?? createPostgresPool(serverConfig.DATABASE_URL)
const teamManager = hub?.teamManager ?? new TeamManager(postgres, serverConfig)
const s3 = hub?.objectStorage ?? getObjectStorage(serverConfig)
if (!s3) {
throw new Error("Can't start session recording blob ingestion without object storage")
}
const ingester = new SessionRecordingBlobIngester(teamManager, serverConfig, s3)
await ingester.start()
const batchConsumer = ingester.batchConsumer
if (batchConsumer) {
stopSessionRecordingBlobConsumer = () => ingester.stop()
joinSessionRecordingBlobConsumer = () => batchConsumer.join()
healthChecks['session-recordings-blob'] = () => batchConsumer.isHealthy() ?? false
}
}
if (capabilities.http) {
httpServer = createHttpServer(healthChecks, analyticsEventsIngestionConsumer, piscina)
}
// If session recordings consumer is defined, then join it. If join
// resolves, then the consumer has stopped and we should shut down
// everything else. Ideally we would also join all the other background
// tasks as well to ensure we stop the server if we hit any errors and
// don't end up with zombie instances, but I'll leave that refactoring
// for another time. Note that we have the liveness health checks
// already, so in K8s cases zombies should be reaped anyway, albeit not
// in the most efficient way.
//
// When extending to other consumers, we would want to do something like
//
// ```
// try {
// await Promise.race([sessionConsumer.join(), analyticsConsumer.join(), ...])
// } finally {
// await closeJobs()
// }
// ```
if (joinSessionRecordingEventsConsumer) {
joinSessionRecordingEventsConsumer().catch(closeJobs)
}
if (joinSessionRecordingBlobConsumer) {
joinSessionRecordingBlobConsumer().catch(closeJobs)
}
return serverInstance ?? { stop: closeJobs }
} catch (error) {
Sentry.captureException(error)
status.error('💥', 'Launchpad failure!', { error: error.stack ?? error })
void Sentry.flush().catch(() => null) // Flush Sentry in the background
await closeJobs()
process.exit(1)
}
}
export async function stopPiscina(piscina: Piscina): Promise<void> {
// Wait *up to* 5 seconds to shut down VMs.
await Promise.race([piscina.broadcastTask({ task: 'teardownPlugins' }), delay(5000)])
// Wait 2 seconds to flush the last queues and caches
await Promise.all([piscina.broadcastTask({ task: 'flushKafkaMessages' }), delay(2000)])
try {
await piscina.destroy()
} catch {}
}
const kafkaProtocolErrors = new Counter({
name: 'kafka_protocol_errors_total',
help: 'Kafka protocol errors encountered, by type',
labelNames: ['type', 'code'],
})