mirror of
https://github.com/BillyOutlast/posthog.git
synced 2026-02-06 12:11:24 +01:00
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
134 lines
5.7 KiB
JavaScript
134 lines
5.7 KiB
JavaScript
// Export for use with actions/github-script
|
|
|
|
const DIFF_THRESHOLD = 0.02
|
|
|
|
module.exports = ({ github, context, fs }) => {
|
|
// Read the eval results
|
|
const evalResults = fs
|
|
.readFileSync('eval_results.jsonl', 'utf8')
|
|
.trim()
|
|
.split('\n')
|
|
.filter((line) => line.trim().length > 0)
|
|
.map((line) => JSON.parse(line))
|
|
|
|
if (evalResults.length === 0) {
|
|
console.warn('No eval results found')
|
|
return
|
|
}
|
|
|
|
// Calculate max diff for each experiment and categorize
|
|
const experimentsWithMaxDiff = evalResults.map((result) => {
|
|
const scores = result.scores || {}
|
|
const diffs = Object.values(scores)
|
|
.map((v) => v.diff)
|
|
.filter((d) => typeof d === 'number')
|
|
const minDiff = diffs.length > 0 ? Math.min(...diffs) : 0
|
|
const maxDiff = diffs.length > 0 ? Math.max(...diffs) : 0
|
|
const category = minDiff < -DIFF_THRESHOLD ? 'regression' : maxDiff > DIFF_THRESHOLD ? 'improvement' : 'neutral'
|
|
|
|
return {
|
|
result,
|
|
category,
|
|
maxDiffInCategoryAbs:
|
|
category === 'regression'
|
|
? -minDiff
|
|
: category === 'improvement'
|
|
? maxDiff
|
|
: Math.max(Math.abs(minDiff), Math.abs(maxDiff)),
|
|
}
|
|
})
|
|
|
|
// Sort: regressions first (most negative to least), then improvements (most positive to least), then neutral
|
|
experimentsWithMaxDiff.sort((a, b) => {
|
|
if (a.category === b.category) {
|
|
return b.maxDiffInCategoryAbs - a.maxDiffInCategoryAbs
|
|
}
|
|
const order = { regression: 0, improvement: 1, neutral: 2 }
|
|
return order[a.category] - order[b.category]
|
|
})
|
|
|
|
// Generate experiment summaries
|
|
const experimentSummaries = experimentsWithMaxDiff.map(({ result }) => {
|
|
// Format scores as bullet points with improvements/regressions and baseline comparison
|
|
const scoresList = Object.entries(result.scores || {})
|
|
.map(([key, value]) => {
|
|
const score = typeof value.score === 'number' ? `${(value.score * 100).toFixed(2)}%` : value.score
|
|
let baselineComparison = null
|
|
const diffHighlight = Math.abs(value.diff) > DIFF_THRESHOLD ? '**' : ''
|
|
let diffEmoji = '🆕'
|
|
if (result.comparison_experiment_name?.startsWith('master-')) {
|
|
baselineComparison = `${diffHighlight}${value.diff > 0 ? '+' : value.diff < 0 ? '' : '±'}${(
|
|
value.diff * 100
|
|
).toFixed(
|
|
2
|
|
)}%${diffHighlight} (improvements: ${value.improvements}, regressions: ${value.regressions})`
|
|
diffEmoji = value.diff > DIFF_THRESHOLD ? '🟢' : value.diff < -DIFF_THRESHOLD ? '🔴' : '🔵'
|
|
}
|
|
return `${diffEmoji} **${key}**: **${score}**${baselineComparison ? `, ${baselineComparison}` : ''}`
|
|
})
|
|
.join('\n')
|
|
|
|
// Format key metrics concisely
|
|
const metrics = result.metrics || {}
|
|
const duration = metrics.duration ? `⏱️ ${metrics.duration.metric.toFixed(2)} s` : null
|
|
const totalTokens = metrics.total_tokens ? `🔢 ${Math.floor(metrics.total_tokens.metric)} tokens` : null
|
|
const cost = metrics.estimated_cost ? `💵 $${metrics.estimated_cost.metric.toFixed(4)} in tokens` : null
|
|
const metricsText = [duration, totalTokens, cost].filter(Boolean).join(', ')
|
|
const baselineLink = `[${result.comparison_experiment_name}](${result.project_url}/experiments/${result.comparison_experiment_name})`
|
|
|
|
// Create concise experiment summary with header only showing experiment name
|
|
const experimentName = result.project_name.replace(/^max-ai-/, '')
|
|
|
|
return [
|
|
`### [${experimentName}](${result.experiment_url})`,
|
|
scoresList,
|
|
`Baseline: ${baselineLink} • Avg. case performance: ${metricsText}`,
|
|
].join('\n\n')
|
|
})
|
|
|
|
// Split summaries by category
|
|
const regressions = []
|
|
const improvements = []
|
|
const neutral = []
|
|
experimentsWithMaxDiff.forEach(({ category }, idx) => {
|
|
if (category === 'regression') regressions.push(experimentSummaries[idx])
|
|
else if (category === 'improvement') improvements.push(experimentSummaries[idx])
|
|
else neutral.push(experimentSummaries[idx])
|
|
})
|
|
|
|
const totalExperiments = evalResults.length
|
|
const totalMetrics = evalResults.reduce((acc, result) => acc + Object.keys(result.scores || {}).length, 0)
|
|
|
|
const bodyParts = [
|
|
`## 🧠 AI eval results`,
|
|
`Evaluated **${totalExperiments}** experiments, comprising **${totalMetrics}** metrics. Showing experiments with largest regressions first.`,
|
|
]
|
|
|
|
bodyParts.push(...regressions)
|
|
bodyParts.push(...improvements)
|
|
if (neutral.length > 0) {
|
|
bodyParts.push(
|
|
`<details><summary>${neutral.length} ${neutral.length === 1 ? 'experiment' : 'experiments'} with no significant changes</summary>\n\n${neutral.join('\n\n')}\n\n</details>`
|
|
)
|
|
}
|
|
|
|
bodyParts.push(
|
|
`_Triggered by [this commit](https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.pull_request.number}/commits/${context.payload.pull_request.head.sha})._`
|
|
)
|
|
|
|
const body = bodyParts.join('\n\n')
|
|
|
|
// Post comment on PR
|
|
if (context.payload.pull_request) {
|
|
github.rest.issues.createComment({
|
|
issue_number: context.issue.number,
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
body: body,
|
|
})
|
|
} else {
|
|
// Just log the summary if this is a push to master
|
|
console.info(body)
|
|
}
|
|
}
|