Files
posthog.com/scripts/check-links-post-build.js
2025-09-16 19:00:34 +00:00

729 lines
24 KiB
JavaScript

#!/usr/bin/env node
/* eslint-disable @typescript-eslint/no-var-requires */
/**
* PostHog Link Checker - Post-Build Validation
*
* This script validates internal links and anchor links against the ACTUAL
* built site structure. It must run after `gatsby build` to access the
* generated site data.
*
* This script:
* - Reads the generated sitemap.xml to get actual pages
* - Validates links against real page URLs (not just file paths)
* - Checks anchor links against actual built page content
* - Works with dynamic pages, templates, and plugin-generated content
*
* Usage:
* gatsby build && npm run check-links-post-build
* gatsby build && node scripts/check-links-post-build.js
*
* To output results to a file, provide a directory path as an argument:
* node scripts/check-links-post-build.js link-check-results
* node scripts/check-links-post-build.js .
* node scripts/check-links-post-build.js ../output
*
* The script will exit with code 1 if any broken links are found.
*/
const fs = require('fs')
const path = require('path')
const { JSDOM } = require('jsdom')
const GithubSlugger = require('github-slugger')
// ============================================================================
// CONFIGURATION
// ============================================================================
const CONFIG = {
MAX_FILE_SIZE: 5 * 1024 * 1024, // 5MB
CACHE_SIZE_LIMIT: 1000,
BATCH_SIZE: 50,
EXCLUDED_EXTENSIONS: ['.css', '.js', '.json', '.xml', '.svg', '.png', '.jpg', '.jpeg', '.gif', '.woff', '.woff2'],
EXCLUDE_PATTERNS: [
'/community/', // powered by Strapi
'/teams/', // powered by Strapi
'/careers/', // powered by Ashby
],
SITEMAP_PATH: path.join(process.cwd(), 'public', 'sitemap', 'sitemap-0.xml'),
CONTENTS_DIR: 'contents',
// Special case URLs that should be considered valid even if not in sitemap
SPECIAL_CASE_URLS: [
'/startups', // Dynamic route in sitemap as /startups/[...slug]
],
}
// Global cache for anchor links
const anchorCache = new Map()
// ============================================================================
// UTILITY FUNCTIONS
// ============================================================================
// Utility function to convert file paths or internal links to full PostHog URLs
function convertToPostHogUrl(pathOrUrl) {
const baseUrl = 'https://posthog.com'
// Case 1: File path (starts with 'contents/')
if (pathOrUrl.startsWith('contents/')) {
// Remove 'contents/' prefix and file extension
let urlPath = pathOrUrl.replace(/^contents\//, '')
urlPath = urlPath.replace(/\.(md|mdx)$/, '')
return `${baseUrl}/${urlPath}`
}
// Case 2: Internal link (starts with '/')
if (pathOrUrl.startsWith('/')) {
return `${baseUrl}${pathOrUrl}`
}
// Case 3: Already a full URL or relative path
if (pathOrUrl.startsWith('http')) {
return pathOrUrl
}
// Default: treat as internal path
return `${baseUrl}/${pathOrUrl}`
}
// Parse vercel.json to extract redirect patterns
function parseVercelRedirects() {
try {
const vercelConfig = JSON.parse(fs.readFileSync('vercel.json', 'utf8'))
const redirects = vercelConfig.redirects || []
const rewrites = vercelConfig.rewrites || []
// Combine redirects and rewrites (rewrites also act as redirects for link validation)
return [...redirects, ...rewrites]
} catch (error) {
console.warn('Warning: Could not parse vercel.json:', error.message)
return []
}
}
// Check if a URL contains a redirect source
function isRedirectSource(url, redirects) {
return redirects.some((redirect) => {
// If source contains :path*, check if URL contains the part before :path*
if (redirect.source.includes(':path*')) {
const pathPrefix = redirect.source.split('/:path*')[0]
return url.includes(pathPrefix)
}
// Otherwise, check if URL contains the full source
return url.includes(redirect.source)
})
}
// ============================================================================
// SITEMAP FUNCTIONS
// ============================================================================
// Get all pages from the sitemap
function getSitemapPages() {
if (!fs.existsSync(CONFIG.SITEMAP_PATH)) {
console.error('Error: Sitemap not found at', CONFIG.SITEMAP_PATH)
console.error('Please run "gatsby build" first to generate the sitemap.')
process.exit(1)
}
const sitemapContent = fs.readFileSync(CONFIG.SITEMAP_PATH, 'utf8')
const dom = new JSDOM(sitemapContent, { contentType: 'text/xml' })
const urlNodes = dom.window.document.querySelectorAll('url loc')
const pages = []
urlNodes.forEach((node) => {
const url = node.textContent.trim()
if (url.startsWith('https://posthog.com/')) {
pages.push(url.replace('https://posthog.com', ''))
}
})
return pages
}
// Check if URL exists in sitemap
function urlExistsInSitemap(url, pages) {
// Remove trailing slash for comparison
const normalizedUrl = url.replace(/\/$/, '')
return pages.includes(normalizedUrl) || pages.includes(normalizedUrl + '/')
}
// ============================================================================
// ANCHOR VALIDATION FUNCTIONS
// ============================================================================
// Better file path resolution for Gatsby's output structure
function getHtmlFilePath(url) {
const publicDir = path.join(process.cwd(), 'public')
let htmlPath
// Remove leading slash
const cleanUrl = url.replace(/^\//, '')
if (cleanUrl === '') {
// Root page
htmlPath = path.join(publicDir, 'index.html')
} else if (cleanUrl.endsWith('/')) {
// Directory with trailing slash
htmlPath = path.join(publicDir, cleanUrl, 'index.html')
} else if (cleanUrl.includes('.')) {
// File with extension
htmlPath = path.join(publicDir, cleanUrl)
} else {
// Directory without trailing slash - try both variants
const withIndex = path.join(publicDir, cleanUrl, 'index.html')
const withHtml = path.join(publicDir, cleanUrl + '.html')
if (fs.existsSync(withIndex)) {
htmlPath = withIndex
} else if (fs.existsSync(withHtml)) {
htmlPath = withHtml
} else {
htmlPath = withIndex // Default to index.html variant
}
}
return htmlPath
}
// More efficient function that extracts ALL anchors and caches them
function extractAnchorsFromHtml(htmlPath) {
// Check cache first
if (anchorCache.has(htmlPath)) {
return anchorCache.get(htmlPath)
}
// Manage cache size
if (anchorCache.size >= CONFIG.CACHE_SIZE_LIMIT) {
// Remove oldest entry
const firstKey = anchorCache.keys().next().value
anchorCache.delete(firstKey)
}
try {
const stats = fs.statSync(htmlPath)
// Skip very large files
if (stats.size > CONFIG.MAX_FILE_SIZE) {
console.warn(`Skipping large file: ${htmlPath} (${stats.size} bytes)`)
const emptySet = new Set()
anchorCache.set(htmlPath, emptySet)
return emptySet
}
// Skip excluded file extensions
const ext = path.extname(htmlPath).toLowerCase()
if (CONFIG.EXCLUDED_EXTENSIONS.includes(ext)) {
const emptySet = new Set()
anchorCache.set(htmlPath, emptySet)
return emptySet
}
const anchors = new Set()
// For all files, process synchronously to avoid Promise issues
const htmlContent = fs.readFileSync(htmlPath, 'utf8')
// Simple string matching for very large content
if (htmlContent.length > 500000) {
const idMatches = htmlContent.match(/id="([^"]+)"/g) || []
const nameMatches = htmlContent.match(/name="([^"]+)"/g) || []
idMatches.forEach((match) => {
const id = match.match(/id="([^"]+)"/)[1]
anchors.add(id)
})
nameMatches.forEach((match) => {
const name = match.match(/name="([^"]+)"/)[1]
anchors.add(name)
})
} else {
// Use DOM parsing for smaller files
const dom = new JSDOM(htmlContent)
const document = dom.window.document
// Get all elements with ID
const elementsWithId = document.querySelectorAll('[id]')
elementsWithId.forEach((element) => {
anchors.add(element.id)
})
// Get all elements with name
const elementsWithName = document.querySelectorAll('[name]')
elementsWithName.forEach((element) => {
anchors.add(element.name)
})
// Process headings with slugger
const headings = document.querySelectorAll('h1, h2, h3, h4, h5, h6')
const slugger = new GithubSlugger()
headings.forEach((heading) => {
const slug = slugger.slug(heading.textContent)
anchors.add(slug)
})
}
// Cache the results
anchorCache.set(htmlPath, anchors)
return anchors
} catch (error) {
console.warn(`Warning: Could not extract anchors from ${htmlPath}:`, error.message)
const emptySet = new Set()
anchorCache.set(htmlPath, emptySet)
return emptySet
}
}
// ============================================================================
// LINK VALIDATION FUNCTIONS
// ============================================================================
// Check if internal URL exists in sitemap
function validateInternalUrl(url, pages) {
const [baseUrl] = url.split('#')
// Check special case URLs that should be considered valid
if (CONFIG.SPECIAL_CASE_URLS.includes(baseUrl)) {
return true
}
return urlExistsInSitemap(baseUrl, pages)
}
// Check if anchor exists in HTML file
function validateAnchor(url, pages) {
const [baseUrl, anchor] = url.split('#')
// No anchor to validate
if (!anchor) {
return true
}
// First ensure the base URL exists
if (!urlExistsInSitemap(baseUrl, pages)) {
return false
}
// Get the HTML file path and check if it exists
const htmlPath = getHtmlFilePath(baseUrl)
if (!fs.existsSync(htmlPath)) {
return false
}
// Extract all anchors and check if our anchor exists
const anchors = extractAnchorsFromHtml(htmlPath)
return anchors.has(anchor)
}
// ============================================================================
// MARKDOWN PROCESSING FUNCTIONS
// ============================================================================
// Extract all internal links from a markdown file
function extractInternalLinks(filePath) {
const content = fs.readFileSync(filePath, 'utf8')
const links = []
const linkRegex = /\[([^\]]*)\]\(([^)]+)\)/g
let match
let lineNumber = 1
let currentIndex = 0
while ((match = linkRegex.exec(content)) !== null) {
const linkText = match[1]
const linkUrl = match[2]
// Count lines up to this match
while (currentIndex < match.index) {
if (content[currentIndex] === '\n') {
lineNumber++
}
currentIndex++
}
// Only check internal links
if (linkUrl.startsWith('/') && !linkUrl.startsWith('//')) {
const beforeMatch = content.substring(Math.max(0, match.index - 75), match.index)
const afterMatch = content.substring(
match.index + match[0].length,
Math.min(content.length, match.index + match[0].length + 75)
)
const context = beforeMatch + match[0] + afterMatch
links.push({
text: linkText,
url: linkUrl,
line: lineNumber,
context: context.replace(/\n/g, ' '),
})
}
}
return links
}
// Find all markdown files in a directory recursively
function findMarkdownFiles(dir) {
const markdownFiles = []
function walkDirectory(currentDir) {
const files = fs.readdirSync(currentDir)
for (const file of files) {
const filePath = path.join(currentDir, file)
const stat = fs.statSync(filePath)
if (stat.isDirectory()) {
walkDirectory(filePath)
} else if (file.endsWith('.md') || file.endsWith('.mdx')) {
markdownFiles.push(filePath)
}
}
}
walkDirectory(dir)
return markdownFiles
}
// Process files in batches for memory management
function processFilesInBatches(files, batchSize = CONFIG.BATCH_SIZE) {
const results = []
for (let i = 0; i < files.length; i += batchSize) {
const batch = files.slice(i, i + batchSize)
console.log(
`Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(files.length / batchSize)} (${
batch.length
} files)`
)
const batchResults = batch.map((file) => ({
file,
links: extractInternalLinks(file),
}))
results.push(...batchResults)
// Clear some memory between batches
if (global.gc) {
global.gc()
}
}
return results
}
// ============================================================================
// LINK PROCESSING FUNCTIONS
// ============================================================================
// Process a single link and return validation result
function processLink(link, file, pages, redirects) {
// Strip query parameters from URL for checking
const urlWithoutQuery = link.url.split('?')[0]
// Check if link should be excluded by pattern
const shouldExclude = CONFIG.EXCLUDE_PATTERNS.some((pattern) => urlWithoutQuery.includes(pattern))
if (shouldExclude) {
return { type: 'excluded' }
}
// Check if link has file extension that should be ignored
const hasExcludedExtension = CONFIG.EXCLUDED_EXTENSIONS.some((ext) => urlWithoutQuery.endsWith(ext))
if (hasExcludedExtension) {
return { type: 'excluded' }
}
// Check if this link is a redirect source
if (isRedirectSource(urlWithoutQuery, redirects)) {
return { type: 'redirected' }
}
// First check if the internal URL is valid
if (!validateInternalUrl(urlWithoutQuery, pages)) {
return {
type: 'broken',
brokenItem: {
file: file,
link: link.url,
text: link.text,
line: link.line,
context: link.context,
type: 'page',
},
}
} else if (urlWithoutQuery.includes('#')) {
// If URL is valid but has anchor, check if anchor exists
if (!validateAnchor(urlWithoutQuery, pages)) {
return {
type: 'broken_anchor',
brokenItem: {
file: file,
link: link.url,
text: link.text,
line: link.line,
context: link.context,
type: 'anchor',
},
}
}
}
return { type: 'valid' }
}
// Process all links and categorize them
function processAllLinks(fileResults, pages, redirects) {
const brokenLinks = []
const brokenAnchors = []
let totalLinks = 0
let anchorLinksChecked = 0
let redirectedLinks = 0
let excludedLinks = 0
for (const result of fileResults) {
const { file, links } = result
totalLinks += links.length
for (const link of links) {
// Check for anchor (like original code)
const urlWithoutQuery = link.url.split('?')[0]
const hasAnchor = urlWithoutQuery.includes('#')
const processResult = processLink(link, file, pages, redirects)
switch (processResult.type) {
case 'excluded':
excludedLinks++
break
case 'redirected':
redirectedLinks++
break
case 'broken':
brokenLinks.push(processResult.brokenItem)
// Count anchor links for ALL non-excluded, non-redirected links (like original)
if (hasAnchor) {
anchorLinksChecked++
}
break
case 'broken_anchor':
brokenAnchors.push(processResult.brokenItem)
anchorLinksChecked++
break
case 'valid':
// Count anchor links for ALL non-excluded, non-redirected links (like original)
if (hasAnchor) {
anchorLinksChecked++
}
break
}
}
}
return {
brokenLinks,
brokenAnchors,
stats: {
totalLinks,
excludedLinks,
redirectedLinks,
anchorLinksChecked,
},
}
}
// ============================================================================
// RESULTS AND OUTPUT FUNCTIONS
// ============================================================================
// Create the results object
function createResultsObject(brokenLinks, brokenAnchors, stats, markdownFiles, redirects, pages) {
return {
timestamp: new Date().toISOString(),
summary: {
totalLinks: stats.totalLinks,
excludedLinks: stats.excludedLinks,
redirectedLinks: stats.redirectedLinks,
brokenLinks: brokenLinks.length,
anchorLinksChecked: stats.anchorLinksChecked,
brokenAnchors: brokenAnchors.length,
htmlFilesCached: anchorCache.size,
markdownFiles: markdownFiles.length,
redirectPatterns: redirects.length,
pagesInSitemap: pages.length,
},
excludePatterns: CONFIG.EXCLUDE_PATTERNS,
excludeFileExtensions: CONFIG.EXCLUDED_EXTENSIONS,
brokenLinks: brokenLinks.map((broken) => ({
type: broken.type,
file: path.relative(process.cwd(), broken.file),
page: convertToPostHogUrl(broken.file),
brokenLink: broken.link,
brokenUrl: convertToPostHogUrl(broken.link),
line: broken.line,
text: broken.text,
context: broken.context,
})),
brokenAnchors: brokenAnchors.map((anchor) => ({
type: anchor.type,
file: path.relative(process.cwd(), anchor.file),
page: convertToPostHogUrl(anchor.file),
brokenLink: anchor.link,
brokenUrl: convertToPostHogUrl(anchor.link),
line: anchor.line,
text: anchor.text,
context: anchor.context,
})),
}
}
// Write results to JSON file (optional)
function writeResultsToFile(results, outputPath) {
if (!outputPath) {
// No output path provided, skip writing to file
return
}
const resultsDir = path.join(process.cwd(), outputPath)
if (!fs.existsSync(resultsDir)) {
fs.mkdirSync(resultsDir, { recursive: true })
}
const timestamp = new Date().toISOString().replace(/[:.]/g, '-')
const resultsFile = path.join(resultsDir, `link-check-${timestamp}.json`)
try {
fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2))
console.log(`\nResults saved to: ${resultsFile}`)
} catch (error) {
console.error(`Error saving results: ${error.message}`)
}
}
// Display broken links
function displayBrokenLinks(brokenLinks) {
if (brokenLinks.length === 0) return
console.log('\nBroken links found:\n')
brokenLinks.forEach((broken) => {
console.log(`Error type: ${broken.type}`)
console.log(`File: ${path.relative(process.cwd(), broken.file)}`)
console.log(`Page: ${convertToPostHogUrl(broken.file)}`)
console.log(`Broken link: ${broken.link}`)
console.log(`Broken URL: ${convertToPostHogUrl(broken.link)}`)
console.log(`Line #: ${broken.line}`)
if (broken.text) {
console.log(`Hyperlinked text: ${broken.text}`)
}
console.log(`Context: ${broken.context}`)
console.log('-'.repeat(80))
})
}
// Display broken anchor links
function displayBrokenAnchors(brokenAnchors) {
if (brokenAnchors.length === 0) return
console.log('\nBroken anchor links:\n')
brokenAnchors.forEach((anchor) => {
console.log(`Error type: ${anchor.type}`)
console.log(`File: ${path.relative(process.cwd(), anchor.file)}`)
console.log(`Page: ${convertToPostHogUrl(anchor.file)}`)
console.log(`Broken link: ${anchor.link}`)
console.log(`Broken URL: ${convertToPostHogUrl(anchor.link)}`)
console.log(`Line #: ${anchor.line}`)
if (anchor.text) {
console.log(`Hyperlinked text: ${anchor.text}`)
}
console.log(`Context: ${anchor.context}`)
console.log('-'.repeat(80))
})
}
// Display summary statistics
function displaySummaryStats(stats, brokenLinks, brokenAnchors, markdownFilesCount) {
console.log(`\nScanned ${markdownFilesCount} markdown files`)
console.log(`Processed ${stats.totalLinks} internal links`)
console.log(`Found ${stats.excludedLinks} excluded links (skipped)`)
console.log(`Found ${stats.redirectedLinks} redirected links (skipped)`)
console.log(`Checked ${stats.anchorLinksChecked} anchor links`)
console.log(`Found ${brokenLinks.length} broken links`)
console.log(`Found ${brokenAnchors.length} broken anchor links`)
console.log(`Cached ${anchorCache.size} HTML files for anchor checking`)
}
// ============================================================================
// MAIN FUNCTION
// ============================================================================
function checkLinks(outputPath) {
console.log('Starting post-build link validation...')
// Initialize data sources
const redirects = parseVercelRedirects()
console.log(`Found ${redirects.length} redirect/rewrite patterns`)
const pages = getSitemapPages()
console.log(`Found ${pages.length} pages in sitemap`)
const markdownFiles = findMarkdownFiles(CONFIG.CONTENTS_DIR)
console.log(`Found ${markdownFiles.length} markdown files`)
// Process files and extract links
const fileResults = processFilesInBatches(markdownFiles)
// Process and validate all links
const { brokenLinks, brokenAnchors, stats } = processAllLinks(fileResults, pages, redirects)
// Sort results alphabetically by file
brokenLinks.sort((a, b) => a.file.localeCompare(b.file))
brokenAnchors.sort((a, b) => a.file.localeCompare(b.file))
// Display broken links
displayBrokenLinks(brokenLinks)
displayBrokenAnchors(brokenAnchors)
if (brokenLinks.length === 0 && brokenAnchors.length === 0) {
console.log('\nNo broken links found! 🎉')
}
// Display summary stats at the end
displaySummaryStats(stats, brokenLinks, brokenAnchors, markdownFiles.length)
// Create and save results at the end
const results = createResultsObject(brokenLinks, brokenAnchors, stats, markdownFiles, redirects, pages)
writeResultsToFile(results, outputPath)
return brokenLinks.length
}
// ============================================================================
// SCRIPT EXECUTION
// ============================================================================
// Parse command line arguments
const args = process.argv.slice(2)
const outputPath = args.length > 0 ? args[0] : null
if (outputPath) {
console.log(`Results will be saved to: ${outputPath}`)
} else {
console.log('No output path provided. Results will only be displayed in console.')
}
// Run the script
const brokenCount = checkLinks(outputPath)
// Only exit with error code if there are broken PAGE links (not just anchor links)
// This allows the workflow to continue while still reporting issues
if (brokenCount > 0) {
console.log('\nCheck the output above ☝️')
}
process.exit(0) // Always exit successfully