posthog.com/scripts/check-links-post-build.js

#!/usr/bin/env node

/* eslint-disable @typescript-eslint/no-var-requires */

/**
 * PostHog Link Checker - Post-Build Validation
 *
 * This script validates internal links and anchor links against the ACTUAL
 * built site structure. It must run after `gatsby build` to access the
 * generated site data.
 *
 * This script:
 * - Reads the generated sitemap.xml to get actual pages
 * - Validates links against real page URLs (not just file paths)
 * - Checks anchor links against actual built page content
 * - Works with dynamic pages, templates, and plugin-generated content
 *
 * Usage:
 *   gatsby build && npm run check-links-post-build
 *   gatsby build && node scripts/check-links-post-build.js
 *
 * To output results to a file, provide a directory path as an argument:
 *   node scripts/check-links-post-build.js link-check-results
 *   node scripts/check-links-post-build.js .
 *   node scripts/check-links-post-build.js ../output
 *
 * The script will exit with code 1 if any broken links are found.
 */

const fs = require('fs')
const path = require('path')
const { JSDOM } = require('jsdom')
const GithubSlugger = require('github-slugger')

// ============================================================================
// CONFIGURATION
// ============================================================================

const CONFIG = {
    MAX_FILE_SIZE: 5 * 1024 * 1024, // 5MB
    CACHE_SIZE_LIMIT: 1000,
    BATCH_SIZE: 50,
    EXCLUDED_EXTENSIONS: ['.css', '.js', '.json', '.xml', '.svg', '.png', '.jpg', '.jpeg', '.gif', '.woff', '.woff2'],
    EXCLUDE_PATTERNS: [
        '/community/', // powered by Strapi
        '/teams/', // powered by Strapi
        '/careers/', // powered by Ashby
    ],
    SITEMAP_PATH: path.join(process.cwd(), 'public', 'sitemap', 'sitemap-0.xml'),
    CONTENTS_DIR: 'contents',
    // Special case URLs that should be considered valid even if not in sitemap
    SPECIAL_CASE_URLS: [
        '/startups', // Dynamic route in sitemap as /startups/[...slug]
    ],
}

// Global cache for anchor links
const anchorCache = new Map()

// ============================================================================
// UTILITY FUNCTIONS
// ============================================================================

// Utility function to convert file paths or internal links to full PostHog URLs
function convertToPostHogUrl(pathOrUrl) {
    const baseUrl = 'https://posthog.com'

    // Case 1: File path (starts with 'contents/')
    if (pathOrUrl.startsWith('contents/')) {
        // Remove 'contents/' prefix and file extension
        let urlPath = pathOrUrl.replace(/^contents\//, '')
        urlPath = urlPath.replace(/\.(md|mdx)$/, '')
        return `${baseUrl}/${urlPath}`
    }

    // Case 2: Internal link (starts with '/')
    if (pathOrUrl.startsWith('/')) {
        return `${baseUrl}${pathOrUrl}`
    }

    // Case 3: Already a full URL or relative path
    if (pathOrUrl.startsWith('http')) {
        return pathOrUrl
    }

    // Default: treat as internal path
    return `${baseUrl}/${pathOrUrl}`
}

// Parse vercel.json to extract redirect patterns
function parseVercelRedirects() {
    try {
        const vercelConfig = JSON.parse(fs.readFileSync('vercel.json', 'utf8'))
        const redirects = vercelConfig.redirects || []
        const rewrites = vercelConfig.rewrites || []

        // Combine redirects and rewrites (rewrites also act as redirects for link validation)
        return [...redirects, ...rewrites]
    } catch (error) {
        console.warn('Warning: Could not parse vercel.json:', error.message)
        return []
    }
}

// Check if a URL contains a redirect source
function isRedirectSource(url, redirects) {
    return redirects.some((redirect) => {
        // If source contains :path*, check if URL contains the part before :path*
        if (redirect.source.includes(':path*')) {
            const pathPrefix = redirect.source.split('/:path*')[0]
            return url.includes(pathPrefix)
        }
        // Otherwise, check if URL contains the full source
        return url.includes(redirect.source)
    })
}

// ============================================================================
// SITEMAP FUNCTIONS
// ============================================================================

// Get all pages from the sitemap
function getSitemapPages() {
    if (!fs.existsSync(CONFIG.SITEMAP_PATH)) {
        console.error('Error: Sitemap not found at', CONFIG.SITEMAP_PATH)
        console.error('Please run "gatsby build" first to generate the sitemap.')
        process.exit(1)
    }

    const sitemapContent = fs.readFileSync(CONFIG.SITEMAP_PATH, 'utf8')
    const dom = new JSDOM(sitemapContent, { contentType: 'text/xml' })
    const urlNodes = dom.window.document.querySelectorAll('url loc')

    const pages = []
    urlNodes.forEach((node) => {
        const url = node.textContent.trim()
        if (url.startsWith('https://posthog.com/')) {
            pages.push(url.replace('https://posthog.com', ''))
        }
    })

    return pages
}

// Check if URL exists in sitemap
function urlExistsInSitemap(url, pages) {
    // Remove trailing slash for comparison
    const normalizedUrl = url.replace(/\/$/, '')
    return pages.includes(normalizedUrl) || pages.includes(normalizedUrl + '/')
}

// ============================================================================
// ANCHOR VALIDATION FUNCTIONS
// ============================================================================

// Better file path resolution for Gatsby's output structure
function getHtmlFilePath(url) {
    const publicDir = path.join(process.cwd(), 'public')
    let htmlPath

    // Remove leading slash
    const cleanUrl = url.replace(/^\//, '')

    if (cleanUrl === '') {
        // Root page
        htmlPath = path.join(publicDir, 'index.html')
    } else if (cleanUrl.endsWith('/')) {
        // Directory with trailing slash
        htmlPath = path.join(publicDir, cleanUrl, 'index.html')
    } else if (cleanUrl.includes('.')) {
        // File with extension
        htmlPath = path.join(publicDir, cleanUrl)
    } else {
        // Directory without trailing slash - try both variants
        const withIndex = path.join(publicDir, cleanUrl, 'index.html')
        const withHtml = path.join(publicDir, cleanUrl + '.html')

        if (fs.existsSync(withIndex)) {
            htmlPath = withIndex
        } else if (fs.existsSync(withHtml)) {
            htmlPath = withHtml
        } else {
            htmlPath = withIndex // Default to index.html variant
        }
    }

    return htmlPath
}

// More efficient function that extracts ALL anchors and caches them
function extractAnchorsFromHtml(htmlPath) {
    // Check cache first
    if (anchorCache.has(htmlPath)) {
        return anchorCache.get(htmlPath)
    }

    // Manage cache size
    if (anchorCache.size >= CONFIG.CACHE_SIZE_LIMIT) {
        // Remove oldest entry
        const firstKey = anchorCache.keys().next().value
        anchorCache.delete(firstKey)
    }

    try {
        const stats = fs.statSync(htmlPath)

        // Skip very large files
        if (stats.size > CONFIG.MAX_FILE_SIZE) {
            console.warn(`Skipping large file: ${htmlPath} (${stats.size} bytes)`)
            const emptySet = new Set()
            anchorCache.set(htmlPath, emptySet)
            return emptySet
        }

        // Skip excluded file extensions
        const ext = path.extname(htmlPath).toLowerCase()
        if (CONFIG.EXCLUDED_EXTENSIONS.includes(ext)) {
            const emptySet = new Set()
            anchorCache.set(htmlPath, emptySet)
            return emptySet
        }

        const anchors = new Set()

        // For all files, process synchronously to avoid Promise issues
        const htmlContent = fs.readFileSync(htmlPath, 'utf8')

        // Simple string matching for very large content
        if (htmlContent.length > 500000) {
            const idMatches = htmlContent.match(/id="([^"]+)"/g) || []
            const nameMatches = htmlContent.match(/name="([^"]+)"/g) || []

            idMatches.forEach((match) => {
                const id = match.match(/id="([^"]+)"/)[1]
                anchors.add(id)
            })

            nameMatches.forEach((match) => {
                const name = match.match(/name="([^"]+)"/)[1]
                anchors.add(name)
            })
        } else {
            // Use DOM parsing for smaller files
            const dom = new JSDOM(htmlContent)
            const document = dom.window.document

            // Get all elements with ID
            const elementsWithId = document.querySelectorAll('[id]')
            elementsWithId.forEach((element) => {
                anchors.add(element.id)
            })

            // Get all elements with name
            const elementsWithName = document.querySelectorAll('[name]')
            elementsWithName.forEach((element) => {
                anchors.add(element.name)
            })

            // Process headings with slugger
            const headings = document.querySelectorAll('h1, h2, h3, h4, h5, h6')
            const slugger = new GithubSlugger()

            headings.forEach((heading) => {
                const slug = slugger.slug(heading.textContent)
                anchors.add(slug)
            })
        }

        // Cache the results
        anchorCache.set(htmlPath, anchors)
        return anchors
    } catch (error) {
        console.warn(`Warning: Could not extract anchors from ${htmlPath}:`, error.message)
        const emptySet = new Set()
        anchorCache.set(htmlPath, emptySet)
        return emptySet
    }
}

// ============================================================================
// LINK VALIDATION FUNCTIONS
// ============================================================================

// Check if internal URL exists in sitemap
function validateInternalUrl(url, pages) {
    const [baseUrl] = url.split('#')

    // Check special case URLs that should be considered valid
    if (CONFIG.SPECIAL_CASE_URLS.includes(baseUrl)) {
        return true
    }

    return urlExistsInSitemap(baseUrl, pages)
}

// Check if anchor exists in HTML file
function validateAnchor(url, pages) {
    const [baseUrl, anchor] = url.split('#')

    // No anchor to validate
    if (!anchor) {
        return true
    }

    // First ensure the base URL exists
    if (!urlExistsInSitemap(baseUrl, pages)) {
        return false
    }

    // Get the HTML file path and check if it exists
    const htmlPath = getHtmlFilePath(baseUrl)
    if (!fs.existsSync(htmlPath)) {
        return false
    }

    // Extract all anchors and check if our anchor exists
    const anchors = extractAnchorsFromHtml(htmlPath)
    return anchors.has(anchor)
}

// ============================================================================
// MARKDOWN PROCESSING FUNCTIONS
// ============================================================================

// Extract all internal links from a markdown file
function extractInternalLinks(filePath) {
    const content = fs.readFileSync(filePath, 'utf8')
    const links = []
    const linkRegex = /\[([^\]]*)\]\(([^)]+)\)/g

    let match
    let lineNumber = 1
    let currentIndex = 0

    while ((match = linkRegex.exec(content)) !== null) {
        const linkText = match[1]
        const linkUrl = match[2]

        // Count lines up to this match
        while (currentIndex < match.index) {
            if (content[currentIndex] === '\n') {
                lineNumber++
            }
            currentIndex++
        }

        // Only check internal links
        if (linkUrl.startsWith('/') && !linkUrl.startsWith('//')) {
            const beforeMatch = content.substring(Math.max(0, match.index - 75), match.index)
            const afterMatch = content.substring(
                match.index + match[0].length,
                Math.min(content.length, match.index + match[0].length + 75)
            )
            const context = beforeMatch + match[0] + afterMatch

            links.push({
                text: linkText,
                url: linkUrl,
                line: lineNumber,
                context: context.replace(/\n/g, ' '),
            })
        }
    }

    return links
}

// Find all markdown files in a directory recursively
function findMarkdownFiles(dir) {
    const markdownFiles = []

    function walkDirectory(currentDir) {
        const files = fs.readdirSync(currentDir)

        for (const file of files) {
            const filePath = path.join(currentDir, file)
            const stat = fs.statSync(filePath)

            if (stat.isDirectory()) {
                walkDirectory(filePath)
            } else if (file.endsWith('.md') || file.endsWith('.mdx')) {
                markdownFiles.push(filePath)
            }
        }
    }

    walkDirectory(dir)
    return markdownFiles
}

// Process files in batches for memory management
function processFilesInBatches(files, batchSize = CONFIG.BATCH_SIZE) {
    const results = []

    for (let i = 0; i < files.length; i += batchSize) {
        const batch = files.slice(i, i + batchSize)
        console.log(
            `Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(files.length / batchSize)} (${
                batch.length
            } files)`
        )

        const batchResults = batch.map((file) => ({
            file,
            links: extractInternalLinks(file),
        }))

        results.push(...batchResults)

        // Clear some memory between batches
        if (global.gc) {
            global.gc()
        }
    }

    return results
}

// ============================================================================
// LINK PROCESSING FUNCTIONS
// ============================================================================

// Process a single link and return validation result
function processLink(link, file, pages, redirects) {
    // Strip query parameters from URL for checking
    const urlWithoutQuery = link.url.split('?')[0]

    // Check if link should be excluded by pattern
    const shouldExclude = CONFIG.EXCLUDE_PATTERNS.some((pattern) => urlWithoutQuery.includes(pattern))
    if (shouldExclude) {
        return { type: 'excluded' }
    }

    // Check if link has file extension that should be ignored
    const hasExcludedExtension = CONFIG.EXCLUDED_EXTENSIONS.some((ext) => urlWithoutQuery.endsWith(ext))
    if (hasExcludedExtension) {
        return { type: 'excluded' }
    }

    // Check if this link is a redirect source
    if (isRedirectSource(urlWithoutQuery, redirects)) {
        return { type: 'redirected' }
    }

    // First check if the internal URL is valid
    if (!validateInternalUrl(urlWithoutQuery, pages)) {
        return {
            type: 'broken',
            brokenItem: {
                file: file,
                link: link.url,
                text: link.text,
                line: link.line,
                context: link.context,
                type: 'page',
            },
        }
    } else if (urlWithoutQuery.includes('#')) {
        // If URL is valid but has anchor, check if anchor exists
        if (!validateAnchor(urlWithoutQuery, pages)) {
            return {
                type: 'broken_anchor',
                brokenItem: {
                    file: file,
                    link: link.url,
                    text: link.text,
                    line: link.line,
                    context: link.context,
                    type: 'anchor',
                },
            }
        }
    }

    return { type: 'valid' }
}

// Process all links and categorize them
function processAllLinks(fileResults, pages, redirects) {
    const brokenLinks = []
    const brokenAnchors = []
    let totalLinks = 0
    let anchorLinksChecked = 0
    let redirectedLinks = 0
    let excludedLinks = 0

    for (const result of fileResults) {
        const { file, links } = result
        totalLinks += links.length

        for (const link of links) {
            // Check for anchor (like original code)
            const urlWithoutQuery = link.url.split('?')[0]
            const hasAnchor = urlWithoutQuery.includes('#')

            const processResult = processLink(link, file, pages, redirects)

            switch (processResult.type) {
                case 'excluded':
                    excludedLinks++
                    break
                case 'redirected':
                    redirectedLinks++
                    break
                case 'broken':
                    brokenLinks.push(processResult.brokenItem)
                    // Count anchor links for ALL non-excluded, non-redirected links (like original)
                    if (hasAnchor) {
                        anchorLinksChecked++
                    }
                    break
                case 'broken_anchor':
                    brokenAnchors.push(processResult.brokenItem)
                    anchorLinksChecked++
                    break
                case 'valid':
                    // Count anchor links for ALL non-excluded, non-redirected links (like original)
                    if (hasAnchor) {
                        anchorLinksChecked++
                    }
                    break
            }
        }
    }

    return {
        brokenLinks,
        brokenAnchors,
        stats: {
            totalLinks,
            excludedLinks,
            redirectedLinks,
            anchorLinksChecked,
        },
    }
}

// ============================================================================
// RESULTS AND OUTPUT FUNCTIONS
// ============================================================================

// Create the results object
function createResultsObject(brokenLinks, brokenAnchors, stats, markdownFiles, redirects, pages) {
    return {
        timestamp: new Date().toISOString(),
        summary: {
            totalLinks: stats.totalLinks,
            excludedLinks: stats.excludedLinks,
            redirectedLinks: stats.redirectedLinks,
            brokenLinks: brokenLinks.length,
            anchorLinksChecked: stats.anchorLinksChecked,
            brokenAnchors: brokenAnchors.length,
            htmlFilesCached: anchorCache.size,
            markdownFiles: markdownFiles.length,
            redirectPatterns: redirects.length,
            pagesInSitemap: pages.length,
        },
        excludePatterns: CONFIG.EXCLUDE_PATTERNS,
        excludeFileExtensions: CONFIG.EXCLUDED_EXTENSIONS,
        brokenLinks: brokenLinks.map((broken) => ({
            type: broken.type,
            file: path.relative(process.cwd(), broken.file),
            page: convertToPostHogUrl(broken.file),
            brokenLink: broken.link,
            brokenUrl: convertToPostHogUrl(broken.link),
            line: broken.line,
            text: broken.text,
            context: broken.context,
        })),
        brokenAnchors: brokenAnchors.map((anchor) => ({
            type: anchor.type,
            file: path.relative(process.cwd(), anchor.file),
            page: convertToPostHogUrl(anchor.file),
            brokenLink: anchor.link,
            brokenUrl: convertToPostHogUrl(anchor.link),
            line: anchor.line,
            text: anchor.text,
            context: anchor.context,
        })),
    }
}

// Write results to JSON file (optional)
function writeResultsToFile(results, outputPath) {
    if (!outputPath) {
        // No output path provided, skip writing to file
        return
    }

    const resultsDir = path.join(process.cwd(), outputPath)
    if (!fs.existsSync(resultsDir)) {
        fs.mkdirSync(resultsDir, { recursive: true })
    }

    const timestamp = new Date().toISOString().replace(/[:.]/g, '-')
    const resultsFile = path.join(resultsDir, `link-check-${timestamp}.json`)

    try {
        fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2))
        console.log(`\nResults saved to: ${resultsFile}`)
    } catch (error) {
        console.error(`Error saving results: ${error.message}`)
    }
}

// Display broken links
function displayBrokenLinks(brokenLinks) {
    if (brokenLinks.length === 0) return

    console.log('\nBroken links found:\n')

    brokenLinks.forEach((broken) => {
        console.log(`Error type: ${broken.type}`)
        console.log(`File: ${path.relative(process.cwd(), broken.file)}`)
        console.log(`Page: ${convertToPostHogUrl(broken.file)}`)
        console.log(`Broken link: ${broken.link}`)
        console.log(`Broken URL: ${convertToPostHogUrl(broken.link)}`)
        console.log(`Line #: ${broken.line}`)
        if (broken.text) {
            console.log(`Hyperlinked text: ${broken.text}`)
        }
        console.log(`Context: ${broken.context}`)
        console.log('-'.repeat(80))
    })
}

// Display broken anchor links
function displayBrokenAnchors(brokenAnchors) {
    if (brokenAnchors.length === 0) return

    console.log('\nBroken anchor links:\n')

    brokenAnchors.forEach((anchor) => {
        console.log(`Error type: ${anchor.type}`)
        console.log(`File: ${path.relative(process.cwd(), anchor.file)}`)
        console.log(`Page: ${convertToPostHogUrl(anchor.file)}`)
        console.log(`Broken link: ${anchor.link}`)
        console.log(`Broken URL: ${convertToPostHogUrl(anchor.link)}`)
        console.log(`Line #: ${anchor.line}`)
        if (anchor.text) {
            console.log(`Hyperlinked text: ${anchor.text}`)
        }
        console.log(`Context: ${anchor.context}`)
        console.log('-'.repeat(80))
    })
}

// Display summary statistics
function displaySummaryStats(stats, brokenLinks, brokenAnchors, markdownFilesCount) {
    console.log(`\nScanned ${markdownFilesCount} markdown files`)
    console.log(`Processed ${stats.totalLinks} internal links`)
    console.log(`Found ${stats.excludedLinks} excluded links (skipped)`)
    console.log(`Found ${stats.redirectedLinks} redirected links (skipped)`)
    console.log(`Checked ${stats.anchorLinksChecked} anchor links`)
    console.log(`Found ${brokenLinks.length} broken links`)
    console.log(`Found ${brokenAnchors.length} broken anchor links`)
    console.log(`Cached ${anchorCache.size} HTML files for anchor checking`)
}

// ============================================================================
// MAIN FUNCTION
// ============================================================================

function checkLinks(outputPath) {
    console.log('Starting post-build link validation...')

    // Initialize data sources
    const redirects = parseVercelRedirects()
    console.log(`Found ${redirects.length} redirect/rewrite patterns`)

    const pages = getSitemapPages()
    console.log(`Found ${pages.length} pages in sitemap`)

    const markdownFiles = findMarkdownFiles(CONFIG.CONTENTS_DIR)
    console.log(`Found ${markdownFiles.length} markdown files`)

    // Process files and extract links
    const fileResults = processFilesInBatches(markdownFiles)

    // Process and validate all links
    const { brokenLinks, brokenAnchors, stats } = processAllLinks(fileResults, pages, redirects)

    // Sort results alphabetically by file
    brokenLinks.sort((a, b) => a.file.localeCompare(b.file))
    brokenAnchors.sort((a, b) => a.file.localeCompare(b.file))

    // Display broken links
    displayBrokenLinks(brokenLinks)
    displayBrokenAnchors(brokenAnchors)

    if (brokenLinks.length === 0 && brokenAnchors.length === 0) {
        console.log('\nNo broken links found! 🎉')
    }

    // Display summary stats at the end
    displaySummaryStats(stats, brokenLinks, brokenAnchors, markdownFiles.length)

    // Create and save results at the end
    const results = createResultsObject(brokenLinks, brokenAnchors, stats, markdownFiles, redirects, pages)
    writeResultsToFile(results, outputPath)

    return brokenLinks.length
}

// ============================================================================
// SCRIPT EXECUTION
// ============================================================================

// Parse command line arguments
const args = process.argv.slice(2)
const outputPath = args.length > 0 ? args[0] : null

if (outputPath) {
    console.log(`Results will be saved to: ${outputPath}`)
} else {
    console.log('No output path provided. Results will only be displayed in console.')
}

// Run the script
const brokenCount = checkLinks(outputPath)

// Only exit with error code if there are broken PAGE links (not just anchor links)
// This allows the workflow to continue while still reporting issues
if (brokenCount > 0) {
    console.log('\nCheck the output above ☝️')
}

process.exit(0) // Always exit successfully