From e7987eacd4fafddd43e4fa115794816d9d1e774a Mon Sep 17 00:00:00 2001 From: Edwin Lim Date: Fri, 11 Jul 2025 16:46:44 -0400 Subject: [PATCH] internal links checker script (post-build) (#12147) --- .github/workflows/internal-links-check.yml | 166 +++++ package.json | 1 + scripts/check-links-post-build.js | 718 +++++++++++++++++++++ 3 files changed, 885 insertions(+) create mode 100644 .github/workflows/internal-links-check.yml create mode 100644 scripts/check-links-post-build.js diff --git a/.github/workflows/internal-links-check.yml b/.github/workflows/internal-links-check.yml new file mode 100644 index 000000000..5deb3b6e1 --- /dev/null +++ b/.github/workflows/internal-links-check.yml @@ -0,0 +1,166 @@ +name: Check internal links + +on: + schedule: + - cron: '0 11 * * 1' # Every Monday at 11am UTC + workflow_dispatch: + inputs: + save_results: + description: 'Save results to file' + required: false + default: 'true' + type: boolean + send_slack_notification: + description: 'Send Slack notification' + required: false + default: 'true' + type: boolean + +jobs: + check-internal-links: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '18' + cache: 'yarn' + + - name: Install dependencies + run: yarn install --frozen-lockfile + + - name: Build site + run: yarn build + + - name: Check links (console only) + if: ${{ github.event_name == 'workflow_dispatch' && inputs.save_results == false }} + run: | + OUTPUT=$(node scripts/check-links-post-build.js 2>&1) + echo "$OUTPUT" + echo "LINK_CHECK_OUTPUT<> $GITHUB_ENV + echo "$OUTPUT" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Check links (with file output) + if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.save_results != false) }} + run: | + OUTPUT=$(node scripts/check-links-post-build.js link-check-results 2>&1) + echo "$OUTPUT" + echo "LINK_CHECK_OUTPUT<> $GITHUB_ENV + echo "$OUTPUT" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Upload link check results + if: ${{ always() && (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.save_results != false)) }} + id: upload-results + uses: actions/upload-artifact@v4 + with: + name: link-check-results + path: link-check-results/*.json + if-no-files-found: warn + retention-days: 30 + + - name: Send Slack notification + if: always() + run: | + # Default to true for scheduled runs, use input value for manual runs + SEND_SLACK="${{ inputs.send_slack_notification }}" + if [ "$SEND_SLACK" = "" ]; then SEND_SLACK="true"; fi + + if [ "$SEND_SLACK" = "true" ] && [ -n "${{ secrets.SLACK_LINKS_CHECK_WEBHOOK }}" ]; then + WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + if [ "${{ job.status }}" = "success" ]; then + COLOR="good" + EMOJI=":white_check_mark:" + STATUS="completed successfully" + else + COLOR="danger" + EMOJI=":x:" + STATUS="failed" + fi + + # Set triggered_by based on event type + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + TRIGGERED_BY="manually run by ${{ github.actor }}" + elif [ "${{ github.event_name }}" = "schedule" ]; then + TRIGGERED_BY="Triggered via schedule" + else + TRIGGERED_BY="${{ github.actor }}" + fi + + # Extract statistics from the environment variable (suppress extra output) + if [ -n "$LINK_CHECK_OUTPUT" ]; then + MARKDOWN_FILES=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Scanned [0-9]* markdown files" | grep -o "[0-9]*" | head -1 || echo "0") + TOTAL_INTERNAL_LINKS=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Processed [0-9]* internal links" | grep -o "[0-9]*" | head -1 || echo "0") + EXCLUDED_LINKS=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Found [0-9]* excluded links (skipped)" | grep -o "[0-9]*" | head -1 || echo "0") + REDIRECTED_LINKS=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Found [0-9]* redirected links (skipped)" | grep -o "[0-9]*" | head -1 || echo "0") + BROKEN_LINKS=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Found [0-9]* broken links" | grep -o "[0-9]*" | head -1 || echo "0") + BROKEN_ANCHORS=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Found [0-9]* broken anchor links" | grep -o "[0-9]*" | head -1 || echo "0") + else + MARKDOWN_FILES="0" + TOTAL_INTERNAL_LINKS="0" + EXCLUDED_LINKS="0" + REDIRECTED_LINKS="0" + BROKEN_LINKS="0" + BROKEN_ANCHORS="0" + fi + + # Prepare variables for Slack payload + # Default to true for scheduled runs, use input value for manual runs + SAVE_RESULTS="${{ inputs.save_results }}" + if [ "$SAVE_RESULTS" = "" ]; then SAVE_RESULTS="true"; fi + + RESULTS_DOWNLOAD="" + if [ "$SAVE_RESULTS" = "true" ]; then + ARTIFACT_ID="${{ steps.upload-results.outputs.artifact-id }}" + if [ -n "$ARTIFACT_ID" ]; then + RESULTS_DOWNLOAD="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}/artifacts/$ARTIFACT_ID" + else + RESULTS_DOWNLOAD="$WORKFLOW_URL" + fi + fi + + curl -s -X POST "${{ secrets.SLACK_LINKS_CHECK_WEBHOOK }}" \ + -H "Content-Type: application/json" \ + -d "{ + \"status\": \"$STATUS\", + \"emoji\": \"$EMOJI\", + \"repository\": \"${{ github.repository }}\", + \"triggered_by\": \"$TRIGGERED_BY\", + \"workflow_url\": \"$WORKFLOW_URL\", + \"markdown_files\": \"$MARKDOWN_FILES\", + \"links_checked\": \"$TOTAL_INTERNAL_LINKS\", + \"excluded_links\": \"$EXCLUDED_LINKS\", + \"redirected_links\": \"$REDIRECTED_LINKS\", + \"broken_links\": \"$BROKEN_LINKS\", + \"broken_anchors\": \"$BROKEN_ANCHORS\", + \"results_download\": \"$RESULTS_DOWNLOAD\", + \"save_results\": \"$SAVE_RESULTS\" + }" > /dev/null 2>&1 + + echo "✅ Slack notification sent successfully" + else + if [ "$SEND_SLACK" = "false" ]; then + echo "Slack notifications disabled by user input" + else + echo "SLACK_LINKS_CHECK_WEBHOOK not configured, skipping Slack notification" + fi + fi + + - name: Report results + if: always() + run: | + # Default to true for scheduled runs, use input value for manual runs + SAVE_RESULTS="${{ inputs.save_results }}" + if [ "$SAVE_RESULTS" = "" ]; then SAVE_RESULTS="true"; fi + + echo "Link check completed!" + echo "Check the job output above for detailed results." + if [ "$SAVE_RESULTS" = "true" ]; then + echo "Results files have been uploaded as artifacts." + fi diff --git a/package.json b/package.json index fefbf65a9..eff0907ff 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "typegen": "kea-typegen write .", "update-sprite": "svg-sprite -s --symbol-dest src/components/productFeature/images/icons --symbol-sprite sprited-icons.svg src/components/productFeature/images/icons/*.svg", "test-redirects": "jest scripts", + "check-links-post-build": "node scripts/check-links-post-build.js", "storybook": "start-storybook -s ./static -p 6006", "build-storybook": "build-storybook" }, diff --git a/scripts/check-links-post-build.js b/scripts/check-links-post-build.js new file mode 100644 index 000000000..f5a957570 --- /dev/null +++ b/scripts/check-links-post-build.js @@ -0,0 +1,718 @@ +#!/usr/bin/env node + +/* eslint-disable @typescript-eslint/no-var-requires */ + +/** + * PostHog Link Checker - Post-Build Validation + * + * This script validates internal links and anchor links against the ACTUAL + * built site structure. It must run after `gatsby build` to access the + * generated site data. + * + * This script: + * - Reads the generated sitemap.xml to get actual pages + * - Validates links against real page URLs (not just file paths) + * - Checks anchor links against actual built page content + * - Works with dynamic pages, templates, and plugin-generated content + * + * Usage: + * gatsby build && npm run check-links-post-build + * gatsby build && node scripts/check-links-post-build.js + * + * To output results to a file, provide a directory path as an argument: + * node scripts/check-links-post-build.js link-check-results + * node scripts/check-links-post-build.js . + * node scripts/check-links-post-build.js ../output + * + * The script will exit with code 1 if any broken links are found. + */ + +const fs = require('fs') +const path = require('path') +const { JSDOM } = require('jsdom') +const GithubSlugger = require('github-slugger') + +// ============================================================================ +// CONFIGURATION +// ============================================================================ + +const CONFIG = { + MAX_FILE_SIZE: 5 * 1024 * 1024, // 5MB + CACHE_SIZE_LIMIT: 1000, + BATCH_SIZE: 50, + EXCLUDED_EXTENSIONS: ['.css', '.js', '.json', '.xml', '.svg', '.png', '.jpg', '.jpeg', '.gif', '.woff', '.woff2'], + EXCLUDE_PATTERNS: [ + '/community/', // powered by Strapi + '/teams/', // powered by Strapi + '/careers/', // powered by Ashby + ], + SITEMAP_PATH: path.join(process.cwd(), 'public', 'sitemap', 'sitemap-0.xml'), + CONTENTS_DIR: 'contents', +} + +// Global cache for anchor links +const anchorCache = new Map() + +// ============================================================================ +// UTILITY FUNCTIONS +// ============================================================================ + +// Utility function to convert file paths or internal links to full PostHog URLs +function convertToPostHogUrl(pathOrUrl) { + const baseUrl = 'https://posthog.com' + + // Case 1: File path (starts with 'contents/') + if (pathOrUrl.startsWith('contents/')) { + // Remove 'contents/' prefix and file extension + let urlPath = pathOrUrl.replace(/^contents\//, '') + urlPath = urlPath.replace(/\.(md|mdx)$/, '') + return `${baseUrl}/${urlPath}` + } + + // Case 2: Internal link (starts with '/') + if (pathOrUrl.startsWith('/')) { + return `${baseUrl}${pathOrUrl}` + } + + // Case 3: Already a full URL or relative path + if (pathOrUrl.startsWith('http')) { + return pathOrUrl + } + + // Default: treat as internal path + return `${baseUrl}/${pathOrUrl}` +} + +// Parse vercel.json to extract redirect patterns +function parseVercelRedirects() { + try { + const vercelConfig = JSON.parse(fs.readFileSync('vercel.json', 'utf8')) + const redirects = vercelConfig.redirects || [] + const rewrites = vercelConfig.rewrites || [] + + // Combine redirects and rewrites (rewrites also act as redirects for link validation) + return [...redirects, ...rewrites] + } catch (error) { + console.warn('Warning: Could not parse vercel.json:', error.message) + return [] + } +} + +// Check if a URL contains a redirect source +function isRedirectSource(url, redirects) { + return redirects.some((redirect) => { + // If source contains :path*, check if URL contains the part before :path* + if (redirect.source.includes(':path*')) { + const pathPrefix = redirect.source.split('/:path*')[0] + return url.includes(pathPrefix) + } + // Otherwise, check if URL contains the full source + return url.includes(redirect.source) + }) +} + +// ============================================================================ +// SITEMAP FUNCTIONS +// ============================================================================ + +// Get all pages from the sitemap +function getSitemapPages() { + if (!fs.existsSync(CONFIG.SITEMAP_PATH)) { + console.error('Error: Sitemap not found at', CONFIG.SITEMAP_PATH) + console.error('Please run "gatsby build" first to generate the sitemap.') + process.exit(1) + } + + const sitemapContent = fs.readFileSync(CONFIG.SITEMAP_PATH, 'utf8') + const dom = new JSDOM(sitemapContent, { contentType: 'text/xml' }) + const urlNodes = dom.window.document.querySelectorAll('url loc') + + const pages = [] + urlNodes.forEach((node) => { + const url = node.textContent.trim() + if (url.startsWith('https://posthog.com/')) { + pages.push(url.replace('https://posthog.com', '')) + } + }) + + return pages +} + +// Check if URL exists in sitemap +function urlExistsInSitemap(url, pages) { + // Remove trailing slash for comparison + const normalizedUrl = url.replace(/\/$/, '') + return pages.includes(normalizedUrl) || pages.includes(normalizedUrl + '/') +} + +// ============================================================================ +// ANCHOR VALIDATION FUNCTIONS +// ============================================================================ + +// Better file path resolution for Gatsby's output structure +function getHtmlFilePath(url) { + const publicDir = path.join(process.cwd(), 'public') + let htmlPath + + // Remove leading slash + const cleanUrl = url.replace(/^\//, '') + + if (cleanUrl === '') { + // Root page + htmlPath = path.join(publicDir, 'index.html') + } else if (cleanUrl.endsWith('/')) { + // Directory with trailing slash + htmlPath = path.join(publicDir, cleanUrl, 'index.html') + } else if (cleanUrl.includes('.')) { + // File with extension + htmlPath = path.join(publicDir, cleanUrl) + } else { + // Directory without trailing slash - try both variants + const withIndex = path.join(publicDir, cleanUrl, 'index.html') + const withHtml = path.join(publicDir, cleanUrl + '.html') + + if (fs.existsSync(withIndex)) { + htmlPath = withIndex + } else if (fs.existsSync(withHtml)) { + htmlPath = withHtml + } else { + htmlPath = withIndex // Default to index.html variant + } + } + + return htmlPath +} + +// More efficient function that extracts ALL anchors and caches them +function extractAnchorsFromHtml(htmlPath) { + // Check cache first + if (anchorCache.has(htmlPath)) { + return anchorCache.get(htmlPath) + } + + // Manage cache size + if (anchorCache.size >= CONFIG.CACHE_SIZE_LIMIT) { + // Remove oldest entry + const firstKey = anchorCache.keys().next().value + anchorCache.delete(firstKey) + } + + try { + const stats = fs.statSync(htmlPath) + + // Skip very large files + if (stats.size > CONFIG.MAX_FILE_SIZE) { + console.warn(`Skipping large file: ${htmlPath} (${stats.size} bytes)`) + const emptySet = new Set() + anchorCache.set(htmlPath, emptySet) + return emptySet + } + + // Skip excluded file extensions + const ext = path.extname(htmlPath).toLowerCase() + if (CONFIG.EXCLUDED_EXTENSIONS.includes(ext)) { + const emptySet = new Set() + anchorCache.set(htmlPath, emptySet) + return emptySet + } + + const anchors = new Set() + + // For all files, process synchronously to avoid Promise issues + const htmlContent = fs.readFileSync(htmlPath, 'utf8') + + // Simple string matching for very large content + if (htmlContent.length > 500000) { + const idMatches = htmlContent.match(/id="([^"]+)"/g) || [] + const nameMatches = htmlContent.match(/name="([^"]+)"/g) || [] + + idMatches.forEach((match) => { + const id = match.match(/id="([^"]+)"/)[1] + anchors.add(id) + }) + + nameMatches.forEach((match) => { + const name = match.match(/name="([^"]+)"/)[1] + anchors.add(name) + }) + } else { + // Use DOM parsing for smaller files + const dom = new JSDOM(htmlContent) + const document = dom.window.document + + // Get all elements with ID + const elementsWithId = document.querySelectorAll('[id]') + elementsWithId.forEach((element) => { + anchors.add(element.id) + }) + + // Get all elements with name + const elementsWithName = document.querySelectorAll('[name]') + elementsWithName.forEach((element) => { + anchors.add(element.name) + }) + + // Process headings with slugger + const headings = document.querySelectorAll('h1, h2, h3, h4, h5, h6') + const slugger = new GithubSlugger() + + headings.forEach((heading) => { + const slug = slugger.slug(heading.textContent) + anchors.add(slug) + }) + } + + // Cache the results + anchorCache.set(htmlPath, anchors) + return anchors + } catch (error) { + console.warn(`Warning: Could not extract anchors from ${htmlPath}:`, error.message) + const emptySet = new Set() + anchorCache.set(htmlPath, emptySet) + return emptySet + } +} + +// ============================================================================ +// LINK VALIDATION FUNCTIONS +// ============================================================================ + +// Check if internal URL exists in sitemap +function validateInternalUrl(url, pages) { + const [baseUrl] = url.split('#') + return urlExistsInSitemap(baseUrl, pages) +} + +// Check if anchor exists in HTML file +function validateAnchor(url, pages) { + const [baseUrl, anchor] = url.split('#') + + // No anchor to validate + if (!anchor) { + return true + } + + // First ensure the base URL exists + if (!urlExistsInSitemap(baseUrl, pages)) { + return false + } + + // Get the HTML file path and check if it exists + const htmlPath = getHtmlFilePath(baseUrl) + if (!fs.existsSync(htmlPath)) { + return false + } + + // Extract all anchors and check if our anchor exists + const anchors = extractAnchorsFromHtml(htmlPath) + return anchors.has(anchor) +} + +// ============================================================================ +// MARKDOWN PROCESSING FUNCTIONS +// ============================================================================ + +// Extract all internal links from a markdown file +function extractInternalLinks(filePath) { + const content = fs.readFileSync(filePath, 'utf8') + const links = [] + const linkRegex = /\[([^\]]*)\]\(([^)]+)\)/g + + let match + let lineNumber = 1 + let currentIndex = 0 + + while ((match = linkRegex.exec(content)) !== null) { + const linkText = match[1] + const linkUrl = match[2] + + // Count lines up to this match + while (currentIndex < match.index) { + if (content[currentIndex] === '\n') { + lineNumber++ + } + currentIndex++ + } + + // Only check internal links + if (linkUrl.startsWith('/') && !linkUrl.startsWith('//')) { + const beforeMatch = content.substring(Math.max(0, match.index - 75), match.index) + const afterMatch = content.substring( + match.index + match[0].length, + Math.min(content.length, match.index + match[0].length + 75) + ) + const context = beforeMatch + match[0] + afterMatch + + links.push({ + text: linkText, + url: linkUrl, + line: lineNumber, + context: context.replace(/\n/g, ' '), + }) + } + } + + return links +} + +// Find all markdown files in a directory recursively +function findMarkdownFiles(dir) { + const markdownFiles = [] + + function walkDirectory(currentDir) { + const files = fs.readdirSync(currentDir) + + for (const file of files) { + const filePath = path.join(currentDir, file) + const stat = fs.statSync(filePath) + + if (stat.isDirectory()) { + walkDirectory(filePath) + } else if (file.endsWith('.md') || file.endsWith('.mdx')) { + markdownFiles.push(filePath) + } + } + } + + walkDirectory(dir) + return markdownFiles +} + +// Process files in batches for memory management +function processFilesInBatches(files, batchSize = CONFIG.BATCH_SIZE) { + const results = [] + + for (let i = 0; i < files.length; i += batchSize) { + const batch = files.slice(i, i + batchSize) + console.log( + `Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(files.length / batchSize)} (${ + batch.length + } files)` + ) + + const batchResults = batch.map((file) => ({ + file, + links: extractInternalLinks(file), + })) + + results.push(...batchResults) + + // Clear some memory between batches + if (global.gc) { + global.gc() + } + } + + return results +} + +// ============================================================================ +// LINK PROCESSING FUNCTIONS +// ============================================================================ + +// Process a single link and return validation result +function processLink(link, file, pages, redirects) { + // Strip query parameters from URL for checking + const urlWithoutQuery = link.url.split('?')[0] + + // Check if link should be excluded by pattern + const shouldExclude = CONFIG.EXCLUDE_PATTERNS.some((pattern) => urlWithoutQuery.includes(pattern)) + if (shouldExclude) { + return { type: 'excluded' } + } + + // Check if link has file extension that should be ignored + const hasExcludedExtension = CONFIG.EXCLUDED_EXTENSIONS.some((ext) => urlWithoutQuery.endsWith(ext)) + if (hasExcludedExtension) { + return { type: 'excluded' } + } + + // Check if this link is a redirect source + if (isRedirectSource(urlWithoutQuery, redirects)) { + return { type: 'redirected' } + } + + // First check if the internal URL is valid + if (!validateInternalUrl(urlWithoutQuery, pages)) { + return { + type: 'broken', + brokenItem: { + file: file, + link: link.url, + text: link.text, + line: link.line, + context: link.context, + type: 'page', + }, + } + } else if (urlWithoutQuery.includes('#')) { + // If URL is valid but has anchor, check if anchor exists + if (!validateAnchor(urlWithoutQuery, pages)) { + return { + type: 'broken_anchor', + brokenItem: { + file: file, + link: link.url, + text: link.text, + line: link.line, + context: link.context, + type: 'anchor', + }, + } + } + } + + return { type: 'valid' } +} + +// Process all links and categorize them +function processAllLinks(fileResults, pages, redirects) { + const brokenLinks = [] + const brokenAnchors = [] + let totalLinks = 0 + let anchorLinksChecked = 0 + let redirectedLinks = 0 + let excludedLinks = 0 + + for (const result of fileResults) { + const { file, links } = result + totalLinks += links.length + + for (const link of links) { + // Check for anchor (like original code) + const urlWithoutQuery = link.url.split('?')[0] + const hasAnchor = urlWithoutQuery.includes('#') + + const processResult = processLink(link, file, pages, redirects) + + switch (processResult.type) { + case 'excluded': + excludedLinks++ + break + case 'redirected': + redirectedLinks++ + break + case 'broken': + brokenLinks.push(processResult.brokenItem) + // Count anchor links for ALL non-excluded, non-redirected links (like original) + if (hasAnchor) { + anchorLinksChecked++ + } + break + case 'broken_anchor': + brokenAnchors.push(processResult.brokenItem) + anchorLinksChecked++ + break + case 'valid': + // Count anchor links for ALL non-excluded, non-redirected links (like original) + if (hasAnchor) { + anchorLinksChecked++ + } + break + } + } + } + + return { + brokenLinks, + brokenAnchors, + stats: { + totalLinks, + excludedLinks, + redirectedLinks, + anchorLinksChecked, + }, + } +} + +// ============================================================================ +// RESULTS AND OUTPUT FUNCTIONS +// ============================================================================ + +// Create the results object +function createResultsObject(brokenLinks, brokenAnchors, stats, markdownFiles, redirects, pages) { + return { + timestamp: new Date().toISOString(), + summary: { + totalLinks: stats.totalLinks, + excludedLinks: stats.excludedLinks, + redirectedLinks: stats.redirectedLinks, + brokenLinks: brokenLinks.length, + anchorLinksChecked: stats.anchorLinksChecked, + brokenAnchors: brokenAnchors.length, + htmlFilesCached: anchorCache.size, + markdownFiles: markdownFiles.length, + redirectPatterns: redirects.length, + pagesInSitemap: pages.length, + }, + excludePatterns: CONFIG.EXCLUDE_PATTERNS, + excludeFileExtensions: CONFIG.EXCLUDED_EXTENSIONS, + brokenLinks: brokenLinks.map((broken) => ({ + type: broken.type, + file: path.relative(process.cwd(), broken.file), + page: convertToPostHogUrl(broken.file), + brokenLink: broken.link, + brokenUrl: convertToPostHogUrl(broken.link), + line: broken.line, + text: broken.text, + context: broken.context, + })), + brokenAnchors: brokenAnchors.map((anchor) => ({ + type: anchor.type, + file: path.relative(process.cwd(), anchor.file), + page: convertToPostHogUrl(anchor.file), + brokenLink: anchor.link, + brokenUrl: convertToPostHogUrl(anchor.link), + line: anchor.line, + text: anchor.text, + context: anchor.context, + })), + } +} + +// Write results to JSON file (optional) +function writeResultsToFile(results, outputPath) { + if (!outputPath) { + // No output path provided, skip writing to file + return + } + + const resultsDir = path.join(process.cwd(), outputPath) + if (!fs.existsSync(resultsDir)) { + fs.mkdirSync(resultsDir, { recursive: true }) + } + + const timestamp = new Date().toISOString().replace(/[:.]/g, '-') + const resultsFile = path.join(resultsDir, `link-check-${timestamp}.json`) + + try { + fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2)) + console.log(`\nResults saved to: ${resultsFile}`) + } catch (error) { + console.error(`Error saving results: ${error.message}`) + } +} + +// Display broken links +function displayBrokenLinks(brokenLinks) { + if (brokenLinks.length === 0) return + + console.log('\nBroken links found:\n') + + brokenLinks.forEach((broken) => { + console.log(`Error type: ${broken.type}`) + console.log(`File: ${path.relative(process.cwd(), broken.file)}`) + console.log(`Page: ${convertToPostHogUrl(broken.file)}`) + console.log(`Broken link: ${broken.link}`) + console.log(`Broken URL: ${convertToPostHogUrl(broken.link)}`) + console.log(`Line #: ${broken.line}`) + if (broken.text) { + console.log(`Hyperlinked text: ${broken.text}`) + } + console.log(`Context: ${broken.context}`) + console.log('-'.repeat(80)) + }) +} + +// Display broken anchor links +function displayBrokenAnchors(brokenAnchors) { + if (brokenAnchors.length === 0) return + + console.log('\nBroken anchor links:\n') + + brokenAnchors.forEach((anchor) => { + console.log(`Error type: ${anchor.type}`) + console.log(`File: ${path.relative(process.cwd(), anchor.file)}`) + console.log(`Page: ${convertToPostHogUrl(anchor.file)}`) + console.log(`Broken link: ${anchor.link}`) + console.log(`Broken URL: ${convertToPostHogUrl(anchor.link)}`) + console.log(`Line #: ${anchor.line}`) + if (anchor.text) { + console.log(`Hyperlinked text: ${anchor.text}`) + } + console.log(`Context: ${anchor.context}`) + console.log('-'.repeat(80)) + }) +} + +// Display summary statistics +function displaySummaryStats(stats, brokenLinks, brokenAnchors, markdownFilesCount) { + console.log(`\nScanned ${markdownFilesCount} markdown files`) + console.log(`Processed ${stats.totalLinks} internal links`) + console.log(`Found ${stats.excludedLinks} excluded links (skipped)`) + console.log(`Found ${stats.redirectedLinks} redirected links (skipped)`) + console.log(`Checked ${stats.anchorLinksChecked} anchor links`) + console.log(`Found ${brokenLinks.length} broken links`) + console.log(`Found ${brokenAnchors.length} broken anchor links`) + console.log(`Cached ${anchorCache.size} HTML files for anchor checking`) +} + +// ============================================================================ +// MAIN FUNCTION +// ============================================================================ + +function checkLinks(outputPath) { + console.log('Starting post-build link validation...') + + // Initialize data sources + const redirects = parseVercelRedirects() + console.log(`Found ${redirects.length} redirect/rewrite patterns`) + + const pages = getSitemapPages() + console.log(`Found ${pages.length} pages in sitemap`) + + const markdownFiles = findMarkdownFiles(CONFIG.CONTENTS_DIR) + console.log(`Found ${markdownFiles.length} markdown files`) + + // Process files and extract links + const fileResults = processFilesInBatches(markdownFiles) + + // Process and validate all links + const { brokenLinks, brokenAnchors, stats } = processAllLinks(fileResults, pages, redirects) + + // Sort results alphabetically by file + brokenLinks.sort((a, b) => a.file.localeCompare(b.file)) + brokenAnchors.sort((a, b) => a.file.localeCompare(b.file)) + + // Display broken links + displayBrokenLinks(brokenLinks) + displayBrokenAnchors(brokenAnchors) + + if (brokenLinks.length === 0 && brokenAnchors.length === 0) { + console.log('\nNo broken links found! 🎉') + } + + // Display summary stats at the end + displaySummaryStats(stats, brokenLinks, brokenAnchors, markdownFiles.length) + + // Create and save results at the end + const results = createResultsObject(brokenLinks, brokenAnchors, stats, markdownFiles, redirects, pages) + writeResultsToFile(results, outputPath) + + return brokenLinks.length +} + +// ============================================================================ +// SCRIPT EXECUTION +// ============================================================================ + +// Parse command line arguments +const args = process.argv.slice(2) +const outputPath = args.length > 0 ? args[0] : null + +if (outputPath) { + console.log(`Results will be saved to: ${outputPath}`) +} else { + console.log('No output path provided. Results will only be displayed in console.') +} + +// Run the script +const brokenCount = checkLinks(outputPath) + +// Only exit with error code if there are broken PAGE links (not just anchor links) +// This allows the workflow to continue while still reporting issues +if (brokenCount > 0) { + console.log('\nCheck the output above ☝️') +} + +process.exit(0) // Always exit successfully