mirror of
https://github.com/BillyOutlast/posthog.com.git
synced 2026-02-04 03:11:21 +01:00
internal links checker script (post-build) (#12147)
This commit is contained in:
166
.github/workflows/internal-links-check.yml
vendored
Normal file
166
.github/workflows/internal-links-check.yml
vendored
Normal file
@@ -0,0 +1,166 @@
|
||||
name: Check internal links
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 11 * * 1' # Every Monday at 11am UTC
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
save_results:
|
||||
description: 'Save results to file'
|
||||
required: false
|
||||
default: 'true'
|
||||
type: boolean
|
||||
send_slack_notification:
|
||||
description: 'Send Slack notification'
|
||||
required: false
|
||||
default: 'true'
|
||||
type: boolean
|
||||
|
||||
jobs:
|
||||
check-internal-links:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '18'
|
||||
cache: 'yarn'
|
||||
|
||||
- name: Install dependencies
|
||||
run: yarn install --frozen-lockfile
|
||||
|
||||
- name: Build site
|
||||
run: yarn build
|
||||
|
||||
- name: Check links (console only)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' && inputs.save_results == false }}
|
||||
run: |
|
||||
OUTPUT=$(node scripts/check-links-post-build.js 2>&1)
|
||||
echo "$OUTPUT"
|
||||
echo "LINK_CHECK_OUTPUT<<EOF" >> $GITHUB_ENV
|
||||
echo "$OUTPUT" >> $GITHUB_ENV
|
||||
echo "EOF" >> $GITHUB_ENV
|
||||
|
||||
- name: Check links (with file output)
|
||||
if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.save_results != false) }}
|
||||
run: |
|
||||
OUTPUT=$(node scripts/check-links-post-build.js link-check-results 2>&1)
|
||||
echo "$OUTPUT"
|
||||
echo "LINK_CHECK_OUTPUT<<EOF" >> $GITHUB_ENV
|
||||
echo "$OUTPUT" >> $GITHUB_ENV
|
||||
echo "EOF" >> $GITHUB_ENV
|
||||
|
||||
- name: Upload link check results
|
||||
if: ${{ always() && (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.save_results != false)) }}
|
||||
id: upload-results
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: link-check-results
|
||||
path: link-check-results/*.json
|
||||
if-no-files-found: warn
|
||||
retention-days: 30
|
||||
|
||||
- name: Send Slack notification
|
||||
if: always()
|
||||
run: |
|
||||
# Default to true for scheduled runs, use input value for manual runs
|
||||
SEND_SLACK="${{ inputs.send_slack_notification }}"
|
||||
if [ "$SEND_SLACK" = "" ]; then SEND_SLACK="true"; fi
|
||||
|
||||
if [ "$SEND_SLACK" = "true" ] && [ -n "${{ secrets.SLACK_LINKS_CHECK_WEBHOOK }}" ]; then
|
||||
WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
|
||||
if [ "${{ job.status }}" = "success" ]; then
|
||||
COLOR="good"
|
||||
EMOJI=":white_check_mark:"
|
||||
STATUS="completed successfully"
|
||||
else
|
||||
COLOR="danger"
|
||||
EMOJI=":x:"
|
||||
STATUS="failed"
|
||||
fi
|
||||
|
||||
# Set triggered_by based on event type
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
TRIGGERED_BY="manually run by ${{ github.actor }}"
|
||||
elif [ "${{ github.event_name }}" = "schedule" ]; then
|
||||
TRIGGERED_BY="Triggered via schedule"
|
||||
else
|
||||
TRIGGERED_BY="${{ github.actor }}"
|
||||
fi
|
||||
|
||||
# Extract statistics from the environment variable (suppress extra output)
|
||||
if [ -n "$LINK_CHECK_OUTPUT" ]; then
|
||||
MARKDOWN_FILES=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Scanned [0-9]* markdown files" | grep -o "[0-9]*" | head -1 || echo "0")
|
||||
TOTAL_INTERNAL_LINKS=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Processed [0-9]* internal links" | grep -o "[0-9]*" | head -1 || echo "0")
|
||||
EXCLUDED_LINKS=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Found [0-9]* excluded links (skipped)" | grep -o "[0-9]*" | head -1 || echo "0")
|
||||
REDIRECTED_LINKS=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Found [0-9]* redirected links (skipped)" | grep -o "[0-9]*" | head -1 || echo "0")
|
||||
BROKEN_LINKS=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Found [0-9]* broken links" | grep -o "[0-9]*" | head -1 || echo "0")
|
||||
BROKEN_ANCHORS=$(echo "$LINK_CHECK_OUTPUT" | grep -o "Found [0-9]* broken anchor links" | grep -o "[0-9]*" | head -1 || echo "0")
|
||||
else
|
||||
MARKDOWN_FILES="0"
|
||||
TOTAL_INTERNAL_LINKS="0"
|
||||
EXCLUDED_LINKS="0"
|
||||
REDIRECTED_LINKS="0"
|
||||
BROKEN_LINKS="0"
|
||||
BROKEN_ANCHORS="0"
|
||||
fi
|
||||
|
||||
# Prepare variables for Slack payload
|
||||
# Default to true for scheduled runs, use input value for manual runs
|
||||
SAVE_RESULTS="${{ inputs.save_results }}"
|
||||
if [ "$SAVE_RESULTS" = "" ]; then SAVE_RESULTS="true"; fi
|
||||
|
||||
RESULTS_DOWNLOAD=""
|
||||
if [ "$SAVE_RESULTS" = "true" ]; then
|
||||
ARTIFACT_ID="${{ steps.upload-results.outputs.artifact-id }}"
|
||||
if [ -n "$ARTIFACT_ID" ]; then
|
||||
RESULTS_DOWNLOAD="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}/artifacts/$ARTIFACT_ID"
|
||||
else
|
||||
RESULTS_DOWNLOAD="$WORKFLOW_URL"
|
||||
fi
|
||||
fi
|
||||
|
||||
curl -s -X POST "${{ secrets.SLACK_LINKS_CHECK_WEBHOOK }}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"status\": \"$STATUS\",
|
||||
\"emoji\": \"$EMOJI\",
|
||||
\"repository\": \"${{ github.repository }}\",
|
||||
\"triggered_by\": \"$TRIGGERED_BY\",
|
||||
\"workflow_url\": \"$WORKFLOW_URL\",
|
||||
\"markdown_files\": \"$MARKDOWN_FILES\",
|
||||
\"links_checked\": \"$TOTAL_INTERNAL_LINKS\",
|
||||
\"excluded_links\": \"$EXCLUDED_LINKS\",
|
||||
\"redirected_links\": \"$REDIRECTED_LINKS\",
|
||||
\"broken_links\": \"$BROKEN_LINKS\",
|
||||
\"broken_anchors\": \"$BROKEN_ANCHORS\",
|
||||
\"results_download\": \"$RESULTS_DOWNLOAD\",
|
||||
\"save_results\": \"$SAVE_RESULTS\"
|
||||
}" > /dev/null 2>&1
|
||||
|
||||
echo "✅ Slack notification sent successfully"
|
||||
else
|
||||
if [ "$SEND_SLACK" = "false" ]; then
|
||||
echo "Slack notifications disabled by user input"
|
||||
else
|
||||
echo "SLACK_LINKS_CHECK_WEBHOOK not configured, skipping Slack notification"
|
||||
fi
|
||||
fi
|
||||
|
||||
- name: Report results
|
||||
if: always()
|
||||
run: |
|
||||
# Default to true for scheduled runs, use input value for manual runs
|
||||
SAVE_RESULTS="${{ inputs.save_results }}"
|
||||
if [ "$SAVE_RESULTS" = "" ]; then SAVE_RESULTS="true"; fi
|
||||
|
||||
echo "Link check completed!"
|
||||
echo "Check the job output above for detailed results."
|
||||
if [ "$SAVE_RESULTS" = "true" ]; then
|
||||
echo "Results files have been uploaded as artifacts."
|
||||
fi
|
||||
@@ -26,6 +26,7 @@
|
||||
"typegen": "kea-typegen write .",
|
||||
"update-sprite": "svg-sprite -s --symbol-dest src/components/productFeature/images/icons --symbol-sprite sprited-icons.svg src/components/productFeature/images/icons/*.svg",
|
||||
"test-redirects": "jest scripts",
|
||||
"check-links-post-build": "node scripts/check-links-post-build.js",
|
||||
"storybook": "start-storybook -s ./static -p 6006",
|
||||
"build-storybook": "build-storybook"
|
||||
},
|
||||
|
||||
718
scripts/check-links-post-build.js
Normal file
718
scripts/check-links-post-build.js
Normal file
@@ -0,0 +1,718 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/* eslint-disable @typescript-eslint/no-var-requires */
|
||||
|
||||
/**
|
||||
* PostHog Link Checker - Post-Build Validation
|
||||
*
|
||||
* This script validates internal links and anchor links against the ACTUAL
|
||||
* built site structure. It must run after `gatsby build` to access the
|
||||
* generated site data.
|
||||
*
|
||||
* This script:
|
||||
* - Reads the generated sitemap.xml to get actual pages
|
||||
* - Validates links against real page URLs (not just file paths)
|
||||
* - Checks anchor links against actual built page content
|
||||
* - Works with dynamic pages, templates, and plugin-generated content
|
||||
*
|
||||
* Usage:
|
||||
* gatsby build && npm run check-links-post-build
|
||||
* gatsby build && node scripts/check-links-post-build.js
|
||||
*
|
||||
* To output results to a file, provide a directory path as an argument:
|
||||
* node scripts/check-links-post-build.js link-check-results
|
||||
* node scripts/check-links-post-build.js .
|
||||
* node scripts/check-links-post-build.js ../output
|
||||
*
|
||||
* The script will exit with code 1 if any broken links are found.
|
||||
*/
|
||||
|
||||
const fs = require('fs')
|
||||
const path = require('path')
|
||||
const { JSDOM } = require('jsdom')
|
||||
const GithubSlugger = require('github-slugger')
|
||||
|
||||
// ============================================================================
|
||||
// CONFIGURATION
|
||||
// ============================================================================
|
||||
|
||||
const CONFIG = {
|
||||
MAX_FILE_SIZE: 5 * 1024 * 1024, // 5MB
|
||||
CACHE_SIZE_LIMIT: 1000,
|
||||
BATCH_SIZE: 50,
|
||||
EXCLUDED_EXTENSIONS: ['.css', '.js', '.json', '.xml', '.svg', '.png', '.jpg', '.jpeg', '.gif', '.woff', '.woff2'],
|
||||
EXCLUDE_PATTERNS: [
|
||||
'/community/', // powered by Strapi
|
||||
'/teams/', // powered by Strapi
|
||||
'/careers/', // powered by Ashby
|
||||
],
|
||||
SITEMAP_PATH: path.join(process.cwd(), 'public', 'sitemap', 'sitemap-0.xml'),
|
||||
CONTENTS_DIR: 'contents',
|
||||
}
|
||||
|
||||
// Global cache for anchor links
|
||||
const anchorCache = new Map()
|
||||
|
||||
// ============================================================================
|
||||
// UTILITY FUNCTIONS
|
||||
// ============================================================================
|
||||
|
||||
// Utility function to convert file paths or internal links to full PostHog URLs
|
||||
function convertToPostHogUrl(pathOrUrl) {
|
||||
const baseUrl = 'https://posthog.com'
|
||||
|
||||
// Case 1: File path (starts with 'contents/')
|
||||
if (pathOrUrl.startsWith('contents/')) {
|
||||
// Remove 'contents/' prefix and file extension
|
||||
let urlPath = pathOrUrl.replace(/^contents\//, '')
|
||||
urlPath = urlPath.replace(/\.(md|mdx)$/, '')
|
||||
return `${baseUrl}/${urlPath}`
|
||||
}
|
||||
|
||||
// Case 2: Internal link (starts with '/')
|
||||
if (pathOrUrl.startsWith('/')) {
|
||||
return `${baseUrl}${pathOrUrl}`
|
||||
}
|
||||
|
||||
// Case 3: Already a full URL or relative path
|
||||
if (pathOrUrl.startsWith('http')) {
|
||||
return pathOrUrl
|
||||
}
|
||||
|
||||
// Default: treat as internal path
|
||||
return `${baseUrl}/${pathOrUrl}`
|
||||
}
|
||||
|
||||
// Parse vercel.json to extract redirect patterns
|
||||
function parseVercelRedirects() {
|
||||
try {
|
||||
const vercelConfig = JSON.parse(fs.readFileSync('vercel.json', 'utf8'))
|
||||
const redirects = vercelConfig.redirects || []
|
||||
const rewrites = vercelConfig.rewrites || []
|
||||
|
||||
// Combine redirects and rewrites (rewrites also act as redirects for link validation)
|
||||
return [...redirects, ...rewrites]
|
||||
} catch (error) {
|
||||
console.warn('Warning: Could not parse vercel.json:', error.message)
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
// Check if a URL contains a redirect source
|
||||
function isRedirectSource(url, redirects) {
|
||||
return redirects.some((redirect) => {
|
||||
// If source contains :path*, check if URL contains the part before :path*
|
||||
if (redirect.source.includes(':path*')) {
|
||||
const pathPrefix = redirect.source.split('/:path*')[0]
|
||||
return url.includes(pathPrefix)
|
||||
}
|
||||
// Otherwise, check if URL contains the full source
|
||||
return url.includes(redirect.source)
|
||||
})
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SITEMAP FUNCTIONS
|
||||
// ============================================================================
|
||||
|
||||
// Get all pages from the sitemap
|
||||
function getSitemapPages() {
|
||||
if (!fs.existsSync(CONFIG.SITEMAP_PATH)) {
|
||||
console.error('Error: Sitemap not found at', CONFIG.SITEMAP_PATH)
|
||||
console.error('Please run "gatsby build" first to generate the sitemap.')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const sitemapContent = fs.readFileSync(CONFIG.SITEMAP_PATH, 'utf8')
|
||||
const dom = new JSDOM(sitemapContent, { contentType: 'text/xml' })
|
||||
const urlNodes = dom.window.document.querySelectorAll('url loc')
|
||||
|
||||
const pages = []
|
||||
urlNodes.forEach((node) => {
|
||||
const url = node.textContent.trim()
|
||||
if (url.startsWith('https://posthog.com/')) {
|
||||
pages.push(url.replace('https://posthog.com', ''))
|
||||
}
|
||||
})
|
||||
|
||||
return pages
|
||||
}
|
||||
|
||||
// Check if URL exists in sitemap
|
||||
function urlExistsInSitemap(url, pages) {
|
||||
// Remove trailing slash for comparison
|
||||
const normalizedUrl = url.replace(/\/$/, '')
|
||||
return pages.includes(normalizedUrl) || pages.includes(normalizedUrl + '/')
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// ANCHOR VALIDATION FUNCTIONS
|
||||
// ============================================================================
|
||||
|
||||
// Better file path resolution for Gatsby's output structure
|
||||
function getHtmlFilePath(url) {
|
||||
const publicDir = path.join(process.cwd(), 'public')
|
||||
let htmlPath
|
||||
|
||||
// Remove leading slash
|
||||
const cleanUrl = url.replace(/^\//, '')
|
||||
|
||||
if (cleanUrl === '') {
|
||||
// Root page
|
||||
htmlPath = path.join(publicDir, 'index.html')
|
||||
} else if (cleanUrl.endsWith('/')) {
|
||||
// Directory with trailing slash
|
||||
htmlPath = path.join(publicDir, cleanUrl, 'index.html')
|
||||
} else if (cleanUrl.includes('.')) {
|
||||
// File with extension
|
||||
htmlPath = path.join(publicDir, cleanUrl)
|
||||
} else {
|
||||
// Directory without trailing slash - try both variants
|
||||
const withIndex = path.join(publicDir, cleanUrl, 'index.html')
|
||||
const withHtml = path.join(publicDir, cleanUrl + '.html')
|
||||
|
||||
if (fs.existsSync(withIndex)) {
|
||||
htmlPath = withIndex
|
||||
} else if (fs.existsSync(withHtml)) {
|
||||
htmlPath = withHtml
|
||||
} else {
|
||||
htmlPath = withIndex // Default to index.html variant
|
||||
}
|
||||
}
|
||||
|
||||
return htmlPath
|
||||
}
|
||||
|
||||
// More efficient function that extracts ALL anchors and caches them
|
||||
function extractAnchorsFromHtml(htmlPath) {
|
||||
// Check cache first
|
||||
if (anchorCache.has(htmlPath)) {
|
||||
return anchorCache.get(htmlPath)
|
||||
}
|
||||
|
||||
// Manage cache size
|
||||
if (anchorCache.size >= CONFIG.CACHE_SIZE_LIMIT) {
|
||||
// Remove oldest entry
|
||||
const firstKey = anchorCache.keys().next().value
|
||||
anchorCache.delete(firstKey)
|
||||
}
|
||||
|
||||
try {
|
||||
const stats = fs.statSync(htmlPath)
|
||||
|
||||
// Skip very large files
|
||||
if (stats.size > CONFIG.MAX_FILE_SIZE) {
|
||||
console.warn(`Skipping large file: ${htmlPath} (${stats.size} bytes)`)
|
||||
const emptySet = new Set()
|
||||
anchorCache.set(htmlPath, emptySet)
|
||||
return emptySet
|
||||
}
|
||||
|
||||
// Skip excluded file extensions
|
||||
const ext = path.extname(htmlPath).toLowerCase()
|
||||
if (CONFIG.EXCLUDED_EXTENSIONS.includes(ext)) {
|
||||
const emptySet = new Set()
|
||||
anchorCache.set(htmlPath, emptySet)
|
||||
return emptySet
|
||||
}
|
||||
|
||||
const anchors = new Set()
|
||||
|
||||
// For all files, process synchronously to avoid Promise issues
|
||||
const htmlContent = fs.readFileSync(htmlPath, 'utf8')
|
||||
|
||||
// Simple string matching for very large content
|
||||
if (htmlContent.length > 500000) {
|
||||
const idMatches = htmlContent.match(/id="([^"]+)"/g) || []
|
||||
const nameMatches = htmlContent.match(/name="([^"]+)"/g) || []
|
||||
|
||||
idMatches.forEach((match) => {
|
||||
const id = match.match(/id="([^"]+)"/)[1]
|
||||
anchors.add(id)
|
||||
})
|
||||
|
||||
nameMatches.forEach((match) => {
|
||||
const name = match.match(/name="([^"]+)"/)[1]
|
||||
anchors.add(name)
|
||||
})
|
||||
} else {
|
||||
// Use DOM parsing for smaller files
|
||||
const dom = new JSDOM(htmlContent)
|
||||
const document = dom.window.document
|
||||
|
||||
// Get all elements with ID
|
||||
const elementsWithId = document.querySelectorAll('[id]')
|
||||
elementsWithId.forEach((element) => {
|
||||
anchors.add(element.id)
|
||||
})
|
||||
|
||||
// Get all elements with name
|
||||
const elementsWithName = document.querySelectorAll('[name]')
|
||||
elementsWithName.forEach((element) => {
|
||||
anchors.add(element.name)
|
||||
})
|
||||
|
||||
// Process headings with slugger
|
||||
const headings = document.querySelectorAll('h1, h2, h3, h4, h5, h6')
|
||||
const slugger = new GithubSlugger()
|
||||
|
||||
headings.forEach((heading) => {
|
||||
const slug = slugger.slug(heading.textContent)
|
||||
anchors.add(slug)
|
||||
})
|
||||
}
|
||||
|
||||
// Cache the results
|
||||
anchorCache.set(htmlPath, anchors)
|
||||
return anchors
|
||||
} catch (error) {
|
||||
console.warn(`Warning: Could not extract anchors from ${htmlPath}:`, error.message)
|
||||
const emptySet = new Set()
|
||||
anchorCache.set(htmlPath, emptySet)
|
||||
return emptySet
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// LINK VALIDATION FUNCTIONS
|
||||
// ============================================================================
|
||||
|
||||
// Check if internal URL exists in sitemap
|
||||
function validateInternalUrl(url, pages) {
|
||||
const [baseUrl] = url.split('#')
|
||||
return urlExistsInSitemap(baseUrl, pages)
|
||||
}
|
||||
|
||||
// Check if anchor exists in HTML file
|
||||
function validateAnchor(url, pages) {
|
||||
const [baseUrl, anchor] = url.split('#')
|
||||
|
||||
// No anchor to validate
|
||||
if (!anchor) {
|
||||
return true
|
||||
}
|
||||
|
||||
// First ensure the base URL exists
|
||||
if (!urlExistsInSitemap(baseUrl, pages)) {
|
||||
return false
|
||||
}
|
||||
|
||||
// Get the HTML file path and check if it exists
|
||||
const htmlPath = getHtmlFilePath(baseUrl)
|
||||
if (!fs.existsSync(htmlPath)) {
|
||||
return false
|
||||
}
|
||||
|
||||
// Extract all anchors and check if our anchor exists
|
||||
const anchors = extractAnchorsFromHtml(htmlPath)
|
||||
return anchors.has(anchor)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// MARKDOWN PROCESSING FUNCTIONS
|
||||
// ============================================================================
|
||||
|
||||
// Extract all internal links from a markdown file
|
||||
function extractInternalLinks(filePath) {
|
||||
const content = fs.readFileSync(filePath, 'utf8')
|
||||
const links = []
|
||||
const linkRegex = /\[([^\]]*)\]\(([^)]+)\)/g
|
||||
|
||||
let match
|
||||
let lineNumber = 1
|
||||
let currentIndex = 0
|
||||
|
||||
while ((match = linkRegex.exec(content)) !== null) {
|
||||
const linkText = match[1]
|
||||
const linkUrl = match[2]
|
||||
|
||||
// Count lines up to this match
|
||||
while (currentIndex < match.index) {
|
||||
if (content[currentIndex] === '\n') {
|
||||
lineNumber++
|
||||
}
|
||||
currentIndex++
|
||||
}
|
||||
|
||||
// Only check internal links
|
||||
if (linkUrl.startsWith('/') && !linkUrl.startsWith('//')) {
|
||||
const beforeMatch = content.substring(Math.max(0, match.index - 75), match.index)
|
||||
const afterMatch = content.substring(
|
||||
match.index + match[0].length,
|
||||
Math.min(content.length, match.index + match[0].length + 75)
|
||||
)
|
||||
const context = beforeMatch + match[0] + afterMatch
|
||||
|
||||
links.push({
|
||||
text: linkText,
|
||||
url: linkUrl,
|
||||
line: lineNumber,
|
||||
context: context.replace(/\n/g, ' '),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return links
|
||||
}
|
||||
|
||||
// Find all markdown files in a directory recursively
|
||||
function findMarkdownFiles(dir) {
|
||||
const markdownFiles = []
|
||||
|
||||
function walkDirectory(currentDir) {
|
||||
const files = fs.readdirSync(currentDir)
|
||||
|
||||
for (const file of files) {
|
||||
const filePath = path.join(currentDir, file)
|
||||
const stat = fs.statSync(filePath)
|
||||
|
||||
if (stat.isDirectory()) {
|
||||
walkDirectory(filePath)
|
||||
} else if (file.endsWith('.md') || file.endsWith('.mdx')) {
|
||||
markdownFiles.push(filePath)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
walkDirectory(dir)
|
||||
return markdownFiles
|
||||
}
|
||||
|
||||
// Process files in batches for memory management
|
||||
function processFilesInBatches(files, batchSize = CONFIG.BATCH_SIZE) {
|
||||
const results = []
|
||||
|
||||
for (let i = 0; i < files.length; i += batchSize) {
|
||||
const batch = files.slice(i, i + batchSize)
|
||||
console.log(
|
||||
`Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(files.length / batchSize)} (${
|
||||
batch.length
|
||||
} files)`
|
||||
)
|
||||
|
||||
const batchResults = batch.map((file) => ({
|
||||
file,
|
||||
links: extractInternalLinks(file),
|
||||
}))
|
||||
|
||||
results.push(...batchResults)
|
||||
|
||||
// Clear some memory between batches
|
||||
if (global.gc) {
|
||||
global.gc()
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// LINK PROCESSING FUNCTIONS
|
||||
// ============================================================================
|
||||
|
||||
// Process a single link and return validation result
|
||||
function processLink(link, file, pages, redirects) {
|
||||
// Strip query parameters from URL for checking
|
||||
const urlWithoutQuery = link.url.split('?')[0]
|
||||
|
||||
// Check if link should be excluded by pattern
|
||||
const shouldExclude = CONFIG.EXCLUDE_PATTERNS.some((pattern) => urlWithoutQuery.includes(pattern))
|
||||
if (shouldExclude) {
|
||||
return { type: 'excluded' }
|
||||
}
|
||||
|
||||
// Check if link has file extension that should be ignored
|
||||
const hasExcludedExtension = CONFIG.EXCLUDED_EXTENSIONS.some((ext) => urlWithoutQuery.endsWith(ext))
|
||||
if (hasExcludedExtension) {
|
||||
return { type: 'excluded' }
|
||||
}
|
||||
|
||||
// Check if this link is a redirect source
|
||||
if (isRedirectSource(urlWithoutQuery, redirects)) {
|
||||
return { type: 'redirected' }
|
||||
}
|
||||
|
||||
// First check if the internal URL is valid
|
||||
if (!validateInternalUrl(urlWithoutQuery, pages)) {
|
||||
return {
|
||||
type: 'broken',
|
||||
brokenItem: {
|
||||
file: file,
|
||||
link: link.url,
|
||||
text: link.text,
|
||||
line: link.line,
|
||||
context: link.context,
|
||||
type: 'page',
|
||||
},
|
||||
}
|
||||
} else if (urlWithoutQuery.includes('#')) {
|
||||
// If URL is valid but has anchor, check if anchor exists
|
||||
if (!validateAnchor(urlWithoutQuery, pages)) {
|
||||
return {
|
||||
type: 'broken_anchor',
|
||||
brokenItem: {
|
||||
file: file,
|
||||
link: link.url,
|
||||
text: link.text,
|
||||
line: link.line,
|
||||
context: link.context,
|
||||
type: 'anchor',
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { type: 'valid' }
|
||||
}
|
||||
|
||||
// Process all links and categorize them
|
||||
function processAllLinks(fileResults, pages, redirects) {
|
||||
const brokenLinks = []
|
||||
const brokenAnchors = []
|
||||
let totalLinks = 0
|
||||
let anchorLinksChecked = 0
|
||||
let redirectedLinks = 0
|
||||
let excludedLinks = 0
|
||||
|
||||
for (const result of fileResults) {
|
||||
const { file, links } = result
|
||||
totalLinks += links.length
|
||||
|
||||
for (const link of links) {
|
||||
// Check for anchor (like original code)
|
||||
const urlWithoutQuery = link.url.split('?')[0]
|
||||
const hasAnchor = urlWithoutQuery.includes('#')
|
||||
|
||||
const processResult = processLink(link, file, pages, redirects)
|
||||
|
||||
switch (processResult.type) {
|
||||
case 'excluded':
|
||||
excludedLinks++
|
||||
break
|
||||
case 'redirected':
|
||||
redirectedLinks++
|
||||
break
|
||||
case 'broken':
|
||||
brokenLinks.push(processResult.brokenItem)
|
||||
// Count anchor links for ALL non-excluded, non-redirected links (like original)
|
||||
if (hasAnchor) {
|
||||
anchorLinksChecked++
|
||||
}
|
||||
break
|
||||
case 'broken_anchor':
|
||||
brokenAnchors.push(processResult.brokenItem)
|
||||
anchorLinksChecked++
|
||||
break
|
||||
case 'valid':
|
||||
// Count anchor links for ALL non-excluded, non-redirected links (like original)
|
||||
if (hasAnchor) {
|
||||
anchorLinksChecked++
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
brokenLinks,
|
||||
brokenAnchors,
|
||||
stats: {
|
||||
totalLinks,
|
||||
excludedLinks,
|
||||
redirectedLinks,
|
||||
anchorLinksChecked,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// RESULTS AND OUTPUT FUNCTIONS
|
||||
// ============================================================================
|
||||
|
||||
// Create the results object
|
||||
function createResultsObject(brokenLinks, brokenAnchors, stats, markdownFiles, redirects, pages) {
|
||||
return {
|
||||
timestamp: new Date().toISOString(),
|
||||
summary: {
|
||||
totalLinks: stats.totalLinks,
|
||||
excludedLinks: stats.excludedLinks,
|
||||
redirectedLinks: stats.redirectedLinks,
|
||||
brokenLinks: brokenLinks.length,
|
||||
anchorLinksChecked: stats.anchorLinksChecked,
|
||||
brokenAnchors: brokenAnchors.length,
|
||||
htmlFilesCached: anchorCache.size,
|
||||
markdownFiles: markdownFiles.length,
|
||||
redirectPatterns: redirects.length,
|
||||
pagesInSitemap: pages.length,
|
||||
},
|
||||
excludePatterns: CONFIG.EXCLUDE_PATTERNS,
|
||||
excludeFileExtensions: CONFIG.EXCLUDED_EXTENSIONS,
|
||||
brokenLinks: brokenLinks.map((broken) => ({
|
||||
type: broken.type,
|
||||
file: path.relative(process.cwd(), broken.file),
|
||||
page: convertToPostHogUrl(broken.file),
|
||||
brokenLink: broken.link,
|
||||
brokenUrl: convertToPostHogUrl(broken.link),
|
||||
line: broken.line,
|
||||
text: broken.text,
|
||||
context: broken.context,
|
||||
})),
|
||||
brokenAnchors: brokenAnchors.map((anchor) => ({
|
||||
type: anchor.type,
|
||||
file: path.relative(process.cwd(), anchor.file),
|
||||
page: convertToPostHogUrl(anchor.file),
|
||||
brokenLink: anchor.link,
|
||||
brokenUrl: convertToPostHogUrl(anchor.link),
|
||||
line: anchor.line,
|
||||
text: anchor.text,
|
||||
context: anchor.context,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
// Write results to JSON file (optional)
|
||||
function writeResultsToFile(results, outputPath) {
|
||||
if (!outputPath) {
|
||||
// No output path provided, skip writing to file
|
||||
return
|
||||
}
|
||||
|
||||
const resultsDir = path.join(process.cwd(), outputPath)
|
||||
if (!fs.existsSync(resultsDir)) {
|
||||
fs.mkdirSync(resultsDir, { recursive: true })
|
||||
}
|
||||
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, '-')
|
||||
const resultsFile = path.join(resultsDir, `link-check-${timestamp}.json`)
|
||||
|
||||
try {
|
||||
fs.writeFileSync(resultsFile, JSON.stringify(results, null, 2))
|
||||
console.log(`\nResults saved to: ${resultsFile}`)
|
||||
} catch (error) {
|
||||
console.error(`Error saving results: ${error.message}`)
|
||||
}
|
||||
}
|
||||
|
||||
// Display broken links
|
||||
function displayBrokenLinks(brokenLinks) {
|
||||
if (brokenLinks.length === 0) return
|
||||
|
||||
console.log('\nBroken links found:\n')
|
||||
|
||||
brokenLinks.forEach((broken) => {
|
||||
console.log(`Error type: ${broken.type}`)
|
||||
console.log(`File: ${path.relative(process.cwd(), broken.file)}`)
|
||||
console.log(`Page: ${convertToPostHogUrl(broken.file)}`)
|
||||
console.log(`Broken link: ${broken.link}`)
|
||||
console.log(`Broken URL: ${convertToPostHogUrl(broken.link)}`)
|
||||
console.log(`Line #: ${broken.line}`)
|
||||
if (broken.text) {
|
||||
console.log(`Hyperlinked text: ${broken.text}`)
|
||||
}
|
||||
console.log(`Context: ${broken.context}`)
|
||||
console.log('-'.repeat(80))
|
||||
})
|
||||
}
|
||||
|
||||
// Display broken anchor links
|
||||
function displayBrokenAnchors(brokenAnchors) {
|
||||
if (brokenAnchors.length === 0) return
|
||||
|
||||
console.log('\nBroken anchor links:\n')
|
||||
|
||||
brokenAnchors.forEach((anchor) => {
|
||||
console.log(`Error type: ${anchor.type}`)
|
||||
console.log(`File: ${path.relative(process.cwd(), anchor.file)}`)
|
||||
console.log(`Page: ${convertToPostHogUrl(anchor.file)}`)
|
||||
console.log(`Broken link: ${anchor.link}`)
|
||||
console.log(`Broken URL: ${convertToPostHogUrl(anchor.link)}`)
|
||||
console.log(`Line #: ${anchor.line}`)
|
||||
if (anchor.text) {
|
||||
console.log(`Hyperlinked text: ${anchor.text}`)
|
||||
}
|
||||
console.log(`Context: ${anchor.context}`)
|
||||
console.log('-'.repeat(80))
|
||||
})
|
||||
}
|
||||
|
||||
// Display summary statistics
|
||||
function displaySummaryStats(stats, brokenLinks, brokenAnchors, markdownFilesCount) {
|
||||
console.log(`\nScanned ${markdownFilesCount} markdown files`)
|
||||
console.log(`Processed ${stats.totalLinks} internal links`)
|
||||
console.log(`Found ${stats.excludedLinks} excluded links (skipped)`)
|
||||
console.log(`Found ${stats.redirectedLinks} redirected links (skipped)`)
|
||||
console.log(`Checked ${stats.anchorLinksChecked} anchor links`)
|
||||
console.log(`Found ${brokenLinks.length} broken links`)
|
||||
console.log(`Found ${brokenAnchors.length} broken anchor links`)
|
||||
console.log(`Cached ${anchorCache.size} HTML files for anchor checking`)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// MAIN FUNCTION
|
||||
// ============================================================================
|
||||
|
||||
function checkLinks(outputPath) {
|
||||
console.log('Starting post-build link validation...')
|
||||
|
||||
// Initialize data sources
|
||||
const redirects = parseVercelRedirects()
|
||||
console.log(`Found ${redirects.length} redirect/rewrite patterns`)
|
||||
|
||||
const pages = getSitemapPages()
|
||||
console.log(`Found ${pages.length} pages in sitemap`)
|
||||
|
||||
const markdownFiles = findMarkdownFiles(CONFIG.CONTENTS_DIR)
|
||||
console.log(`Found ${markdownFiles.length} markdown files`)
|
||||
|
||||
// Process files and extract links
|
||||
const fileResults = processFilesInBatches(markdownFiles)
|
||||
|
||||
// Process and validate all links
|
||||
const { brokenLinks, brokenAnchors, stats } = processAllLinks(fileResults, pages, redirects)
|
||||
|
||||
// Sort results alphabetically by file
|
||||
brokenLinks.sort((a, b) => a.file.localeCompare(b.file))
|
||||
brokenAnchors.sort((a, b) => a.file.localeCompare(b.file))
|
||||
|
||||
// Display broken links
|
||||
displayBrokenLinks(brokenLinks)
|
||||
displayBrokenAnchors(brokenAnchors)
|
||||
|
||||
if (brokenLinks.length === 0 && brokenAnchors.length === 0) {
|
||||
console.log('\nNo broken links found! 🎉')
|
||||
}
|
||||
|
||||
// Display summary stats at the end
|
||||
displaySummaryStats(stats, brokenLinks, brokenAnchors, markdownFiles.length)
|
||||
|
||||
// Create and save results at the end
|
||||
const results = createResultsObject(brokenLinks, brokenAnchors, stats, markdownFiles, redirects, pages)
|
||||
writeResultsToFile(results, outputPath)
|
||||
|
||||
return brokenLinks.length
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SCRIPT EXECUTION
|
||||
// ============================================================================
|
||||
|
||||
// Parse command line arguments
|
||||
const args = process.argv.slice(2)
|
||||
const outputPath = args.length > 0 ? args[0] : null
|
||||
|
||||
if (outputPath) {
|
||||
console.log(`Results will be saved to: ${outputPath}`)
|
||||
} else {
|
||||
console.log('No output path provided. Results will only be displayed in console.')
|
||||
}
|
||||
|
||||
// Run the script
|
||||
const brokenCount = checkLinks(outputPath)
|
||||
|
||||
// Only exit with error code if there are broken PAGE links (not just anchor links)
|
||||
// This allows the workflow to continue while still reporting issues
|
||||
if (brokenCount > 0) {
|
||||
console.log('\nCheck the output above ☝️')
|
||||
}
|
||||
|
||||
process.exit(0) // Always exit successfully
|
||||
Reference in New Issue
Block a user