docs/lib/check-links.js

123 строки
4.6 KiB
JavaScript

const cheerio = require('cheerio')
const findPageInVersion = require('./find-page-in-version')
const renderContent = require('./render-content')
const rewriteLocalLinks = require('./rewrite-local-links')
const nonEnterpriseDefaultVersion = require('./non-enterprise-default-version')
const { getPathWithoutLanguage } = require('./path-utils')
const { getEnterpriseVersionNumber, adminProduct } = require('./patterns')
const { deprecated, latest } = require('./enterprise-server-releases')
// internal links will have a language code by the time we're testing them
// we also want to capture same-page anchors (#foo)
const languageCode = 'en'
const internalHrefs = ['/en', '#']
const renderedPageCache = {}
const checkedAnchorCache = {}
module.exports = async function checkLinks ($, page, context, version, checkedLinkCache = {}) {
// run rewriteLocalLinks to version links and add language codes
rewriteLocalLinks($, version, languageCode)
const brokenLinks = {
anchors: [],
links: []
}
// internal link check
for (const href of internalHrefs) {
const internalLinks = $(`a[href^="${href}"]`).get()
for (const internalLink of internalLinks) {
const href = $(internalLink).attr('href')
// enable caching so we don't check links more than once
// anchor links are cached locally (within this run) since they are specific to the page
if (checkedLinkCache[href] || checkedAnchorCache[href]) continue
const [link, anchor] = href.split('#')
// if anchor only (e.g., #foo), look for heading on same page
if (anchor && !link) {
// ignore anchors that are autogenerated from headings
if (anchor === $(internalLink).parent().attr('id')) continue
const matchingHeadings = getMatchingHeadings($, anchor)
if (matchingHeadings.length === 0) {
brokenLinks.anchors.push({ 'broken same-page anchor': `#${anchor}`, reason: 'heading not found on page' })
}
checkedAnchorCache[href] = true
continue
}
checkedLinkCache[href] = true
// skip rare hardcoded links to old GHE versions
// these paths will always be in the old versioned form
// example: /enterprise/11.10.340/admin/articles/upgrading-to-the-latest-release
const gheVersionInLink = link.match(getEnterpriseVersionNumber)
if (gheVersionInLink && deprecated.includes(gheVersionInLink[1])) continue
// look for linked page
const isDotcomOnly = $(internalLink).attr('class')
// special case for GHES Admin links on dotcom, which are not broken; they go to the latest GHES version
let versionToCheck = version
if (version === nonEnterpriseDefaultVersion && adminProduct.test(link)) {
versionToCheck = `enterprise-server@${latest}`
}
const linkedPage = findPageInVersion(link, context.pages, context.redirects, languageCode, versionToCheck, isDotcomOnly)
if (!linkedPage) {
brokenLinks.links.push({ 'broken link': link, reason: 'linked page not found' })
continue
}
// don't check anchors on developers content
if (linkedPage.relativePath.match(/^(rest|graphql|developers)/)) continue
// create a unique string for caching purposes
const pathToCache = version + linkedPage.relativePath
const anchorToCheck = anchor
// if link with anchor (e.g., /some/path#foo), look for heading on linked page
if (anchorToCheck) {
// either render page or fetch it from cache if we've already rendered it
let linkedPageObject
if (!renderedPageCache[pathToCache]) {
const linkedPageHtml = await renderContent(linkedPage.markdown, context)
linkedPageObject = cheerio.load(linkedPageHtml, { xmlMode: true })
renderedPageCache[pathToCache] = linkedPageObject
} else {
linkedPageObject = renderedPageCache[pathToCache]
}
const matchingHeadings = getMatchingHeadings(linkedPageObject, anchorToCheck)
if (matchingHeadings.length === 0) {
if (anchor) {
brokenLinks.anchors.push({ 'broken anchor': `#${anchor}`, 'full link': `${getPathWithoutLanguage(link)}#${anchor}`, reason: 'heading not found on linked page', 'linked page': linkedPage.fullPath })
}
continue
}
}
}
}
return { brokenLinks, checkedLinkCache }
}
// article titles are h1s; headings can be any subsequent level
function getMatchingHeadings ($, anchor) {
return $(`
h2[id="${anchor}"],
h3[id="${anchor}"],
h4[id="${anchor}"],
h5[id="${anchor}"],
h6[id="${anchor}"],
a[name="${anchor}"]
`)
}