const cheerio = require('cheerio') const findPageInVersion = require('./find-page-in-version') const renderContent = require('./render-content') const rewriteLocalLinks = require('./rewrite-local-links') const nonEnterpriseDefaultVersion = require('./non-enterprise-default-version') const { getPathWithoutLanguage } = require('./path-utils') const { getEnterpriseVersionNumber, adminProduct } = require('./patterns') const { deprecated, latest } = require('./enterprise-server-releases') // internal links will have a language code by the time we're testing them // we also want to capture same-page anchors (#foo) const languageCode = 'en' const internalHrefs = ['/en', '#'] const renderedPageCache = {} const checkedAnchorCache = {} module.exports = async function checkLinks ($, page, context, version, checkedLinkCache = {}) { // run rewriteLocalLinks to version links and add language codes rewriteLocalLinks($, version, languageCode) const brokenLinks = { anchors: [], links: [] } // internal link check for (const href of internalHrefs) { const internalLinks = $(`a[href^="${href}"]`).get() for (const internalLink of internalLinks) { const href = $(internalLink).attr('href') // enable caching so we don't check links more than once // anchor links are cached locally (within this run) since they are specific to the page if (checkedLinkCache[href] || checkedAnchorCache[href]) continue const [link, anchor] = href.split('#') // if anchor only (e.g., #foo), look for heading on same page if (anchor && !link) { // ignore anchors that are autogenerated from headings if (anchor === $(internalLink).parent().attr('id')) continue const matchingHeadings = getMatchingHeadings($, anchor) if (matchingHeadings.length === 0) { brokenLinks.anchors.push({ 'broken same-page anchor': `#${anchor}`, reason: 'heading not found on page' }) } checkedAnchorCache[href] = true continue } checkedLinkCache[href] = true // skip rare hardcoded links to old GHE versions // these paths will always be in the old versioned form // example: /enterprise/11.10.340/admin/articles/upgrading-to-the-latest-release const gheVersionInLink = link.match(getEnterpriseVersionNumber) if (gheVersionInLink && deprecated.includes(gheVersionInLink[1])) continue // look for linked page const isDotcomOnly = $(internalLink).attr('class') // special case for GHES Admin links on dotcom, which are not broken; they go to the latest GHES version let versionToCheck = version if (version === nonEnterpriseDefaultVersion && adminProduct.test(link)) { versionToCheck = `enterprise-server@${latest}` } const linkedPage = findPageInVersion(link, context.pages, context.redirects, languageCode, versionToCheck, isDotcomOnly) if (!linkedPage) { brokenLinks.links.push({ 'broken link': link, reason: 'linked page not found' }) continue } if (linkedPage.relativePath.includes('rest/reference/') && linkedPage.relativePath !== 'rest/reference/index.md') { const linkedPageRelevantPermalink = linkedPage.permalinks.find(permalink => permalink.pageVersion === version) if (!linkedPageRelevantPermalink) continue const docsPath = linkedPageRelevantPermalink.href .split('rest/reference/')[1] .split('#')[0] // do not include #fragments // find all operations that with an operationID that matches the requested docs path context.currentRestOperations = context.operationsForCurrentProduct .filter(operation => operation.operationId.startsWith(docsPath)) } // collect elements of the page that may contain links const linkedPageContent = linkedPage.relativePath.includes('graphql/reference/objects') ? linkedPage.markdown + context.graphql.prerenderedObjectsForCurrentVersion.html : linkedPage.markdown // create a unique string for caching purposes const pathToCache = version + linkedPage.relativePath const anchorToCheck = anchor // if link with anchor (e.g., /some/path#foo), look for heading on linked page if (anchorToCheck) { // either render page or fetch it from cache if we've already rendered it let linkedPageObject if (!renderedPageCache[pathToCache]) { const linkedPageHtml = await renderContent(linkedPageContent, context) linkedPageObject = cheerio.load(linkedPageHtml, { xmlMode: true }) renderedPageCache[pathToCache] = linkedPageObject } else { linkedPageObject = renderedPageCache[pathToCache] } const matchingHeadings = getMatchingHeadings(linkedPageObject, anchorToCheck) if (matchingHeadings.length === 0) { if (anchor) { brokenLinks.anchors.push({ 'broken anchor': `#${anchor}`, 'full link': `${getPathWithoutLanguage(link)}#${anchor}`, reason: 'heading not found on linked page', 'linked page': linkedPage.fullPath }) } continue } } } } return { brokenLinks, checkedLinkCache } } // article titles are h1s; headings can be any subsequent level function getMatchingHeadings ($, anchor) { return $(` h2[id="${anchor}"], h3[id="${anchor}"], h4[id="${anchor}"], h5[id="${anchor}"], h6[id="${anchor}"], a[name="${anchor}"] `).get() }