2020-09-27 15:10:11 +03:00
const cheerio = require ( 'cheerio' )
const findPage = require ( './find-page' )
const renderContent = require ( './render-content' )
const rewriteLocalLinks = require ( './rewrite-local-links' )
const getApplicableVersions = require ( './get-applicable-versions' )
const { getPathWithoutLanguage } = require ( './path-utils' )
const { getEnterpriseVersionNumber } = require ( './patterns' )
const { deprecated } = require ( './enterprise-server-releases' )
// internal links will have a language code by the time we're testing them
// we also want to capture same-page anchors (#foo)
const languageCode = 'en'
const internalHrefs = [ '/en' , '#' ]
const renderedPageCache = { }
const checkedAnchorCache = { }
module . exports = async function checkLinks ( $ , page , context , version , checkedLinkCache = { } ) {
// run rewriteLocalLinks to version links and add language codes
rewriteLocalLinks ( $ , version , languageCode )
const brokenLinks = {
anchors : [ ] ,
links : [ ]
}
// internal link check
for ( const href of internalHrefs ) {
const internalLinks = $ ( ` a[href^=" ${ href } "] ` ) . get ( )
for ( const internalLink of internalLinks ) {
const href = $ ( internalLink ) . attr ( 'href' )
// enable caching so we don't check links more than once
// anchor links are cached locally (within this run) since they are specific to the page
if ( checkedLinkCache [ href ] || checkedAnchorCache [ href ] ) continue
const [ link , anchor ] = href . split ( '#' )
// if anchor only (e.g., #foo), look for heading on same page
if ( anchor && ! link ) {
// ignore anchors that are autogenerated from headings
if ( anchor === $ ( internalLink ) . parent ( ) . attr ( 'id' ) ) continue
const matchingHeadings = getMatchingHeadings ( $ , anchor )
if ( matchingHeadings . length === 0 ) {
brokenLinks . anchors . push ( { 'broken same-page anchor' : ` # ${ anchor } ` , reason : 'heading not found on page' } )
}
checkedAnchorCache [ href ] = true
continue
}
checkedLinkCache [ href ] = true
// skip rare hardcoded links to old GHE versions
// these paths will always be in the old versioned form
// example: /enterprise/11.10.340/admin/articles/upgrading-to-the-latest-release
const gheVersionInLink = link . match ( getEnterpriseVersionNumber )
if ( gheVersionInLink && deprecated . includes ( gheVersionInLink [ 1 ] ) ) continue
// look for linked page
const linkedPage = findPage ( link , context . pages , context . redirects , languageCode )
if ( ! linkedPage ) {
brokenLinks . links . push ( { 'broken link' : link , reason : 'linked page not found' } )
continue
}
// finding the linked page isn't enough if it's a github.com page; also need to check versions
if ( linkedPage . relativePath . startsWith ( 'github' ) ) {
2020-09-29 20:36:07 +03:00
const linkedPageVersions = getApplicableVersions ( linkedPage . versions , linkedPage . relativePath )
2020-09-27 15:10:11 +03:00
if ( ! linkedPageVersions . includes ( version ) && $ ( internalLink ) . attr ( 'class' ) !== 'dotcom-only' ) {
brokenLinks . links . push ( { 'broken link' : link , reason : ` ${ version } not found in linked page versions ` , 'linked page' : linkedPage . fullPath } )
continue
}
}
// don't check anchors on developers content
if ( linkedPage . relativePath . match ( /^(rest|graphql|developers)/ ) ) continue
// create a unique string for caching purposes
const pathToCache = version + linkedPage . relativePath
const anchorToCheck = anchor
// if link with anchor (e.g., /some/path#foo), look for heading on linked page
if ( anchorToCheck ) {
// either render page or fetch it from cache if we've already rendered it
let linkedPageObject
if ( ! renderedPageCache [ pathToCache ] ) {
const linkedPageHtml = await renderContent ( linkedPage . markdown , context )
linkedPageObject = cheerio . load ( linkedPageHtml , { xmlMode : true } )
renderedPageCache [ pathToCache ] = linkedPageObject
} else {
linkedPageObject = renderedPageCache [ pathToCache ]
}
const matchingHeadings = getMatchingHeadings ( linkedPageObject , anchorToCheck )
if ( matchingHeadings . length === 0 ) {
if ( anchor ) {
brokenLinks . anchors . push ( { 'broken anchor' : ` # ${ anchor } ` , 'full link' : ` ${ getPathWithoutLanguage ( link ) } # ${ anchor } ` , reason : 'heading not found on linked page' , 'linked page' : linkedPage . fullPath } )
}
continue
}
}
}
}
return { brokenLinks , checkedLinkCache }
}
// article titles are h1s; headings can be any subsequent level
function getMatchingHeadings ( $ , anchor ) {
return $ ( `
h2 [ id = "${anchor}" ] ,
h3 [ id = "${anchor}" ] ,
h4 [ id = "${anchor}" ] ,
h5 [ id = "${anchor}" ] ,
h6 [ id = "${anchor}" ] ,
a [ name = "${anchor}" ]
` )
}