From b0677b244207fa404774a9f3854aa474d14b64ab Mon Sep 17 00:00:00 2001 From: Sarah Schneider Date: Fri, 22 Jan 2021 09:28:06 -0500 Subject: [PATCH] use script instead of jest, much faster --- jest.config.js | 4 +- package.json | 2 +- script/check-internal-links.js | 40 +++ tests/helpers/links-checker.js | 269 --------------------- tests/links-and-images/links-and-images.js | 37 --- 5 files changed, 42 insertions(+), 310 deletions(-) create mode 100755 script/check-internal-links.js delete mode 100644 tests/helpers/links-checker.js delete mode 100644 tests/links-and-images/links-and-images.js diff --git a/jest.config.js b/jest.config.js index b3524001c8..6b22ede91a 100644 --- a/jest.config.js +++ b/jest.config.js @@ -1,7 +1,6 @@ // https://jestjs.io/docs/en/configuration.html const isBrowser = process.env.BROWSER -const isLinkCheck = process.env.LINKCHECK const isActions = Boolean(process.env.GITHUB_ACTIONS) module.exports = { @@ -28,8 +27,7 @@ module.exports = { 'node_modules/', 'vendor/', 'tests/helpers/', - ...isBrowser ? [] : ['tests/browser/browser.js'], - ...isLinkCheck ? [] : ['tests/links-and-images/links-and-images.js'] + ...isBrowser ? [] : ['tests/browser/browser.js'] ], testMatch: [ '**/tests/**/*.js' diff --git a/package.json b/package.json index 92fc890094..155a2543d5 100644 --- a/package.json +++ b/package.json @@ -173,7 +173,7 @@ "pa11y-test": "start-server-and-test browser-test-server 4001 pa11y-ci", "link-check": "start-server-and-test link-check-server 4002 link-check-test", "link-check-server": "cross-env NODE_ENV=development ENABLED_LANGUAGES='en' PORT=4002 node server.js", - "link-check-test": "cross-env LINKCHECK=1 jest tests/links-and-images/links-and-images.js", + "link-check-test": "cross-env LINKCHECK=1 node script/check-internal-links.js", "heroku-postbuild": "node script/early-access/clone-for-build.js && npm run build" }, "engines": { diff --git a/script/check-internal-links.js b/script/check-internal-links.js new file mode 100755 index 0000000000..55513097fe --- /dev/null +++ b/script/check-internal-links.js @@ -0,0 +1,40 @@ +#!/usr/bin/env node + +const linkinator = require('linkinator') +const checker = new linkinator.LinkChecker() +const { deprecated } = require('../lib/enterprise-server-releases') + +const config = { + path: 'http://localhost:4002/en', + concurrency: 400, + recurse: true, + linksToSkip: [ + // Skip any link that is not an internal link + '^((?!http://localhost:4002/en).)*$', + // Skip dist files + '/dist/index.*', + // Skip deprecated Enterprise content + `enterprise(-server@|/)(${deprecated.join('|')})/?` + ] +} + +main() + +async function main () { + const result = (await checker.check(config)).links + + const brokenLinks = result + .filter(link => link.state === 'BROKEN') + .map(link => { delete link.failureDetails; return link }) + + // Exit successfully if no broken links! + if (!brokenLinks.length) { + console.log('All links are good!') + process.exit(0) + } + + console.log(`Found ${brokenLinks.length} total broken links: ${JSON.stringify([...brokenLinks], null, 2)}`) + + // Exit unsuccessfully if broken links are found. + process.exit(1) +} diff --git a/tests/helpers/links-checker.js b/tests/helpers/links-checker.js deleted file mode 100644 index 33efc17243..0000000000 --- a/tests/helpers/links-checker.js +++ /dev/null @@ -1,269 +0,0 @@ -const cheerio = require('cheerio') -const { union, uniq } = require('lodash') -const fs = require('fs') -const path = require('path') - -const { getProductStringFromPath } = require('../../lib/path-utils') -const patterns = require('../../lib/patterns') -const { deprecated } = require('../../lib/enterprise-server-releases') -const rest = require('../../middleware/contextualizers/rest') -const graphql = require('../../middleware/contextualizers/graphql') -const contextualize = require('../../middleware/context') -const releaseNotes = require('../../middleware/contextualizers/enterprise-release-notes') -const versionSatisfiesRange = require('../../lib/version-satisfies-range') - -class LinksChecker { - constructor (opts = { languageCode: 'en', internalHrefPrefixes: ['/', '#'] }) { - Object.assign(this, { ...opts }) - - // Some caching mechanism so we do not load pages unnecessarily, - // nor check links that have been checked - this.pageCache = new Map() - this.checkedLinksCache = new Set() - - // stores images to check all at once in a Map: - // imageSrc => { - // "usedBy": [version:path, ...] - // } - this.imagesToCheck = new Map() - - // Stores broken images in a Map, formatted the same way as imagesToCheck - this.brokenImages = new Map() - - // Stores broken links in a Map in the format of: - // link => { - // linkedFrom: [ version:filePath, ... ] - // }, ... - this.brokenLinks = new Map() - - // stores anchor links to check all at once in a Map: - // version:filePath => { - // '#anchor-link' : { - // linkedFrom: ['url1', 'url2'] - // }, - // '#anchor-link2': {...} - // } - this.anchorLinksToCheck = new Map() - - // Stores broken anchors in a Map, formatted the same way as anchorLinksToCheck - this.brokenAnchors = new Map() - } - - async setRenderedPageObj (pathCacheKey, context, reRender = false) { - if (this.pageCache.has(pathCacheKey) && !reRender) return - let pageHTML = await context.page.render(context) - - // handle special pre-rendered snowflake - if (context.page.relativePath.endsWith('graphql/reference/objects.md')) { - pageHTML += context.graphql.prerenderedObjectsForCurrentVersion.html - } - - const pageObj = cheerio.load(pageHTML, { xmlMode: true }) - this.pageCache.set(pathCacheKey, pageObj) - } - - async getRenderedPageObj (pathCacheKey, context) { - if (!this.pageCache.has(pathCacheKey)) { - if (context) { - await this.setRenderedPageObj(pathCacheKey, context) - } else { - console.error('cannot find pre-rendered page, and does not have enough context to render one.') - } - } - return this.pageCache.get(pathCacheKey) - } - - addAnchorForLater (pagePath, anchor, linkedFrom) { - const anchorsInPath = this.anchorLinksToCheck.get(pagePath) || {} - const anchorLink = anchorsInPath[anchor] || { linkedFrom: [] } - anchorLink.linkedFrom = union(anchorLink.linkedFrom, [linkedFrom]) - anchorsInPath[anchor] = anchorLink - this.anchorLinksToCheck.set(pagePath, anchorsInPath) - } - - addImagesForLater (images, pagePath) { - uniq(images).forEach(imageSrc => { - const imageUsage = this.imagesToCheck.get(imageSrc) || { usedBy: [] } - imageUsage.usedBy = union(imageUsage.usedBy, [pagePath]) - this.imagesToCheck.set(imageSrc, imageUsage) - }) - } - - async checkPage (context, checkExternalAnchors) { - const path = context.relativePath - const version = context.currentVersion - - const pathCacheKey = `${version}:${path}` - const $ = await this.getRenderedPageObj(pathCacheKey, context) - - const imageSrcs = $('img[src^="/assets"]').map((i, el) => $(el).attr('src')).toArray() - - this.addImagesForLater(imageSrcs, pathCacheKey) - - for (const href of this.internalHrefPrefixes) { - const internalLinks = $(`a[href^="${href}"]`).get() - - for (const internalLink of internalLinks) { - const href = $(internalLink).attr('href') - - let [link, anchor] = href.split('#') - // remove trailing slash - link = link.replace(patterns.trailingSlash, '$1') - - // if it's an external link and has been checked before, skip - if (link && this.checkedLinksCache.has(link)) { - // if it's been determined this link is broken, add to the linkedFrom field - if (this.brokenLinks.has(link)) { - const brokenLink = this.brokenLinks.get(link) - brokenLink.linkedFrom = union(brokenLink.linkedFrom, [pathCacheKey]) - this.brokenLinks.set(link, brokenLink) - } - if (!anchor) continue - } - - // if it's an internal anchor (e.g., #foo), save for later - if (anchor && !link) { - // ignore anchors that are autogenerated from headings - if (anchor === $(internalLink).parent().attr('id')) continue - this.addAnchorForLater(pathCacheKey, anchor, 'same page') - continue - } - - // ------ BEGIN ONEOFF EXCLUSIONS -------/// - // skip GraphQL public schema paths (these are checked by separate tests) - if (link.startsWith('/public/') && link.endsWith('.graphql')) continue - - // skip links that start with /assets/images, as these are not in the pages collection - // and /assets/images paths should be checked during the image check - if (link.startsWith('/assets/images')) continue - - // skip rare hardcoded links to old GHE versions - // these paths will always be in the old versioned format - // example: /enterprise/11.10.340/admin/articles/upgrading-to-the-latest-release - const gheVersionInLink = link.match(patterns.getEnterpriseVersionNumber) - if (gheVersionInLink && deprecated.includes(gheVersionInLink[1])) continue - // ------ END ONEOFF EXCLUSIONS -------/// - - // look for linked page - const linkedPage = context.pages[link] || context.pages[context.redirects[link]] - this.checkedLinksCache.add(link) - - if (!linkedPage) { - this.brokenLinks.set(link, { linkedFrom: [pathCacheKey] }) - continue - } - - // if we're not checking external anchors, we're done - if (!checkExternalAnchors) { - continue - } - - // find the permalink for the current version - const linkedPagePermalink = linkedPage.permalinks.find(permalink => permalink.pageVersion === version) - - if (linkedPagePermalink) { - const linkedPageContext = await buildPathContext(context, linkedPage, linkedPagePermalink) - - if (anchor) { - await this.setRenderedPageObj(`${version}:${linkedPage.relativePath}`, linkedPageContext) - this.addAnchorForLater(`${version}:${linkedPage.relativePath}`, anchor, pathCacheKey) - } - } - } - } - } - - async checkAnchors () { - for await (const [pathCacheKey, anchors] of this.anchorLinksToCheck) { - const $ = await this.getRenderedPageObj(pathCacheKey) - for (const anchorText in anchors) { - const matchingHeadings = $(`[id="${anchorText}"], [name="${anchorText}"]`) - if (matchingHeadings.length === 0) { - const brokenAnchorPath = this.brokenAnchors.get(pathCacheKey) || {} - brokenAnchorPath[anchorText] = anchors[anchorText] - this.brokenAnchors.set(pathCacheKey, brokenAnchorPath) - } - } - } - } - - getBrokenLinks () { - return this.brokenLinks - } - - async getBrokenAnchors () { - await this.checkAnchors() - return this.brokenAnchors - } - - async getBrokenImages () { - for await (const [imageSrc, imageUsage] of this.imagesToCheck) { - try { - await fs.promises.access(path.join(process.cwd(), imageSrc)) - } catch (e) { - this.brokenImages.set(imageSrc, imageUsage) - } - } - return this.brokenImages - } -} - -// this function is async because the middleware functions are likely async -async function applyMiddleware (middleware, req) { - return middleware(req, null, () => {}) -} - -async function buildInitialContext () { - const req = { - path: '/en', - language: 'en', - query: {} - } - await applyMiddleware(contextualize, req) - return req.context -} - -async function buildPathContext (initialContext, page, permalink) { - // Create a new object with path-specific properties. - // Note this is cherry-picking properties currently only needed by the middleware below; - // See middleware/context.js for the rest of the properties we are NOT refreshing per page. - // If we find this causes problems for link checking, we can call `contextualize` on - // every page. For now, this cherry-picking approach is intended to improve performance so - // we don't have to build the expensive `pages`, `redirects`, etc. data on every page we check. - const path = permalink.href - const pathContext = { - page, - currentVersion: permalink.pageVersion, - currentProduct: getProductStringFromPath(path), - relativePath: permalink.relativePath, - currentPath: permalink.href - } - - // Combine it with the initial context object that has pages, redirects, etc. - const combinedContext = Object.assign({}, initialContext, pathContext) - - // Create a new req object using the combined context - const req = { - path, - context: combinedContext, - language: 'en', - query: {} - } - - // Pass the req to the contextualizing middlewares - await applyMiddleware(rest, req) - await applyMiddleware(graphql, req) - // Release notes are available on docs site starting with GHES 3.0 - if (versionSatisfiesRange(permalink.pageVersion, '>=3.0')) { - await applyMiddleware(releaseNotes, req) - } - - // Return the resulting context object with REST, GraphQL, and release notes data now attached - return req.context -} - -module.exports = { - LinksChecker, - buildPathContext, - buildInitialContext -} diff --git a/tests/links-and-images/links-and-images.js b/tests/links-and-images/links-and-images.js deleted file mode 100644 index 6a29909978..0000000000 --- a/tests/links-and-images/links-and-images.js +++ /dev/null @@ -1,37 +0,0 @@ -const linkinator = require('linkinator') -const checker = new linkinator.LinkChecker() -const { deprecated } = require('../../lib/enterprise-server-releases') - -const config = { - path: 'http://localhost:4002/en', - concurrency: 400, - recurse: true, - linksToSkip: [ - // Skip any link that is not an internal link - '^((?!http://localhost:4002/en).)*$', - // Skip dist files - '/dist/index.*', - // Skip deprecated Enterprise content - `enterprise(-server@|/)(${deprecated.join('|')})/?` - ] -} - -describe('page rendering', () => { - jest.setTimeout(1000 * 1000) - - let result - beforeAll(async (done) => { - result = (await checker.check(config)).links - done() - }) - - test('every page has internal links that can be resolved', async () => { - const brokenLinks = result - .filter(link => link.state === 'BROKEN') - .map(link => { - delete link.failureDetails - return link - }) - expect(brokenLinks.length, `Found ${brokenLinks.length} total broken links: ${JSON.stringify([...brokenLinks], null, 2)}`).toBe(0) - }) -})