Merge branch 'main' into repo-sync

2023-03-30 11:06:16 -04:00 · 2023-03-30 11:06:16 -04:00 · cb5e9c19d0
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -153,6 +153,15 @@ jobs:
      - name: Run build script
        run: npm run build

+      - name: Disk cache used by getRemoteJSON function in middleware
+        uses: actions/cache@9b0c1fce7a93df8e3bb8926b0d6e9d89e92f20a7
+        with:
+          path: .remotejson-cache
+          # Very liberal cache key. Note, for this to become populated
+          # for other branches, you have to manually run this workflow
+          # at least once using the "Run workflow" button.
+          key: ${{ runner.os }}-remotejson
+
      - name: Index fixtures into the local Elasticsearch
        # For the sake of saving time, only run this step if the group
        # is one that will run tests against an Elasticsearch on localhost.
--- a/.gitignore
+++ b/.gitignore
@ -44,3 +44,6 @@ semmle-code

 .installed.package-lock.json
 assets/images/help/writing/unordered-list-rendered (1).png
+
+# Used by getRemoteJSON()
+.remotejson-cache/
--- a/middleware/archived-enterprise-versions.js
+++ b/middleware/archived-enterprise-versions.js
@ -16,6 +16,7 @@ import { readCompressedJsonFileFallbackLazily } from '../lib/read-json-file.js'
 import { archivedCacheControl, languageCacheControl } from './cache-control.js'
 import { pathLanguagePrefixed, languagePrefixPathRegex } from '../lib/languages.js'
 import getRedirect, { splitPathByLanguage } from '../lib/get-redirect.js'
+import getRemoteJSON from './get-remote-json.js'

 const REMOTE_ENTERPRISE_STORAGE_URL = 'https://githubdocs.azureedge.net/enterprise'

@ -75,29 +76,6 @@ const retryConfiguration = { limit: 3 }
 // unnecessary error reporting.
 const timeoutConfiguration = { response: 1500 }

-async function getRemoteJSON(url, config) {
-  let fromCache = true
-  if (!_getRemoteJSONCache.has(url)) {
-    fromCache = false
-    // got will, by default, follow redirects and it will throw if the ultimate
-    // response is not a 2xx.
-    // But it's possible that the page is a 200 OK but it's just not a JSON
-    // page at all. Then we can't assume we can deserialize it.
-    const res = await got(url, config)
-    if (!res.headers['content-type'].startsWith('application/json')) {
-      throw new Error(
-        `Fetching '${url}' resulted in a non-JSON response (${res.headers['content-type']})`
-      )
-    }
-
-    _getRemoteJSONCache.set(url, JSON.parse(res.body))
-  }
-  const tags = [`url:${url}`, `from_cache:${fromCache}`]
-  statsd.increment('middleware.archived_get_remote_json', 1, tags)
-  return _getRemoteJSONCache.get(url)
-}
-const _getRemoteJSONCache = new Map()
-
 // This module handles requests for deprecated GitHub Enterprise versions
 // by routing them to static content in help-docs-archived-enterprise-versions

--- a/middleware/get-remote-json.js
+++ b/middleware/get-remote-json.js
@ -0,0 +1,87 @@
+import path from 'path'
+import fs from 'fs'
+import crypto from 'crypto'
+
+import got from 'got'
+import statsd from '../lib/statsd.js'
+
+// The only reason this is exported is for the sake of the unit tests'
+// ability to test in-memory miss after purging this with a mutation
+export const cache = new Map()
+
+const inProd = process.env.NODE_ENV === 'production'
+
+// Wrapper on `got()` that is able to both cache in memory and on disk.
+// The on-disk caching is in `.remotejson/`.
+// We use this for downloading `redirects.json` files from the
+// help-docs-archived-enterprise-versions repo as a proxy. A lot of those
+// .json files are large and they're also static which makes them
+// ideal for caching.
+// Note that there's 2 layers of caching here:
+//  1. Is it in memory cache?
+//  2. No, is it on disk?
+//  3. No, download from the internet then store responses in memory and disk
+export default async function getRemoteJSON(url, config) {
+  // We could get fancy and make the cache key depend on the `config` too
+  // given that this is A) only used for archived enterprise stuff,
+  // and B) the config is only applicable on cache miss when doing the `got()`.
+  const cacheKey = url
+
+  // Assume it's in the in-memory cache first.
+  // Later we'll update this if we find we need to.
+  let fromCache = 'memory'
+
+  if (!cache.has(cacheKey)) {
+    fromCache = 'not'
+
+    let foundOnDisk = false
+    const tempFilename = crypto.createHash('md5').update(url).digest('hex')
+
+    // Do this here instead of at the top of the file so that it becomes
+    // possible to override this in unit tests.
+    const ROOT = process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT || '.remotejson-cache'
+
+    const onDisk = path.join(ROOT, `${tempFilename}.json`)
+    // Never even try reading from disk in production.
+    if (!inProd && fs.existsSync(onDisk)) {
+      const body = fs.readFileSync(onDisk, 'utf-8')
+      // It might exist on disk, but it could be empty
+      if (body) {
+        try {
+          // It might be corrupted JSON.
+          cache.set(cacheKey, JSON.parse(body))
+          fromCache = 'disk'
+          foundOnDisk = true
+        } catch (error) {
+          if (!(error instanceof SyntaxError)) {
+            throw error
+          }
+        }
+      }
+    }
+
+    if (!foundOnDisk) {
+      // got will, by default, follow redirects and it will throw if the ultimate
+      // response is not a 2xx.
+      // But it's possible that the page is a 200 OK but it's just not a JSON
+      // page at all. Then we can't assume we can deserialize it.
+      const res = await got(url, config)
+      if (!res.headers['content-type'].startsWith('application/json')) {
+        throw new Error(
+          `Fetching '${url}' resulted in a non-JSON response (${res.headers['content-type']})`
+        )
+      }
+      cache.set(cacheKey, JSON.parse(res.body))
+
+      // Only write to disk for testing and local preview.
+      // In production, we never write to disk. Only in-memory.
+      if (!inProd) {
+        fs.mkdirSync(path.dirname(onDisk), { recursive: true })
+        fs.writeFileSync(onDisk, res.body, 'utf-8')
+      }
+    }
+  }
+  const tags = [`url:${url}`, `from_cache:${fromCache}`]
+  statsd.increment('middleware.get_remote_json', 1, tags)
+  return cache.get(cacheKey)
+}
--- a/tests/unit/get-remote-json.js
+++ b/tests/unit/get-remote-json.js
@ -0,0 +1,115 @@
+import fs from 'fs'
+import path from 'path'
+import os from 'os'
+
+import rimraf from 'rimraf'
+import { expect, test, describe, beforeAll, afterAll } from '@jest/globals'
+import nock from 'nock'
+import getRemoteJSON, { cache } from '../../middleware/get-remote-json.js'
+
+/**
+ *
+ * These unit tests test that the in-memory cache works and when it's
+ * not a cache it, it can benefit from using the disk cache.
+ */
+
+describe('getRemoteJSON', () => {
+  const envVarValueBefore = process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT
+  const tempDir = path.join(os.tmpdir(), 'remotejson-test')
+
+  beforeAll(() => {
+    process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempDir
+  })
+
+  afterAll(() => {
+    process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = envVarValueBefore
+    rimraf.sync(tempDir)
+  })
+
+  afterEach(() => {
+    nock.cleanAll()
+  })
+
+  test('simple in-memory caching', async () => {
+    const url = 'http://example.com/redirects.json'
+    const { origin, pathname } = new URL(url)
+    nock(origin).get(pathname).reply(200, { foo: 'bar' })
+    const data = await getRemoteJSON(url, {})
+    expect(data.foo).toBe('bar')
+    expect(cache.get(url)).toBeTruthy()
+    // Second time, despite not setting up a second nock(), will work
+    // because it can use memory now.
+    const data2 = await getRemoteJSON(url, {})
+    expect(data2.foo).toBe('bar')
+    expect(cache.get(url)).toBeTruthy()
+  })
+
+  test('benefit from disk-based caching', async () => {
+    const url = 'http://example.com/cool.json'
+    const { origin, pathname } = new URL(url)
+    nock(origin).get(pathname).reply(200, { cool: true })
+    const data = await getRemoteJSON(url, {})
+    expect(data.cool).toBe(true)
+    expect(cache.get(url)).toBeTruthy()
+    cache.delete(url)
+
+    // This time, the nock won't fail despite not using `.persist()`.
+    // That means it didn't need the network because it was able to
+    // use the disk cache.
+    const data2 = await getRemoteJSON(url, {})
+    expect(data2.cool).toBe(true)
+  })
+
+  test('recover from disk corruption (empty)', async () => {
+    const tempTempDir = path.join(tempDir, 'empty-files')
+    process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempTempDir
+    const url = 'http://example.com/empty.json'
+    const { origin, pathname } = new URL(url)
+    nock(origin).get(pathname).reply(200, { cool: true })
+    await getRemoteJSON(url, {})
+
+    // Make every file in the cache directory an empty file
+    for (const file of fs.readdirSync(tempTempDir)) {
+      fs.writeFileSync(path.join(tempTempDir, file), '')
+    }
+
+    cache.delete(url)
+    // If we don't do this, nock will fail because a second network
+    // request became necessary.
+    nock(origin).get(pathname).reply(200, { cool: true })
+
+    const data = await getRemoteJSON(url, {})
+    expect(data.cool).toBe(true)
+  })
+
+  test('recover from disk corruption (bad JSON)', async () => {
+    const tempTempDir = path.join(tempDir, 'corrupt-files')
+    process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempTempDir
+    const url = 'http://example.com/corrupt.json'
+    const { origin, pathname } = new URL(url)
+    nock(origin).get(pathname).reply(200, { cool: true })
+    await getRemoteJSON(url, {})
+
+    // Make every file in the cache directory an empty file
+    for (const file of fs.readdirSync(tempTempDir)) {
+      fs.writeFileSync(path.join(tempTempDir, file), '{"not:JSON{')
+    }
+
+    cache.delete(url)
+    // If we don't do this, nock will fail because a second network
+    // request became necessary.
+    nock(origin).get(pathname).reply(200, { cool: true })
+
+    const data = await getRemoteJSON(url, {})
+    expect(data.cool).toBe(true)
+  })
+
+  test('not-actually JSON despite URL', async () => {
+    const url = 'http://example.com/might-look-like.json'
+    const { origin, pathname } = new URL(url)
+    nock(origin).get(pathname).reply(200, '<html>here</html>', {
+      'Content-Type': 'text/html',
+    })
+    await expect(getRemoteJSON(url, {})).rejects.toThrowError(/resulted in a non-JSON response/)
+  })
+})