Merge branch 'main' into repo-sync

This commit is contained in:
Octomerger Bot 2023-03-30 11:06:16 -04:00 коммит произвёл GitHub
Родитель 8c729cb2a4 44db141ee3
Коммит cb5e9c19d0
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 215 добавлений и 23 удалений

9
.github/workflows/test.yml поставляемый
Просмотреть файл

@ -153,6 +153,15 @@ jobs:
- name: Run build script
run: npm run build
- name: Disk cache used by getRemoteJSON function in middleware
uses: actions/cache@9b0c1fce7a93df8e3bb8926b0d6e9d89e92f20a7
with:
path: .remotejson-cache
# Very liberal cache key. Note, for this to become populated
# for other branches, you have to manually run this workflow
# at least once using the "Run workflow" button.
key: ${{ runner.os }}-remotejson
- name: Index fixtures into the local Elasticsearch
# For the sake of saving time, only run this step if the group
# is one that will run tests against an Elasticsearch on localhost.

3
.gitignore поставляемый
Просмотреть файл

@ -44,3 +44,6 @@ semmle-code
.installed.package-lock.json
assets/images/help/writing/unordered-list-rendered (1).png
# Used by getRemoteJSON()
.remotejson-cache/

Просмотреть файл

@ -16,6 +16,7 @@ import { readCompressedJsonFileFallbackLazily } from '../lib/read-json-file.js'
import { archivedCacheControl, languageCacheControl } from './cache-control.js'
import { pathLanguagePrefixed, languagePrefixPathRegex } from '../lib/languages.js'
import getRedirect, { splitPathByLanguage } from '../lib/get-redirect.js'
import getRemoteJSON from './get-remote-json.js'
const REMOTE_ENTERPRISE_STORAGE_URL = 'https://githubdocs.azureedge.net/enterprise'
@ -75,29 +76,6 @@ const retryConfiguration = { limit: 3 }
// unnecessary error reporting.
const timeoutConfiguration = { response: 1500 }
async function getRemoteJSON(url, config) {
let fromCache = true
if (!_getRemoteJSONCache.has(url)) {
fromCache = false
// got will, by default, follow redirects and it will throw if the ultimate
// response is not a 2xx.
// But it's possible that the page is a 200 OK but it's just not a JSON
// page at all. Then we can't assume we can deserialize it.
const res = await got(url, config)
if (!res.headers['content-type'].startsWith('application/json')) {
throw new Error(
`Fetching '${url}' resulted in a non-JSON response (${res.headers['content-type']})`
)
}
_getRemoteJSONCache.set(url, JSON.parse(res.body))
}
const tags = [`url:${url}`, `from_cache:${fromCache}`]
statsd.increment('middleware.archived_get_remote_json', 1, tags)
return _getRemoteJSONCache.get(url)
}
const _getRemoteJSONCache = new Map()
// This module handles requests for deprecated GitHub Enterprise versions
// by routing them to static content in help-docs-archived-enterprise-versions

Просмотреть файл

@ -0,0 +1,87 @@
import path from 'path'
import fs from 'fs'
import crypto from 'crypto'
import got from 'got'
import statsd from '../lib/statsd.js'
// The only reason this is exported is for the sake of the unit tests'
// ability to test in-memory miss after purging this with a mutation
export const cache = new Map()
const inProd = process.env.NODE_ENV === 'production'
// Wrapper on `got()` that is able to both cache in memory and on disk.
// The on-disk caching is in `.remotejson/`.
// We use this for downloading `redirects.json` files from the
// help-docs-archived-enterprise-versions repo as a proxy. A lot of those
// .json files are large and they're also static which makes them
// ideal for caching.
// Note that there's 2 layers of caching here:
// 1. Is it in memory cache?
// 2. No, is it on disk?
// 3. No, download from the internet then store responses in memory and disk
export default async function getRemoteJSON(url, config) {
// We could get fancy and make the cache key depend on the `config` too
// given that this is A) only used for archived enterprise stuff,
// and B) the config is only applicable on cache miss when doing the `got()`.
const cacheKey = url
// Assume it's in the in-memory cache first.
// Later we'll update this if we find we need to.
let fromCache = 'memory'
if (!cache.has(cacheKey)) {
fromCache = 'not'
let foundOnDisk = false
const tempFilename = crypto.createHash('md5').update(url).digest('hex')
// Do this here instead of at the top of the file so that it becomes
// possible to override this in unit tests.
const ROOT = process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT || '.remotejson-cache'
const onDisk = path.join(ROOT, `${tempFilename}.json`)
// Never even try reading from disk in production.
if (!inProd && fs.existsSync(onDisk)) {
const body = fs.readFileSync(onDisk, 'utf-8')
// It might exist on disk, but it could be empty
if (body) {
try {
// It might be corrupted JSON.
cache.set(cacheKey, JSON.parse(body))
fromCache = 'disk'
foundOnDisk = true
} catch (error) {
if (!(error instanceof SyntaxError)) {
throw error
}
}
}
}
if (!foundOnDisk) {
// got will, by default, follow redirects and it will throw if the ultimate
// response is not a 2xx.
// But it's possible that the page is a 200 OK but it's just not a JSON
// page at all. Then we can't assume we can deserialize it.
const res = await got(url, config)
if (!res.headers['content-type'].startsWith('application/json')) {
throw new Error(
`Fetching '${url}' resulted in a non-JSON response (${res.headers['content-type']})`
)
}
cache.set(cacheKey, JSON.parse(res.body))
// Only write to disk for testing and local preview.
// In production, we never write to disk. Only in-memory.
if (!inProd) {
fs.mkdirSync(path.dirname(onDisk), { recursive: true })
fs.writeFileSync(onDisk, res.body, 'utf-8')
}
}
}
const tags = [`url:${url}`, `from_cache:${fromCache}`]
statsd.increment('middleware.get_remote_json', 1, tags)
return cache.get(cacheKey)
}

Просмотреть файл

@ -0,0 +1,115 @@
import fs from 'fs'
import path from 'path'
import os from 'os'
import rimraf from 'rimraf'
import { expect, test, describe, beforeAll, afterAll } from '@jest/globals'
import nock from 'nock'
import getRemoteJSON, { cache } from '../../middleware/get-remote-json.js'
/**
*
* These unit tests test that the in-memory cache works and when it's
* not a cache it, it can benefit from using the disk cache.
*/
describe('getRemoteJSON', () => {
const envVarValueBefore = process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT
const tempDir = path.join(os.tmpdir(), 'remotejson-test')
beforeAll(() => {
process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempDir
})
afterAll(() => {
process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = envVarValueBefore
rimraf.sync(tempDir)
})
afterEach(() => {
nock.cleanAll()
})
test('simple in-memory caching', async () => {
const url = 'http://example.com/redirects.json'
const { origin, pathname } = new URL(url)
nock(origin).get(pathname).reply(200, { foo: 'bar' })
const data = await getRemoteJSON(url, {})
expect(data.foo).toBe('bar')
expect(cache.get(url)).toBeTruthy()
// Second time, despite not setting up a second nock(), will work
// because it can use memory now.
const data2 = await getRemoteJSON(url, {})
expect(data2.foo).toBe('bar')
expect(cache.get(url)).toBeTruthy()
})
test('benefit from disk-based caching', async () => {
const url = 'http://example.com/cool.json'
const { origin, pathname } = new URL(url)
nock(origin).get(pathname).reply(200, { cool: true })
const data = await getRemoteJSON(url, {})
expect(data.cool).toBe(true)
expect(cache.get(url)).toBeTruthy()
cache.delete(url)
// This time, the nock won't fail despite not using `.persist()`.
// That means it didn't need the network because it was able to
// use the disk cache.
const data2 = await getRemoteJSON(url, {})
expect(data2.cool).toBe(true)
})
test('recover from disk corruption (empty)', async () => {
const tempTempDir = path.join(tempDir, 'empty-files')
process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempTempDir
const url = 'http://example.com/empty.json'
const { origin, pathname } = new URL(url)
nock(origin).get(pathname).reply(200, { cool: true })
await getRemoteJSON(url, {})
// Make every file in the cache directory an empty file
for (const file of fs.readdirSync(tempTempDir)) {
fs.writeFileSync(path.join(tempTempDir, file), '')
}
cache.delete(url)
// If we don't do this, nock will fail because a second network
// request became necessary.
nock(origin).get(pathname).reply(200, { cool: true })
const data = await getRemoteJSON(url, {})
expect(data.cool).toBe(true)
})
test('recover from disk corruption (bad JSON)', async () => {
const tempTempDir = path.join(tempDir, 'corrupt-files')
process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempTempDir
const url = 'http://example.com/corrupt.json'
const { origin, pathname } = new URL(url)
nock(origin).get(pathname).reply(200, { cool: true })
await getRemoteJSON(url, {})
// Make every file in the cache directory an empty file
for (const file of fs.readdirSync(tempTempDir)) {
fs.writeFileSync(path.join(tempTempDir, file), '{"not:JSON{')
}
cache.delete(url)
// If we don't do this, nock will fail because a second network
// request became necessary.
nock(origin).get(pathname).reply(200, { cool: true })
const data = await getRemoteJSON(url, {})
expect(data.cool).toBe(true)
})
test('not-actually JSON despite URL', async () => {
const url = 'http://example.com/might-look-like.json'
const { origin, pathname } = new URL(url)
nock(origin).get(pathname).reply(200, '<html>here</html>', {
'Content-Type': 'text/html',
})
await expect(getRemoteJSON(url, {})).rejects.toThrowError(/resulted in a non-JSON response/)
})
})