зеркало из https://github.com/github/docs.git
Merge branch 'main' into repo-sync
This commit is contained in:
Коммит
cb5e9c19d0
|
@ -153,6 +153,15 @@ jobs:
|
|||
- name: Run build script
|
||||
run: npm run build
|
||||
|
||||
- name: Disk cache used by getRemoteJSON function in middleware
|
||||
uses: actions/cache@9b0c1fce7a93df8e3bb8926b0d6e9d89e92f20a7
|
||||
with:
|
||||
path: .remotejson-cache
|
||||
# Very liberal cache key. Note, for this to become populated
|
||||
# for other branches, you have to manually run this workflow
|
||||
# at least once using the "Run workflow" button.
|
||||
key: ${{ runner.os }}-remotejson
|
||||
|
||||
- name: Index fixtures into the local Elasticsearch
|
||||
# For the sake of saving time, only run this step if the group
|
||||
# is one that will run tests against an Elasticsearch on localhost.
|
||||
|
|
|
@ -44,3 +44,6 @@ semmle-code
|
|||
|
||||
.installed.package-lock.json
|
||||
assets/images/help/writing/unordered-list-rendered (1).png
|
||||
|
||||
# Used by getRemoteJSON()
|
||||
.remotejson-cache/
|
||||
|
|
|
@ -16,6 +16,7 @@ import { readCompressedJsonFileFallbackLazily } from '../lib/read-json-file.js'
|
|||
import { archivedCacheControl, languageCacheControl } from './cache-control.js'
|
||||
import { pathLanguagePrefixed, languagePrefixPathRegex } from '../lib/languages.js'
|
||||
import getRedirect, { splitPathByLanguage } from '../lib/get-redirect.js'
|
||||
import getRemoteJSON from './get-remote-json.js'
|
||||
|
||||
const REMOTE_ENTERPRISE_STORAGE_URL = 'https://githubdocs.azureedge.net/enterprise'
|
||||
|
||||
|
@ -75,29 +76,6 @@ const retryConfiguration = { limit: 3 }
|
|||
// unnecessary error reporting.
|
||||
const timeoutConfiguration = { response: 1500 }
|
||||
|
||||
async function getRemoteJSON(url, config) {
|
||||
let fromCache = true
|
||||
if (!_getRemoteJSONCache.has(url)) {
|
||||
fromCache = false
|
||||
// got will, by default, follow redirects and it will throw if the ultimate
|
||||
// response is not a 2xx.
|
||||
// But it's possible that the page is a 200 OK but it's just not a JSON
|
||||
// page at all. Then we can't assume we can deserialize it.
|
||||
const res = await got(url, config)
|
||||
if (!res.headers['content-type'].startsWith('application/json')) {
|
||||
throw new Error(
|
||||
`Fetching '${url}' resulted in a non-JSON response (${res.headers['content-type']})`
|
||||
)
|
||||
}
|
||||
|
||||
_getRemoteJSONCache.set(url, JSON.parse(res.body))
|
||||
}
|
||||
const tags = [`url:${url}`, `from_cache:${fromCache}`]
|
||||
statsd.increment('middleware.archived_get_remote_json', 1, tags)
|
||||
return _getRemoteJSONCache.get(url)
|
||||
}
|
||||
const _getRemoteJSONCache = new Map()
|
||||
|
||||
// This module handles requests for deprecated GitHub Enterprise versions
|
||||
// by routing them to static content in help-docs-archived-enterprise-versions
|
||||
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
import path from 'path'
|
||||
import fs from 'fs'
|
||||
import crypto from 'crypto'
|
||||
|
||||
import got from 'got'
|
||||
import statsd from '../lib/statsd.js'
|
||||
|
||||
// The only reason this is exported is for the sake of the unit tests'
|
||||
// ability to test in-memory miss after purging this with a mutation
|
||||
export const cache = new Map()
|
||||
|
||||
const inProd = process.env.NODE_ENV === 'production'
|
||||
|
||||
// Wrapper on `got()` that is able to both cache in memory and on disk.
|
||||
// The on-disk caching is in `.remotejson/`.
|
||||
// We use this for downloading `redirects.json` files from the
|
||||
// help-docs-archived-enterprise-versions repo as a proxy. A lot of those
|
||||
// .json files are large and they're also static which makes them
|
||||
// ideal for caching.
|
||||
// Note that there's 2 layers of caching here:
|
||||
// 1. Is it in memory cache?
|
||||
// 2. No, is it on disk?
|
||||
// 3. No, download from the internet then store responses in memory and disk
|
||||
export default async function getRemoteJSON(url, config) {
|
||||
// We could get fancy and make the cache key depend on the `config` too
|
||||
// given that this is A) only used for archived enterprise stuff,
|
||||
// and B) the config is only applicable on cache miss when doing the `got()`.
|
||||
const cacheKey = url
|
||||
|
||||
// Assume it's in the in-memory cache first.
|
||||
// Later we'll update this if we find we need to.
|
||||
let fromCache = 'memory'
|
||||
|
||||
if (!cache.has(cacheKey)) {
|
||||
fromCache = 'not'
|
||||
|
||||
let foundOnDisk = false
|
||||
const tempFilename = crypto.createHash('md5').update(url).digest('hex')
|
||||
|
||||
// Do this here instead of at the top of the file so that it becomes
|
||||
// possible to override this in unit tests.
|
||||
const ROOT = process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT || '.remotejson-cache'
|
||||
|
||||
const onDisk = path.join(ROOT, `${tempFilename}.json`)
|
||||
// Never even try reading from disk in production.
|
||||
if (!inProd && fs.existsSync(onDisk)) {
|
||||
const body = fs.readFileSync(onDisk, 'utf-8')
|
||||
// It might exist on disk, but it could be empty
|
||||
if (body) {
|
||||
try {
|
||||
// It might be corrupted JSON.
|
||||
cache.set(cacheKey, JSON.parse(body))
|
||||
fromCache = 'disk'
|
||||
foundOnDisk = true
|
||||
} catch (error) {
|
||||
if (!(error instanceof SyntaxError)) {
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundOnDisk) {
|
||||
// got will, by default, follow redirects and it will throw if the ultimate
|
||||
// response is not a 2xx.
|
||||
// But it's possible that the page is a 200 OK but it's just not a JSON
|
||||
// page at all. Then we can't assume we can deserialize it.
|
||||
const res = await got(url, config)
|
||||
if (!res.headers['content-type'].startsWith('application/json')) {
|
||||
throw new Error(
|
||||
`Fetching '${url}' resulted in a non-JSON response (${res.headers['content-type']})`
|
||||
)
|
||||
}
|
||||
cache.set(cacheKey, JSON.parse(res.body))
|
||||
|
||||
// Only write to disk for testing and local preview.
|
||||
// In production, we never write to disk. Only in-memory.
|
||||
if (!inProd) {
|
||||
fs.mkdirSync(path.dirname(onDisk), { recursive: true })
|
||||
fs.writeFileSync(onDisk, res.body, 'utf-8')
|
||||
}
|
||||
}
|
||||
}
|
||||
const tags = [`url:${url}`, `from_cache:${fromCache}`]
|
||||
statsd.increment('middleware.get_remote_json', 1, tags)
|
||||
return cache.get(cacheKey)
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
import fs from 'fs'
|
||||
import path from 'path'
|
||||
import os from 'os'
|
||||
|
||||
import rimraf from 'rimraf'
|
||||
import { expect, test, describe, beforeAll, afterAll } from '@jest/globals'
|
||||
import nock from 'nock'
|
||||
import getRemoteJSON, { cache } from '../../middleware/get-remote-json.js'
|
||||
|
||||
/**
|
||||
*
|
||||
* These unit tests test that the in-memory cache works and when it's
|
||||
* not a cache it, it can benefit from using the disk cache.
|
||||
*/
|
||||
|
||||
describe('getRemoteJSON', () => {
|
||||
const envVarValueBefore = process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT
|
||||
const tempDir = path.join(os.tmpdir(), 'remotejson-test')
|
||||
|
||||
beforeAll(() => {
|
||||
process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempDir
|
||||
})
|
||||
|
||||
afterAll(() => {
|
||||
process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = envVarValueBefore
|
||||
rimraf.sync(tempDir)
|
||||
})
|
||||
|
||||
afterEach(() => {
|
||||
nock.cleanAll()
|
||||
})
|
||||
|
||||
test('simple in-memory caching', async () => {
|
||||
const url = 'http://example.com/redirects.json'
|
||||
const { origin, pathname } = new URL(url)
|
||||
nock(origin).get(pathname).reply(200, { foo: 'bar' })
|
||||
const data = await getRemoteJSON(url, {})
|
||||
expect(data.foo).toBe('bar')
|
||||
expect(cache.get(url)).toBeTruthy()
|
||||
// Second time, despite not setting up a second nock(), will work
|
||||
// because it can use memory now.
|
||||
const data2 = await getRemoteJSON(url, {})
|
||||
expect(data2.foo).toBe('bar')
|
||||
expect(cache.get(url)).toBeTruthy()
|
||||
})
|
||||
|
||||
test('benefit from disk-based caching', async () => {
|
||||
const url = 'http://example.com/cool.json'
|
||||
const { origin, pathname } = new URL(url)
|
||||
nock(origin).get(pathname).reply(200, { cool: true })
|
||||
const data = await getRemoteJSON(url, {})
|
||||
expect(data.cool).toBe(true)
|
||||
expect(cache.get(url)).toBeTruthy()
|
||||
cache.delete(url)
|
||||
|
||||
// This time, the nock won't fail despite not using `.persist()`.
|
||||
// That means it didn't need the network because it was able to
|
||||
// use the disk cache.
|
||||
const data2 = await getRemoteJSON(url, {})
|
||||
expect(data2.cool).toBe(true)
|
||||
})
|
||||
|
||||
test('recover from disk corruption (empty)', async () => {
|
||||
const tempTempDir = path.join(tempDir, 'empty-files')
|
||||
process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempTempDir
|
||||
const url = 'http://example.com/empty.json'
|
||||
const { origin, pathname } = new URL(url)
|
||||
nock(origin).get(pathname).reply(200, { cool: true })
|
||||
await getRemoteJSON(url, {})
|
||||
|
||||
// Make every file in the cache directory an empty file
|
||||
for (const file of fs.readdirSync(tempTempDir)) {
|
||||
fs.writeFileSync(path.join(tempTempDir, file), '')
|
||||
}
|
||||
|
||||
cache.delete(url)
|
||||
// If we don't do this, nock will fail because a second network
|
||||
// request became necessary.
|
||||
nock(origin).get(pathname).reply(200, { cool: true })
|
||||
|
||||
const data = await getRemoteJSON(url, {})
|
||||
expect(data.cool).toBe(true)
|
||||
})
|
||||
|
||||
test('recover from disk corruption (bad JSON)', async () => {
|
||||
const tempTempDir = path.join(tempDir, 'corrupt-files')
|
||||
process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempTempDir
|
||||
const url = 'http://example.com/corrupt.json'
|
||||
const { origin, pathname } = new URL(url)
|
||||
nock(origin).get(pathname).reply(200, { cool: true })
|
||||
await getRemoteJSON(url, {})
|
||||
|
||||
// Make every file in the cache directory an empty file
|
||||
for (const file of fs.readdirSync(tempTempDir)) {
|
||||
fs.writeFileSync(path.join(tempTempDir, file), '{"not:JSON{')
|
||||
}
|
||||
|
||||
cache.delete(url)
|
||||
// If we don't do this, nock will fail because a second network
|
||||
// request became necessary.
|
||||
nock(origin).get(pathname).reply(200, { cool: true })
|
||||
|
||||
const data = await getRemoteJSON(url, {})
|
||||
expect(data.cool).toBe(true)
|
||||
})
|
||||
|
||||
test('not-actually JSON despite URL', async () => {
|
||||
const url = 'http://example.com/might-look-like.json'
|
||||
const { origin, pathname } = new URL(url)
|
||||
nock(origin).get(pathname).reply(200, '<html>here</html>', {
|
||||
'Content-Type': 'text/html',
|
||||
})
|
||||
await expect(getRemoteJSON(url, {})).rejects.toThrowError(/resulted in a non-JSON response/)
|
||||
})
|
||||
})
|
Загрузка…
Ссылка в новой задаче