docs/lib/get-data.js

import fs from 'fs'
import path from 'path'

import yaml from 'js-yaml'
import matter from 'gray-matter'
import { merge, get } from 'lodash-es'

import languages from './languages.js'
import { correctTranslatedContentStrings } from './correct-translation-content.js'

// If you run `export DEBUG_JIT_DATA_READS=true` in your terminal,
// next time it will mention every file it reads from disk.
const DEBUG_JIT_DATA_READS = Boolean(JSON.parse(process.env.DEBUG_JIT_DATA_READS || 'false'))

// This is a list of files that we should always immediately fall back to
// English for.
// Having this is safer than trying to wrangle the translations to NOT
// have them translated.
const ALWAYS_ENGLISH_YAML_FILES = new Set([
  'data/variables/product.yml',
  'data/variables/release_candidate.yml',
])

// Returns all the things inside a directory
export const getDeepDataByLanguage = memoize((dottedPath, langCode) => {
  if (!(langCode in languages))
    throw new Error(`langCode '${langCode}' not a recognized language code`)
  const { dir } = languages[langCode]
  return getDeepDataByDir(dottedPath, dir)
})

// Doesn't need to be memoized because it's used by getDataKeysByLanguage
// which is already memoized.
function getDeepDataByDir(dottedPath, dir) {
  const fullPath = ['data']
  const split = dottedPath.split(/\./g)
  fullPath.push(...split)

  const things = {}
  const relPath = fullPath.join(path.sep)
  for (const dirent of getDirents(dir, relPath)) {
    if (dirent.name === 'README.md') continue
    const key = dirent.isDirectory() ? dirent.name : dirent.name.replace(/\.yml$/, '') // e.g. '3-5'  or '0-rc2'
    if (dirent.isDirectory()) {
      things[key] = getDeepDataByDir(`${dottedPath}.${key}`, dir)
    } else if (dirent.name.endsWith('.yml')) {
      things[key] = getYamlContent(dir, path.join(relPath, dirent.name))
    } else if (dirent.name.endsWith('.md')) {
      things[key] = getMarkdownContent(dir, path.join(relPath, dirent.name))
    } else {
      throw new Error(`don't know how to read '${dirent.name}'`)
    }
  }
  return things
}

function getDirents(root, relPath) {
  const filePath = root ? path.join(root, relPath) : relPath
  return fs.readdirSync(filePath, { withFileTypes: true })
}

export const getUIDataMerged = memoize((langCode) => {
  const uiEnglish = getUIData('en')
  if (langCode === 'en') return uiEnglish
  // Got to combine. Start with the English and put the translation on top.
  // E.g.
  //    english = {food: "Food", drink: "Drink"}
  //    swedish = {food: "Mat"}
  //    =>
  //    combind = {food: "Mat", drink: "Drink"}
  const combined = {}
  merge(combined, uiEnglish)
  merge(combined, getUIData(langCode))
  return combined
})

// Doesn't need to be memoized because it's used by another function
// that is memoized.
const getUIData = (langCode) => {
  const fullPath = ['data', 'ui.yml']
  const { dir } = languages[langCode]
  return getYamlContent(dir, fullPath.join(path.sep))
}

export const getDataByLanguage = memoize((dottedPath, langCode) => {
  if (!(langCode in languages))
    throw new Error(`langCode '${langCode}' not a recognized language code`)
  const { dir } = languages[langCode]

  try {
    const value = getDataByDir(dottedPath, dir, languages.en.dir)

    // What could happens is that a new key has only been added to
    // the English data/ui.yml but hasn't been added to Japanese, but
    // there nevertheless exists a Japanse `data/ui.yml`.
    // Since getDataByDir() uses `get(dataObject, 'dott.ed.path')` it
    // will return `undefined` if it's not present.
    // If this happens, we can't rely on `err.code === 'ENOENT'` to
    // fall back the English one. So we just start over using the English data.
    if (value === undefined && langCode !== 'en') {
      return getDataByDir(dottedPath, languages.en.dir)
    }
    return value
  } catch (error) {
    if (error instanceof Error && error.mark && error.message) {
      // It's a yaml.load() generated error!
      // Remember, the file that we read might have been a .yml or a .md
      // file. If it was a .md file, with corrupt front-matter that too
      // would have caused a YAMLException
      if (langCode !== 'en') {
        if (DEBUG_JIT_DATA_READS) {
          console.warn(`Unable to parse Yaml in (${langCode}) '${dottedPath}': ${error.message}`)
        }
        // Give it one more chance, but use English this time
        return getDataByDir(dottedPath, languages.en.dir)
      }
      // Always throw English Yaml reading errors. Staff writers
      // need to know early and explicitly that they are corrupt.
      throw error
    }

    if (error.code === 'ENOENT') return undefined
    throw error
  }
})

function getDataByDir(dottedPath, dir, englishRoot) {
  const fullPath = ['data']

  // Using English here because it doesn't matter. We just want to
  // figure out how to turn `foo.version-3.4.deeper.key' into
  // `['foo', 'version-3.4', 'deeper', 'key']` here and we'll need
  // any directory to do that and English is always the most up-to-date.
  // We need the getSmartSplit() as long as there's a chance that a
  // directory or file inside data/ might contain a dot in the name,
  // however the exception is the file names in data/release-notes/**/*.yml
  // because it contains files that are just numbers like 3-7/0.yml and
  // that can cause problems inside getSmartSplit().
  const split = dottedPath.startsWith('release-notes')
    ? dottedPath.split('.')
    : getSmartSplit(dottedPath)

  // For early-access data stuff, they're referred to as...
  //
  //   {% data early-access.reusables.foo.bar %}
  //
  // When we "merge" in the early-access data, we put the whole directory
  // within the root `data/` so it exists, on disk, as
  //
  //   data/early-access/reusables/foo/bar.md
  //
  if (split[0] === 'early-access') {
    fullPath.push(split.shift())
  }
  const first = split[0]

  if (first === 'variables') {
    const key = split.pop()
    const basename = split.pop()
    fullPath.push(...split)
    fullPath.push(`${basename}.yml`)
    const allData = getYamlContent(dir, fullPath.join(path.sep), englishRoot)
    if (allData) {
      const value = allData[key]
      if (value) {
        return matter(value).content
      }
    } else {
      console.warn(`Unable to find variables Yaml file ${fullPath.join(path.sep)}`)
    }
    return
  }

  if (first === 'reusables') {
    const nakedname = split.pop()
    fullPath.push(...split)
    fullPath.push(`${nakedname}.md`)
    const markdown = getMarkdownContent(dir, fullPath.join(path.sep), englishRoot)
    let { content } = matter(markdown)
    if (dir !== englishRoot) {
      // If we're reading a translation, we need to replace the possible
      // corruptions. For example `[AUTOTITLE"을](/foo/bar)`.
      // To do this we'll need the English equivalent
      let englishContent = content
      try {
        englishContent = getMarkdownContent(englishRoot, fullPath.join(path.sep), englishRoot)
      } catch (error) {
        // In some real but rare cases a reusable doesn't exist in English.
        // At all.
        // This can happen when the translation is really out of date.
        // You might have an old `docs-internal.locale/content/**/*.md`
        // file that mentions `{% data reusables.foo.bar %}`. And it's
        // working fine, except none of that exists in English.
        // If this is the case, we still want to executed the
        // correctTranslatedContentStrings() function, but we can't
        // genuinely give it the English equivalent content, which it
        // sometimes uses to correct some Liquid tags. At least other
        // good corrections might happen.
        if (error.code !== 'ENOENT') {
          throw error
        }
      }
      content = correctTranslatedContentStrings(content, englishContent)
    }
    return content
  }

  // E.g. {% data ui.pages.foo.bar %}
  if (first === 'ui') {
    const basename = split.shift() // i.e. 'ui'
    fullPath.push(`${basename}.yml`)
    const allData = getYamlContent(dir, fullPath.join(path.sep), englishRoot)
    return get(allData, split.join('.'))
  }

  if (first === 'product-examples' || first === 'glossaries' || first === 'release-notes') {
    const basename = split.pop()
    fullPath.push(...split)
    fullPath.push(`${basename}.yml`)
    return getYamlContent(dir, fullPath.join(path.sep), englishRoot)
  }

  if (first === 'learning-tracks') {
    const key = split.pop()
    const basename = split.pop()
    fullPath.push(...split)
    fullPath.push(`${basename}.yml`)
    const allData = getYamlContent(dir, fullPath.join(path.sep), englishRoot)
    return allData[key]
  }

  throw new Error(`Can't find the key '${dottedPath}' in the scope.`)
}

function getSmartSplit(dottedPath) {
  const split = dottedPath.split('.')
  const bits = []
  for (let i = 0, len = split.length; i < len; i++) {
    const bit = split[i]
    if (i === len - 1) {
      bits.push(bit)
    } else {
      const next = split[i + 1]
      if (/\d$/.test(bit) && /^\d/.test(next)) {
        bits.push([bit, next].join('.'))
        i++ // jump ahead one position in the loop
      } else {
        bits.push(bit)
      }
    }
  }
  return bits
}

// The reason this is memoized, even though the parent caller function
// (`getDataByLanguage`) is also memoized is because we might read
// the same file for two different keys. E.g.
//
//    getDataByLanguage('variables.product.prodname_ghe_server', 'en')
//    getDataByLanguage('variables.product.company_short', 'en')
//
// ...will actually depend on reading `data/variables/product.yml`. Twice.
// Well, actually not twice because we cache the disk reading. So the outcome
// becomes this:
//
//    1. getDataByLanguage('variables.product.prodname_ghe_server', 'en')
//      -> cache MISS
//        1.1. read and parse data/variables/product.yml
//          -> cache MISS
//    2. getDataByLanguage('variables.product.company_short', 'en')
//      -> cache MISS
//        2.1. read and parse data/variables/product.yml
//          -> cache HIT    (Yay!)
//
const getYamlContent = memoize((root, relPath, englishRoot) => {
  // Certain Yaml files we know we always want the English one
  // no matter what the specified language is.
  // For example, we never want `data/variables/product.yml` translated
  // so we know to immediately fall back to the English one.
  if (ALWAYS_ENGLISH_YAML_FILES.has(relPath)) {
    root = '' // this forces it to read from English
  }
  const fileContent = getFileContent(root, relPath, englishRoot)
  return yaml.load(fileContent, { filename: relPath })
})

// The reason why this is memoized, is the same as for getYamlContent() above.
const getMarkdownContent = memoize((root, relPath, englishRoot) => {
  const fileContent = getFileContent(root, relPath, englishRoot)
  return matter(fileContent).content.trimEnd()
})

const getFileContent = (root, relPath, englishRoot) => {
  const filePath = root ? path.join(root, relPath) : relPath
  if (DEBUG_JIT_DATA_READS) console.log('READ', filePath)
  try {
    return fs.readFileSync(filePath, 'utf-8')
  } catch (err) {
    // It might fail because that particular data entry doesn't yet
    // exist in a translation
    if (err.code === 'ENOENT') {
      // If looking it up as a file fails, give it one more chance if the
      // read was for a translation.
      if (root !== englishRoot) {
        // We can try again but this time using the English files
        return getFileContent(englishRoot, relPath, englishRoot)
      }
    }
    throw err
  }
}

function memoize(func) {
  const cache = new Map()
  return (...args) => {
    if (process.env.NODE_ENV === 'development') {
      // It is very possible that certain files, when caching is disabled,
      // are read multiple times in short succession. E.g. `product.yml`.
      // So how expensive is it to read these files excessively?
      // To answer that, we benchmarked it by sampling 10 files from the
      // most common files that are used from `data/`. In fact, we ran 100
      // runs of 10 *different* files. About 80% of them were `.yml` files.
      // As a median, it takes **0.5ms to read 10 files from disk**
      // all in a sync manner.
      // Since most files coming through here is `.yml` files (e.g.
      // product.yml and ui.yml) if you also do the `yaml.load()` of the
      // read content, that number becomes **2.1ms to read and parse 10 files**.
      // So in conclusion, not a lot of time.
      return func(...args)
    }

    const key = args.join(':')
    if (!cache.has(key)) {
      cache.set(key, func(...args))
    }
    const value = cache.get(key)
    // If what was stored in the cache is a mutable, this time, return
    // a shallow copy.
    // Otherwise, what *might* happen is this:
    //
    //   > const getNames = memoize(() => ["peter", "tucker"])
    //   > var names = getNames()
    //   > names.push("ashley")
    //   > var names2 = getNames()
    //   > names2.push("charlotte")
    //   > console.log(names2)
    //
    //   ["peter", "tucker", "ashley", "charlotte"]
    //
    // Note that these are shallow copies only.
    if (Array.isArray(value)) return [...value]
    if (typeof value === 'object') return { ...value }
    return value
  }
}