docs/script/search/parse-page-sections-into-re...

#!/usr/bin/env node
import { chain } from 'lodash-es'
import { maxContentLength } from '../../lib/search/config.js'
// This module takes cheerio page object and divides it into sections
// using H1,H2 heading elements as section delimiters. The text
// that follows each heading becomes the content of the search record.

const urlPrefix = 'https://docs.github.com'
const ignoredHeadingSlugs = ['in-this-article', 'further-reading']

export default function parsePageSectionsIntoRecords(href, $) {
  const title = $('h1').text().trim()
  const breadcrumbsArray = $('nav.breadcrumbs a')
    .map((i, el) => {
      return $(el).text().trim().replace(/\n/g, ' ').replace(/\s+/g, ' ')
    })
    .get()
    .slice(0, -1)

  const breadcrumbs = breadcrumbsArray.join(' / ') || ''
  const metaKeywords = $('meta[name="keywords"]').attr('content')
  const topics = metaKeywords ? metaKeywords.split(',') : []

  const productName = breadcrumbsArray[0] || ''
  topics.push(productName)
  // Remove "github" to make filter queries shorter
  if (productName.includes('GitHub ')) {
    topics.push(productName.replace('GitHub ', ''))
  }

  let records

  const $sections = $('[data-search=article-content] h2')
    .filter('[id]')
    .filter((i, el) => {
      return !ignoredHeadingSlugs.includes($(el).attr('id'))
    })

  if ($sections.length > 0) {
    records = $sections
      .map((i, el) => {
        const heading = $(el).text().trim()
        const slug = $(el).attr('id')
        const objectID = [href, slug].join('#')
        const url = [urlPrefix, objectID].join('')
        const content = $(el)
          // Platform-specific content is nested in a DIV
          // GraphQL content in nested in two DIVS
          .nextUntil('h2, div > h2, div > div > h2')
          .map((i, el) => $(el).text())
          .get()
          .join(' ')
          .trim()
          .slice(0, maxContentLength)
        return {
          objectID,
          url,
          slug,
          breadcrumbs,
          heading,
          title,
          content,
          topics,
        }
      })
      .get()
  } else {
    // There are no sections. Treat the entire article as the record.
    const objectID = href
    const url = [urlPrefix, objectID].join('')
    const content = $(
      '[data-search=article-body] p, [data-search=article-body] ul, [data-search=article-body] ol, [data-search=article-body] table'
    )
      .map((i, el) => $(el).text())
      .get()
      .join(' ')
      .trim()
      .slice(0, maxContentLength)

    records = [
      {
        objectID,
        url,
        breadcrumbs,
        title,
        content,
        topics,
      },
    ]
  }

  return chain(records).uniqBy('objectID').value()
}