зеркало из https://github.com/github/docs.git
94 строки
2.7 KiB
JavaScript
94 строки
2.7 KiB
JavaScript
#!/usr/bin/env node
|
|
import { chain } from 'lodash-es'
|
|
import { maxContentLength } from '../../lib/search/config.js'
|
|
// This module takes cheerio page object and divides it into sections
|
|
// using H1,H2 heading elements as section delimiters. The text
|
|
// that follows each heading becomes the content of the search record.
|
|
|
|
const urlPrefix = 'https://docs.github.com'
|
|
const ignoredHeadingSlugs = ['in-this-article', 'further-reading']
|
|
|
|
export default function parsePageSectionsIntoRecords(href, $) {
|
|
const title = $('h1').text().trim()
|
|
const breadcrumbsArray = $('nav.breadcrumbs a')
|
|
.map((i, el) => {
|
|
return $(el).text().trim().replace(/\n/g, ' ').replace(/\s+/g, ' ')
|
|
})
|
|
.get()
|
|
.slice(0, -1)
|
|
|
|
const breadcrumbs = breadcrumbsArray.join(' / ') || ''
|
|
const metaKeywords = $('meta[name="keywords"]').attr('content')
|
|
const topics = metaKeywords ? metaKeywords.split(',') : []
|
|
|
|
const productName = breadcrumbsArray[0] || ''
|
|
topics.push(productName)
|
|
// Remove "github" to make filter queries shorter
|
|
if (productName.includes('GitHub ')) {
|
|
topics.push(productName.replace('GitHub ', ''))
|
|
}
|
|
|
|
let records
|
|
|
|
const $sections = $('[data-search=article-content] h2')
|
|
.filter('[id]')
|
|
.filter((i, el) => {
|
|
return !ignoredHeadingSlugs.includes($(el).attr('id'))
|
|
})
|
|
|
|
if ($sections.length > 0) {
|
|
records = $sections
|
|
.map((i, el) => {
|
|
const heading = $(el).text().trim()
|
|
const slug = $(el).attr('id')
|
|
const objectID = [href, slug].join('#')
|
|
const url = [urlPrefix, objectID].join('')
|
|
const content = $(el)
|
|
// Platform-specific content is nested in a DIV
|
|
// GraphQL content in nested in two DIVS
|
|
.nextUntil('h2, div > h2, div > div > h2')
|
|
.map((i, el) => $(el).text())
|
|
.get()
|
|
.join(' ')
|
|
.trim()
|
|
.slice(0, maxContentLength)
|
|
return {
|
|
objectID,
|
|
url,
|
|
slug,
|
|
breadcrumbs,
|
|
heading,
|
|
title,
|
|
content,
|
|
topics,
|
|
}
|
|
})
|
|
.get()
|
|
} else {
|
|
// There are no sections. Treat the entire article as the record.
|
|
const objectID = href
|
|
const url = [urlPrefix, objectID].join('')
|
|
const content = $(
|
|
'[data-search=article-body] p, [data-search=article-body] ul, [data-search=article-body] ol, [data-search=article-body] table'
|
|
)
|
|
.map((i, el) => $(el).text())
|
|
.get()
|
|
.join(' ')
|
|
.trim()
|
|
.slice(0, maxContentLength)
|
|
|
|
records = [
|
|
{
|
|
objectID,
|
|
url,
|
|
breadcrumbs,
|
|
title,
|
|
content,
|
|
topics,
|
|
},
|
|
]
|
|
}
|
|
|
|
return chain(records).uniqBy('objectID').value()
|
|
}
|