зеркало из https://github.com/github/docs.git
Коммит
1b5e3de292
23
.env.example
23
.env.example
|
@ -0,0 +1,23 @@
|
|||
# This file is a template for what your untracked .env file might look like for local development.
|
||||
# Please copy this to a new .env file and fill in the values as needed.
|
||||
|
||||
# Requires a running local Elasticsearch service. Can be started via Docker, see https://github.com/github/docs-engineering/blob/main/docs/elasticsearch/elasticsearch-locally.md
|
||||
# When this value is unset searches will be proxied to the production Elasticsearch endpoint
|
||||
ELASTICSEARCH_URL=http://localhost:9200
|
||||
|
||||
# Set for sending events in local development. See https://github.com/github/docs-engineering/blob/main/docs/analytics/hydro-mock.md
|
||||
HYDRO_ENDPOINT=
|
||||
HYDRO_SECRET=
|
||||
|
||||
# Localization variables
|
||||
# See https://github.com/github/docs-internal/tree/main/src/languages#working-with-translated-content-locally
|
||||
ENABLED_LANGUAGES=
|
||||
TRANSLATIONS_ROOT=
|
||||
|
||||
# For running the src/search/scripts/scrape script
|
||||
# You may want a lower value depending on your CPU
|
||||
BUILD_RECORDS_MAX_CONCURRENT=100
|
||||
BUILD_RECORDS_MIN_TIME=
|
||||
|
||||
# Set to true to enable the /fastly-cache-test route for debugging Fastly headers
|
||||
ENABLE_FASTLY_TESTING=
|
|
@ -1,7 +1,7 @@
|
|||
name: Index autocomplete Elasticsearch
|
||||
name: Index autocomplete search in Elasticsearch
|
||||
|
||||
# **What it does**: Indexes autocomplete data into Elasticsearch.
|
||||
# **Why we have it**: So we can power the API for autocomplete.
|
||||
# **What it does**: Indexes autocomplete data (general and AI search) into Elasticsearch.
|
||||
# **Why we have it**: So we can power the APIs for autocomplete.
|
||||
# **Who does it impact**: docs-engineering
|
||||
|
||||
on:
|
||||
|
@ -10,7 +10,7 @@ on:
|
|||
- cron: '20 16 * * *' # Run every day at 16:20 UTC / 8:20 PST
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/index-autocomplete-elasticsearch.yml
|
||||
- .github/workflows/index-autocomplete-search.yml
|
||||
- 'src/search/scripts/index/**'
|
||||
- 'package*.json'
|
||||
|
||||
|
@ -40,10 +40,15 @@ jobs:
|
|||
if: ${{ github.event_name == 'pull_request' }}
|
||||
run: curl --fail --retry-connrefused --retry 5 -I http://localhost:9200
|
||||
|
||||
- name: Run indexing
|
||||
- name: Run general auto-complete indexing
|
||||
env:
|
||||
ELASTICSEARCH_URL: ${{ github.event_name == 'pull_request' && 'http://localhost:9200' || secrets.ELASTICSEARCH_URL }}
|
||||
run: npm run index -- autocomplete docs-internal-data
|
||||
run: npm run index-general-autocomplete -- docs-internal-data
|
||||
|
||||
- name: Run AI search auto-complete indexing
|
||||
env:
|
||||
ELASTICSEARCH_URL: ${{ github.event_name == 'pull_request' && 'http://localhost:9200' || secrets.ELASTICSEARCH_URL }}
|
||||
run: npm run index-ai-search-autocomplete -- docs-internal-data
|
||||
|
||||
- uses: ./.github/actions/slack-alert
|
||||
if: ${{ failure() && github.event_name == 'schedule' }}
|
|
@ -1,6 +1,6 @@
|
|||
name: Sync search - PR
|
||||
name: Index general search in Elasticsearch on PR
|
||||
|
||||
# **What it does**: This does what `sync-sarch-elasticsearch.yml` does but
|
||||
# **What it does**: This does what `index-general-search-elasticsearch.yml` does but
|
||||
# with a localhost Elasticsearch and only for English.
|
||||
# **Why we have it**: To test that the script works and the popular pages json is valid.
|
||||
# **Who does it impact**: Docs engineering
|
||||
|
@ -11,8 +11,8 @@ on:
|
|||
paths:
|
||||
- 'src/search/**'
|
||||
- 'package*.json'
|
||||
# Ultimately, for debugging this workflow itself
|
||||
- .github/workflows/sync-search-pr.yml
|
||||
# For debugging this workflow
|
||||
- .github/workflows/index-general-search-pr.yml
|
||||
# Make sure we run this if the composite action changes
|
||||
- .github/actions/setup-elasticsearch/action.yml
|
||||
|
||||
|
@ -25,9 +25,6 @@ concurrency:
|
|||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
# Yes, it's hardcoded but it makes all the steps look exactly the same
|
||||
# as they do in `sync-search-elasticsearch.yml` where it uses
|
||||
# that `${{ env.ELASTICSEARCH_URL }}`
|
||||
ELASTICSEARCH_URL: http://localhost:9200
|
||||
# Since we'll run in NDOE_ENV=production, we need to be explicit that
|
||||
# we don't want Hydro configured.
|
||||
|
@ -63,7 +60,7 @@ jobs:
|
|||
env:
|
||||
ENABLE_DEV_LOGGING: false
|
||||
run: |
|
||||
npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log &
|
||||
npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
|
||||
|
||||
# first sleep to give it a chance to start
|
||||
sleep 6
|
||||
|
@ -88,15 +85,13 @@ jobs:
|
|||
# let's just accept an empty string instead.
|
||||
THROW_ON_EMPTY: false
|
||||
|
||||
# The sync-search-index recognizes this env var if you don't
|
||||
# use the `--docs-internal-data <PATH>` option.
|
||||
DOCS_INTERNAL_DATA: docs-internal-data
|
||||
|
||||
run: |
|
||||
mkdir /tmp/records
|
||||
npm run sync-search-indices -- /tmp/records \
|
||||
npm run general-search-scrape -- /tmp/records \
|
||||
--language en \
|
||||
--version dotcom
|
||||
--version fpt
|
||||
|
||||
ls -lh /tmp/records
|
||||
|
||||
|
@ -106,9 +101,9 @@ jobs:
|
|||
|
||||
- name: Index into Elasticsearch
|
||||
run: |
|
||||
npm run index-elasticsearch -- /tmp/records \
|
||||
npm run index-general-search -- /tmp/records \
|
||||
--language en \
|
||||
--version dotcom
|
||||
--version fpt
|
||||
|
||||
- name: Check created indexes and aliases
|
||||
run: |
|
|
@ -1,4 +1,4 @@
|
|||
name: Sync search Elasticsearch
|
||||
name: Index general search in Elasticsearch
|
||||
|
||||
# **What it does**: It scrapes the whole site and dumps the records in a
|
||||
# temp directory. Then it indexes that into Elasticsearch.
|
||||
|
@ -140,7 +140,7 @@ jobs:
|
|||
env:
|
||||
ENABLE_DEV_LOGGING: false
|
||||
run: |
|
||||
npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log &
|
||||
npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
|
||||
|
||||
# first sleep to give it a chance to start
|
||||
sleep 6
|
||||
|
@ -169,13 +169,11 @@ jobs:
|
|||
# the same as not set within the script.
|
||||
VERSION: ${{ inputs.version }}
|
||||
|
||||
# The sync-search-index recognizes this env var if you don't
|
||||
# use the `--docs-internal-data <PATH>` option.
|
||||
DOCS_INTERNAL_DATA: docs-internal-data
|
||||
|
||||
run: |
|
||||
mkdir /tmp/records
|
||||
npm run sync-search-indices -- /tmp/records \
|
||||
npm run general-search-scrape -- /tmp/records \
|
||||
--language ${{ matrix.language }}
|
||||
|
||||
ls -lh /tmp/records
|
||||
|
@ -186,12 +184,12 @@ jobs:
|
|||
|
||||
- name: Index into Elasticsearch
|
||||
env:
|
||||
# Must match what we used when scraping (npm run sync-search-indices)
|
||||
# Must match what we used when scraping (npm run general-search-scrape)
|
||||
# otherwise the script will seek other versions from disk that might
|
||||
# not exist.
|
||||
VERSION: ${{ inputs.version }}
|
||||
run: |
|
||||
npm run index-elasticsearch -- /tmp/records \
|
||||
npm run index-general-search -- /tmp/records \
|
||||
--language ${{ matrix.language }} \
|
||||
--stagger-seconds 5 \
|
||||
--retries 5
|
|
@ -51,3 +51,9 @@ assets/images/help/writing/unordered-list-rendered (1).png
|
|||
|
||||
# Used by precompute-pageinfo
|
||||
.pageinfo-cache.json.br
|
||||
|
||||
# Cloned and used for indexing Elasticsearch data
|
||||
docs-internal-data/
|
||||
|
||||
# For intermediate data (like scraping for Elasticsearch indexing)
|
||||
tmp/
|
|
@ -212,3 +212,30 @@ If your appliance averages more than 70% CPU utilization, {% data variables.prod
|
|||
|
||||
As part of upgrading GitHub Enterprise Server to version 3.13 or later, the Elasticsearch service will be upgraded. {% data variables.product.company_short %} strongly recommends following the guidance in "[AUTOTITLE](/admin/upgrading-your-instance/performing-an-upgrade/preparing-for-the-elasticsearch-upgrade)."
|
||||
{% endif %}
|
||||
|
||||
{% ifversion ghes > 3.12 and ghes < 3.15 %}
|
||||
|
||||
## Undecryptable records
|
||||
|
||||
If you are upgrading from {% data variables.product.prodname_ghe_server %} 3.11 or 3.12 to 3.13, or from 3.12 to 3.14, you may run into an issue with undecryptable records due to missing required keys for decryption. The only solution is to delete the undecryptable records. The type of records impacted by this issue are 2FA records, that means you might need to ask users to re-enable two-factor authentication (2FA).
|
||||
|
||||
### Before upgrading
|
||||
|
||||
If you are upgrading from {% data variables.product.prodname_ghe_server %} 3.11 or 3.12 to 3.13, or from 3.12 to 3.14, you can run the encryption diagnostics script to identify the undecryptable records ahead of time. This will give you the opportunity to understand the impact and plan for it.
|
||||
|
||||
1. Download the [encryption diagnostics script](https://gh.io/ghes-encryption-diagnostics). You can use a command like `curl -L -O https://gh.io/ghes-encryption-diagnostics` to download the script.
|
||||
1. Save the script to the `/data/user/common` directory on the appliance.
|
||||
1. Follow the instructions at the top of the script and execute it on the appliance. If there are any undecryptable records, they are logged in `/tmp/column_encryption_records_to_be_deleted.log`. Any records logged here means that the system was not able to find the keys for them and hence was not able to decrypt the data in those records.
|
||||
|
||||
At this stage, please note that these records will be deleted as part of the process. The script will warn you about the users who will need to re-enroll into 2FA after the upgrade. The impacted users' handles are logged in `/tmp/column_encryption_users_to_have_2fa_disabled.log`. These users will need to be re-enrolled into 2FA.
|
||||
|
||||
If the script runs into unexpected issues, you will be prompted to [contact {% data variables.contact.github_support %}](/support/contacting-github-support). Errors related to these issues will be logged in `/tmp/column_encryption_unexpected_errors.log`. If you are in a dire situation and are unable to have users re-enroll into 2FA, [contact {% data variables.contact.github_support %}](/support/contacting-github-support) for help.
|
||||
|
||||
### During the upgrade
|
||||
|
||||
In case you did not have the opportunity to run the encryption diagnostics script ahead of time, there are mechanisms in the product to help you. The pre-flight checks during the upgrade process will detect undecryptable records and log them in `/tmp/column_encryption_records_to_be_deleted.log`. The sequence will warn you of the users who will need to re-enable 2FA after the upgrade. The impacted users records are logged in `/tmp/column_encryption_users_to_have_2fa_disabled.log`.
|
||||
|
||||
If undecryptable records are detected, you will be prompted whether you want to proceed with the upgrade or not. If you proceed, the upgrade process deletes the undecryptable records. Otherwise, the upgrade process will exit.
|
||||
|
||||
If you have any questions during the upgrade, you can reach out to {% data variables.contact.github_support %}. Once you have had the time and opportunity to understand the impact, you can retrigger the upgrade.
|
||||
{% endif %}
|
||||
|
|
|
@ -80,6 +80,12 @@ For example, you link your Azure subscription to your organization {% ifversion
|
|||
|
||||
* You must know your Azure subscription ID. See [Get subscription and tenant IDs in the Azure portal](https://learn.microsoft.com/en-us/azure/azure-portal/get-subscription-tenant-id) in the Microsoft Docs or [contact Azure support](https://azure.microsoft.com/support/).
|
||||
|
||||
## Video demonstration of connecting a subscription
|
||||
|
||||
To connect an Azure subscription, you'll need appropriate access permissions on both {% data variables.product.product_name %} and the Azure billing portal. This may require coordination between two different people.
|
||||
|
||||
To see a demo of the process from beginning to end, see [Billing GitHub consumption through an Azure subscription](https://www.youtube.com/watch?v=Y-f7JKJ4_8Y) on {% data variables.product.company_short %}'s YouTube channel. This video demonstrates the process for an enterprise account. If you're connecting a subscription to an organization account, see "[Connecting your Azure subscription to your organization account](/free-pro-team@latest/billing/managing-the-plan-for-your-github-account/connecting-an-azure-subscription#connecting-your-azure-subscription-to-your-organization-account)."
|
||||
|
||||
{% ifversion fpt %}
|
||||
|
||||
## Connecting your Azure subscription to your organization account
|
||||
|
|
|
@ -35,7 +35,7 @@ Generate end-user query help from .qhelp files.
|
|||
|
||||
### Primary Options
|
||||
|
||||
#### `<qhelp|mdhelp|query|dir|suite>...`
|
||||
#### `<qhelpquerysuite>...`
|
||||
|
||||
\[Mandatory] Query help files to render. Each argument is one of:
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ title: Transcript - "Billing GitHub consumption through an Azure subscription"
|
|||
intro: Audio and visual transcript.
|
||||
shortTitle: Billing through Azure
|
||||
allowTitleToDifferFromFilename: true
|
||||
product_video: 'https://www.youtube.com/watch?v=DAiIhJKCt8s'
|
||||
product_video: 'https://www.youtube.com/watch?v=Y-f7JKJ4_8Y'
|
||||
topics:
|
||||
- Transcripts
|
||||
versions:
|
||||
|
@ -27,7 +27,9 @@ And finally, if a Microsoft customer has an Azure discount, it will automaticall
|
|||
|
||||
If a Microsoft customer also has a Microsoft Azure Consumption Commitment, or MACC, all future GitHub consumption will decrement their MACC as well.
|
||||
|
||||
So what GitHub products are eligible for Azure billing? Any GitHub consumption products are eligible today, meaning products that customers pay for based on actual usage, including Copilot for Business, GitHub-hosted actions, larger hosted runners, GitHub Packages and storage, and GitHub Codespaces. Please note that GitHub Enterprise and GitHub Advanced Security are currently not able to be billed through Azure, but are instead invoiced on an annual basis.
|
||||
So what GitHub products are eligible for Azure billing? Any GitHub consumption products are eligible today, meaning products that customers pay for based on actual usage, including things like GitHub Copilot, GitHub-hosted actions, larger hosted runners, GitHub Packages and storage, and GitHub Codespaces.
|
||||
|
||||
Historically, GitHub Enterprise and Advanced Security were only available through an annual license. However, as of August 1, 2024, they are now also available for metered billing through Azure, for additional flexibility and pay-as-you-go pricing. For existing licensed customers, be sure to connect with your GitHub seller to learn more, as certain restrictions may apply.
|
||||
|
||||
[A table shows eligibility for Azure billing and MACCs for the products mentioned. In the table, all products eligible for Azure billing are also eligible for MACCs.]
|
||||
|
||||
|
|
|
@ -5,6 +5,8 @@ sections:
|
|||
**MEDIUM:** An attacker could steal sensitive information by exploiting a Cross-Site Scripting vulnerability in the repository transfer feature. This exploitation would require social engineering. GitHub has requested CVE ID [CVE-2024-8770](https://www.cve.org/cverecord?id=CVE-2024-8770) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
**MEDIUM:** An attacker could push a commit with changes to a workflow using a PAT or OAuth app that lacks the appropriate `workflow` scope by pushing a triple-nested tag pointing at the associated commit. GitHub has requested CVE ID [CVE-2024-8263](https://www.cve.org/cverecord?id=CVE-2024-8263) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
**HIGH:** A GitHub App installed in organizations could upgrade some permissions from read to write access without approval from an organization administrator. An attacker would require an account with administrator access to install a malicious GitHub App. GitHub has requested [CVE ID CVE-2024-8810](https://www.cve.org/cverecord?id=CVE-2024-8810) for this vulnerability, which was reported via the [GitHub Bug Bounty Program](https://bounty.github.com/). [Updated: 2024-11-07]
|
||||
bugs:
|
||||
- |
|
||||
For instances deployed on AWS with IMDSv2 enforced, fallback to private IPs was not successful.
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
date: '2024-11-07'
|
||||
sections:
|
||||
security_fixes:
|
||||
- |
|
||||
**HIGH**: An attacker could bypass SAML single sign-on (SSO) authentication with the optional encrypted assertions feature, allowing unauthorized provisioning of users and access to the instance, by exploiting an improper verification of cryptographic signatures vulnerability in GitHub Enterprise Server. This is a follow up fix for [CVE-2024-9487](https://www.cve.org/cverecord?id=CVE-2024-9487) to further harden the encrypted assertions feature against this type of attack. Please note that encrypted assertions are not enabled by default. Instances not utilizing SAML SSO, or utilizing SAML SSO authentication without encrypted assertions, are not impacted. Additionally, an attacker would require direct network access as well as a signed SAML response or metadata document to exploit this vulnerability.
|
||||
known_issues:
|
||||
- |
|
||||
Custom firewall rules are removed during the upgrade process.
|
||||
- |
|
||||
During the validation phase of a configuration run, a `No such object` error may occur for the Notebook and Viewscreen services. This error can be ignored as the services should still correctly start.
|
||||
- |
|
||||
If the root site administrator is locked out of the Management Console after failed login attempts, the account does not unlock automatically after the defined lockout time. Someone with administrative SSH access to the instance must unlock the account using the administrative shell. For more information, see "[AUTOTITLE](/admin/configuration/administering-your-instance-from-the-management-console/troubleshooting-access-to-the-management-console#unlocking-the-root-site-administrator-account)."
|
||||
- |
|
||||
The `mbind: Operation not permitted` error in the `/var/log/mysql/mysql.err` file can be ignored. MySQL 8 does not gracefully handle when the `CAP_SYS_NICE` capability isn't required, and outputs an error instead of a warning.
|
||||
- |
|
||||
{% data reusables.release-notes.2023-11-aws-system-time %}
|
||||
- |
|
||||
On an instance with the HTTP `X-Forwarded-For` header configured for use behind a load balancer, all client IP addresses in the instance's audit log erroneously appear as 127.0.0.1.
|
||||
- |
|
||||
{% data reusables.release-notes.2023-10-git-push-made-but-not-registered %}
|
||||
- |
|
||||
{% data reusables.release-notes.large-adoc-files-issue %}
|
||||
- |
|
||||
{% data reusables.release-notes.2024-01-haproxy-upgrade-causing-increased-errors %}
|
||||
- |
|
||||
The `reply.[HOSTNAME]` subdomain is falsely always displaying as having no SSL and DNS record, when testing the domain settings via the Management Console without subdomain isolation.
|
||||
- |
|
||||
Admin stats REST API endpoints may timeout on appliances with many users or repositories. Retrying the request until data is returned is advised.
|
||||
- |
|
||||
{% data reusables.release-notes.2024-06-possible-frontend-5-minute-outage-during-hotpatch-upgrade %}
|
||||
- |
|
||||
When restoring from a backup snapshot, a large number of `mapper_parsing_exception` errors may be displayed.
|
||||
- |
|
||||
Services may respond with a `503` status due to an out of date `haproxy` configuration. This can usually be resolved with a `ghe-config-apply` run.
|
|
@ -5,6 +5,8 @@ sections:
|
|||
**MEDIUM:** An attacker could steal sensitive information by exploiting a Cross-Site Scripting vulnerability in the repository transfer feature. This exploitation would require social engineering. GitHub has requested CVE ID [CVE-2024-8770](https://www.cve.org/cverecord?id=CVE-2024-8770) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
**MEDIUM:** An attacker could push a commit with changes to a workflow using a PAT or OAuth app that lacks the appropriate `workflow` scope by pushing a triple-nested tag pointing at the associated commit. GitHub has requested CVE ID [CVE-2024-8263](https://www.cve.org/cverecord?id=CVE-2024-8263) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
**HIGH:** A GitHub App installed in organizations could upgrade some permissions from read to write access without approval from an organization administrator. An attacker would require an account with administrator access to install a malicious GitHub App. GitHub has requested [CVE ID CVE-2024-8810](https://www.cve.org/cverecord?id=CVE-2024-8810) for this vulnerability, which was reported via the [GitHub Bug Bounty Program](https://bounty.github.com/). [Updated: 2024-11-07]
|
||||
bugs:
|
||||
- |
|
||||
For instances deployed on AWS with IMDSv2 enforced, fallback to private IPs was not successful.
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
date: '2024-11-07'
|
||||
sections:
|
||||
security_fixes:
|
||||
- |
|
||||
**HIGH**: An attacker could bypass SAML single sign-on (SSO) authentication with the optional encrypted assertions feature, allowing unauthorized provisioning of users and access to the instance, by exploiting an improper verification of cryptographic signatures vulnerability in GitHub Enterprise Server. This is a follow up fix for [CVE-2024-9487](https://www.cve.org/cverecord?id=CVE-2024-9487) to further harden the encrypted assertions feature against this type of attack. Please note that encrypted assertions are not enabled by default. Instances not utilizing SAML SSO, or utilizing SAML SSO authentication without encrypted assertions, are not impacted. Additionally, an attacker would require direct network access as well as a signed SAML response or metadata document to exploit this vulnerability.
|
||||
- |
|
||||
**HIGH**: An attacker could achieve container escape and privilege escalation to root by exploiting a path collision and arbitrary code execution via the `ghe-firejail` path. GitHub has requested CVE ID [CVE-2024-10007](https://www.cve.org/cverecord?id=CVE-2024-10007) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
bugs:
|
||||
- |
|
||||
This error message `mbind: Operation not permitted` was repeatedly showing in the `/var/log/mysql/mysql.err` MySQL logs.
|
||||
- |
|
||||
When saving settings in the Management Console, the configuration run would stop if the `enterprise-manage` process was restarted.
|
||||
- |
|
||||
A missing configuration value prevented Dependabot from creating group update pull requests.
|
||||
- |
|
||||
On an instance with GitHub Actions enabled, some maintenance tasks could fail due to incomplete upgrade steps during previous upgrades to new releases of GitHub Enterprise Server.
|
||||
- |
|
||||
The initial setup certificate generation in AWS took longer than expected due to fallback to private IPs. The time for this fallback has been reduced.
|
||||
- |
|
||||
If the primary instance was unreachable, running `ghe-repl-stop --force` on a replica would fail during the config apply run.
|
||||
- |
|
||||
When restoring from a backup, repositories that had been deleted in the last 90 days were not completely restored.
|
||||
- |
|
||||
Restoring Git repositories using `backup-utils` occasionally failed.
|
||||
- |
|
||||
Some customers upgrading from 3.11 to 3.13 may experience issues with undecryptable records during the upgrade. This issue has now been resolved. We recommend you read "[Undecryptable records](/enterprise-server@3.13/admin/upgrading-your-instance/troubleshooting-upgrades/known-issues-with-upgrades-to-your-instance#undecryptable-records)."
|
||||
changes:
|
||||
- |
|
||||
For instances deployed on AWS, the default settings for Chrony NTP synchronization have been aligned with AWS's suggested default configurations.
|
||||
known_issues:
|
||||
- |
|
||||
Custom firewall rules are removed during the upgrade process.
|
||||
- |
|
||||
During the validation phase of a configuration run, a `No such object` error may occur for the Notebook and Viewscreen services. This error can be ignored as the services should still correctly start.
|
||||
- |
|
||||
If the root site administrator is locked out of the Management Console after failed login attempts, the account does not unlock automatically after the defined lockout time. Someone with administrative SSH access to the instance must unlock the account using the administrative shell. See "[AUTOTITLE](/admin/configuration/administering-your-instance-from-the-management-console/troubleshooting-access-to-the-management-console#unlocking-the-root-site-administrator-account)."
|
||||
- |
|
||||
The `mbind: Operation not permitted` error in the `/var/log/mysql/mysql.err` file can be ignored. MySQL 8 does not gracefully handle when the `CAP_SYS_NICE` capability isn't required, and outputs an error instead of a warning.
|
||||
- |
|
||||
{% data reusables.release-notes.2023-11-aws-system-time %}
|
||||
- |
|
||||
On an instance with the HTTP `X-Forwarded-For` header configured for use behind a load balancer, all client IP addresses in the instance's audit log erroneously appear as 127.0.0.1.
|
||||
- |
|
||||
{% data reusables.release-notes.2023-10-git-push-made-but-not-registered %}
|
||||
- |
|
||||
{% data reusables.release-notes.large-adoc-files-issue %}
|
||||
- |
|
||||
{% data reusables.release-notes.2024-01-haproxy-upgrade-causing-increased-errors %}
|
||||
- |
|
||||
Repositories originally imported using `ghe-migrator` will not correctly track GitHub Advanced Security contributions.
|
||||
- |
|
||||
The `reply.[HOSTNAME]` subdomain is falsely always displaying as having no SSL and DNS record, when testing the domain settings via the Management Console without subdomain isolation.
|
||||
- |
|
||||
Admin stats REST API endpoints may timeout on appliances with many users or repositories. Retrying the request until data is returned is advised.
|
||||
- |
|
||||
{% data reusables.release-notes.2024-06-possible-frontend-5-minute-outage-during-hotpatch-upgrade %}
|
||||
- |
|
||||
When restoring from a backup snapshot, a large number of `mapper_parsing_exception` errors may be displayed.
|
||||
- |
|
||||
Services may respond with a `503` status due to an out of date `haproxy` configuration. This can usually be resolved with a `ghe-config-apply` run.
|
|
@ -0,0 +1,58 @@
|
|||
date: '2024-11-07'
|
||||
sections:
|
||||
security_fixes:
|
||||
- |
|
||||
**HIGH**: An attacker could bypass SAML single sign-on (SSO) authentication with the optional encrypted assertions feature, allowing unauthorized provisioning of users and access to the instance, by exploiting an improper verification of cryptographic signatures vulnerability in GitHub Enterprise Server. This is a follow up fix for [CVE-2024-9487](https://www.cve.org/cverecord?id=CVE-2024-9487) to further harden the encrypted assertions feature against this type of attack. Please note that encrypted assertions are not enabled by default. Instances not utilizing SAML SSO, or utilizing SAML SSO authentication without encrypted assertions, are not impacted. Additionally, an attacker would require direct network access as well as a signed SAML response or metadata document to exploit this vulnerability.
|
||||
- |
|
||||
**HIGH**: An attacker could achieve container escape and privilege escalation to root by exploiting a path collision and arbitrary code execution via the `ghe-firejail` path. GitHub has requested CVE ID [CVE-2024-10007](https://www.cve.org/cverecord?id=CVE-2024-10007) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
bugs:
|
||||
- |
|
||||
This error message `mbind: Operation not permitted` was repeatedly showing in the `/var/log/mysql/mysql.err` MySQL logs.
|
||||
- |
|
||||
When saving settings in the Management Console, the configuration run would stop if the `enterprise-manage` process was restarted.
|
||||
- |
|
||||
A missing configuration value prevented Dependabot from creating group update pull requests.
|
||||
- |
|
||||
On an instance with GitHub Actions enabled, some maintenance tasks could fail due to incomplete upgrade steps during previous upgrades to new releases of GitHub Enterprise Server.
|
||||
- |
|
||||
The initial setup certificate generation in AWS took longer than expected due to fallback to private IPs. The time for this fallback has been reduced.
|
||||
- |
|
||||
If the primary instance was unreachable, running `ghe-repl-stop --force` on a replica would fail during the config apply run.
|
||||
- |
|
||||
When restoring from a backup, repositories that had been deleted in the last 90 days were not completely restored.
|
||||
- |
|
||||
Restoring Git repositories using backup-utils occasionally failed.
|
||||
- |
|
||||
Organizations were limited to using 100 Actions organization variables instead of 1,000.
|
||||
- |
|
||||
Some customers upgrading from 3.12 to 3.13 or to 3.14 may experience issues with undecryptable records during the upgrade. This issue has now been resolved. We recommend you read "[Undecryptable records](/enterprise-server@3.14/admin/upgrading-your-instance/troubleshooting-upgrades/known-issues-with-upgrades-to-your-instance#undecryptable-records)."
|
||||
changes:
|
||||
- |
|
||||
For instances deployed on AWS, the default settings for Chrony NTP synchronization have been aligned with AWS's suggested default configurations.
|
||||
known_issues:
|
||||
- |
|
||||
Custom firewall rules are removed during the upgrade process.
|
||||
- |
|
||||
During the validation phase of a configuration run, a `No such object` error may occur for the Notebook and Viewscreen services. This error can be ignored as the services should still correctly start.
|
||||
- |
|
||||
If the root site administrator is locked out of the Management Console after failed login attempts, the account does not unlock automatically after the defined lockout time. Someone with administrative SSH access to the instance must unlock the account using the administrative shell. See "[AUTOTITLE](/admin/configuration/administering-your-instance-from-the-management-console/troubleshooting-access-to-the-management-console#unlocking-the-root-site-administrator-account)."
|
||||
- |
|
||||
The `mbind: Operation not permitted` error in the `/var/log/mysql/mysql.err` file can be ignored. MySQL 8 does not gracefully handle when the `CAP_SYS_NICE` capability isn't required, and outputs an error instead of a warning.
|
||||
- |
|
||||
{% data reusables.release-notes.2023-11-aws-system-time %}
|
||||
- |
|
||||
On an instance with the HTTP `X-Forwarded-For` header configured for use behind a load balancer, all client IP addresses in the instance's audit log erroneously appear as 127.0.0.1.
|
||||
- |
|
||||
{% data reusables.release-notes.large-adoc-files-issue %}
|
||||
- |
|
||||
Repositories originally imported using `ghe-migrator` will not correctly track GitHub Advanced Security contributions.
|
||||
- |
|
||||
The `reply.[HOSTNAME]` subdomain is falsely always displaying as having no SSL and DNS record, when testing the domain settings via the Management Console without subdomain isolation.
|
||||
- |
|
||||
Admin stats REST API endpoints may timeout on appliances with many users or repositories. Retrying the request until data is returned is advised.
|
||||
- |
|
||||
{% data reusables.release-notes.2024-06-possible-frontend-5-minute-outage-during-hotpatch-upgrade %}
|
||||
- |
|
||||
When restoring from a backup snapshot, a large number of `mapper_parsing_exception` errors may be displayed.
|
||||
- |
|
||||
Services may respond with a `503` status due to an out of date `haproxy` configuration. This can usually be resolved with a `ghe-config-apply` run.
|
|
@ -5,6 +5,8 @@ sections:
|
|||
**MEDIUM:** An attacker could steal sensitive information by exploiting a Cross-Site Scripting vulnerability in the repository transfer feature. This exploitation would require social engineering. GitHub has requested CVE ID [CVE-2024-8770](https://www.cve.org/cverecord?id=CVE-2024-8770) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
**MEDIUM:** An attacker could push a commit with changes to a workflow using a PAT or OAuth app that lacks the appropriate `workflow` scope by pushing a triple-nested tag pointing at the associated commit. GitHub has requested CVE ID [CVE-2024-8263](https://www.cve.org/cverecord?id=CVE-2024-8263) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
**HIGH:** A GitHub App installed in organizations could upgrade some permissions from read to write access without approval from an organization administrator. An attacker would require an account with administrator access to install a malicious GitHub App. GitHub has requested [CVE ID CVE-2024-8810](https://www.cve.org/cverecord?id=CVE-2024-8810) for this vulnerability, which was reported via the [GitHub Bug Bounty Program](https://bounty.github.com/). [Updated: 2024-11-07]
|
||||
bugs:
|
||||
- |
|
||||
For instances deployed on AWS with IMDSv2 enforced, fallback to private IPs was not successful.
|
||||
|
|
|
@ -23,9 +23,9 @@ sections:
|
|||
- |
|
||||
**MEDIUM:** An attacker could have unauthorized read access to issue content inside an internal repository via GitHub projects. This attack required attacker access to the corresponding project board. GitHub has requested CVE ID [CVE-2024-5817](https://nvd.nist.gov/vuln/detail/CVE-2024-5817) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
An attacker could access previously executed private required workflows by changing the repository visibility from private to public. This occurred despite the repositories with the required workflows remaining private. This vulnerability was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
**MEDIUM**: An attacker could gain unauthorized access to secret scanning alert data because the [REST API secret scanning endpoint](/rest/secret-scanning/secret-scanning?apiVersion=2022-11-28) did not properly verify whether the user account has the business owner role. Only organization members can exploit this vulnerability, requiring a {% data variables.product.pat_generic %} (PAT) with `repo` or `security_events` scopes, limiting exposure to internal actors. Exploitation also required secret scanning to be enabled on user-owned repositories. GitHub has requested CVE ID [CVE-2024-10824](https://www.cve.org/CVERecord?id=CVE-2024-10824) for this vulnerability. [Updated: 2024-11-07]
|
||||
- |
|
||||
A user without the enterprise owner role could view all secret scanning alerts for user-owned repositories using the REST API. Alerts in user-owned repositories are now properly restricted to only be visible to enterprise owners.
|
||||
An attacker could access previously executed private required workflows by changing the repository visibility from private to public. This occurred despite the repositories with the required workflows remaining private. This vulnerability was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
Packages have been updated to the latest security versions.
|
||||
bugs:
|
||||
|
|
|
@ -5,6 +5,8 @@ sections:
|
|||
**MEDIUM:** An attacker could steal sensitive information by exploiting a Cross-Site Scripting vulnerability in the repository transfer feature. This exploitation would require social engineering. GitHub has requested CVE ID [CVE-2024-8770](https://www.cve.org/cverecord?id=CVE-2024-8770) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
**MEDIUM:** An attacker could push a commit with changes to a workflow using a PAT or OAuth app that lacks the appropriate `workflow` scope by pushing a triple-nested tag pointing at the associated commit. GitHub has requested CVE ID [CVE-2024-8263](https://www.cve.org/cverecord?id=CVE-2024-8263) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
**HIGH:** A GitHub App installed in organizations could upgrade some permissions from read to write access without approval from an organization administrator. An attacker would require an account with administrator access to install a malicious GitHub App. GitHub has requested [CVE ID CVE-2024-8810](https://www.cve.org/cverecord?id=CVE-2024-8810) for this vulnerability, which was reported via the [GitHub Bug Bounty Program](https://bounty.github.com/). [Updated: 2024-11-07]
|
||||
bugs:
|
||||
- |
|
||||
For instances deployed on AWS with IMDSv2 enforced, fallback to private IPs was not successful.
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
date: '2024-11-07'
|
||||
sections:
|
||||
security_fixes:
|
||||
- |
|
||||
Elasticsearch packages have been updated to the latest security versions.
|
||||
- |
|
||||
**HIGH**: An attacker could bypass SAML single sign-on (SSO) authentication with the optional encrypted assertions feature, allowing unauthorized provisioning of users and access to the instance, by exploiting an improper verification of cryptographic signatures vulnerability in GitHub Enterprise Server. This is a follow up fix for [CVE-2024-9487](https://www.cve.org/cverecord?id=CVE-2024-9487) to further harden the encrypted assertions feature against this type of attack. Please note that encrypted assertions are not enabled by default. Instances not utilizing SAML SSO, or utilizing SAML SSO authentication without encrypted assertions, are not impacted. Additionally, an attacker would require direct network access as well as a signed SAML response or metadata document to exploit this vulnerability.
|
||||
- |
|
||||
**HIGH**: An attacker could achieve container escape and privilege escalation to root by exploiting a path collision and arbitrary code execution via the `ghe-firejail` path. GitHub has requested CVE ID [CVE-2024-10007](https://www.cve.org/cverecord?id=CVE-2024-10007) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
bugs:
|
||||
- |
|
||||
A missing configuration value prevented Dependabot from creating group update pull requests.
|
||||
- |
|
||||
When saving settings in the Management Console, the configuration run would stop if the `enterprise-manage` process was restarted.
|
||||
- |
|
||||
On an instance with GitHub Actions enabled, some maintenance tasks could fail due to incomplete upgrade steps during previous upgrades to new releases of GitHub Enterprise Server.
|
||||
- |
|
||||
The initial setup certificate generation in AWS took longer than expected due to fallback to private IPs. The time for this fallback has been reduced.
|
||||
- |
|
||||
The `ghe-support-bundle` generation would fail when the `aqueduct-lite` service is down.
|
||||
- |
|
||||
If the primary instance was unreachable, running `ghe-repl-stop --force` on a replica would fail during the config apply run.
|
||||
- |
|
||||
For instances that use the mandatory message feature logging in to certain URLs may have caused a 500 error.
|
||||
- |
|
||||
When restoring from a backup, repositories that had been deleted in the last 90 days were not completely restored.
|
||||
- |
|
||||
Restoring Git repositories using backup-utils occasionally failed.
|
||||
- |
|
||||
Enterprise installations experienced unpredictable repository search results due to the default 4,000 repository limit. A relaxed repository filter mode, which includes all single-tenant organization repositories and bypasses the limit, has been introduced. Administrators can enable this mode using `ghe-config app.github.enterprise-repo-search-filter-enabled true && ghe-config-apply`.
|
||||
- |
|
||||
Organizations were limited to using 100 Actions organization variables instead of 1,000.
|
||||
- |
|
||||
Running `config-apply` became stuck under certain circumstances due to a misconfiguration with Packages and Elasticsearch.
|
||||
- |
|
||||
Some customers upgrading to 3.13 may experience issues with undecryptable records during the upgrade. This issue has now been resolved. We recommend you read "[Undecryptable records](/admin/upgrading-your-instance/troubleshooting-upgrades/known-issues-with-upgrades-to-your-instance#undecryptable-records)."
|
||||
changes:
|
||||
- |
|
||||
When connecting to an appliance via SSH, a notification about upcoming root disk changes displays.
|
||||
known_issues:
|
||||
- |
|
||||
During the validation phase of a configuration run, a `No such object` error may occur for the Notebook and Viewscreen services. This error can be ignored as the services should still correctly start.
|
||||
- |
|
||||
If the root site administrator is locked out of the Management Console after failed login attempts, the account does not unlock automatically after the defined lockout time. Someone with administrative SSH access to the instance must unlock the account using the administrative shell. See "[AUTOTITLE](/admin/configuration/administering-your-instance-from-the-management-console/troubleshooting-access-to-the-management-console#unlocking-the-root-site-administrator-account)."
|
||||
- |
|
||||
On an instance with the HTTP `X-Forwarded-For` header configured for use behind a load balancer, all client IP addresses in the instance's audit log erroneously appear as 127.0.0.1.
|
||||
- |
|
||||
Repositories originally imported using `ghe-migrator` will not correctly track GitHub Advanced Security contributions.
|
||||
- |
|
||||
For an instance in a cluster configuration and with GitHub Actions enabled, restoring a cluster from backup requires targeting the primary DB node.
|
||||
- |
|
||||
When following the steps for [Replacing the primary MySQL node](/admin/monitoring-managing-and-updating-your-instance/configuring-clustering/replacing-a-cluster-node#replacing-the-primary-mysql-node), step 14 (running `ghe-cluster-config-apply`) might fail with errors. If this occurs, re-running `ghe-cluster-config-apply` is expected to succeed.
|
||||
- |
|
||||
Running a `config apply` as part of the steps for [Replacing a node in an emergency](/admin/monitoring-managing-and-updating-your-instance/configuring-clustering/replacing-a-cluster-node#replacing-a-node-in-an-emergency) may fail with errors if the node being replaced is still reachable. If this occurs, shutdown the node and repeat the steps.
|
||||
- |
|
||||
{% data reusables.release-notes.2024-06-possible-frontend-5-minute-outage-during-hotpatch-upgrade %}
|
||||
- |
|
||||
When restoring data originally backed up from a 3.13 appliance onto a 3.13 appliance, the elasticsearch indices need to be reindexed before some of the data will show up. This happens via a nightly scheduled job. It can also be forced by running `/usr/local/share/enterprise/ghe-es-search-repair`.
|
||||
- |
|
||||
When restoring from a backup snapshot, a large number of `mapper_parsing_exception` errors may be displayed.
|
||||
- |
|
||||
Services may respond with a `503` status due to an out of date `haproxy` configuration. This can usually be resolved with a `ghe-config-apply` run.
|
|
@ -3,6 +3,8 @@ sections:
|
|||
security_fixes:
|
||||
- |
|
||||
**MEDIUM:** An attacker could steal sensitive information by exploiting a Cross-Site Scripting vulnerability in the repository transfer feature. This exploitation would require social engineering. GitHub has requested CVE ID [CVE-2024-8770](https://www.cve.org/cverecord?id=CVE-2024-8770) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
- |
|
||||
**HIGH:** A GitHub App installed in organizations could upgrade some permissions from read to write access without approval from an organization administrator. An attacker would require an account with administrator access to install a malicious GitHub App. GitHub has requested [CVE ID CVE-2024-8810](https://www.cve.org/cverecord?id=CVE-2024-8810) for this vulnerability, which was reported via the [GitHub Bug Bounty Program](https://bounty.github.com/). [Updated: 2024-11-07]
|
||||
bugs:
|
||||
- |
|
||||
On an instance with GitHub Actions enabled, due to an insufficient wait time, MS SQL and MySQL replication could fail with the error message `Failed to start nomad service!`.
|
||||
|
|
|
@ -0,0 +1,76 @@
|
|||
date: '2024-11-07'
|
||||
sections:
|
||||
security_fixes:
|
||||
- |
|
||||
Elasticsearch packages have been updated to the latest security versions.
|
||||
- |
|
||||
Packages have been updated to the latest security version.
|
||||
- |
|
||||
**HIGH**: An attacker could bypass SAML single sign-on (SSO) authentication with the optional encrypted assertions feature, allowing unauthorized provisioning of users and access to the instance, by exploiting an improper verification of cryptographic signatures vulnerability in GitHub Enterprise Server. This is a follow up fix for [CVE-2024-9487](https://www.cve.org/cverecord?id=CVE-2024-9487) to further harden the encrypted assertions feature against this type of attack. Please note that encrypted assertions are not enabled by default. Instances not utilizing SAML SSO, or utilizing SAML SSO authentication without encrypted assertions, are not impacted. Additionally, an attacker would require direct network access as well as a signed SAML response or metadata document to exploit this vulnerability.
|
||||
- |
|
||||
**HIGH**: An attacker could achieve container escape and privilege escalation to root by exploiting a path collision and arbitrary code execution via the `ghe-firejail` path. GitHub has requested CVE ID [CVE-2024-10007](https://www.cve.org/cverecord?id=CVE-2024-10007) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
|
||||
bugs:
|
||||
- |
|
||||
When saving settings in the Management Console, the configuration run would stop if the `enterprise-manage` process was restarted.
|
||||
- |
|
||||
On an instance with GitHub Actions enabled, some maintenance tasks could fail due to incomplete upgrade steps during previous upgrades to new releases of GitHub Enterprise Server.
|
||||
- |
|
||||
A repeated error message concerning connectivity to port 6002 was emitted to the system logs when GitHub Actions was enabled.
|
||||
- |
|
||||
The initial setup certificate generation in AWS took longer than expected due to fallback to private IPs. The time for this fallback has been reduced.
|
||||
- |
|
||||
The `ghe-support-bundle` generation would fail when the `aqueduct-lite` service is down.
|
||||
- |
|
||||
If the primary instance was unreachable, running `ghe-repl-stop --force` on a replica would fail during the config apply run.
|
||||
- |
|
||||
Administrators in the SCIM private beta (versions < 3.14) that decided to upgrade their private beta appliance see an incorrectly checked box in the "SCIM Configuration" section of the Enterprise settings authentication security page in 3.14.
|
||||
- |
|
||||
Certain URLs may have caused a 500 error on instances that use the mandatory message feature logging.
|
||||
- |
|
||||
When restoring from a backup, repositories that had been deleted in the last 90 days were not completely restored.
|
||||
- |
|
||||
For instances that use secret scanning, custom messages for push protection set by the enterprise did not display to users.
|
||||
- |
|
||||
Restoring Git repositories using `backup-utils` occasionally failed.
|
||||
- |
|
||||
Enterprise installations experienced unpredictable repository search results due to the default 4,000 repository limit. A relaxed repository filter mode, which includes all single-tenant organization repositories and bypasses the limit, has been introduced. Administrators can enable this mode using `ghe-config app.github.enterprise-repo-search-filter-enabled true && ghe-config-apply`.
|
||||
- |
|
||||
Running `config-apply` became stuck under certain circumstances due to a misconfiguration with Packages and Elasticsearch.
|
||||
- |
|
||||
Audit log events for secret scanning alerts incorrectly displayed a blank secret type when generated for a custom pattern.
|
||||
- |
|
||||
Some customers upgrading to 3.14 may experience issues with undecryptable records during the upgrade. This issue has now been resolved. We recommend you read "[Undecryptable records](/admin/upgrading-your-instance/troubleshooting-upgrades/known-issues-with-upgrades-to-your-instance#undecryptable-records)."
|
||||
changes:
|
||||
- |
|
||||
When connecting to an appliance via SSH, a notification about upcoming root disk changes displays.
|
||||
known_issues:
|
||||
- |
|
||||
During the validation phase of a configuration run, a `No such object` error may occur for the Notebook and Viewscreen services. This error can be ignored as the services should still correctly start.
|
||||
- |
|
||||
If the root site administrator is locked out of the Management Console after failed login attempts, the account does not unlock automatically after the defined lockout time. Someone with administrative SSH access to the instance must unlock the account using the administrative shell. See "[AUTOTITLE](/admin/configuration/administering-your-instance-from-the-management-console/troubleshooting-access-to-the-management-console#unlocking-the-root-site-administrator-account)."
|
||||
- |
|
||||
On an instance with the HTTP `X-Forwarded-For` header configured for use behind a load balancer, all client IP addresses in the instance's audit log erroneously appear as 127.0.0.1.
|
||||
- |
|
||||
{% data reusables.release-notes.large-adoc-files-issue %}
|
||||
- |
|
||||
Repositories originally imported using `ghe-migrator` will not correctly track GitHub Advanced Security contributions.
|
||||
- |
|
||||
Admin stats REST API endpoints may timeout on appliances with many users or repositories. Retrying the request until data is returned is advised.
|
||||
- |
|
||||
When following the steps for [Replacing the primary MySQL node](/admin/monitoring-managing-and-updating-your-instance/configuring-clustering/replacing-a-cluster-node#replacing-the-primary-mysql-node), step 14 (running `ghe-cluster-config-apply`) might fail with errors. If this occurs, re-running `ghe-cluster-config-apply` is expected to succeed.
|
||||
- |
|
||||
Running a `config apply` as part of the steps for [Replacing a node in an emergency](/admin/monitoring-managing-and-updating-your-instance/configuring-clustering/replacing-a-cluster-node#replacing-a-node-in-an-emergency) may fail with errors if the node being replaced is still reachable. If this occurs, shutdown the node and repeat the steps.
|
||||
- |
|
||||
{% data reusables.release-notes.2024-06-possible-frontend-5-minute-outage-during-hotpatch-upgrade %}
|
||||
- |
|
||||
When restoring data originally backed up from a 3.13 appliance onto a 3.13 appliance, the Elasticsearch indices need to be reindexed before some of the data will show up. This happens via a nightly scheduled job. It can also be forced by running `/usr/local/share/enterprise/ghe-es-search-repair`.
|
||||
- |
|
||||
An organization-level code scanning configuration page is displayed on instances that do not use GitHub Advanced Security or code scanning.
|
||||
- |
|
||||
In the header bar displayed to site administrators, some icons are not available.
|
||||
- |
|
||||
When enabling automatic update checks for the first time in the Management Console, the status is not dynamically reflected until the "Updates" page is reloaded.
|
||||
- |
|
||||
When restoring from a backup snapshot, a large number of `mapper_parsing_exception` errors may be displayed.
|
||||
- |
|
||||
Services may respond with a `503` status due to an out of date `haproxy` configuration. This can usually be resolved with a `ghe-config-apply` run.
|
|
@ -35,7 +35,7 @@
|
|||
| [Use of `Kernel.open` or `IO.read` or similar sinks with a non-constant value](https://codeql.github.com/codeql-query-help/ruby/rb-non-constant-kernel-open/) | 078, 088, 073 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
|
||||
| [Use of `Kernel.open`, `IO.read` or similar sinks with user-controlled input](https://codeql.github.com/codeql-query-help/ruby/rb-kernel-open/) | 078, 088, 073 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
|
||||
| [Use of a broken or weak cryptographic algorithm](https://codeql.github.com/codeql-query-help/ruby/rb-weak-cryptographic-algorithm/) | 327 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
|
||||
| [Use of a broken or weak cryptographic hashing algorithm on sensitive data](https://codeql.github.com/codeql-query-help/ruby/rb-weak-sensitive-data-hashing/) | 327, 328, 916 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "x" aria-label="Not included" %} |
|
||||
| [Use of a broken or weak cryptographic hashing algorithm on sensitive data](https://codeql.github.com/codeql-query-help/ruby/rb-weak-sensitive-data-hashing/) | 327, 328, 916 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
|
||||
| [Use of externally-controlled format string](https://codeql.github.com/codeql-query-help/ruby/rb-tainted-format-string/) | 134 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
|
||||
| [Weak cookie configuration](https://codeql.github.com/codeql-query-help/ruby/rb-weak-cookie-configuration/) | 732, 1275 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
|
||||
| [XML external entity expansion](https://codeql.github.com/codeql-query-help/ruby/rb-xxe/) | 611, 776, 827 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
|
||||
|
|
|
@ -64,4 +64,5 @@
|
|||
| GHD038 | expired-content | Expired content must be remediated. | error | expired |
|
||||
| GHD039 | expiring-soon | Content that expires soon should be proactively addressed. | warning | expired |
|
||||
| [GHD040](https://github.com/github/docs/blob/main/src/content-linter/README.md) | table-liquid-versioning | Tables must use the correct liquid versioning format | error | tables |
|
||||
| GHD041 | third-party-action-pinning | Code examples that use third-party actions must always pin to a full length commit SHA | error | feature, actions |
|
||||
| GHD041 | third-party-action-pinning | Code examples that use third-party actions must always pin to a full length commit SHA | error | feature, actions |
|
||||
| GHD042 | liquid-tag-whitespace | Liquid tags should start and end with one whitespace. Liquid tag arguments should be separated by only one whitespace. | error | liquid, format |
|
|
@ -2,3 +2,5 @@ To pay for licenses, you must connect your enterprise to an Azure subscription.
|
|||
|
||||
* "[Prerequisites](/billing/managing-the-plan-for-your-github-account/connecting-an-azure-subscription#prerequisites)"
|
||||
* "[Connecting your Azure subscription to an enterprise account](/billing/managing-the-plan-for-your-github-account/connecting-an-azure-subscription#connecting-your-azure-subscription-to-your-enterprise-account)"
|
||||
|
||||
If you prefer a visual overview of the process, watch [Billing {% data variables.product.company_short %} consumption through an Azure subscription](https://www.youtube.com/watch?v=Y-f7JKJ4_8Y) on our YouTube channel.
|
||||
|
|
|
@ -5,6 +5,6 @@ CA certificate key too weak
|
|||
```
|
||||
|
||||
To resolve this issue, confirm that your certificate complies
|
||||
with level 2 of the OpenSSL security specification. For more information, see [SSL_CTX_set_security_level](https://www.openssl.org/docs/man1.1.1/man3/SSL_CTX_set_security_level.html#DEFAULT-CALLBACK-BEHAVIOUR) in the OpenSSL docs. For more information about reviewing your instance's logs, see "[AUTOTITLE](/admin/monitoring-and-managing-your-instance/monitoring-your-instance/about-system-logs#system-logs-in-the-systemd-journal)".
|
||||
with level 2 of the OpenSSL security specification. For more information, see [SSL_CTX_set_security_level](https://www.openssl.org/docs/man1.1.1/man3/SSL_CTX_set_security_level.html#DEFAULT-CALLBACK-BEHAVIOUR) in the OpenSSL docs. For more information about reviewing your instance's logs, see "[AUTOTITLE](/admin/monitoring-and-managing-your-instance/monitoring-your-instance/about-system-logs#system-logs-in-the-systemd-journal)."
|
||||
|
||||
If the error appears in `babeld` logs because your TLS certificate does not comply with level 2 of the specification, you must create and upload a new certificate with stronger security before you upgrade to GitHub Enterprise Server 3.10 or later. For more information, see "[AUTOTITLE](/admin/configuration/hardening-security-for-your-enterprise/configuring-tls)."
|
||||
|
|
|
@ -109,10 +109,13 @@
|
|||
"@octokit/rest": "21.0.2",
|
||||
"@playwright/test": "^1.48.1",
|
||||
"@types/accept-language-parser": "1.5.6",
|
||||
"@types/cheerio": "^0.22.35",
|
||||
"@types/connect-datadog": "0.0.10",
|
||||
"@types/connect-timeout": "0.0.39",
|
||||
"@types/cookie": "0.6.0",
|
||||
"@types/cookie-parser": "1.4.7",
|
||||
"@types/elasticsearch": "^5.0.43",
|
||||
"@types/event-to-promise": "^0.7.5",
|
||||
"@types/express": "4.17.21",
|
||||
"@types/imurmurhash": "^0.1.4",
|
||||
"@types/js-cookie": "^3.0.6",
|
||||
|
@ -3165,6 +3168,15 @@
|
|||
"integrity": "sha512-hWtVTC2q7hc7xZ/RLbxapMvDMgUnDvKvMOpKal4DrMyfGBUfB1oKaZlIRr6mJL+If3bAP6sV/QneGzF6tJjZDg==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/cheerio": {
|
||||
"version": "0.22.35",
|
||||
"resolved": "https://registry.npmjs.org/@types/cheerio/-/cheerio-0.22.35.tgz",
|
||||
"integrity": "sha512-yD57BchKRvTV+JD53UZ6PD8KWY5g5rvvMLRnZR3EQBCZXiDT/HR+pKpMzFGlWNhFrXlo7VPZXtKvIEwZkAWOIA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/connect": {
|
||||
"version": "3.4.38",
|
||||
"resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz",
|
||||
|
@ -3228,12 +3240,27 @@
|
|||
"@types/ms": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/elasticsearch": {
|
||||
"version": "5.0.43",
|
||||
"resolved": "https://registry.npmjs.org/@types/elasticsearch/-/elasticsearch-5.0.43.tgz",
|
||||
"integrity": "sha512-N+MpzURpDCWd7zaJ7CE1aU+nBSeAABLhDE0lGodQ0LLftx7ku6hjTXLr9OAFZLSXiWL3Xxx8jts485ynrcm5NA==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/estree": {
|
||||
"version": "1.0.5",
|
||||
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz",
|
||||
"integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==",
|
||||
"dev": true
|
||||
},
|
||||
"node_modules/@types/event-to-promise": {
|
||||
"version": "0.7.5",
|
||||
"resolved": "https://registry.npmjs.org/@types/event-to-promise/-/event-to-promise-0.7.5.tgz",
|
||||
"integrity": "sha512-h10M3ybTySQFVP4N1uiEgPwbpHExNS8UMpCqRUJFkMhlpgSlWsyYsGMmkrJIKRnhGfYDOb4LD3U+SSPujoMHNA==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/express": {
|
||||
"version": "4.17.21",
|
||||
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz",
|
||||
|
|
20
package.json
20
package.json
|
@ -17,7 +17,7 @@
|
|||
"exports": "./src/frame/server.ts",
|
||||
"scripts": {
|
||||
"all-documents": "tsx src/content-render/scripts/all-documents/cli.ts",
|
||||
"analyze-text": "node src/search/scripts/analyze-text.js",
|
||||
"analyze-text": "tsx src/search/scripts/analyze-text.ts",
|
||||
"analyze-comment": "tsx src/events/scripts/analyze-comment-cli.ts",
|
||||
"archive-version": "tsx --max-old-space-size=16384 src/ghes-releases/scripts/archive-version.ts",
|
||||
"audit-log-sync": "tsx src/audit-logs/scripts/sync.ts",
|
||||
|
@ -39,8 +39,14 @@
|
|||
"find-unused-variables": "tsx src/content-linter/scripts/find-unsed-variables.ts",
|
||||
"fixture-dev": "cross-env ROOT=src/fixtures/fixtures npm start",
|
||||
"fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests",
|
||||
"index": "tsx src/search/scripts/index/index.ts",
|
||||
"index-elasticsearch": "node src/search/scripts/index-elasticsearch.js",
|
||||
"general-search-scrape": "tsx src/search/scripts/scrape/scrape-cli.ts",
|
||||
"general-search-scrape-server": "cross-env NODE_ENV=production PORT=4002 MINIMAL_RENDER=true CHANGELOG_DISABLED=true tsx src/frame/server.ts",
|
||||
"ghes-release-scrape-with-server": "cross-env GHES_RELEASE=1 start-server-and-test general-search-scrape-server 4002 general-search-scrape",
|
||||
"general-search-scrape-with-server": "cross-env NODE_OPTIONS='--max_old_space_size=8192' start-server-and-test general-search-scrape-server 4002 general-search-scrape",
|
||||
"index": "tsx src/search/scripts/index/index-cli autocomplete docs-internal-data",
|
||||
"index-ai-search-autocomplete": "tsx src/search/scripts/index/index-cli ai-search-autocomplete",
|
||||
"index-general-autocomplete": "tsx src/search/scripts/index/index-cli general-autocomplete",
|
||||
"index-general-search": "tsx src/search/scripts/index/index-cli general-search",
|
||||
"index-test-fixtures": "./src/search/scripts/index-test-fixtures.sh",
|
||||
"lint": "eslint '**/*.{js,mjs,ts,tsx}'",
|
||||
"lint-content": "node src/content-linter/scripts/lint-content.js",
|
||||
|
@ -70,10 +76,6 @@
|
|||
"start-for-playwright": "cross-env ROOT=src/fixtures/fixtures TRANSLATIONS_FIXTURE_ROOT=src/fixtures/fixtures/translations ENABLED_LANGUAGES=en,ja NODE_ENV=test tsx src/frame/server.ts",
|
||||
"symlink-from-local-repo": "node src/early-access/scripts/symlink-from-local-repo.js",
|
||||
"sync-rest": "tsx src/rest/scripts/update-files.ts",
|
||||
"sync-search": "cross-env NODE_OPTIONS='--max_old_space_size=8192' start-server-and-test sync-search-server 4002 sync-search-indices",
|
||||
"sync-search-ghes-release": "cross-env GHES_RELEASE=1 start-server-and-test sync-search-server 4002 sync-search-indices",
|
||||
"sync-search-indices": "node src/search/scripts/sync-search-indices.js",
|
||||
"sync-search-server": "cross-env NODE_ENV=production PORT=4002 MINIMAL_RENDER=true CHANGELOG_DISABLED=true tsx src/frame/server.ts",
|
||||
"sync-secret-scanning": "tsx src/secret-scanning/scripts/sync.ts",
|
||||
"sync-webhooks": "npx tsx src/rest/scripts/update-files.ts -o webhooks",
|
||||
"test": "vitest",
|
||||
|
@ -222,6 +224,7 @@
|
|||
"src/open-source/scripts/add-pr-links.js",
|
||||
"src/open-source/scripts/pr-link-source.js",
|
||||
"rest-api-description/",
|
||||
"docs-internal-data/",
|
||||
"src/code-scanning/scripts/generate-code-scanning-query-list.ts"
|
||||
]
|
||||
},
|
||||
|
@ -327,10 +330,13 @@
|
|||
"@octokit/rest": "21.0.2",
|
||||
"@playwright/test": "^1.48.1",
|
||||
"@types/accept-language-parser": "1.5.6",
|
||||
"@types/cheerio": "^0.22.35",
|
||||
"@types/connect-datadog": "0.0.10",
|
||||
"@types/connect-timeout": "0.0.39",
|
||||
"@types/cookie": "0.6.0",
|
||||
"@types/cookie-parser": "1.4.7",
|
||||
"@types/elasticsearch": "^5.0.43",
|
||||
"@types/event-to-promise": "^0.7.5",
|
||||
"@types/express": "4.17.21",
|
||||
"@types/imurmurhash": "^0.1.4",
|
||||
"@types/js-cookie": "^3.0.6",
|
||||
|
|
|
@ -3,5 +3,5 @@
|
|||
"apiOnlyEvents": "This event is not available in the web interface, only via the REST API, audit log streaming, or JSON/CSV exports.",
|
||||
"apiRequestEvent": "This event is only available via audit log streaming."
|
||||
},
|
||||
"sha": "548a504f9bbeb14e74a0da48a869f8e6239b6d9f"
|
||||
"sha": "5cdd5d7d8ef0e34ebff6addc8d04b7d3da813589"
|
||||
}
|
|
@ -31,6 +31,7 @@ import { imageNoGif } from './image-no-gif.js'
|
|||
import { expiredContent, expiringSoon } from './expired-content.js'
|
||||
import { tableLiquidVersioning } from './table-liquid-versioning.js'
|
||||
import { thirdPartyActionPinning } from './third-party-action-pinning.js'
|
||||
import { liquidTagWhitespace } from './liquid-tag-whitespace.js'
|
||||
|
||||
const noDefaultAltText = markdownlintGitHub.find((elem) =>
|
||||
elem.names.includes('no-default-alt-text'),
|
||||
|
@ -77,5 +78,6 @@ export const gitHubDocsMarkdownlint = {
|
|||
expiringSoon,
|
||||
tableLiquidVersioning,
|
||||
thirdPartyActionPinning,
|
||||
liquidTagWhitespace,
|
||||
],
|
||||
}
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
import { TokenKind } from 'liquidjs'
|
||||
|
||||
import { getLiquidTokens, getPositionData } from '../helpers/liquid-utils.js'
|
||||
import { addFixErrorDetail } from '../helpers/utils.js'
|
||||
|
||||
/*
|
||||
Liquid tags should start and end with one whitespace. For example:
|
||||
|
||||
DO use a single whitespace character
|
||||
{% data <args> %}
|
||||
|
||||
DON'T use 0 or more than 1 whitespace
|
||||
{%data <args> %}
|
||||
|
||||
DON'T use more than 1 whitespace between args
|
||||
{%data arg1 arg2 %}
|
||||
*/
|
||||
|
||||
export const liquidTagWhitespace = {
|
||||
names: ['GHD042', 'liquid-tag-whitespace'],
|
||||
description:
|
||||
'Liquid tags should start and end with one whitespace. Liquid tag arguments should be separated by only one whitespace.',
|
||||
tags: ['liquid', 'format'],
|
||||
function: (params, onError) => {
|
||||
const content = params.lines.join('\n')
|
||||
const tokens = getLiquidTokens(content).filter((token) => token.kind === TokenKind.Tag)
|
||||
for (const token of tokens) {
|
||||
const { lineNumber, column, length } = getPositionData(token, params.lines)
|
||||
|
||||
const range = [column, length]
|
||||
const tag = params.lines[lineNumber - 1].slice(column - 1, column - 1 + length)
|
||||
|
||||
// Get just the opening and closing tags, which includes any whitespace
|
||||
// added before the tag name or any arguments
|
||||
const openTag = tag.slice(0, token.contentRange[0] - token.begin)
|
||||
const closeTag = tag.slice(-(token.end - token.contentRange[1]))
|
||||
|
||||
const isOpenTagOneSpace = openTag !== openTag.trim() + ' '
|
||||
const isCloseTagOneSpace = closeTag !== ' ' + closeTag.trim()
|
||||
|
||||
const moreThanOneSpace = /\s{2,}/
|
||||
const isArgOneSpace = moreThanOneSpace.test(tag)
|
||||
|
||||
const fixedContent =
|
||||
openTag.trim() + ' ' + token.content.replace(moreThanOneSpace, ' ') + ' ' + closeTag.trim()
|
||||
|
||||
if (isOpenTagOneSpace || isCloseTagOneSpace || isArgOneSpace) {
|
||||
addFixErrorDetail(
|
||||
onError,
|
||||
lineNumber,
|
||||
fixedContent,
|
||||
params.lines[lineNumber - 1].slice(column - 1, column - 1 + length),
|
||||
range,
|
||||
{
|
||||
lineNumber,
|
||||
editColumn: column,
|
||||
deleteCount: length,
|
||||
insertText: fixedContent,
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
|
@ -161,6 +161,12 @@ const githubDocsConfig = {
|
|||
'partial-markdown-files': true,
|
||||
'yml-files': true,
|
||||
},
|
||||
'liquid-tag-whitespace': {
|
||||
// GHD042
|
||||
severity: 'error',
|
||||
'partial-markdown-files': true,
|
||||
'yml-files': true,
|
||||
},
|
||||
}
|
||||
|
||||
export const githubDocsFrontmatterConfig = {
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
import { describe, expect, test } from 'vitest'
|
||||
|
||||
import { runRule } from '../../lib/init-test.js'
|
||||
import { liquidTagWhitespace } from '../../lib/linting-rules/liquid-tag-whitespace.js'
|
||||
|
||||
describe(liquidTagWhitespace.names.join(' - '), () => {
|
||||
test('liquid tags with correct whitespace pass', async () => {
|
||||
const markdown = [
|
||||
'{% data variables.location.product_location %}',
|
||||
'{% assign my_variable = "value" %}',
|
||||
'{% if user %}Hello, {{ user.name }}{% endif %}',
|
||||
].join('\n')
|
||||
|
||||
const result = await runRule(liquidTagWhitespace, { strings: { markdown } })
|
||||
const errors = result.markdown
|
||||
expect(errors.length).toBe(0)
|
||||
})
|
||||
|
||||
test('liquid tags with incorrect whitespace fail', async () => {
|
||||
const markdown = [
|
||||
'{%data variables.location.product_location %}',
|
||||
'{% assign my_variable = "value"%}',
|
||||
'{% if user %}Hello, {{ user.name }} {%endif %}',
|
||||
'{% data variables.location.product_location %}',
|
||||
'{%-data variables.location.product_location -%}',
|
||||
'{%- assign my_variable = "value"-%}',
|
||||
'{%- if user -%}Hello, {{ user.name }} {%endif %}',
|
||||
'{%- data variables.location.product_location -%}',
|
||||
].join('\n')
|
||||
|
||||
const result = await runRule(liquidTagWhitespace, { strings: { markdown } })
|
||||
const errors = result.markdown
|
||||
expect(errors.length).toBe(8)
|
||||
expect(errors[2].lineNumber).toBe(3)
|
||||
expect(errors[2].fixInfo).toEqual({
|
||||
deleteCount: 10,
|
||||
editColumn: 37,
|
||||
lineNumber: 3,
|
||||
insertText: '{% endif %}',
|
||||
})
|
||||
})
|
||||
|
||||
test('liquid tags with multiple spaces between arguments fail', async () => {
|
||||
const markdown = [
|
||||
'{% assign my_variable = "value" %}',
|
||||
'{% if user %}Hello, {{ user.name }}{% endif %}',
|
||||
].join('\n')
|
||||
|
||||
const result = await runRule(liquidTagWhitespace, { strings: { markdown } })
|
||||
const errors = result.markdown
|
||||
expect(errors.length).toBe(2)
|
||||
expect(errors[1].lineNumber).toBe(2)
|
||||
expect(errors[0].fixInfo).toEqual({
|
||||
deleteCount: 35,
|
||||
editColumn: 1,
|
||||
lineNumber: 1,
|
||||
insertText: '{% assign my_variable = "value" %}',
|
||||
})
|
||||
})
|
||||
|
||||
test('liquid tags with single spaces between arguments pass', async () => {
|
||||
const markdown = [
|
||||
'{% assign my_variable = "value" %}',
|
||||
'{% if user %}Hello, {{ user.name }}{% endif %}',
|
||||
].join('\n')
|
||||
|
||||
const result = await runRule(liquidTagWhitespace, { strings: { markdown } })
|
||||
const errors = result.markdown
|
||||
expect(errors.length).toBe(0)
|
||||
})
|
||||
})
|
|
@ -68,7 +68,7 @@ describe('breadcrumbs', () => {
|
|||
|
||||
expect($breadcrumbTitles.length).toBe(0)
|
||||
expect($breadcrumbLinks.length).toBe(2)
|
||||
expect($breadcrumbLinks[0].attribs.title).toBe('Deeper secrets')
|
||||
expect($breadcrumbLinks[1].attribs.title).toBe('Mariana Trench')
|
||||
expect(($breadcrumbLinks[0] as cheerio.TagElement).attribs.title).toBe('Deeper secrets')
|
||||
expect(($breadcrumbLinks[1] as cheerio.TagElement).attribs.title).toBe('Mariana Trench')
|
||||
})
|
||||
})
|
||||
|
|
|
@ -3,7 +3,7 @@ import { createProxyMiddleware } from 'http-proxy-middleware'
|
|||
|
||||
import events from '@/events/middleware.js'
|
||||
import anchorRedirect from '@/rest/api/anchor-redirect.js'
|
||||
import search from '@/search/middleware/search.js'
|
||||
import search from '@/search/middleware/search-routes.js'
|
||||
import pageInfo from '@/pageinfo/middleware'
|
||||
import pageList from '@/pagelist/middleware'
|
||||
import webhooks from '@/webhooks/middleware/webhooks.js'
|
||||
|
|
|
@ -61,7 +61,7 @@ import fastlyCacheTest from './fastly-cache-test'
|
|||
import trailingSlashes from './trailing-slashes'
|
||||
import mockVaPortal from './mock-va-portal'
|
||||
import dynamicAssets from '@/assets/middleware/dynamic-assets'
|
||||
import contextualizeSearch from '@/search/middleware/contextualize.js'
|
||||
import generalSearchMiddleware from '@/search/middleware/general-search-middleware'
|
||||
import shielding from '@/shielding/middleware'
|
||||
import tracking from '@/tracking/middleware'
|
||||
import { MAX_REQUEST_TIMEOUT } from '@/frame/lib/constants.js'
|
||||
|
@ -275,7 +275,7 @@ export default function (app: Express) {
|
|||
app.use(asyncMiddleware(productExamples))
|
||||
app.use(asyncMiddleware(productGroups))
|
||||
app.use(asyncMiddleware(glossaries))
|
||||
app.use(asyncMiddleware(contextualizeSearch))
|
||||
app.use(asyncMiddleware(generalSearchMiddleware))
|
||||
app.use(asyncMiddleware(featuredLinks))
|
||||
app.use(asyncMiddleware(learningTrack))
|
||||
|
||||
|
|
|
@ -15,7 +15,10 @@ describe('favicon assets', () => {
|
|||
expect(res.headers['cache-control']).toContain('public')
|
||||
expect(res.headers['cache-control']).toContain('immutable')
|
||||
expect(res.headers['cache-control']).toMatch(/max-age=\d+/)
|
||||
const maxAgeSeconds = parseInt(res.headers['cache-control'].match(/max-age=(\d+)/)[1], 10)
|
||||
const maxAgeSeconds = parseInt(
|
||||
(res.headers['cache-control'] || '').match(/max-age=(\d+)/)?.[1] || '',
|
||||
10,
|
||||
)
|
||||
// Let's not be too specific in the tests, just as long as it's testing
|
||||
// that it's a reasonably large number of seconds.
|
||||
expect(maxAgeSeconds).toBeGreaterThanOrEqual(60 * 60)
|
||||
|
@ -25,13 +28,16 @@ describe('favicon assets', () => {
|
|||
test('should serve a valid and aggressively caching /apple-touch-icon.png', async () => {
|
||||
const res = await get('/apple-touch-icon.png')
|
||||
expect(res.statusCode).toBe(200)
|
||||
expect(parseInt(res.headers['content-length'], 10)).toBeGreaterThan(0)
|
||||
expect(parseInt(res.headers['content-length'] || '', 10)).toBeGreaterThan(0)
|
||||
expect(res.headers['content-type']).toBe('image/png')
|
||||
expect(res.headers['set-cookie']).toBeUndefined()
|
||||
expect(res.headers['cache-control']).toContain('public')
|
||||
expect(res.headers['cache-control']).toContain('immutable')
|
||||
expect(res.headers['cache-control']).toMatch(/max-age=\d+/)
|
||||
const maxAgeSeconds = parseInt(res.headers['cache-control'].match(/max-age=(\d+)/)[1], 10)
|
||||
const maxAgeSeconds = parseInt(
|
||||
(res.headers['cache-control'] || '').match(/max-age=(\d+)/)?.[1] || '',
|
||||
10,
|
||||
)
|
||||
// Let's not be too specific in the tests, just as long as it's testing
|
||||
// that it's a reasonably large number of seconds.
|
||||
expect(maxAgeSeconds).toBeGreaterThanOrEqual(60 * 60)
|
||||
|
|
|
@ -20,6 +20,9 @@ describe('manifest', () => {
|
|||
test('download manifest from HTML and check content', async () => {
|
||||
const $ = await getDOM('/')
|
||||
const url = $('link[rel="manifest"]').attr('href')
|
||||
if (!url) {
|
||||
throw new Error('No manifest URL found')
|
||||
}
|
||||
const res = await get(url)
|
||||
expect(res.statusCode).toBe(200)
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ labels:
|
|||
- [Prerequisites](#prerequisites)
|
||||
- [Create publication branch for a new version of GHES](#creation)
|
||||
- [Resolve check failures](#check-failures)
|
||||
- [Sync the search indices](#sync-search-indices)
|
||||
- [Scrape the search indices](#scrape-search-indices)
|
||||
- [Maintain the publication branch](#maintenance)
|
||||
- [Complete preparation for the RC and publish the docset](#publication)
|
||||
|
||||
|
@ -110,11 +110,11 @@ For content from the OpenAPI schema, note the affected content with broken links
|
|||
|
||||
<br/>
|
||||
|
||||
<a name="sync-search-indices">
|
||||
<a name="scrape-search-indices">
|
||||
|
||||
### [🔎](#sync-search-indices) Sync the search indices
|
||||
### [🔎](#scrape-search-indices) Scrape the search indices
|
||||
|
||||
1. Go to the [`sync-search-elasticsearch` workflow](https://github.com/github/docs-internal/actions/workflows/sync-search-elasticsearch.yml) ([permalink](https://github.com/github/docs-internal/blob/f8ca45703c48c7d1976a278337bc3391fb14fe9e/.github/workflows/sync-search-elasticsearch.yml) in case it moves)
|
||||
1. Go to the [`index-general-search.yml` workflow](https://github.com/github/docs-internal/actions/workflows/index-general-search.yml)
|
||||
1. Click on the **Run workflow** drop down and set the following parameters:
|
||||
- `Branch:` set to the name of the publication branch
|
||||
- `Version` set to the version you're publishing (e.g., `ghes-3.12` if you're publishing GHES 3.12)
|
||||
|
|
|
@ -1,4 +1,19 @@
|
|||
[
|
||||
{
|
||||
"schemaChanges": [
|
||||
{
|
||||
"title": "The GraphQL schema includes these changes:",
|
||||
"changes": [
|
||||
"<p>Type <code>UpdateEnterpriseDeployKeySettingInput</code> was added</p>",
|
||||
"<p>Type <code>UpdateEnterpriseDeployKeySettingPayload</code> was added</p>",
|
||||
"<p>Field <code>updateEnterpriseDeployKeySetting</code> was added to object type <code>Mutation</code></p>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"previewChanges": [],
|
||||
"upcomingChanges": [],
|
||||
"date": "2024-11-07"
|
||||
},
|
||||
{
|
||||
"schemaChanges": [
|
||||
{
|
||||
|
|
|
@ -24830,6 +24830,16 @@ type Mutation {
|
|||
input: UpdateEnterpriseDefaultRepositoryPermissionSettingInput!
|
||||
): UpdateEnterpriseDefaultRepositoryPermissionSettingPayload
|
||||
|
||||
"""
|
||||
Sets whether deploy keys are allowed to be created and used for an enterprise.
|
||||
"""
|
||||
updateEnterpriseDeployKeySetting(
|
||||
"""
|
||||
Parameters for UpdateEnterpriseDeployKeySetting
|
||||
"""
|
||||
input: UpdateEnterpriseDeployKeySettingInput!
|
||||
): UpdateEnterpriseDeployKeySettingPayload
|
||||
|
||||
"""
|
||||
Sets whether organization members with admin permissions on a repository can change repository visibility.
|
||||
"""
|
||||
|
@ -58729,6 +58739,46 @@ type UpdateEnterpriseDefaultRepositoryPermissionSettingPayload {
|
|||
message: String
|
||||
}
|
||||
|
||||
"""
|
||||
Autogenerated input type of UpdateEnterpriseDeployKeySetting
|
||||
"""
|
||||
input UpdateEnterpriseDeployKeySettingInput {
|
||||
"""
|
||||
A unique identifier for the client performing the mutation.
|
||||
"""
|
||||
clientMutationId: String
|
||||
|
||||
"""
|
||||
The ID of the enterprise on which to set the deploy key setting.
|
||||
"""
|
||||
enterpriseId: ID! @possibleTypes(concreteTypes: ["Enterprise"])
|
||||
|
||||
"""
|
||||
The value for the deploy key setting on the enterprise.
|
||||
"""
|
||||
settingValue: EnterpriseEnabledDisabledSettingValue!
|
||||
}
|
||||
|
||||
"""
|
||||
Autogenerated return type of UpdateEnterpriseDeployKeySetting.
|
||||
"""
|
||||
type UpdateEnterpriseDeployKeySettingPayload {
|
||||
"""
|
||||
A unique identifier for the client performing the mutation.
|
||||
"""
|
||||
clientMutationId: String
|
||||
|
||||
"""
|
||||
The enterprise with the updated deploy key setting.
|
||||
"""
|
||||
enterprise: Enterprise
|
||||
|
||||
"""
|
||||
A message confirming the result of updating the deploy key setting.
|
||||
"""
|
||||
message: String
|
||||
}
|
||||
|
||||
"""
|
||||
Autogenerated input type of UpdateEnterpriseMembersCanChangeRepositoryVisibilitySetting
|
||||
"""
|
||||
|
|
|
@ -7681,6 +7681,48 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "updateEnterpriseDeployKeySetting",
|
||||
"kind": "mutations",
|
||||
"id": "updateenterprisedeploykeysetting",
|
||||
"href": "/graphql/reference/mutations#updateenterprisedeploykeysetting",
|
||||
"description": "<p>Sets whether deploy keys are allowed to be created and used for an enterprise.</p>",
|
||||
"inputFields": [
|
||||
{
|
||||
"name": "input",
|
||||
"type": "UpdateEnterpriseDeployKeySettingInput!",
|
||||
"id": "updateenterprisedeploykeysettinginput",
|
||||
"kind": "input-objects",
|
||||
"href": "/graphql/reference/input-objects#updateenterprisedeploykeysettinginput"
|
||||
}
|
||||
],
|
||||
"returnFields": [
|
||||
{
|
||||
"name": "clientMutationId",
|
||||
"type": "String",
|
||||
"id": "string",
|
||||
"kind": "scalars",
|
||||
"href": "/graphql/reference/scalars#string",
|
||||
"description": "<p>A unique identifier for the client performing the mutation.</p>"
|
||||
},
|
||||
{
|
||||
"name": "enterprise",
|
||||
"type": "Enterprise",
|
||||
"id": "enterprise",
|
||||
"kind": "objects",
|
||||
"href": "/graphql/reference/objects#enterprise",
|
||||
"description": "<p>The enterprise with the updated deploy key setting.</p>"
|
||||
},
|
||||
{
|
||||
"name": "message",
|
||||
"type": "String",
|
||||
"id": "string",
|
||||
"kind": "scalars",
|
||||
"href": "/graphql/reference/scalars#string",
|
||||
"description": "<p>A message confirming the result of updating the deploy key setting.</p>"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "updateEnterpriseMembersCanChangeRepositoryVisibilitySetting",
|
||||
"kind": "mutations",
|
||||
|
@ -103270,6 +103312,40 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "UpdateEnterpriseDeployKeySettingInput",
|
||||
"kind": "inputObjects",
|
||||
"id": "updateenterprisedeploykeysettinginput",
|
||||
"href": "/graphql/reference/input-objects#updateenterprisedeploykeysettinginput",
|
||||
"description": "<p>Autogenerated input type of UpdateEnterpriseDeployKeySetting.</p>",
|
||||
"inputFields": [
|
||||
{
|
||||
"name": "clientMutationId",
|
||||
"description": "<p>A unique identifier for the client performing the mutation.</p>",
|
||||
"type": "String",
|
||||
"id": "string",
|
||||
"kind": "scalars",
|
||||
"href": "/graphql/reference/scalars#string"
|
||||
},
|
||||
{
|
||||
"name": "enterpriseId",
|
||||
"description": "<p>The ID of the enterprise on which to set the deploy key setting.</p>",
|
||||
"type": "ID!",
|
||||
"id": "id",
|
||||
"kind": "scalars",
|
||||
"href": "/graphql/reference/scalars#id",
|
||||
"isDeprecated": false
|
||||
},
|
||||
{
|
||||
"name": "settingValue",
|
||||
"description": "<p>The value for the deploy key setting on the enterprise.</p>",
|
||||
"type": "EnterpriseEnabledDisabledSettingValue!",
|
||||
"id": "enterpriseenableddisabledsettingvalue",
|
||||
"kind": "enums",
|
||||
"href": "/graphql/reference/enums#enterpriseenableddisabledsettingvalue"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "UpdateEnterpriseMembersCanChangeRepositoryVisibilitySettingInput",
|
||||
"kind": "inputObjects",
|
||||
|
|
|
@ -24830,6 +24830,16 @@ type Mutation {
|
|||
input: UpdateEnterpriseDefaultRepositoryPermissionSettingInput!
|
||||
): UpdateEnterpriseDefaultRepositoryPermissionSettingPayload
|
||||
|
||||
"""
|
||||
Sets whether deploy keys are allowed to be created and used for an enterprise.
|
||||
"""
|
||||
updateEnterpriseDeployKeySetting(
|
||||
"""
|
||||
Parameters for UpdateEnterpriseDeployKeySetting
|
||||
"""
|
||||
input: UpdateEnterpriseDeployKeySettingInput!
|
||||
): UpdateEnterpriseDeployKeySettingPayload
|
||||
|
||||
"""
|
||||
Sets whether organization members with admin permissions on a repository can change repository visibility.
|
||||
"""
|
||||
|
@ -58729,6 +58739,46 @@ type UpdateEnterpriseDefaultRepositoryPermissionSettingPayload {
|
|||
message: String
|
||||
}
|
||||
|
||||
"""
|
||||
Autogenerated input type of UpdateEnterpriseDeployKeySetting
|
||||
"""
|
||||
input UpdateEnterpriseDeployKeySettingInput {
|
||||
"""
|
||||
A unique identifier for the client performing the mutation.
|
||||
"""
|
||||
clientMutationId: String
|
||||
|
||||
"""
|
||||
The ID of the enterprise on which to set the deploy key setting.
|
||||
"""
|
||||
enterpriseId: ID! @possibleTypes(concreteTypes: ["Enterprise"])
|
||||
|
||||
"""
|
||||
The value for the deploy key setting on the enterprise.
|
||||
"""
|
||||
settingValue: EnterpriseEnabledDisabledSettingValue!
|
||||
}
|
||||
|
||||
"""
|
||||
Autogenerated return type of UpdateEnterpriseDeployKeySetting.
|
||||
"""
|
||||
type UpdateEnterpriseDeployKeySettingPayload {
|
||||
"""
|
||||
A unique identifier for the client performing the mutation.
|
||||
"""
|
||||
clientMutationId: String
|
||||
|
||||
"""
|
||||
The enterprise with the updated deploy key setting.
|
||||
"""
|
||||
enterprise: Enterprise
|
||||
|
||||
"""
|
||||
A message confirming the result of updating the deploy key setting.
|
||||
"""
|
||||
message: String
|
||||
}
|
||||
|
||||
"""
|
||||
Autogenerated input type of UpdateEnterpriseMembersCanChangeRepositoryVisibilitySetting
|
||||
"""
|
||||
|
|
|
@ -7681,6 +7681,48 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "updateEnterpriseDeployKeySetting",
|
||||
"kind": "mutations",
|
||||
"id": "updateenterprisedeploykeysetting",
|
||||
"href": "/graphql/reference/mutations#updateenterprisedeploykeysetting",
|
||||
"description": "<p>Sets whether deploy keys are allowed to be created and used for an enterprise.</p>",
|
||||
"inputFields": [
|
||||
{
|
||||
"name": "input",
|
||||
"type": "UpdateEnterpriseDeployKeySettingInput!",
|
||||
"id": "updateenterprisedeploykeysettinginput",
|
||||
"kind": "input-objects",
|
||||
"href": "/graphql/reference/input-objects#updateenterprisedeploykeysettinginput"
|
||||
}
|
||||
],
|
||||
"returnFields": [
|
||||
{
|
||||
"name": "clientMutationId",
|
||||
"type": "String",
|
||||
"id": "string",
|
||||
"kind": "scalars",
|
||||
"href": "/graphql/reference/scalars#string",
|
||||
"description": "<p>A unique identifier for the client performing the mutation.</p>"
|
||||
},
|
||||
{
|
||||
"name": "enterprise",
|
||||
"type": "Enterprise",
|
||||
"id": "enterprise",
|
||||
"kind": "objects",
|
||||
"href": "/graphql/reference/objects#enterprise",
|
||||
"description": "<p>The enterprise with the updated deploy key setting.</p>"
|
||||
},
|
||||
{
|
||||
"name": "message",
|
||||
"type": "String",
|
||||
"id": "string",
|
||||
"kind": "scalars",
|
||||
"href": "/graphql/reference/scalars#string",
|
||||
"description": "<p>A message confirming the result of updating the deploy key setting.</p>"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "updateEnterpriseMembersCanChangeRepositoryVisibilitySetting",
|
||||
"kind": "mutations",
|
||||
|
@ -103270,6 +103312,40 @@
|
|||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "UpdateEnterpriseDeployKeySettingInput",
|
||||
"kind": "inputObjects",
|
||||
"id": "updateenterprisedeploykeysettinginput",
|
||||
"href": "/graphql/reference/input-objects#updateenterprisedeploykeysettinginput",
|
||||
"description": "<p>Autogenerated input type of UpdateEnterpriseDeployKeySetting.</p>",
|
||||
"inputFields": [
|
||||
{
|
||||
"name": "clientMutationId",
|
||||
"description": "<p>A unique identifier for the client performing the mutation.</p>",
|
||||
"type": "String",
|
||||
"id": "string",
|
||||
"kind": "scalars",
|
||||
"href": "/graphql/reference/scalars#string"
|
||||
},
|
||||
{
|
||||
"name": "enterpriseId",
|
||||
"description": "<p>The ID of the enterprise on which to set the deploy key setting.</p>",
|
||||
"type": "ID!",
|
||||
"id": "id",
|
||||
"kind": "scalars",
|
||||
"href": "/graphql/reference/scalars#id",
|
||||
"isDeprecated": false
|
||||
},
|
||||
{
|
||||
"name": "settingValue",
|
||||
"description": "<p>The value for the deploy key setting on the enterprise.</p>",
|
||||
"type": "EnterpriseEnabledDisabledSettingValue!",
|
||||
"id": "enterpriseenableddisabledsettingvalue",
|
||||
"kind": "enums",
|
||||
"href": "/graphql/reference/enums#enterpriseenableddisabledsettingvalue"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "UpdateEnterpriseMembersCanChangeRepositoryVisibilitySettingInput",
|
||||
"kind": "inputObjects",
|
||||
|
|
|
@ -17,15 +17,15 @@ describe('frame', () => {
|
|||
test.each(langs)('breadcrumbs link to %s pages', async (lang) => {
|
||||
const $ = await getDOM(`/${lang}/get-started/learning-about-github`)
|
||||
const $breadcrumbs = $('[data-testid=breadcrumbs-in-article] a')
|
||||
expect($breadcrumbs[0].attribs.href).toBe(`/${lang}/get-started`)
|
||||
expect(($breadcrumbs[0] as cheerio.TagElement).attribs.href).toBe(`/${lang}/get-started`)
|
||||
})
|
||||
|
||||
test.each(langs)('homepage links go to %s pages', async (lang) => {
|
||||
const $ = await getDOM(`/${lang}`)
|
||||
const $links = $('[data-testid=bump-link]')
|
||||
$links.each((i: number, el: Element) => {
|
||||
$links.each((i: number, el: cheerio.Element) => {
|
||||
const linkUrl = $(el).attr('href')
|
||||
expect(linkUrl.startsWith(`/${lang}/`)).toBe(true)
|
||||
expect((linkUrl || '').startsWith(`/${lang}/`)).toBe(true)
|
||||
})
|
||||
})
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import fs from 'fs'
|
||||
import path from 'path'
|
||||
|
||||
import cheerio, { type CheerioAPI, type Element } from 'cheerio'
|
||||
import cheerio from 'cheerio'
|
||||
import coreLib from '@actions/core'
|
||||
import got, { RequestError } from 'got'
|
||||
import chalk from 'chalk'
|
||||
|
@ -339,7 +339,15 @@ async function main(
|
|||
const t0 = new Date().getTime()
|
||||
const flawsGroups = await Promise.all(
|
||||
pages.map((page: Page) =>
|
||||
processPage(core, page, pageMap, redirects, opts, externalLinkCheckerDB, versions),
|
||||
processPage(
|
||||
core,
|
||||
page,
|
||||
pageMap,
|
||||
redirects,
|
||||
opts,
|
||||
externalLinkCheckerDB,
|
||||
versions as string[],
|
||||
),
|
||||
),
|
||||
)
|
||||
const t1 = new Date().getTime()
|
||||
|
@ -695,13 +703,13 @@ async function processPermalink(
|
|||
}
|
||||
const $ = cheerio.load(html, { xmlMode: true })
|
||||
const flaws: LinkFlaw[] = []
|
||||
const links: Element[] = []
|
||||
const links: cheerio.Element[] = []
|
||||
$('a[href]').each((i, link) => {
|
||||
links.push(link)
|
||||
})
|
||||
const newFlaws: LinkFlaw[] = await Promise.all(
|
||||
links.map(async (link) => {
|
||||
const { href } = link.attribs
|
||||
const { href } = (link as cheerio.TagElement).attribs
|
||||
|
||||
// The global cache can't be used for anchor links because they
|
||||
// depend on each page it renders
|
||||
|
@ -752,7 +760,7 @@ async function processPermalink(
|
|||
|
||||
if (checkImages) {
|
||||
$('img[src]').each((i, img) => {
|
||||
let { src } = img.attribs
|
||||
let { src } = (img as cheerio.TagElement).attribs
|
||||
|
||||
// Images get a cache-busting prefix injected in the image
|
||||
// E.g. <img src="/assets/cb-123456/foo/bar.png">
|
||||
|
@ -874,7 +882,7 @@ let globalCacheMissCount = 0
|
|||
async function checkHrefLink(
|
||||
core: any,
|
||||
href: string,
|
||||
$: CheerioAPI,
|
||||
$: cheerio.Root,
|
||||
redirects: Redirects,
|
||||
pageMap: PageMap,
|
||||
checkAnchors = false,
|
||||
|
|
|
@ -16,9 +16,36 @@ The site search is part of every version of docs.github.com. This endpoint respo
|
|||
You can also query our search endpoint directly at:
|
||||
`https://docs.github.com/search?version=<VERSION>&language=<LANGUAGE CODE>&query=<QUERY>`
|
||||
|
||||
- The VERSION can be any numbered supported GitHub Enterprise Server version (e.g., `3.12`), Enterprise Cloud (`ghec`), or the Free pro team plan (`dotcom`).
|
||||
- The LANGUAGE CODE can be one of: `zh`, `es`, `pt`, `ru`, `ja`, `fr`, `de`, `ko`
|
||||
- Any search QUERY you'd like.
|
||||
- The `VERSION` can be any numbered supported GitHub Enterprise Server version (e.g., `3.12`), Enterprise Cloud (`ghec`), or the Free pro team plan (`dotcom`).
|
||||
- The `LANGUAGE CODE` can be one of: `zh`, `es`, `pt`, `ru`, `ja`, `fr`, `de`, `ko`
|
||||
- The `QUERY` can be any alphanumeric string value.
|
||||
|
||||
## Types of search
|
||||
|
||||
Our backend currently supports 3 "types" of searching.
|
||||
|
||||
All searches accept a `query` param, e.g. `?query=how` and return results based on their type:
|
||||
|
||||
1. **general search**
|
||||
- Results: The pages of our sites that match the query, sorted by popularity
|
||||
- Example: Query = "clone" -> Results <URLs to Docs Page about cloning>
|
||||
- Endpoint: `/api/search/v1`
|
||||
2. **general autocomplete**
|
||||
- Results: Potential terms that can be autocompleted from the query based on previous user searches
|
||||
- Example: Query = "cl" -> A Result = "clone"
|
||||
- Endpoint: `/api/search/autocomplete/v1`
|
||||
3. **AI search autocomplete**
|
||||
- Results: Human-readable full-sentence questions that best match the query. Questions are based on previous searches and popular pages
|
||||
- Example: Query = "How do I clone" -> A Result = "How do I clone a repository?"
|
||||
- Endpoint: `/api/search/ai-search-autocomplete/v1`
|
||||
|
||||
## Elasticsearch
|
||||
|
||||
Elasticsearch is an external service that we use for searching. When a user types a search, our backend queries Elasticsearch for the most relevant results.
|
||||
|
||||
### Indexing Elasticsearch
|
||||
|
||||
In order to provide relevant results to queries, we prefill Elasticsearch with data via Indexes. See the [Indexing README](./scripts/index/README.md) for how we index on Docs.
|
||||
|
||||
## Production deploys
|
||||
|
||||
|
@ -32,40 +59,25 @@ You can manually run the workflow to generate the indexes after you push your ch
|
|||
|
||||
### Build and sync
|
||||
|
||||
The preferred way to build and sync the search indices is to do so via the [GitHub Actions workflow](/.github/workflows/sync-search-elasticsearch.yml).
|
||||
The preferred way to build and sync the search indices is to do so via the [GitHub Actions workflow](/.github/workflows/index-general-search.yml).
|
||||
|
||||
## Files
|
||||
|
||||
### Actions workflow files
|
||||
|
||||
- [`.github/workflows/sync-search-elasticsearch.yml`](/.github/workflows/sync-search-elasticsearch.yml) - Builds and syncs search indices on the `main` branch every four hours. Search indices are stored in an internal-only Elasticsearch instance. To run it manually, click "Run workflow" button in the Actions tab.
|
||||
- [`.github/workflows/index-general-search.yml`](/.github/workflows/index-general-search.yml) - Populates search indices for **general search** using the `main` branch every four hours. Search indices are stored in an internal-only Elasticsearch instance. To run it manually, click "Run workflow" button in the Actions tab.
|
||||
- [`.github/workflows/index-autocomplete-search.yml`](/.github/workflows/index-general-search.yml) - Populates search indices for both **general autocomplete** and **AI search autocomplete** using data from an internal repo. Runs daily.
|
||||
|
||||
### Notable code files and directories
|
||||
|
||||
- [src/search/components/Search.tsx](/src/search/components/Search.tsx) - The browser-side code that enables the search.
|
||||
- [src/search/components/SearchResults.tsx](/src/search/components/SearchResults.tsx) - The browser-side code that displays search results.
|
||||
- [src/search/middleware/es-search.js](/src/search/middleware/es-search.js) - A wrapper around the Node.js Elasticsearch module for interacting with the search API.
|
||||
- [src/search/scripts/](/src/search/scripts/) - Scripts used by Actions workflows or for manual operations.
|
||||
- [src/search/tests](/src/search/tests) - Tests!
|
||||
- [src/search/middleware/general-search-middleware.ts](src/search/middleware/general-search-middleware.ts) - Entrypoint to general search when you hit docs.github/search
|
||||
- [src/search/middleware/search-routes](src/search/middleware/general-search-middleware.ts) - Entrypoint to the API endpoints for our search routes
|
||||
- [src/search/scripts/](/src/search/scripts/) - Scripts used by Actions workflows or for manual operations like scraping data for indexing and performing the indexing.
|
||||
- [src/search/tests](/src/search/tests) - Tests relevant to searching.
|
||||
|
||||
## Records
|
||||
|
||||
Each record represents a page. Each record has `breadcrumbs`, `title`, `headings`, `content` (the article content in text, not HTML), `intro` (if one exists in the frontmatter), and a unique `objectID` that is currently just the permalink of the article. Here's an example:
|
||||
|
||||
```json
|
||||
{
|
||||
"objectID":"/en/actions/creating-actions/about-custom-actions",
|
||||
"breadcrumbs":"GitHub Actions / Creating actions",
|
||||
"title":"About custom actions",
|
||||
"headings":"About custom actions\nTypes of actions\n[...]",
|
||||
"content":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, [...]",
|
||||
"intro":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, or use and customize actions shared by the GitHub community.",
|
||||
"toplevel":"GitHub Actions",
|
||||
"popularity":0
|
||||
}
|
||||
```
|
||||
|
||||
## Notes
|
||||
## Miscellaneous Notes
|
||||
|
||||
- It's not strictly necessary to set an `objectID` as the search index will create one automatically, but by creating our own we have a guarantee that subsequent invocations of this upload script will overwrite existing records instead of creating numerous duplicate records with differing IDs.
|
||||
- Our search querying has typo tolerance. Try spelling something wrong and see what you get!
|
||||
|
|
|
@ -2,9 +2,10 @@ import { CheckboxGroup, Checkbox, FormControl } from '@primer/react'
|
|||
import { useRouter } from 'next/router'
|
||||
import Link from 'next/link'
|
||||
|
||||
import type { SearchResultAggregations } from './types'
|
||||
import { useTranslation } from 'src/languages/components/useTranslation'
|
||||
|
||||
import type { SearchResultAggregations } from 'src/search/types'
|
||||
|
||||
type Props = {
|
||||
aggregations: SearchResultAggregations
|
||||
}
|
||||
|
|
|
@ -4,30 +4,39 @@ import { useRouter } from 'next/router'
|
|||
import { useEffect, useState } from 'react'
|
||||
import cx from 'classnames'
|
||||
|
||||
import type { SearchResultsT, SearchResultHitT, SearchQueryT } from './types'
|
||||
import { useTranslation } from 'src/languages/components/useTranslation'
|
||||
import { Link } from 'src/frame/components/Link'
|
||||
import { sendEvent, EventType } from 'src/events/components/events'
|
||||
|
||||
import styles from './SearchResults.module.scss'
|
||||
|
||||
import type { SearchQueryContentT } from 'src/search/components/types'
|
||||
import type { GeneralSearchHitWithoutIncludes, GeneralSearchResponse } from 'src/search/types'
|
||||
import type { SearchTotalHits } from '@elastic/elasticsearch/lib/api/types'
|
||||
|
||||
type Props = {
|
||||
results: SearchResultsT
|
||||
search: SearchQueryT
|
||||
results: GeneralSearchResponse
|
||||
searchParams: SearchQueryContentT
|
||||
}
|
||||
export function SearchResults({ results, search }: Props) {
|
||||
const pages = Math.ceil(results.meta.found.value / results.meta.size)
|
||||
export function SearchResults({ results, searchParams }: Props) {
|
||||
const pages = Math.ceil((results.meta.found as SearchTotalHits).value / results.meta.size)
|
||||
const { page } = results.meta
|
||||
|
||||
return (
|
||||
<div>
|
||||
<SearchResultHits hits={results.hits} search={search} />
|
||||
<SearchResultHits hits={results.hits} searchParams={searchParams} />
|
||||
{pages > 1 && <ResultsPagination page={page} totalPages={pages} />}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function SearchResultHits({ hits, search }: { hits: SearchResultHitT[]; search: SearchQueryT }) {
|
||||
function SearchResultHits({
|
||||
hits,
|
||||
searchParams,
|
||||
}: {
|
||||
hits: GeneralSearchHitWithoutIncludes[]
|
||||
searchParams: SearchQueryContentT
|
||||
}) {
|
||||
return (
|
||||
<div>
|
||||
{hits.length === 0 && <NoSearchResults />}
|
||||
|
@ -35,10 +44,10 @@ function SearchResultHits({ hits, search }: { hits: SearchResultHitT[]; search:
|
|||
<SearchResultHit
|
||||
key={hit.id}
|
||||
hit={hit}
|
||||
query={search.query}
|
||||
query={searchParams.query}
|
||||
totalHits={hits.length}
|
||||
index={index}
|
||||
debug={search.debug}
|
||||
debug={searchParams.debug}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
|
@ -64,7 +73,7 @@ function SearchResultHit({
|
|||
index,
|
||||
debug,
|
||||
}: {
|
||||
hit: SearchResultHitT
|
||||
hit: GeneralSearchHitWithoutIncludes
|
||||
query: string
|
||||
totalHits: number
|
||||
index: number
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import { Flash } from '@primer/react'
|
||||
|
||||
import { useTranslation } from 'src/languages/components/useTranslation'
|
||||
import type { SearchValidationErrorT } from './types'
|
||||
import type { SearchValidationErrorEntry } from '../types'
|
||||
|
||||
interface Props {
|
||||
errors: SearchValidationErrorT[]
|
||||
errors: SearchValidationErrorEntry[]
|
||||
}
|
||||
|
||||
export function ValidationErrors({ errors }: Props) {
|
||||
|
|
|
@ -1,10 +1,5 @@
|
|||
import { createContext, useContext } from 'react'
|
||||
|
||||
import type { SearchT } from '../types'
|
||||
|
||||
export type SearchContextT = {
|
||||
search: SearchT
|
||||
}
|
||||
import type { SearchContextT } from '../types'
|
||||
|
||||
export const SearchContext = createContext<SearchContextT | null>(null)
|
||||
|
||||
|
|
|
@ -7,8 +7,9 @@ import { useNumberFormatter } from 'src/search/components/useNumberFormatter'
|
|||
import { SearchResults } from 'src/search/components/SearchResults'
|
||||
import { NoQuery } from 'src/search/components/NoQuery'
|
||||
import { useMainContext } from 'src/frame/components/context/MainContext'
|
||||
import { ValidationErrors } from './ValidationErrors'
|
||||
import { useSearchContext } from './context/SearchContext'
|
||||
import { ValidationErrors } from 'src/search/components/ValidationErrors'
|
||||
import { useSearchContext } from 'src/search/components/context/SearchContext'
|
||||
import type { SearchTotalHits } from '@elastic/elasticsearch/lib/api/types'
|
||||
|
||||
export function Search() {
|
||||
const { search } = useSearchContext()
|
||||
|
@ -17,7 +18,7 @@ export function Search() {
|
|||
const { t } = useTranslation('search_results')
|
||||
const { currentVersion } = useVersion()
|
||||
|
||||
const { query } = search.search
|
||||
const { query } = search.searchParams
|
||||
|
||||
// A reference to the `content/search/index.md` Page object.
|
||||
// Not to be confused with the "page" that is for paginating
|
||||
|
@ -37,7 +38,7 @@ export function Search() {
|
|||
pageTitle += ` (${searchVersion})`
|
||||
}
|
||||
if (results) {
|
||||
pageTitle = `${formatInteger(results.meta.found.value)} ${pageTitle}`
|
||||
pageTitle = `${formatInteger((results.meta.found as SearchTotalHits).value)} ${pageTitle}`
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -63,7 +64,7 @@ export function Search() {
|
|||
<ValidationErrors errors={validationErrors} />
|
||||
) : null}
|
||||
|
||||
{results ? <SearchResults results={results} search={search.search} /> : null}
|
||||
{results ? <SearchResults results={results} searchParams={search.searchParams} /> : null}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
|
|
@ -1,58 +1,15 @@
|
|||
export type SearchResultHitT = {
|
||||
id: string
|
||||
url: string
|
||||
title: string
|
||||
breadcrumbs: string
|
||||
highlights: {
|
||||
title?: string[]
|
||||
content?: string[]
|
||||
content_explicit?: string[]
|
||||
import { GeneralSearchResponse, SearchValidationErrorEntry } from 'src/search/types'
|
||||
|
||||
export interface SearchContextT {
|
||||
search: {
|
||||
results?: GeneralSearchResponse
|
||||
searchParams: SearchQueryContentT
|
||||
validationErrors: SearchValidationErrorEntry[]
|
||||
}
|
||||
score?: number
|
||||
popularity?: number
|
||||
es_url?: string
|
||||
}
|
||||
|
||||
type SearchResultsMeta = {
|
||||
found: {
|
||||
value: number
|
||||
relation: string
|
||||
}
|
||||
took: {
|
||||
query_msec: number
|
||||
total_msec: number
|
||||
}
|
||||
page: number
|
||||
size: number
|
||||
}
|
||||
|
||||
type Aggregation = {
|
||||
key: string
|
||||
count: number
|
||||
}
|
||||
|
||||
export type SearchResultAggregations = {
|
||||
[key: string]: Aggregation[]
|
||||
}
|
||||
|
||||
export type SearchResultsT = {
|
||||
meta: SearchResultsMeta
|
||||
hits: SearchResultHitT[]
|
||||
aggregations?: SearchResultAggregations
|
||||
}
|
||||
|
||||
export type SearchQueryT = {
|
||||
// Parts of the search query that are set to the search context
|
||||
export type SearchQueryContentT = {
|
||||
query: string
|
||||
debug: boolean
|
||||
}
|
||||
|
||||
export type SearchValidationErrorT = {
|
||||
error: string
|
||||
// key: string
|
||||
}
|
||||
|
||||
export type SearchT = {
|
||||
search: SearchQueryT
|
||||
results?: SearchResultsT
|
||||
validationErrors: SearchValidationErrorT[]
|
||||
}
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
export const namePrefix = 'github-docs'
|
||||
|
||||
export default {
|
||||
namePrefix,
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
import languages from '@/languages/lib/languages.js'
|
||||
import { utcTimestamp } from '@/search/lib/helpers/time'
|
||||
import { allIndexVersionKeys, versionToIndexVersionMap } from '@/search/lib/elasticsearch-versions'
|
||||
|
||||
import type { SearchTypes } from '@/search/types'
|
||||
|
||||
export type SearchIndexes = {
|
||||
[key in SearchTypes]: SearchIndex
|
||||
}
|
||||
|
||||
export type SearchIndex = {
|
||||
prefix: string
|
||||
type: string
|
||||
}
|
||||
|
||||
/* Elasticsearch uses indexes to group categories of data
|
||||
|
||||
We currently have 3 top-level categories of indexes:
|
||||
1. General search: This is populated using data from all of our Docs pages
|
||||
2. General autocomplete: This is populated using analytics search history in docs-internal-data
|
||||
3. AI autocomplete: This is populated with human-readable questions using a GPT query in docs-internal-data
|
||||
|
||||
This file is intended to be the source of truth for Docs Elasticsearch indexes.
|
||||
|
||||
Indexes are in the form:
|
||||
<test_prefix><prefix>-<type>-<version>-<language>
|
||||
e.g. github-docs-general-search-fpt-en
|
||||
|
||||
<test-prefix> might be "tests_" for tests
|
||||
*/
|
||||
const prefix = 'github-docs'
|
||||
const indexes: SearchIndexes = {
|
||||
generalSearch: {
|
||||
prefix,
|
||||
type: 'general-search',
|
||||
},
|
||||
generalAutocomplete: {
|
||||
prefix,
|
||||
type: 'general-autocomplete',
|
||||
},
|
||||
aiSearchAutocomplete: {
|
||||
prefix,
|
||||
type: 'ai-search-autocomplete',
|
||||
},
|
||||
}
|
||||
|
||||
// Source of truth for determining the index name for the Elastic Search index given a version and language
|
||||
export function getElasticSearchIndex(
|
||||
type: SearchTypes,
|
||||
version: string,
|
||||
language: string,
|
||||
manualPrefix = '',
|
||||
): {
|
||||
indexName: string
|
||||
indexAlias: string
|
||||
} {
|
||||
if (!(type in indexes)) {
|
||||
throw new Error(`Type ${type} not found in indexes for getElasticSearchIndex function.`)
|
||||
}
|
||||
const index = indexes[type] as SearchIndex
|
||||
|
||||
// Validate language
|
||||
if (!(language in languages)) {
|
||||
throw new Error(
|
||||
`Language ${language} not found in languages for getElasticSearchIndex function.`,
|
||||
)
|
||||
}
|
||||
|
||||
// Validate version
|
||||
if (!allIndexVersionKeys.includes(version)) {
|
||||
throw new Error(
|
||||
`Version '${version}' does not map to a valid version for getElasticSearchIndex function.`,
|
||||
)
|
||||
}
|
||||
|
||||
// e.g. free-pro-team becomes fpt for the index name
|
||||
const indexVersion = versionToIndexVersionMap[version]
|
||||
|
||||
// In the index-test-fixtures.sh script, we use the tests_ prefix index for testing
|
||||
const testPrefix = process.env.NODE_ENV === 'test' ? 'tests_' : ''
|
||||
|
||||
// If a manual prefix is provided, append an underscore to it
|
||||
if (manualPrefix && !manualPrefix.endsWith('_')) {
|
||||
manualPrefix += '_'
|
||||
}
|
||||
|
||||
const indexName = `${testPrefix || manualPrefix}${index.prefix}_${index.type}_${indexVersion}_${language}`
|
||||
const indexAlias = `${indexName}__${utcTimestamp()}`
|
||||
|
||||
return { indexName, indexAlias }
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
* Source of truth for versioning in the context of Elasticsearch
|
||||
* We have a unique index for each version of the docs
|
||||
* so consistency is important for creating/accessing ES Indexes.
|
||||
*
|
||||
* Example versions (these may not be up to date):
|
||||
*
|
||||
* 1. free-pro-team@latest. Previously known as "dotcom". This is the default version of the docs.
|
||||
* - short name: fpt
|
||||
* 2. enterprise-cloud@latest
|
||||
* - short name: ghec
|
||||
* 3. enterprise-server@X: This is the source of versioning complexity since the version is dynamic
|
||||
* - short name: ghes-X
|
||||
*
|
||||
* However, for (3) someone might enter `&version=3.5` as the version in the request query string.
|
||||
* This would map to `ghes-3.5`
|
||||
*/
|
||||
|
||||
import { allVersions } from '@/versions/lib/all-versions'
|
||||
|
||||
// versionToIndexVersionMap examples:
|
||||
// free-pro-team@latest -> fpt
|
||||
// free-pro-team -> fpt
|
||||
// dotcom -> fpt
|
||||
// enterprise-cloud@latest -> ghec
|
||||
// enterprise-server@3.5 -> ghes-3.5
|
||||
// 3.5 -> ghes-3.5
|
||||
export const versionToIndexVersionMap: { [key: string]: string } = {}
|
||||
|
||||
// For each potential input (from request query string, CLI, etc), map it to the appropriate index version
|
||||
for (const versionSource of Object.values(allVersions)) {
|
||||
if (versionSource.hasNumberedReleases) {
|
||||
versionToIndexVersionMap[versionSource.currentRelease] = versionSource.miscVersionName
|
||||
// Map shortname or plan, e.g. `ghes` or `enterprise-server` to the latest release, e.g. `ghes-3.14`
|
||||
if (versionSource.latestRelease === versionSource.currentRelease) {
|
||||
versionToIndexVersionMap[versionSource.plan] = versionSource.miscVersionName
|
||||
versionToIndexVersionMap[versionSource.shortName] = versionSource.miscVersionName
|
||||
}
|
||||
} else {
|
||||
versionToIndexVersionMap[versionSource.version] = versionSource.shortName
|
||||
versionToIndexVersionMap[versionSource.miscVersionName] = versionSource.shortName
|
||||
// The next two lines map things like `?version=free-pro-team` -> `?version=fpt`
|
||||
versionToIndexVersionMap[versionSource.plan] = versionSource.shortName
|
||||
versionToIndexVersionMap[versionSource.shortName] = versionSource.shortName
|
||||
}
|
||||
}
|
||||
|
||||
// All of the possible keys that can be input to access a version
|
||||
export const allIndexVersionKeys = Array.from(
|
||||
new Set([...Object.keys(versionToIndexVersionMap), ...Object.keys(allVersions)]),
|
||||
)
|
||||
|
||||
// These should be the only possible values that an ES index will use (source of truth)
|
||||
// allIndexVersionOptions example:
|
||||
// fpt, ghec, ghes-3.14, ghes-3.13, ghes-3.12, ghes-3.11, ghes-3.10
|
||||
export const allIndexVersionOptions = Array.from(
|
||||
new Set([...Object.values(versionToIndexVersionMap)]),
|
||||
)
|
||||
|
||||
// Autocomplete only supports 3 "versions": free-pro-team, enterprise-cloud, and enterprise-server
|
||||
// docs-internal-data stores data under directories with these names. It does not account for individual enterprise-server versions
|
||||
// These are the "plan" names on the allVersions object
|
||||
const allVersionPlans: string[] = []
|
||||
for (const version of Object.values(allVersions)) {
|
||||
if (version.plan) {
|
||||
allVersionPlans.push(version.plan)
|
||||
}
|
||||
}
|
||||
// Remove duplicates
|
||||
export const supportedAutocompletePlanVersions = Array.from(new Set(allVersionPlans))
|
||||
|
||||
// Returns the plan name for the given version
|
||||
// Needed because {version} in the docs-internal-data paths use the version's 'plan' name, e.g. `free-pro-team` instead of `fpt`
|
||||
export function getPlanVersionFromIndexVersion(indexVersion: string): string {
|
||||
const planVersion =
|
||||
Object.values(allVersions).find(
|
||||
(info) =>
|
||||
info.shortName === indexVersion ||
|
||||
info.plan === indexVersion ||
|
||||
info.miscVersionName === indexVersion ||
|
||||
info.currentRelease === indexVersion,
|
||||
)?.plan || ''
|
||||
|
||||
if (!planVersion) {
|
||||
throw new Error(`Plan version not found for index version ${indexVersion}`)
|
||||
}
|
||||
|
||||
return planVersion
|
||||
}
|
||||
|
||||
// Gets the matching key from allVersions for the given index version
|
||||
// This is needed for scraping since the pages use the 'allVersions' key as their version
|
||||
export function getAllVersionsKeyFromIndexVersion(indexVersion: string): string {
|
||||
const key = Object.keys(allVersions).find(
|
||||
(key) =>
|
||||
key === indexVersion ||
|
||||
allVersions[key].shortName === indexVersion ||
|
||||
allVersions[key].plan === indexVersion ||
|
||||
allVersions[key].miscVersionName === indexVersion,
|
||||
)
|
||||
|
||||
if (!key) {
|
||||
throw new Error(`No key found for index version ${indexVersion}`)
|
||||
}
|
||||
|
||||
return key
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
import { Client } from '@elastic/elasticsearch'
|
||||
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
|
||||
import { getHighlightConfiguration } from '@/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config'
|
||||
|
||||
import type { AutocompleteSearchResponse } from '@/search/types'
|
||||
import type {
|
||||
AutocompleteMatchQueriesOptions,
|
||||
AutocompleteResultsArgs,
|
||||
} from '@/search/lib/get-elasticsearch-results/types'
|
||||
import type { QueryDslQueryContainer, SearchTotalHits } from '@elastic/elasticsearch/lib/api/types'
|
||||
|
||||
// Query Elasticsearch for AI Search autocomplete results
|
||||
export async function getAISearchAutocompleteResults({
|
||||
indexName,
|
||||
query,
|
||||
size,
|
||||
}: AutocompleteResultsArgs): Promise<AutocompleteSearchResponse> {
|
||||
const t0 = new Date()
|
||||
const client = getElasticsearchClient() as Client
|
||||
|
||||
const matchQueries = getAISearchAutocompleteMatchQueries(query.trim(), {
|
||||
fuzzy: {
|
||||
minLength: 3,
|
||||
maxLength: 20,
|
||||
},
|
||||
})
|
||||
const matchQuery = {
|
||||
bool: {
|
||||
should: matchQueries,
|
||||
},
|
||||
}
|
||||
|
||||
const highlight = getHighlightConfiguration(query, ['term'])
|
||||
|
||||
const searchQuery = {
|
||||
index: indexName,
|
||||
highlight,
|
||||
size,
|
||||
query: matchQuery,
|
||||
_source_includes: ['term'],
|
||||
}
|
||||
|
||||
const result = await client.search<{ term: string }>(searchQuery)
|
||||
|
||||
const hitsAll = result.hits
|
||||
const hits = hitsAll.hits.map((hit) => ({
|
||||
term: hit._source?.term,
|
||||
highlights: (hit.highlight && hit.highlight.term) || [],
|
||||
}))
|
||||
|
||||
return {
|
||||
meta: {
|
||||
found: hitsAll.total as SearchTotalHits,
|
||||
took: { query_msec: result.took, total_msec: new Date().getTime() - t0.getTime() },
|
||||
size,
|
||||
},
|
||||
hits,
|
||||
}
|
||||
}
|
||||
|
||||
function getAISearchAutocompleteMatchQueries(
|
||||
query: string,
|
||||
{ fuzzy }: AutocompleteMatchQueriesOptions,
|
||||
) {
|
||||
const BOOST_PHRASE = 4.0
|
||||
const BOOST_REGULAR = 2.0
|
||||
const BOOST_PREFIX = 1.0
|
||||
const BOOST_FUZZY = 0.1
|
||||
|
||||
const matchQueries: QueryDslQueryContainer[] = []
|
||||
|
||||
// Use match_phrase for exact term matches
|
||||
matchQueries.push({
|
||||
match_phrase: {
|
||||
term: {
|
||||
query,
|
||||
boost: BOOST_PHRASE,
|
||||
slop: 1, // Allows minor word reordering
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
// Use match for general matching
|
||||
matchQueries.push({
|
||||
match: {
|
||||
term: {
|
||||
query,
|
||||
boost: BOOST_PREFIX,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
// Match phrase prefix for partial term matches
|
||||
matchQueries.push({
|
||||
match_phrase_prefix: {
|
||||
term: {
|
||||
query,
|
||||
boost: BOOST_PREFIX,
|
||||
},
|
||||
},
|
||||
})
|
||||
matchQueries.push({
|
||||
match_bool_prefix: {
|
||||
term: {
|
||||
query,
|
||||
boost: BOOST_REGULAR,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
// Add fuzzy matching for typos and variations
|
||||
if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
|
||||
matchQueries.push({
|
||||
fuzzy: {
|
||||
term: {
|
||||
value: query,
|
||||
boost: BOOST_FUZZY,
|
||||
fuzziness: 'AUTO',
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
return matchQueries
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
import { Client } from '@elastic/elasticsearch'
|
||||
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
|
||||
import { getHighlightConfiguration } from '@/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config'
|
||||
|
||||
import type { QueryDslQueryContainer, SearchTotalHits } from '@elastic/elasticsearch/lib/api/types'
|
||||
import type { AutocompleteSearchResponse } from '@/search/types'
|
||||
import type {
|
||||
AutocompleteMatchQueriesOptions,
|
||||
AutocompleteResultsArgs,
|
||||
AutocompleteElasticsearchItem,
|
||||
} from '@/search/lib/get-elasticsearch-results/types'
|
||||
|
||||
// Query Elasticsearch for general autocomplete results
|
||||
export async function getAutocompleteSearchResults({
|
||||
indexName,
|
||||
query,
|
||||
size,
|
||||
}: AutocompleteResultsArgs): Promise<AutocompleteSearchResponse> {
|
||||
const t0 = new Date()
|
||||
const client = getElasticsearchClient() as Client
|
||||
|
||||
const matchQueries = getAutocompleteMatchQueries(query.trim(), {
|
||||
fuzzy: {
|
||||
minLength: 3,
|
||||
maxLength: 20,
|
||||
},
|
||||
})
|
||||
const matchQuery = {
|
||||
bool: {
|
||||
should: matchQueries,
|
||||
},
|
||||
}
|
||||
|
||||
const highlight = getHighlightConfiguration(query, ['term'])
|
||||
|
||||
const searchQuery = {
|
||||
index: indexName,
|
||||
highlight,
|
||||
size,
|
||||
query: matchQuery,
|
||||
// Send absolutely minimal from Elasticsearch to here. Less data => faster.
|
||||
_source_includes: ['term'],
|
||||
}
|
||||
|
||||
const result = await client.search<AutocompleteElasticsearchItem>(searchQuery)
|
||||
|
||||
const hitsAll = result.hits
|
||||
const hits = hitsAll.hits.map((hit) => ({
|
||||
term: hit._source?.term,
|
||||
highlights: (hit.highlight && hit.highlight.term) || [],
|
||||
}))
|
||||
|
||||
return {
|
||||
meta: {
|
||||
found: hitsAll.total as SearchTotalHits,
|
||||
took: { query_msec: result.took, total_msec: new Date().getTime() - t0.getTime() },
|
||||
size,
|
||||
},
|
||||
hits,
|
||||
}
|
||||
}
|
||||
|
||||
function getAutocompleteMatchQueries(query: string, { fuzzy }: AutocompleteMatchQueriesOptions) {
|
||||
const BOOST_PHRASE = 4.0
|
||||
const BOOST_REGULAR = 2.0
|
||||
const BOOST_FUZZY = 0.1
|
||||
|
||||
const matchQueries: QueryDslQueryContainer[] = []
|
||||
const isMultiWordQuery = query.includes(' ') || query.includes('-')
|
||||
|
||||
if (isMultiWordQuery) {
|
||||
matchQueries.push({
|
||||
match_phrase_prefix: {
|
||||
term: {
|
||||
query,
|
||||
boost: BOOST_PHRASE,
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
matchQueries.push({
|
||||
match_bool_prefix: {
|
||||
term: {
|
||||
query,
|
||||
boost: BOOST_REGULAR,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
|
||||
matchQueries.push({
|
||||
fuzzy: {
|
||||
term: { value: query, boost: BOOST_FUZZY, fuzziness: 'AUTO' },
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
return matchQueries
|
||||
}
|
|
@ -1,57 +1,54 @@
|
|||
import { Client } from '@elastic/elasticsearch'
|
||||
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
|
||||
import { DEFAULT_HIGHLIGHT_FIELDS } from '@/search/lib/search-request-params/search-params-objects'
|
||||
import { getHighlightConfiguration } from '@/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config'
|
||||
|
||||
export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content']
|
||||
// This needs to match what we *use* in the `<SearchResults>` component.
|
||||
// For example, if we don't display "headings" we shouldn't request
|
||||
// highlights for it either.
|
||||
export const DEFAULT_HIGHLIGHT_FIELDS = ['title', 'content']
|
||||
|
||||
const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL
|
||||
import type {
|
||||
SearchHit as ElasticsearchHit,
|
||||
QueryDslQueryContainer,
|
||||
SearchRequest,
|
||||
SearchTotalHits,
|
||||
} from '@elastic/elasticsearch/lib/api/types'
|
||||
import type {
|
||||
AdditionalIncludes,
|
||||
ComputedSearchQueryParamsMap,
|
||||
} from '@/search/lib/search-request-params/types'
|
||||
import type { SearchAggregation, GeneralSearchHit, GeneralSearchResponse } from '@/search/types'
|
||||
|
||||
const MAX_AGGREGATE_SIZE = 30
|
||||
|
||||
const isDevMode = process.env.NODE_ENV !== 'production'
|
||||
const isDevMode: boolean = process.env.NODE_ENV !== 'production'
|
||||
|
||||
function getClient() {
|
||||
if (!ELASTICSEARCH_URL) {
|
||||
// If this was mistakenly not set, it will eventually fail
|
||||
// when you use the Client. But `new Client({node: undefined})`
|
||||
// won't throw. And the error you get when you actually do try
|
||||
// to use that Client instance is cryptic compared to this
|
||||
// plain and simple thrown error.
|
||||
throw new Error(`$ELASTICSEARCH_URL is not set`)
|
||||
}
|
||||
return new Client({
|
||||
node: ELASTICSEARCH_URL,
|
||||
// The default is 30,000ms but we noticed that the median time is about
|
||||
// 100-150ms with some occasional searches taking multiple seconds.
|
||||
// The default `maxRetries` is 3 which is a sensible number.
|
||||
// If a query gets stuck, it's better to (relatively) quickly give up
|
||||
// and retry. So if it takes longer than this time here, we're banking on
|
||||
// that it was just bad luck and that it'll work if we simply try again.
|
||||
// See internal issue #2318.
|
||||
requestTimeout: 1900,
|
||||
// It's important that requestTimeout * maxRetries is less than 10 seconds.
|
||||
maxRetries: 5,
|
||||
})
|
||||
type getGeneralSearchResultsParams = {
|
||||
indexName: string
|
||||
searchParams: ComputedSearchQueryParamsMap['generalSearch']
|
||||
topics?: string[]
|
||||
includeTopics?: boolean
|
||||
}
|
||||
|
||||
// The true work horse that actually performs the Elasticsearch query
|
||||
export async function getSearchResults({
|
||||
indexName,
|
||||
query,
|
||||
page,
|
||||
size,
|
||||
debug,
|
||||
sort,
|
||||
topics,
|
||||
includeTopics,
|
||||
usePrefixSearch,
|
||||
highlights,
|
||||
include,
|
||||
toplevel,
|
||||
aggregate,
|
||||
}) {
|
||||
// Query Elasticsearch for general search results
|
||||
export async function getGeneralSearchResults(
|
||||
args: getGeneralSearchResultsParams,
|
||||
): Promise<GeneralSearchResponse> {
|
||||
const {
|
||||
indexName,
|
||||
searchParams: {
|
||||
highlights,
|
||||
include,
|
||||
toplevel,
|
||||
aggregate,
|
||||
autocomplete,
|
||||
query,
|
||||
page,
|
||||
size,
|
||||
debug,
|
||||
sort,
|
||||
},
|
||||
topics,
|
||||
includeTopics,
|
||||
} = args
|
||||
|
||||
const usePrefixSearch = autocomplete
|
||||
|
||||
if (topics && !Array.isArray(topics)) {
|
||||
throw new Error("'topics' has to be an array")
|
||||
}
|
||||
|
@ -71,8 +68,8 @@ export async function getSearchResults({
|
|||
throw new Error("Every entry in the 'toplevel' must be a string")
|
||||
}
|
||||
}
|
||||
const t0 = new Date()
|
||||
const client = getClient()
|
||||
const t0 = Date.now()
|
||||
const client = getElasticsearchClient()
|
||||
const from = size * (page - 1)
|
||||
|
||||
const matchQueries = getMatchQueries(query.trim(), {
|
||||
|
@ -83,7 +80,7 @@ export async function getSearchResults({
|
|||
},
|
||||
})
|
||||
|
||||
const matchQuery = {
|
||||
const matchQuery: Record<string, any> = {
|
||||
bool: {
|
||||
should: matchQueries,
|
||||
// This allows filtering by toplevel later.
|
||||
|
@ -91,7 +88,8 @@ export async function getSearchResults({
|
|||
},
|
||||
}
|
||||
|
||||
const topicsFilter = (topics || []).map((topic) => {
|
||||
const topicsArray = Array.isArray(topics) ? topics : topics ? [topics] : []
|
||||
const topicsFilter = topicsArray.map((topic) => {
|
||||
return {
|
||||
term: {
|
||||
// Remember, 'topics' is a keyword field, meaning you need
|
||||
|
@ -101,15 +99,18 @@ export async function getSearchResults({
|
|||
}
|
||||
})
|
||||
if (topicsFilter.length) {
|
||||
matchQuery.bool.filter = topicsFilter
|
||||
matchQuery.bool.filter = matchQuery.bool.filter || []
|
||||
matchQuery.bool.filter.push(...topicsFilter)
|
||||
}
|
||||
|
||||
if (toplevel && toplevel.length) {
|
||||
matchQuery.bool.filter = {
|
||||
const toplevelArray = toplevel || []
|
||||
if (toplevelArray.length) {
|
||||
matchQuery.bool.filter = matchQuery.bool.filter || []
|
||||
matchQuery.bool.filter.push({
|
||||
terms: {
|
||||
toplevel,
|
||||
toplevel: toplevelArray,
|
||||
},
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const highlightFields = Array.from(highlights || DEFAULT_HIGHLIGHT_FIELDS)
|
||||
|
@ -121,7 +122,7 @@ export async function getSearchResults({
|
|||
|
||||
const aggs = getAggregations(aggregate)
|
||||
|
||||
const searchQuery = {
|
||||
const searchQuery: SearchRequest = {
|
||||
index: indexName,
|
||||
highlight,
|
||||
from,
|
||||
|
@ -136,13 +137,13 @@ export async function getSearchResults({
|
|||
_source_includes: ['title', 'url', 'breadcrumbs', 'popularity', 'toplevel'],
|
||||
}
|
||||
|
||||
if (includeTopics) {
|
||||
searchQuery._source_includes.push('topics')
|
||||
if (includeTopics && Array.isArray(searchQuery._source_includes)) {
|
||||
searchQuery._source_includes?.push('topics')
|
||||
}
|
||||
|
||||
for (const key of ['intro', 'headings']) {
|
||||
if (include.includes(key)) {
|
||||
searchQuery._source_includes.push(key)
|
||||
for (const key of ['intro', 'headings'] as const) {
|
||||
if (include.includes(key) && Array.isArray(searchQuery._source_includes)) {
|
||||
searchQuery._source_includes?.push(key)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -193,26 +194,26 @@ export async function getSearchResults({
|
|||
highlightFields,
|
||||
include,
|
||||
})
|
||||
const aggregations = getAggregationsResult(aggregate, result.aggregations)
|
||||
const t1 = new Date()
|
||||
const aggregationsResult = getAggregationsResult(aggregate, result.aggregations)
|
||||
const t1 = Date.now()
|
||||
|
||||
const meta = {
|
||||
found: hitsAll.total,
|
||||
found: hitsAll.total as SearchTotalHits,
|
||||
took: {
|
||||
query_msec: result.took,
|
||||
total_msec: t1.getTime() - t0.getTime(),
|
||||
total_msec: t1 - t0,
|
||||
},
|
||||
page,
|
||||
size,
|
||||
}
|
||||
|
||||
return { meta, hits, aggregations }
|
||||
return { meta, hits, aggregations: aggregationsResult }
|
||||
}
|
||||
|
||||
function getAggregations(aggregate) {
|
||||
function getAggregations(aggregate?: string[]): Record<string, any> | undefined {
|
||||
if (!aggregate || !aggregate.length) return undefined
|
||||
|
||||
const aggs = {}
|
||||
const aggs: Record<string, any> = {}
|
||||
for (const key of aggregate) {
|
||||
aggs[key] = {
|
||||
terms: {
|
||||
|
@ -224,66 +225,37 @@ function getAggregations(aggregate) {
|
|||
return aggs
|
||||
}
|
||||
|
||||
function getAggregationsResult(aggregate, result) {
|
||||
if (!aggregate || !aggregate.length) return
|
||||
return Object.fromEntries(
|
||||
aggregate.map((key) => [
|
||||
key,
|
||||
result[key].buckets
|
||||
.map((bucket) => {
|
||||
return {
|
||||
key: bucket.key,
|
||||
count: bucket.doc_count,
|
||||
}
|
||||
})
|
||||
.sort((a, b) => a.key.localeCompare(b.key)),
|
||||
]),
|
||||
)
|
||||
}
|
||||
|
||||
export async function getAutocompleteSearchResults({ indexName, query, size }) {
|
||||
const client = getClient()
|
||||
|
||||
const matchQueries = getAutocompleteMatchQueries(query.trim(), {
|
||||
fuzzy: {
|
||||
minLength: 3,
|
||||
maxLength: 20,
|
||||
},
|
||||
})
|
||||
const matchQuery = {
|
||||
bool: {
|
||||
should: matchQueries,
|
||||
},
|
||||
}
|
||||
|
||||
const highlight = getHighlightConfiguration(query, ['term'])
|
||||
|
||||
const searchQuery = {
|
||||
index: indexName,
|
||||
highlight,
|
||||
size,
|
||||
query: matchQuery,
|
||||
// Send absolutely minimal from Elasticsearch to here. Less data => faster.
|
||||
_source_includes: ['term'],
|
||||
}
|
||||
const result = await client.search(searchQuery)
|
||||
|
||||
const hitsAll = result.hits
|
||||
const hits = hitsAll.hits.map((hit) => {
|
||||
return {
|
||||
term: hit._source.term,
|
||||
highlights: (hit.highlight && hit.highlight.term) || [],
|
||||
function getAggregationsResult(
|
||||
aggregate?: string[],
|
||||
result?: Record<string, any>,
|
||||
): Record<string, SearchAggregation[]> | undefined {
|
||||
if (!aggregate || !aggregate.length || !result) return undefined
|
||||
const aggregations: Record<string, SearchAggregation[]> = {}
|
||||
for (const key of aggregate) {
|
||||
if (result[key]?.buckets) {
|
||||
aggregations[key] = result[key].buckets
|
||||
.map((bucket: any) => ({
|
||||
key: bucket.key as string,
|
||||
count: bucket.doc_count as number,
|
||||
}))
|
||||
.sort((a: { key: string }, b: { key: string }) => a.key.localeCompare(b.key))
|
||||
}
|
||||
})
|
||||
|
||||
const meta = {
|
||||
found: hitsAll.total,
|
||||
}
|
||||
|
||||
return { meta, hits }
|
||||
return aggregations
|
||||
}
|
||||
|
||||
function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
|
||||
interface GetMatchQueriesOptions {
|
||||
usePrefixSearch: boolean
|
||||
fuzzy: {
|
||||
minLength: number
|
||||
maxLength: number
|
||||
}
|
||||
}
|
||||
|
||||
function getMatchQueries(
|
||||
query: string,
|
||||
{ usePrefixSearch, fuzzy }: GetMatchQueriesOptions,
|
||||
): QueryDslQueryContainer[] {
|
||||
const BOOST_PHRASE = 10.0
|
||||
const BOOST_TITLE = 4.0
|
||||
const BOOST_HEADINGS = 3.0
|
||||
|
@ -296,7 +268,7 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
|
|||
// which wouldn't find anything else anyway.
|
||||
const BOOST_FUZZY = 0.1
|
||||
|
||||
const matchQueries = []
|
||||
const matchQueries: QueryDslQueryContainer[] = []
|
||||
|
||||
// If the query input is multiple words, it's good to know because you can
|
||||
// make the query do `match_phrase` and you can make `match` query
|
||||
|
@ -453,12 +425,12 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
|
|||
} else if (query.startsWith('http')) {
|
||||
// E.g. `https://docs.github.com/en/some/page?foo=bar`
|
||||
// will become a search on `{url: '/en/some/page'}`
|
||||
let pathname
|
||||
let pathname: string | undefined
|
||||
try {
|
||||
pathname = new URL(query).pathname
|
||||
} catch {
|
||||
// If it failed, it can't be initialized with the `URL` constructor
|
||||
// we so we can deem it *not* a valid URL.
|
||||
// so we can deem it *not* a valid URL.
|
||||
}
|
||||
if (pathname) {
|
||||
matchQueries.push({
|
||||
|
@ -471,47 +443,18 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
|
|||
return matchQueries
|
||||
}
|
||||
|
||||
function getAutocompleteMatchQueries(query, { fuzzy }) {
|
||||
const BOOST_PHRASE = 4.0
|
||||
const BOOST_REGULAR = 2.0
|
||||
const BOOST_FUZZY = 0.1 // make it always last in ranking
|
||||
const matchQueries = []
|
||||
|
||||
// If the query input is multiple words, it's good to know because you can
|
||||
// make the query do `match_phrase` and you can make `match` query
|
||||
// with the `AND` operator (`OR` is the default).
|
||||
const isMultiWordQuery = query.includes(' ') || query.includes('-')
|
||||
|
||||
if (isMultiWordQuery) {
|
||||
matchQueries.push({
|
||||
match_phrase_prefix: {
|
||||
term: {
|
||||
query,
|
||||
boost: BOOST_PHRASE,
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
matchQueries.push({
|
||||
match_bool_prefix: {
|
||||
term: {
|
||||
query,
|
||||
boost: BOOST_REGULAR,
|
||||
},
|
||||
},
|
||||
})
|
||||
if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
|
||||
matchQueries.push({
|
||||
fuzzy: {
|
||||
term: { value: query, boost: BOOST_FUZZY, fuzziness: 'AUTO' },
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
return matchQueries
|
||||
interface GetHitsOptions {
|
||||
indexName: string
|
||||
debug?: boolean
|
||||
includeTopics?: boolean
|
||||
highlightFields: string[]
|
||||
include: AdditionalIncludes[]
|
||||
}
|
||||
|
||||
function getHits(hits, { indexName, debug, includeTopics, highlightFields, include }) {
|
||||
function getHits(
|
||||
hits: ElasticsearchHit<any>[],
|
||||
{ indexName, debug = false, includeTopics = false, highlightFields, include }: GetHitsOptions,
|
||||
): GeneralSearchHit[] {
|
||||
return hits.map((hit) => {
|
||||
// Return `hit.highlights[...]` based on the highlight fields requested.
|
||||
// So if you searched with `&highlights=headings&highlights=content`
|
||||
|
@ -521,11 +464,12 @@ function getHits(hits, { indexName, debug, includeTopics, highlightFields, inclu
|
|||
// headings: [...]
|
||||
// }
|
||||
// even if there was a match on 'title'.
|
||||
const hitHighlights = Object.fromEntries(
|
||||
highlightFields.map((key) => [key, (hit.highlight && hit.highlight[key]) || []]),
|
||||
)
|
||||
const hitHighlights: Record<string, string[]> = {}
|
||||
for (const key of highlightFields) {
|
||||
hitHighlights[key] = (hit.highlight && hit.highlight[key]) || []
|
||||
}
|
||||
|
||||
const result = {
|
||||
const result: GeneralSearchHit = {
|
||||
id: hit._id,
|
||||
url: hit._source.url,
|
||||
title: hit._source.title,
|
||||
|
@ -536,87 +480,15 @@ function getHits(hits, { indexName, debug, includeTopics, highlightFields, inclu
|
|||
result.topics = hit._source.topics || []
|
||||
}
|
||||
if (debug) {
|
||||
result.score = hit._score || 0.0
|
||||
result.popularity = hit._source.popularity || 0.0
|
||||
result.score = hit._score ?? 0.0
|
||||
result.popularity = hit._source.popularity ?? 0.0
|
||||
if (isDevMode) {
|
||||
result.es_url = `http://localhost:9200/${indexName}/_doc/${hit._id}`
|
||||
}
|
||||
}
|
||||
for (const field of include || []) {
|
||||
for (const field of include) {
|
||||
result[field] = hit._source[field]
|
||||
}
|
||||
return result
|
||||
})
|
||||
}
|
||||
|
||||
// The highlight configuration is dependent on how we use the content
|
||||
// in the UI. For example, we feel we need about 3 lines (max)
|
||||
// of highlights of content under each title. If we feel it shows too
|
||||
// many highlights in the search result UI, we can come back here
|
||||
// and change it to something more appropriate.
|
||||
function getHighlightConfiguration(query, highlights) {
|
||||
const fields = {}
|
||||
if (highlights.includes('title')) {
|
||||
fields.title = {
|
||||
// Fast Vector Highlighter
|
||||
// Using this requires that you first index these fields
|
||||
// with {term_vector: 'with_positions_offsets'}
|
||||
type: 'fvh',
|
||||
fragment_size: 200,
|
||||
number_of_fragments: 1,
|
||||
}
|
||||
}
|
||||
if (highlights.includes('content')) {
|
||||
// The 'no_match_size' is so we can display *something* for the
|
||||
// preview if there was no highlight match at all within the content.
|
||||
fields.content = {
|
||||
// Fast Vector Highlighter
|
||||
// Using this requires that you first index these fields
|
||||
// with {term_vector: 'with_positions_offsets'}
|
||||
type: 'fvh',
|
||||
fragment_size: 150,
|
||||
number_of_fragments: 1,
|
||||
no_match_size: 150,
|
||||
|
||||
highlight_query: {
|
||||
match_phrase_prefix: {
|
||||
content: {
|
||||
query,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
fields.content_explicit = {
|
||||
// Fast Vector Highlighter
|
||||
// Using this requires that you first index these fields
|
||||
// with {term_vector: 'with_positions_offsets'}
|
||||
type: 'fvh',
|
||||
fragment_size: 150,
|
||||
number_of_fragments: 1,
|
||||
no_match_size: 0,
|
||||
|
||||
highlight_query: {
|
||||
match_phrase_prefix: {
|
||||
content_explicit: {
|
||||
query,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
if (highlights.includes('term')) {
|
||||
fields.term = {
|
||||
// Fast Vector Highlighter
|
||||
// Using this requires that you first index these fields
|
||||
// with {term_vector: 'with_positions_offsets'}
|
||||
type: 'fvh',
|
||||
// fragment_size: 200,
|
||||
// number_of_fragments: 1,
|
||||
}
|
||||
}
|
||||
return {
|
||||
pre_tags: ['<mark>'],
|
||||
post_tags: ['</mark>'],
|
||||
fields,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
import { SearchHighlight } from '@elastic/elasticsearch/lib/api/types'
|
||||
|
||||
import type { HighlightOptions } from '@/search/lib/search-request-params/types'
|
||||
|
||||
export interface HighlightConfig {
|
||||
type: string
|
||||
fragment_size?: number
|
||||
number_of_fragments?: number
|
||||
no_match_size?: number
|
||||
highlight_query?: object
|
||||
}
|
||||
|
||||
export type HighlightFields = {
|
||||
[key in HighlightOptions]: HighlightConfig
|
||||
}
|
||||
|
||||
// When we query Elasticsearch, we can specify a highlight configuration
|
||||
export function getHighlightConfiguration(
|
||||
query: string,
|
||||
highlightsFields: HighlightOptions[],
|
||||
): SearchHighlight {
|
||||
const fields = {} as HighlightFields
|
||||
if (highlightsFields.includes('title')) {
|
||||
fields.title = {
|
||||
// Fast Vector Highlighter
|
||||
// Using this requires that you first index these fields
|
||||
// with {term_vector: 'with_positions_offsets'}
|
||||
type: 'fvh',
|
||||
fragment_size: 200,
|
||||
number_of_fragments: 1,
|
||||
}
|
||||
}
|
||||
if (highlightsFields.includes('content')) {
|
||||
fields.content = {
|
||||
// Fast Vector Highlighter
|
||||
// Using this requires that you first index these fields
|
||||
// with {term_vector: 'with_positions_offsets'}
|
||||
type: 'fvh',
|
||||
fragment_size: 150,
|
||||
number_of_fragments: 1,
|
||||
// So we can at least display something if there was no highlight match within the content.
|
||||
no_match_size: 150,
|
||||
|
||||
highlight_query: {
|
||||
match_phrase_prefix: {
|
||||
content: {
|
||||
query,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
fields.content_explicit = {
|
||||
// Fast Vector Highlighter
|
||||
// Using this requires that you first index these fields
|
||||
// with {term_vector: 'with_positions_offsets'}
|
||||
type: 'fvh',
|
||||
fragment_size: 150,
|
||||
number_of_fragments: 1,
|
||||
no_match_size: 0,
|
||||
|
||||
highlight_query: {
|
||||
match_phrase_prefix: {
|
||||
content_explicit: {
|
||||
query,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
if (highlightsFields.includes('term')) {
|
||||
fields.term = {
|
||||
// Fast Vector Highlighter
|
||||
// Using this requires that you first index these fields
|
||||
// with {term_vector: 'with_positions_offsets'}
|
||||
type: 'fvh',
|
||||
}
|
||||
}
|
||||
|
||||
const highlightConfig: SearchHighlight = {
|
||||
pre_tags: ['<mark>'],
|
||||
post_tags: ['</mark>'],
|
||||
fields,
|
||||
}
|
||||
|
||||
return highlightConfig
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
export interface AutocompleteResultsArgs {
|
||||
indexName: string
|
||||
query: string
|
||||
size: number
|
||||
}
|
||||
|
||||
export interface FuzzyConfig {
|
||||
minLength: number
|
||||
maxLength: number
|
||||
}
|
||||
|
||||
export interface MatchQueriesOptions {
|
||||
usePrefixSearch?: boolean
|
||||
fuzzy: FuzzyConfig
|
||||
}
|
||||
|
||||
export interface AutocompleteMatchQueriesOptions {
|
||||
fuzzy: FuzzyConfig
|
||||
}
|
||||
|
||||
export interface AutocompleteElasticsearchItem {
|
||||
term: string
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
import { Client } from '@elastic/elasticsearch'
|
||||
import { safeUrlDisplay } from '@/search/lib/helpers/strings'
|
||||
|
||||
export function getElasticsearchClient(overrideURL = '', verbose = false): Client {
|
||||
const node = getElasticsearchURL(overrideURL)
|
||||
if (verbose) {
|
||||
console.log('Connecting to Elasticsearch URL:', safeUrlDisplay(node))
|
||||
}
|
||||
const client = new Client({ node })
|
||||
return client
|
||||
}
|
||||
|
||||
function getElasticsearchURL(overrideURL = ''): string {
|
||||
if (!process.env.ELASTICSEARCH_URL && !overrideURL) {
|
||||
throw new Error(
|
||||
'Must pass the elasticsearch URL option or ' +
|
||||
'set the environment variable ELASTICSEARCH_URL',
|
||||
)
|
||||
}
|
||||
let node = overrideURL || process.env.ELASTICSEARCH_URL || ''
|
||||
|
||||
// Allow the user to lazily set it to `localhost:9200` for example.
|
||||
if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) {
|
||||
node = `http://${node}`
|
||||
}
|
||||
|
||||
const parsed = new URL(node)
|
||||
if (!parsed.hostname) throw new Error('no valid hostname')
|
||||
|
||||
return node
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
import { allVersions } from '@/versions/lib/all-versions'
|
||||
|
||||
// TODO: Old version logic
|
||||
type VersionAliases = { [key: string]: string }
|
||||
export const versionAliases: VersionAliases = {}
|
||||
export const prefixVersionAliases: VersionAliases = {}
|
||||
|
||||
Object.values(allVersions).forEach((info) => {
|
||||
if (info.hasNumberedReleases) {
|
||||
versionAliases[info.currentRelease] = info.miscVersionName
|
||||
} else {
|
||||
versionAliases[info.version] = info.miscVersionName
|
||||
versionAliases[info.miscVersionName] = info.miscVersionName
|
||||
}
|
||||
prefixVersionAliases[info.plan] = info.shortName
|
||||
prefixVersionAliases[info.shortName] = info.shortName
|
||||
})
|
||||
|
||||
// Temporary hard-coded switch
|
||||
//
|
||||
// We need to run workflows in production to index the search data
|
||||
// We want the middleware + routes that consume the indexes to consume the old indexes
|
||||
// until the new indexes are ready.
|
||||
|
||||
// Once they are ready we can remove this file & cleanup the places it is used
|
||||
export function isBeforeSearchIndexMigration() {
|
||||
if (process.env.NODE_ENV === 'production') return true
|
||||
return false
|
||||
}
|
||||
|
||||
// Old test prefix helper function
|
||||
export function getGeneralSearchIndexPrefix(): string {
|
||||
if (process.env.NODE_ENV === 'test') return 'tests_'
|
||||
return ''
|
||||
}
|
||||
|
||||
export function getGeneralSearchIndexVersion(paramVersion: string): string {
|
||||
const version =
|
||||
prefixVersionAliases[paramVersion] ||
|
||||
versionAliases[paramVersion] ||
|
||||
allVersions[paramVersion].miscVersionName
|
||||
|
||||
return version
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
export function safeUrlDisplay(url: string): string {
|
||||
const parsed = new URL(url)
|
||||
if (parsed.password) {
|
||||
parsed.password = '***'
|
||||
}
|
||||
if (parsed.username) {
|
||||
parsed.username = parsed.username.slice(0, 4) + '***'
|
||||
}
|
||||
return parsed.toString()
|
||||
}
|
|
@ -33,3 +33,28 @@ export function utcTimestamp() {
|
|||
.join('')
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a given number of seconds into a formatted time string "HH:mm:ss".
|
||||
*
|
||||
* @param {number} seconds - The total number of seconds to format.
|
||||
* @returns {string} A string representing the time in "hours:minutes:seconds" format.
|
||||
*
|
||||
* @example
|
||||
* // returns "01:30:45"
|
||||
* formatSeconds(5445);
|
||||
*/
|
||||
export function formatSecondsToHHMMSS(seconds: number): string {
|
||||
return new Date(seconds * 1000).toISOString().substr(11, 8)
|
||||
}
|
||||
|
||||
export function readableTimeMinAndSec(ms: number): string {
|
||||
if (ms < 1000) {
|
||||
return `${ms.toFixed(1)}ms`
|
||||
}
|
||||
const seconds = ms / 1000
|
||||
if (seconds > 60) {
|
||||
return `${Math.round(seconds / 60)}m${Math.round(seconds % 60)}s`
|
||||
}
|
||||
return `${seconds.toFixed(1)}s`
|
||||
}
|
|
@ -0,0 +1,96 @@
|
|||
import type { Request } from 'express'
|
||||
import { format } from 'node:util'
|
||||
|
||||
import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes'
|
||||
import {
|
||||
ValidationError,
|
||||
getSearchRequestParamsObject,
|
||||
} from '@/search/lib/search-request-params/search-params-objects'
|
||||
import {
|
||||
getGeneralSearchIndexVersion,
|
||||
getGeneralSearchIndexPrefix,
|
||||
isBeforeSearchIndexMigration,
|
||||
} from '@/search/lib/helpers/old-version-logic'
|
||||
|
||||
import type {
|
||||
ComputedSearchQueryParams,
|
||||
ComputedSearchQueryParamsMap,
|
||||
GetSearchRequestReturn,
|
||||
} from '@/search/lib/search-request-params/types'
|
||||
import type { SearchTypes, SearchValidationErrorEntry } from '@/search/types'
|
||||
|
||||
type ForceParams = {
|
||||
[K in keyof ComputedSearchQueryParams]?: ComputedSearchQueryParams[K]
|
||||
}
|
||||
|
||||
// Fetches the Search Params Object based on the type of request and uses that object to validate the passed in request parameters
|
||||
// For example, if the request is a general search request, the general search params object expects a `page` key, e.g. ?page=1 on the request
|
||||
// If that key is not present, it will be added to the validation errors array which will result in a 400 to the user.
|
||||
export function getSearchFromRequestParams<Type extends SearchTypes>(
|
||||
req: Request,
|
||||
type: Type,
|
||||
forceParams: ForceParams = {} as ForceParams,
|
||||
): GetSearchRequestReturn<Type> {
|
||||
const searchParamsObject = getSearchRequestParamsObject(type)
|
||||
|
||||
const searchParams: ComputedSearchQueryParamsMap[Type] = {} as ComputedSearchQueryParamsMap[Type]
|
||||
const validationErrors: SearchValidationErrorEntry[] = []
|
||||
|
||||
for (const { key, default_, cast, validate, multiple } of searchParamsObject) {
|
||||
if (key in forceParams) {
|
||||
;(searchParams[key] as any) = forceParams[key] as any
|
||||
continue
|
||||
}
|
||||
|
||||
let value = req.query[key]
|
||||
if (!value || (typeof value === 'string' && !value.trim())) {
|
||||
if (default_ === undefined) {
|
||||
validationErrors.push({ error: `No truthy value for key '${key}'`, key })
|
||||
continue
|
||||
}
|
||||
value = default_
|
||||
}
|
||||
if (cast) {
|
||||
value = cast(value)
|
||||
}
|
||||
try {
|
||||
if (validate && !validate(value)) {
|
||||
validationErrors.push({
|
||||
error: format('Not a valid value (%O) for key %O', value, key),
|
||||
key,
|
||||
})
|
||||
}
|
||||
} catch (err) {
|
||||
if (err instanceof ValidationError) {
|
||||
validationErrors.push({ error: err.toString(), field: key })
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
if (!multiple && Array.isArray(value)) {
|
||||
validationErrors.push({
|
||||
error: format('Cannot have multiple values (%O) for key %O', value, key),
|
||||
key,
|
||||
})
|
||||
}
|
||||
|
||||
;(searchParams[key] as any) = value
|
||||
}
|
||||
|
||||
let indexName = ''
|
||||
if (!validationErrors.length) {
|
||||
// generalSearch is the only type of search that uses the old index prefix logic, rather than the `getElasticSearchIndex` function logic
|
||||
if (type === 'generalSearch' && isBeforeSearchIndexMigration()) {
|
||||
indexName = `${getGeneralSearchIndexPrefix()}github-docs-${getGeneralSearchIndexVersion(searchParams.version)}-${searchParams.language}`
|
||||
} else {
|
||||
const getIndexResults = getElasticSearchIndex(
|
||||
type,
|
||||
searchParams.version,
|
||||
searchParams.language,
|
||||
)
|
||||
indexName = getIndexResults.indexName
|
||||
}
|
||||
}
|
||||
|
||||
return { indexName, searchParams, validationErrors }
|
||||
}
|
|
@ -0,0 +1,153 @@
|
|||
/*
|
||||
When a request is made to a /search endpoint with query parameters, e.g. ?query=foo&version=free-pro-team,
|
||||
we need to validate and parse the parameters. This file contains the configuration for which parameters
|
||||
to expect based on the type of search request "e.g. general search vs autocomplete search" and how to validate them.
|
||||
*/
|
||||
import languages from '@/languages/lib/languages'
|
||||
import { allIndexVersionKeys, versionToIndexVersionMap } from '@/search/lib/elasticsearch-versions'
|
||||
import { SearchTypes } from '@/search/types'
|
||||
import { versionAliases } from '@/search/lib/helpers/old-version-logic'
|
||||
import { allVersions } from '@/versions/lib/all-versions'
|
||||
|
||||
import type { SearchRequestQueryParams } from '@/search/lib/search-request-params/types'
|
||||
|
||||
// Entry to this file, returns the query parameters to expect based on the type of search request
|
||||
export function getSearchRequestParamsObject(type: SearchTypes): SearchRequestQueryParams[] {
|
||||
if (type === 'generalAutocomplete') {
|
||||
return AUTOCOMPLETE_PARAMS_OBJ
|
||||
} else if (type === 'aiSearchAutocomplete') {
|
||||
return AI_SEARCH_AUTOCOMPLETE_PARAMS_OBJ
|
||||
}
|
||||
return GENERAL_SEARCH_PARAMS_OBJ
|
||||
}
|
||||
|
||||
// - - - Everything below this line is for building the search query param objects - - - //
|
||||
|
||||
// Constants
|
||||
const DEFAULT_AUTOCOMPLETE_SIZE = 5
|
||||
const MAX_AUTOCOMPLETE_SIZE = 10
|
||||
const DEFAULT_SIZE = 10
|
||||
const MAX_SIZE = 50
|
||||
const DEFAULT_PAGE = 1
|
||||
const POSSIBLE_SORTS = ['best', 'relevance'] as const
|
||||
const DEFAULT_SORT = POSSIBLE_SORTS[0]
|
||||
const MAX_PAGE = 10
|
||||
const V1_AGGREGATES = ['toplevel'] as const
|
||||
export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content'] as const
|
||||
// This needs to match what we *use* in the `<SearchResults>` component.
|
||||
// For example, if we don't display "headings" we shouldn't request
|
||||
// highlights for it either.
|
||||
export const DEFAULT_HIGHLIGHT_FIELDS: readonly string[] = ['title', 'content']
|
||||
|
||||
export const V1_ADDITIONAL_INCLUDES = ['intro', 'headings', 'toplevel'] as const
|
||||
|
||||
export class ValidationError extends Error {}
|
||||
|
||||
const SHARED_PARAMS_OBJ: SearchRequestQueryParams[] = [
|
||||
{ key: 'query' },
|
||||
{
|
||||
key: 'version',
|
||||
default_: 'free-pro-team',
|
||||
validate: (version: string) => {
|
||||
if (!versionToIndexVersionMap[version]) {
|
||||
throw new ValidationError(`'${version}' not in ${allIndexVersionKeys.join(', ')}`)
|
||||
}
|
||||
return true
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
const GENERAL_SEARCH_PARAMS_OBJ: SearchRequestQueryParams[] = [
|
||||
...SHARED_PARAMS_OBJ,
|
||||
{ key: 'query' },
|
||||
// TODO: Overwrite with old version logic for now
|
||||
{
|
||||
key: 'version',
|
||||
default_: 'dotcom',
|
||||
validate: (v) => {
|
||||
if (versionAliases[v] || allVersions[v]) return true
|
||||
const valid = [...Object.keys(versionAliases), ...Object.keys(allVersions)]
|
||||
throw new ValidationError(`'${v}' not in ${valid}`)
|
||||
},
|
||||
},
|
||||
{ key: 'language', default_: 'en', validate: (v) => v in languages },
|
||||
{
|
||||
key: 'size',
|
||||
default_: DEFAULT_SIZE,
|
||||
cast: (v) => parseInt(v, 10),
|
||||
validate: (v) => v >= 0 && v <= MAX_SIZE,
|
||||
},
|
||||
{
|
||||
key: 'page',
|
||||
default_: DEFAULT_PAGE,
|
||||
cast: (v) => parseInt(v, 10),
|
||||
validate: (v) => v >= 1 && v <= MAX_PAGE,
|
||||
},
|
||||
{ key: 'sort', default_: DEFAULT_SORT, validate: (v) => POSSIBLE_SORTS.includes(v as any) },
|
||||
{
|
||||
key: 'highlights',
|
||||
default_: DEFAULT_HIGHLIGHT_FIELDS,
|
||||
cast: (v) => (Array.isArray(v) ? v : [v]),
|
||||
multiple: true,
|
||||
validate: (v) => {
|
||||
for (const highlight of v) {
|
||||
if (!POSSIBLE_HIGHLIGHT_FIELDS.includes(highlight)) {
|
||||
throw new ValidationError(`highlight value '${highlight}' is not valid`)
|
||||
}
|
||||
}
|
||||
return true
|
||||
},
|
||||
},
|
||||
{ key: 'autocomplete', default_: false, cast: toBoolean },
|
||||
{ key: 'debug', default_: process.env.NODE_ENV === 'development', cast: toBoolean },
|
||||
{
|
||||
key: 'include',
|
||||
default_: [],
|
||||
cast: toArray,
|
||||
multiple: true,
|
||||
validate: (values) =>
|
||||
values.every((value: string) => V1_ADDITIONAL_INCLUDES.includes(value as any)),
|
||||
},
|
||||
{
|
||||
key: 'toplevel',
|
||||
default_: [],
|
||||
cast: toArray,
|
||||
multiple: true,
|
||||
},
|
||||
{
|
||||
key: 'aggregate',
|
||||
default_: [],
|
||||
cast: toArray,
|
||||
multiple: true,
|
||||
validate: (values) => values.every((value: string) => V1_AGGREGATES.includes(value as any)),
|
||||
},
|
||||
]
|
||||
|
||||
const SHARED_AUTOCOMPLETE_PARAMS_OBJ: SearchRequestQueryParams[] = [
|
||||
{
|
||||
key: 'size',
|
||||
default_: DEFAULT_AUTOCOMPLETE_SIZE,
|
||||
cast: (size: string) => parseInt(size, 10),
|
||||
validate: (size: number) => size >= 0 && size <= MAX_AUTOCOMPLETE_SIZE,
|
||||
},
|
||||
]
|
||||
|
||||
const AI_SEARCH_AUTOCOMPLETE_PARAMS_OBJ: SearchRequestQueryParams[] = [
|
||||
...SHARED_PARAMS_OBJ,
|
||||
...SHARED_AUTOCOMPLETE_PARAMS_OBJ,
|
||||
{ key: 'language', default_: 'en', validate: (language: string) => language === 'en' },
|
||||
]
|
||||
|
||||
const AUTOCOMPLETE_PARAMS_OBJ: SearchRequestQueryParams[] = [
|
||||
...SHARED_PARAMS_OBJ,
|
||||
...SHARED_AUTOCOMPLETE_PARAMS_OBJ,
|
||||
{ key: 'language', default_: 'en', validate: (language: string) => language in languages },
|
||||
]
|
||||
|
||||
function toBoolean(value: any): boolean {
|
||||
return value === 'true' || value === '1'
|
||||
}
|
||||
|
||||
function toArray(value: any): any[] {
|
||||
return Array.isArray(value) ? value : [value]
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
import { V1_ADDITIONAL_INCLUDES } from '@/search/lib/search-request-params/search-params-objects'
|
||||
|
||||
import { SearchTypes, SearchValidationErrorEntry } from '@/search/types'
|
||||
|
||||
export type HighlightOptions = 'title' | 'content' | 'content_explicit' | 'term'
|
||||
|
||||
export type AdditionalIncludes = (typeof V1_ADDITIONAL_INCLUDES)[number]
|
||||
|
||||
export interface ComputedSearchQueryParams {
|
||||
query: string
|
||||
size: number
|
||||
version: string
|
||||
language: string
|
||||
// These are optional, so we need to use ComputedSearchQueryParamsMap in functions to get the exact types per Search Type
|
||||
page?: number
|
||||
sort?: string
|
||||
highlights?: HighlightOptions[]
|
||||
autocomplete?: boolean
|
||||
debug?: boolean
|
||||
include?: AdditionalIncludes[]
|
||||
toplevel?: string[]
|
||||
aggregate?: string[]
|
||||
}
|
||||
|
||||
export interface ComputedSearchQueryParamsMap {
|
||||
generalSearch: ComputedSearchQueryParams & {
|
||||
page: number
|
||||
sort: string
|
||||
highlights: HighlightOptions[]
|
||||
autocomplete: boolean
|
||||
debug: boolean
|
||||
include: AdditionalIncludes[]
|
||||
toplevel: string[]
|
||||
aggregate: string[]
|
||||
}
|
||||
generalAutocomplete: ComputedSearchQueryParams
|
||||
aiSearchAutocomplete: ComputedSearchQueryParams
|
||||
}
|
||||
|
||||
export interface SearchRequestQueryParams {
|
||||
key: keyof ComputedSearchQueryParams
|
||||
default_?: any
|
||||
cast?: (value: any) => any
|
||||
validate?: (value: any) => boolean
|
||||
multiple?: boolean
|
||||
}
|
||||
|
||||
export interface GetSearchRequestReturn<Type extends SearchTypes> {
|
||||
indexName: string
|
||||
searchParams: ComputedSearchQueryParamsMap[Type]
|
||||
validationErrors: SearchValidationErrorEntry[]
|
||||
}
|
|
@ -1,153 +0,0 @@
|
|||
import got from 'got'
|
||||
import { errors } from '@elastic/elasticsearch'
|
||||
import statsd from '#src/observability/lib/statsd.js'
|
||||
|
||||
import { getPathWithoutVersion, getPathWithoutLanguage } from '#src/frame/lib/path-utils.js'
|
||||
import { getSearchFromRequest } from './get-search-request.js'
|
||||
import { getSearchResults } from './es-search.js'
|
||||
|
||||
export default async function contextualizeSearch(req, res, next) {
|
||||
// If it's NextJS fetching or data or it's a direct request,
|
||||
// the pagePath is the "normalized" version
|
||||
const { pagePath } = req
|
||||
if (getPathWithoutLanguage(getPathWithoutVersion(pagePath)) !== '/search') {
|
||||
return next()
|
||||
}
|
||||
|
||||
// When you use `/api/search/v1?version=foo&language=xy&...`
|
||||
// the language and version comes from the query string.
|
||||
// When you use `/xz/enterprise-cloud@latest/search?query=hello`
|
||||
// the `version` and `language` is implied from the URL pathname.
|
||||
// search.version = req.context.currentVersion
|
||||
// search.language = req.context.currentLanguage
|
||||
|
||||
const { search, validationErrors } = getSearchFromRequest(req, {
|
||||
version: req.context.currentVersion,
|
||||
language: req.context.currentLanguage,
|
||||
})
|
||||
|
||||
if (validationErrors.map((error) => error.key).includes('query')) {
|
||||
// 'query' is such an exception because the search result component
|
||||
// will attempt to display its value even if there was any
|
||||
// validation error. In a sense, it displays:
|
||||
//
|
||||
// You searched for "foo"
|
||||
// But your 'page' parameter is invalid.
|
||||
//
|
||||
// If for example, the search input is an array, we pick the first
|
||||
// value. If it's too long, we truncate it.
|
||||
if (Array.isArray(search.query)) {
|
||||
search.query = search.query[0]
|
||||
} else if (!search.query) {
|
||||
// If the 'query' query string parameter wasn't even present,
|
||||
// it becomes `undefined`. But since `search.query` needs to be
|
||||
// a *string*, we pretend it was provided but empty.
|
||||
search.query = ''
|
||||
}
|
||||
}
|
||||
|
||||
// This enables so that when the search is sent to Elasticsearch
|
||||
// it will request an aggregate by these keyword fields.
|
||||
search.aggregate = ['toplevel']
|
||||
|
||||
req.context.search = { search, validationErrors }
|
||||
|
||||
if (!validationErrors.length && search.query) {
|
||||
if (!process.env.ELASTICSEARCH_URL) {
|
||||
// This is only true in local dev or in Preview environments.
|
||||
// And in local dev, it's usually for content contributors who
|
||||
// want to test a preview locally, but don't want to have to
|
||||
// set up Elasticsearch.
|
||||
// This same proxying logic happens in `middleware/api/index.js`
|
||||
// too for the outwards facing `/api/search/v1` endpoint.
|
||||
if (search.aggregate && search.toplevel && search.toplevel.length > 0) {
|
||||
// Do 2 searches. One without filtering
|
||||
const { toplevel, ...searchWithoutFilter } = search
|
||||
searchWithoutFilter.size = 0
|
||||
const { aggregations } = await getProxySearch(searchWithoutFilter)
|
||||
const { aggregate, ...searchWithoutAggregate } = search
|
||||
req.context.search.results = await getProxySearch(searchWithoutAggregate)
|
||||
req.context.search.results.aggregations = aggregations
|
||||
} else {
|
||||
req.context.search.results = await getProxySearch(search)
|
||||
}
|
||||
} else {
|
||||
// If this throws, so be it. Let it bubble up.
|
||||
// In local dev, you get to see the error. In production,
|
||||
// you get a "Oops! Something went wrong" which involves a Failbot
|
||||
// send.
|
||||
const tags = [`indexName:${search.indexName}`, `toplevels:${search.toplevel.length}`]
|
||||
const timed = statsd.asyncTimer(getSearchResults, 'contextualize.search', tags)
|
||||
try {
|
||||
if (search.aggregate && search.toplevel && search.toplevel.length > 0) {
|
||||
// Do 2 searches. One without filtering
|
||||
const { toplevel, ...searchWithoutFilter } = search
|
||||
searchWithoutFilter.size = 0
|
||||
const { aggregations } = await timed(searchWithoutFilter)
|
||||
req.context.search.results = await timed(search)
|
||||
req.context.search.results.aggregations = aggregations
|
||||
} else {
|
||||
req.context.search.results = await timed(search)
|
||||
}
|
||||
} catch (error) {
|
||||
// If the error coming from the Elasticsearch client is any sort
|
||||
// of 4xx error, it will be bubbled up to the next middleware
|
||||
// which might think something else is wrong with the *client's*
|
||||
// request from the outside. But in reality it's not their fault.
|
||||
// It's our fault in the backend side. So we throw a new error
|
||||
// so that this failure to seach ultimately bubbles up to a
|
||||
// proper 500 error (including Failbot reporting).
|
||||
// In particular, this helps platform developers working on the
|
||||
// Elasticsearch searching code.
|
||||
if (error instanceof errors.ElasticsearchClientError) {
|
||||
console.error('Error calling getSearchResults(%s):', search, error)
|
||||
if (error.meta?.body) {
|
||||
console.error(`Meta:`, error.meta.body)
|
||||
}
|
||||
throw new Error(error.message)
|
||||
} else {
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return next()
|
||||
}
|
||||
|
||||
// When you use the proxy to prod, using its API, we need to "convert"
|
||||
// the parameters we have figured out here in the contextualizer.
|
||||
// Thankfully all the names match. For example, we might figure
|
||||
// the page by doing `req.context.search.page = 123` and now we need to
|
||||
// add that to the query string for the `/api/search/v1`.
|
||||
// We inclusion-list all the keys that we want to take from the search
|
||||
// object into the query string URL.
|
||||
const SEARCH_KEYS_TO_QUERY_STRING = [
|
||||
'query',
|
||||
'version',
|
||||
'language',
|
||||
'page',
|
||||
'aggregate',
|
||||
'toplevel',
|
||||
'size',
|
||||
]
|
||||
|
||||
async function getProxySearch(search) {
|
||||
const url = new URL('https://docs.github.com/api/search/v1')
|
||||
for (const key of SEARCH_KEYS_TO_QUERY_STRING) {
|
||||
const value = search[key]
|
||||
if (typeof value === 'boolean') {
|
||||
url.searchParams.set(key, value ? 'true' : 'false')
|
||||
} else if (Array.isArray(value)) {
|
||||
for (const v of value) {
|
||||
url.searchParams.append(key, v)
|
||||
}
|
||||
} else if (typeof value === 'number') {
|
||||
url.searchParams.set(key, `${value}`)
|
||||
} else if (value) {
|
||||
url.searchParams.set(key, value)
|
||||
}
|
||||
}
|
||||
console.log(`Proxying search to ${url}`)
|
||||
return got(url).json()
|
||||
}
|
|
@ -0,0 +1,174 @@
|
|||
/*
|
||||
This file & middleware is for when a user requests our /search page e.g. 'docs.github.com/search?query=foo'
|
||||
We make whatever search is in the ?query= parameter and attach it to req.search
|
||||
req.search is then consumed by the search component in 'src/search/pages/search.tsx'
|
||||
|
||||
When a user directly hits our API e.g. /api/search/v1?query=foo, they will hit the routes in ./search-routes.ts
|
||||
*/
|
||||
|
||||
import got from 'got'
|
||||
import { Request, Response, NextFunction } from 'express'
|
||||
import { errors } from '@elastic/elasticsearch'
|
||||
import statsd from '@/observability/lib/statsd.js'
|
||||
|
||||
import { getPathWithoutVersion, getPathWithoutLanguage } from '@/frame/lib/path-utils'
|
||||
import { getGeneralSearchResults } from '@/search/lib/get-elasticsearch-results/general-search'
|
||||
import { getSearchFromRequestParams } from '@/search/lib/search-request-params/get-search-from-request-params'
|
||||
|
||||
import type { ComputedSearchQueryParamsMap } from '@/search/lib/search-request-params/types'
|
||||
import type {
|
||||
GeneralSearchResponse,
|
||||
SearchOnReqObject,
|
||||
SearchTypes,
|
||||
SearchValidationErrorEntry,
|
||||
} from '@/search/types.js'
|
||||
|
||||
interface Context<Type extends SearchTypes> {
|
||||
currentVersion: string
|
||||
currentLanguage: string
|
||||
search: SearchOnReqObject<Type>
|
||||
}
|
||||
|
||||
interface CustomRequest<Type extends SearchTypes> extends Request {
|
||||
pagePath: string
|
||||
context: Context<Type>
|
||||
}
|
||||
|
||||
export default async function contextualizeGeneralSearch(
|
||||
req: CustomRequest<'generalSearch'>,
|
||||
res: Response,
|
||||
next: NextFunction,
|
||||
): Promise<void> {
|
||||
const { pagePath } = req
|
||||
if (getPathWithoutLanguage(getPathWithoutVersion(pagePath)) !== '/search') {
|
||||
return next()
|
||||
}
|
||||
|
||||
// Since this is a middleware language & version are already set in req.context via a prior middleware
|
||||
const { indexName, searchParams, validationErrors } = getSearchFromRequestParams(
|
||||
req,
|
||||
'generalSearch',
|
||||
// Force the version and language keys to be set from the `req.context` object
|
||||
{
|
||||
version: req.context.currentVersion,
|
||||
language: req.context.currentLanguage,
|
||||
},
|
||||
)
|
||||
|
||||
if (validationErrors.map((error: SearchValidationErrorEntry) => error.key).includes('query')) {
|
||||
if (Array.isArray(searchParams.query)) {
|
||||
searchParams.query = searchParams.query[0]
|
||||
} else if (!searchParams.query) {
|
||||
searchParams.query = '' // If 'undefined' we need to cast to string
|
||||
}
|
||||
}
|
||||
|
||||
searchParams.aggregate = ['toplevel']
|
||||
|
||||
req.context.search = {
|
||||
searchParams,
|
||||
validationErrors,
|
||||
}
|
||||
|
||||
if (!validationErrors.length && searchParams.query) {
|
||||
// In local dev ELASTICSEARCH_URL may not be set, so we proxy the search to prod
|
||||
if (!process.env.ELASTICSEARCH_URL) {
|
||||
if (searchParams.aggregate && searchParams.toplevel && searchParams.toplevel.length > 0) {
|
||||
// Do 2 searches. One without filtering to get the aggregations
|
||||
const searchWithoutFilter = Object.fromEntries(
|
||||
Object.entries(searchParams).filter(([key]) => key !== 'topLevel'),
|
||||
)
|
||||
searchWithoutFilter.size = 0
|
||||
const { aggregations } = await getProxySearch(
|
||||
searchWithoutFilter as ComputedSearchQueryParamsMap['generalSearch'],
|
||||
)
|
||||
const searchWithoutAggregate = Object.fromEntries(
|
||||
Object.entries(searchParams).filter(([key]) => key !== 'aggregate'),
|
||||
)
|
||||
req.context.search.results = await getProxySearch(
|
||||
searchWithoutAggregate as ComputedSearchQueryParamsMap['generalSearch'],
|
||||
)
|
||||
req.context.search.results.aggregations = aggregations
|
||||
} else {
|
||||
req.context.search.results = await getProxySearch(searchParams)
|
||||
}
|
||||
} else {
|
||||
const tags: string[] = [`indexName:${indexName}`, `toplevels:${searchParams.toplevel.length}`]
|
||||
const timed = statsd.asyncTimer(getGeneralSearchResults, 'contextualize.search', tags)
|
||||
const getGeneralSearchArgs = {
|
||||
indexName,
|
||||
searchParams,
|
||||
}
|
||||
try {
|
||||
if (searchParams.aggregate && searchParams.toplevel && searchParams.toplevel.length > 0) {
|
||||
// Do 2 searches. One without filtering to get the aggregations
|
||||
const searchWithoutFilter = Object.fromEntries(
|
||||
Object.entries(searchParams).filter(([key]) => key !== 'topLevel'),
|
||||
)
|
||||
searchWithoutFilter.size = 0
|
||||
const { aggregations } = await timed({
|
||||
...getGeneralSearchArgs,
|
||||
searchParams: searchWithoutFilter as ComputedSearchQueryParamsMap['generalSearch'],
|
||||
})
|
||||
req.context.search.results = await timed(getGeneralSearchArgs)
|
||||
req.context.search.results.aggregations = aggregations
|
||||
} else {
|
||||
req.context.search.results = await timed(getGeneralSearchArgs)
|
||||
}
|
||||
} catch (error) {
|
||||
// If the Elasticsearch sends a 4XX we want the user to see a 500
|
||||
if (error instanceof errors.ResponseError) {
|
||||
console.error(
|
||||
'Error calling getSearchResults(%s):',
|
||||
JSON.stringify({
|
||||
indexName,
|
||||
searchParams,
|
||||
}),
|
||||
error,
|
||||
)
|
||||
if (error?.meta?.body) {
|
||||
console.error(`Meta:`, error.meta.body)
|
||||
}
|
||||
throw new Error(error.message)
|
||||
} else {
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return next()
|
||||
}
|
||||
|
||||
const SEARCH_KEYS_TO_QUERY_STRING: (keyof ComputedSearchQueryParamsMap['generalSearch'])[] = [
|
||||
'query',
|
||||
'version',
|
||||
'language',
|
||||
'page',
|
||||
'aggregate',
|
||||
'toplevel',
|
||||
'size',
|
||||
]
|
||||
|
||||
// Proxy the API endpoint with the relevant search params
|
||||
async function getProxySearch(
|
||||
search: ComputedSearchQueryParamsMap['generalSearch'],
|
||||
): Promise<GeneralSearchResponse> {
|
||||
const url = new URL('https://docs.github.com/api/search/v1')
|
||||
for (const key of SEARCH_KEYS_TO_QUERY_STRING) {
|
||||
const value = search[key]
|
||||
if (typeof value === 'boolean') {
|
||||
url.searchParams.set(key, value ? 'true' : 'false')
|
||||
} else if (Array.isArray(value)) {
|
||||
for (const v of value) {
|
||||
url.searchParams.append(key, v)
|
||||
}
|
||||
} else if (typeof value === 'number') {
|
||||
url.searchParams.set(key, `${value}`)
|
||||
} else if (value) {
|
||||
url.searchParams.set(key, value)
|
||||
}
|
||||
}
|
||||
console.log(`Proxying search to ${url}`)
|
||||
return got(url).json<GeneralSearchResponse>()
|
||||
}
|
|
@ -1,229 +0,0 @@
|
|||
import { format } from 'node:util'
|
||||
|
||||
import languages from '#src/languages/lib/languages.js'
|
||||
import { allVersions } from '#src/versions/lib/all-versions.js'
|
||||
import { POSSIBLE_HIGHLIGHT_FIELDS, DEFAULT_HIGHLIGHT_FIELDS } from './es-search.js'
|
||||
|
||||
const DEFAULT_SIZE = 10
|
||||
const DEFAULT_AUTOCOMPLETE_SIZE = 8
|
||||
const MAX_SIZE = 50 // How much you return has a strong impact on performance
|
||||
const MAX_AUTOCOMPLETE_SIZE = 10
|
||||
const DEFAULT_PAGE = 1
|
||||
const POSSIBLE_SORTS = ['best', 'relevance']
|
||||
const DEFAULT_SORT = POSSIBLE_SORTS[0]
|
||||
const MAX_PAGE = 10
|
||||
|
||||
// There are some fields you can optionally include in the output.
|
||||
// These are fields available in Elasticsearch that we don't include in
|
||||
// the output by default. E.g. `...&include=intro`
|
||||
// Requesting anything that is not in this list will result in
|
||||
// a 400 Bad Request.
|
||||
const V1_ADDITIONAL_INCLUDES = ['intro', 'headings', 'toplevel']
|
||||
|
||||
const V1_AGGREGATES = ['toplevel']
|
||||
|
||||
// If someone searches for `...&version=3.5` what they actually mean
|
||||
// is `ghes-3.5`. This is because of legacy formatting with the old search.
|
||||
// In some distant future we can clean up any client enough that this
|
||||
// aliasing won't be necessary.
|
||||
const versionAliases = {}
|
||||
const prefixVersionAliases = {}
|
||||
Object.values(allVersions).forEach((info) => {
|
||||
if (info.hasNumberedReleases) {
|
||||
versionAliases[info.currentRelease] = info.miscVersionName
|
||||
} else {
|
||||
versionAliases[info.version] = info.miscVersionName
|
||||
versionAliases[info.miscVersionName] = info.miscVersionName
|
||||
}
|
||||
// This makes it so you can search for `?version=enterprise-server`
|
||||
// and that actually means `?version=ghes` because there's an index
|
||||
// called `github-autocomplete-en-ghes`.
|
||||
prefixVersionAliases[info.plan] = info.shortName
|
||||
prefixVersionAliases[info.shortName] = info.shortName
|
||||
})
|
||||
|
||||
function getIndexPrefix() {
|
||||
// This logic is mirrored in the scripts we use before running tests
|
||||
// In particular, see the `index-test-fixtures` npm script.
|
||||
// That's expected to be run before CI and local vitest testing.
|
||||
// The reason we have a deliberately different index name (by prefix)
|
||||
// for testing compared to regular operation is to make it convenient
|
||||
// for engineers working on local manual testing *and* automated
|
||||
// testing without have to re-index different content (e.g. fixtures
|
||||
// vs real content) on the same index name.
|
||||
if (process.env.NODE_ENV === 'test') return 'tests_'
|
||||
|
||||
return ''
|
||||
}
|
||||
|
||||
class ValidationError extends Error {}
|
||||
|
||||
const PARAMS = [
|
||||
{ key: 'query' },
|
||||
{
|
||||
key: 'version',
|
||||
default_: 'dotcom',
|
||||
validate: (v) => {
|
||||
if (versionAliases[v] || allVersions[v]) return true
|
||||
const valid = [...Object.keys(versionAliases), ...Object.keys(allVersions)]
|
||||
throw new ValidationError(`'${v}' not in ${valid}`)
|
||||
},
|
||||
},
|
||||
{ key: 'language', default_: 'en', validate: (v) => v in languages },
|
||||
{
|
||||
key: 'size',
|
||||
default_: DEFAULT_SIZE,
|
||||
cast: (v) => parseInt(v, 10),
|
||||
validate: (v) => v >= 0 && v <= MAX_SIZE,
|
||||
},
|
||||
{
|
||||
key: 'page',
|
||||
default_: DEFAULT_PAGE,
|
||||
cast: (v) => parseInt(v, 10),
|
||||
validate: (v) => v >= 1 && v <= MAX_PAGE,
|
||||
},
|
||||
{ key: 'sort', default_: DEFAULT_SORT, validate: (v) => POSSIBLE_SORTS.includes(v) },
|
||||
{
|
||||
key: 'highlights',
|
||||
default_: DEFAULT_HIGHLIGHT_FIELDS,
|
||||
cast: (v) => (Array.isArray(v) ? v : [v]),
|
||||
multiple: true,
|
||||
validate: (v) => {
|
||||
for (const highlight of v) {
|
||||
if (!POSSIBLE_HIGHLIGHT_FIELDS.includes(highlight)) {
|
||||
throw new ValidationError(`highlight value '${highlight}' is not valid`)
|
||||
}
|
||||
}
|
||||
return true
|
||||
},
|
||||
},
|
||||
{ key: 'autocomplete', default_: false, cast: toBoolean },
|
||||
{ key: 'debug', default_: process.env.NODE_ENV === 'development', cast: toBoolean },
|
||||
{
|
||||
key: 'include',
|
||||
default_: [],
|
||||
cast: toArray,
|
||||
multiple: true,
|
||||
// Note: At the time of writing this general validator middleware
|
||||
// doesn't yet know it's being used by the v1 version.
|
||||
// But we don't have any other versions yet so no need to
|
||||
// over-engineer this more.
|
||||
validate: (values) => values.every((value) => V1_ADDITIONAL_INCLUDES.includes(value)),
|
||||
},
|
||||
{
|
||||
key: 'toplevel',
|
||||
default_: [],
|
||||
cast: toArray,
|
||||
multiple: true,
|
||||
},
|
||||
{
|
||||
key: 'aggregate',
|
||||
default_: [],
|
||||
cast: toArray,
|
||||
multiple: true,
|
||||
validate: (values) => values.every((value) => V1_AGGREGATES.includes(value)),
|
||||
},
|
||||
]
|
||||
|
||||
const AUTOCOMPLETE_PARAMS = [
|
||||
{ key: 'query' },
|
||||
{ key: 'language', default_: 'en', validate: (v) => v in languages },
|
||||
{
|
||||
key: 'version',
|
||||
default_: 'free-pro-team',
|
||||
validate: (v) => {
|
||||
if (prefixVersionAliases[v] || allVersions[v]) return true
|
||||
if (Object.values(prefixVersionAliases).includes(v)) return true
|
||||
const valid = [
|
||||
...Object.keys(prefixVersionAliases),
|
||||
...Object.values(prefixVersionAliases),
|
||||
...Object.keys(allVersions),
|
||||
]
|
||||
throw new ValidationError(`'${v}' not in ${valid.join(', ')}`)
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'size',
|
||||
default_: DEFAULT_AUTOCOMPLETE_SIZE,
|
||||
cast: (v) => parseInt(v, 10),
|
||||
validate: (v) => v >= 0 && v <= MAX_AUTOCOMPLETE_SIZE,
|
||||
},
|
||||
]
|
||||
export function getAutocompleteSearchFromRequest(req, force = {}) {
|
||||
const { search, validationErrors } = getSearchFromRequest(req, {}, AUTOCOMPLETE_PARAMS)
|
||||
if (validationErrors.length === 0) {
|
||||
const version = prefixVersionAliases[search.version] || allVersions[search.version].shortName
|
||||
search.indexName = `${getIndexPrefix()}github-autocomplete-${search.language}-${version}`
|
||||
}
|
||||
return { search, validationErrors }
|
||||
}
|
||||
|
||||
export function getSearchFromRequest(req, force = {}, params = PARAMS) {
|
||||
const search = {}
|
||||
const validationErrors = []
|
||||
|
||||
for (const { key, default_, cast, validate, multiple } of params) {
|
||||
// This is necessary because when the version or language comes from
|
||||
// the pathname, we don't want pick these up from the query string.
|
||||
// This function gets used by /$locale/$version/search
|
||||
// *and* /api/search/v1?language=$locale&version=$version
|
||||
if (key in force) {
|
||||
search[key] = force[key]
|
||||
continue
|
||||
}
|
||||
|
||||
let value = req.query[key]
|
||||
if (!value || (typeof value === 'string' && !value.trim())) {
|
||||
if (default_ === undefined) {
|
||||
// no value and no default, bad!
|
||||
validationErrors.push({ error: `No truthy value for key '${key}'`, key })
|
||||
continue
|
||||
}
|
||||
value = default_
|
||||
}
|
||||
if (cast) {
|
||||
value = cast(value)
|
||||
}
|
||||
try {
|
||||
if (validate && !validate(value)) {
|
||||
validationErrors.push({
|
||||
error: format('Not a valid value (%O) for key %O', value, key),
|
||||
key,
|
||||
})
|
||||
}
|
||||
} catch (err) {
|
||||
if (err instanceof ValidationError) {
|
||||
validationErrors.push({ error: err.toString(), field: key })
|
||||
} else {
|
||||
throw err
|
||||
}
|
||||
}
|
||||
if (!multiple && Array.isArray(value)) {
|
||||
validationErrors.push({
|
||||
error: format('Cannot have multiple values (%O) for key %O', value, key),
|
||||
key,
|
||||
})
|
||||
}
|
||||
|
||||
search[key] = value
|
||||
}
|
||||
|
||||
if (!validationErrors.length) {
|
||||
const version =
|
||||
prefixVersionAliases[search.version] ||
|
||||
versionAliases[search.version] ||
|
||||
allVersions[search.version].miscVersionName
|
||||
search.indexName = `${getIndexPrefix()}github-docs-${version}-${search.language}` // github-docs-ghes-3.5-en
|
||||
}
|
||||
|
||||
return { search, validationErrors }
|
||||
}
|
||||
|
||||
function toBoolean(value) {
|
||||
if (value === 'true' || value === '1') return true
|
||||
return false
|
||||
}
|
||||
|
||||
function toArray(value) {
|
||||
return Array.isArray(value) ? value : [value]
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
/*
|
||||
This file and the routes included are for the /search endpoint of our API
|
||||
|
||||
For general search (client searches on docs.github.com) we use the middleware in ./general-search-middleware to get the search results
|
||||
*/
|
||||
import express, { Request, Response } from 'express'
|
||||
|
||||
import FailBot from '@/observability/lib/failbot.js'
|
||||
import { searchCacheControl } from '@/frame/middleware/cache-control.js'
|
||||
import catchMiddlewareError from '@/observability/middleware/catch-middleware-error.js'
|
||||
import {
|
||||
setFastlySurrogateKey,
|
||||
SURROGATE_ENUMS,
|
||||
} from '@/frame/middleware/set-fastly-surrogate-key.js'
|
||||
import { getAutocompleteSearchResults } from '@/search/lib/get-elasticsearch-results/general-autocomplete'
|
||||
import { getAISearchAutocompleteResults } from '@/search/lib/get-elasticsearch-results/ai-search-autocomplete'
|
||||
import { getSearchFromRequestParams } from '@/search/lib/search-request-params/get-search-from-request-params'
|
||||
import { getGeneralSearchResults } from '@/search/lib/get-elasticsearch-results/general-search'
|
||||
|
||||
const router = express.Router()
|
||||
|
||||
router.get('/legacy', (req: Request, res: Response) => {
|
||||
res.status(410).send('Use /api/search/v1 instead.')
|
||||
})
|
||||
|
||||
router.get(
|
||||
'/v1',
|
||||
catchMiddlewareError(async (req: Request, res: Response) => {
|
||||
const { indexName, searchParams, validationErrors } = getSearchFromRequestParams(
|
||||
req,
|
||||
'generalSearch',
|
||||
)
|
||||
if (validationErrors.length) {
|
||||
// We only send the first validation error to the user
|
||||
return res.status(400).json(validationErrors[0])
|
||||
}
|
||||
|
||||
const getResultOptions = {
|
||||
indexName,
|
||||
searchParams,
|
||||
}
|
||||
try {
|
||||
const { meta, hits, aggregations } = await getGeneralSearchResults(getResultOptions)
|
||||
|
||||
if (process.env.NODE_ENV !== 'development') {
|
||||
searchCacheControl(res)
|
||||
// We can cache this without purging it after every deploy
|
||||
// because the API search is only used as a proxy for local
|
||||
// and preview environments.
|
||||
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
|
||||
}
|
||||
|
||||
res.status(200).json({ meta, hits, aggregations })
|
||||
} catch (error) {
|
||||
await handleGetSearchResultsError(req, res, error, getResultOptions)
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
||||
router.get(
|
||||
'/autocomplete/v1',
|
||||
catchMiddlewareError(async (req: Request, res: Response) => {
|
||||
const {
|
||||
indexName,
|
||||
validationErrors,
|
||||
searchParams: { query, size },
|
||||
} = getSearchFromRequestParams(req, 'generalAutocomplete')
|
||||
if (validationErrors.length) {
|
||||
return res.status(400).json(validationErrors[0])
|
||||
}
|
||||
|
||||
const options = {
|
||||
indexName,
|
||||
query,
|
||||
size,
|
||||
}
|
||||
try {
|
||||
const { meta, hits } = await getAutocompleteSearchResults(options)
|
||||
|
||||
if (process.env.NODE_ENV !== 'development') {
|
||||
searchCacheControl(res)
|
||||
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
|
||||
}
|
||||
|
||||
res.status(200).json({ meta, hits })
|
||||
} catch (error) {
|
||||
await handleGetSearchResultsError(req, res, error, options)
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
||||
router.get(
|
||||
'/ai-search-autocomplete/v1',
|
||||
catchMiddlewareError(async (req: Request, res: Response) => {
|
||||
const {
|
||||
indexName,
|
||||
validationErrors,
|
||||
searchParams: { query, size },
|
||||
} = getSearchFromRequestParams(req, 'aiSearchAutocomplete')
|
||||
if (validationErrors.length) {
|
||||
return res.status(400).json(validationErrors[0])
|
||||
}
|
||||
|
||||
const getResultOptions = {
|
||||
indexName,
|
||||
query,
|
||||
size,
|
||||
}
|
||||
try {
|
||||
const { meta, hits } = await getAISearchAutocompleteResults(getResultOptions)
|
||||
|
||||
if (process.env.NODE_ENV !== 'development') {
|
||||
searchCacheControl(res)
|
||||
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
|
||||
}
|
||||
|
||||
res.status(200).json({ meta, hits })
|
||||
} catch (error) {
|
||||
await handleGetSearchResultsError(req, res, error, getResultOptions)
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
||||
async function handleGetSearchResultsError(req: Request, res: Response, error: any, options: any) {
|
||||
if (process.env.NODE_ENV === 'development') {
|
||||
console.error(`Error calling getSearchResults(${options})`, error)
|
||||
} else {
|
||||
const reports = FailBot.report(error, { url: req.url, ...options })
|
||||
if (reports) await Promise.all(reports)
|
||||
}
|
||||
res.status(500).json({ error: error.message })
|
||||
}
|
||||
|
||||
// Redirects for latest versions
|
||||
router.get('/', (req: Request, res: Response) => {
|
||||
res.redirect(307, req.originalUrl.replace('/search', '/search/v1'))
|
||||
})
|
||||
|
||||
router.get('/autocomplete', (req: Request, res: Response) => {
|
||||
res.redirect(307, req.originalUrl.replace('/search/autocomplete', '/search/autocomplete/v1'))
|
||||
})
|
||||
|
||||
router.get('/ai-search-autocomplete', (req: Request, res: Response) => {
|
||||
res.redirect(
|
||||
307,
|
||||
req.originalUrl.replace('/search/ai-search-autocomplete', '/search/ai-search-autocomplete/v1'),
|
||||
)
|
||||
})
|
||||
|
||||
export default router
|
|
@ -1,160 +0,0 @@
|
|||
import express from 'express'
|
||||
|
||||
import FailBot from '#src/observability/lib/failbot.js'
|
||||
import { searchCacheControl } from '#src/frame/middleware/cache-control.js'
|
||||
import catchMiddlewareError from '#src/observability/middleware/catch-middleware-error.js'
|
||||
import {
|
||||
setFastlySurrogateKey,
|
||||
SURROGATE_ENUMS,
|
||||
} from '#src/frame/middleware/set-fastly-surrogate-key.js'
|
||||
import { getAutocompleteSearchResults, getSearchResults } from './es-search.js'
|
||||
import { getAutocompleteSearchFromRequest, getSearchFromRequest } from './get-search-request.js'
|
||||
|
||||
const router = express.Router()
|
||||
|
||||
router.get('/legacy', (req, res) => {
|
||||
res.status(410).send('Use /api/search/v1 instead.')
|
||||
})
|
||||
|
||||
export const validationMiddleware = (req, res, next) => {
|
||||
const { search, validationErrors } = getSearchFromRequest(req)
|
||||
if (validationErrors.length) {
|
||||
// There might be multiple things bad about the query parameters,
|
||||
// but we send a 400 on the first possible one in the API.
|
||||
return res.status(400).json(validationErrors[0])
|
||||
}
|
||||
|
||||
req.search = search
|
||||
return next()
|
||||
}
|
||||
|
||||
router.get(
|
||||
'/v1',
|
||||
validationMiddleware,
|
||||
catchMiddlewareError(async function search(req, res) {
|
||||
const {
|
||||
indexName,
|
||||
query,
|
||||
autocomplete,
|
||||
page,
|
||||
size,
|
||||
debug,
|
||||
sort,
|
||||
highlights,
|
||||
include,
|
||||
toplevel,
|
||||
aggregate,
|
||||
} = req.search
|
||||
|
||||
const options = {
|
||||
indexName,
|
||||
query,
|
||||
page,
|
||||
size,
|
||||
debug,
|
||||
sort,
|
||||
highlights,
|
||||
usePrefixSearch: autocomplete,
|
||||
include,
|
||||
toplevel,
|
||||
aggregate,
|
||||
}
|
||||
try {
|
||||
const { meta, hits, aggregations } = await getSearchResults(options)
|
||||
|
||||
if (process.env.NODE_ENV !== 'development') {
|
||||
searchCacheControl(res)
|
||||
// We can cache this without purging it after every deploy
|
||||
// because the API search is only used as a proxy for local
|
||||
// and preview environments.
|
||||
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
|
||||
}
|
||||
|
||||
// The v1 version of the output matches perfectly what comes out
|
||||
// of the getSearchResults() function.
|
||||
res.status(200).json({ meta, hits, aggregations })
|
||||
} catch (error) {
|
||||
// If getSearchResult() throws an error that might be 404 inside
|
||||
// elasticsearch, if we don't capture that here, it will propagate
|
||||
// to the next middleware.
|
||||
await handleGetSearchResultsError(req, res, error, options)
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
||||
export const autocompleteValidationMiddleware = (req, res, next) => {
|
||||
const { search, validationErrors } = getAutocompleteSearchFromRequest(req)
|
||||
if (validationErrors.length) {
|
||||
// There might be multiple things bad about the query parameters,
|
||||
// but we send a 400 on the first possible one in the API.
|
||||
return res.status(400).json(validationErrors[0])
|
||||
}
|
||||
|
||||
req.search = search
|
||||
return next()
|
||||
}
|
||||
|
||||
router.get(
|
||||
'/autocomplete/v1',
|
||||
autocompleteValidationMiddleware,
|
||||
catchMiddlewareError(async (req, res) => {
|
||||
const { indexName, query, size } = req.search
|
||||
|
||||
const options = {
|
||||
indexName,
|
||||
query,
|
||||
size,
|
||||
}
|
||||
try {
|
||||
const { meta, hits } = await getAutocompleteSearchResults(options)
|
||||
|
||||
if (process.env.NODE_ENV !== 'development') {
|
||||
searchCacheControl(res)
|
||||
// We can cache this without purging it after every deploy
|
||||
// because the API search is only used as a proxy for local
|
||||
// and preview environments.
|
||||
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
|
||||
}
|
||||
|
||||
// The v1 version of the output matches perfectly what comes out
|
||||
// of the getSearchResults() function.
|
||||
res.status(200).json({ meta, hits })
|
||||
} catch (error) {
|
||||
// If getSearchResult() throws an error that might be 404 inside
|
||||
// elasticsearch, if we don't capture that here, it will propagate
|
||||
// to the next middleware.
|
||||
await handleGetSearchResultsError(req, res, error, options)
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
||||
// We have more than one place where we do `try{...} catch error( THIS )`
|
||||
// which is slightly different depending on the "sub-version" (e.g. /legacy)
|
||||
// This function is a single place to take care of all of these error handlings
|
||||
async function handleGetSearchResultsError(req, res, error, options) {
|
||||
if (process.env.NODE_ENV === 'development') {
|
||||
console.error(`Error calling getSearchResults(${options})`, error)
|
||||
} else {
|
||||
const reports = FailBot.report(error, Object.assign({ url: req.url }, options))
|
||||
// It might be `undefined` if no backends are configured which
|
||||
// is likely when using production NODE_ENV on your laptop
|
||||
// where you might not have a HAYSTACK_URL configured.
|
||||
if (reports) await Promise.all(reports)
|
||||
}
|
||||
res.status(500).json({ error: error.message })
|
||||
}
|
||||
|
||||
// Alias for the latest version
|
||||
router.get('/', (req, res) => {
|
||||
// At the time of writing, the latest version is v1. (July 2022)
|
||||
// Use `req.originalUrl` because this router is "self contained"
|
||||
// which means that `req.url` will be `/` in this context.
|
||||
res.redirect(307, req.originalUrl.replace('/search', '/search/v1'))
|
||||
})
|
||||
|
||||
// Alias for the latest autocomplete version
|
||||
router.get('/autocomplete', (req, res) => {
|
||||
res.redirect(307, req.originalUrl.replace('/search/autocomplete', '/search/autocomplete/v1'))
|
||||
})
|
||||
|
||||
export default router
|
|
@ -7,9 +7,10 @@ import {
|
|||
addUINamespaces,
|
||||
} from 'src/frame/components/context/MainContext'
|
||||
import { DefaultLayout } from 'src/frame/components/DefaultLayout'
|
||||
import type { SearchT } from 'src/search/components/types'
|
||||
import { SearchContext, SearchContextT } from 'src/search/components/context/SearchContext'
|
||||
import { SearchContext } from 'src/search/components/context/SearchContext'
|
||||
import { Search } from 'src/search/components/index'
|
||||
import { SearchOnReqObject } from 'src/search/types'
|
||||
import type { SearchContextT } from 'src/search/components/types'
|
||||
|
||||
type Props = {
|
||||
mainContext: MainContextT
|
||||
|
@ -40,6 +41,8 @@ export const getServerSideProps: GetServerSideProps<Props> = async (context) =>
|
|||
throw new Error('Expected req.context to be populated with .search')
|
||||
}
|
||||
|
||||
const searchObject = req.context.search as SearchOnReqObject<'generalSearch'>
|
||||
|
||||
// The `req.context.search` is similar to what's needed to React
|
||||
// render the search result page.
|
||||
// But it contains information (from the contextualizing) that is
|
||||
|
@ -48,24 +51,24 @@ export const getServerSideProps: GetServerSideProps<Props> = async (context) =>
|
|||
// `page` and `indexName` which was useful when it made the actual
|
||||
// Elasticsearch query. But it's not needed to render the results.
|
||||
// We explicitly pick out the parts that are needed, only.
|
||||
const search: SearchT = {
|
||||
search: {
|
||||
query: req.context.search.search.query,
|
||||
debug: req.context.search.search.debug,
|
||||
const search: SearchContextT['search'] = {
|
||||
searchParams: {
|
||||
query: searchObject.searchParams.query,
|
||||
debug: searchObject.searchParams.debug,
|
||||
},
|
||||
validationErrors: req.context.search.validationErrors,
|
||||
validationErrors: searchObject.validationErrors,
|
||||
}
|
||||
// If there are no results (e.g. /en/search?query=) from the
|
||||
// contextualizing, then `req.context.search.results` will
|
||||
// be `undefined` which can't be serialized as a prop, using JSON.stringify.
|
||||
if (req.context.search.results) {
|
||||
if (searchObject.results) {
|
||||
search.results = {
|
||||
meta: req.context.search.results.meta,
|
||||
hits: req.context.search.results.hits,
|
||||
meta: searchObject.results.meta,
|
||||
hits: searchObject.results.hits,
|
||||
// Use `null` instead of `undefined` for JSON serialization.
|
||||
// The only reason it would ever not be truthy is if the aggregates
|
||||
// functionality is not enabled for this version.
|
||||
aggregations: req.context.search.results.aggregations || null,
|
||||
aggregations: searchObject.results.aggregations || null,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,24 +1,19 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
// [start-readme]
|
||||
//
|
||||
// See how a piece of text gets turned into tokens by the different
|
||||
// analyzers.
|
||||
// See how a piece of text gets turned into tokens by the different analyzers.
|
||||
// Requires that the index exists in Elasticsearch.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// npm run analyze-text "my words" to tokenize
|
||||
//
|
||||
// [end-readme]
|
||||
// npm run analyze-text -- -V dotcom -l en "The name of the wind"
|
||||
|
||||
import { Client } from '@elastic/elasticsearch'
|
||||
import { program, Option } from 'commander'
|
||||
import { Command, Option } from 'commander'
|
||||
import chalk from 'chalk'
|
||||
import dotenv from 'dotenv'
|
||||
|
||||
import { languageKeys } from '#src/languages/lib/languages.js'
|
||||
import { allVersions } from '#src/versions/lib/all-versions.js'
|
||||
import { languageKeys } from '@/languages/lib/languages.js'
|
||||
import { allVersions } from '@/versions/lib/all-versions.js'
|
||||
|
||||
import type { IndicesAnalyzeAnalyzeToken } from '@elastic/elasticsearch/lib/api/types'
|
||||
|
||||
// Now you can optionally have set the ELASTICSEARCH_URL in your .env file.
|
||||
dotenv.config()
|
||||
|
@ -38,16 +33,28 @@ dotenv.config()
|
|||
//
|
||||
// We need this later to be able to map CLI arguments to what the
|
||||
// records are called when found on disk.
|
||||
const shortNames = Object.fromEntries(
|
||||
Object.values(allVersions).map((info) => {
|
||||
const shortName = info.hasNumberedReleases
|
||||
? info.miscBaseName + info.currentRelease
|
||||
: info.miscBaseName
|
||||
return [shortName, info]
|
||||
}),
|
||||
)
|
||||
const shortNames: Record<string, (typeof allVersions)[keyof typeof allVersions]> =
|
||||
Object.fromEntries(
|
||||
Object.values(allVersions).map((info) => {
|
||||
const shortName = info.hasNumberedReleases
|
||||
? `${info.miscBaseName}${info.currentRelease}`
|
||||
: info.miscBaseName
|
||||
return [shortName, info]
|
||||
}),
|
||||
)
|
||||
|
||||
const allVersionKeys = Object.keys(shortNames)
|
||||
const allVersionKeys: string[] = Object.keys(shortNames)
|
||||
|
||||
interface Options {
|
||||
verbose?: boolean
|
||||
version?: string
|
||||
language?: string
|
||||
notLanguage?: string
|
||||
elasticsearchUrl?: string
|
||||
indexPrefix?: string
|
||||
}
|
||||
|
||||
const program = new Command()
|
||||
|
||||
program
|
||||
.description('Analyze text into tokens')
|
||||
|
@ -56,21 +63,29 @@ program
|
|||
.addOption(
|
||||
new Option('-l, --language <LANGUAGE>', 'Which language to focus on').choices(languageKeys),
|
||||
)
|
||||
.option('--not-language <LANGUAGE>', 'Exclude a specific language')
|
||||
.option('-u, --elasticsearch-url <url>', 'If different from $ELASTICSEARCH_URL')
|
||||
.option('--index-prefix <PREFIX>', 'Prefix for the index name')
|
||||
.argument('<text>', 'text to tokenize')
|
||||
.parse(process.argv)
|
||||
|
||||
main(program.opts(), program.args)
|
||||
const options = program.opts<Options>()
|
||||
const args: string[] = program.args
|
||||
|
||||
async function main(opts, args) {
|
||||
main(options, args).catch((err) => {
|
||||
console.error(chalk.red('Error:'), err)
|
||||
process.exit(1)
|
||||
})
|
||||
|
||||
async function main(opts: Options, args: string[]): Promise<void> {
|
||||
const texts = [args.join(' ')]
|
||||
if (!opts.elasticsearchUrl && !process.env.ELASTICSEARCH_URL) {
|
||||
throw new Error(
|
||||
'Must passed the elasticsearch URL option or ' +
|
||||
'Must pass the elasticsearch URL option or ' +
|
||||
'set the environment variable ELASTICSEARCH_URL',
|
||||
)
|
||||
}
|
||||
let node = opts.elasticsearchUrl || process.env.ELASTICSEARCH_URL
|
||||
let node = opts.elasticsearchUrl || process.env.ELASTICSEARCH_URL!
|
||||
|
||||
// Allow the user to lazily set it to `localhost:9200` for example.
|
||||
if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) {
|
||||
|
@ -79,15 +94,15 @@ async function main(opts, args) {
|
|||
|
||||
try {
|
||||
const parsed = new URL(node)
|
||||
if (!parsed.hostname) throw new Error('no valid hostname')
|
||||
if (!parsed.hostname) throw new Error('No valid hostname')
|
||||
} catch (err) {
|
||||
console.error(chalk.bold('URL for Elasticsearch not a valid URL', err))
|
||||
console.error(chalk.bold('URL for Elasticsearch not a valid URL'), err)
|
||||
return
|
||||
}
|
||||
|
||||
const { verbose, language, notLanguage } = opts
|
||||
|
||||
// The notLanguage is useful you want to, for example, index all languages
|
||||
// The notLanguage is useful if you want to, for example, index all languages
|
||||
// *except* English.
|
||||
if (language && notLanguage) {
|
||||
throw new Error("Can't combine --language and --not-language")
|
||||
|
@ -116,29 +131,32 @@ async function main(opts, args) {
|
|||
|
||||
const indexName = `${prefix}github-docs-${versionKey}-${languageKey}`
|
||||
console.log(chalk.yellow(`Analyzing in ${chalk.bold(indexName)}`))
|
||||
await analyzeVersion(client, texts, indexName, verbose)
|
||||
await analyzeVersion(client, texts, indexName)
|
||||
}
|
||||
|
||||
function safeUrlDisplay(url) {
|
||||
function safeUrlDisplay(url: string): string {
|
||||
const parsed = new URL(url)
|
||||
if (parsed.password) {
|
||||
parsed.password = '***'
|
||||
}
|
||||
if (parsed.username) {
|
||||
parsed.username = parsed.username.slice(0, 4) + '***'
|
||||
parsed.username = `${parsed.username.slice(0, 4)}***`
|
||||
}
|
||||
return parsed.toString()
|
||||
}
|
||||
async function analyzeVersion(client, texts, indexName, verbose = false) {
|
||||
|
||||
async function analyzeVersion(client: Client, texts: string[], indexName: string): Promise<void> {
|
||||
for (const text of texts) {
|
||||
console.log(`RAW TEXT: 〝${chalk.italic(text)}〞`)
|
||||
for (const analyzer of ['text_analyzer_explicit', 'text_analyzer', 'standard']) {
|
||||
console.log('ANALYZER:', chalk.bold(analyzer))
|
||||
const { tokens } = await client.indices.analyze({
|
||||
const response = await client.indices.analyze({
|
||||
index: indexName,
|
||||
body: { analyzer, text },
|
||||
})
|
||||
const tokenWords = tokens.map((token) => token.token)
|
||||
|
||||
const tokens: IndicesAnalyzeAnalyzeToken[] | undefined = response.tokens
|
||||
const tokenWords: string[] = tokens?.map((token) => token.token) || []
|
||||
console.log(tokenWords)
|
||||
}
|
||||
}
|
|
@ -1,575 +0,0 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
// [start-readme]
|
||||
//
|
||||
// Creates Elasticsearch index, populates from records,
|
||||
// moves the index alias, deletes old indexes.
|
||||
//
|
||||
// [end-readme]
|
||||
|
||||
import fs from 'fs/promises'
|
||||
import path from 'path'
|
||||
|
||||
import { Client, errors } from '@elastic/elasticsearch'
|
||||
import { program, Option, InvalidArgumentError } from 'commander'
|
||||
import chalk from 'chalk'
|
||||
import dotenv from 'dotenv'
|
||||
|
||||
import { retryOnErrorTest } from './retry-on-error-test.js'
|
||||
import { languageKeys } from '#src/languages/lib/languages.js'
|
||||
import { allVersions } from '#src/versions/lib/all-versions.js'
|
||||
|
||||
// Now you can optionally have set the ELASTICSEARCH_URL in your .env file.
|
||||
dotenv.config()
|
||||
|
||||
// Create an object that maps the "short name" of a version to
|
||||
// all information about it. E.g.
|
||||
//
|
||||
// {
|
||||
// 'ghes-3.5': {
|
||||
// hasNumberedReleases: true,
|
||||
// currentRelease: '3.5',
|
||||
// version: 'enterprise-server@3.5',
|
||||
// miscBaseName: 'ghes-'
|
||||
// ...
|
||||
// },
|
||||
// ...
|
||||
//
|
||||
// We need this later to be able to map CLI arguments to what the
|
||||
// records are called when found on disk.
|
||||
const shortNames = Object.fromEntries(
|
||||
Object.values(allVersions).map((info) => {
|
||||
const shortName = info.hasNumberedReleases
|
||||
? info.miscBaseName + info.currentRelease
|
||||
: info.miscBaseName
|
||||
return [shortName, info]
|
||||
}),
|
||||
)
|
||||
|
||||
const allVersionKeys = Object.keys(shortNames)
|
||||
|
||||
const DEFAULT_SLEEPTIME_SECONDS = 30
|
||||
|
||||
program
|
||||
.description('Creates Elasticsearch index from records')
|
||||
.option('-v, --verbose', 'Verbose outputs')
|
||||
.addOption(new Option('-V, --version [VERSION...]', 'Specific versions').choices(allVersionKeys))
|
||||
.addOption(
|
||||
new Option('-l, --language <LANGUAGE...>', 'Which languages to focus on').choices(languageKeys),
|
||||
)
|
||||
.addOption(
|
||||
new Option('--not-language <LANGUAGE...>', 'Specific language to omit').choices(languageKeys),
|
||||
)
|
||||
.option('-u, --elasticsearch-url <url>', 'If different from $ELASTICSEARCH_URL')
|
||||
.option('-p, --index-prefix <prefix>', 'Index string to put before index name')
|
||||
.option(
|
||||
'-s, --stagger-seconds <seconds>',
|
||||
'Number of seconds to sleep between each bulk operation',
|
||||
(value) => {
|
||||
const parsed = parseInt(value, 10)
|
||||
if (isNaN(parsed)) {
|
||||
throw new InvalidArgumentError('Not a number.')
|
||||
}
|
||||
return parsed
|
||||
},
|
||||
)
|
||||
.option(
|
||||
'-r, --retries <count>',
|
||||
'Number of retry attempts on recoverable network errors',
|
||||
(value) => {
|
||||
const parsed = parseInt(value, 10)
|
||||
if (isNaN(parsed)) {
|
||||
throw new InvalidArgumentError('Not a number.')
|
||||
}
|
||||
return parsed
|
||||
},
|
||||
)
|
||||
.option(
|
||||
'--sleep-time <seconds>',
|
||||
`Number of seconds to sleep between each retry attempt (defaults to ${DEFAULT_SLEEPTIME_SECONDS})`,
|
||||
(value) => {
|
||||
const parsed = parseInt(value, 10)
|
||||
if (isNaN(parsed)) {
|
||||
throw new InvalidArgumentError('Not a number.')
|
||||
}
|
||||
return parsed
|
||||
},
|
||||
)
|
||||
.argument('<source-directory>', 'where the indexable files are')
|
||||
.parse(process.argv)
|
||||
|
||||
main(program.opts(), program.args)
|
||||
|
||||
async function main(opts, args) {
|
||||
if (!args.length) {
|
||||
throw new Error('Must pass the source as the first argument')
|
||||
}
|
||||
|
||||
const { verbose, language, notLanguage, elasticsearchUrl } = opts
|
||||
|
||||
if (!elasticsearchUrl && !process.env.ELASTICSEARCH_URL) {
|
||||
throw new Error(
|
||||
'Must passed the elasticsearch URL option or ' +
|
||||
'set the environment variable ELASTICSEARCH_URL',
|
||||
)
|
||||
}
|
||||
let node = elasticsearchUrl || process.env.ELASTICSEARCH_URL
|
||||
|
||||
// Allow the user to lazily set it to `localhost:9200` for example.
|
||||
if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) {
|
||||
node = `http://${node}`
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = new URL(node)
|
||||
if (!parsed.hostname) throw new Error('no valid hostname')
|
||||
} catch (err) {
|
||||
console.error(chalk.bold('URL for Elasticsearch not a valid URL', err))
|
||||
throw err
|
||||
}
|
||||
|
||||
// The notLanguage is useful you want to, for example, index all languages
|
||||
// *except* English.
|
||||
if (language && notLanguage) {
|
||||
throw new Error("Can't combine --language and --not-language")
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
console.log(`Connecting to ${chalk.bold(safeUrlDisplay(node))}`)
|
||||
}
|
||||
const sourceDirectory = args[0]
|
||||
try {
|
||||
await fs.stat(sourceDirectory)
|
||||
} catch (error) {
|
||||
if (error.code === 'ENOENT') {
|
||||
throw new Error(`The specified directory '${sourceDirectory}' does not exist.`)
|
||||
}
|
||||
throw error
|
||||
}
|
||||
|
||||
try {
|
||||
await indexAll(node, sourceDirectory, opts)
|
||||
} catch (error) {
|
||||
// If any error is thrown from within the SDK, that error object will
|
||||
// contain a `Connection` object which, when printed, can reveal the
|
||||
// username/password or the base64 Basic auth credentials.
|
||||
// So we want to carefully re-throw it so it only contains the minimal
|
||||
// information for debugging without exposing the Connection credentials
|
||||
// in Actions logs.
|
||||
if (error instanceof errors.ElasticsearchClientError) {
|
||||
// All ElasticsearchClientError error subclasses have a `name` and
|
||||
// `message` but only some have a `meta`.
|
||||
if (error.meta) {
|
||||
console.error('Error meta: %O', error.meta)
|
||||
}
|
||||
throw new Error(error.message)
|
||||
}
|
||||
// If any other error happens that isn't from the elasticsearch SDK,
|
||||
// let it bubble up.
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
|
||||
|
||||
async function indexAll(node, sourceDirectory, opts) {
|
||||
const client = new Client({ node })
|
||||
|
||||
const { language, verbose, notLanguage, indexPrefix, staggerSeconds } = opts
|
||||
|
||||
let version
|
||||
if ('version' in opts) {
|
||||
version = opts.version
|
||||
if (process.env.VERSION) {
|
||||
console.warn(
|
||||
`'version' specified as argument ('${version}') AND environment variable ('${process.env.VERSION}')`,
|
||||
)
|
||||
}
|
||||
} else {
|
||||
if (process.env.VERSION && process.env.VERSION !== 'all') {
|
||||
version = process.env.VERSION
|
||||
if (!allVersionKeys.includes(version)) {
|
||||
throw new Error(
|
||||
`Environment variable 'VERSION' (${version}) is not recognized. Must be one of ${allVersionKeys}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
let versionKeys = allVersionKeys
|
||||
// If it came from the `--version` argument parsing, it might be a string
|
||||
// or an array of strings because it uses `--version [VERSION...]`.
|
||||
if (version) {
|
||||
if (Array.isArray(version)) {
|
||||
versionKeys = version
|
||||
} else {
|
||||
versionKeys = [version]
|
||||
}
|
||||
}
|
||||
|
||||
// This will throw if it can't ping
|
||||
await client.ping()
|
||||
|
||||
const languages =
|
||||
language || languageKeys.filter((lang) => !notLanguage || !notLanguage.includes(lang))
|
||||
if (verbose) {
|
||||
console.log(`Indexing on languages ${chalk.bold(languages.join(', '))}`)
|
||||
}
|
||||
|
||||
const prefix = indexPrefix ? `${indexPrefix}_` : ''
|
||||
|
||||
for (const language of languages) {
|
||||
let count = 0
|
||||
for (const versionKey of versionKeys) {
|
||||
console.log(chalk.yellow(`Indexing ${chalk.bold(versionKey)} in ${chalk.bold(language)}`))
|
||||
const indexName = `${prefix}github-docs-${versionKey}-${language}`
|
||||
|
||||
const t0 = new Date()
|
||||
await indexVersion(client, indexName, versionKey, language, sourceDirectory, opts)
|
||||
const t1 = new Date()
|
||||
console.log(chalk.green(`Finished indexing ${indexName}. Took ${formatTime(t1 - t0)}`))
|
||||
if (verbose) {
|
||||
console.log(`To view index: ${safeUrlDisplay(node + `/${indexName}`)}`)
|
||||
console.log(`To search index: ${safeUrlDisplay(node + `/${indexName}/_search`)}`)
|
||||
}
|
||||
count++
|
||||
// console.log({ count, versionKeysLength: versionKeys.length })
|
||||
if (staggerSeconds && count < versionKeys.length - 1) {
|
||||
console.log(`Sleeping for ${staggerSeconds} seconds...`)
|
||||
await sleep(1000 * staggerSeconds)
|
||||
}
|
||||
// A bit of visual separation betweeen each version
|
||||
console.log('')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function safeUrlDisplay(url) {
|
||||
const parsed = new URL(url)
|
||||
if (parsed.password) {
|
||||
parsed.password = '***'
|
||||
}
|
||||
if (parsed.username) {
|
||||
parsed.username = parsed.username.slice(0, 4) + '***'
|
||||
}
|
||||
return parsed.toString()
|
||||
}
|
||||
|
||||
// Return '20220719012012' if the current date is
|
||||
// 2022-07-19T01:20:12.172Z. Note how the 6th month (July) becomes
|
||||
// '07'. All numbers become 2 character zero-padding strings individually.
|
||||
function utcTimestamp() {
|
||||
const d = new Date()
|
||||
|
||||
return (
|
||||
[
|
||||
`${d.getUTCFullYear()}`,
|
||||
d.getUTCMonth() + 1,
|
||||
d.getUTCDate(),
|
||||
d.getUTCHours(),
|
||||
d.getUTCMinutes(),
|
||||
d.getUTCSeconds(),
|
||||
]
|
||||
// If it's a number make it a zero-padding 2 character string
|
||||
.map((x) => (typeof x === 'number' ? ('0' + x).slice(-2) : x))
|
||||
.join('')
|
||||
)
|
||||
}
|
||||
|
||||
// Consider moving this to lib
|
||||
async function indexVersion(client, indexName, version, language, sourceDirectory, opts) {
|
||||
const { verbose } = opts
|
||||
|
||||
// Note, it's a bit "weird" that numbered releases versions are
|
||||
// called the number but that's the convention the previous
|
||||
// search backend used
|
||||
const indexVersionName = shortNames[version].hasNumberedReleases
|
||||
? shortNames[version].currentRelease
|
||||
: shortNames[version].miscBaseName
|
||||
const recordsName = `github-docs-${indexVersionName}-${language}`
|
||||
|
||||
const records = await loadRecords(recordsName, sourceDirectory)
|
||||
|
||||
const thisAlias = `${indexName}__${utcTimestamp()}`
|
||||
|
||||
// CREATE INDEX
|
||||
const settings = {
|
||||
analysis: {
|
||||
char_filter: {
|
||||
// This will turn `runs-on` into `runs_on` so that it can't be
|
||||
// tokenized to `runs` because `on` is a stop word.
|
||||
// It also means that prose terms, in English, like `opt-in`
|
||||
// not be matched if someone searches for `opt in`. But this
|
||||
// is why we have multiple different analyzers. So it becomes
|
||||
// `opt_in` in the `text_analyzer_explicit` analyzer, but is
|
||||
// left as `opt` in the `text_analyzer` analyzer.
|
||||
hyphenation_filter: {
|
||||
type: 'mapping',
|
||||
mappings: ['- => _'],
|
||||
},
|
||||
},
|
||||
analyzer: {
|
||||
// We defined to analyzers. Both based on a "common core" with the
|
||||
// `standard` tokenizer. But the second one adds Snowball filter.
|
||||
// That means the tokenization of "Dependency naming" becomes
|
||||
// `[dependency, naming]` in the explicit one and `[depend, name]`
|
||||
// in the Snowball one.
|
||||
// We do this to give a chance to boost the more exact spelling a
|
||||
// bit higher with the assumption that if the user knew exactly
|
||||
// what it was called, we should show that higher.
|
||||
// A great use-case of this when users search for keywords that are
|
||||
// code words like `dependency-name`.
|
||||
text_analyzer_explicit: {
|
||||
char_filter: ['hyphenation_filter'],
|
||||
filter: ['lowercase', 'stop', 'asciifolding'],
|
||||
tokenizer: 'standard',
|
||||
type: 'custom',
|
||||
},
|
||||
text_analyzer: {
|
||||
filter: ['lowercase', 'stop', 'asciifolding'],
|
||||
tokenizer: 'standard',
|
||||
type: 'custom',
|
||||
},
|
||||
},
|
||||
filter: {
|
||||
// Will later, conditionally, put the snowball configuration here.
|
||||
},
|
||||
},
|
||||
}
|
||||
const snowballLanguage = getSnowballLanguage(language)
|
||||
if (snowballLanguage) {
|
||||
settings.analysis.analyzer.text_analyzer.filter.push('languaged_snowball')
|
||||
settings.analysis.filter.languaged_snowball = {
|
||||
type: 'snowball',
|
||||
language: snowballLanguage,
|
||||
}
|
||||
} else {
|
||||
if (verbose) {
|
||||
console.warn(`No snowball language for '${language}'`)
|
||||
}
|
||||
}
|
||||
|
||||
await client.indices.create({
|
||||
index: thisAlias,
|
||||
mappings: {
|
||||
properties: {
|
||||
url: { type: 'keyword' },
|
||||
title: {
|
||||
type: 'text',
|
||||
analyzer: 'text_analyzer',
|
||||
norms: false,
|
||||
// This is used for fast highlighting. Uses more space but makes
|
||||
// the searches faster.
|
||||
term_vector: 'with_positions_offsets',
|
||||
},
|
||||
title_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
|
||||
content: {
|
||||
type: 'text',
|
||||
analyzer: 'text_analyzer',
|
||||
// This is used for fast highlighting. Uses more space but makes
|
||||
// the searches faster.
|
||||
term_vector: 'with_positions_offsets',
|
||||
},
|
||||
content_explicit: {
|
||||
type: 'text',
|
||||
analyzer: 'text_analyzer_explicit',
|
||||
// This is used for fast highlighting. Uses more space but makes
|
||||
// the searches faster.
|
||||
term_vector: 'with_positions_offsets',
|
||||
},
|
||||
headings: { type: 'text', analyzer: 'text_analyzer', norms: false },
|
||||
headings_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
|
||||
breadcrumbs: { type: 'text' },
|
||||
popularity: { type: 'float' },
|
||||
intro: { type: 'text' },
|
||||
// Use 'keyword' because it's faster to index and (more importantly)
|
||||
// faster to search on. It would be different if it was something
|
||||
// users could type in into a text input.
|
||||
toplevel: { type: 'keyword' },
|
||||
},
|
||||
},
|
||||
settings,
|
||||
})
|
||||
|
||||
// POPULATE
|
||||
const allRecords = Object.values(records).sort((a, b) => b.popularity - a.popularity)
|
||||
const operations = allRecords.flatMap((doc) => {
|
||||
const { title, objectID, content, breadcrumbs, headings, intro, toplevel } = doc
|
||||
const contentEscaped = escapeHTML(content)
|
||||
const headingsEscaped = escapeHTML(headings)
|
||||
const record = {
|
||||
url: objectID,
|
||||
title,
|
||||
title_explicit: title,
|
||||
content: contentEscaped,
|
||||
content_explicit: contentEscaped,
|
||||
breadcrumbs,
|
||||
headings: headingsEscaped,
|
||||
headings_explicit: headingsEscaped,
|
||||
// This makes sure the popularities are always greater than 1.
|
||||
// Generally the 'popularity' is a ratio where the most popular
|
||||
// one of all is 1.0.
|
||||
// By making it >=1.0 when we multiply a relevance score,
|
||||
// you never get a product of 0.0.
|
||||
popularity: doc.popularity + 1,
|
||||
intro,
|
||||
toplevel,
|
||||
}
|
||||
return [{ index: { _index: thisAlias } }, record]
|
||||
})
|
||||
|
||||
const bulkOptions = {
|
||||
// Default is 'false'.
|
||||
// It means that the index is NOT refreshed as documents are inserted.
|
||||
// Which makes sense in our case because we do not intend to search on
|
||||
// this index until after we've pointed the alias to this new index.
|
||||
refresh: false,
|
||||
// Default is '1m' but we have no reason *not* to be patient. It's run
|
||||
// by a bot on a schedeule (GitHub Actions).
|
||||
timeout: '5m',
|
||||
}
|
||||
|
||||
const attempts = opts.retries || 0
|
||||
const sleepTime = (opts.sleepTime || DEFAULT_SLEEPTIME_SECONDS) * 1000
|
||||
|
||||
console.log(`About to bulk index ${allRecords.length.toLocaleString()} records with retry %O`, {
|
||||
attempts,
|
||||
sleepTime,
|
||||
})
|
||||
const t0 = new Date()
|
||||
const bulkResponse = await retryOnErrorTest(
|
||||
(error) => {
|
||||
// Rate limiting can happen when you're indexing too much at
|
||||
// same time.
|
||||
return error instanceof errors.ResponseError && error.meta.statusCode === 429
|
||||
},
|
||||
() => client.bulk({ operations, ...bulkOptions }),
|
||||
{
|
||||
attempts,
|
||||
sleepTime,
|
||||
onError: (_, attempts, sleepTime) => {
|
||||
console.warn(
|
||||
chalk.yellow(
|
||||
`Failed to bulk index ${indexName}. Will attempt ${attempts} more times (after ${
|
||||
sleepTime / 1000
|
||||
}s sleep).`,
|
||||
),
|
||||
)
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
if (bulkResponse.errors) {
|
||||
// Some day, when we're more confident how and why this might happen
|
||||
// we can rewrite this code to "massage" the errors better.
|
||||
// For now, if it fails, it's "OK". It means we won't be proceeding,
|
||||
// an error is thrown in Actions and we don't have to worry about
|
||||
// an incompletion index.
|
||||
console.error(`Bulk response errors: ${bulkResponse.errors}`)
|
||||
throw new Error('Bulk errors happened.')
|
||||
}
|
||||
const t1 = new Date()
|
||||
console.log(`Bulk indexed ${thisAlias}. Took ${formatTime(t1 - t0)}`)
|
||||
|
||||
// The counting of documents in the index is async and can take a while
|
||||
// to reflect. So send count requests until we get the right number.
|
||||
let documentsInIndex = 0
|
||||
let countAttempts = 3
|
||||
while (documentsInIndex < allRecords.length) {
|
||||
const { count } = await client.count({ index: thisAlias })
|
||||
documentsInIndex = count
|
||||
if (documentsInIndex >= allRecords.length) break
|
||||
countAttempts--
|
||||
if (!countAttempts) {
|
||||
console.log(`After ${countAttempts} attempts still haven't matched the expected number.`)
|
||||
break
|
||||
}
|
||||
await sleep(1000)
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Documents now in ${chalk.bold(thisAlias)}: ${chalk.bold(documentsInIndex.toLocaleString())}`,
|
||||
)
|
||||
|
||||
// To perform an atomic operation that creates the new alias and removes
|
||||
// the old indexes, we can use the updateAliases API with a body that
|
||||
// includes an "actions" array. The array includes the added alias
|
||||
// and the removed indexes. If any of the actions fail, none of the operations
|
||||
// are performed.
|
||||
// https://www.elastic.co/guide/en/elasticsearch/reference/master/indices-aliases.html
|
||||
const aliasUpdates = [
|
||||
{
|
||||
add: {
|
||||
index: thisAlias,
|
||||
alias: indexName,
|
||||
},
|
||||
},
|
||||
]
|
||||
console.log(`Alias ${indexName} -> ${thisAlias}`)
|
||||
|
||||
console.log('About to get indices with retry %O', { attempts, sleepTime })
|
||||
const indices = await retryOnErrorTest(
|
||||
(error) => {
|
||||
// 404 can happen when you're trying to get an index that
|
||||
// doesn't exist. ...yet!
|
||||
return error instanceof errors.ResponseError && error.meta.statusCode === 404
|
||||
},
|
||||
() => client.cat.indices({ format: 'json' }),
|
||||
{
|
||||
attempts,
|
||||
sleepTime,
|
||||
onError: (error, attempts, sleepTime) => {
|
||||
console.warn(
|
||||
chalk.yellow(
|
||||
`Failed to get index ${indexName} (${
|
||||
error.message || error.toString()
|
||||
}). Will attempt ${attempts} more times (after ${formatTime(sleepTime)}s sleep).`,
|
||||
),
|
||||
)
|
||||
},
|
||||
},
|
||||
)
|
||||
for (const index of indices) {
|
||||
if (index.index !== thisAlias && index.index.startsWith(indexName)) {
|
||||
aliasUpdates.push({ remove_index: { index: index.index } })
|
||||
console.log('Deleting index', index.index)
|
||||
}
|
||||
}
|
||||
if (verbose) console.log('Updating alias actions:', aliasUpdates)
|
||||
await client.indices.updateAliases({ body: { actions: aliasUpdates } })
|
||||
}
|
||||
|
||||
function escapeHTML(content) {
|
||||
return content.replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"')
|
||||
}
|
||||
|
||||
async function loadRecords(indexName, sourceDirectory) {
|
||||
const filePath = path.join(sourceDirectory, `${indexName}-records.json`)
|
||||
const payload = await fs.readFile(filePath)
|
||||
return JSON.parse(payload)
|
||||
}
|
||||
|
||||
function getSnowballLanguage(language) {
|
||||
// Based on https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-snowball-tokenfilter.html
|
||||
// Note, not all languages are supported. So this function might return
|
||||
// undefined. That implies that you can't use snowballing.
|
||||
return {
|
||||
en: 'English',
|
||||
fr: 'French',
|
||||
es: 'Spanish',
|
||||
ru: 'Russian',
|
||||
it: 'Italian',
|
||||
de: 'German',
|
||||
pt: 'Portuguese',
|
||||
}[language]
|
||||
}
|
||||
|
||||
function formatTime(ms) {
|
||||
if (ms < 1000) {
|
||||
return `${ms.toFixed(1)}ms`
|
||||
}
|
||||
const seconds = ms / 1000
|
||||
if (seconds > 60) {
|
||||
return `${Math.round(seconds / 60)}m${Math.round(seconds % 60)}s`
|
||||
}
|
||||
return `${seconds.toFixed(1)}s`
|
||||
}
|
|
@ -6,7 +6,10 @@
|
|||
set -e
|
||||
|
||||
# For general site-search
|
||||
npm run index-elasticsearch -- -l en -l ja -V ghec -V dotcom --index-prefix tests -- src/search/tests/fixtures/search-indexes
|
||||
npm run index-general-search -- src/search/tests/fixtures/search-indexes -l en -l ja -V ghec -V fpt --index-prefix tests
|
||||
|
||||
# For autocomplete search
|
||||
npm run index -- autocomplete src/search/tests/fixtures/data -l en -l ja -v fpt -v ghec --index-prefix tests
|
||||
# For general autocomplete search
|
||||
npm run index-general-autocomplete -- src/search/tests/fixtures/data -l en -l ja -v fpt -v ghec --index-prefix tests
|
||||
|
||||
# For AI search autocomplete
|
||||
npm run index-ai-search-autocomplete -- src/search/tests/fixtures/data -l en -v fpt -v ghec --index-prefix tests
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
# Elastic Search Indexing
|
||||
|
||||
Elasticsearch uses indexes to store the data that is used to determine search results.
|
||||
|
||||
We use this scripts in this directory to index our Elasticsearch instances.
|
||||
|
||||
In production, the indexing happens in the GitHub workflows: `index-autocomplete-search.yml` and `index-general-search.yml`
|
||||
|
||||
## CLI Script
|
||||
|
||||
Before running the indexing for **general search** you run the [scrape](../scrape/README.md) script to scrape page data into files.
|
||||
|
||||
Before running the indexing for **general autocomplete** and **AI search autocomplete** you need to clone [docs-internal-data](https://github.com/github/docs-internal-data) to the root of this directory.
|
||||
|
||||
There is a separate run command for indexing each type of search data:
|
||||
1. **general search**: `npm run index-general-search -- <scrape-directory>`
|
||||
2. **general autocomplete**: `npm run index-general-autocomplete -- docs-internal-data` (if `docs-internal-data` is cloned to root directory)
|
||||
3. **AI search autocomplete**: `npm run index-ai-search-autocomplete -- docs-internal-data` (if `docs-internal-data` is cloned to root directory)
|
||||
|
||||
To see the arguments accepted by any script, pass the `--help` argument, for example
|
||||
|
||||
```bash
|
||||
npm run index-general-autocomplete -- --help
|
||||
```
|
|
@ -1,167 +0,0 @@
|
|||
import fs from 'node:fs'
|
||||
import path from 'node:path'
|
||||
|
||||
import { Client, estypes } from '@elastic/elasticsearch'
|
||||
|
||||
import { getClient } from './lib/get-client'
|
||||
import { utcTimestamp } from './lib/utils'
|
||||
import { populate } from './lib/populate'
|
||||
|
||||
import { type Version, Records } from './types'
|
||||
|
||||
export const shortVersionNames = {
|
||||
'enterprise-server': 'ghes',
|
||||
'enterprise-cloud': 'ghec',
|
||||
'free-pro-team': 'fpt',
|
||||
} as const
|
||||
|
||||
const DEFAULT_SLEEPTIME_SECONDS = 30
|
||||
|
||||
type Options = {
|
||||
dataRepoRoot: string
|
||||
languages: string[]
|
||||
versions: Version[]
|
||||
retries?: number
|
||||
sleepTime?: number
|
||||
verbose?: boolean
|
||||
indexPrefix?: string
|
||||
}
|
||||
|
||||
export async function indexAutocomplete(options: Options) {
|
||||
// The data repo has a predictable structure of
|
||||
// `hydro/rollups/user-searches/$language/$version/rollup.json`
|
||||
// But note that the "version" might be a prefix, like enterprise-server.
|
||||
// const { verbose } = options
|
||||
|
||||
const client = getClient()
|
||||
|
||||
const { dataRepoRoot, versions, languages } = options
|
||||
for (const language of languages) {
|
||||
for (const version of versions) {
|
||||
const records = loadRecords({ version, language, dataRepoRoot })
|
||||
const { alias, name } = await createIndex(
|
||||
client,
|
||||
language,
|
||||
version,
|
||||
options.indexPrefix || '',
|
||||
)
|
||||
await populate(client, records, {
|
||||
alias,
|
||||
name,
|
||||
retries: options.retries || 0,
|
||||
sleepTime: options.sleepTime || DEFAULT_SLEEPTIME_SECONDS,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type LoadOptions = {
|
||||
dataRepoRoot: string
|
||||
language: string
|
||||
version: string
|
||||
}
|
||||
|
||||
function loadRecords(options: LoadOptions): Records {
|
||||
// First load the rollup records for user-searches
|
||||
const filePath = path.join(
|
||||
options.dataRepoRoot,
|
||||
'hydro/rollups/user-searches',
|
||||
options.language,
|
||||
options.version,
|
||||
'rollup.json',
|
||||
)
|
||||
const terms: Records = {}
|
||||
|
||||
const userSearchTerms: Records = JSON.parse(fs.readFileSync(filePath, 'utf8'))
|
||||
let highestValue = Math.max(...Object.values(userSearchTerms))
|
||||
if (highestValue === 0) {
|
||||
throw new Error(`No records found for ${options.language} ${options.version}`)
|
||||
}
|
||||
for (const [term, value] of Object.entries(userSearchTerms)) {
|
||||
// Why +1?
|
||||
// Because we want these user-searches to alway be higher than all the
|
||||
// terms generated from titles.
|
||||
// For example, a common user-search term that users use
|
||||
// is "log forwarding". But there might not be a deconstructed term,
|
||||
// from the document titles, however there might be one called
|
||||
// "log proxy". So when our users search for "log" we want to suggest,
|
||||
// in the autocomplete UI "log forwarding" before "log proxy".
|
||||
terms[term] = value / highestValue + 1
|
||||
}
|
||||
|
||||
const documentTermsFilePath = path.join(
|
||||
options.dataRepoRoot,
|
||||
'all-documents/terms',
|
||||
options.language,
|
||||
options.version,
|
||||
'terms.json',
|
||||
)
|
||||
const documentTerms: Records = JSON.parse(fs.readFileSync(documentTermsFilePath, 'utf8'))
|
||||
highestValue = Math.max(...Object.values(documentTerms))
|
||||
if (highestValue === 0) {
|
||||
throw new Error(`No document title records found for ${options.language} ${options.version}`)
|
||||
}
|
||||
for (const [term, value] of Object.entries(documentTerms)) {
|
||||
if (!(term in terms)) {
|
||||
terms[term] = value / highestValue + 1
|
||||
}
|
||||
}
|
||||
|
||||
return terms
|
||||
}
|
||||
|
||||
type IndexInfo = {
|
||||
alias: string
|
||||
name: string
|
||||
}
|
||||
|
||||
async function createIndex(
|
||||
client: Client,
|
||||
language: string,
|
||||
version: Version,
|
||||
indexPrefix: string,
|
||||
): Promise<IndexInfo> {
|
||||
const settings: estypes.IndicesIndexSettings = {
|
||||
analysis: {
|
||||
analyzer: {
|
||||
text_analyzer: {
|
||||
filter: ['lowercase'],
|
||||
tokenizer: 'standard',
|
||||
type: 'custom',
|
||||
},
|
||||
},
|
||||
},
|
||||
// filter: {
|
||||
// // Will later, conditionally, put the snowball configuration here.
|
||||
// },
|
||||
// XXX SNOWBALL?
|
||||
}
|
||||
|
||||
if (indexPrefix && !indexPrefix.endsWith('_')) {
|
||||
indexPrefix += '_'
|
||||
}
|
||||
|
||||
const indexName = `${indexPrefix}github-autocomplete-${language}-${shortVersionNames[version] || version}`
|
||||
const thisAlias = `${indexName}__${utcTimestamp()}`
|
||||
|
||||
const mappings: estypes.MappingTypeMapping = {
|
||||
properties: {
|
||||
term: {
|
||||
type: 'text',
|
||||
analyzer: 'text_analyzer',
|
||||
// This is used for fast highlighting. Uses more space but makes
|
||||
// the searches faster.
|
||||
term_vector: 'with_positions_offsets',
|
||||
},
|
||||
popularity: { type: 'float' },
|
||||
},
|
||||
}
|
||||
|
||||
await client.indices.create({
|
||||
index: thisAlias,
|
||||
mappings,
|
||||
settings,
|
||||
})
|
||||
|
||||
return { alias: thisAlias, name: indexName }
|
||||
}
|
|
@ -0,0 +1,158 @@
|
|||
import { program, Option, Command, InvalidArgumentError } from 'commander'
|
||||
import { errors } from '@elastic/elasticsearch'
|
||||
import dotenv from 'dotenv'
|
||||
|
||||
import { languageKeys } from '@/languages/lib/languages.js'
|
||||
import { indexGeneralAutocomplete } from './lib/index-general-autocomplete'
|
||||
import { indexGeneralSearch } from './lib/index-general-search'
|
||||
import {
|
||||
allIndexVersionKeys,
|
||||
allIndexVersionOptions,
|
||||
supportedAutocompletePlanVersions,
|
||||
} from '@/search/lib/elasticsearch-versions'
|
||||
import { indexAISearchAutocomplete } from './lib/index-ai-search-autocomplete'
|
||||
|
||||
// If you optionally have ELASTICSEARCH_URL set in your .env file.
|
||||
dotenv.config()
|
||||
|
||||
program.name('index').description('CLI scripts for indexing Docs data into Elasticsearch')
|
||||
|
||||
const generalAutoCompleteCommand = new Command('general-autocomplete')
|
||||
.description('Index for general search autocomplete')
|
||||
.addOption(
|
||||
new Option('-l, --language <language...>', 'Specific languages(s)').choices(languageKeys),
|
||||
)
|
||||
.addOption(
|
||||
new Option('-v, --version <version...>', 'Specific versions').choices(allIndexVersionKeys),
|
||||
)
|
||||
.option('--verbose', 'Verbose output')
|
||||
.option('--index-prefix <prefix>', 'Prefix for the index names', '')
|
||||
.argument('<data-root>', 'path to the docs-internal-data repo')
|
||||
.action(async (dataRepoRoot: string, options) => {
|
||||
const languages = options.language ? options.language : languageKeys
|
||||
const indexPrefix = options.indexPrefix || ''
|
||||
try {
|
||||
await indexGeneralAutocomplete({
|
||||
dataRepoRoot,
|
||||
languages,
|
||||
versions: options.version || supportedAutocompletePlanVersions,
|
||||
indexPrefix,
|
||||
})
|
||||
} catch (error: any) {
|
||||
if (error instanceof errors.ElasticsearchClientError) {
|
||||
if ((error as any)?.meta) {
|
||||
console.error('Error meta: %O', (error as any).meta)
|
||||
}
|
||||
}
|
||||
console.error('general-autocomplete indexing error:', error.message)
|
||||
process.exit(1)
|
||||
}
|
||||
})
|
||||
|
||||
const generalSearchCommand = new Command('general-search')
|
||||
.description(
|
||||
'Indexes records for general search. Records should be pre-scraped by the scrape script.',
|
||||
)
|
||||
.option('-v, --verbose', 'Verbose outputs')
|
||||
.addOption(
|
||||
new Option('-V, --version [VERSION...]', 'Specific versions').choices(allIndexVersionOptions),
|
||||
)
|
||||
.addOption(
|
||||
new Option('-l, --language <LANGUAGE...>', 'Which languages to focus on').choices(languageKeys),
|
||||
)
|
||||
.addOption(
|
||||
new Option('--not-language <LANGUAGE...>', 'Specific language to omit').choices(languageKeys),
|
||||
)
|
||||
.option('-u, --elasticsearch-url <url>', 'If different from $ELASTICSEARCH_URL')
|
||||
.option('-p, --index-prefix <prefix>', 'Index string to put before index name')
|
||||
.option(
|
||||
'-s, --stagger-seconds <seconds>',
|
||||
'Number of seconds to sleep between each bulk operation',
|
||||
(value) => {
|
||||
const parsed = parseInt(value, 10)
|
||||
if (isNaN(parsed)) {
|
||||
throw new InvalidArgumentError('Not a number.')
|
||||
}
|
||||
return parsed
|
||||
},
|
||||
)
|
||||
.option(
|
||||
'-r, --retries <count>',
|
||||
'Number of retry attempts on recoverable network errors',
|
||||
(value) => {
|
||||
const parsed = parseInt(value, 10)
|
||||
if (isNaN(parsed)) {
|
||||
throw new InvalidArgumentError('Not a number.')
|
||||
}
|
||||
return parsed
|
||||
},
|
||||
)
|
||||
.option(
|
||||
'--sleep-time <seconds>',
|
||||
`Number of seconds to sleep between each retry attempt (defaults to 30)`,
|
||||
(value) => {
|
||||
const parsed = parseInt(value, 10)
|
||||
if (isNaN(parsed)) {
|
||||
throw new InvalidArgumentError('Not a number.')
|
||||
}
|
||||
return parsed
|
||||
},
|
||||
30,
|
||||
)
|
||||
.argument('<source-directory>', 'where the indexable files are')
|
||||
.action(async (sourceDirectory, options) => {
|
||||
try {
|
||||
await indexGeneralSearch(sourceDirectory, options)
|
||||
} catch (error: any) {
|
||||
if (error instanceof errors.ElasticsearchClientError) {
|
||||
if ((error as any)?.meta) {
|
||||
console.error('Error meta: %O', (error as any).meta)
|
||||
}
|
||||
}
|
||||
console.error('general-search indexing error:', error.message)
|
||||
process.exit(1)
|
||||
}
|
||||
})
|
||||
|
||||
const aiSearchAutocompleteCommand = new Command('ai-search-autocomplete')
|
||||
.description('Index for AI search autocomplete')
|
||||
.addOption(
|
||||
new Option(
|
||||
'-l, --language <language...>',
|
||||
'Specific languages(s). (NOTE: Only english, "en" is currently supported',
|
||||
).choices(['en']),
|
||||
)
|
||||
.addOption(
|
||||
new Option('-v, --version <version...>', 'Specific versions').choices(allIndexVersionKeys),
|
||||
)
|
||||
.option('--verbose', 'Verbose output')
|
||||
.option('--index-prefix <prefix>', 'Prefix for the index names', '')
|
||||
.argument('<data-root>', 'path to the docs-internal-data repo')
|
||||
.action(async (dataRepoRoot: string, options) => {
|
||||
// In the future, we may want to support multiple languages
|
||||
// Currently (since this is an experiment), we only support english
|
||||
const languages = ['en']
|
||||
const indexPrefix = options.indexPrefix || ''
|
||||
try {
|
||||
await indexAISearchAutocomplete({
|
||||
dataRepoRoot,
|
||||
languages,
|
||||
versions: options.version || supportedAutocompletePlanVersions,
|
||||
indexPrefix,
|
||||
})
|
||||
} catch (error: any) {
|
||||
if (error instanceof errors.ElasticsearchClientError) {
|
||||
if ((error as any)?.meta) {
|
||||
console.error('Error meta: %O', (error as any).meta)
|
||||
}
|
||||
}
|
||||
console.error('ai-search-autocomplete indexing error:', error.message)
|
||||
process.exit(1)
|
||||
}
|
||||
})
|
||||
|
||||
program.addCommand(generalAutoCompleteCommand)
|
||||
program.addCommand(generalSearchCommand)
|
||||
program.addCommand(aiSearchAutocompleteCommand)
|
||||
|
||||
program.parse(process.argv)
|
|
@ -1,44 +0,0 @@
|
|||
import { program, Option } from 'commander'
|
||||
|
||||
import { languageKeys } from '@/languages/lib/languages.js'
|
||||
import { indexAutocomplete } from './index-autocomplete'
|
||||
import { type Version } from './types'
|
||||
|
||||
const defaultVersions: Version[] = ['free-pro-team', 'enterprise-server', 'enterprise-cloud']
|
||||
const shortAlias = new Map<string, Version>()
|
||||
shortAlias.set('ghes', 'enterprise-server')
|
||||
shortAlias.set('fpt', 'free-pro-team')
|
||||
shortAlias.set('ghec', 'enterprise-cloud')
|
||||
|
||||
program.name('index').description('CLI scripts for indexing to Elasticsearch')
|
||||
|
||||
program
|
||||
.command('autocomplete')
|
||||
.description('Index for autocomplete')
|
||||
.addOption(
|
||||
new Option('-l, --language <language...>', 'Specific languages(s)').choices(languageKeys),
|
||||
)
|
||||
.addOption(
|
||||
new Option('-v, --version <version...>', 'Specific version prefix(es)').choices([
|
||||
...defaultVersions,
|
||||
...shortAlias.keys(),
|
||||
]),
|
||||
)
|
||||
.option('--verbose', 'Verbose output')
|
||||
.option('--index-prefix <prefix>', 'Prefix for the index names', '')
|
||||
.argument('<data-root>', 'path to the docs-internal-data repo')
|
||||
.action((root: string, options) => {
|
||||
const languages = options.language ? options.language : languageKeys
|
||||
const versions: Version[] = []
|
||||
for (const v of options.version || defaultVersions) {
|
||||
if (shortAlias.has(v)) {
|
||||
versions.push(shortAlias.get(v)!)
|
||||
} else {
|
||||
versions.push(v)
|
||||
}
|
||||
}
|
||||
const indexPrefix = options.indexPrefix || ''
|
||||
return indexAutocomplete({ dataRepoRoot: root, languages, versions, indexPrefix })
|
||||
})
|
||||
|
||||
program.parse(process.argv)
|
|
@ -1,27 +0,0 @@
|
|||
import { Client } from '@elastic/elasticsearch'
|
||||
|
||||
export function getClient(): Client {
|
||||
const node = getElasticsearchURL()
|
||||
const client = new Client({ node })
|
||||
return client
|
||||
}
|
||||
|
||||
function getElasticsearchURL() {
|
||||
if (!process.env.ELASTICSEARCH_URL) {
|
||||
throw new Error(
|
||||
'Must passed the elasticsearch URL option or ' +
|
||||
'set the environment variable ELASTICSEARCH_URL',
|
||||
)
|
||||
}
|
||||
let node = process.env.ELASTICSEARCH_URL
|
||||
|
||||
// Allow the user to lazily set it to `localhost:9200` for example.
|
||||
if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) {
|
||||
node = `http://${node}`
|
||||
}
|
||||
|
||||
const parsed = new URL(node)
|
||||
if (!parsed.hostname) throw new Error('no valid hostname')
|
||||
|
||||
return node
|
||||
}
|
|
@ -0,0 +1,112 @@
|
|||
import fs from 'node:fs'
|
||||
import path from 'node:path'
|
||||
|
||||
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
|
||||
import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes'
|
||||
import {
|
||||
createIndex,
|
||||
populateIndex,
|
||||
printSuccess,
|
||||
updateAlias,
|
||||
} from '@/search/scripts/index/utils/indexing-elasticsearch-utils'
|
||||
import { getAISearchAutocompleteSettings } from '@/search/scripts/index/utils/settings'
|
||||
import { aiSearchAutocompleteMappings } from '@/search/scripts/index/utils/mappings'
|
||||
import { getPlanVersionFromIndexVersion } from '@/search/lib/elasticsearch-versions'
|
||||
|
||||
import type { TermsWithFrequency } from '@/search/scripts/index/types'
|
||||
|
||||
type Options = {
|
||||
dataRepoRoot: string
|
||||
languages: string[]
|
||||
versions: string[]
|
||||
retries?: number
|
||||
sleepTime?: number
|
||||
verbose?: boolean
|
||||
indexPrefix?: string
|
||||
}
|
||||
|
||||
export async function indexAISearchAutocomplete(options: Options) {
|
||||
const client = getElasticsearchClient(undefined, options.verbose)
|
||||
await client.ping() // Will throw if not available
|
||||
|
||||
const { dataRepoRoot, languages, versions } = options
|
||||
for (const language of languages) {
|
||||
for (const version of versions) {
|
||||
const startTime = new Date()
|
||||
|
||||
const records = loadQueriesWithPriority({ dataRepoRoot, language, version })
|
||||
const { indexName, indexAlias } = getElasticSearchIndex(
|
||||
'aiSearchAutocomplete',
|
||||
version,
|
||||
language,
|
||||
options.indexPrefix || '',
|
||||
)
|
||||
|
||||
const settings = getAISearchAutocompleteSettings(language, options.verbose)
|
||||
|
||||
await createIndex(client, indexAlias, settings, aiSearchAutocompleteMappings)
|
||||
|
||||
const recordsArray = Object.entries(records).map(([term, popularity]) => ({
|
||||
term,
|
||||
popularity,
|
||||
}))
|
||||
|
||||
await populateIndex(client, indexAlias, indexName, recordsArray, {
|
||||
retries: options.retries,
|
||||
sleepTime: options.sleepTime,
|
||||
verbose: options.verbose,
|
||||
})
|
||||
|
||||
await updateAlias(client, indexName, indexAlias, options)
|
||||
|
||||
printSuccess(indexName, startTime, options.verbose)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type LoadOptions = {
|
||||
dataRepoRoot: string
|
||||
language: string
|
||||
version: string
|
||||
}
|
||||
|
||||
function loadQueriesWithPriority(options: LoadOptions): TermsWithFrequency {
|
||||
// The {version} in the paths uses the version's 'plan' name, e.g. `free-pro-team` instead of `fpt`
|
||||
const internalDataVersion = getPlanVersionFromIndexVersion(options.version)
|
||||
|
||||
if (!internalDataVersion) {
|
||||
throw new Error(`No rollup version found for version ${options.version}`)
|
||||
}
|
||||
|
||||
const queriesFilePath = path.join(
|
||||
options.dataRepoRoot,
|
||||
'ai/search/queries',
|
||||
options.language,
|
||||
internalDataVersion,
|
||||
'queries.json',
|
||||
)
|
||||
|
||||
const queriesFile = JSON.parse(fs.readFileSync(queriesFilePath, 'utf8'))
|
||||
const { topQueries, allQueries } = queriesFile
|
||||
|
||||
const terms: TermsWithFrequency = {}
|
||||
|
||||
let popularity = topQueries.length + allQueries.length
|
||||
|
||||
// Assign higher popularity to topQueries
|
||||
for (const term of topQueries) {
|
||||
terms[term] = popularity
|
||||
popularity -= 1
|
||||
}
|
||||
|
||||
// Assign remaining popularity to allQueries using the order they have in the JSON
|
||||
for (const term of allQueries) {
|
||||
// Don't read in the topQueries again (duplicates)
|
||||
if (!(term in terms)) {
|
||||
terms[term] = popularity
|
||||
popularity -= 1
|
||||
}
|
||||
}
|
||||
|
||||
return terms
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
import fs from 'node:fs'
|
||||
import path from 'node:path'
|
||||
|
||||
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
|
||||
import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes'
|
||||
import {
|
||||
createIndex,
|
||||
populateIndex,
|
||||
printSuccess,
|
||||
updateAlias,
|
||||
} from '@/search/scripts/index/utils/indexing-elasticsearch-utils'
|
||||
import { getGeneralAutocompleteSettings } from '@/search/scripts/index/utils/settings'
|
||||
import { generalAutocompleteMappings } from '@/search/scripts/index/utils/mappings'
|
||||
import { getPlanVersionFromIndexVersion } from '@/search/lib/elasticsearch-versions'
|
||||
|
||||
import type { TermsWithFrequency } from '@/search/scripts/index/types'
|
||||
|
||||
type Options = {
|
||||
dataRepoRoot: string
|
||||
languages: string[]
|
||||
versions: string[]
|
||||
retries?: number
|
||||
sleepTime?: number
|
||||
verbose?: boolean
|
||||
indexPrefix?: string
|
||||
}
|
||||
|
||||
export async function indexGeneralAutocomplete(options: Options) {
|
||||
const client = getElasticsearchClient(undefined, options.verbose)
|
||||
await client.ping() // Will throw if not available
|
||||
|
||||
const { dataRepoRoot, versions, languages } = options
|
||||
for (const language of languages) {
|
||||
for (const version of versions) {
|
||||
const startTime = new Date()
|
||||
|
||||
const records = loadTermsWithFrequency({ version, language, dataRepoRoot })
|
||||
const { indexName, indexAlias } = getElasticSearchIndex(
|
||||
'generalAutocomplete',
|
||||
version,
|
||||
language,
|
||||
options.indexPrefix || '',
|
||||
)
|
||||
|
||||
const settings = getGeneralAutocompleteSettings(language, options.verbose)
|
||||
|
||||
await createIndex(client, indexAlias, settings, generalAutocompleteMappings)
|
||||
|
||||
const recordsArray = Object.entries(records).map(([term, popularity]) => ({
|
||||
term,
|
||||
popularity,
|
||||
}))
|
||||
|
||||
await populateIndex(client, indexAlias, indexName, recordsArray, {
|
||||
retries: options.retries,
|
||||
sleepTime: options.sleepTime,
|
||||
verbose: options.verbose,
|
||||
})
|
||||
|
||||
await updateAlias(client, indexName, indexAlias, options)
|
||||
|
||||
printSuccess(indexName, startTime, options.verbose)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type LoadOptions = {
|
||||
dataRepoRoot: string
|
||||
language: string
|
||||
version: string
|
||||
}
|
||||
|
||||
/*
|
||||
* Terms are one-word search terms that a user might enter into a search toolbar
|
||||
* We have two sources of "terms":
|
||||
* - Previous user searches (searchTerms)
|
||||
* - Terms auto-generated taking each word from each title of all of our articles (documentTerms)
|
||||
*
|
||||
* Each of the files live in our docs-internal-data repo that should be cloned before running this script.
|
||||
* The paths to these files for each type of term are:
|
||||
* - searchTerms: hydro/rollups/user-searches/{langauge}/{version}/rollup.json
|
||||
* - documentTerms: hydro/rollups/user-searches/{langauge}/{version}/rollup.json
|
||||
*/
|
||||
function loadTermsWithFrequency(options: LoadOptions): TermsWithFrequency {
|
||||
// The {version} in the paths uses the version's 'plan' name, e.g. `free-pro-team` instead of `fpt`
|
||||
const internalDataVersion = getPlanVersionFromIndexVersion(options.version)
|
||||
|
||||
if (!internalDataVersion) {
|
||||
throw new Error(`No rollup version found for version ${options.version}`)
|
||||
}
|
||||
|
||||
const filePath = path.join(
|
||||
options.dataRepoRoot,
|
||||
'hydro/rollups/user-searches',
|
||||
options.language,
|
||||
internalDataVersion,
|
||||
'rollup.json',
|
||||
)
|
||||
const terms: TermsWithFrequency = {}
|
||||
|
||||
const userSearchTerms: TermsWithFrequency = JSON.parse(fs.readFileSync(filePath, 'utf8'))
|
||||
let maxFrequency = Math.max(...Object.values(userSearchTerms))
|
||||
if (maxFrequency === 0) {
|
||||
throw new Error(`No records found for ${options.language} ${options.version}`)
|
||||
}
|
||||
for (const [term, frequency] of Object.entries(userSearchTerms)) {
|
||||
// Normalize the frequency which will turn into "popularity" in ElasticSearch
|
||||
// We include +1 here because "userSearchTerms" should have higher priority than "articleTitleTerms"
|
||||
terms[term] = frequency / maxFrequency + 1
|
||||
}
|
||||
|
||||
const articleTitleTermsFilePath = path.join(
|
||||
options.dataRepoRoot,
|
||||
'all-documents/terms',
|
||||
options.language,
|
||||
internalDataVersion,
|
||||
'terms.json',
|
||||
)
|
||||
const articleTitleTerms: TermsWithFrequency = JSON.parse(
|
||||
fs.readFileSync(articleTitleTermsFilePath, 'utf8'),
|
||||
)
|
||||
maxFrequency = Math.max(...Object.values(articleTitleTerms))
|
||||
if (maxFrequency === 0) {
|
||||
throw new Error(`No document title records found for ${options.language} ${options.version}`)
|
||||
}
|
||||
for (const [articleTitleTerm, frequency] of Object.entries(articleTitleTerms)) {
|
||||
if (!(articleTitleTerm in terms)) {
|
||||
// Notice that we don't + 1 here because we want to give more priority to data from user searches
|
||||
terms[articleTitleTerm] = frequency / maxFrequency
|
||||
}
|
||||
}
|
||||
|
||||
return terms
|
||||
}
|
|
@ -0,0 +1,145 @@
|
|||
import { Client } from '@elastic/elasticsearch'
|
||||
import chalk from 'chalk'
|
||||
|
||||
import { languageKeys } from '#src/languages/lib/languages.js'
|
||||
import { allVersions } from '#src/versions/lib/all-versions.js'
|
||||
import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes'
|
||||
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
|
||||
import {
|
||||
createIndex,
|
||||
escapeHTML,
|
||||
loadIndexRecords,
|
||||
populateIndex,
|
||||
printSuccess,
|
||||
updateAlias,
|
||||
} from '@/search/scripts/index/utils/indexing-elasticsearch-utils'
|
||||
import { sleep } from '@/search/lib/helpers/time'
|
||||
import { getGeneralSearchSettings } from '@/search/scripts/index/utils/settings'
|
||||
import { generalSearchMappings } from '@/search/scripts/index/utils/mappings'
|
||||
|
||||
import type { AllVersionInfo } from '@/search/scripts/index/types'
|
||||
|
||||
interface Options {
|
||||
verbose?: boolean
|
||||
version?: string[] | string
|
||||
language?: string[]
|
||||
notLanguage?: string[]
|
||||
elasticsearchUrl?: string
|
||||
indexPrefix?: string
|
||||
staggerSeconds?: number
|
||||
retries?: number
|
||||
sleepTime: number
|
||||
}
|
||||
|
||||
const shortNames: { [key: string]: AllVersionInfo } = Object.fromEntries(
|
||||
Object.values(allVersions).map((info: AllVersionInfo) => {
|
||||
const shortName = info.hasNumberedReleases
|
||||
? info.miscBaseName + info.currentRelease
|
||||
: info.miscBaseName
|
||||
return [shortName, info]
|
||||
}),
|
||||
)
|
||||
|
||||
const allVersionKeys = Object.keys(shortNames)
|
||||
|
||||
export async function indexGeneralSearch(sourceDirectory: string, opts: Options) {
|
||||
if (!sourceDirectory) {
|
||||
throw new Error('Must pass the source directory as the first argument')
|
||||
}
|
||||
|
||||
const { language, notLanguage } = opts
|
||||
|
||||
if (language && notLanguage) {
|
||||
throw new Error("Can't combine --language and --not-language")
|
||||
}
|
||||
|
||||
const client = getElasticsearchClient(opts.elasticsearchUrl, opts.verbose)
|
||||
await client.ping() // Will throw if not available
|
||||
|
||||
let version: string | string[] | undefined = opts.version
|
||||
if (!version && process.env.VERSION && process.env.VERSION !== 'all') {
|
||||
version = process.env.VERSION
|
||||
if (!allVersionKeys.includes(version)) {
|
||||
throw new Error(
|
||||
`Environment variable 'VERSION' (${version}) is not recognized. Must be one of ${allVersionKeys}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
let versionKeys = allVersionKeys
|
||||
if (version) {
|
||||
versionKeys = Array.isArray(version) ? version : [version]
|
||||
}
|
||||
|
||||
const languages =
|
||||
language || languageKeys.filter((lang) => !notLanguage || !notLanguage.includes(lang))
|
||||
if (opts.verbose) {
|
||||
console.log(`Indexing on languages ${chalk.bold(languages.join(', '))}`)
|
||||
}
|
||||
|
||||
for (const language of languages) {
|
||||
let count = 0
|
||||
for (const versionKey of versionKeys) {
|
||||
const startTime = new Date()
|
||||
|
||||
const { indexName, indexAlias } = getElasticSearchIndex(
|
||||
'generalSearch',
|
||||
versionKey,
|
||||
language,
|
||||
opts.indexPrefix || '',
|
||||
)
|
||||
|
||||
await indexVersion(client, indexName, indexAlias, language, sourceDirectory, opts)
|
||||
|
||||
count++
|
||||
if (opts.staggerSeconds && count < versionKeys.length - 1) {
|
||||
console.log(`Sleeping for ${opts.staggerSeconds} seconds...`)
|
||||
await sleep(1000 * opts.staggerSeconds)
|
||||
}
|
||||
|
||||
printSuccess(indexName, startTime, opts.verbose)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function indexVersion(
|
||||
client: Client,
|
||||
indexName: string,
|
||||
indexAlias: string,
|
||||
language: string,
|
||||
sourceDirectory: string,
|
||||
opts: Options,
|
||||
) {
|
||||
const recordsData = await loadIndexRecords(indexName, sourceDirectory)
|
||||
const allRecords = Object.values(recordsData).sort((a, b) => b.popularity - a.popularity)
|
||||
const records = allRecords.map((doc) => {
|
||||
const { title, objectID, content, breadcrumbs, headings, intro, toplevel } = doc
|
||||
const contentEscaped = escapeHTML(content)
|
||||
const headingsEscaped = escapeHTML(headings)
|
||||
return {
|
||||
url: objectID,
|
||||
title,
|
||||
title_explicit: title,
|
||||
content: contentEscaped,
|
||||
content_explicit: contentEscaped,
|
||||
breadcrumbs,
|
||||
headings: headingsEscaped,
|
||||
headings_explicit: headingsEscaped,
|
||||
popularity: doc.popularity + 1,
|
||||
intro,
|
||||
toplevel,
|
||||
}
|
||||
})
|
||||
|
||||
const settings = getGeneralSearchSettings(language, opts.verbose || false)
|
||||
const mappings = generalSearchMappings
|
||||
|
||||
await createIndex(client, indexAlias, settings, mappings)
|
||||
|
||||
await populateIndex(client, indexAlias, indexName, records, {
|
||||
retries: opts.retries,
|
||||
sleepTime: opts.sleepTime * 1000,
|
||||
verbose: opts.verbose,
|
||||
})
|
||||
|
||||
await updateAlias(client, indexName, indexAlias, opts)
|
||||
}
|
|
@ -1,107 +0,0 @@
|
|||
import chalk from 'chalk'
|
||||
import { Client, errors } from '@elastic/elasticsearch'
|
||||
|
||||
import type { Records, RetryConfig } from '../types'
|
||||
import { retryOnErrorTest } from './retry-on-error-test'
|
||||
import { repointAlias } from './repoint-alias'
|
||||
import { formatTime, sleep } from './utils'
|
||||
|
||||
type PopulateOptions = RetryConfig & {
|
||||
verbose?: boolean
|
||||
alias: string
|
||||
name: string
|
||||
}
|
||||
|
||||
export async function populate(client: Client, records: Records, options: PopulateOptions) {
|
||||
const { alias, name } = options
|
||||
|
||||
const allRecords = Object.entries(records).sort((a, b) => b[1] - a[1])
|
||||
const operations = allRecords.flatMap(([term, count]) => {
|
||||
const popularity = count / allRecords[0][1] // Normalize to 1.0 for the highest count
|
||||
return [
|
||||
{ index: { _index: alias } },
|
||||
{
|
||||
term,
|
||||
popularity,
|
||||
},
|
||||
]
|
||||
})
|
||||
|
||||
const bulkOptions = {
|
||||
// Default is 'false'.
|
||||
// It means that the index is NOT refreshed as documents are inserted.
|
||||
// Which makes sense in our case because we do not intend to search on
|
||||
// this index until after we've pointed the alias to this new index.
|
||||
refresh: false,
|
||||
// Default is '1m' but we have no reason *not* to be patient. It's run
|
||||
// by a bot on a schedeule (GitHub Actions).
|
||||
timeout: '5m',
|
||||
}
|
||||
|
||||
const attempts = options.retries
|
||||
const sleepTime = options.sleepTime * 1000
|
||||
|
||||
console.log(`About to bulk index ${allRecords.length.toLocaleString()} records with retry %O`, {
|
||||
attempts,
|
||||
sleepTime,
|
||||
})
|
||||
const t0 = new Date()
|
||||
const bulkResponse = await retryOnErrorTest(
|
||||
(error: Error) => {
|
||||
// Rate limiting can happen when you're indexing too much at
|
||||
// same time.
|
||||
return error instanceof errors.ResponseError && error.meta.statusCode === 429
|
||||
},
|
||||
() => client.bulk({ operations, ...bulkOptions }),
|
||||
{
|
||||
attempts,
|
||||
sleepTime,
|
||||
onError: (_, attempts, sleepTime) => {
|
||||
console.warn(
|
||||
chalk.yellow(
|
||||
`Failed to bulk index ${name}. Will attempt ${attempts} more times (after ${
|
||||
sleepTime / 1000
|
||||
}s sleep).`,
|
||||
),
|
||||
)
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
if (bulkResponse.errors) {
|
||||
// Some day, when we're more confident how and why this might happen
|
||||
// we can rewrite this code to "massage" the errors better.
|
||||
// For now, if it fails, it's "OK". It means we won't be proceeding,
|
||||
// an error is thrown in Actions and we don't have to worry about
|
||||
// an incompletion index.
|
||||
console.error(`Bulk response errors: ${bulkResponse.errors}`)
|
||||
throw new Error('Bulk errors happened.')
|
||||
}
|
||||
const t1 = new Date()
|
||||
console.log(`Bulk indexed ${alias}. Took ${formatTime(t1.getTime() - t0.getTime())}`)
|
||||
|
||||
// The counting of documents in the index is async and can take a while
|
||||
// to reflect. So send count requests until we get the right number.
|
||||
let documentsInIndex = 0
|
||||
let countAttempts = 3
|
||||
while (documentsInIndex < allRecords.length) {
|
||||
const { count } = await client.count({ index: alias })
|
||||
documentsInIndex = count
|
||||
if (documentsInIndex >= allRecords.length) break
|
||||
countAttempts--
|
||||
if (!countAttempts) {
|
||||
console.log(`After ${countAttempts} attempts still haven't matched the expected number.`)
|
||||
break
|
||||
}
|
||||
await sleep(1000)
|
||||
}
|
||||
console.log(
|
||||
`Documents now in ${chalk.bold(alias)}: ${chalk.bold(documentsInIndex.toLocaleString())}`,
|
||||
)
|
||||
|
||||
await repointAlias(client, alias, name, {
|
||||
attempts,
|
||||
sleepTime,
|
||||
verbose: Boolean(options.verbose),
|
||||
})
|
||||
}
|
|
@ -1,77 +0,0 @@
|
|||
import chalk from 'chalk'
|
||||
import { Client, errors } from '@elastic/elasticsearch'
|
||||
|
||||
import { retryOnErrorTest } from './retry-on-error-test'
|
||||
import { formatTime } from './utils'
|
||||
|
||||
export async function repointAlias(
|
||||
client: Client,
|
||||
alias: string,
|
||||
name: string,
|
||||
options: {
|
||||
attempts: number
|
||||
sleepTime: number
|
||||
verbose: boolean
|
||||
},
|
||||
) {
|
||||
const { attempts, sleepTime, verbose } = options
|
||||
// To perform an atomic operation that creates the new alias and removes
|
||||
// the old indexes, we can use the updateAliases API with a body that
|
||||
// includes an "actions" array. The array includes the added alias
|
||||
// and the removed indexes. If any of the actions fail, none of the operations
|
||||
// are performed.
|
||||
// https://www.elastic.co/guide/en/elasticsearch/reference/master/indices-aliases.html
|
||||
|
||||
type Update =
|
||||
| {
|
||||
add: {
|
||||
index: string
|
||||
alias: string
|
||||
}
|
||||
}
|
||||
| {
|
||||
remove_index: {
|
||||
index: string
|
||||
}
|
||||
}
|
||||
const aliasUpdates: Update[] = [
|
||||
{
|
||||
add: {
|
||||
index: alias,
|
||||
alias: name,
|
||||
},
|
||||
},
|
||||
]
|
||||
console.log(`Alias ${name} -> ${alias}`)
|
||||
|
||||
console.log('About to get indices with retry %O', { attempts, sleepTime })
|
||||
const indices = await retryOnErrorTest(
|
||||
(error: any) => {
|
||||
// 404 can happen when you're trying to get an index that
|
||||
// doesn't exist. ...yet!
|
||||
return error instanceof errors.ResponseError && error.meta.statusCode === 404
|
||||
},
|
||||
() => client.cat.indices({ format: 'json' }),
|
||||
{
|
||||
attempts,
|
||||
sleepTime,
|
||||
onError: (error, attempts, sleepTime) => {
|
||||
console.warn(
|
||||
chalk.yellow(
|
||||
`Failed to get index ${name} (${
|
||||
error.message || error.toString()
|
||||
}). Will attempt ${attempts} more times (after ${formatTime(sleepTime)}s sleep).`,
|
||||
),
|
||||
)
|
||||
},
|
||||
},
|
||||
)
|
||||
for (const index of indices) {
|
||||
if (index.index !== alias && index.index.startsWith(name)) {
|
||||
aliasUpdates.push({ remove_index: { index: index.index } })
|
||||
console.log('Deleting index', index.index)
|
||||
}
|
||||
}
|
||||
if (verbose) console.log('Updating alias actions:', aliasUpdates)
|
||||
await client.indices.updateAliases({ body: { actions: aliasUpdates } })
|
||||
}
|
|
@ -1,10 +1,55 @@
|
|||
export type Version = 'free-pro-team' | 'enterprise-server' | 'enterprise-cloud'
|
||||
|
||||
export type Records = {
|
||||
[key: string]: number
|
||||
}
|
||||
|
||||
export type RetryConfig = {
|
||||
retries: number
|
||||
sleepTime: number
|
||||
}
|
||||
|
||||
export interface AllVersionInfo {
|
||||
hasNumberedReleases: boolean
|
||||
miscBaseName: string
|
||||
currentRelease: string
|
||||
version: string
|
||||
plan: string
|
||||
}
|
||||
|
||||
export interface AllVersions {
|
||||
[key: string]: AllVersionInfo
|
||||
}
|
||||
|
||||
export interface Options {
|
||||
language?: string
|
||||
notLanguage?: string
|
||||
version?: string
|
||||
docsInternalData?: string
|
||||
markers?: boolean
|
||||
filter?: string
|
||||
}
|
||||
|
||||
export type Args = string[]
|
||||
|
||||
export interface Page {
|
||||
relativePath: string
|
||||
redirect_from?: string[]
|
||||
}
|
||||
|
||||
export interface Config {
|
||||
noMarkers: boolean
|
||||
filter?: string
|
||||
docsInternalDataPath?: string
|
||||
}
|
||||
|
||||
export type TermsWithFrequency = { [term: string]: number }
|
||||
|
||||
export interface Records {
|
||||
[objectID: string]: Record // Here objectId will be identical to the record's objectId
|
||||
}
|
||||
|
||||
export interface Record {
|
||||
objectID: string // e.g. "/en/enterprise-cloud@latest/get-started"
|
||||
breadcrumbs: string // e.g. "Get started / Using GitHub"
|
||||
title: string // e.g. "Get started with GitHub documentation"
|
||||
headings: string
|
||||
content: string
|
||||
intro: string
|
||||
toplevel: string
|
||||
popularity: number
|
||||
}
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
export const SNOWBALL_LANGUAGES: { [key: string]: string } = {
|
||||
en: 'English',
|
||||
fr: 'French',
|
||||
es: 'Spanish',
|
||||
ru: 'Russian',
|
||||
it: 'Italian',
|
||||
de: 'German',
|
||||
pt: 'Portuguese',
|
||||
}
|
||||
|
||||
export const DEFAULT_SLEEPTIME_SECONDS = 30
|
|
@ -0,0 +1,178 @@
|
|||
import chalk from 'chalk'
|
||||
import { Client, estypes, errors } from '@elastic/elasticsearch'
|
||||
import fs from 'fs/promises'
|
||||
import path from 'path'
|
||||
|
||||
import { readableTimeMinAndSec, sleep } from '@/search/lib/helpers/time'
|
||||
import { retryOnErrorTest } from '@/search/scripts/index/utils/retry-on-error-test'
|
||||
import {
|
||||
DEFAULT_SLEEPTIME_SECONDS,
|
||||
SNOWBALL_LANGUAGES,
|
||||
} from '@/search/scripts/index/utils/constants'
|
||||
import { safeUrlDisplay } from '@/search/lib/helpers/strings'
|
||||
|
||||
import type { Records } from '@/search/scripts/index/types'
|
||||
|
||||
type Options = {
|
||||
retries?: number
|
||||
sleepTime?: number
|
||||
verbose?: boolean
|
||||
}
|
||||
|
||||
export async function createIndex(
|
||||
client: Client,
|
||||
indexAlias: string,
|
||||
settings: estypes.IndicesIndexSettings,
|
||||
mappings: estypes.MappingTypeMapping,
|
||||
) {
|
||||
await client.indices.create({
|
||||
index: indexAlias,
|
||||
mappings,
|
||||
settings,
|
||||
})
|
||||
}
|
||||
|
||||
export async function populateIndex(
|
||||
client: Client,
|
||||
indexAlias: string,
|
||||
indexName: string,
|
||||
records: any[],
|
||||
options: Options,
|
||||
) {
|
||||
console.log(chalk.yellow(`\nIndexing ${chalk.bold(indexName)}`))
|
||||
const bulkOperations = records.flatMap((doc) => [{ index: { _index: indexAlias } }, doc])
|
||||
|
||||
const bulkOptions = {
|
||||
refresh: false,
|
||||
timeout: '5m',
|
||||
}
|
||||
|
||||
const attempts = options.retries || 0
|
||||
const sleepTime = options.sleepTime || DEFAULT_SLEEPTIME_SECONDS * 1000
|
||||
console.log(`About to bulk index ${records.length.toLocaleString()} records with retry %O`, {
|
||||
attempts,
|
||||
sleepTimeMS: sleepTime,
|
||||
})
|
||||
|
||||
const t0 = new Date()
|
||||
const bulkResponse = await retryOnErrorTest(
|
||||
(error) => error instanceof errors.ResponseError && error.meta.statusCode === 429,
|
||||
() => client.bulk({ operations: bulkOperations, ...bulkOptions }),
|
||||
{
|
||||
attempts,
|
||||
sleepTime,
|
||||
onError: (_, attempts, sleepTime) => {
|
||||
console.warn(
|
||||
chalk.yellow(
|
||||
`Failed to bulk index ${indexName}. Will attempt ${attempts} more times (after ${
|
||||
sleepTime / 1000
|
||||
}s sleep).`,
|
||||
),
|
||||
)
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
if (bulkResponse.errors) {
|
||||
console.error(`Bulk response errors: ${bulkResponse.errors}`)
|
||||
throw new Error('Bulk errors happened.')
|
||||
}
|
||||
const t1 = new Date()
|
||||
console.log(
|
||||
`Bulk indexed ${indexAlias}. Took ${readableTimeMinAndSec(t1.getTime() - t0.getTime())}`,
|
||||
)
|
||||
|
||||
let documentsInIndex = 0
|
||||
let countAttempts = 3
|
||||
while (documentsInIndex < records.length) {
|
||||
const { count } = await client.count({ index: indexAlias })
|
||||
documentsInIndex = count
|
||||
if (documentsInIndex >= records.length) break
|
||||
countAttempts--
|
||||
if (!countAttempts) {
|
||||
console.log(`After ${countAttempts} attempts still haven't matched the expected number.`)
|
||||
break
|
||||
}
|
||||
await sleep(1000)
|
||||
}
|
||||
|
||||
console.log(`Documents now in ${indexAlias}: ${documentsInIndex.toLocaleString()}`)
|
||||
}
|
||||
|
||||
export async function updateAlias(
|
||||
client: Client,
|
||||
indexName: string,
|
||||
indexAlias: string,
|
||||
options: Options,
|
||||
) {
|
||||
const aliasUpdates: estypes.IndicesUpdateAliasesAction[] = [
|
||||
{
|
||||
add: {
|
||||
index: indexAlias,
|
||||
alias: indexName,
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
const indices = await retryOnErrorTest(
|
||||
(error) => {
|
||||
// 404 can happen when you're trying to get an index that
|
||||
// doesn't exist. ...yet!
|
||||
return error instanceof errors.ResponseError && error.meta.statusCode === 404
|
||||
},
|
||||
() => client.cat.indices({ format: 'json' }),
|
||||
{
|
||||
attempts: options.retries || 0,
|
||||
sleepTime: (options.sleepTime || DEFAULT_SLEEPTIME_SECONDS) * 1000,
|
||||
onError: (error, attempts, sleepTime) => {
|
||||
console.warn(
|
||||
chalk.yellow(
|
||||
`Failed to get index ${indexName} (${
|
||||
error.message || error.toString()
|
||||
}). Will attempt ${attempts} more times (after ${readableTimeMinAndSec(sleepTime)}s sleep).`,
|
||||
),
|
||||
)
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
for (const index of indices) {
|
||||
if (index.index !== indexAlias && index.index.startsWith(indexName)) {
|
||||
aliasUpdates.push({ remove_index: { index: index.index } })
|
||||
console.log('Deleting old index', index.index)
|
||||
}
|
||||
}
|
||||
if (options.verbose) console.log('Updating alias actions:', aliasUpdates)
|
||||
await client.indices.updateAliases({ body: { actions: aliasUpdates } })
|
||||
}
|
||||
|
||||
export function printSuccess(indexName: string, startTime: Date, verbose = false) {
|
||||
const endTime = new Date()
|
||||
console.log(
|
||||
chalk.green(
|
||||
`Finished indexing ${indexName}. Took ${readableTimeMinAndSec(endTime.getTime() - startTime.getTime())}`,
|
||||
),
|
||||
)
|
||||
|
||||
if (verbose) {
|
||||
console.log(`To view index: ${safeUrlDisplay(`<elasticsearch-url>/${indexName}`)}`)
|
||||
console.log(`To search index: ${safeUrlDisplay(`<elasticsearch-url>/${indexName}/_search`)}`)
|
||||
}
|
||||
}
|
||||
|
||||
export async function loadIndexRecords(
|
||||
indexName: string,
|
||||
sourceDirectory: string,
|
||||
): Promise<Records> {
|
||||
const filePath = path.join(sourceDirectory, `${indexName}-records.json`)
|
||||
const payload = await fs.readFile(filePath, 'utf8')
|
||||
return JSON.parse(payload)
|
||||
}
|
||||
|
||||
export function escapeHTML(content: string): string {
|
||||
return content.replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"')
|
||||
}
|
||||
|
||||
export function getSnowballLanguage(language: string): string | undefined {
|
||||
return SNOWBALL_LANGUAGES[language]
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
import type { estypes } from '@elastic/elasticsearch'
|
||||
|
||||
export const generalSearchMappings: estypes.MappingTypeMapping = {
|
||||
properties: {
|
||||
url: { type: 'keyword' },
|
||||
title: {
|
||||
type: 'text',
|
||||
analyzer: 'text_analyzer',
|
||||
norms: false,
|
||||
term_vector: 'with_positions_offsets',
|
||||
},
|
||||
title_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
|
||||
content: {
|
||||
type: 'text',
|
||||
analyzer: 'text_analyzer',
|
||||
term_vector: 'with_positions_offsets',
|
||||
},
|
||||
content_explicit: {
|
||||
type: 'text',
|
||||
analyzer: 'text_analyzer_explicit',
|
||||
term_vector: 'with_positions_offsets',
|
||||
},
|
||||
headings: { type: 'text', analyzer: 'text_analyzer', norms: false },
|
||||
headings_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
|
||||
breadcrumbs: { type: 'text' },
|
||||
popularity: { type: 'float' },
|
||||
intro: { type: 'text' },
|
||||
toplevel: { type: 'keyword' },
|
||||
},
|
||||
}
|
||||
|
||||
export const generalAutocompleteMappings: estypes.MappingTypeMapping = {
|
||||
properties: {
|
||||
term: {
|
||||
type: 'text',
|
||||
analyzer: 'text_analyzer',
|
||||
term_vector: 'with_positions_offsets',
|
||||
},
|
||||
popularity: { type: 'float' },
|
||||
},
|
||||
}
|
||||
|
||||
export const aiSearchAutocompleteMappings: estypes.MappingTypeMapping = {
|
||||
properties: {
|
||||
term: {
|
||||
type: 'text',
|
||||
analyzer: 'text_analyzer',
|
||||
term_vector: 'with_positions_offsets',
|
||||
},
|
||||
popularity: { type: 'float' },
|
||||
},
|
||||
}
|
|
@ -1,5 +1,3 @@
|
|||
// [start-readme]
|
||||
//
|
||||
// Return a function that you can use to run any code within and if it
|
||||
// throws you get a chance to say whether to sleep + retry.
|
||||
// Example:
|
||||
|
@ -20,10 +18,8 @@
|
|||
// Note that, by default, the sleep time is "exponential" by a factor of
|
||||
// 1.5. So the first sleep will, in the above example,
|
||||
// be 800ms. Then 1,200ms, Then 1,800ms. etc.
|
||||
//
|
||||
// [end-readme]
|
||||
|
||||
import { sleep } from './utils'
|
||||
import { sleep } from '@/search/lib/helpers/time'
|
||||
|
||||
export async function retryOnErrorTest(
|
||||
errorTest: (error: any) => boolean,
|
|
@ -0,0 +1,118 @@
|
|||
import { SNOWBALL_LANGUAGES } from '@/search/scripts/index/utils/constants'
|
||||
|
||||
import type { estypes } from '@elastic/elasticsearch'
|
||||
import type {
|
||||
AnalysisSnowballLanguage,
|
||||
AnalysisCustomAnalyzer,
|
||||
} from '@elastic/elasticsearch/lib/api/types'
|
||||
|
||||
export function getGeneralSearchSettings(
|
||||
language: string,
|
||||
verbose: boolean,
|
||||
): estypes.IndicesIndexSettings {
|
||||
const settings: estypes.IndicesIndexSettings = {
|
||||
analysis: {
|
||||
char_filter: {
|
||||
hyphenation_filter: {
|
||||
type: 'mapping',
|
||||
mappings: ['- => _'],
|
||||
},
|
||||
},
|
||||
analyzer: {
|
||||
text_analyzer_explicit: {
|
||||
char_filter: ['hyphenation_filter'],
|
||||
filter: ['lowercase', 'stop', 'asciifolding'],
|
||||
tokenizer: 'standard',
|
||||
type: 'custom',
|
||||
} as AnalysisCustomAnalyzer,
|
||||
text_analyzer: {
|
||||
filter: ['lowercase', 'stop', 'asciifolding'],
|
||||
tokenizer: 'standard',
|
||||
type: 'custom',
|
||||
} as AnalysisCustomAnalyzer,
|
||||
},
|
||||
filter: {},
|
||||
},
|
||||
}
|
||||
|
||||
const snowballLanguage = SNOWBALL_LANGUAGES[language]
|
||||
if (snowballLanguage) {
|
||||
const textAnalyzer = settings.analysis!.analyzer!.text_analyzer as AnalysisCustomAnalyzer
|
||||
textAnalyzer.filter!.push('languaged_snowball')
|
||||
|
||||
settings.analysis!.filter!['languaged_snowball'] = {
|
||||
type: 'snowball',
|
||||
language: snowballLanguage as AnalysisSnowballLanguage,
|
||||
}
|
||||
} else if (verbose) {
|
||||
console.warn(`No snowball language for '${language}'`)
|
||||
}
|
||||
|
||||
return settings
|
||||
}
|
||||
|
||||
export function getGeneralAutocompleteSettings(
|
||||
language: string,
|
||||
verbose = false,
|
||||
): estypes.IndicesIndexSettings {
|
||||
const settings: estypes.IndicesIndexSettings = {
|
||||
analysis: {
|
||||
analyzer: {
|
||||
text_analyzer: {
|
||||
filter: ['lowercase'],
|
||||
tokenizer: 'standard',
|
||||
type: 'custom',
|
||||
} as AnalysisCustomAnalyzer,
|
||||
},
|
||||
filter: {},
|
||||
},
|
||||
}
|
||||
|
||||
const snowballLanguage = SNOWBALL_LANGUAGES[language]
|
||||
if (snowballLanguage) {
|
||||
const textAnalyzer = settings.analysis!.analyzer!.text_analyzer as AnalysisCustomAnalyzer
|
||||
textAnalyzer.filter!.push('languaged_snowball')
|
||||
|
||||
settings.analysis!.filter!['languaged_snowball'] = {
|
||||
type: 'snowball',
|
||||
language: snowballLanguage as AnalysisSnowballLanguage,
|
||||
}
|
||||
} else if (verbose) {
|
||||
console.warn(`No snowball language for '${language}'`)
|
||||
}
|
||||
|
||||
return settings
|
||||
}
|
||||
|
||||
export function getAISearchAutocompleteSettings(
|
||||
language: string,
|
||||
verbose = false,
|
||||
): estypes.IndicesIndexSettings {
|
||||
const settings: estypes.IndicesIndexSettings = {
|
||||
analysis: {
|
||||
analyzer: {
|
||||
text_analyzer: {
|
||||
filter: ['lowercase'],
|
||||
tokenizer: 'standard',
|
||||
type: 'custom',
|
||||
} as AnalysisCustomAnalyzer,
|
||||
},
|
||||
filter: {},
|
||||
},
|
||||
}
|
||||
|
||||
const snowballLanguage = SNOWBALL_LANGUAGES[language]
|
||||
if (snowballLanguage) {
|
||||
const textAnalyzer = settings.analysis!.analyzer!.text_analyzer as AnalysisCustomAnalyzer
|
||||
textAnalyzer.filter!.push('languaged_snowball')
|
||||
|
||||
settings.analysis!.filter!['languaged_snowball'] = {
|
||||
type: 'snowball',
|
||||
language: snowballLanguage as AnalysisSnowballLanguage,
|
||||
}
|
||||
} else if (verbose) {
|
||||
console.warn(`No snowball language for '${language}'`)
|
||||
}
|
||||
|
||||
return settings
|
||||
}
|
|
@ -1,76 +0,0 @@
|
|||
// [start-readme]
|
||||
//
|
||||
// Return a function that you can use to run any code within and if it
|
||||
// throws you get a chance to say whether to sleep + retry.
|
||||
// Example:
|
||||
//
|
||||
// async function mainFunction() {
|
||||
// if (Math.random() > 0.9) throw new Error('too large')
|
||||
// return 'OK'
|
||||
// }
|
||||
//
|
||||
// const errorTest = (err) => err instanceof Error && err.message.includes('too large')
|
||||
// const config = { // all optional
|
||||
// attempts: 3,
|
||||
// sleepTime: 800,
|
||||
// onError: (err, attempts) => console.warn(`Failed ${attempts} attempts`)
|
||||
// }
|
||||
// const ok = await retry(errorTest, mainFunction, config)
|
||||
//
|
||||
// Note that, by default, the sleep time is "exponential" by a factor of
|
||||
// 1.5. So the first sleep will, in the above example,
|
||||
// be 800ms. Then 1,200ms, Then 1,800ms. etc.
|
||||
//
|
||||
// [end-readme]
|
||||
|
||||
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
|
||||
|
||||
export async function retryOnErrorTest(
|
||||
errorTest,
|
||||
callback,
|
||||
{
|
||||
attempts = 4,
|
||||
sleepTime = 1000,
|
||||
exponential = 1.5,
|
||||
jitterPercent = 25,
|
||||
onError = () => {},
|
||||
} = {},
|
||||
) {
|
||||
while (true) {
|
||||
try {
|
||||
return await callback()
|
||||
} catch (error) {
|
||||
if (error instanceof Error && attempts > 0 && errorTest(error)) {
|
||||
if (onError) onError(error, attempts, sleepTime)
|
||||
attempts--
|
||||
// The reason for the jitter is to avoid a thundering herd problem.
|
||||
// Suppose two independent processes/threads start at the same time.
|
||||
// They both fail, perhaps due to rate limiting. Now, if they both
|
||||
// sleep for 30 seconds in the first retry attempt, it'll just
|
||||
// clash again 30 seconds later. But if you add a bit of jitter, at
|
||||
// the next attempt these independent processes/threads will now
|
||||
// start at slightly different times.
|
||||
|
||||
// According to the Oxford English dictionary, they define "jitter" as:
|
||||
//
|
||||
// slight irregular movement, variation, or unsteadiness,
|
||||
// especially in an electrical signal or electronic device.
|
||||
//
|
||||
await sleep(addJitter(sleepTime, jitterPercent))
|
||||
if (exponential) {
|
||||
sleepTime *= 2
|
||||
}
|
||||
} else {
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function addJitter(num, percent) {
|
||||
// Return the number plus between 0 and $percent of that number.
|
||||
// For example, for 1,000 with a 20% jitter you might get 1133.4
|
||||
// because you start with 1,000 and 13.4% is a random number between
|
||||
// 0 and 20%.
|
||||
return num + Math.random() * percent * 0.01 * num
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
# Scraping for General Search
|
||||
|
||||
We need to scrape each page on the Docs site and use the data we scrape to index Elasticsearch.
|
||||
|
||||
We currently only scrape for **general search** results.
|
||||
|
||||
Autocomplete search data is generated from analytics events and GPT queries.
|
||||
|
||||
## CLI Script
|
||||
|
||||
Before running the scraping script ensure that the server is running in another terminal with `npm run general-search-scrape-server`
|
||||
|
||||
Run the script with `npm run general-search-scrape -- <scrape-directory>`
|
||||
|
||||
After a successful run it will generate a series of JSON files with the page data of every page of the Docs site into the passed directory.
|
||||
|
||||
The `index-general-search.yml` workflow will scrape the records into `/tmp/records` then proceed to run the [general-search indexing script](../index/README.md)
|
||||
|
||||
To see the arguments accepted by the script, pass the `--help` argument, for example
|
||||
|
||||
```bash
|
||||
npm run general-search-scrape -- --help
|
||||
```
|
||||
|
||||
## Records (scraped pages)
|
||||
|
||||
In the context of an Elasticsearch index, a record represents a page. Each record has `breadcrumbs`, `title`, `headings`, `content` (the article content in text, not HTML), `intro` (if one exists in the frontmatter), and a unique `objectID` that is currently just the permalink of the article. Here's an example:
|
||||
|
||||
```json
|
||||
{
|
||||
"objectID":"/en/actions/creating-actions/about-custom-actions",
|
||||
"breadcrumbs":"GitHub Actions / Creating actions",
|
||||
"title":"About custom actions",
|
||||
"headings":"About custom actions\nTypes of actions\n[...]",
|
||||
"content":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, [...]",
|
||||
"intro":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, or use and customize actions shared by the GitHub community.",
|
||||
"toplevel":"GitHub Actions",
|
||||
"popularity":0
|
||||
}
|
||||
```
|
|
@ -1,14 +1,16 @@
|
|||
#!/usr/bin/env node
|
||||
import eventToPromise from 'event-to-promise'
|
||||
import chalk from 'chalk'
|
||||
import dotenv from 'dotenv'
|
||||
import boxen from 'boxen'
|
||||
import { HTTPError } from 'got'
|
||||
|
||||
import parsePageSectionsIntoRecords from './parse-page-sections-into-records.js'
|
||||
import getPopularPages from './popular-pages.js'
|
||||
import languages from '#src/languages/lib/languages.js'
|
||||
import domwaiter from './domwaiter.js'
|
||||
import languages from '@/languages/lib/languages.js'
|
||||
import parsePageSectionsIntoRecords from '@/search/scripts/scrape/lib/parse-page-sections-into-records'
|
||||
import getPopularPages from '@/search/scripts/scrape/lib/popular-pages'
|
||||
import domwaiter from '@/search/scripts/scrape/lib/domwaiter'
|
||||
import { getAllVersionsKeyFromIndexVersion } from '@/search/lib/elasticsearch-versions'
|
||||
|
||||
import type { Page, Permalink, Record, Config, Redirects } from '@/search/scripts/scrape/types'
|
||||
|
||||
const pageMarker = chalk.green('|')
|
||||
const recordMarker = chalk.grey('.')
|
||||
|
@ -31,16 +33,19 @@ const MIN_TIME = parseInt(process.env.BUILD_RECORDS_MIN_TIME || '5', 10)
|
|||
const FORCE_0_POPULARITY_PRODUCTS = new Set(['contributing'])
|
||||
|
||||
export default async function buildRecords(
|
||||
indexName,
|
||||
indexablePages,
|
||||
pageVersion,
|
||||
languageCode,
|
||||
redirects,
|
||||
config = {},
|
||||
) {
|
||||
indexName: string,
|
||||
indexablePages: Page[],
|
||||
indexVersion: string,
|
||||
languageCode: string,
|
||||
redirects: Redirects,
|
||||
config: Config = {} as Config,
|
||||
): Promise<Record[]> {
|
||||
// Determine the page version from the index version
|
||||
const pageVersion = getAllVersionsKeyFromIndexVersion(indexVersion)
|
||||
|
||||
const { noMarkers, docsInternalDataPath } = config
|
||||
console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`)
|
||||
const records = []
|
||||
const records: Record[] = []
|
||||
const pages = indexablePages
|
||||
// exclude pages that are not in the current language
|
||||
.filter((page) => page.languageCode === languageCode)
|
||||
|
@ -55,12 +60,15 @@ export default async function buildRecords(
|
|||
})
|
||||
})
|
||||
.map((permalink) => {
|
||||
permalink.url = `http://localhost:${port}${permalink.href}`
|
||||
if (permalink) {
|
||||
permalink.url = `http://localhost:${port}${permalink.href}`
|
||||
}
|
||||
return permalink
|
||||
})
|
||||
.filter((permalink): permalink is Permalink => permalink !== undefined)
|
||||
|
||||
const popularPages = docsInternalDataPath
|
||||
? await getPopularPages(docsInternalDataPath, redirects, pageVersion, languageCode)
|
||||
? await getPopularPages(docsInternalDataPath, redirects, indexVersion, languageCode)
|
||||
: {}
|
||||
|
||||
console.log('indexable pages', indexablePages.length)
|
||||
|
@ -93,7 +101,7 @@ export default async function buildRecords(
|
|||
if (err instanceof HTTPError && !err.response.ok) {
|
||||
console.log(
|
||||
'\n' +
|
||||
boxen(chalk.bold(err.request.requestUrl.pathname), {
|
||||
boxen(chalk.bold(err.request.requestUrl?.pathname), {
|
||||
title: chalk.red('The URL it failed on was'),
|
||||
padding: 1,
|
||||
borderColor: 'red',
|
|
@ -1,9 +1,18 @@
|
|||
import { EventEmitter } from 'node:events'
|
||||
import { EventEmitter } from 'events'
|
||||
import Bottleneck from 'bottleneck'
|
||||
import got from 'got'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
export default function domwaiter(pages, opts = {}) {
|
||||
import type { Permalink } from '@/search/scripts/scrape/types'
|
||||
|
||||
interface DomWaiterOptions {
|
||||
parseDOM?: boolean
|
||||
json?: boolean
|
||||
maxConcurrent?: number
|
||||
minTime?: number
|
||||
}
|
||||
|
||||
export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter {
|
||||
const emitter = new EventEmitter()
|
||||
|
||||
const defaults = {
|
||||
|
@ -17,26 +26,26 @@ export default function domwaiter(pages, opts = {}) {
|
|||
const limiter = new Bottleneck(opts)
|
||||
|
||||
pages.forEach((page) => {
|
||||
limiter.schedule(getPage, page, emitter, opts)
|
||||
limiter.schedule(() => getPage(page, emitter, opts))
|
||||
})
|
||||
|
||||
limiter
|
||||
.on('idle', () => {
|
||||
emitter.emit('done')
|
||||
})
|
||||
.on('error', (err) => {
|
||||
emitter.emit('error', err)
|
||||
})
|
||||
limiter.on('idle', () => {
|
||||
emitter.emit('done')
|
||||
})
|
||||
|
||||
limiter.on('error', (err) => {
|
||||
emitter.emit('error', err)
|
||||
})
|
||||
|
||||
return emitter
|
||||
}
|
||||
|
||||
async function getPage(page, emitter, opts) {
|
||||
async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) {
|
||||
emitter.emit('beforePageLoad', page)
|
||||
|
||||
if (opts.json) {
|
||||
try {
|
||||
const json = await got(page.url).json()
|
||||
const json = await got(page.url!).json()
|
||||
const pageCopy = Object.assign({}, page, { json })
|
||||
emitter.emit('page', pageCopy)
|
||||
} catch (err) {
|
||||
|
@ -44,9 +53,9 @@ async function getPage(page, emitter, opts) {
|
|||
}
|
||||
} else {
|
||||
try {
|
||||
const body = (await got(page.url)).body
|
||||
const body = (await got(page.url!)).body
|
||||
const pageCopy = Object.assign({}, page, { body })
|
||||
if (opts.parseDOM) pageCopy.$ = cheerio.load(body)
|
||||
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
|
||||
emitter.emit('page', pageCopy)
|
||||
} catch (err) {
|
||||
emitter.emit('error', err)
|
|
@ -1,8 +1,9 @@
|
|||
#!/usr/bin/env node
|
||||
import { loadPages } from '#src/frame/lib/page-data.js'
|
||||
import { loadPages } from '@/frame/lib/page-data.js'
|
||||
|
||||
export default async function findIndexablePages(match = '') {
|
||||
const allPages = await loadPages()
|
||||
import type { Page } from '@/search/scripts/scrape/types'
|
||||
|
||||
export default async function findIndexablePages(match = ''): Promise<Page[]> {
|
||||
const allPages: Page[] = await loadPages()
|
||||
const indexablePages = allPages
|
||||
// exclude hidden pages
|
||||
.filter((page) => !page.hidden)
|
|
@ -1,17 +1,18 @@
|
|||
#!/usr/bin/env node
|
||||
import { render } from 'cheerio-to-text'
|
||||
|
||||
import type { Record } from '@/search/scripts/scrape/types'
|
||||
|
||||
// This module takes cheerio page object and divides it into sections
|
||||
// using H1,H2 heading elements as section delimiters. The text
|
||||
// that follows each heading becomes the content of the search record.
|
||||
|
||||
const ignoredHeadingSlugs = ['in-this-article', 'further-reading', 'prerequisites']
|
||||
|
||||
export default function parsePageSectionsIntoRecords(page) {
|
||||
export default function parsePageSectionsIntoRecords(page: any): Record {
|
||||
const { href, $ } = page
|
||||
const title = $('h1').first().text().trim()
|
||||
const breadcrumbsArray = $('[data-search=breadcrumbs] nav.breadcrumbs a')
|
||||
.map((i, el) => {
|
||||
.map((i: number, el: any) => {
|
||||
return $(el).text().trim().replace('/', '').replace(/\s+/g, ' ')
|
||||
})
|
||||
.get()
|
||||
|
@ -21,8 +22,7 @@ export default function parsePageSectionsIntoRecords(page) {
|
|||
// page that don't make much sense to find in a site search.
|
||||
$('[data-search=hide]').remove()
|
||||
|
||||
// Only slice off the last one if the length of the array is greater
|
||||
// that 1.
|
||||
// Only slice off the last one if the length of the array is greater than 1
|
||||
// On an article page, we the breadcrumbs array will be something
|
||||
// like:
|
||||
//
|
||||
|
@ -51,12 +51,12 @@ export default function parsePageSectionsIntoRecords(page) {
|
|||
|
||||
const $sections = $('h2', $root)
|
||||
.filter('[id]')
|
||||
.filter((i, el) => {
|
||||
.filter((i: number, el: any) => {
|
||||
return !ignoredHeadingSlugs.includes($(el).attr('id'))
|
||||
})
|
||||
|
||||
const headings = $sections
|
||||
.map((i, el) => $(el).text())
|
||||
.map((i: number, el: any) => $(el).text())
|
||||
.get()
|
||||
.join('\n')
|
||||
.trim()
|
|
@ -2,28 +2,31 @@ import { join } from 'path'
|
|||
import { existsSync } from 'fs'
|
||||
import fs from 'fs/promises'
|
||||
|
||||
export default async function getPopularPages(dirPath, redirects, version, language) {
|
||||
// The dirPath is the path to the github/docs-internal-data repo.
|
||||
// We make assumptions about the structure of the repo. In particular,
|
||||
// the pageviews rollups live in
|
||||
// `hydro/rollups/pageviews/$language/$versionprefix/rollup.json`
|
||||
// For example
|
||||
// `hydro/rollups/pageviews/en/enterprise-cloud/rollup.json`
|
||||
const versionPrefix = version.split('@')[0]
|
||||
let filePath = join(dirPath, 'hydro/rollups/pageviews', language, versionPrefix, 'rollup.json')
|
||||
import { getPlanVersionFromIndexVersion } from '@/search/lib/elasticsearch-versions.js'
|
||||
|
||||
import type { Redirects, PopularPages } from '@/search/scripts/scrape/types'
|
||||
|
||||
export default async function getPopularPages(
|
||||
dirPath: string,
|
||||
redirects: Redirects,
|
||||
indexVersion: string,
|
||||
language: string,
|
||||
): Promise<PopularPages> {
|
||||
const planVersion = getPlanVersionFromIndexVersion(indexVersion)
|
||||
let filePath = join(dirPath, 'hydro/rollups/pageviews', language, planVersion, 'rollup.json')
|
||||
if (!existsSync(filePath) && language !== 'en') {
|
||||
console.warn("Trying the rollup for 'en'")
|
||||
language = 'en'
|
||||
filePath = join(dirPath, 'hydro/rollups/pageviews', language, versionPrefix, 'rollup.json')
|
||||
filePath = join(dirPath, 'hydro/rollups/pageviews', language, planVersion, 'rollup.json')
|
||||
}
|
||||
if (!existsSync(filePath)) {
|
||||
throw new Error(`No rollup found for version '${versionPrefix}'. Tried ${filePath}`)
|
||||
throw new Error(`No rollup found for version '${planVersion}'. Tried ${filePath}`)
|
||||
}
|
||||
const rollupRaw = await fs.readFile(filePath, 'utf-8')
|
||||
|
||||
// Firt iterate through the array of objects, not making an assumption
|
||||
// First iterate through the array of objects, not making an assumption
|
||||
// that the first one is the biggest one.
|
||||
const all = {}
|
||||
const all: { [key: string]: number } = {}
|
||||
for (const [path, count] of Object.entries(JSON.parse(rollupRaw))) {
|
||||
if (!path) {
|
||||
// Can happen if the SQL query is, for some unknown reason, finding
|
||||
|
@ -41,11 +44,11 @@ export default async function getPopularPages(dirPath, redirects, version, langu
|
|||
// We never index these anyway so their popularity is never relevant.
|
||||
continue
|
||||
}
|
||||
all[path] = count
|
||||
all[path] = count as number
|
||||
}
|
||||
|
||||
const biggestCount = Math.max(...Object.values(all))
|
||||
const popularPages = {}
|
||||
const popularPages: PopularPages = {}
|
||||
for (const [path, count] of Object.entries(all)) {
|
||||
// Don't bother writing massively long floating point numbers
|
||||
// because reducing it makes the JSON records smaller and we don't
|
||||
|
@ -55,11 +58,6 @@ export default async function getPopularPages(dirPath, redirects, version, langu
|
|||
// The reason we're heeding redirects is because it's possible
|
||||
// that the JSON file is older/"staler" than the
|
||||
// content itself.
|
||||
// Imaging our analytics recorded that `/en/foo` had 1,234 pageviews,
|
||||
// and someone goes and... `git mv content/foo content/bar` plus
|
||||
// adding `redirect_from: - /foo` into the front-matter.
|
||||
// Then, by using the redirects first, we can maintain that popularity
|
||||
// by now "pretending" that it's `/en/bar` that has 1,234 pageviews.
|
||||
popularPages[redirects[path] || path] = ratio
|
||||
}
|
||||
|
|
@ -1,22 +1,22 @@
|
|||
#!/usr/bin/env node
|
||||
import chalk from 'chalk'
|
||||
|
||||
import languages from '#src/languages/lib/languages.js'
|
||||
import buildRecords from './build-records.js'
|
||||
import findIndexablePages from './find-indexable-pages.js'
|
||||
import { allVersions } from '#src/versions/lib/all-versions.js'
|
||||
import { namePrefix } from '#src/search/lib/config.js'
|
||||
import { writeIndexRecords } from './search-index-records.js'
|
||||
import languages from '@/languages/lib/languages.js'
|
||||
import buildRecords from '@/search/scripts/scrape/lib/build-records'
|
||||
import findIndexablePages from '@/search/scripts/scrape/lib/find-indexable-pages'
|
||||
import { writeIndexRecords } from '@/search/scripts/scrape/lib/search-index-records'
|
||||
import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes'
|
||||
|
||||
import type { Options, Config, Page, Redirects } from '@/search/scripts/scrape/types'
|
||||
|
||||
// Build a search data file for every combination of product version and language
|
||||
// e.g. `github-docs-dotcom-en.json` and `github-docs-2.14-ja.json`
|
||||
export default async function syncSearchIndexes({
|
||||
export default async function scrapeIntoIndexJson({
|
||||
language,
|
||||
notLanguage,
|
||||
outDirectory,
|
||||
versionsToBuild,
|
||||
config = {},
|
||||
}) {
|
||||
config = {} as Config,
|
||||
}: Options): Promise<void> {
|
||||
const t0 = new Date()
|
||||
|
||||
// build indices for a specific language if provided; otherwise build indices for all languages
|
||||
|
@ -25,14 +25,14 @@ export default async function syncSearchIndexes({
|
|||
)
|
||||
|
||||
console.log(
|
||||
`Building indices for ${chalk.yellow(language || 'all languages')} and ${chalk.yellow(
|
||||
`Building indices for language: ${chalk.yellow(language || 'all languages')} and version: ${chalk.yellow(
|
||||
versionsToBuild.length === 1 ? versionsToBuild[0] : 'all versions',
|
||||
)}.\n`,
|
||||
)
|
||||
|
||||
// Exclude WIP pages, hidden pages, index pages, etc
|
||||
const indexablePages = await findIndexablePages(config.filter)
|
||||
const redirects = {}
|
||||
const indexablePages: Page[] = await findIndexablePages(config.filter)
|
||||
const redirects: Redirects = {}
|
||||
indexablePages.forEach((page) => {
|
||||
const href = page.relativePath.replace('index.md', '').replace('.md', '')
|
||||
for (let redirectFrom of page.redirect_from || []) {
|
||||
|
@ -47,22 +47,14 @@ export default async function syncSearchIndexes({
|
|||
let countRecordsTotal = 0
|
||||
// Build and validate all indices
|
||||
for (const languageCode of languagesToBuild) {
|
||||
for (const pageVersion of versionsToBuild) {
|
||||
// if GHES, resolves to the release number like 2.21, 2.22, etc.
|
||||
// if FPT, resolves to 'dotcom'
|
||||
const indexVersion =
|
||||
allVersions[pageVersion].plan === 'enterprise-server'
|
||||
? allVersions[pageVersion].currentRelease
|
||||
: allVersions[pageVersion].miscBaseName
|
||||
|
||||
// github-docs-dotcom-en, github-docs-2.22-en
|
||||
const indexName = `${namePrefix}-${indexVersion}-${languageCode}`
|
||||
for (const indexVersion of versionsToBuild) {
|
||||
const { indexName } = getElasticSearchIndex('generalSearch', indexVersion, languageCode)
|
||||
|
||||
// The page version will be the new version, e.g., free-pro-team@latest, enterprise-server@3.7
|
||||
const records = await buildRecords(
|
||||
indexName,
|
||||
indexablePages,
|
||||
pageVersion,
|
||||
indexVersion,
|
||||
languageCode,
|
||||
redirects,
|
||||
config,
|
||||
|
@ -81,6 +73,6 @@ export default async function syncSearchIndexes({
|
|||
console.log(`Rate ~${chalk.bold(rate)} pages per second.`)
|
||||
}
|
||||
|
||||
function formatSeconds(seconds) {
|
||||
function formatSeconds(seconds: number): string {
|
||||
return new Date(seconds * 1000).toISOString().substr(11, 8)
|
||||
}
|
|
@ -1,16 +1,27 @@
|
|||
#!/usr/bin/env node
|
||||
import path from 'path'
|
||||
import fs from 'fs/promises'
|
||||
import assert from 'assert'
|
||||
import { isArray, isString } from 'lodash-es'
|
||||
|
||||
function countArrayValues(arr) {
|
||||
const counter = new Map()
|
||||
arr.forEach((value) => counter.set(value, (counter.get(value) || 0) + 1))
|
||||
return [...counter.entries()].map(([value, count]) => {
|
||||
return { value, count }
|
||||
})
|
||||
import type { Record } from '@/search/scripts/scrape/types'
|
||||
|
||||
export async function writeIndexRecords(
|
||||
name: string,
|
||||
records: Record[],
|
||||
outDirectory: string,
|
||||
): Promise<string> {
|
||||
validateRecords(name, records)
|
||||
|
||||
const recordsObject = Object.fromEntries(records.map((record) => [record.objectID, record]))
|
||||
const content = JSON.stringify(recordsObject, undefined, 0)
|
||||
|
||||
const filePath = path.join(outDirectory, `${name}-records.json`)
|
||||
await fs.writeFile(filePath, content)
|
||||
|
||||
return filePath
|
||||
}
|
||||
|
||||
export default function validateRecords(name, records) {
|
||||
function validateRecords(name: string, records: Record[]): true {
|
||||
assert(isString(name) && name.length, '`name` is required')
|
||||
assert(isArray(records) && records.length, '`records` must be a non-empty array')
|
||||
|
||||
|
@ -35,3 +46,11 @@ export default function validateRecords(name, records) {
|
|||
|
||||
return true
|
||||
}
|
||||
|
||||
function countArrayValues(arr: string[]) {
|
||||
const counter = new Map()
|
||||
arr.forEach((value) => counter.set(value, (counter.get(value) || 0) + 1))
|
||||
return [...counter.entries()].map(([value, count]) => {
|
||||
return { value, count }
|
||||
})
|
||||
}
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче