Merge pull request #35219 from github/repo-sync

Repo sync
This commit is contained in:
docs-bot 2024-11-07 13:35:34 -05:00 коммит произвёл GitHub
Родитель aec2b02ee0 1a99ce6a37
Коммит 1b5e3de292
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
122 изменённых файлов: 4476 добавлений и 2462 удалений

Просмотреть файл

@ -0,0 +1,23 @@
# This file is a template for what your untracked .env file might look like for local development.
# Please copy this to a new .env file and fill in the values as needed.
# Requires a running local Elasticsearch service. Can be started via Docker, see https://github.com/github/docs-engineering/blob/main/docs/elasticsearch/elasticsearch-locally.md
# When this value is unset searches will be proxied to the production Elasticsearch endpoint
ELASTICSEARCH_URL=http://localhost:9200
# Set for sending events in local development. See https://github.com/github/docs-engineering/blob/main/docs/analytics/hydro-mock.md
HYDRO_ENDPOINT=
HYDRO_SECRET=
# Localization variables
# See https://github.com/github/docs-internal/tree/main/src/languages#working-with-translated-content-locally
ENABLED_LANGUAGES=
TRANSLATIONS_ROOT=
# For running the src/search/scripts/scrape script
# You may want a lower value depending on your CPU
BUILD_RECORDS_MAX_CONCURRENT=100
BUILD_RECORDS_MIN_TIME=
# Set to true to enable the /fastly-cache-test route for debugging Fastly headers
ENABLE_FASTLY_TESTING=

Просмотреть файл

@ -1,7 +1,7 @@
name: Index autocomplete Elasticsearch
name: Index autocomplete search in Elasticsearch
# **What it does**: Indexes autocomplete data into Elasticsearch.
# **Why we have it**: So we can power the API for autocomplete.
# **What it does**: Indexes autocomplete data (general and AI search) into Elasticsearch.
# **Why we have it**: So we can power the APIs for autocomplete.
# **Who does it impact**: docs-engineering
on:
@ -10,7 +10,7 @@ on:
- cron: '20 16 * * *' # Run every day at 16:20 UTC / 8:20 PST
pull_request:
paths:
- .github/workflows/index-autocomplete-elasticsearch.yml
- .github/workflows/index-autocomplete-search.yml
- 'src/search/scripts/index/**'
- 'package*.json'
@ -40,10 +40,15 @@ jobs:
if: ${{ github.event_name == 'pull_request' }}
run: curl --fail --retry-connrefused --retry 5 -I http://localhost:9200
- name: Run indexing
- name: Run general auto-complete indexing
env:
ELASTICSEARCH_URL: ${{ github.event_name == 'pull_request' && 'http://localhost:9200' || secrets.ELASTICSEARCH_URL }}
run: npm run index -- autocomplete docs-internal-data
run: npm run index-general-autocomplete -- docs-internal-data
- name: Run AI search auto-complete indexing
env:
ELASTICSEARCH_URL: ${{ github.event_name == 'pull_request' && 'http://localhost:9200' || secrets.ELASTICSEARCH_URL }}
run: npm run index-ai-search-autocomplete -- docs-internal-data
- uses: ./.github/actions/slack-alert
if: ${{ failure() && github.event_name == 'schedule' }}

Просмотреть файл

@ -1,6 +1,6 @@
name: Sync search - PR
name: Index general search in Elasticsearch on PR
# **What it does**: This does what `sync-sarch-elasticsearch.yml` does but
# **What it does**: This does what `index-general-search-elasticsearch.yml` does but
# with a localhost Elasticsearch and only for English.
# **Why we have it**: To test that the script works and the popular pages json is valid.
# **Who does it impact**: Docs engineering
@ -11,8 +11,8 @@ on:
paths:
- 'src/search/**'
- 'package*.json'
# Ultimately, for debugging this workflow itself
- .github/workflows/sync-search-pr.yml
# For debugging this workflow
- .github/workflows/index-general-search-pr.yml
# Make sure we run this if the composite action changes
- .github/actions/setup-elasticsearch/action.yml
@ -25,9 +25,6 @@ concurrency:
cancel-in-progress: true
env:
# Yes, it's hardcoded but it makes all the steps look exactly the same
# as they do in `sync-search-elasticsearch.yml` where it uses
# that `${{ env.ELASTICSEARCH_URL }}`
ELASTICSEARCH_URL: http://localhost:9200
# Since we'll run in NDOE_ENV=production, we need to be explicit that
# we don't want Hydro configured.
@ -63,7 +60,7 @@ jobs:
env:
ENABLE_DEV_LOGGING: false
run: |
npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log &
npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
# first sleep to give it a chance to start
sleep 6
@ -88,15 +85,13 @@ jobs:
# let's just accept an empty string instead.
THROW_ON_EMPTY: false
# The sync-search-index recognizes this env var if you don't
# use the `--docs-internal-data <PATH>` option.
DOCS_INTERNAL_DATA: docs-internal-data
run: |
mkdir /tmp/records
npm run sync-search-indices -- /tmp/records \
npm run general-search-scrape -- /tmp/records \
--language en \
--version dotcom
--version fpt
ls -lh /tmp/records
@ -106,9 +101,9 @@ jobs:
- name: Index into Elasticsearch
run: |
npm run index-elasticsearch -- /tmp/records \
npm run index-general-search -- /tmp/records \
--language en \
--version dotcom
--version fpt
- name: Check created indexes and aliases
run: |

Просмотреть файл

@ -1,4 +1,4 @@
name: Sync search Elasticsearch
name: Index general search in Elasticsearch
# **What it does**: It scrapes the whole site and dumps the records in a
# temp directory. Then it indexes that into Elasticsearch.
@ -140,7 +140,7 @@ jobs:
env:
ENABLE_DEV_LOGGING: false
run: |
npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log &
npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
# first sleep to give it a chance to start
sleep 6
@ -169,13 +169,11 @@ jobs:
# the same as not set within the script.
VERSION: ${{ inputs.version }}
# The sync-search-index recognizes this env var if you don't
# use the `--docs-internal-data <PATH>` option.
DOCS_INTERNAL_DATA: docs-internal-data
run: |
mkdir /tmp/records
npm run sync-search-indices -- /tmp/records \
npm run general-search-scrape -- /tmp/records \
--language ${{ matrix.language }}
ls -lh /tmp/records
@ -186,12 +184,12 @@ jobs:
- name: Index into Elasticsearch
env:
# Must match what we used when scraping (npm run sync-search-indices)
# Must match what we used when scraping (npm run general-search-scrape)
# otherwise the script will seek other versions from disk that might
# not exist.
VERSION: ${{ inputs.version }}
run: |
npm run index-elasticsearch -- /tmp/records \
npm run index-general-search -- /tmp/records \
--language ${{ matrix.language }} \
--stagger-seconds 5 \
--retries 5

6
.gitignore поставляемый
Просмотреть файл

@ -51,3 +51,9 @@ assets/images/help/writing/unordered-list-rendered (1).png
# Used by precompute-pageinfo
.pageinfo-cache.json.br
# Cloned and used for indexing Elasticsearch data
docs-internal-data/
# For intermediate data (like scraping for Elasticsearch indexing)
tmp/

Просмотреть файл

@ -212,3 +212,30 @@ If your appliance averages more than 70% CPU utilization, {% data variables.prod
As part of upgrading GitHub Enterprise Server to version 3.13 or later, the Elasticsearch service will be upgraded. {% data variables.product.company_short %} strongly recommends following the guidance in "[AUTOTITLE](/admin/upgrading-your-instance/performing-an-upgrade/preparing-for-the-elasticsearch-upgrade)."
{% endif %}
{% ifversion ghes > 3.12 and ghes < 3.15 %}
## Undecryptable records
If you are upgrading from {% data variables.product.prodname_ghe_server %} 3.11 or 3.12 to 3.13, or from 3.12 to 3.14, you may run into an issue with undecryptable records due to missing required keys for decryption. The only solution is to delete the undecryptable records. The type of records impacted by this issue are 2FA records, that means you might need to ask users to re-enable two-factor authentication (2FA).
### Before upgrading
If you are upgrading from {% data variables.product.prodname_ghe_server %} 3.11 or 3.12 to 3.13, or from 3.12 to 3.14, you can run the encryption diagnostics script to identify the undecryptable records ahead of time. This will give you the opportunity to understand the impact and plan for it.
1. Download the [encryption diagnostics script](https://gh.io/ghes-encryption-diagnostics). You can use a command like `curl -L -O https://gh.io/ghes-encryption-diagnostics` to download the script.
1. Save the script to the `/data/user/common` directory on the appliance.
1. Follow the instructions at the top of the script and execute it on the appliance. If there are any undecryptable records, they are logged in `/tmp/column_encryption_records_to_be_deleted.log`. Any records logged here means that the system was not able to find the keys for them and hence was not able to decrypt the data in those records.
At this stage, please note that these records will be deleted as part of the process. The script will warn you about the users who will need to re-enroll into 2FA after the upgrade. The impacted users' handles are logged in `/tmp/column_encryption_users_to_have_2fa_disabled.log`. These users will need to be re-enrolled into 2FA.
If the script runs into unexpected issues, you will be prompted to [contact {% data variables.contact.github_support %}](/support/contacting-github-support). Errors related to these issues will be logged in `/tmp/column_encryption_unexpected_errors.log`. If you are in a dire situation and are unable to have users re-enroll into 2FA, [contact {% data variables.contact.github_support %}](/support/contacting-github-support) for help.
### During the upgrade
In case you did not have the opportunity to run the encryption diagnostics script ahead of time, there are mechanisms in the product to help you. The pre-flight checks during the upgrade process will detect undecryptable records and log them in `/tmp/column_encryption_records_to_be_deleted.log`. The sequence will warn you of the users who will need to re-enable 2FA after the upgrade. The impacted users records are logged in `/tmp/column_encryption_users_to_have_2fa_disabled.log`.
If undecryptable records are detected, you will be prompted whether you want to proceed with the upgrade or not. If you proceed, the upgrade process deletes the undecryptable records. Otherwise, the upgrade process will exit.
If you have any questions during the upgrade, you can reach out to {% data variables.contact.github_support %}. Once you have had the time and opportunity to understand the impact, you can retrigger the upgrade.
{% endif %}

Просмотреть файл

@ -80,6 +80,12 @@ For example, you link your Azure subscription to your organization {% ifversion
* You must know your Azure subscription ID. See [Get subscription and tenant IDs in the Azure portal](https://learn.microsoft.com/en-us/azure/azure-portal/get-subscription-tenant-id) in the Microsoft Docs or [contact Azure support](https://azure.microsoft.com/support/).
## Video demonstration of connecting a subscription
To connect an Azure subscription, you'll need appropriate access permissions on both {% data variables.product.product_name %} and the Azure billing portal. This may require coordination between two different people.
To see a demo of the process from beginning to end, see [Billing GitHub consumption through an Azure subscription](https://www.youtube.com/watch?v=Y-f7JKJ4_8Y) on {% data variables.product.company_short %}'s YouTube channel. This video demonstrates the process for an enterprise account. If you're connecting a subscription to an organization account, see "[Connecting your Azure subscription to your organization account](/free-pro-team@latest/billing/managing-the-plan-for-your-github-account/connecting-an-azure-subscription#connecting-your-azure-subscription-to-your-organization-account)."
{% ifversion fpt %}
## Connecting your Azure subscription to your organization account

Просмотреть файл

@ -35,7 +35,7 @@ Generate end-user query help from .qhelp files.
### Primary Options
#### `<qhelp|mdhelp|query|dir|suite>...`
#### `<qhelpquerysuite>...`
\[Mandatory] Query help files to render. Each argument is one of:

Просмотреть файл

@ -3,7 +3,7 @@ title: Transcript - "Billing GitHub consumption through an Azure subscription"
intro: Audio and visual transcript.
shortTitle: Billing through Azure
allowTitleToDifferFromFilename: true
product_video: 'https://www.youtube.com/watch?v=DAiIhJKCt8s'
product_video: 'https://www.youtube.com/watch?v=Y-f7JKJ4_8Y'
topics:
- Transcripts
versions:
@ -27,7 +27,9 @@ And finally, if a Microsoft customer has an Azure discount, it will automaticall
If a Microsoft customer also has a Microsoft Azure Consumption Commitment, or MACC, all future GitHub consumption will decrement their MACC as well.
So what GitHub products are eligible for Azure billing? Any GitHub consumption products are eligible today, meaning products that customers pay for based on actual usage, including Copilot for Business, GitHub-hosted actions, larger hosted runners, GitHub Packages and storage, and GitHub Codespaces. Please note that GitHub Enterprise and GitHub Advanced Security are currently not able to be billed through Azure, but are instead invoiced on an annual basis.
So what GitHub products are eligible for Azure billing? Any GitHub consumption products are eligible today, meaning products that customers pay for based on actual usage, including things like GitHub Copilot, GitHub-hosted actions, larger hosted runners, GitHub Packages and storage, and GitHub Codespaces.
Historically, GitHub Enterprise and Advanced Security were only available through an annual license. However, as of August 1, 2024, they are now also available for metered billing through Azure, for additional flexibility and pay-as-you-go pricing. For existing licensed customers, be sure to connect with your GitHub seller to learn more, as certain restrictions may apply.
[A table shows eligibility for Azure billing and MACCs for the products mentioned. In the table, all products eligible for Azure billing are also eligible for MACCs.]

Просмотреть файл

@ -5,6 +5,8 @@ sections:
**MEDIUM:** An attacker could steal sensitive information by exploiting a Cross-Site Scripting vulnerability in the repository transfer feature. This exploitation would require social engineering. GitHub has requested CVE ID [CVE-2024-8770](https://www.cve.org/cverecord?id=CVE-2024-8770) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
**MEDIUM:** An attacker could push a commit with changes to a workflow using a PAT or OAuth app that lacks the appropriate `workflow` scope by pushing a triple-nested tag pointing at the associated commit. GitHub has requested CVE ID [CVE-2024-8263](https://www.cve.org/cverecord?id=CVE-2024-8263) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
**HIGH:** A GitHub App installed in organizations could upgrade some permissions from read to write access without approval from an organization administrator. An attacker would require an account with administrator access to install a malicious GitHub App. GitHub has requested [CVE ID CVE-2024-8810](https://www.cve.org/cverecord?id=CVE-2024-8810) for this vulnerability, which was reported via the [GitHub Bug Bounty Program](https://bounty.github.com/). [Updated: 2024-11-07]
bugs:
- |
For instances deployed on AWS with IMDSv2 enforced, fallback to private IPs was not successful.

Просмотреть файл

@ -0,0 +1,34 @@
date: '2024-11-07'
sections:
security_fixes:
- |
**HIGH**: An attacker could bypass SAML single sign-on (SSO) authentication with the optional encrypted assertions feature, allowing unauthorized provisioning of users and access to the instance, by exploiting an improper verification of cryptographic signatures vulnerability in GitHub Enterprise Server. This is a follow up fix for [CVE-2024-9487](https://www.cve.org/cverecord?id=CVE-2024-9487) to further harden the encrypted assertions feature against this type of attack. Please note that encrypted assertions are not enabled by default. Instances not utilizing SAML SSO, or utilizing SAML SSO authentication without encrypted assertions, are not impacted. Additionally, an attacker would require direct network access as well as a signed SAML response or metadata document to exploit this vulnerability.
known_issues:
- |
Custom firewall rules are removed during the upgrade process.
- |
During the validation phase of a configuration run, a `No such object` error may occur for the Notebook and Viewscreen services. This error can be ignored as the services should still correctly start.
- |
If the root site administrator is locked out of the Management Console after failed login attempts, the account does not unlock automatically after the defined lockout time. Someone with administrative SSH access to the instance must unlock the account using the administrative shell. For more information, see "[AUTOTITLE](/admin/configuration/administering-your-instance-from-the-management-console/troubleshooting-access-to-the-management-console#unlocking-the-root-site-administrator-account)."
- |
The `mbind: Operation not permitted` error in the `/var/log/mysql/mysql.err` file can be ignored. MySQL 8 does not gracefully handle when the `CAP_SYS_NICE` capability isn't required, and outputs an error instead of a warning.
- |
{% data reusables.release-notes.2023-11-aws-system-time %}
- |
On an instance with the HTTP `X-Forwarded-For` header configured for use behind a load balancer, all client IP addresses in the instance's audit log erroneously appear as 127.0.0.1.
- |
{% data reusables.release-notes.2023-10-git-push-made-but-not-registered %}
- |
{% data reusables.release-notes.large-adoc-files-issue %}
- |
{% data reusables.release-notes.2024-01-haproxy-upgrade-causing-increased-errors %}
- |
The `reply.[HOSTNAME]` subdomain is falsely always displaying as having no SSL and DNS record, when testing the domain settings via the Management Console without subdomain isolation.
- |
Admin stats REST API endpoints may timeout on appliances with many users or repositories. Retrying the request until data is returned is advised.
- |
{% data reusables.release-notes.2024-06-possible-frontend-5-minute-outage-during-hotpatch-upgrade %}
- |
When restoring from a backup snapshot, a large number of `mapper_parsing_exception` errors may be displayed.
- |
Services may respond with a `503` status due to an out of date `haproxy` configuration. This can usually be resolved with a `ghe-config-apply` run.

Просмотреть файл

@ -5,6 +5,8 @@ sections:
**MEDIUM:** An attacker could steal sensitive information by exploiting a Cross-Site Scripting vulnerability in the repository transfer feature. This exploitation would require social engineering. GitHub has requested CVE ID [CVE-2024-8770](https://www.cve.org/cverecord?id=CVE-2024-8770) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
**MEDIUM:** An attacker could push a commit with changes to a workflow using a PAT or OAuth app that lacks the appropriate `workflow` scope by pushing a triple-nested tag pointing at the associated commit. GitHub has requested CVE ID [CVE-2024-8263](https://www.cve.org/cverecord?id=CVE-2024-8263) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
**HIGH:** A GitHub App installed in organizations could upgrade some permissions from read to write access without approval from an organization administrator. An attacker would require an account with administrator access to install a malicious GitHub App. GitHub has requested [CVE ID CVE-2024-8810](https://www.cve.org/cverecord?id=CVE-2024-8810) for this vulnerability, which was reported via the [GitHub Bug Bounty Program](https://bounty.github.com/). [Updated: 2024-11-07]
bugs:
- |
For instances deployed on AWS with IMDSv2 enforced, fallback to private IPs was not successful.

Просмотреть файл

@ -0,0 +1,60 @@
date: '2024-11-07'
sections:
security_fixes:
- |
**HIGH**: An attacker could bypass SAML single sign-on (SSO) authentication with the optional encrypted assertions feature, allowing unauthorized provisioning of users and access to the instance, by exploiting an improper verification of cryptographic signatures vulnerability in GitHub Enterprise Server. This is a follow up fix for [CVE-2024-9487](https://www.cve.org/cverecord?id=CVE-2024-9487) to further harden the encrypted assertions feature against this type of attack. Please note that encrypted assertions are not enabled by default. Instances not utilizing SAML SSO, or utilizing SAML SSO authentication without encrypted assertions, are not impacted. Additionally, an attacker would require direct network access as well as a signed SAML response or metadata document to exploit this vulnerability.
- |
**HIGH**: An attacker could achieve container escape and privilege escalation to root by exploiting a path collision and arbitrary code execution via the `ghe-firejail` path. GitHub has requested CVE ID [CVE-2024-10007](https://www.cve.org/cverecord?id=CVE-2024-10007) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
bugs:
- |
This error message `mbind: Operation not permitted` was repeatedly showing in the `/var/log/mysql/mysql.err` MySQL logs.
- |
When saving settings in the Management Console, the configuration run would stop if the `enterprise-manage` process was restarted.
- |
A missing configuration value prevented Dependabot from creating group update pull requests.
- |
On an instance with GitHub Actions enabled, some maintenance tasks could fail due to incomplete upgrade steps during previous upgrades to new releases of GitHub Enterprise Server.
- |
The initial setup certificate generation in AWS took longer than expected due to fallback to private IPs. The time for this fallback has been reduced.
- |
If the primary instance was unreachable, running `ghe-repl-stop --force` on a replica would fail during the config apply run.
- |
When restoring from a backup, repositories that had been deleted in the last 90 days were not completely restored.
- |
Restoring Git repositories using `backup-utils` occasionally failed.
- |
Some customers upgrading from 3.11 to 3.13 may experience issues with undecryptable records during the upgrade. This issue has now been resolved. We recommend you read "[Undecryptable records](/enterprise-server@3.13/admin/upgrading-your-instance/troubleshooting-upgrades/known-issues-with-upgrades-to-your-instance#undecryptable-records)."
changes:
- |
For instances deployed on AWS, the default settings for Chrony NTP synchronization have been aligned with AWS's suggested default configurations.
known_issues:
- |
Custom firewall rules are removed during the upgrade process.
- |
During the validation phase of a configuration run, a `No such object` error may occur for the Notebook and Viewscreen services. This error can be ignored as the services should still correctly start.
- |
If the root site administrator is locked out of the Management Console after failed login attempts, the account does not unlock automatically after the defined lockout time. Someone with administrative SSH access to the instance must unlock the account using the administrative shell. See "[AUTOTITLE](/admin/configuration/administering-your-instance-from-the-management-console/troubleshooting-access-to-the-management-console#unlocking-the-root-site-administrator-account)."
- |
The `mbind: Operation not permitted` error in the `/var/log/mysql/mysql.err` file can be ignored. MySQL 8 does not gracefully handle when the `CAP_SYS_NICE` capability isn't required, and outputs an error instead of a warning.
- |
{% data reusables.release-notes.2023-11-aws-system-time %}
- |
On an instance with the HTTP `X-Forwarded-For` header configured for use behind a load balancer, all client IP addresses in the instance's audit log erroneously appear as 127.0.0.1.
- |
{% data reusables.release-notes.2023-10-git-push-made-but-not-registered %}
- |
{% data reusables.release-notes.large-adoc-files-issue %}
- |
{% data reusables.release-notes.2024-01-haproxy-upgrade-causing-increased-errors %}
- |
Repositories originally imported using `ghe-migrator` will not correctly track GitHub Advanced Security contributions.
- |
The `reply.[HOSTNAME]` subdomain is falsely always displaying as having no SSL and DNS record, when testing the domain settings via the Management Console without subdomain isolation.
- |
Admin stats REST API endpoints may timeout on appliances with many users or repositories. Retrying the request until data is returned is advised.
- |
{% data reusables.release-notes.2024-06-possible-frontend-5-minute-outage-during-hotpatch-upgrade %}
- |
When restoring from a backup snapshot, a large number of `mapper_parsing_exception` errors may be displayed.
- |
Services may respond with a `503` status due to an out of date `haproxy` configuration. This can usually be resolved with a `ghe-config-apply` run.

Просмотреть файл

@ -0,0 +1,58 @@
date: '2024-11-07'
sections:
security_fixes:
- |
**HIGH**: An attacker could bypass SAML single sign-on (SSO) authentication with the optional encrypted assertions feature, allowing unauthorized provisioning of users and access to the instance, by exploiting an improper verification of cryptographic signatures vulnerability in GitHub Enterprise Server. This is a follow up fix for [CVE-2024-9487](https://www.cve.org/cverecord?id=CVE-2024-9487) to further harden the encrypted assertions feature against this type of attack. Please note that encrypted assertions are not enabled by default. Instances not utilizing SAML SSO, or utilizing SAML SSO authentication without encrypted assertions, are not impacted. Additionally, an attacker would require direct network access as well as a signed SAML response or metadata document to exploit this vulnerability.
- |
**HIGH**: An attacker could achieve container escape and privilege escalation to root by exploiting a path collision and arbitrary code execution via the `ghe-firejail` path. GitHub has requested CVE ID [CVE-2024-10007](https://www.cve.org/cverecord?id=CVE-2024-10007) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
bugs:
- |
This error message `mbind: Operation not permitted` was repeatedly showing in the `/var/log/mysql/mysql.err` MySQL logs.
- |
When saving settings in the Management Console, the configuration run would stop if the `enterprise-manage` process was restarted.
- |
A missing configuration value prevented Dependabot from creating group update pull requests.
- |
On an instance with GitHub Actions enabled, some maintenance tasks could fail due to incomplete upgrade steps during previous upgrades to new releases of GitHub Enterprise Server.
- |
The initial setup certificate generation in AWS took longer than expected due to fallback to private IPs. The time for this fallback has been reduced.
- |
If the primary instance was unreachable, running `ghe-repl-stop --force` on a replica would fail during the config apply run.
- |
When restoring from a backup, repositories that had been deleted in the last 90 days were not completely restored.
- |
Restoring Git repositories using backup-utils occasionally failed.
- |
Organizations were limited to using 100 Actions organization variables instead of 1,000.
- |
Some customers upgrading from 3.12 to 3.13 or to 3.14 may experience issues with undecryptable records during the upgrade. This issue has now been resolved. We recommend you read "[Undecryptable records](/enterprise-server@3.14/admin/upgrading-your-instance/troubleshooting-upgrades/known-issues-with-upgrades-to-your-instance#undecryptable-records)."
changes:
- |
For instances deployed on AWS, the default settings for Chrony NTP synchronization have been aligned with AWS's suggested default configurations.
known_issues:
- |
Custom firewall rules are removed during the upgrade process.
- |
During the validation phase of a configuration run, a `No such object` error may occur for the Notebook and Viewscreen services. This error can be ignored as the services should still correctly start.
- |
If the root site administrator is locked out of the Management Console after failed login attempts, the account does not unlock automatically after the defined lockout time. Someone with administrative SSH access to the instance must unlock the account using the administrative shell. See "[AUTOTITLE](/admin/configuration/administering-your-instance-from-the-management-console/troubleshooting-access-to-the-management-console#unlocking-the-root-site-administrator-account)."
- |
The `mbind: Operation not permitted` error in the `/var/log/mysql/mysql.err` file can be ignored. MySQL 8 does not gracefully handle when the `CAP_SYS_NICE` capability isn't required, and outputs an error instead of a warning.
- |
{% data reusables.release-notes.2023-11-aws-system-time %}
- |
On an instance with the HTTP `X-Forwarded-For` header configured for use behind a load balancer, all client IP addresses in the instance's audit log erroneously appear as 127.0.0.1.
- |
{% data reusables.release-notes.large-adoc-files-issue %}
- |
Repositories originally imported using `ghe-migrator` will not correctly track GitHub Advanced Security contributions.
- |
The `reply.[HOSTNAME]` subdomain is falsely always displaying as having no SSL and DNS record, when testing the domain settings via the Management Console without subdomain isolation.
- |
Admin stats REST API endpoints may timeout on appliances with many users or repositories. Retrying the request until data is returned is advised.
- |
{% data reusables.release-notes.2024-06-possible-frontend-5-minute-outage-during-hotpatch-upgrade %}
- |
When restoring from a backup snapshot, a large number of `mapper_parsing_exception` errors may be displayed.
- |
Services may respond with a `503` status due to an out of date `haproxy` configuration. This can usually be resolved with a `ghe-config-apply` run.

Просмотреть файл

@ -5,6 +5,8 @@ sections:
**MEDIUM:** An attacker could steal sensitive information by exploiting a Cross-Site Scripting vulnerability in the repository transfer feature. This exploitation would require social engineering. GitHub has requested CVE ID [CVE-2024-8770](https://www.cve.org/cverecord?id=CVE-2024-8770) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
**MEDIUM:** An attacker could push a commit with changes to a workflow using a PAT or OAuth app that lacks the appropriate `workflow` scope by pushing a triple-nested tag pointing at the associated commit. GitHub has requested CVE ID [CVE-2024-8263](https://www.cve.org/cverecord?id=CVE-2024-8263) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
**HIGH:** A GitHub App installed in organizations could upgrade some permissions from read to write access without approval from an organization administrator. An attacker would require an account with administrator access to install a malicious GitHub App. GitHub has requested [CVE ID CVE-2024-8810](https://www.cve.org/cverecord?id=CVE-2024-8810) for this vulnerability, which was reported via the [GitHub Bug Bounty Program](https://bounty.github.com/). [Updated: 2024-11-07]
bugs:
- |
For instances deployed on AWS with IMDSv2 enforced, fallback to private IPs was not successful.

Просмотреть файл

@ -23,9 +23,9 @@ sections:
- |
**MEDIUM:** An attacker could have unauthorized read access to issue content inside an internal repository via GitHub projects. This attack required attacker access to the corresponding project board. GitHub has requested CVE ID [CVE-2024-5817](https://nvd.nist.gov/vuln/detail/CVE-2024-5817) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
An attacker could access previously executed private required workflows by changing the repository visibility from private to public. This occurred despite the repositories with the required workflows remaining private. This vulnerability was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
**MEDIUM**: An attacker could gain unauthorized access to secret scanning alert data because the [REST API secret scanning endpoint](/rest/secret-scanning/secret-scanning?apiVersion=2022-11-28) did not properly verify whether the user account has the business owner role. Only organization members can exploit this vulnerability, requiring a {% data variables.product.pat_generic %} (PAT) with `repo` or `security_events` scopes, limiting exposure to internal actors. Exploitation also required secret scanning to be enabled on user-owned repositories. GitHub has requested CVE ID [CVE-2024-10824](https://www.cve.org/CVERecord?id=CVE-2024-10824) for this vulnerability. [Updated: 2024-11-07]
- |
A user without the enterprise owner role could view all secret scanning alerts for user-owned repositories using the REST API. Alerts in user-owned repositories are now properly restricted to only be visible to enterprise owners.
An attacker could access previously executed private required workflows by changing the repository visibility from private to public. This occurred despite the repositories with the required workflows remaining private. This vulnerability was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
Packages have been updated to the latest security versions.
bugs:

Просмотреть файл

@ -5,6 +5,8 @@ sections:
**MEDIUM:** An attacker could steal sensitive information by exploiting a Cross-Site Scripting vulnerability in the repository transfer feature. This exploitation would require social engineering. GitHub has requested CVE ID [CVE-2024-8770](https://www.cve.org/cverecord?id=CVE-2024-8770) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
**MEDIUM:** An attacker could push a commit with changes to a workflow using a PAT or OAuth app that lacks the appropriate `workflow` scope by pushing a triple-nested tag pointing at the associated commit. GitHub has requested CVE ID [CVE-2024-8263](https://www.cve.org/cverecord?id=CVE-2024-8263) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
**HIGH:** A GitHub App installed in organizations could upgrade some permissions from read to write access without approval from an organization administrator. An attacker would require an account with administrator access to install a malicious GitHub App. GitHub has requested [CVE ID CVE-2024-8810](https://www.cve.org/cverecord?id=CVE-2024-8810) for this vulnerability, which was reported via the [GitHub Bug Bounty Program](https://bounty.github.com/). [Updated: 2024-11-07]
bugs:
- |
For instances deployed on AWS with IMDSv2 enforced, fallback to private IPs was not successful.

Просмотреть файл

@ -0,0 +1,62 @@
date: '2024-11-07'
sections:
security_fixes:
- |
Elasticsearch packages have been updated to the latest security versions.
- |
**HIGH**: An attacker could bypass SAML single sign-on (SSO) authentication with the optional encrypted assertions feature, allowing unauthorized provisioning of users and access to the instance, by exploiting an improper verification of cryptographic signatures vulnerability in GitHub Enterprise Server. This is a follow up fix for [CVE-2024-9487](https://www.cve.org/cverecord?id=CVE-2024-9487) to further harden the encrypted assertions feature against this type of attack. Please note that encrypted assertions are not enabled by default. Instances not utilizing SAML SSO, or utilizing SAML SSO authentication without encrypted assertions, are not impacted. Additionally, an attacker would require direct network access as well as a signed SAML response or metadata document to exploit this vulnerability.
- |
**HIGH**: An attacker could achieve container escape and privilege escalation to root by exploiting a path collision and arbitrary code execution via the `ghe-firejail` path. GitHub has requested CVE ID [CVE-2024-10007](https://www.cve.org/cverecord?id=CVE-2024-10007) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
bugs:
- |
A missing configuration value prevented Dependabot from creating group update pull requests.
- |
When saving settings in the Management Console, the configuration run would stop if the `enterprise-manage` process was restarted.
- |
On an instance with GitHub Actions enabled, some maintenance tasks could fail due to incomplete upgrade steps during previous upgrades to new releases of GitHub Enterprise Server.
- |
The initial setup certificate generation in AWS took longer than expected due to fallback to private IPs. The time for this fallback has been reduced.
- |
The `ghe-support-bundle` generation would fail when the `aqueduct-lite` service is down.
- |
If the primary instance was unreachable, running `ghe-repl-stop --force` on a replica would fail during the config apply run.
- |
For instances that use the mandatory message feature logging in to certain URLs may have caused a 500 error.
- |
When restoring from a backup, repositories that had been deleted in the last 90 days were not completely restored.
- |
Restoring Git repositories using backup-utils occasionally failed.
- |
Enterprise installations experienced unpredictable repository search results due to the default 4,000 repository limit. A relaxed repository filter mode, which includes all single-tenant organization repositories and bypasses the limit, has been introduced. Administrators can enable this mode using `ghe-config app.github.enterprise-repo-search-filter-enabled true && ghe-config-apply`.
- |
Organizations were limited to using 100 Actions organization variables instead of 1,000.
- |
Running `config-apply` became stuck under certain circumstances due to a misconfiguration with Packages and Elasticsearch.
- |
Some customers upgrading to 3.13 may experience issues with undecryptable records during the upgrade. This issue has now been resolved. We recommend you read "[Undecryptable records](/admin/upgrading-your-instance/troubleshooting-upgrades/known-issues-with-upgrades-to-your-instance#undecryptable-records)."
changes:
- |
When connecting to an appliance via SSH, a notification about upcoming root disk changes displays.
known_issues:
- |
During the validation phase of a configuration run, a `No such object` error may occur for the Notebook and Viewscreen services. This error can be ignored as the services should still correctly start.
- |
If the root site administrator is locked out of the Management Console after failed login attempts, the account does not unlock automatically after the defined lockout time. Someone with administrative SSH access to the instance must unlock the account using the administrative shell. See "[AUTOTITLE](/admin/configuration/administering-your-instance-from-the-management-console/troubleshooting-access-to-the-management-console#unlocking-the-root-site-administrator-account)."
- |
On an instance with the HTTP `X-Forwarded-For` header configured for use behind a load balancer, all client IP addresses in the instance's audit log erroneously appear as 127.0.0.1.
- |
Repositories originally imported using `ghe-migrator` will not correctly track GitHub Advanced Security contributions.
- |
For an instance in a cluster configuration and with GitHub Actions enabled, restoring a cluster from backup requires targeting the primary DB node.
- |
When following the steps for [Replacing the primary MySQL node](/admin/monitoring-managing-and-updating-your-instance/configuring-clustering/replacing-a-cluster-node#replacing-the-primary-mysql-node), step 14 (running `ghe-cluster-config-apply`) might fail with errors. If this occurs, re-running `ghe-cluster-config-apply` is expected to succeed.
- |
Running a `config apply` as part of the steps for [Replacing a node in an emergency](/admin/monitoring-managing-and-updating-your-instance/configuring-clustering/replacing-a-cluster-node#replacing-a-node-in-an-emergency) may fail with errors if the node being replaced is still reachable. If this occurs, shutdown the node and repeat the steps.
- |
{% data reusables.release-notes.2024-06-possible-frontend-5-minute-outage-during-hotpatch-upgrade %}
- |
When restoring data originally backed up from a 3.13 appliance onto a 3.13 appliance, the elasticsearch indices need to be reindexed before some of the data will show up. This happens via a nightly scheduled job. It can also be forced by running `/usr/local/share/enterprise/ghe-es-search-repair`.
- |
When restoring from a backup snapshot, a large number of `mapper_parsing_exception` errors may be displayed.
- |
Services may respond with a `503` status due to an out of date `haproxy` configuration. This can usually be resolved with a `ghe-config-apply` run.

Просмотреть файл

@ -3,6 +3,8 @@ sections:
security_fixes:
- |
**MEDIUM:** An attacker could steal sensitive information by exploiting a Cross-Site Scripting vulnerability in the repository transfer feature. This exploitation would require social engineering. GitHub has requested CVE ID [CVE-2024-8770](https://www.cve.org/cverecord?id=CVE-2024-8770) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
- |
**HIGH:** A GitHub App installed in organizations could upgrade some permissions from read to write access without approval from an organization administrator. An attacker would require an account with administrator access to install a malicious GitHub App. GitHub has requested [CVE ID CVE-2024-8810](https://www.cve.org/cverecord?id=CVE-2024-8810) for this vulnerability, which was reported via the [GitHub Bug Bounty Program](https://bounty.github.com/). [Updated: 2024-11-07]
bugs:
- |
On an instance with GitHub Actions enabled, due to an insufficient wait time, MS SQL and MySQL replication could fail with the error message `Failed to start nomad service!`.

Просмотреть файл

@ -0,0 +1,76 @@
date: '2024-11-07'
sections:
security_fixes:
- |
Elasticsearch packages have been updated to the latest security versions.
- |
Packages have been updated to the latest security version.
- |
**HIGH**: An attacker could bypass SAML single sign-on (SSO) authentication with the optional encrypted assertions feature, allowing unauthorized provisioning of users and access to the instance, by exploiting an improper verification of cryptographic signatures vulnerability in GitHub Enterprise Server. This is a follow up fix for [CVE-2024-9487](https://www.cve.org/cverecord?id=CVE-2024-9487) to further harden the encrypted assertions feature against this type of attack. Please note that encrypted assertions are not enabled by default. Instances not utilizing SAML SSO, or utilizing SAML SSO authentication without encrypted assertions, are not impacted. Additionally, an attacker would require direct network access as well as a signed SAML response or metadata document to exploit this vulnerability.
- |
**HIGH**: An attacker could achieve container escape and privilege escalation to root by exploiting a path collision and arbitrary code execution via the `ghe-firejail` path. GitHub has requested CVE ID [CVE-2024-10007](https://www.cve.org/cverecord?id=CVE-2024-10007) for this vulnerability, which was reported via the [GitHub Bug Bounty program](https://bounty.github.com/).
bugs:
- |
When saving settings in the Management Console, the configuration run would stop if the `enterprise-manage` process was restarted.
- |
On an instance with GitHub Actions enabled, some maintenance tasks could fail due to incomplete upgrade steps during previous upgrades to new releases of GitHub Enterprise Server.
- |
A repeated error message concerning connectivity to port 6002 was emitted to the system logs when GitHub Actions was enabled.
- |
The initial setup certificate generation in AWS took longer than expected due to fallback to private IPs. The time for this fallback has been reduced.
- |
The `ghe-support-bundle` generation would fail when the `aqueduct-lite` service is down.
- |
If the primary instance was unreachable, running `ghe-repl-stop --force` on a replica would fail during the config apply run.
- |
Administrators in the SCIM private beta (versions < 3.14) that decided to upgrade their private beta appliance see an incorrectly checked box in the "SCIM Configuration" section of the Enterprise settings authentication security page in 3.14.
- |
Certain URLs may have caused a 500 error on instances that use the mandatory message feature logging.
- |
When restoring from a backup, repositories that had been deleted in the last 90 days were not completely restored.
- |
For instances that use secret scanning, custom messages for push protection set by the enterprise did not display to users.
- |
Restoring Git repositories using `backup-utils` occasionally failed.
- |
Enterprise installations experienced unpredictable repository search results due to the default 4,000 repository limit. A relaxed repository filter mode, which includes all single-tenant organization repositories and bypasses the limit, has been introduced. Administrators can enable this mode using `ghe-config app.github.enterprise-repo-search-filter-enabled true && ghe-config-apply`.
- |
Running `config-apply` became stuck under certain circumstances due to a misconfiguration with Packages and Elasticsearch.
- |
Audit log events for secret scanning alerts incorrectly displayed a blank secret type when generated for a custom pattern.
- |
Some customers upgrading to 3.14 may experience issues with undecryptable records during the upgrade. This issue has now been resolved. We recommend you read "[Undecryptable records](/admin/upgrading-your-instance/troubleshooting-upgrades/known-issues-with-upgrades-to-your-instance#undecryptable-records)."
changes:
- |
When connecting to an appliance via SSH, a notification about upcoming root disk changes displays.
known_issues:
- |
During the validation phase of a configuration run, a `No such object` error may occur for the Notebook and Viewscreen services. This error can be ignored as the services should still correctly start.
- |
If the root site administrator is locked out of the Management Console after failed login attempts, the account does not unlock automatically after the defined lockout time. Someone with administrative SSH access to the instance must unlock the account using the administrative shell. See "[AUTOTITLE](/admin/configuration/administering-your-instance-from-the-management-console/troubleshooting-access-to-the-management-console#unlocking-the-root-site-administrator-account)."
- |
On an instance with the HTTP `X-Forwarded-For` header configured for use behind a load balancer, all client IP addresses in the instance's audit log erroneously appear as 127.0.0.1.
- |
{% data reusables.release-notes.large-adoc-files-issue %}
- |
Repositories originally imported using `ghe-migrator` will not correctly track GitHub Advanced Security contributions.
- |
Admin stats REST API endpoints may timeout on appliances with many users or repositories. Retrying the request until data is returned is advised.
- |
When following the steps for [Replacing the primary MySQL node](/admin/monitoring-managing-and-updating-your-instance/configuring-clustering/replacing-a-cluster-node#replacing-the-primary-mysql-node), step 14 (running `ghe-cluster-config-apply`) might fail with errors. If this occurs, re-running `ghe-cluster-config-apply` is expected to succeed.
- |
Running a `config apply` as part of the steps for [Replacing a node in an emergency](/admin/monitoring-managing-and-updating-your-instance/configuring-clustering/replacing-a-cluster-node#replacing-a-node-in-an-emergency) may fail with errors if the node being replaced is still reachable. If this occurs, shutdown the node and repeat the steps.
- |
{% data reusables.release-notes.2024-06-possible-frontend-5-minute-outage-during-hotpatch-upgrade %}
- |
When restoring data originally backed up from a 3.13 appliance onto a 3.13 appliance, the Elasticsearch indices need to be reindexed before some of the data will show up. This happens via a nightly scheduled job. It can also be forced by running `/usr/local/share/enterprise/ghe-es-search-repair`.
- |
An organization-level code scanning configuration page is displayed on instances that do not use GitHub Advanced Security or code scanning.
- |
In the header bar displayed to site administrators, some icons are not available.
- |
When enabling automatic update checks for the first time in the Management Console, the status is not dynamically reflected until the "Updates" page is reloaded.
- |
When restoring from a backup snapshot, a large number of `mapper_parsing_exception` errors may be displayed.
- |
Services may respond with a `503` status due to an out of date `haproxy` configuration. This can usually be resolved with a `ghe-config-apply` run.

Просмотреть файл

@ -35,7 +35,7 @@
| [Use of `Kernel.open` or `IO.read` or similar sinks with a non-constant value](https://codeql.github.com/codeql-query-help/ruby/rb-non-constant-kernel-open/) | 078, 088, 073 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
| [Use of `Kernel.open`, `IO.read` or similar sinks with user-controlled input](https://codeql.github.com/codeql-query-help/ruby/rb-kernel-open/) | 078, 088, 073 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
| [Use of a broken or weak cryptographic algorithm](https://codeql.github.com/codeql-query-help/ruby/rb-weak-cryptographic-algorithm/) | 327 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
| [Use of a broken or weak cryptographic hashing algorithm on sensitive data](https://codeql.github.com/codeql-query-help/ruby/rb-weak-sensitive-data-hashing/) | 327, 328, 916 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "x" aria-label="Not included" %} |
| [Use of a broken or weak cryptographic hashing algorithm on sensitive data](https://codeql.github.com/codeql-query-help/ruby/rb-weak-sensitive-data-hashing/) | 327, 328, 916 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
| [Use of externally-controlled format string](https://codeql.github.com/codeql-query-help/ruby/rb-tainted-format-string/) | 134 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
| [Weak cookie configuration](https://codeql.github.com/codeql-query-help/ruby/rb-weak-cookie-configuration/) | 732, 1275 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |
| [XML external entity expansion](https://codeql.github.com/codeql-query-help/ruby/rb-xxe/) | 611, 776, 827 | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} | {% octicon "check" aria-label="Included" %} |

Просмотреть файл

@ -64,4 +64,5 @@
| GHD038 | expired-content | Expired content must be remediated. | error | expired |
| GHD039 | expiring-soon | Content that expires soon should be proactively addressed. | warning | expired |
| [GHD040](https://github.com/github/docs/blob/main/src/content-linter/README.md) | table-liquid-versioning | Tables must use the correct liquid versioning format | error | tables |
| GHD041 | third-party-action-pinning | Code examples that use third-party actions must always pin to a full length commit SHA | error | feature, actions |
| GHD041 | third-party-action-pinning | Code examples that use third-party actions must always pin to a full length commit SHA | error | feature, actions |
| GHD042 | liquid-tag-whitespace | Liquid tags should start and end with one whitespace. Liquid tag arguments should be separated by only one whitespace. | error | liquid, format |

Просмотреть файл

@ -2,3 +2,5 @@ To pay for licenses, you must connect your enterprise to an Azure subscription.
* "[Prerequisites](/billing/managing-the-plan-for-your-github-account/connecting-an-azure-subscription#prerequisites)"
* "[Connecting your Azure subscription to an enterprise account](/billing/managing-the-plan-for-your-github-account/connecting-an-azure-subscription#connecting-your-azure-subscription-to-your-enterprise-account)"
If you prefer a visual overview of the process, watch [Billing {% data variables.product.company_short %} consumption through an Azure subscription](https://www.youtube.com/watch?v=Y-f7JKJ4_8Y) on our YouTube channel.

Просмотреть файл

@ -5,6 +5,6 @@ CA certificate key too weak
```
To resolve this issue, confirm that your certificate complies
with level 2 of the OpenSSL security specification. For more information, see [SSL_CTX_set_security_level](https://www.openssl.org/docs/man1.1.1/man3/SSL_CTX_set_security_level.html#DEFAULT-CALLBACK-BEHAVIOUR) in the OpenSSL docs. For more information about reviewing your instance's logs, see "[AUTOTITLE](/admin/monitoring-and-managing-your-instance/monitoring-your-instance/about-system-logs#system-logs-in-the-systemd-journal)".
with level 2 of the OpenSSL security specification. For more information, see [SSL_CTX_set_security_level](https://www.openssl.org/docs/man1.1.1/man3/SSL_CTX_set_security_level.html#DEFAULT-CALLBACK-BEHAVIOUR) in the OpenSSL docs. For more information about reviewing your instance's logs, see "[AUTOTITLE](/admin/monitoring-and-managing-your-instance/monitoring-your-instance/about-system-logs#system-logs-in-the-systemd-journal)."
If the error appears in `babeld` logs because your TLS certificate does not comply with level 2 of the specification, you must create and upload a new certificate with stronger security before you upgrade to GitHub Enterprise Server 3.10 or later. For more information, see "[AUTOTITLE](/admin/configuration/hardening-security-for-your-enterprise/configuring-tls)."

27
package-lock.json сгенерированный
Просмотреть файл

@ -109,10 +109,13 @@
"@octokit/rest": "21.0.2",
"@playwright/test": "^1.48.1",
"@types/accept-language-parser": "1.5.6",
"@types/cheerio": "^0.22.35",
"@types/connect-datadog": "0.0.10",
"@types/connect-timeout": "0.0.39",
"@types/cookie": "0.6.0",
"@types/cookie-parser": "1.4.7",
"@types/elasticsearch": "^5.0.43",
"@types/event-to-promise": "^0.7.5",
"@types/express": "4.17.21",
"@types/imurmurhash": "^0.1.4",
"@types/js-cookie": "^3.0.6",
@ -3165,6 +3168,15 @@
"integrity": "sha512-hWtVTC2q7hc7xZ/RLbxapMvDMgUnDvKvMOpKal4DrMyfGBUfB1oKaZlIRr6mJL+If3bAP6sV/QneGzF6tJjZDg==",
"dev": true
},
"node_modules/@types/cheerio": {
"version": "0.22.35",
"resolved": "https://registry.npmjs.org/@types/cheerio/-/cheerio-0.22.35.tgz",
"integrity": "sha512-yD57BchKRvTV+JD53UZ6PD8KWY5g5rvvMLRnZR3EQBCZXiDT/HR+pKpMzFGlWNhFrXlo7VPZXtKvIEwZkAWOIA==",
"dev": true,
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/connect": {
"version": "3.4.38",
"resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz",
@ -3228,12 +3240,27 @@
"@types/ms": "*"
}
},
"node_modules/@types/elasticsearch": {
"version": "5.0.43",
"resolved": "https://registry.npmjs.org/@types/elasticsearch/-/elasticsearch-5.0.43.tgz",
"integrity": "sha512-N+MpzURpDCWd7zaJ7CE1aU+nBSeAABLhDE0lGodQ0LLftx7ku6hjTXLr9OAFZLSXiWL3Xxx8jts485ynrcm5NA==",
"dev": true
},
"node_modules/@types/estree": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz",
"integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==",
"dev": true
},
"node_modules/@types/event-to-promise": {
"version": "0.7.5",
"resolved": "https://registry.npmjs.org/@types/event-to-promise/-/event-to-promise-0.7.5.tgz",
"integrity": "sha512-h10M3ybTySQFVP4N1uiEgPwbpHExNS8UMpCqRUJFkMhlpgSlWsyYsGMmkrJIKRnhGfYDOb4LD3U+SSPujoMHNA==",
"dev": true,
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/express": {
"version": "4.17.21",
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz",

Просмотреть файл

@ -17,7 +17,7 @@
"exports": "./src/frame/server.ts",
"scripts": {
"all-documents": "tsx src/content-render/scripts/all-documents/cli.ts",
"analyze-text": "node src/search/scripts/analyze-text.js",
"analyze-text": "tsx src/search/scripts/analyze-text.ts",
"analyze-comment": "tsx src/events/scripts/analyze-comment-cli.ts",
"archive-version": "tsx --max-old-space-size=16384 src/ghes-releases/scripts/archive-version.ts",
"audit-log-sync": "tsx src/audit-logs/scripts/sync.ts",
@ -39,8 +39,14 @@
"find-unused-variables": "tsx src/content-linter/scripts/find-unsed-variables.ts",
"fixture-dev": "cross-env ROOT=src/fixtures/fixtures npm start",
"fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests",
"index": "tsx src/search/scripts/index/index.ts",
"index-elasticsearch": "node src/search/scripts/index-elasticsearch.js",
"general-search-scrape": "tsx src/search/scripts/scrape/scrape-cli.ts",
"general-search-scrape-server": "cross-env NODE_ENV=production PORT=4002 MINIMAL_RENDER=true CHANGELOG_DISABLED=true tsx src/frame/server.ts",
"ghes-release-scrape-with-server": "cross-env GHES_RELEASE=1 start-server-and-test general-search-scrape-server 4002 general-search-scrape",
"general-search-scrape-with-server": "cross-env NODE_OPTIONS='--max_old_space_size=8192' start-server-and-test general-search-scrape-server 4002 general-search-scrape",
"index": "tsx src/search/scripts/index/index-cli autocomplete docs-internal-data",
"index-ai-search-autocomplete": "tsx src/search/scripts/index/index-cli ai-search-autocomplete",
"index-general-autocomplete": "tsx src/search/scripts/index/index-cli general-autocomplete",
"index-general-search": "tsx src/search/scripts/index/index-cli general-search",
"index-test-fixtures": "./src/search/scripts/index-test-fixtures.sh",
"lint": "eslint '**/*.{js,mjs,ts,tsx}'",
"lint-content": "node src/content-linter/scripts/lint-content.js",
@ -70,10 +76,6 @@
"start-for-playwright": "cross-env ROOT=src/fixtures/fixtures TRANSLATIONS_FIXTURE_ROOT=src/fixtures/fixtures/translations ENABLED_LANGUAGES=en,ja NODE_ENV=test tsx src/frame/server.ts",
"symlink-from-local-repo": "node src/early-access/scripts/symlink-from-local-repo.js",
"sync-rest": "tsx src/rest/scripts/update-files.ts",
"sync-search": "cross-env NODE_OPTIONS='--max_old_space_size=8192' start-server-and-test sync-search-server 4002 sync-search-indices",
"sync-search-ghes-release": "cross-env GHES_RELEASE=1 start-server-and-test sync-search-server 4002 sync-search-indices",
"sync-search-indices": "node src/search/scripts/sync-search-indices.js",
"sync-search-server": "cross-env NODE_ENV=production PORT=4002 MINIMAL_RENDER=true CHANGELOG_DISABLED=true tsx src/frame/server.ts",
"sync-secret-scanning": "tsx src/secret-scanning/scripts/sync.ts",
"sync-webhooks": "npx tsx src/rest/scripts/update-files.ts -o webhooks",
"test": "vitest",
@ -222,6 +224,7 @@
"src/open-source/scripts/add-pr-links.js",
"src/open-source/scripts/pr-link-source.js",
"rest-api-description/",
"docs-internal-data/",
"src/code-scanning/scripts/generate-code-scanning-query-list.ts"
]
},
@ -327,10 +330,13 @@
"@octokit/rest": "21.0.2",
"@playwright/test": "^1.48.1",
"@types/accept-language-parser": "1.5.6",
"@types/cheerio": "^0.22.35",
"@types/connect-datadog": "0.0.10",
"@types/connect-timeout": "0.0.39",
"@types/cookie": "0.6.0",
"@types/cookie-parser": "1.4.7",
"@types/elasticsearch": "^5.0.43",
"@types/event-to-promise": "^0.7.5",
"@types/express": "4.17.21",
"@types/imurmurhash": "^0.1.4",
"@types/js-cookie": "^3.0.6",

Просмотреть файл

@ -3,5 +3,5 @@
"apiOnlyEvents": "This event is not available in the web interface, only via the REST API, audit log streaming, or JSON/CSV exports.",
"apiRequestEvent": "This event is only available via audit log streaming."
},
"sha": "548a504f9bbeb14e74a0da48a869f8e6239b6d9f"
"sha": "5cdd5d7d8ef0e34ebff6addc8d04b7d3da813589"
}

Просмотреть файл

@ -31,6 +31,7 @@ import { imageNoGif } from './image-no-gif.js'
import { expiredContent, expiringSoon } from './expired-content.js'
import { tableLiquidVersioning } from './table-liquid-versioning.js'
import { thirdPartyActionPinning } from './third-party-action-pinning.js'
import { liquidTagWhitespace } from './liquid-tag-whitespace.js'
const noDefaultAltText = markdownlintGitHub.find((elem) =>
elem.names.includes('no-default-alt-text'),
@ -77,5 +78,6 @@ export const gitHubDocsMarkdownlint = {
expiringSoon,
tableLiquidVersioning,
thirdPartyActionPinning,
liquidTagWhitespace,
],
}

Просмотреть файл

@ -0,0 +1,64 @@
import { TokenKind } from 'liquidjs'
import { getLiquidTokens, getPositionData } from '../helpers/liquid-utils.js'
import { addFixErrorDetail } from '../helpers/utils.js'
/*
Liquid tags should start and end with one whitespace. For example:
DO use a single whitespace character
{% data <args> %}
DON'T use 0 or more than 1 whitespace
{%data <args> %}
DON'T use more than 1 whitespace between args
{%data arg1 arg2 %}
*/
export const liquidTagWhitespace = {
names: ['GHD042', 'liquid-tag-whitespace'],
description:
'Liquid tags should start and end with one whitespace. Liquid tag arguments should be separated by only one whitespace.',
tags: ['liquid', 'format'],
function: (params, onError) => {
const content = params.lines.join('\n')
const tokens = getLiquidTokens(content).filter((token) => token.kind === TokenKind.Tag)
for (const token of tokens) {
const { lineNumber, column, length } = getPositionData(token, params.lines)
const range = [column, length]
const tag = params.lines[lineNumber - 1].slice(column - 1, column - 1 + length)
// Get just the opening and closing tags, which includes any whitespace
// added before the tag name or any arguments
const openTag = tag.slice(0, token.contentRange[0] - token.begin)
const closeTag = tag.slice(-(token.end - token.contentRange[1]))
const isOpenTagOneSpace = openTag !== openTag.trim() + ' '
const isCloseTagOneSpace = closeTag !== ' ' + closeTag.trim()
const moreThanOneSpace = /\s{2,}/
const isArgOneSpace = moreThanOneSpace.test(tag)
const fixedContent =
openTag.trim() + ' ' + token.content.replace(moreThanOneSpace, ' ') + ' ' + closeTag.trim()
if (isOpenTagOneSpace || isCloseTagOneSpace || isArgOneSpace) {
addFixErrorDetail(
onError,
lineNumber,
fixedContent,
params.lines[lineNumber - 1].slice(column - 1, column - 1 + length),
range,
{
lineNumber,
editColumn: column,
deleteCount: length,
insertText: fixedContent,
},
)
}
}
},
}

Просмотреть файл

@ -161,6 +161,12 @@ const githubDocsConfig = {
'partial-markdown-files': true,
'yml-files': true,
},
'liquid-tag-whitespace': {
// GHD042
severity: 'error',
'partial-markdown-files': true,
'yml-files': true,
},
}
export const githubDocsFrontmatterConfig = {

Просмотреть файл

@ -0,0 +1,71 @@
import { describe, expect, test } from 'vitest'
import { runRule } from '../../lib/init-test.js'
import { liquidTagWhitespace } from '../../lib/linting-rules/liquid-tag-whitespace.js'
describe(liquidTagWhitespace.names.join(' - '), () => {
test('liquid tags with correct whitespace pass', async () => {
const markdown = [
'{% data variables.location.product_location %}',
'{% assign my_variable = "value" %}',
'{% if user %}Hello, {{ user.name }}{% endif %}',
].join('\n')
const result = await runRule(liquidTagWhitespace, { strings: { markdown } })
const errors = result.markdown
expect(errors.length).toBe(0)
})
test('liquid tags with incorrect whitespace fail', async () => {
const markdown = [
'{%data variables.location.product_location %}',
'{% assign my_variable = "value"%}',
'{% if user %}Hello, {{ user.name }} {%endif %}',
'{% data variables.location.product_location %}',
'{%-data variables.location.product_location -%}',
'{%- assign my_variable = "value"-%}',
'{%- if user -%}Hello, {{ user.name }} {%endif %}',
'{%- data variables.location.product_location -%}',
].join('\n')
const result = await runRule(liquidTagWhitespace, { strings: { markdown } })
const errors = result.markdown
expect(errors.length).toBe(8)
expect(errors[2].lineNumber).toBe(3)
expect(errors[2].fixInfo).toEqual({
deleteCount: 10,
editColumn: 37,
lineNumber: 3,
insertText: '{% endif %}',
})
})
test('liquid tags with multiple spaces between arguments fail', async () => {
const markdown = [
'{% assign my_variable = "value" %}',
'{% if user %}Hello, {{ user.name }}{% endif %}',
].join('\n')
const result = await runRule(liquidTagWhitespace, { strings: { markdown } })
const errors = result.markdown
expect(errors.length).toBe(2)
expect(errors[1].lineNumber).toBe(2)
expect(errors[0].fixInfo).toEqual({
deleteCount: 35,
editColumn: 1,
lineNumber: 1,
insertText: '{% assign my_variable = "value" %}',
})
})
test('liquid tags with single spaces between arguments pass', async () => {
const markdown = [
'{% assign my_variable = "value" %}',
'{% if user %}Hello, {{ user.name }}{% endif %}',
].join('\n')
const result = await runRule(liquidTagWhitespace, { strings: { markdown } })
const errors = result.markdown
expect(errors.length).toBe(0)
})
})

Просмотреть файл

@ -68,7 +68,7 @@ describe('breadcrumbs', () => {
expect($breadcrumbTitles.length).toBe(0)
expect($breadcrumbLinks.length).toBe(2)
expect($breadcrumbLinks[0].attribs.title).toBe('Deeper secrets')
expect($breadcrumbLinks[1].attribs.title).toBe('Mariana Trench')
expect(($breadcrumbLinks[0] as cheerio.TagElement).attribs.title).toBe('Deeper secrets')
expect(($breadcrumbLinks[1] as cheerio.TagElement).attribs.title).toBe('Mariana Trench')
})
})

Просмотреть файл

@ -3,7 +3,7 @@ import { createProxyMiddleware } from 'http-proxy-middleware'
import events from '@/events/middleware.js'
import anchorRedirect from '@/rest/api/anchor-redirect.js'
import search from '@/search/middleware/search.js'
import search from '@/search/middleware/search-routes.js'
import pageInfo from '@/pageinfo/middleware'
import pageList from '@/pagelist/middleware'
import webhooks from '@/webhooks/middleware/webhooks.js'

Просмотреть файл

@ -61,7 +61,7 @@ import fastlyCacheTest from './fastly-cache-test'
import trailingSlashes from './trailing-slashes'
import mockVaPortal from './mock-va-portal'
import dynamicAssets from '@/assets/middleware/dynamic-assets'
import contextualizeSearch from '@/search/middleware/contextualize.js'
import generalSearchMiddleware from '@/search/middleware/general-search-middleware'
import shielding from '@/shielding/middleware'
import tracking from '@/tracking/middleware'
import { MAX_REQUEST_TIMEOUT } from '@/frame/lib/constants.js'
@ -275,7 +275,7 @@ export default function (app: Express) {
app.use(asyncMiddleware(productExamples))
app.use(asyncMiddleware(productGroups))
app.use(asyncMiddleware(glossaries))
app.use(asyncMiddleware(contextualizeSearch))
app.use(asyncMiddleware(generalSearchMiddleware))
app.use(asyncMiddleware(featuredLinks))
app.use(asyncMiddleware(learningTrack))

Просмотреть файл

@ -15,7 +15,10 @@ describe('favicon assets', () => {
expect(res.headers['cache-control']).toContain('public')
expect(res.headers['cache-control']).toContain('immutable')
expect(res.headers['cache-control']).toMatch(/max-age=\d+/)
const maxAgeSeconds = parseInt(res.headers['cache-control'].match(/max-age=(\d+)/)[1], 10)
const maxAgeSeconds = parseInt(
(res.headers['cache-control'] || '').match(/max-age=(\d+)/)?.[1] || '',
10,
)
// Let's not be too specific in the tests, just as long as it's testing
// that it's a reasonably large number of seconds.
expect(maxAgeSeconds).toBeGreaterThanOrEqual(60 * 60)
@ -25,13 +28,16 @@ describe('favicon assets', () => {
test('should serve a valid and aggressively caching /apple-touch-icon.png', async () => {
const res = await get('/apple-touch-icon.png')
expect(res.statusCode).toBe(200)
expect(parseInt(res.headers['content-length'], 10)).toBeGreaterThan(0)
expect(parseInt(res.headers['content-length'] || '', 10)).toBeGreaterThan(0)
expect(res.headers['content-type']).toBe('image/png')
expect(res.headers['set-cookie']).toBeUndefined()
expect(res.headers['cache-control']).toContain('public')
expect(res.headers['cache-control']).toContain('immutable')
expect(res.headers['cache-control']).toMatch(/max-age=\d+/)
const maxAgeSeconds = parseInt(res.headers['cache-control'].match(/max-age=(\d+)/)[1], 10)
const maxAgeSeconds = parseInt(
(res.headers['cache-control'] || '').match(/max-age=(\d+)/)?.[1] || '',
10,
)
// Let's not be too specific in the tests, just as long as it's testing
// that it's a reasonably large number of seconds.
expect(maxAgeSeconds).toBeGreaterThanOrEqual(60 * 60)

Просмотреть файл

@ -20,6 +20,9 @@ describe('manifest', () => {
test('download manifest from HTML and check content', async () => {
const $ = await getDOM('/')
const url = $('link[rel="manifest"]').attr('href')
if (!url) {
throw new Error('No manifest URL found')
}
const res = await get(url)
expect(res.statusCode).toBe(200)

Просмотреть файл

@ -17,7 +17,7 @@ labels:
- [Prerequisites](#prerequisites)
- [Create publication branch for a new version of GHES](#creation)
- [Resolve check failures](#check-failures)
- [Sync the search indices](#sync-search-indices)
- [Scrape the search indices](#scrape-search-indices)
- [Maintain the publication branch](#maintenance)
- [Complete preparation for the RC and publish the docset](#publication)
@ -110,11 +110,11 @@ For content from the OpenAPI schema, note the affected content with broken links
<br/>
<a name="sync-search-indices">
<a name="scrape-search-indices">
### [🔎](#sync-search-indices) Sync the search indices
### [🔎](#scrape-search-indices) Scrape the search indices
1. Go to the [`sync-search-elasticsearch` workflow](https://github.com/github/docs-internal/actions/workflows/sync-search-elasticsearch.yml) ([permalink](https://github.com/github/docs-internal/blob/f8ca45703c48c7d1976a278337bc3391fb14fe9e/.github/workflows/sync-search-elasticsearch.yml) in case it moves)
1. Go to the [`index-general-search.yml` workflow](https://github.com/github/docs-internal/actions/workflows/index-general-search.yml)
1. Click on the **Run workflow** drop down and set the following parameters:
- `Branch:` set to the name of the publication branch
- `Version` set to the version you're publishing (e.g., `ghes-3.12` if you're publishing GHES 3.12)

Просмотреть файл

@ -1,4 +1,19 @@
[
{
"schemaChanges": [
{
"title": "The GraphQL schema includes these changes:",
"changes": [
"<p>Type <code>UpdateEnterpriseDeployKeySettingInput</code> was added</p>",
"<p>Type <code>UpdateEnterpriseDeployKeySettingPayload</code> was added</p>",
"<p>Field <code>updateEnterpriseDeployKeySetting</code> was added to object type <code>Mutation</code></p>"
]
}
],
"previewChanges": [],
"upcomingChanges": [],
"date": "2024-11-07"
},
{
"schemaChanges": [
{

Просмотреть файл

@ -24830,6 +24830,16 @@ type Mutation {
input: UpdateEnterpriseDefaultRepositoryPermissionSettingInput!
): UpdateEnterpriseDefaultRepositoryPermissionSettingPayload
"""
Sets whether deploy keys are allowed to be created and used for an enterprise.
"""
updateEnterpriseDeployKeySetting(
"""
Parameters for UpdateEnterpriseDeployKeySetting
"""
input: UpdateEnterpriseDeployKeySettingInput!
): UpdateEnterpriseDeployKeySettingPayload
"""
Sets whether organization members with admin permissions on a repository can change repository visibility.
"""
@ -58729,6 +58739,46 @@ type UpdateEnterpriseDefaultRepositoryPermissionSettingPayload {
message: String
}
"""
Autogenerated input type of UpdateEnterpriseDeployKeySetting
"""
input UpdateEnterpriseDeployKeySettingInput {
"""
A unique identifier for the client performing the mutation.
"""
clientMutationId: String
"""
The ID of the enterprise on which to set the deploy key setting.
"""
enterpriseId: ID! @possibleTypes(concreteTypes: ["Enterprise"])
"""
The value for the deploy key setting on the enterprise.
"""
settingValue: EnterpriseEnabledDisabledSettingValue!
}
"""
Autogenerated return type of UpdateEnterpriseDeployKeySetting.
"""
type UpdateEnterpriseDeployKeySettingPayload {
"""
A unique identifier for the client performing the mutation.
"""
clientMutationId: String
"""
The enterprise with the updated deploy key setting.
"""
enterprise: Enterprise
"""
A message confirming the result of updating the deploy key setting.
"""
message: String
}
"""
Autogenerated input type of UpdateEnterpriseMembersCanChangeRepositoryVisibilitySetting
"""

Просмотреть файл

@ -7681,6 +7681,48 @@
}
]
},
{
"name": "updateEnterpriseDeployKeySetting",
"kind": "mutations",
"id": "updateenterprisedeploykeysetting",
"href": "/graphql/reference/mutations#updateenterprisedeploykeysetting",
"description": "<p>Sets whether deploy keys are allowed to be created and used for an enterprise.</p>",
"inputFields": [
{
"name": "input",
"type": "UpdateEnterpriseDeployKeySettingInput!",
"id": "updateenterprisedeploykeysettinginput",
"kind": "input-objects",
"href": "/graphql/reference/input-objects#updateenterprisedeploykeysettinginput"
}
],
"returnFields": [
{
"name": "clientMutationId",
"type": "String",
"id": "string",
"kind": "scalars",
"href": "/graphql/reference/scalars#string",
"description": "<p>A unique identifier for the client performing the mutation.</p>"
},
{
"name": "enterprise",
"type": "Enterprise",
"id": "enterprise",
"kind": "objects",
"href": "/graphql/reference/objects#enterprise",
"description": "<p>The enterprise with the updated deploy key setting.</p>"
},
{
"name": "message",
"type": "String",
"id": "string",
"kind": "scalars",
"href": "/graphql/reference/scalars#string",
"description": "<p>A message confirming the result of updating the deploy key setting.</p>"
}
]
},
{
"name": "updateEnterpriseMembersCanChangeRepositoryVisibilitySetting",
"kind": "mutations",
@ -103270,6 +103312,40 @@
}
]
},
{
"name": "UpdateEnterpriseDeployKeySettingInput",
"kind": "inputObjects",
"id": "updateenterprisedeploykeysettinginput",
"href": "/graphql/reference/input-objects#updateenterprisedeploykeysettinginput",
"description": "<p>Autogenerated input type of UpdateEnterpriseDeployKeySetting.</p>",
"inputFields": [
{
"name": "clientMutationId",
"description": "<p>A unique identifier for the client performing the mutation.</p>",
"type": "String",
"id": "string",
"kind": "scalars",
"href": "/graphql/reference/scalars#string"
},
{
"name": "enterpriseId",
"description": "<p>The ID of the enterprise on which to set the deploy key setting.</p>",
"type": "ID!",
"id": "id",
"kind": "scalars",
"href": "/graphql/reference/scalars#id",
"isDeprecated": false
},
{
"name": "settingValue",
"description": "<p>The value for the deploy key setting on the enterprise.</p>",
"type": "EnterpriseEnabledDisabledSettingValue!",
"id": "enterpriseenableddisabledsettingvalue",
"kind": "enums",
"href": "/graphql/reference/enums#enterpriseenableddisabledsettingvalue"
}
]
},
{
"name": "UpdateEnterpriseMembersCanChangeRepositoryVisibilitySettingInput",
"kind": "inputObjects",

Просмотреть файл

@ -24830,6 +24830,16 @@ type Mutation {
input: UpdateEnterpriseDefaultRepositoryPermissionSettingInput!
): UpdateEnterpriseDefaultRepositoryPermissionSettingPayload
"""
Sets whether deploy keys are allowed to be created and used for an enterprise.
"""
updateEnterpriseDeployKeySetting(
"""
Parameters for UpdateEnterpriseDeployKeySetting
"""
input: UpdateEnterpriseDeployKeySettingInput!
): UpdateEnterpriseDeployKeySettingPayload
"""
Sets whether organization members with admin permissions on a repository can change repository visibility.
"""
@ -58729,6 +58739,46 @@ type UpdateEnterpriseDefaultRepositoryPermissionSettingPayload {
message: String
}
"""
Autogenerated input type of UpdateEnterpriseDeployKeySetting
"""
input UpdateEnterpriseDeployKeySettingInput {
"""
A unique identifier for the client performing the mutation.
"""
clientMutationId: String
"""
The ID of the enterprise on which to set the deploy key setting.
"""
enterpriseId: ID! @possibleTypes(concreteTypes: ["Enterprise"])
"""
The value for the deploy key setting on the enterprise.
"""
settingValue: EnterpriseEnabledDisabledSettingValue!
}
"""
Autogenerated return type of UpdateEnterpriseDeployKeySetting.
"""
type UpdateEnterpriseDeployKeySettingPayload {
"""
A unique identifier for the client performing the mutation.
"""
clientMutationId: String
"""
The enterprise with the updated deploy key setting.
"""
enterprise: Enterprise
"""
A message confirming the result of updating the deploy key setting.
"""
message: String
}
"""
Autogenerated input type of UpdateEnterpriseMembersCanChangeRepositoryVisibilitySetting
"""

Просмотреть файл

@ -7681,6 +7681,48 @@
}
]
},
{
"name": "updateEnterpriseDeployKeySetting",
"kind": "mutations",
"id": "updateenterprisedeploykeysetting",
"href": "/graphql/reference/mutations#updateenterprisedeploykeysetting",
"description": "<p>Sets whether deploy keys are allowed to be created and used for an enterprise.</p>",
"inputFields": [
{
"name": "input",
"type": "UpdateEnterpriseDeployKeySettingInput!",
"id": "updateenterprisedeploykeysettinginput",
"kind": "input-objects",
"href": "/graphql/reference/input-objects#updateenterprisedeploykeysettinginput"
}
],
"returnFields": [
{
"name": "clientMutationId",
"type": "String",
"id": "string",
"kind": "scalars",
"href": "/graphql/reference/scalars#string",
"description": "<p>A unique identifier for the client performing the mutation.</p>"
},
{
"name": "enterprise",
"type": "Enterprise",
"id": "enterprise",
"kind": "objects",
"href": "/graphql/reference/objects#enterprise",
"description": "<p>The enterprise with the updated deploy key setting.</p>"
},
{
"name": "message",
"type": "String",
"id": "string",
"kind": "scalars",
"href": "/graphql/reference/scalars#string",
"description": "<p>A message confirming the result of updating the deploy key setting.</p>"
}
]
},
{
"name": "updateEnterpriseMembersCanChangeRepositoryVisibilitySetting",
"kind": "mutations",
@ -103270,6 +103312,40 @@
}
]
},
{
"name": "UpdateEnterpriseDeployKeySettingInput",
"kind": "inputObjects",
"id": "updateenterprisedeploykeysettinginput",
"href": "/graphql/reference/input-objects#updateenterprisedeploykeysettinginput",
"description": "<p>Autogenerated input type of UpdateEnterpriseDeployKeySetting.</p>",
"inputFields": [
{
"name": "clientMutationId",
"description": "<p>A unique identifier for the client performing the mutation.</p>",
"type": "String",
"id": "string",
"kind": "scalars",
"href": "/graphql/reference/scalars#string"
},
{
"name": "enterpriseId",
"description": "<p>The ID of the enterprise on which to set the deploy key setting.</p>",
"type": "ID!",
"id": "id",
"kind": "scalars",
"href": "/graphql/reference/scalars#id",
"isDeprecated": false
},
{
"name": "settingValue",
"description": "<p>The value for the deploy key setting on the enterprise.</p>",
"type": "EnterpriseEnabledDisabledSettingValue!",
"id": "enterpriseenableddisabledsettingvalue",
"kind": "enums",
"href": "/graphql/reference/enums#enterpriseenableddisabledsettingvalue"
}
]
},
{
"name": "UpdateEnterpriseMembersCanChangeRepositoryVisibilitySettingInput",
"kind": "inputObjects",

Просмотреть файл

@ -17,15 +17,15 @@ describe('frame', () => {
test.each(langs)('breadcrumbs link to %s pages', async (lang) => {
const $ = await getDOM(`/${lang}/get-started/learning-about-github`)
const $breadcrumbs = $('[data-testid=breadcrumbs-in-article] a')
expect($breadcrumbs[0].attribs.href).toBe(`/${lang}/get-started`)
expect(($breadcrumbs[0] as cheerio.TagElement).attribs.href).toBe(`/${lang}/get-started`)
})
test.each(langs)('homepage links go to %s pages', async (lang) => {
const $ = await getDOM(`/${lang}`)
const $links = $('[data-testid=bump-link]')
$links.each((i: number, el: Element) => {
$links.each((i: number, el: cheerio.Element) => {
const linkUrl = $(el).attr('href')
expect(linkUrl.startsWith(`/${lang}/`)).toBe(true)
expect((linkUrl || '').startsWith(`/${lang}/`)).toBe(true)
})
})

Просмотреть файл

@ -3,7 +3,7 @@
import fs from 'fs'
import path from 'path'
import cheerio, { type CheerioAPI, type Element } from 'cheerio'
import cheerio from 'cheerio'
import coreLib from '@actions/core'
import got, { RequestError } from 'got'
import chalk from 'chalk'
@ -339,7 +339,15 @@ async function main(
const t0 = new Date().getTime()
const flawsGroups = await Promise.all(
pages.map((page: Page) =>
processPage(core, page, pageMap, redirects, opts, externalLinkCheckerDB, versions),
processPage(
core,
page,
pageMap,
redirects,
opts,
externalLinkCheckerDB,
versions as string[],
),
),
)
const t1 = new Date().getTime()
@ -695,13 +703,13 @@ async function processPermalink(
}
const $ = cheerio.load(html, { xmlMode: true })
const flaws: LinkFlaw[] = []
const links: Element[] = []
const links: cheerio.Element[] = []
$('a[href]').each((i, link) => {
links.push(link)
})
const newFlaws: LinkFlaw[] = await Promise.all(
links.map(async (link) => {
const { href } = link.attribs
const { href } = (link as cheerio.TagElement).attribs
// The global cache can't be used for anchor links because they
// depend on each page it renders
@ -752,7 +760,7 @@ async function processPermalink(
if (checkImages) {
$('img[src]').each((i, img) => {
let { src } = img.attribs
let { src } = (img as cheerio.TagElement).attribs
// Images get a cache-busting prefix injected in the image
// E.g. <img src="/assets/cb-123456/foo/bar.png">
@ -874,7 +882,7 @@ let globalCacheMissCount = 0
async function checkHrefLink(
core: any,
href: string,
$: CheerioAPI,
$: cheerio.Root,
redirects: Redirects,
pageMap: PageMap,
checkAnchors = false,

Просмотреть файл

@ -16,9 +16,36 @@ The site search is part of every version of docs.github.com. This endpoint respo
You can also query our search endpoint directly at:
`https://docs.github.com/search?version=<VERSION>&language=<LANGUAGE CODE>&query=<QUERY>`
- The VERSION can be any numbered supported GitHub Enterprise Server version (e.g., `3.12`), Enterprise Cloud (`ghec`), or the Free pro team plan (`dotcom`).
- The LANGUAGE CODE can be one of: `zh`, `es`, `pt`, `ru`, `ja`, `fr`, `de`, `ko`
- Any search QUERY you'd like.
- The `VERSION` can be any numbered supported GitHub Enterprise Server version (e.g., `3.12`), Enterprise Cloud (`ghec`), or the Free pro team plan (`dotcom`).
- The `LANGUAGE CODE` can be one of: `zh`, `es`, `pt`, `ru`, `ja`, `fr`, `de`, `ko`
- The `QUERY` can be any alphanumeric string value.
## Types of search
Our backend currently supports 3 "types" of searching.
All searches accept a `query` param, e.g. `?query=how` and return results based on their type:
1. **general search**
- Results: The pages of our sites that match the query, sorted by popularity
- Example: Query = "clone" -> Results <URLs to Docs Page about cloning>
- Endpoint: `/api/search/v1`
2. **general autocomplete**
- Results: Potential terms that can be autocompleted from the query based on previous user searches
- Example: Query = "cl" -> A Result = "clone"
- Endpoint: `/api/search/autocomplete/v1`
3. **AI search autocomplete**
- Results: Human-readable full-sentence questions that best match the query. Questions are based on previous searches and popular pages
- Example: Query = "How do I clone" -> A Result = "How do I clone a repository?"
- Endpoint: `/api/search/ai-search-autocomplete/v1`
## Elasticsearch
Elasticsearch is an external service that we use for searching. When a user types a search, our backend queries Elasticsearch for the most relevant results.
### Indexing Elasticsearch
In order to provide relevant results to queries, we prefill Elasticsearch with data via Indexes. See the [Indexing README](./scripts/index/README.md) for how we index on Docs.
## Production deploys
@ -32,40 +59,25 @@ You can manually run the workflow to generate the indexes after you push your ch
### Build and sync
The preferred way to build and sync the search indices is to do so via the [GitHub Actions workflow](/.github/workflows/sync-search-elasticsearch.yml).
The preferred way to build and sync the search indices is to do so via the [GitHub Actions workflow](/.github/workflows/index-general-search.yml).
## Files
### Actions workflow files
- [`.github/workflows/sync-search-elasticsearch.yml`](/.github/workflows/sync-search-elasticsearch.yml) - Builds and syncs search indices on the `main` branch every four hours. Search indices are stored in an internal-only Elasticsearch instance. To run it manually, click "Run workflow" button in the Actions tab.
- [`.github/workflows/index-general-search.yml`](/.github/workflows/index-general-search.yml) - Populates search indices for **general search** using the `main` branch every four hours. Search indices are stored in an internal-only Elasticsearch instance. To run it manually, click "Run workflow" button in the Actions tab.
- [`.github/workflows/index-autocomplete-search.yml`](/.github/workflows/index-general-search.yml) - Populates search indices for both **general autocomplete** and **AI search autocomplete** using data from an internal repo. Runs daily.
### Notable code files and directories
- [src/search/components/Search.tsx](/src/search/components/Search.tsx) - The browser-side code that enables the search.
- [src/search/components/SearchResults.tsx](/src/search/components/SearchResults.tsx) - The browser-side code that displays search results.
- [src/search/middleware/es-search.js](/src/search/middleware/es-search.js) - A wrapper around the Node.js Elasticsearch module for interacting with the search API.
- [src/search/scripts/](/src/search/scripts/) - Scripts used by Actions workflows or for manual operations.
- [src/search/tests](/src/search/tests) - Tests!
- [src/search/middleware/general-search-middleware.ts](src/search/middleware/general-search-middleware.ts) - Entrypoint to general search when you hit docs.github/search
- [src/search/middleware/search-routes](src/search/middleware/general-search-middleware.ts) - Entrypoint to the API endpoints for our search routes
- [src/search/scripts/](/src/search/scripts/) - Scripts used by Actions workflows or for manual operations like scraping data for indexing and performing the indexing.
- [src/search/tests](/src/search/tests) - Tests relevant to searching.
## Records
Each record represents a page. Each record has `breadcrumbs`, `title`, `headings`, `content` (the article content in text, not HTML), `intro` (if one exists in the frontmatter), and a unique `objectID` that is currently just the permalink of the article. Here's an example:
```json
{
"objectID":"/en/actions/creating-actions/about-custom-actions",
"breadcrumbs":"GitHub Actions / Creating actions",
"title":"About custom actions",
"headings":"About custom actions\nTypes of actions\n[...]",
"content":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, [...]",
"intro":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, or use and customize actions shared by the GitHub community.",
"toplevel":"GitHub Actions",
"popularity":0
}
```
## Notes
## Miscellaneous Notes
- It's not strictly necessary to set an `objectID` as the search index will create one automatically, but by creating our own we have a guarantee that subsequent invocations of this upload script will overwrite existing records instead of creating numerous duplicate records with differing IDs.
- Our search querying has typo tolerance. Try spelling something wrong and see what you get!

Просмотреть файл

@ -2,9 +2,10 @@ import { CheckboxGroup, Checkbox, FormControl } from '@primer/react'
import { useRouter } from 'next/router'
import Link from 'next/link'
import type { SearchResultAggregations } from './types'
import { useTranslation } from 'src/languages/components/useTranslation'
import type { SearchResultAggregations } from 'src/search/types'
type Props = {
aggregations: SearchResultAggregations
}

Просмотреть файл

@ -4,30 +4,39 @@ import { useRouter } from 'next/router'
import { useEffect, useState } from 'react'
import cx from 'classnames'
import type { SearchResultsT, SearchResultHitT, SearchQueryT } from './types'
import { useTranslation } from 'src/languages/components/useTranslation'
import { Link } from 'src/frame/components/Link'
import { sendEvent, EventType } from 'src/events/components/events'
import styles from './SearchResults.module.scss'
import type { SearchQueryContentT } from 'src/search/components/types'
import type { GeneralSearchHitWithoutIncludes, GeneralSearchResponse } from 'src/search/types'
import type { SearchTotalHits } from '@elastic/elasticsearch/lib/api/types'
type Props = {
results: SearchResultsT
search: SearchQueryT
results: GeneralSearchResponse
searchParams: SearchQueryContentT
}
export function SearchResults({ results, search }: Props) {
const pages = Math.ceil(results.meta.found.value / results.meta.size)
export function SearchResults({ results, searchParams }: Props) {
const pages = Math.ceil((results.meta.found as SearchTotalHits).value / results.meta.size)
const { page } = results.meta
return (
<div>
<SearchResultHits hits={results.hits} search={search} />
<SearchResultHits hits={results.hits} searchParams={searchParams} />
{pages > 1 && <ResultsPagination page={page} totalPages={pages} />}
</div>
)
}
function SearchResultHits({ hits, search }: { hits: SearchResultHitT[]; search: SearchQueryT }) {
function SearchResultHits({
hits,
searchParams,
}: {
hits: GeneralSearchHitWithoutIncludes[]
searchParams: SearchQueryContentT
}) {
return (
<div>
{hits.length === 0 && <NoSearchResults />}
@ -35,10 +44,10 @@ function SearchResultHits({ hits, search }: { hits: SearchResultHitT[]; search:
<SearchResultHit
key={hit.id}
hit={hit}
query={search.query}
query={searchParams.query}
totalHits={hits.length}
index={index}
debug={search.debug}
debug={searchParams.debug}
/>
))}
</div>
@ -64,7 +73,7 @@ function SearchResultHit({
index,
debug,
}: {
hit: SearchResultHitT
hit: GeneralSearchHitWithoutIncludes
query: string
totalHits: number
index: number

Просмотреть файл

@ -1,10 +1,10 @@
import { Flash } from '@primer/react'
import { useTranslation } from 'src/languages/components/useTranslation'
import type { SearchValidationErrorT } from './types'
import type { SearchValidationErrorEntry } from '../types'
interface Props {
errors: SearchValidationErrorT[]
errors: SearchValidationErrorEntry[]
}
export function ValidationErrors({ errors }: Props) {

Просмотреть файл

@ -1,10 +1,5 @@
import { createContext, useContext } from 'react'
import type { SearchT } from '../types'
export type SearchContextT = {
search: SearchT
}
import type { SearchContextT } from '../types'
export const SearchContext = createContext<SearchContextT | null>(null)

Просмотреть файл

@ -7,8 +7,9 @@ import { useNumberFormatter } from 'src/search/components/useNumberFormatter'
import { SearchResults } from 'src/search/components/SearchResults'
import { NoQuery } from 'src/search/components/NoQuery'
import { useMainContext } from 'src/frame/components/context/MainContext'
import { ValidationErrors } from './ValidationErrors'
import { useSearchContext } from './context/SearchContext'
import { ValidationErrors } from 'src/search/components/ValidationErrors'
import { useSearchContext } from 'src/search/components/context/SearchContext'
import type { SearchTotalHits } from '@elastic/elasticsearch/lib/api/types'
export function Search() {
const { search } = useSearchContext()
@ -17,7 +18,7 @@ export function Search() {
const { t } = useTranslation('search_results')
const { currentVersion } = useVersion()
const { query } = search.search
const { query } = search.searchParams
// A reference to the `content/search/index.md` Page object.
// Not to be confused with the "page" that is for paginating
@ -37,7 +38,7 @@ export function Search() {
pageTitle += ` (${searchVersion})`
}
if (results) {
pageTitle = `${formatInteger(results.meta.found.value)} ${pageTitle}`
pageTitle = `${formatInteger((results.meta.found as SearchTotalHits).value)} ${pageTitle}`
}
}
@ -63,7 +64,7 @@ export function Search() {
<ValidationErrors errors={validationErrors} />
) : null}
{results ? <SearchResults results={results} search={search.search} /> : null}
{results ? <SearchResults results={results} searchParams={search.searchParams} /> : null}
</div>
)
}

Просмотреть файл

@ -1,58 +1,15 @@
export type SearchResultHitT = {
id: string
url: string
title: string
breadcrumbs: string
highlights: {
title?: string[]
content?: string[]
content_explicit?: string[]
import { GeneralSearchResponse, SearchValidationErrorEntry } from 'src/search/types'
export interface SearchContextT {
search: {
results?: GeneralSearchResponse
searchParams: SearchQueryContentT
validationErrors: SearchValidationErrorEntry[]
}
score?: number
popularity?: number
es_url?: string
}
type SearchResultsMeta = {
found: {
value: number
relation: string
}
took: {
query_msec: number
total_msec: number
}
page: number
size: number
}
type Aggregation = {
key: string
count: number
}
export type SearchResultAggregations = {
[key: string]: Aggregation[]
}
export type SearchResultsT = {
meta: SearchResultsMeta
hits: SearchResultHitT[]
aggregations?: SearchResultAggregations
}
export type SearchQueryT = {
// Parts of the search query that are set to the search context
export type SearchQueryContentT = {
query: string
debug: boolean
}
export type SearchValidationErrorT = {
error: string
// key: string
}
export type SearchT = {
search: SearchQueryT
results?: SearchResultsT
validationErrors: SearchValidationErrorT[]
}

Просмотреть файл

@ -1,5 +0,0 @@
export const namePrefix = 'github-docs'
export default {
namePrefix,
}

Просмотреть файл

@ -0,0 +1,91 @@
import languages from '@/languages/lib/languages.js'
import { utcTimestamp } from '@/search/lib/helpers/time'
import { allIndexVersionKeys, versionToIndexVersionMap } from '@/search/lib/elasticsearch-versions'
import type { SearchTypes } from '@/search/types'
export type SearchIndexes = {
[key in SearchTypes]: SearchIndex
}
export type SearchIndex = {
prefix: string
type: string
}
/* Elasticsearch uses indexes to group categories of data
We currently have 3 top-level categories of indexes:
1. General search: This is populated using data from all of our Docs pages
2. General autocomplete: This is populated using analytics search history in docs-internal-data
3. AI autocomplete: This is populated with human-readable questions using a GPT query in docs-internal-data
This file is intended to be the source of truth for Docs Elasticsearch indexes.
Indexes are in the form:
<test_prefix><prefix>-<type>-<version>-<language>
e.g. github-docs-general-search-fpt-en
<test-prefix> might be "tests_" for tests
*/
const prefix = 'github-docs'
const indexes: SearchIndexes = {
generalSearch: {
prefix,
type: 'general-search',
},
generalAutocomplete: {
prefix,
type: 'general-autocomplete',
},
aiSearchAutocomplete: {
prefix,
type: 'ai-search-autocomplete',
},
}
// Source of truth for determining the index name for the Elastic Search index given a version and language
export function getElasticSearchIndex(
type: SearchTypes,
version: string,
language: string,
manualPrefix = '',
): {
indexName: string
indexAlias: string
} {
if (!(type in indexes)) {
throw new Error(`Type ${type} not found in indexes for getElasticSearchIndex function.`)
}
const index = indexes[type] as SearchIndex
// Validate language
if (!(language in languages)) {
throw new Error(
`Language ${language} not found in languages for getElasticSearchIndex function.`,
)
}
// Validate version
if (!allIndexVersionKeys.includes(version)) {
throw new Error(
`Version '${version}' does not map to a valid version for getElasticSearchIndex function.`,
)
}
// e.g. free-pro-team becomes fpt for the index name
const indexVersion = versionToIndexVersionMap[version]
// In the index-test-fixtures.sh script, we use the tests_ prefix index for testing
const testPrefix = process.env.NODE_ENV === 'test' ? 'tests_' : ''
// If a manual prefix is provided, append an underscore to it
if (manualPrefix && !manualPrefix.endsWith('_')) {
manualPrefix += '_'
}
const indexName = `${testPrefix || manualPrefix}${index.prefix}_${index.type}_${indexVersion}_${language}`
const indexAlias = `${indexName}__${utcTimestamp()}`
return { indexName, indexAlias }
}

Просмотреть файл

@ -0,0 +1,107 @@
/*
* Source of truth for versioning in the context of Elasticsearch
* We have a unique index for each version of the docs
* so consistency is important for creating/accessing ES Indexes.
*
* Example versions (these may not be up to date):
*
* 1. free-pro-team@latest. Previously known as "dotcom". This is the default version of the docs.
* - short name: fpt
* 2. enterprise-cloud@latest
* - short name: ghec
* 3. enterprise-server@X: This is the source of versioning complexity since the version is dynamic
* - short name: ghes-X
*
* However, for (3) someone might enter `&version=3.5` as the version in the request query string.
* This would map to `ghes-3.5`
*/
import { allVersions } from '@/versions/lib/all-versions'
// versionToIndexVersionMap examples:
// free-pro-team@latest -> fpt
// free-pro-team -> fpt
// dotcom -> fpt
// enterprise-cloud@latest -> ghec
// enterprise-server@3.5 -> ghes-3.5
// 3.5 -> ghes-3.5
export const versionToIndexVersionMap: { [key: string]: string } = {}
// For each potential input (from request query string, CLI, etc), map it to the appropriate index version
for (const versionSource of Object.values(allVersions)) {
if (versionSource.hasNumberedReleases) {
versionToIndexVersionMap[versionSource.currentRelease] = versionSource.miscVersionName
// Map shortname or plan, e.g. `ghes` or `enterprise-server` to the latest release, e.g. `ghes-3.14`
if (versionSource.latestRelease === versionSource.currentRelease) {
versionToIndexVersionMap[versionSource.plan] = versionSource.miscVersionName
versionToIndexVersionMap[versionSource.shortName] = versionSource.miscVersionName
}
} else {
versionToIndexVersionMap[versionSource.version] = versionSource.shortName
versionToIndexVersionMap[versionSource.miscVersionName] = versionSource.shortName
// The next two lines map things like `?version=free-pro-team` -> `?version=fpt`
versionToIndexVersionMap[versionSource.plan] = versionSource.shortName
versionToIndexVersionMap[versionSource.shortName] = versionSource.shortName
}
}
// All of the possible keys that can be input to access a version
export const allIndexVersionKeys = Array.from(
new Set([...Object.keys(versionToIndexVersionMap), ...Object.keys(allVersions)]),
)
// These should be the only possible values that an ES index will use (source of truth)
// allIndexVersionOptions example:
// fpt, ghec, ghes-3.14, ghes-3.13, ghes-3.12, ghes-3.11, ghes-3.10
export const allIndexVersionOptions = Array.from(
new Set([...Object.values(versionToIndexVersionMap)]),
)
// Autocomplete only supports 3 "versions": free-pro-team, enterprise-cloud, and enterprise-server
// docs-internal-data stores data under directories with these names. It does not account for individual enterprise-server versions
// These are the "plan" names on the allVersions object
const allVersionPlans: string[] = []
for (const version of Object.values(allVersions)) {
if (version.plan) {
allVersionPlans.push(version.plan)
}
}
// Remove duplicates
export const supportedAutocompletePlanVersions = Array.from(new Set(allVersionPlans))
// Returns the plan name for the given version
// Needed because {version} in the docs-internal-data paths use the version's 'plan' name, e.g. `free-pro-team` instead of `fpt`
export function getPlanVersionFromIndexVersion(indexVersion: string): string {
const planVersion =
Object.values(allVersions).find(
(info) =>
info.shortName === indexVersion ||
info.plan === indexVersion ||
info.miscVersionName === indexVersion ||
info.currentRelease === indexVersion,
)?.plan || ''
if (!planVersion) {
throw new Error(`Plan version not found for index version ${indexVersion}`)
}
return planVersion
}
// Gets the matching key from allVersions for the given index version
// This is needed for scraping since the pages use the 'allVersions' key as their version
export function getAllVersionsKeyFromIndexVersion(indexVersion: string): string {
const key = Object.keys(allVersions).find(
(key) =>
key === indexVersion ||
allVersions[key].shortName === indexVersion ||
allVersions[key].plan === indexVersion ||
allVersions[key].miscVersionName === indexVersion,
)
if (!key) {
throw new Error(`No key found for index version ${indexVersion}`)
}
return key
}

Просмотреть файл

@ -0,0 +1,125 @@
import { Client } from '@elastic/elasticsearch'
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
import { getHighlightConfiguration } from '@/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config'
import type { AutocompleteSearchResponse } from '@/search/types'
import type {
AutocompleteMatchQueriesOptions,
AutocompleteResultsArgs,
} from '@/search/lib/get-elasticsearch-results/types'
import type { QueryDslQueryContainer, SearchTotalHits } from '@elastic/elasticsearch/lib/api/types'
// Query Elasticsearch for AI Search autocomplete results
export async function getAISearchAutocompleteResults({
indexName,
query,
size,
}: AutocompleteResultsArgs): Promise<AutocompleteSearchResponse> {
const t0 = new Date()
const client = getElasticsearchClient() as Client
const matchQueries = getAISearchAutocompleteMatchQueries(query.trim(), {
fuzzy: {
minLength: 3,
maxLength: 20,
},
})
const matchQuery = {
bool: {
should: matchQueries,
},
}
const highlight = getHighlightConfiguration(query, ['term'])
const searchQuery = {
index: indexName,
highlight,
size,
query: matchQuery,
_source_includes: ['term'],
}
const result = await client.search<{ term: string }>(searchQuery)
const hitsAll = result.hits
const hits = hitsAll.hits.map((hit) => ({
term: hit._source?.term,
highlights: (hit.highlight && hit.highlight.term) || [],
}))
return {
meta: {
found: hitsAll.total as SearchTotalHits,
took: { query_msec: result.took, total_msec: new Date().getTime() - t0.getTime() },
size,
},
hits,
}
}
function getAISearchAutocompleteMatchQueries(
query: string,
{ fuzzy }: AutocompleteMatchQueriesOptions,
) {
const BOOST_PHRASE = 4.0
const BOOST_REGULAR = 2.0
const BOOST_PREFIX = 1.0
const BOOST_FUZZY = 0.1
const matchQueries: QueryDslQueryContainer[] = []
// Use match_phrase for exact term matches
matchQueries.push({
match_phrase: {
term: {
query,
boost: BOOST_PHRASE,
slop: 1, // Allows minor word reordering
},
},
})
// Use match for general matching
matchQueries.push({
match: {
term: {
query,
boost: BOOST_PREFIX,
},
},
})
// Match phrase prefix for partial term matches
matchQueries.push({
match_phrase_prefix: {
term: {
query,
boost: BOOST_PREFIX,
},
},
})
matchQueries.push({
match_bool_prefix: {
term: {
query,
boost: BOOST_REGULAR,
},
},
})
// Add fuzzy matching for typos and variations
if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
matchQueries.push({
fuzzy: {
term: {
value: query,
boost: BOOST_FUZZY,
fuzziness: 'AUTO',
},
},
})
}
return matchQueries
}

Просмотреть файл

@ -0,0 +1,100 @@
import { Client } from '@elastic/elasticsearch'
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
import { getHighlightConfiguration } from '@/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config'
import type { QueryDslQueryContainer, SearchTotalHits } from '@elastic/elasticsearch/lib/api/types'
import type { AutocompleteSearchResponse } from '@/search/types'
import type {
AutocompleteMatchQueriesOptions,
AutocompleteResultsArgs,
AutocompleteElasticsearchItem,
} from '@/search/lib/get-elasticsearch-results/types'
// Query Elasticsearch for general autocomplete results
export async function getAutocompleteSearchResults({
indexName,
query,
size,
}: AutocompleteResultsArgs): Promise<AutocompleteSearchResponse> {
const t0 = new Date()
const client = getElasticsearchClient() as Client
const matchQueries = getAutocompleteMatchQueries(query.trim(), {
fuzzy: {
minLength: 3,
maxLength: 20,
},
})
const matchQuery = {
bool: {
should: matchQueries,
},
}
const highlight = getHighlightConfiguration(query, ['term'])
const searchQuery = {
index: indexName,
highlight,
size,
query: matchQuery,
// Send absolutely minimal from Elasticsearch to here. Less data => faster.
_source_includes: ['term'],
}
const result = await client.search<AutocompleteElasticsearchItem>(searchQuery)
const hitsAll = result.hits
const hits = hitsAll.hits.map((hit) => ({
term: hit._source?.term,
highlights: (hit.highlight && hit.highlight.term) || [],
}))
return {
meta: {
found: hitsAll.total as SearchTotalHits,
took: { query_msec: result.took, total_msec: new Date().getTime() - t0.getTime() },
size,
},
hits,
}
}
function getAutocompleteMatchQueries(query: string, { fuzzy }: AutocompleteMatchQueriesOptions) {
const BOOST_PHRASE = 4.0
const BOOST_REGULAR = 2.0
const BOOST_FUZZY = 0.1
const matchQueries: QueryDslQueryContainer[] = []
const isMultiWordQuery = query.includes(' ') || query.includes('-')
if (isMultiWordQuery) {
matchQueries.push({
match_phrase_prefix: {
term: {
query,
boost: BOOST_PHRASE,
},
},
})
}
matchQueries.push({
match_bool_prefix: {
term: {
query,
boost: BOOST_REGULAR,
},
},
})
if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
matchQueries.push({
fuzzy: {
term: { value: query, boost: BOOST_FUZZY, fuzziness: 'AUTO' },
},
})
}
return matchQueries
}

Просмотреть файл

@ -1,57 +1,54 @@
import { Client } from '@elastic/elasticsearch'
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
import { DEFAULT_HIGHLIGHT_FIELDS } from '@/search/lib/search-request-params/search-params-objects'
import { getHighlightConfiguration } from '@/search/lib/get-elasticsearch-results/helpers/elasticsearch-highlight-config'
export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content']
// This needs to match what we *use* in the `<SearchResults>` component.
// For example, if we don't display "headings" we shouldn't request
// highlights for it either.
export const DEFAULT_HIGHLIGHT_FIELDS = ['title', 'content']
const ELASTICSEARCH_URL = process.env.ELASTICSEARCH_URL
import type {
SearchHit as ElasticsearchHit,
QueryDslQueryContainer,
SearchRequest,
SearchTotalHits,
} from '@elastic/elasticsearch/lib/api/types'
import type {
AdditionalIncludes,
ComputedSearchQueryParamsMap,
} from '@/search/lib/search-request-params/types'
import type { SearchAggregation, GeneralSearchHit, GeneralSearchResponse } from '@/search/types'
const MAX_AGGREGATE_SIZE = 30
const isDevMode = process.env.NODE_ENV !== 'production'
const isDevMode: boolean = process.env.NODE_ENV !== 'production'
function getClient() {
if (!ELASTICSEARCH_URL) {
// If this was mistakenly not set, it will eventually fail
// when you use the Client. But `new Client({node: undefined})`
// won't throw. And the error you get when you actually do try
// to use that Client instance is cryptic compared to this
// plain and simple thrown error.
throw new Error(`$ELASTICSEARCH_URL is not set`)
}
return new Client({
node: ELASTICSEARCH_URL,
// The default is 30,000ms but we noticed that the median time is about
// 100-150ms with some occasional searches taking multiple seconds.
// The default `maxRetries` is 3 which is a sensible number.
// If a query gets stuck, it's better to (relatively) quickly give up
// and retry. So if it takes longer than this time here, we're banking on
// that it was just bad luck and that it'll work if we simply try again.
// See internal issue #2318.
requestTimeout: 1900,
// It's important that requestTimeout * maxRetries is less than 10 seconds.
maxRetries: 5,
})
type getGeneralSearchResultsParams = {
indexName: string
searchParams: ComputedSearchQueryParamsMap['generalSearch']
topics?: string[]
includeTopics?: boolean
}
// The true work horse that actually performs the Elasticsearch query
export async function getSearchResults({
indexName,
query,
page,
size,
debug,
sort,
topics,
includeTopics,
usePrefixSearch,
highlights,
include,
toplevel,
aggregate,
}) {
// Query Elasticsearch for general search results
export async function getGeneralSearchResults(
args: getGeneralSearchResultsParams,
): Promise<GeneralSearchResponse> {
const {
indexName,
searchParams: {
highlights,
include,
toplevel,
aggregate,
autocomplete,
query,
page,
size,
debug,
sort,
},
topics,
includeTopics,
} = args
const usePrefixSearch = autocomplete
if (topics && !Array.isArray(topics)) {
throw new Error("'topics' has to be an array")
}
@ -71,8 +68,8 @@ export async function getSearchResults({
throw new Error("Every entry in the 'toplevel' must be a string")
}
}
const t0 = new Date()
const client = getClient()
const t0 = Date.now()
const client = getElasticsearchClient()
const from = size * (page - 1)
const matchQueries = getMatchQueries(query.trim(), {
@ -83,7 +80,7 @@ export async function getSearchResults({
},
})
const matchQuery = {
const matchQuery: Record<string, any> = {
bool: {
should: matchQueries,
// This allows filtering by toplevel later.
@ -91,7 +88,8 @@ export async function getSearchResults({
},
}
const topicsFilter = (topics || []).map((topic) => {
const topicsArray = Array.isArray(topics) ? topics : topics ? [topics] : []
const topicsFilter = topicsArray.map((topic) => {
return {
term: {
// Remember, 'topics' is a keyword field, meaning you need
@ -101,15 +99,18 @@ export async function getSearchResults({
}
})
if (topicsFilter.length) {
matchQuery.bool.filter = topicsFilter
matchQuery.bool.filter = matchQuery.bool.filter || []
matchQuery.bool.filter.push(...topicsFilter)
}
if (toplevel && toplevel.length) {
matchQuery.bool.filter = {
const toplevelArray = toplevel || []
if (toplevelArray.length) {
matchQuery.bool.filter = matchQuery.bool.filter || []
matchQuery.bool.filter.push({
terms: {
toplevel,
toplevel: toplevelArray,
},
}
})
}
const highlightFields = Array.from(highlights || DEFAULT_HIGHLIGHT_FIELDS)
@ -121,7 +122,7 @@ export async function getSearchResults({
const aggs = getAggregations(aggregate)
const searchQuery = {
const searchQuery: SearchRequest = {
index: indexName,
highlight,
from,
@ -136,13 +137,13 @@ export async function getSearchResults({
_source_includes: ['title', 'url', 'breadcrumbs', 'popularity', 'toplevel'],
}
if (includeTopics) {
searchQuery._source_includes.push('topics')
if (includeTopics && Array.isArray(searchQuery._source_includes)) {
searchQuery._source_includes?.push('topics')
}
for (const key of ['intro', 'headings']) {
if (include.includes(key)) {
searchQuery._source_includes.push(key)
for (const key of ['intro', 'headings'] as const) {
if (include.includes(key) && Array.isArray(searchQuery._source_includes)) {
searchQuery._source_includes?.push(key)
}
}
@ -193,26 +194,26 @@ export async function getSearchResults({
highlightFields,
include,
})
const aggregations = getAggregationsResult(aggregate, result.aggregations)
const t1 = new Date()
const aggregationsResult = getAggregationsResult(aggregate, result.aggregations)
const t1 = Date.now()
const meta = {
found: hitsAll.total,
found: hitsAll.total as SearchTotalHits,
took: {
query_msec: result.took,
total_msec: t1.getTime() - t0.getTime(),
total_msec: t1 - t0,
},
page,
size,
}
return { meta, hits, aggregations }
return { meta, hits, aggregations: aggregationsResult }
}
function getAggregations(aggregate) {
function getAggregations(aggregate?: string[]): Record<string, any> | undefined {
if (!aggregate || !aggregate.length) return undefined
const aggs = {}
const aggs: Record<string, any> = {}
for (const key of aggregate) {
aggs[key] = {
terms: {
@ -224,66 +225,37 @@ function getAggregations(aggregate) {
return aggs
}
function getAggregationsResult(aggregate, result) {
if (!aggregate || !aggregate.length) return
return Object.fromEntries(
aggregate.map((key) => [
key,
result[key].buckets
.map((bucket) => {
return {
key: bucket.key,
count: bucket.doc_count,
}
})
.sort((a, b) => a.key.localeCompare(b.key)),
]),
)
}
export async function getAutocompleteSearchResults({ indexName, query, size }) {
const client = getClient()
const matchQueries = getAutocompleteMatchQueries(query.trim(), {
fuzzy: {
minLength: 3,
maxLength: 20,
},
})
const matchQuery = {
bool: {
should: matchQueries,
},
}
const highlight = getHighlightConfiguration(query, ['term'])
const searchQuery = {
index: indexName,
highlight,
size,
query: matchQuery,
// Send absolutely minimal from Elasticsearch to here. Less data => faster.
_source_includes: ['term'],
}
const result = await client.search(searchQuery)
const hitsAll = result.hits
const hits = hitsAll.hits.map((hit) => {
return {
term: hit._source.term,
highlights: (hit.highlight && hit.highlight.term) || [],
function getAggregationsResult(
aggregate?: string[],
result?: Record<string, any>,
): Record<string, SearchAggregation[]> | undefined {
if (!aggregate || !aggregate.length || !result) return undefined
const aggregations: Record<string, SearchAggregation[]> = {}
for (const key of aggregate) {
if (result[key]?.buckets) {
aggregations[key] = result[key].buckets
.map((bucket: any) => ({
key: bucket.key as string,
count: bucket.doc_count as number,
}))
.sort((a: { key: string }, b: { key: string }) => a.key.localeCompare(b.key))
}
})
const meta = {
found: hitsAll.total,
}
return { meta, hits }
return aggregations
}
function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
interface GetMatchQueriesOptions {
usePrefixSearch: boolean
fuzzy: {
minLength: number
maxLength: number
}
}
function getMatchQueries(
query: string,
{ usePrefixSearch, fuzzy }: GetMatchQueriesOptions,
): QueryDslQueryContainer[] {
const BOOST_PHRASE = 10.0
const BOOST_TITLE = 4.0
const BOOST_HEADINGS = 3.0
@ -296,7 +268,7 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
// which wouldn't find anything else anyway.
const BOOST_FUZZY = 0.1
const matchQueries = []
const matchQueries: QueryDslQueryContainer[] = []
// If the query input is multiple words, it's good to know because you can
// make the query do `match_phrase` and you can make `match` query
@ -453,12 +425,12 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
} else if (query.startsWith('http')) {
// E.g. `https://docs.github.com/en/some/page?foo=bar`
// will become a search on `{url: '/en/some/page'}`
let pathname
let pathname: string | undefined
try {
pathname = new URL(query).pathname
} catch {
// If it failed, it can't be initialized with the `URL` constructor
// we so we can deem it *not* a valid URL.
// so we can deem it *not* a valid URL.
}
if (pathname) {
matchQueries.push({
@ -471,47 +443,18 @@ function getMatchQueries(query, { usePrefixSearch, fuzzy }) {
return matchQueries
}
function getAutocompleteMatchQueries(query, { fuzzy }) {
const BOOST_PHRASE = 4.0
const BOOST_REGULAR = 2.0
const BOOST_FUZZY = 0.1 // make it always last in ranking
const matchQueries = []
// If the query input is multiple words, it's good to know because you can
// make the query do `match_phrase` and you can make `match` query
// with the `AND` operator (`OR` is the default).
const isMultiWordQuery = query.includes(' ') || query.includes('-')
if (isMultiWordQuery) {
matchQueries.push({
match_phrase_prefix: {
term: {
query,
boost: BOOST_PHRASE,
},
},
})
}
matchQueries.push({
match_bool_prefix: {
term: {
query,
boost: BOOST_REGULAR,
},
},
})
if (query.length > fuzzy.minLength && query.length < fuzzy.maxLength) {
matchQueries.push({
fuzzy: {
term: { value: query, boost: BOOST_FUZZY, fuzziness: 'AUTO' },
},
})
}
return matchQueries
interface GetHitsOptions {
indexName: string
debug?: boolean
includeTopics?: boolean
highlightFields: string[]
include: AdditionalIncludes[]
}
function getHits(hits, { indexName, debug, includeTopics, highlightFields, include }) {
function getHits(
hits: ElasticsearchHit<any>[],
{ indexName, debug = false, includeTopics = false, highlightFields, include }: GetHitsOptions,
): GeneralSearchHit[] {
return hits.map((hit) => {
// Return `hit.highlights[...]` based on the highlight fields requested.
// So if you searched with `&highlights=headings&highlights=content`
@ -521,11 +464,12 @@ function getHits(hits, { indexName, debug, includeTopics, highlightFields, inclu
// headings: [...]
// }
// even if there was a match on 'title'.
const hitHighlights = Object.fromEntries(
highlightFields.map((key) => [key, (hit.highlight && hit.highlight[key]) || []]),
)
const hitHighlights: Record<string, string[]> = {}
for (const key of highlightFields) {
hitHighlights[key] = (hit.highlight && hit.highlight[key]) || []
}
const result = {
const result: GeneralSearchHit = {
id: hit._id,
url: hit._source.url,
title: hit._source.title,
@ -536,87 +480,15 @@ function getHits(hits, { indexName, debug, includeTopics, highlightFields, inclu
result.topics = hit._source.topics || []
}
if (debug) {
result.score = hit._score || 0.0
result.popularity = hit._source.popularity || 0.0
result.score = hit._score ?? 0.0
result.popularity = hit._source.popularity ?? 0.0
if (isDevMode) {
result.es_url = `http://localhost:9200/${indexName}/_doc/${hit._id}`
}
}
for (const field of include || []) {
for (const field of include) {
result[field] = hit._source[field]
}
return result
})
}
// The highlight configuration is dependent on how we use the content
// in the UI. For example, we feel we need about 3 lines (max)
// of highlights of content under each title. If we feel it shows too
// many highlights in the search result UI, we can come back here
// and change it to something more appropriate.
function getHighlightConfiguration(query, highlights) {
const fields = {}
if (highlights.includes('title')) {
fields.title = {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
type: 'fvh',
fragment_size: 200,
number_of_fragments: 1,
}
}
if (highlights.includes('content')) {
// The 'no_match_size' is so we can display *something* for the
// preview if there was no highlight match at all within the content.
fields.content = {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
type: 'fvh',
fragment_size: 150,
number_of_fragments: 1,
no_match_size: 150,
highlight_query: {
match_phrase_prefix: {
content: {
query,
},
},
},
}
fields.content_explicit = {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
type: 'fvh',
fragment_size: 150,
number_of_fragments: 1,
no_match_size: 0,
highlight_query: {
match_phrase_prefix: {
content_explicit: {
query,
},
},
},
}
}
if (highlights.includes('term')) {
fields.term = {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
type: 'fvh',
// fragment_size: 200,
// number_of_fragments: 1,
}
}
return {
pre_tags: ['<mark>'],
post_tags: ['</mark>'],
fields,
}
}

Просмотреть файл

@ -0,0 +1,86 @@
import { SearchHighlight } from '@elastic/elasticsearch/lib/api/types'
import type { HighlightOptions } from '@/search/lib/search-request-params/types'
export interface HighlightConfig {
type: string
fragment_size?: number
number_of_fragments?: number
no_match_size?: number
highlight_query?: object
}
export type HighlightFields = {
[key in HighlightOptions]: HighlightConfig
}
// When we query Elasticsearch, we can specify a highlight configuration
export function getHighlightConfiguration(
query: string,
highlightsFields: HighlightOptions[],
): SearchHighlight {
const fields = {} as HighlightFields
if (highlightsFields.includes('title')) {
fields.title = {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
type: 'fvh',
fragment_size: 200,
number_of_fragments: 1,
}
}
if (highlightsFields.includes('content')) {
fields.content = {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
type: 'fvh',
fragment_size: 150,
number_of_fragments: 1,
// So we can at least display something if there was no highlight match within the content.
no_match_size: 150,
highlight_query: {
match_phrase_prefix: {
content: {
query,
},
},
},
}
fields.content_explicit = {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
type: 'fvh',
fragment_size: 150,
number_of_fragments: 1,
no_match_size: 0,
highlight_query: {
match_phrase_prefix: {
content_explicit: {
query,
},
},
},
}
}
if (highlightsFields.includes('term')) {
fields.term = {
// Fast Vector Highlighter
// Using this requires that you first index these fields
// with {term_vector: 'with_positions_offsets'}
type: 'fvh',
}
}
const highlightConfig: SearchHighlight = {
pre_tags: ['<mark>'],
post_tags: ['</mark>'],
fields,
}
return highlightConfig
}

Просмотреть файл

@ -0,0 +1,23 @@
export interface AutocompleteResultsArgs {
indexName: string
query: string
size: number
}
export interface FuzzyConfig {
minLength: number
maxLength: number
}
export interface MatchQueriesOptions {
usePrefixSearch?: boolean
fuzzy: FuzzyConfig
}
export interface AutocompleteMatchQueriesOptions {
fuzzy: FuzzyConfig
}
export interface AutocompleteElasticsearchItem {
term: string
}

Просмотреть файл

@ -0,0 +1,31 @@
import { Client } from '@elastic/elasticsearch'
import { safeUrlDisplay } from '@/search/lib/helpers/strings'
export function getElasticsearchClient(overrideURL = '', verbose = false): Client {
const node = getElasticsearchURL(overrideURL)
if (verbose) {
console.log('Connecting to Elasticsearch URL:', safeUrlDisplay(node))
}
const client = new Client({ node })
return client
}
function getElasticsearchURL(overrideURL = ''): string {
if (!process.env.ELASTICSEARCH_URL && !overrideURL) {
throw new Error(
'Must pass the elasticsearch URL option or ' +
'set the environment variable ELASTICSEARCH_URL',
)
}
let node = overrideURL || process.env.ELASTICSEARCH_URL || ''
// Allow the user to lazily set it to `localhost:9200` for example.
if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) {
node = `http://${node}`
}
const parsed = new URL(node)
if (!parsed.hostname) throw new Error('no valid hostname')
return node
}

Просмотреть файл

@ -0,0 +1,44 @@
import { allVersions } from '@/versions/lib/all-versions'
// TODO: Old version logic
type VersionAliases = { [key: string]: string }
export const versionAliases: VersionAliases = {}
export const prefixVersionAliases: VersionAliases = {}
Object.values(allVersions).forEach((info) => {
if (info.hasNumberedReleases) {
versionAliases[info.currentRelease] = info.miscVersionName
} else {
versionAliases[info.version] = info.miscVersionName
versionAliases[info.miscVersionName] = info.miscVersionName
}
prefixVersionAliases[info.plan] = info.shortName
prefixVersionAliases[info.shortName] = info.shortName
})
// Temporary hard-coded switch
//
// We need to run workflows in production to index the search data
// We want the middleware + routes that consume the indexes to consume the old indexes
// until the new indexes are ready.
// Once they are ready we can remove this file & cleanup the places it is used
export function isBeforeSearchIndexMigration() {
if (process.env.NODE_ENV === 'production') return true
return false
}
// Old test prefix helper function
export function getGeneralSearchIndexPrefix(): string {
if (process.env.NODE_ENV === 'test') return 'tests_'
return ''
}
export function getGeneralSearchIndexVersion(paramVersion: string): string {
const version =
prefixVersionAliases[paramVersion] ||
versionAliases[paramVersion] ||
allVersions[paramVersion].miscVersionName
return version
}

Просмотреть файл

@ -0,0 +1,10 @@
export function safeUrlDisplay(url: string): string {
const parsed = new URL(url)
if (parsed.password) {
parsed.password = '***'
}
if (parsed.username) {
parsed.username = parsed.username.slice(0, 4) + '***'
}
return parsed.toString()
}

Просмотреть файл

@ -33,3 +33,28 @@ export function utcTimestamp() {
.join('')
)
}
/**
* Converts a given number of seconds into a formatted time string "HH:mm:ss".
*
* @param {number} seconds - The total number of seconds to format.
* @returns {string} A string representing the time in "hours:minutes:seconds" format.
*
* @example
* // returns "01:30:45"
* formatSeconds(5445);
*/
export function formatSecondsToHHMMSS(seconds: number): string {
return new Date(seconds * 1000).toISOString().substr(11, 8)
}
export function readableTimeMinAndSec(ms: number): string {
if (ms < 1000) {
return `${ms.toFixed(1)}ms`
}
const seconds = ms / 1000
if (seconds > 60) {
return `${Math.round(seconds / 60)}m${Math.round(seconds % 60)}s`
}
return `${seconds.toFixed(1)}s`
}

Просмотреть файл

@ -0,0 +1,96 @@
import type { Request } from 'express'
import { format } from 'node:util'
import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes'
import {
ValidationError,
getSearchRequestParamsObject,
} from '@/search/lib/search-request-params/search-params-objects'
import {
getGeneralSearchIndexVersion,
getGeneralSearchIndexPrefix,
isBeforeSearchIndexMigration,
} from '@/search/lib/helpers/old-version-logic'
import type {
ComputedSearchQueryParams,
ComputedSearchQueryParamsMap,
GetSearchRequestReturn,
} from '@/search/lib/search-request-params/types'
import type { SearchTypes, SearchValidationErrorEntry } from '@/search/types'
type ForceParams = {
[K in keyof ComputedSearchQueryParams]?: ComputedSearchQueryParams[K]
}
// Fetches the Search Params Object based on the type of request and uses that object to validate the passed in request parameters
// For example, if the request is a general search request, the general search params object expects a `page` key, e.g. ?page=1 on the request
// If that key is not present, it will be added to the validation errors array which will result in a 400 to the user.
export function getSearchFromRequestParams<Type extends SearchTypes>(
req: Request,
type: Type,
forceParams: ForceParams = {} as ForceParams,
): GetSearchRequestReturn<Type> {
const searchParamsObject = getSearchRequestParamsObject(type)
const searchParams: ComputedSearchQueryParamsMap[Type] = {} as ComputedSearchQueryParamsMap[Type]
const validationErrors: SearchValidationErrorEntry[] = []
for (const { key, default_, cast, validate, multiple } of searchParamsObject) {
if (key in forceParams) {
;(searchParams[key] as any) = forceParams[key] as any
continue
}
let value = req.query[key]
if (!value || (typeof value === 'string' && !value.trim())) {
if (default_ === undefined) {
validationErrors.push({ error: `No truthy value for key '${key}'`, key })
continue
}
value = default_
}
if (cast) {
value = cast(value)
}
try {
if (validate && !validate(value)) {
validationErrors.push({
error: format('Not a valid value (%O) for key %O', value, key),
key,
})
}
} catch (err) {
if (err instanceof ValidationError) {
validationErrors.push({ error: err.toString(), field: key })
} else {
throw err
}
}
if (!multiple && Array.isArray(value)) {
validationErrors.push({
error: format('Cannot have multiple values (%O) for key %O', value, key),
key,
})
}
;(searchParams[key] as any) = value
}
let indexName = ''
if (!validationErrors.length) {
// generalSearch is the only type of search that uses the old index prefix logic, rather than the `getElasticSearchIndex` function logic
if (type === 'generalSearch' && isBeforeSearchIndexMigration()) {
indexName = `${getGeneralSearchIndexPrefix()}github-docs-${getGeneralSearchIndexVersion(searchParams.version)}-${searchParams.language}`
} else {
const getIndexResults = getElasticSearchIndex(
type,
searchParams.version,
searchParams.language,
)
indexName = getIndexResults.indexName
}
}
return { indexName, searchParams, validationErrors }
}

Просмотреть файл

@ -0,0 +1,153 @@
/*
When a request is made to a /search endpoint with query parameters, e.g. ?query=foo&version=free-pro-team,
we need to validate and parse the parameters. This file contains the configuration for which parameters
to expect based on the type of search request "e.g. general search vs autocomplete search" and how to validate them.
*/
import languages from '@/languages/lib/languages'
import { allIndexVersionKeys, versionToIndexVersionMap } from '@/search/lib/elasticsearch-versions'
import { SearchTypes } from '@/search/types'
import { versionAliases } from '@/search/lib/helpers/old-version-logic'
import { allVersions } from '@/versions/lib/all-versions'
import type { SearchRequestQueryParams } from '@/search/lib/search-request-params/types'
// Entry to this file, returns the query parameters to expect based on the type of search request
export function getSearchRequestParamsObject(type: SearchTypes): SearchRequestQueryParams[] {
if (type === 'generalAutocomplete') {
return AUTOCOMPLETE_PARAMS_OBJ
} else if (type === 'aiSearchAutocomplete') {
return AI_SEARCH_AUTOCOMPLETE_PARAMS_OBJ
}
return GENERAL_SEARCH_PARAMS_OBJ
}
// - - - Everything below this line is for building the search query param objects - - - //
// Constants
const DEFAULT_AUTOCOMPLETE_SIZE = 5
const MAX_AUTOCOMPLETE_SIZE = 10
const DEFAULT_SIZE = 10
const MAX_SIZE = 50
const DEFAULT_PAGE = 1
const POSSIBLE_SORTS = ['best', 'relevance'] as const
const DEFAULT_SORT = POSSIBLE_SORTS[0]
const MAX_PAGE = 10
const V1_AGGREGATES = ['toplevel'] as const
export const POSSIBLE_HIGHLIGHT_FIELDS = ['title', 'content'] as const
// This needs to match what we *use* in the `<SearchResults>` component.
// For example, if we don't display "headings" we shouldn't request
// highlights for it either.
export const DEFAULT_HIGHLIGHT_FIELDS: readonly string[] = ['title', 'content']
export const V1_ADDITIONAL_INCLUDES = ['intro', 'headings', 'toplevel'] as const
export class ValidationError extends Error {}
const SHARED_PARAMS_OBJ: SearchRequestQueryParams[] = [
{ key: 'query' },
{
key: 'version',
default_: 'free-pro-team',
validate: (version: string) => {
if (!versionToIndexVersionMap[version]) {
throw new ValidationError(`'${version}' not in ${allIndexVersionKeys.join(', ')}`)
}
return true
},
},
]
const GENERAL_SEARCH_PARAMS_OBJ: SearchRequestQueryParams[] = [
...SHARED_PARAMS_OBJ,
{ key: 'query' },
// TODO: Overwrite with old version logic for now
{
key: 'version',
default_: 'dotcom',
validate: (v) => {
if (versionAliases[v] || allVersions[v]) return true
const valid = [...Object.keys(versionAliases), ...Object.keys(allVersions)]
throw new ValidationError(`'${v}' not in ${valid}`)
},
},
{ key: 'language', default_: 'en', validate: (v) => v in languages },
{
key: 'size',
default_: DEFAULT_SIZE,
cast: (v) => parseInt(v, 10),
validate: (v) => v >= 0 && v <= MAX_SIZE,
},
{
key: 'page',
default_: DEFAULT_PAGE,
cast: (v) => parseInt(v, 10),
validate: (v) => v >= 1 && v <= MAX_PAGE,
},
{ key: 'sort', default_: DEFAULT_SORT, validate: (v) => POSSIBLE_SORTS.includes(v as any) },
{
key: 'highlights',
default_: DEFAULT_HIGHLIGHT_FIELDS,
cast: (v) => (Array.isArray(v) ? v : [v]),
multiple: true,
validate: (v) => {
for (const highlight of v) {
if (!POSSIBLE_HIGHLIGHT_FIELDS.includes(highlight)) {
throw new ValidationError(`highlight value '${highlight}' is not valid`)
}
}
return true
},
},
{ key: 'autocomplete', default_: false, cast: toBoolean },
{ key: 'debug', default_: process.env.NODE_ENV === 'development', cast: toBoolean },
{
key: 'include',
default_: [],
cast: toArray,
multiple: true,
validate: (values) =>
values.every((value: string) => V1_ADDITIONAL_INCLUDES.includes(value as any)),
},
{
key: 'toplevel',
default_: [],
cast: toArray,
multiple: true,
},
{
key: 'aggregate',
default_: [],
cast: toArray,
multiple: true,
validate: (values) => values.every((value: string) => V1_AGGREGATES.includes(value as any)),
},
]
const SHARED_AUTOCOMPLETE_PARAMS_OBJ: SearchRequestQueryParams[] = [
{
key: 'size',
default_: DEFAULT_AUTOCOMPLETE_SIZE,
cast: (size: string) => parseInt(size, 10),
validate: (size: number) => size >= 0 && size <= MAX_AUTOCOMPLETE_SIZE,
},
]
const AI_SEARCH_AUTOCOMPLETE_PARAMS_OBJ: SearchRequestQueryParams[] = [
...SHARED_PARAMS_OBJ,
...SHARED_AUTOCOMPLETE_PARAMS_OBJ,
{ key: 'language', default_: 'en', validate: (language: string) => language === 'en' },
]
const AUTOCOMPLETE_PARAMS_OBJ: SearchRequestQueryParams[] = [
...SHARED_PARAMS_OBJ,
...SHARED_AUTOCOMPLETE_PARAMS_OBJ,
{ key: 'language', default_: 'en', validate: (language: string) => language in languages },
]
function toBoolean(value: any): boolean {
return value === 'true' || value === '1'
}
function toArray(value: any): any[] {
return Array.isArray(value) ? value : [value]
}

Просмотреть файл

@ -0,0 +1,52 @@
import { V1_ADDITIONAL_INCLUDES } from '@/search/lib/search-request-params/search-params-objects'
import { SearchTypes, SearchValidationErrorEntry } from '@/search/types'
export type HighlightOptions = 'title' | 'content' | 'content_explicit' | 'term'
export type AdditionalIncludes = (typeof V1_ADDITIONAL_INCLUDES)[number]
export interface ComputedSearchQueryParams {
query: string
size: number
version: string
language: string
// These are optional, so we need to use ComputedSearchQueryParamsMap in functions to get the exact types per Search Type
page?: number
sort?: string
highlights?: HighlightOptions[]
autocomplete?: boolean
debug?: boolean
include?: AdditionalIncludes[]
toplevel?: string[]
aggregate?: string[]
}
export interface ComputedSearchQueryParamsMap {
generalSearch: ComputedSearchQueryParams & {
page: number
sort: string
highlights: HighlightOptions[]
autocomplete: boolean
debug: boolean
include: AdditionalIncludes[]
toplevel: string[]
aggregate: string[]
}
generalAutocomplete: ComputedSearchQueryParams
aiSearchAutocomplete: ComputedSearchQueryParams
}
export interface SearchRequestQueryParams {
key: keyof ComputedSearchQueryParams
default_?: any
cast?: (value: any) => any
validate?: (value: any) => boolean
multiple?: boolean
}
export interface GetSearchRequestReturn<Type extends SearchTypes> {
indexName: string
searchParams: ComputedSearchQueryParamsMap[Type]
validationErrors: SearchValidationErrorEntry[]
}

Просмотреть файл

@ -1,153 +0,0 @@
import got from 'got'
import { errors } from '@elastic/elasticsearch'
import statsd from '#src/observability/lib/statsd.js'
import { getPathWithoutVersion, getPathWithoutLanguage } from '#src/frame/lib/path-utils.js'
import { getSearchFromRequest } from './get-search-request.js'
import { getSearchResults } from './es-search.js'
export default async function contextualizeSearch(req, res, next) {
// If it's NextJS fetching or data or it's a direct request,
// the pagePath is the "normalized" version
const { pagePath } = req
if (getPathWithoutLanguage(getPathWithoutVersion(pagePath)) !== '/search') {
return next()
}
// When you use `/api/search/v1?version=foo&language=xy&...`
// the language and version comes from the query string.
// When you use `/xz/enterprise-cloud@latest/search?query=hello`
// the `version` and `language` is implied from the URL pathname.
// search.version = req.context.currentVersion
// search.language = req.context.currentLanguage
const { search, validationErrors } = getSearchFromRequest(req, {
version: req.context.currentVersion,
language: req.context.currentLanguage,
})
if (validationErrors.map((error) => error.key).includes('query')) {
// 'query' is such an exception because the search result component
// will attempt to display its value even if there was any
// validation error. In a sense, it displays:
//
// You searched for "foo"
// But your 'page' parameter is invalid.
//
// If for example, the search input is an array, we pick the first
// value. If it's too long, we truncate it.
if (Array.isArray(search.query)) {
search.query = search.query[0]
} else if (!search.query) {
// If the 'query' query string parameter wasn't even present,
// it becomes `undefined`. But since `search.query` needs to be
// a *string*, we pretend it was provided but empty.
search.query = ''
}
}
// This enables so that when the search is sent to Elasticsearch
// it will request an aggregate by these keyword fields.
search.aggregate = ['toplevel']
req.context.search = { search, validationErrors }
if (!validationErrors.length && search.query) {
if (!process.env.ELASTICSEARCH_URL) {
// This is only true in local dev or in Preview environments.
// And in local dev, it's usually for content contributors who
// want to test a preview locally, but don't want to have to
// set up Elasticsearch.
// This same proxying logic happens in `middleware/api/index.js`
// too for the outwards facing `/api/search/v1` endpoint.
if (search.aggregate && search.toplevel && search.toplevel.length > 0) {
// Do 2 searches. One without filtering
const { toplevel, ...searchWithoutFilter } = search
searchWithoutFilter.size = 0
const { aggregations } = await getProxySearch(searchWithoutFilter)
const { aggregate, ...searchWithoutAggregate } = search
req.context.search.results = await getProxySearch(searchWithoutAggregate)
req.context.search.results.aggregations = aggregations
} else {
req.context.search.results = await getProxySearch(search)
}
} else {
// If this throws, so be it. Let it bubble up.
// In local dev, you get to see the error. In production,
// you get a "Oops! Something went wrong" which involves a Failbot
// send.
const tags = [`indexName:${search.indexName}`, `toplevels:${search.toplevel.length}`]
const timed = statsd.asyncTimer(getSearchResults, 'contextualize.search', tags)
try {
if (search.aggregate && search.toplevel && search.toplevel.length > 0) {
// Do 2 searches. One without filtering
const { toplevel, ...searchWithoutFilter } = search
searchWithoutFilter.size = 0
const { aggregations } = await timed(searchWithoutFilter)
req.context.search.results = await timed(search)
req.context.search.results.aggregations = aggregations
} else {
req.context.search.results = await timed(search)
}
} catch (error) {
// If the error coming from the Elasticsearch client is any sort
// of 4xx error, it will be bubbled up to the next middleware
// which might think something else is wrong with the *client's*
// request from the outside. But in reality it's not their fault.
// It's our fault in the backend side. So we throw a new error
// so that this failure to seach ultimately bubbles up to a
// proper 500 error (including Failbot reporting).
// In particular, this helps platform developers working on the
// Elasticsearch searching code.
if (error instanceof errors.ElasticsearchClientError) {
console.error('Error calling getSearchResults(%s):', search, error)
if (error.meta?.body) {
console.error(`Meta:`, error.meta.body)
}
throw new Error(error.message)
} else {
throw error
}
}
}
}
return next()
}
// When you use the proxy to prod, using its API, we need to "convert"
// the parameters we have figured out here in the contextualizer.
// Thankfully all the names match. For example, we might figure
// the page by doing `req.context.search.page = 123` and now we need to
// add that to the query string for the `/api/search/v1`.
// We inclusion-list all the keys that we want to take from the search
// object into the query string URL.
const SEARCH_KEYS_TO_QUERY_STRING = [
'query',
'version',
'language',
'page',
'aggregate',
'toplevel',
'size',
]
async function getProxySearch(search) {
const url = new URL('https://docs.github.com/api/search/v1')
for (const key of SEARCH_KEYS_TO_QUERY_STRING) {
const value = search[key]
if (typeof value === 'boolean') {
url.searchParams.set(key, value ? 'true' : 'false')
} else if (Array.isArray(value)) {
for (const v of value) {
url.searchParams.append(key, v)
}
} else if (typeof value === 'number') {
url.searchParams.set(key, `${value}`)
} else if (value) {
url.searchParams.set(key, value)
}
}
console.log(`Proxying search to ${url}`)
return got(url).json()
}

Просмотреть файл

@ -0,0 +1,174 @@
/*
This file & middleware is for when a user requests our /search page e.g. 'docs.github.com/search?query=foo'
We make whatever search is in the ?query= parameter and attach it to req.search
req.search is then consumed by the search component in 'src/search/pages/search.tsx'
When a user directly hits our API e.g. /api/search/v1?query=foo, they will hit the routes in ./search-routes.ts
*/
import got from 'got'
import { Request, Response, NextFunction } from 'express'
import { errors } from '@elastic/elasticsearch'
import statsd from '@/observability/lib/statsd.js'
import { getPathWithoutVersion, getPathWithoutLanguage } from '@/frame/lib/path-utils'
import { getGeneralSearchResults } from '@/search/lib/get-elasticsearch-results/general-search'
import { getSearchFromRequestParams } from '@/search/lib/search-request-params/get-search-from-request-params'
import type { ComputedSearchQueryParamsMap } from '@/search/lib/search-request-params/types'
import type {
GeneralSearchResponse,
SearchOnReqObject,
SearchTypes,
SearchValidationErrorEntry,
} from '@/search/types.js'
interface Context<Type extends SearchTypes> {
currentVersion: string
currentLanguage: string
search: SearchOnReqObject<Type>
}
interface CustomRequest<Type extends SearchTypes> extends Request {
pagePath: string
context: Context<Type>
}
export default async function contextualizeGeneralSearch(
req: CustomRequest<'generalSearch'>,
res: Response,
next: NextFunction,
): Promise<void> {
const { pagePath } = req
if (getPathWithoutLanguage(getPathWithoutVersion(pagePath)) !== '/search') {
return next()
}
// Since this is a middleware language & version are already set in req.context via a prior middleware
const { indexName, searchParams, validationErrors } = getSearchFromRequestParams(
req,
'generalSearch',
// Force the version and language keys to be set from the `req.context` object
{
version: req.context.currentVersion,
language: req.context.currentLanguage,
},
)
if (validationErrors.map((error: SearchValidationErrorEntry) => error.key).includes('query')) {
if (Array.isArray(searchParams.query)) {
searchParams.query = searchParams.query[0]
} else if (!searchParams.query) {
searchParams.query = '' // If 'undefined' we need to cast to string
}
}
searchParams.aggregate = ['toplevel']
req.context.search = {
searchParams,
validationErrors,
}
if (!validationErrors.length && searchParams.query) {
// In local dev ELASTICSEARCH_URL may not be set, so we proxy the search to prod
if (!process.env.ELASTICSEARCH_URL) {
if (searchParams.aggregate && searchParams.toplevel && searchParams.toplevel.length > 0) {
// Do 2 searches. One without filtering to get the aggregations
const searchWithoutFilter = Object.fromEntries(
Object.entries(searchParams).filter(([key]) => key !== 'topLevel'),
)
searchWithoutFilter.size = 0
const { aggregations } = await getProxySearch(
searchWithoutFilter as ComputedSearchQueryParamsMap['generalSearch'],
)
const searchWithoutAggregate = Object.fromEntries(
Object.entries(searchParams).filter(([key]) => key !== 'aggregate'),
)
req.context.search.results = await getProxySearch(
searchWithoutAggregate as ComputedSearchQueryParamsMap['generalSearch'],
)
req.context.search.results.aggregations = aggregations
} else {
req.context.search.results = await getProxySearch(searchParams)
}
} else {
const tags: string[] = [`indexName:${indexName}`, `toplevels:${searchParams.toplevel.length}`]
const timed = statsd.asyncTimer(getGeneralSearchResults, 'contextualize.search', tags)
const getGeneralSearchArgs = {
indexName,
searchParams,
}
try {
if (searchParams.aggregate && searchParams.toplevel && searchParams.toplevel.length > 0) {
// Do 2 searches. One without filtering to get the aggregations
const searchWithoutFilter = Object.fromEntries(
Object.entries(searchParams).filter(([key]) => key !== 'topLevel'),
)
searchWithoutFilter.size = 0
const { aggregations } = await timed({
...getGeneralSearchArgs,
searchParams: searchWithoutFilter as ComputedSearchQueryParamsMap['generalSearch'],
})
req.context.search.results = await timed(getGeneralSearchArgs)
req.context.search.results.aggregations = aggregations
} else {
req.context.search.results = await timed(getGeneralSearchArgs)
}
} catch (error) {
// If the Elasticsearch sends a 4XX we want the user to see a 500
if (error instanceof errors.ResponseError) {
console.error(
'Error calling getSearchResults(%s):',
JSON.stringify({
indexName,
searchParams,
}),
error,
)
if (error?.meta?.body) {
console.error(`Meta:`, error.meta.body)
}
throw new Error(error.message)
} else {
throw error
}
}
}
}
return next()
}
const SEARCH_KEYS_TO_QUERY_STRING: (keyof ComputedSearchQueryParamsMap['generalSearch'])[] = [
'query',
'version',
'language',
'page',
'aggregate',
'toplevel',
'size',
]
// Proxy the API endpoint with the relevant search params
async function getProxySearch(
search: ComputedSearchQueryParamsMap['generalSearch'],
): Promise<GeneralSearchResponse> {
const url = new URL('https://docs.github.com/api/search/v1')
for (const key of SEARCH_KEYS_TO_QUERY_STRING) {
const value = search[key]
if (typeof value === 'boolean') {
url.searchParams.set(key, value ? 'true' : 'false')
} else if (Array.isArray(value)) {
for (const v of value) {
url.searchParams.append(key, v)
}
} else if (typeof value === 'number') {
url.searchParams.set(key, `${value}`)
} else if (value) {
url.searchParams.set(key, value)
}
}
console.log(`Proxying search to ${url}`)
return got(url).json<GeneralSearchResponse>()
}

Просмотреть файл

@ -1,229 +0,0 @@
import { format } from 'node:util'
import languages from '#src/languages/lib/languages.js'
import { allVersions } from '#src/versions/lib/all-versions.js'
import { POSSIBLE_HIGHLIGHT_FIELDS, DEFAULT_HIGHLIGHT_FIELDS } from './es-search.js'
const DEFAULT_SIZE = 10
const DEFAULT_AUTOCOMPLETE_SIZE = 8
const MAX_SIZE = 50 // How much you return has a strong impact on performance
const MAX_AUTOCOMPLETE_SIZE = 10
const DEFAULT_PAGE = 1
const POSSIBLE_SORTS = ['best', 'relevance']
const DEFAULT_SORT = POSSIBLE_SORTS[0]
const MAX_PAGE = 10
// There are some fields you can optionally include in the output.
// These are fields available in Elasticsearch that we don't include in
// the output by default. E.g. `...&include=intro`
// Requesting anything that is not in this list will result in
// a 400 Bad Request.
const V1_ADDITIONAL_INCLUDES = ['intro', 'headings', 'toplevel']
const V1_AGGREGATES = ['toplevel']
// If someone searches for `...&version=3.5` what they actually mean
// is `ghes-3.5`. This is because of legacy formatting with the old search.
// In some distant future we can clean up any client enough that this
// aliasing won't be necessary.
const versionAliases = {}
const prefixVersionAliases = {}
Object.values(allVersions).forEach((info) => {
if (info.hasNumberedReleases) {
versionAliases[info.currentRelease] = info.miscVersionName
} else {
versionAliases[info.version] = info.miscVersionName
versionAliases[info.miscVersionName] = info.miscVersionName
}
// This makes it so you can search for `?version=enterprise-server`
// and that actually means `?version=ghes` because there's an index
// called `github-autocomplete-en-ghes`.
prefixVersionAliases[info.plan] = info.shortName
prefixVersionAliases[info.shortName] = info.shortName
})
function getIndexPrefix() {
// This logic is mirrored in the scripts we use before running tests
// In particular, see the `index-test-fixtures` npm script.
// That's expected to be run before CI and local vitest testing.
// The reason we have a deliberately different index name (by prefix)
// for testing compared to regular operation is to make it convenient
// for engineers working on local manual testing *and* automated
// testing without have to re-index different content (e.g. fixtures
// vs real content) on the same index name.
if (process.env.NODE_ENV === 'test') return 'tests_'
return ''
}
class ValidationError extends Error {}
const PARAMS = [
{ key: 'query' },
{
key: 'version',
default_: 'dotcom',
validate: (v) => {
if (versionAliases[v] || allVersions[v]) return true
const valid = [...Object.keys(versionAliases), ...Object.keys(allVersions)]
throw new ValidationError(`'${v}' not in ${valid}`)
},
},
{ key: 'language', default_: 'en', validate: (v) => v in languages },
{
key: 'size',
default_: DEFAULT_SIZE,
cast: (v) => parseInt(v, 10),
validate: (v) => v >= 0 && v <= MAX_SIZE,
},
{
key: 'page',
default_: DEFAULT_PAGE,
cast: (v) => parseInt(v, 10),
validate: (v) => v >= 1 && v <= MAX_PAGE,
},
{ key: 'sort', default_: DEFAULT_SORT, validate: (v) => POSSIBLE_SORTS.includes(v) },
{
key: 'highlights',
default_: DEFAULT_HIGHLIGHT_FIELDS,
cast: (v) => (Array.isArray(v) ? v : [v]),
multiple: true,
validate: (v) => {
for (const highlight of v) {
if (!POSSIBLE_HIGHLIGHT_FIELDS.includes(highlight)) {
throw new ValidationError(`highlight value '${highlight}' is not valid`)
}
}
return true
},
},
{ key: 'autocomplete', default_: false, cast: toBoolean },
{ key: 'debug', default_: process.env.NODE_ENV === 'development', cast: toBoolean },
{
key: 'include',
default_: [],
cast: toArray,
multiple: true,
// Note: At the time of writing this general validator middleware
// doesn't yet know it's being used by the v1 version.
// But we don't have any other versions yet so no need to
// over-engineer this more.
validate: (values) => values.every((value) => V1_ADDITIONAL_INCLUDES.includes(value)),
},
{
key: 'toplevel',
default_: [],
cast: toArray,
multiple: true,
},
{
key: 'aggregate',
default_: [],
cast: toArray,
multiple: true,
validate: (values) => values.every((value) => V1_AGGREGATES.includes(value)),
},
]
const AUTOCOMPLETE_PARAMS = [
{ key: 'query' },
{ key: 'language', default_: 'en', validate: (v) => v in languages },
{
key: 'version',
default_: 'free-pro-team',
validate: (v) => {
if (prefixVersionAliases[v] || allVersions[v]) return true
if (Object.values(prefixVersionAliases).includes(v)) return true
const valid = [
...Object.keys(prefixVersionAliases),
...Object.values(prefixVersionAliases),
...Object.keys(allVersions),
]
throw new ValidationError(`'${v}' not in ${valid.join(', ')}`)
},
},
{
key: 'size',
default_: DEFAULT_AUTOCOMPLETE_SIZE,
cast: (v) => parseInt(v, 10),
validate: (v) => v >= 0 && v <= MAX_AUTOCOMPLETE_SIZE,
},
]
export function getAutocompleteSearchFromRequest(req, force = {}) {
const { search, validationErrors } = getSearchFromRequest(req, {}, AUTOCOMPLETE_PARAMS)
if (validationErrors.length === 0) {
const version = prefixVersionAliases[search.version] || allVersions[search.version].shortName
search.indexName = `${getIndexPrefix()}github-autocomplete-${search.language}-${version}`
}
return { search, validationErrors }
}
export function getSearchFromRequest(req, force = {}, params = PARAMS) {
const search = {}
const validationErrors = []
for (const { key, default_, cast, validate, multiple } of params) {
// This is necessary because when the version or language comes from
// the pathname, we don't want pick these up from the query string.
// This function gets used by /$locale/$version/search
// *and* /api/search/v1?language=$locale&version=$version
if (key in force) {
search[key] = force[key]
continue
}
let value = req.query[key]
if (!value || (typeof value === 'string' && !value.trim())) {
if (default_ === undefined) {
// no value and no default, bad!
validationErrors.push({ error: `No truthy value for key '${key}'`, key })
continue
}
value = default_
}
if (cast) {
value = cast(value)
}
try {
if (validate && !validate(value)) {
validationErrors.push({
error: format('Not a valid value (%O) for key %O', value, key),
key,
})
}
} catch (err) {
if (err instanceof ValidationError) {
validationErrors.push({ error: err.toString(), field: key })
} else {
throw err
}
}
if (!multiple && Array.isArray(value)) {
validationErrors.push({
error: format('Cannot have multiple values (%O) for key %O', value, key),
key,
})
}
search[key] = value
}
if (!validationErrors.length) {
const version =
prefixVersionAliases[search.version] ||
versionAliases[search.version] ||
allVersions[search.version].miscVersionName
search.indexName = `${getIndexPrefix()}github-docs-${version}-${search.language}` // github-docs-ghes-3.5-en
}
return { search, validationErrors }
}
function toBoolean(value) {
if (value === 'true' || value === '1') return true
return false
}
function toArray(value) {
return Array.isArray(value) ? value : [value]
}

Просмотреть файл

@ -0,0 +1,150 @@
/*
This file and the routes included are for the /search endpoint of our API
For general search (client searches on docs.github.com) we use the middleware in ./general-search-middleware to get the search results
*/
import express, { Request, Response } from 'express'
import FailBot from '@/observability/lib/failbot.js'
import { searchCacheControl } from '@/frame/middleware/cache-control.js'
import catchMiddlewareError from '@/observability/middleware/catch-middleware-error.js'
import {
setFastlySurrogateKey,
SURROGATE_ENUMS,
} from '@/frame/middleware/set-fastly-surrogate-key.js'
import { getAutocompleteSearchResults } from '@/search/lib/get-elasticsearch-results/general-autocomplete'
import { getAISearchAutocompleteResults } from '@/search/lib/get-elasticsearch-results/ai-search-autocomplete'
import { getSearchFromRequestParams } from '@/search/lib/search-request-params/get-search-from-request-params'
import { getGeneralSearchResults } from '@/search/lib/get-elasticsearch-results/general-search'
const router = express.Router()
router.get('/legacy', (req: Request, res: Response) => {
res.status(410).send('Use /api/search/v1 instead.')
})
router.get(
'/v1',
catchMiddlewareError(async (req: Request, res: Response) => {
const { indexName, searchParams, validationErrors } = getSearchFromRequestParams(
req,
'generalSearch',
)
if (validationErrors.length) {
// We only send the first validation error to the user
return res.status(400).json(validationErrors[0])
}
const getResultOptions = {
indexName,
searchParams,
}
try {
const { meta, hits, aggregations } = await getGeneralSearchResults(getResultOptions)
if (process.env.NODE_ENV !== 'development') {
searchCacheControl(res)
// We can cache this without purging it after every deploy
// because the API search is only used as a proxy for local
// and preview environments.
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
}
res.status(200).json({ meta, hits, aggregations })
} catch (error) {
await handleGetSearchResultsError(req, res, error, getResultOptions)
}
}),
)
router.get(
'/autocomplete/v1',
catchMiddlewareError(async (req: Request, res: Response) => {
const {
indexName,
validationErrors,
searchParams: { query, size },
} = getSearchFromRequestParams(req, 'generalAutocomplete')
if (validationErrors.length) {
return res.status(400).json(validationErrors[0])
}
const options = {
indexName,
query,
size,
}
try {
const { meta, hits } = await getAutocompleteSearchResults(options)
if (process.env.NODE_ENV !== 'development') {
searchCacheControl(res)
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
}
res.status(200).json({ meta, hits })
} catch (error) {
await handleGetSearchResultsError(req, res, error, options)
}
}),
)
router.get(
'/ai-search-autocomplete/v1',
catchMiddlewareError(async (req: Request, res: Response) => {
const {
indexName,
validationErrors,
searchParams: { query, size },
} = getSearchFromRequestParams(req, 'aiSearchAutocomplete')
if (validationErrors.length) {
return res.status(400).json(validationErrors[0])
}
const getResultOptions = {
indexName,
query,
size,
}
try {
const { meta, hits } = await getAISearchAutocompleteResults(getResultOptions)
if (process.env.NODE_ENV !== 'development') {
searchCacheControl(res)
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
}
res.status(200).json({ meta, hits })
} catch (error) {
await handleGetSearchResultsError(req, res, error, getResultOptions)
}
}),
)
async function handleGetSearchResultsError(req: Request, res: Response, error: any, options: any) {
if (process.env.NODE_ENV === 'development') {
console.error(`Error calling getSearchResults(${options})`, error)
} else {
const reports = FailBot.report(error, { url: req.url, ...options })
if (reports) await Promise.all(reports)
}
res.status(500).json({ error: error.message })
}
// Redirects for latest versions
router.get('/', (req: Request, res: Response) => {
res.redirect(307, req.originalUrl.replace('/search', '/search/v1'))
})
router.get('/autocomplete', (req: Request, res: Response) => {
res.redirect(307, req.originalUrl.replace('/search/autocomplete', '/search/autocomplete/v1'))
})
router.get('/ai-search-autocomplete', (req: Request, res: Response) => {
res.redirect(
307,
req.originalUrl.replace('/search/ai-search-autocomplete', '/search/ai-search-autocomplete/v1'),
)
})
export default router

Просмотреть файл

@ -1,160 +0,0 @@
import express from 'express'
import FailBot from '#src/observability/lib/failbot.js'
import { searchCacheControl } from '#src/frame/middleware/cache-control.js'
import catchMiddlewareError from '#src/observability/middleware/catch-middleware-error.js'
import {
setFastlySurrogateKey,
SURROGATE_ENUMS,
} from '#src/frame/middleware/set-fastly-surrogate-key.js'
import { getAutocompleteSearchResults, getSearchResults } from './es-search.js'
import { getAutocompleteSearchFromRequest, getSearchFromRequest } from './get-search-request.js'
const router = express.Router()
router.get('/legacy', (req, res) => {
res.status(410).send('Use /api/search/v1 instead.')
})
export const validationMiddleware = (req, res, next) => {
const { search, validationErrors } = getSearchFromRequest(req)
if (validationErrors.length) {
// There might be multiple things bad about the query parameters,
// but we send a 400 on the first possible one in the API.
return res.status(400).json(validationErrors[0])
}
req.search = search
return next()
}
router.get(
'/v1',
validationMiddleware,
catchMiddlewareError(async function search(req, res) {
const {
indexName,
query,
autocomplete,
page,
size,
debug,
sort,
highlights,
include,
toplevel,
aggregate,
} = req.search
const options = {
indexName,
query,
page,
size,
debug,
sort,
highlights,
usePrefixSearch: autocomplete,
include,
toplevel,
aggregate,
}
try {
const { meta, hits, aggregations } = await getSearchResults(options)
if (process.env.NODE_ENV !== 'development') {
searchCacheControl(res)
// We can cache this without purging it after every deploy
// because the API search is only used as a proxy for local
// and preview environments.
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
}
// The v1 version of the output matches perfectly what comes out
// of the getSearchResults() function.
res.status(200).json({ meta, hits, aggregations })
} catch (error) {
// If getSearchResult() throws an error that might be 404 inside
// elasticsearch, if we don't capture that here, it will propagate
// to the next middleware.
await handleGetSearchResultsError(req, res, error, options)
}
}),
)
export const autocompleteValidationMiddleware = (req, res, next) => {
const { search, validationErrors } = getAutocompleteSearchFromRequest(req)
if (validationErrors.length) {
// There might be multiple things bad about the query parameters,
// but we send a 400 on the first possible one in the API.
return res.status(400).json(validationErrors[0])
}
req.search = search
return next()
}
router.get(
'/autocomplete/v1',
autocompleteValidationMiddleware,
catchMiddlewareError(async (req, res) => {
const { indexName, query, size } = req.search
const options = {
indexName,
query,
size,
}
try {
const { meta, hits } = await getAutocompleteSearchResults(options)
if (process.env.NODE_ENV !== 'development') {
searchCacheControl(res)
// We can cache this without purging it after every deploy
// because the API search is only used as a proxy for local
// and preview environments.
setFastlySurrogateKey(res, SURROGATE_ENUMS.MANUAL)
}
// The v1 version of the output matches perfectly what comes out
// of the getSearchResults() function.
res.status(200).json({ meta, hits })
} catch (error) {
// If getSearchResult() throws an error that might be 404 inside
// elasticsearch, if we don't capture that here, it will propagate
// to the next middleware.
await handleGetSearchResultsError(req, res, error, options)
}
}),
)
// We have more than one place where we do `try{...} catch error( THIS )`
// which is slightly different depending on the "sub-version" (e.g. /legacy)
// This function is a single place to take care of all of these error handlings
async function handleGetSearchResultsError(req, res, error, options) {
if (process.env.NODE_ENV === 'development') {
console.error(`Error calling getSearchResults(${options})`, error)
} else {
const reports = FailBot.report(error, Object.assign({ url: req.url }, options))
// It might be `undefined` if no backends are configured which
// is likely when using production NODE_ENV on your laptop
// where you might not have a HAYSTACK_URL configured.
if (reports) await Promise.all(reports)
}
res.status(500).json({ error: error.message })
}
// Alias for the latest version
router.get('/', (req, res) => {
// At the time of writing, the latest version is v1. (July 2022)
// Use `req.originalUrl` because this router is "self contained"
// which means that `req.url` will be `/` in this context.
res.redirect(307, req.originalUrl.replace('/search', '/search/v1'))
})
// Alias for the latest autocomplete version
router.get('/autocomplete', (req, res) => {
res.redirect(307, req.originalUrl.replace('/search/autocomplete', '/search/autocomplete/v1'))
})
export default router

Просмотреть файл

@ -7,9 +7,10 @@ import {
addUINamespaces,
} from 'src/frame/components/context/MainContext'
import { DefaultLayout } from 'src/frame/components/DefaultLayout'
import type { SearchT } from 'src/search/components/types'
import { SearchContext, SearchContextT } from 'src/search/components/context/SearchContext'
import { SearchContext } from 'src/search/components/context/SearchContext'
import { Search } from 'src/search/components/index'
import { SearchOnReqObject } from 'src/search/types'
import type { SearchContextT } from 'src/search/components/types'
type Props = {
mainContext: MainContextT
@ -40,6 +41,8 @@ export const getServerSideProps: GetServerSideProps<Props> = async (context) =>
throw new Error('Expected req.context to be populated with .search')
}
const searchObject = req.context.search as SearchOnReqObject<'generalSearch'>
// The `req.context.search` is similar to what's needed to React
// render the search result page.
// But it contains information (from the contextualizing) that is
@ -48,24 +51,24 @@ export const getServerSideProps: GetServerSideProps<Props> = async (context) =>
// `page` and `indexName` which was useful when it made the actual
// Elasticsearch query. But it's not needed to render the results.
// We explicitly pick out the parts that are needed, only.
const search: SearchT = {
search: {
query: req.context.search.search.query,
debug: req.context.search.search.debug,
const search: SearchContextT['search'] = {
searchParams: {
query: searchObject.searchParams.query,
debug: searchObject.searchParams.debug,
},
validationErrors: req.context.search.validationErrors,
validationErrors: searchObject.validationErrors,
}
// If there are no results (e.g. /en/search?query=) from the
// contextualizing, then `req.context.search.results` will
// be `undefined` which can't be serialized as a prop, using JSON.stringify.
if (req.context.search.results) {
if (searchObject.results) {
search.results = {
meta: req.context.search.results.meta,
hits: req.context.search.results.hits,
meta: searchObject.results.meta,
hits: searchObject.results.hits,
// Use `null` instead of `undefined` for JSON serialization.
// The only reason it would ever not be truthy is if the aggregates
// functionality is not enabled for this version.
aggregations: req.context.search.results.aggregations || null,
aggregations: searchObject.results.aggregations || null,
}
}

Просмотреть файл

@ -1,24 +1,19 @@
#!/usr/bin/env node
// [start-readme]
//
// See how a piece of text gets turned into tokens by the different
// analyzers.
// See how a piece of text gets turned into tokens by the different analyzers.
// Requires that the index exists in Elasticsearch.
//
// Example:
//
// npm run analyze-text "my words" to tokenize
//
// [end-readme]
// npm run analyze-text -- -V dotcom -l en "The name of the wind"
import { Client } from '@elastic/elasticsearch'
import { program, Option } from 'commander'
import { Command, Option } from 'commander'
import chalk from 'chalk'
import dotenv from 'dotenv'
import { languageKeys } from '#src/languages/lib/languages.js'
import { allVersions } from '#src/versions/lib/all-versions.js'
import { languageKeys } from '@/languages/lib/languages.js'
import { allVersions } from '@/versions/lib/all-versions.js'
import type { IndicesAnalyzeAnalyzeToken } from '@elastic/elasticsearch/lib/api/types'
// Now you can optionally have set the ELASTICSEARCH_URL in your .env file.
dotenv.config()
@ -38,16 +33,28 @@ dotenv.config()
//
// We need this later to be able to map CLI arguments to what the
// records are called when found on disk.
const shortNames = Object.fromEntries(
Object.values(allVersions).map((info) => {
const shortName = info.hasNumberedReleases
? info.miscBaseName + info.currentRelease
: info.miscBaseName
return [shortName, info]
}),
)
const shortNames: Record<string, (typeof allVersions)[keyof typeof allVersions]> =
Object.fromEntries(
Object.values(allVersions).map((info) => {
const shortName = info.hasNumberedReleases
? `${info.miscBaseName}${info.currentRelease}`
: info.miscBaseName
return [shortName, info]
}),
)
const allVersionKeys = Object.keys(shortNames)
const allVersionKeys: string[] = Object.keys(shortNames)
interface Options {
verbose?: boolean
version?: string
language?: string
notLanguage?: string
elasticsearchUrl?: string
indexPrefix?: string
}
const program = new Command()
program
.description('Analyze text into tokens')
@ -56,21 +63,29 @@ program
.addOption(
new Option('-l, --language <LANGUAGE>', 'Which language to focus on').choices(languageKeys),
)
.option('--not-language <LANGUAGE>', 'Exclude a specific language')
.option('-u, --elasticsearch-url <url>', 'If different from $ELASTICSEARCH_URL')
.option('--index-prefix <PREFIX>', 'Prefix for the index name')
.argument('<text>', 'text to tokenize')
.parse(process.argv)
main(program.opts(), program.args)
const options = program.opts<Options>()
const args: string[] = program.args
async function main(opts, args) {
main(options, args).catch((err) => {
console.error(chalk.red('Error:'), err)
process.exit(1)
})
async function main(opts: Options, args: string[]): Promise<void> {
const texts = [args.join(' ')]
if (!opts.elasticsearchUrl && !process.env.ELASTICSEARCH_URL) {
throw new Error(
'Must passed the elasticsearch URL option or ' +
'Must pass the elasticsearch URL option or ' +
'set the environment variable ELASTICSEARCH_URL',
)
}
let node = opts.elasticsearchUrl || process.env.ELASTICSEARCH_URL
let node = opts.elasticsearchUrl || process.env.ELASTICSEARCH_URL!
// Allow the user to lazily set it to `localhost:9200` for example.
if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) {
@ -79,15 +94,15 @@ async function main(opts, args) {
try {
const parsed = new URL(node)
if (!parsed.hostname) throw new Error('no valid hostname')
if (!parsed.hostname) throw new Error('No valid hostname')
} catch (err) {
console.error(chalk.bold('URL for Elasticsearch not a valid URL', err))
console.error(chalk.bold('URL for Elasticsearch not a valid URL'), err)
return
}
const { verbose, language, notLanguage } = opts
// The notLanguage is useful you want to, for example, index all languages
// The notLanguage is useful if you want to, for example, index all languages
// *except* English.
if (language && notLanguage) {
throw new Error("Can't combine --language and --not-language")
@ -116,29 +131,32 @@ async function main(opts, args) {
const indexName = `${prefix}github-docs-${versionKey}-${languageKey}`
console.log(chalk.yellow(`Analyzing in ${chalk.bold(indexName)}`))
await analyzeVersion(client, texts, indexName, verbose)
await analyzeVersion(client, texts, indexName)
}
function safeUrlDisplay(url) {
function safeUrlDisplay(url: string): string {
const parsed = new URL(url)
if (parsed.password) {
parsed.password = '***'
}
if (parsed.username) {
parsed.username = parsed.username.slice(0, 4) + '***'
parsed.username = `${parsed.username.slice(0, 4)}***`
}
return parsed.toString()
}
async function analyzeVersion(client, texts, indexName, verbose = false) {
async function analyzeVersion(client: Client, texts: string[], indexName: string): Promise<void> {
for (const text of texts) {
console.log(`RAW TEXT: 〝${chalk.italic(text)}`)
for (const analyzer of ['text_analyzer_explicit', 'text_analyzer', 'standard']) {
console.log('ANALYZER:', chalk.bold(analyzer))
const { tokens } = await client.indices.analyze({
const response = await client.indices.analyze({
index: indexName,
body: { analyzer, text },
})
const tokenWords = tokens.map((token) => token.token)
const tokens: IndicesAnalyzeAnalyzeToken[] | undefined = response.tokens
const tokenWords: string[] = tokens?.map((token) => token.token) || []
console.log(tokenWords)
}
}

Просмотреть файл

@ -1,575 +0,0 @@
#!/usr/bin/env node
// [start-readme]
//
// Creates Elasticsearch index, populates from records,
// moves the index alias, deletes old indexes.
//
// [end-readme]
import fs from 'fs/promises'
import path from 'path'
import { Client, errors } from '@elastic/elasticsearch'
import { program, Option, InvalidArgumentError } from 'commander'
import chalk from 'chalk'
import dotenv from 'dotenv'
import { retryOnErrorTest } from './retry-on-error-test.js'
import { languageKeys } from '#src/languages/lib/languages.js'
import { allVersions } from '#src/versions/lib/all-versions.js'
// Now you can optionally have set the ELASTICSEARCH_URL in your .env file.
dotenv.config()
// Create an object that maps the "short name" of a version to
// all information about it. E.g.
//
// {
// 'ghes-3.5': {
// hasNumberedReleases: true,
// currentRelease: '3.5',
// version: 'enterprise-server@3.5',
// miscBaseName: 'ghes-'
// ...
// },
// ...
//
// We need this later to be able to map CLI arguments to what the
// records are called when found on disk.
const shortNames = Object.fromEntries(
Object.values(allVersions).map((info) => {
const shortName = info.hasNumberedReleases
? info.miscBaseName + info.currentRelease
: info.miscBaseName
return [shortName, info]
}),
)
const allVersionKeys = Object.keys(shortNames)
const DEFAULT_SLEEPTIME_SECONDS = 30
program
.description('Creates Elasticsearch index from records')
.option('-v, --verbose', 'Verbose outputs')
.addOption(new Option('-V, --version [VERSION...]', 'Specific versions').choices(allVersionKeys))
.addOption(
new Option('-l, --language <LANGUAGE...>', 'Which languages to focus on').choices(languageKeys),
)
.addOption(
new Option('--not-language <LANGUAGE...>', 'Specific language to omit').choices(languageKeys),
)
.option('-u, --elasticsearch-url <url>', 'If different from $ELASTICSEARCH_URL')
.option('-p, --index-prefix <prefix>', 'Index string to put before index name')
.option(
'-s, --stagger-seconds <seconds>',
'Number of seconds to sleep between each bulk operation',
(value) => {
const parsed = parseInt(value, 10)
if (isNaN(parsed)) {
throw new InvalidArgumentError('Not a number.')
}
return parsed
},
)
.option(
'-r, --retries <count>',
'Number of retry attempts on recoverable network errors',
(value) => {
const parsed = parseInt(value, 10)
if (isNaN(parsed)) {
throw new InvalidArgumentError('Not a number.')
}
return parsed
},
)
.option(
'--sleep-time <seconds>',
`Number of seconds to sleep between each retry attempt (defaults to ${DEFAULT_SLEEPTIME_SECONDS})`,
(value) => {
const parsed = parseInt(value, 10)
if (isNaN(parsed)) {
throw new InvalidArgumentError('Not a number.')
}
return parsed
},
)
.argument('<source-directory>', 'where the indexable files are')
.parse(process.argv)
main(program.opts(), program.args)
async function main(opts, args) {
if (!args.length) {
throw new Error('Must pass the source as the first argument')
}
const { verbose, language, notLanguage, elasticsearchUrl } = opts
if (!elasticsearchUrl && !process.env.ELASTICSEARCH_URL) {
throw new Error(
'Must passed the elasticsearch URL option or ' +
'set the environment variable ELASTICSEARCH_URL',
)
}
let node = elasticsearchUrl || process.env.ELASTICSEARCH_URL
// Allow the user to lazily set it to `localhost:9200` for example.
if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) {
node = `http://${node}`
}
try {
const parsed = new URL(node)
if (!parsed.hostname) throw new Error('no valid hostname')
} catch (err) {
console.error(chalk.bold('URL for Elasticsearch not a valid URL', err))
throw err
}
// The notLanguage is useful you want to, for example, index all languages
// *except* English.
if (language && notLanguage) {
throw new Error("Can't combine --language and --not-language")
}
if (verbose) {
console.log(`Connecting to ${chalk.bold(safeUrlDisplay(node))}`)
}
const sourceDirectory = args[0]
try {
await fs.stat(sourceDirectory)
} catch (error) {
if (error.code === 'ENOENT') {
throw new Error(`The specified directory '${sourceDirectory}' does not exist.`)
}
throw error
}
try {
await indexAll(node, sourceDirectory, opts)
} catch (error) {
// If any error is thrown from within the SDK, that error object will
// contain a `Connection` object which, when printed, can reveal the
// username/password or the base64 Basic auth credentials.
// So we want to carefully re-throw it so it only contains the minimal
// information for debugging without exposing the Connection credentials
// in Actions logs.
if (error instanceof errors.ElasticsearchClientError) {
// All ElasticsearchClientError error subclasses have a `name` and
// `message` but only some have a `meta`.
if (error.meta) {
console.error('Error meta: %O', error.meta)
}
throw new Error(error.message)
}
// If any other error happens that isn't from the elasticsearch SDK,
// let it bubble up.
throw error
}
}
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
async function indexAll(node, sourceDirectory, opts) {
const client = new Client({ node })
const { language, verbose, notLanguage, indexPrefix, staggerSeconds } = opts
let version
if ('version' in opts) {
version = opts.version
if (process.env.VERSION) {
console.warn(
`'version' specified as argument ('${version}') AND environment variable ('${process.env.VERSION}')`,
)
}
} else {
if (process.env.VERSION && process.env.VERSION !== 'all') {
version = process.env.VERSION
if (!allVersionKeys.includes(version)) {
throw new Error(
`Environment variable 'VERSION' (${version}) is not recognized. Must be one of ${allVersionKeys}`,
)
}
}
}
let versionKeys = allVersionKeys
// If it came from the `--version` argument parsing, it might be a string
// or an array of strings because it uses `--version [VERSION...]`.
if (version) {
if (Array.isArray(version)) {
versionKeys = version
} else {
versionKeys = [version]
}
}
// This will throw if it can't ping
await client.ping()
const languages =
language || languageKeys.filter((lang) => !notLanguage || !notLanguage.includes(lang))
if (verbose) {
console.log(`Indexing on languages ${chalk.bold(languages.join(', '))}`)
}
const prefix = indexPrefix ? `${indexPrefix}_` : ''
for (const language of languages) {
let count = 0
for (const versionKey of versionKeys) {
console.log(chalk.yellow(`Indexing ${chalk.bold(versionKey)} in ${chalk.bold(language)}`))
const indexName = `${prefix}github-docs-${versionKey}-${language}`
const t0 = new Date()
await indexVersion(client, indexName, versionKey, language, sourceDirectory, opts)
const t1 = new Date()
console.log(chalk.green(`Finished indexing ${indexName}. Took ${formatTime(t1 - t0)}`))
if (verbose) {
console.log(`To view index: ${safeUrlDisplay(node + `/${indexName}`)}`)
console.log(`To search index: ${safeUrlDisplay(node + `/${indexName}/_search`)}`)
}
count++
// console.log({ count, versionKeysLength: versionKeys.length })
if (staggerSeconds && count < versionKeys.length - 1) {
console.log(`Sleeping for ${staggerSeconds} seconds...`)
await sleep(1000 * staggerSeconds)
}
// A bit of visual separation betweeen each version
console.log('')
}
}
}
function safeUrlDisplay(url) {
const parsed = new URL(url)
if (parsed.password) {
parsed.password = '***'
}
if (parsed.username) {
parsed.username = parsed.username.slice(0, 4) + '***'
}
return parsed.toString()
}
// Return '20220719012012' if the current date is
// 2022-07-19T01:20:12.172Z. Note how the 6th month (July) becomes
// '07'. All numbers become 2 character zero-padding strings individually.
function utcTimestamp() {
const d = new Date()
return (
[
`${d.getUTCFullYear()}`,
d.getUTCMonth() + 1,
d.getUTCDate(),
d.getUTCHours(),
d.getUTCMinutes(),
d.getUTCSeconds(),
]
// If it's a number make it a zero-padding 2 character string
.map((x) => (typeof x === 'number' ? ('0' + x).slice(-2) : x))
.join('')
)
}
// Consider moving this to lib
async function indexVersion(client, indexName, version, language, sourceDirectory, opts) {
const { verbose } = opts
// Note, it's a bit "weird" that numbered releases versions are
// called the number but that's the convention the previous
// search backend used
const indexVersionName = shortNames[version].hasNumberedReleases
? shortNames[version].currentRelease
: shortNames[version].miscBaseName
const recordsName = `github-docs-${indexVersionName}-${language}`
const records = await loadRecords(recordsName, sourceDirectory)
const thisAlias = `${indexName}__${utcTimestamp()}`
// CREATE INDEX
const settings = {
analysis: {
char_filter: {
// This will turn `runs-on` into `runs_on` so that it can't be
// tokenized to `runs` because `on` is a stop word.
// It also means that prose terms, in English, like `opt-in`
// not be matched if someone searches for `opt in`. But this
// is why we have multiple different analyzers. So it becomes
// `opt_in` in the `text_analyzer_explicit` analyzer, but is
// left as `opt` in the `text_analyzer` analyzer.
hyphenation_filter: {
type: 'mapping',
mappings: ['- => _'],
},
},
analyzer: {
// We defined to analyzers. Both based on a "common core" with the
// `standard` tokenizer. But the second one adds Snowball filter.
// That means the tokenization of "Dependency naming" becomes
// `[dependency, naming]` in the explicit one and `[depend, name]`
// in the Snowball one.
// We do this to give a chance to boost the more exact spelling a
// bit higher with the assumption that if the user knew exactly
// what it was called, we should show that higher.
// A great use-case of this when users search for keywords that are
// code words like `dependency-name`.
text_analyzer_explicit: {
char_filter: ['hyphenation_filter'],
filter: ['lowercase', 'stop', 'asciifolding'],
tokenizer: 'standard',
type: 'custom',
},
text_analyzer: {
filter: ['lowercase', 'stop', 'asciifolding'],
tokenizer: 'standard',
type: 'custom',
},
},
filter: {
// Will later, conditionally, put the snowball configuration here.
},
},
}
const snowballLanguage = getSnowballLanguage(language)
if (snowballLanguage) {
settings.analysis.analyzer.text_analyzer.filter.push('languaged_snowball')
settings.analysis.filter.languaged_snowball = {
type: 'snowball',
language: snowballLanguage,
}
} else {
if (verbose) {
console.warn(`No snowball language for '${language}'`)
}
}
await client.indices.create({
index: thisAlias,
mappings: {
properties: {
url: { type: 'keyword' },
title: {
type: 'text',
analyzer: 'text_analyzer',
norms: false,
// This is used for fast highlighting. Uses more space but makes
// the searches faster.
term_vector: 'with_positions_offsets',
},
title_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
content: {
type: 'text',
analyzer: 'text_analyzer',
// This is used for fast highlighting. Uses more space but makes
// the searches faster.
term_vector: 'with_positions_offsets',
},
content_explicit: {
type: 'text',
analyzer: 'text_analyzer_explicit',
// This is used for fast highlighting. Uses more space but makes
// the searches faster.
term_vector: 'with_positions_offsets',
},
headings: { type: 'text', analyzer: 'text_analyzer', norms: false },
headings_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
breadcrumbs: { type: 'text' },
popularity: { type: 'float' },
intro: { type: 'text' },
// Use 'keyword' because it's faster to index and (more importantly)
// faster to search on. It would be different if it was something
// users could type in into a text input.
toplevel: { type: 'keyword' },
},
},
settings,
})
// POPULATE
const allRecords = Object.values(records).sort((a, b) => b.popularity - a.popularity)
const operations = allRecords.flatMap((doc) => {
const { title, objectID, content, breadcrumbs, headings, intro, toplevel } = doc
const contentEscaped = escapeHTML(content)
const headingsEscaped = escapeHTML(headings)
const record = {
url: objectID,
title,
title_explicit: title,
content: contentEscaped,
content_explicit: contentEscaped,
breadcrumbs,
headings: headingsEscaped,
headings_explicit: headingsEscaped,
// This makes sure the popularities are always greater than 1.
// Generally the 'popularity' is a ratio where the most popular
// one of all is 1.0.
// By making it >=1.0 when we multiply a relevance score,
// you never get a product of 0.0.
popularity: doc.popularity + 1,
intro,
toplevel,
}
return [{ index: { _index: thisAlias } }, record]
})
const bulkOptions = {
// Default is 'false'.
// It means that the index is NOT refreshed as documents are inserted.
// Which makes sense in our case because we do not intend to search on
// this index until after we've pointed the alias to this new index.
refresh: false,
// Default is '1m' but we have no reason *not* to be patient. It's run
// by a bot on a schedeule (GitHub Actions).
timeout: '5m',
}
const attempts = opts.retries || 0
const sleepTime = (opts.sleepTime || DEFAULT_SLEEPTIME_SECONDS) * 1000
console.log(`About to bulk index ${allRecords.length.toLocaleString()} records with retry %O`, {
attempts,
sleepTime,
})
const t0 = new Date()
const bulkResponse = await retryOnErrorTest(
(error) => {
// Rate limiting can happen when you're indexing too much at
// same time.
return error instanceof errors.ResponseError && error.meta.statusCode === 429
},
() => client.bulk({ operations, ...bulkOptions }),
{
attempts,
sleepTime,
onError: (_, attempts, sleepTime) => {
console.warn(
chalk.yellow(
`Failed to bulk index ${indexName}. Will attempt ${attempts} more times (after ${
sleepTime / 1000
}s sleep).`,
),
)
},
},
)
if (bulkResponse.errors) {
// Some day, when we're more confident how and why this might happen
// we can rewrite this code to "massage" the errors better.
// For now, if it fails, it's "OK". It means we won't be proceeding,
// an error is thrown in Actions and we don't have to worry about
// an incompletion index.
console.error(`Bulk response errors: ${bulkResponse.errors}`)
throw new Error('Bulk errors happened.')
}
const t1 = new Date()
console.log(`Bulk indexed ${thisAlias}. Took ${formatTime(t1 - t0)}`)
// The counting of documents in the index is async and can take a while
// to reflect. So send count requests until we get the right number.
let documentsInIndex = 0
let countAttempts = 3
while (documentsInIndex < allRecords.length) {
const { count } = await client.count({ index: thisAlias })
documentsInIndex = count
if (documentsInIndex >= allRecords.length) break
countAttempts--
if (!countAttempts) {
console.log(`After ${countAttempts} attempts still haven't matched the expected number.`)
break
}
await sleep(1000)
}
console.log(
`Documents now in ${chalk.bold(thisAlias)}: ${chalk.bold(documentsInIndex.toLocaleString())}`,
)
// To perform an atomic operation that creates the new alias and removes
// the old indexes, we can use the updateAliases API with a body that
// includes an "actions" array. The array includes the added alias
// and the removed indexes. If any of the actions fail, none of the operations
// are performed.
// https://www.elastic.co/guide/en/elasticsearch/reference/master/indices-aliases.html
const aliasUpdates = [
{
add: {
index: thisAlias,
alias: indexName,
},
},
]
console.log(`Alias ${indexName} -> ${thisAlias}`)
console.log('About to get indices with retry %O', { attempts, sleepTime })
const indices = await retryOnErrorTest(
(error) => {
// 404 can happen when you're trying to get an index that
// doesn't exist. ...yet!
return error instanceof errors.ResponseError && error.meta.statusCode === 404
},
() => client.cat.indices({ format: 'json' }),
{
attempts,
sleepTime,
onError: (error, attempts, sleepTime) => {
console.warn(
chalk.yellow(
`Failed to get index ${indexName} (${
error.message || error.toString()
}). Will attempt ${attempts} more times (after ${formatTime(sleepTime)}s sleep).`,
),
)
},
},
)
for (const index of indices) {
if (index.index !== thisAlias && index.index.startsWith(indexName)) {
aliasUpdates.push({ remove_index: { index: index.index } })
console.log('Deleting index', index.index)
}
}
if (verbose) console.log('Updating alias actions:', aliasUpdates)
await client.indices.updateAliases({ body: { actions: aliasUpdates } })
}
function escapeHTML(content) {
return content.replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/"/g, '&quot;')
}
async function loadRecords(indexName, sourceDirectory) {
const filePath = path.join(sourceDirectory, `${indexName}-records.json`)
const payload = await fs.readFile(filePath)
return JSON.parse(payload)
}
function getSnowballLanguage(language) {
// Based on https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-snowball-tokenfilter.html
// Note, not all languages are supported. So this function might return
// undefined. That implies that you can't use snowballing.
return {
en: 'English',
fr: 'French',
es: 'Spanish',
ru: 'Russian',
it: 'Italian',
de: 'German',
pt: 'Portuguese',
}[language]
}
function formatTime(ms) {
if (ms < 1000) {
return `${ms.toFixed(1)}ms`
}
const seconds = ms / 1000
if (seconds > 60) {
return `${Math.round(seconds / 60)}m${Math.round(seconds % 60)}s`
}
return `${seconds.toFixed(1)}s`
}

Просмотреть файл

@ -6,7 +6,10 @@
set -e
# For general site-search
npm run index-elasticsearch -- -l en -l ja -V ghec -V dotcom --index-prefix tests -- src/search/tests/fixtures/search-indexes
npm run index-general-search -- src/search/tests/fixtures/search-indexes -l en -l ja -V ghec -V fpt --index-prefix tests
# For autocomplete search
npm run index -- autocomplete src/search/tests/fixtures/data -l en -l ja -v fpt -v ghec --index-prefix tests
# For general autocomplete search
npm run index-general-autocomplete -- src/search/tests/fixtures/data -l en -l ja -v fpt -v ghec --index-prefix tests
# For AI search autocomplete
npm run index-ai-search-autocomplete -- src/search/tests/fixtures/data -l en -v fpt -v ghec --index-prefix tests

Просмотреть файл

@ -0,0 +1,24 @@
# Elastic Search Indexing
Elasticsearch uses indexes to store the data that is used to determine search results.
We use this scripts in this directory to index our Elasticsearch instances.
In production, the indexing happens in the GitHub workflows: `index-autocomplete-search.yml` and `index-general-search.yml`
## CLI Script
Before running the indexing for **general search** you run the [scrape](../scrape/README.md) script to scrape page data into files.
Before running the indexing for **general autocomplete** and **AI search autocomplete** you need to clone [docs-internal-data](https://github.com/github/docs-internal-data) to the root of this directory.
There is a separate run command for indexing each type of search data:
1. **general search**: `npm run index-general-search -- <scrape-directory>`
2. **general autocomplete**: `npm run index-general-autocomplete -- docs-internal-data` (if `docs-internal-data` is cloned to root directory)
3. **AI search autocomplete**: `npm run index-ai-search-autocomplete -- docs-internal-data` (if `docs-internal-data` is cloned to root directory)
To see the arguments accepted by any script, pass the `--help` argument, for example
```bash
npm run index-general-autocomplete -- --help
```

Просмотреть файл

@ -1,167 +0,0 @@
import fs from 'node:fs'
import path from 'node:path'
import { Client, estypes } from '@elastic/elasticsearch'
import { getClient } from './lib/get-client'
import { utcTimestamp } from './lib/utils'
import { populate } from './lib/populate'
import { type Version, Records } from './types'
export const shortVersionNames = {
'enterprise-server': 'ghes',
'enterprise-cloud': 'ghec',
'free-pro-team': 'fpt',
} as const
const DEFAULT_SLEEPTIME_SECONDS = 30
type Options = {
dataRepoRoot: string
languages: string[]
versions: Version[]
retries?: number
sleepTime?: number
verbose?: boolean
indexPrefix?: string
}
export async function indexAutocomplete(options: Options) {
// The data repo has a predictable structure of
// `hydro/rollups/user-searches/$language/$version/rollup.json`
// But note that the "version" might be a prefix, like enterprise-server.
// const { verbose } = options
const client = getClient()
const { dataRepoRoot, versions, languages } = options
for (const language of languages) {
for (const version of versions) {
const records = loadRecords({ version, language, dataRepoRoot })
const { alias, name } = await createIndex(
client,
language,
version,
options.indexPrefix || '',
)
await populate(client, records, {
alias,
name,
retries: options.retries || 0,
sleepTime: options.sleepTime || DEFAULT_SLEEPTIME_SECONDS,
})
}
}
}
type LoadOptions = {
dataRepoRoot: string
language: string
version: string
}
function loadRecords(options: LoadOptions): Records {
// First load the rollup records for user-searches
const filePath = path.join(
options.dataRepoRoot,
'hydro/rollups/user-searches',
options.language,
options.version,
'rollup.json',
)
const terms: Records = {}
const userSearchTerms: Records = JSON.parse(fs.readFileSync(filePath, 'utf8'))
let highestValue = Math.max(...Object.values(userSearchTerms))
if (highestValue === 0) {
throw new Error(`No records found for ${options.language} ${options.version}`)
}
for (const [term, value] of Object.entries(userSearchTerms)) {
// Why +1?
// Because we want these user-searches to alway be higher than all the
// terms generated from titles.
// For example, a common user-search term that users use
// is "log forwarding". But there might not be a deconstructed term,
// from the document titles, however there might be one called
// "log proxy". So when our users search for "log" we want to suggest,
// in the autocomplete UI "log forwarding" before "log proxy".
terms[term] = value / highestValue + 1
}
const documentTermsFilePath = path.join(
options.dataRepoRoot,
'all-documents/terms',
options.language,
options.version,
'terms.json',
)
const documentTerms: Records = JSON.parse(fs.readFileSync(documentTermsFilePath, 'utf8'))
highestValue = Math.max(...Object.values(documentTerms))
if (highestValue === 0) {
throw new Error(`No document title records found for ${options.language} ${options.version}`)
}
for (const [term, value] of Object.entries(documentTerms)) {
if (!(term in terms)) {
terms[term] = value / highestValue + 1
}
}
return terms
}
type IndexInfo = {
alias: string
name: string
}
async function createIndex(
client: Client,
language: string,
version: Version,
indexPrefix: string,
): Promise<IndexInfo> {
const settings: estypes.IndicesIndexSettings = {
analysis: {
analyzer: {
text_analyzer: {
filter: ['lowercase'],
tokenizer: 'standard',
type: 'custom',
},
},
},
// filter: {
// // Will later, conditionally, put the snowball configuration here.
// },
// XXX SNOWBALL?
}
if (indexPrefix && !indexPrefix.endsWith('_')) {
indexPrefix += '_'
}
const indexName = `${indexPrefix}github-autocomplete-${language}-${shortVersionNames[version] || version}`
const thisAlias = `${indexName}__${utcTimestamp()}`
const mappings: estypes.MappingTypeMapping = {
properties: {
term: {
type: 'text',
analyzer: 'text_analyzer',
// This is used for fast highlighting. Uses more space but makes
// the searches faster.
term_vector: 'with_positions_offsets',
},
popularity: { type: 'float' },
},
}
await client.indices.create({
index: thisAlias,
mappings,
settings,
})
return { alias: thisAlias, name: indexName }
}

Просмотреть файл

@ -0,0 +1,158 @@
import { program, Option, Command, InvalidArgumentError } from 'commander'
import { errors } from '@elastic/elasticsearch'
import dotenv from 'dotenv'
import { languageKeys } from '@/languages/lib/languages.js'
import { indexGeneralAutocomplete } from './lib/index-general-autocomplete'
import { indexGeneralSearch } from './lib/index-general-search'
import {
allIndexVersionKeys,
allIndexVersionOptions,
supportedAutocompletePlanVersions,
} from '@/search/lib/elasticsearch-versions'
import { indexAISearchAutocomplete } from './lib/index-ai-search-autocomplete'
// If you optionally have ELASTICSEARCH_URL set in your .env file.
dotenv.config()
program.name('index').description('CLI scripts for indexing Docs data into Elasticsearch')
const generalAutoCompleteCommand = new Command('general-autocomplete')
.description('Index for general search autocomplete')
.addOption(
new Option('-l, --language <language...>', 'Specific languages(s)').choices(languageKeys),
)
.addOption(
new Option('-v, --version <version...>', 'Specific versions').choices(allIndexVersionKeys),
)
.option('--verbose', 'Verbose output')
.option('--index-prefix <prefix>', 'Prefix for the index names', '')
.argument('<data-root>', 'path to the docs-internal-data repo')
.action(async (dataRepoRoot: string, options) => {
const languages = options.language ? options.language : languageKeys
const indexPrefix = options.indexPrefix || ''
try {
await indexGeneralAutocomplete({
dataRepoRoot,
languages,
versions: options.version || supportedAutocompletePlanVersions,
indexPrefix,
})
} catch (error: any) {
if (error instanceof errors.ElasticsearchClientError) {
if ((error as any)?.meta) {
console.error('Error meta: %O', (error as any).meta)
}
}
console.error('general-autocomplete indexing error:', error.message)
process.exit(1)
}
})
const generalSearchCommand = new Command('general-search')
.description(
'Indexes records for general search. Records should be pre-scraped by the scrape script.',
)
.option('-v, --verbose', 'Verbose outputs')
.addOption(
new Option('-V, --version [VERSION...]', 'Specific versions').choices(allIndexVersionOptions),
)
.addOption(
new Option('-l, --language <LANGUAGE...>', 'Which languages to focus on').choices(languageKeys),
)
.addOption(
new Option('--not-language <LANGUAGE...>', 'Specific language to omit').choices(languageKeys),
)
.option('-u, --elasticsearch-url <url>', 'If different from $ELASTICSEARCH_URL')
.option('-p, --index-prefix <prefix>', 'Index string to put before index name')
.option(
'-s, --stagger-seconds <seconds>',
'Number of seconds to sleep between each bulk operation',
(value) => {
const parsed = parseInt(value, 10)
if (isNaN(parsed)) {
throw new InvalidArgumentError('Not a number.')
}
return parsed
},
)
.option(
'-r, --retries <count>',
'Number of retry attempts on recoverable network errors',
(value) => {
const parsed = parseInt(value, 10)
if (isNaN(parsed)) {
throw new InvalidArgumentError('Not a number.')
}
return parsed
},
)
.option(
'--sleep-time <seconds>',
`Number of seconds to sleep between each retry attempt (defaults to 30)`,
(value) => {
const parsed = parseInt(value, 10)
if (isNaN(parsed)) {
throw new InvalidArgumentError('Not a number.')
}
return parsed
},
30,
)
.argument('<source-directory>', 'where the indexable files are')
.action(async (sourceDirectory, options) => {
try {
await indexGeneralSearch(sourceDirectory, options)
} catch (error: any) {
if (error instanceof errors.ElasticsearchClientError) {
if ((error as any)?.meta) {
console.error('Error meta: %O', (error as any).meta)
}
}
console.error('general-search indexing error:', error.message)
process.exit(1)
}
})
const aiSearchAutocompleteCommand = new Command('ai-search-autocomplete')
.description('Index for AI search autocomplete')
.addOption(
new Option(
'-l, --language <language...>',
'Specific languages(s). (NOTE: Only english, "en" is currently supported',
).choices(['en']),
)
.addOption(
new Option('-v, --version <version...>', 'Specific versions').choices(allIndexVersionKeys),
)
.option('--verbose', 'Verbose output')
.option('--index-prefix <prefix>', 'Prefix for the index names', '')
.argument('<data-root>', 'path to the docs-internal-data repo')
.action(async (dataRepoRoot: string, options) => {
// In the future, we may want to support multiple languages
// Currently (since this is an experiment), we only support english
const languages = ['en']
const indexPrefix = options.indexPrefix || ''
try {
await indexAISearchAutocomplete({
dataRepoRoot,
languages,
versions: options.version || supportedAutocompletePlanVersions,
indexPrefix,
})
} catch (error: any) {
if (error instanceof errors.ElasticsearchClientError) {
if ((error as any)?.meta) {
console.error('Error meta: %O', (error as any).meta)
}
}
console.error('ai-search-autocomplete indexing error:', error.message)
process.exit(1)
}
})
program.addCommand(generalAutoCompleteCommand)
program.addCommand(generalSearchCommand)
program.addCommand(aiSearchAutocompleteCommand)
program.parse(process.argv)

Просмотреть файл

@ -1,44 +0,0 @@
import { program, Option } from 'commander'
import { languageKeys } from '@/languages/lib/languages.js'
import { indexAutocomplete } from './index-autocomplete'
import { type Version } from './types'
const defaultVersions: Version[] = ['free-pro-team', 'enterprise-server', 'enterprise-cloud']
const shortAlias = new Map<string, Version>()
shortAlias.set('ghes', 'enterprise-server')
shortAlias.set('fpt', 'free-pro-team')
shortAlias.set('ghec', 'enterprise-cloud')
program.name('index').description('CLI scripts for indexing to Elasticsearch')
program
.command('autocomplete')
.description('Index for autocomplete')
.addOption(
new Option('-l, --language <language...>', 'Specific languages(s)').choices(languageKeys),
)
.addOption(
new Option('-v, --version <version...>', 'Specific version prefix(es)').choices([
...defaultVersions,
...shortAlias.keys(),
]),
)
.option('--verbose', 'Verbose output')
.option('--index-prefix <prefix>', 'Prefix for the index names', '')
.argument('<data-root>', 'path to the docs-internal-data repo')
.action((root: string, options) => {
const languages = options.language ? options.language : languageKeys
const versions: Version[] = []
for (const v of options.version || defaultVersions) {
if (shortAlias.has(v)) {
versions.push(shortAlias.get(v)!)
} else {
versions.push(v)
}
}
const indexPrefix = options.indexPrefix || ''
return indexAutocomplete({ dataRepoRoot: root, languages, versions, indexPrefix })
})
program.parse(process.argv)

Просмотреть файл

@ -1,27 +0,0 @@
import { Client } from '@elastic/elasticsearch'
export function getClient(): Client {
const node = getElasticsearchURL()
const client = new Client({ node })
return client
}
function getElasticsearchURL() {
if (!process.env.ELASTICSEARCH_URL) {
throw new Error(
'Must passed the elasticsearch URL option or ' +
'set the environment variable ELASTICSEARCH_URL',
)
}
let node = process.env.ELASTICSEARCH_URL
// Allow the user to lazily set it to `localhost:9200` for example.
if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) {
node = `http://${node}`
}
const parsed = new URL(node)
if (!parsed.hostname) throw new Error('no valid hostname')
return node
}

Просмотреть файл

@ -0,0 +1,112 @@
import fs from 'node:fs'
import path from 'node:path'
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes'
import {
createIndex,
populateIndex,
printSuccess,
updateAlias,
} from '@/search/scripts/index/utils/indexing-elasticsearch-utils'
import { getAISearchAutocompleteSettings } from '@/search/scripts/index/utils/settings'
import { aiSearchAutocompleteMappings } from '@/search/scripts/index/utils/mappings'
import { getPlanVersionFromIndexVersion } from '@/search/lib/elasticsearch-versions'
import type { TermsWithFrequency } from '@/search/scripts/index/types'
type Options = {
dataRepoRoot: string
languages: string[]
versions: string[]
retries?: number
sleepTime?: number
verbose?: boolean
indexPrefix?: string
}
export async function indexAISearchAutocomplete(options: Options) {
const client = getElasticsearchClient(undefined, options.verbose)
await client.ping() // Will throw if not available
const { dataRepoRoot, languages, versions } = options
for (const language of languages) {
for (const version of versions) {
const startTime = new Date()
const records = loadQueriesWithPriority({ dataRepoRoot, language, version })
const { indexName, indexAlias } = getElasticSearchIndex(
'aiSearchAutocomplete',
version,
language,
options.indexPrefix || '',
)
const settings = getAISearchAutocompleteSettings(language, options.verbose)
await createIndex(client, indexAlias, settings, aiSearchAutocompleteMappings)
const recordsArray = Object.entries(records).map(([term, popularity]) => ({
term,
popularity,
}))
await populateIndex(client, indexAlias, indexName, recordsArray, {
retries: options.retries,
sleepTime: options.sleepTime,
verbose: options.verbose,
})
await updateAlias(client, indexName, indexAlias, options)
printSuccess(indexName, startTime, options.verbose)
}
}
}
type LoadOptions = {
dataRepoRoot: string
language: string
version: string
}
function loadQueriesWithPriority(options: LoadOptions): TermsWithFrequency {
// The {version} in the paths uses the version's 'plan' name, e.g. `free-pro-team` instead of `fpt`
const internalDataVersion = getPlanVersionFromIndexVersion(options.version)
if (!internalDataVersion) {
throw new Error(`No rollup version found for version ${options.version}`)
}
const queriesFilePath = path.join(
options.dataRepoRoot,
'ai/search/queries',
options.language,
internalDataVersion,
'queries.json',
)
const queriesFile = JSON.parse(fs.readFileSync(queriesFilePath, 'utf8'))
const { topQueries, allQueries } = queriesFile
const terms: TermsWithFrequency = {}
let popularity = topQueries.length + allQueries.length
// Assign higher popularity to topQueries
for (const term of topQueries) {
terms[term] = popularity
popularity -= 1
}
// Assign remaining popularity to allQueries using the order they have in the JSON
for (const term of allQueries) {
// Don't read in the topQueries again (duplicates)
if (!(term in terms)) {
terms[term] = popularity
popularity -= 1
}
}
return terms
}

Просмотреть файл

@ -0,0 +1,134 @@
import fs from 'node:fs'
import path from 'node:path'
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes'
import {
createIndex,
populateIndex,
printSuccess,
updateAlias,
} from '@/search/scripts/index/utils/indexing-elasticsearch-utils'
import { getGeneralAutocompleteSettings } from '@/search/scripts/index/utils/settings'
import { generalAutocompleteMappings } from '@/search/scripts/index/utils/mappings'
import { getPlanVersionFromIndexVersion } from '@/search/lib/elasticsearch-versions'
import type { TermsWithFrequency } from '@/search/scripts/index/types'
type Options = {
dataRepoRoot: string
languages: string[]
versions: string[]
retries?: number
sleepTime?: number
verbose?: boolean
indexPrefix?: string
}
export async function indexGeneralAutocomplete(options: Options) {
const client = getElasticsearchClient(undefined, options.verbose)
await client.ping() // Will throw if not available
const { dataRepoRoot, versions, languages } = options
for (const language of languages) {
for (const version of versions) {
const startTime = new Date()
const records = loadTermsWithFrequency({ version, language, dataRepoRoot })
const { indexName, indexAlias } = getElasticSearchIndex(
'generalAutocomplete',
version,
language,
options.indexPrefix || '',
)
const settings = getGeneralAutocompleteSettings(language, options.verbose)
await createIndex(client, indexAlias, settings, generalAutocompleteMappings)
const recordsArray = Object.entries(records).map(([term, popularity]) => ({
term,
popularity,
}))
await populateIndex(client, indexAlias, indexName, recordsArray, {
retries: options.retries,
sleepTime: options.sleepTime,
verbose: options.verbose,
})
await updateAlias(client, indexName, indexAlias, options)
printSuccess(indexName, startTime, options.verbose)
}
}
}
type LoadOptions = {
dataRepoRoot: string
language: string
version: string
}
/*
* Terms are one-word search terms that a user might enter into a search toolbar
* We have two sources of "terms":
* - Previous user searches (searchTerms)
* - Terms auto-generated taking each word from each title of all of our articles (documentTerms)
*
* Each of the files live in our docs-internal-data repo that should be cloned before running this script.
* The paths to these files for each type of term are:
* - searchTerms: hydro/rollups/user-searches/{langauge}/{version}/rollup.json
* - documentTerms: hydro/rollups/user-searches/{langauge}/{version}/rollup.json
*/
function loadTermsWithFrequency(options: LoadOptions): TermsWithFrequency {
// The {version} in the paths uses the version's 'plan' name, e.g. `free-pro-team` instead of `fpt`
const internalDataVersion = getPlanVersionFromIndexVersion(options.version)
if (!internalDataVersion) {
throw new Error(`No rollup version found for version ${options.version}`)
}
const filePath = path.join(
options.dataRepoRoot,
'hydro/rollups/user-searches',
options.language,
internalDataVersion,
'rollup.json',
)
const terms: TermsWithFrequency = {}
const userSearchTerms: TermsWithFrequency = JSON.parse(fs.readFileSync(filePath, 'utf8'))
let maxFrequency = Math.max(...Object.values(userSearchTerms))
if (maxFrequency === 0) {
throw new Error(`No records found for ${options.language} ${options.version}`)
}
for (const [term, frequency] of Object.entries(userSearchTerms)) {
// Normalize the frequency which will turn into "popularity" in ElasticSearch
// We include +1 here because "userSearchTerms" should have higher priority than "articleTitleTerms"
terms[term] = frequency / maxFrequency + 1
}
const articleTitleTermsFilePath = path.join(
options.dataRepoRoot,
'all-documents/terms',
options.language,
internalDataVersion,
'terms.json',
)
const articleTitleTerms: TermsWithFrequency = JSON.parse(
fs.readFileSync(articleTitleTermsFilePath, 'utf8'),
)
maxFrequency = Math.max(...Object.values(articleTitleTerms))
if (maxFrequency === 0) {
throw new Error(`No document title records found for ${options.language} ${options.version}`)
}
for (const [articleTitleTerm, frequency] of Object.entries(articleTitleTerms)) {
if (!(articleTitleTerm in terms)) {
// Notice that we don't + 1 here because we want to give more priority to data from user searches
terms[articleTitleTerm] = frequency / maxFrequency
}
}
return terms
}

Просмотреть файл

@ -0,0 +1,145 @@
import { Client } from '@elastic/elasticsearch'
import chalk from 'chalk'
import { languageKeys } from '#src/languages/lib/languages.js'
import { allVersions } from '#src/versions/lib/all-versions.js'
import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes'
import { getElasticsearchClient } from '@/search/lib/helpers/get-client'
import {
createIndex,
escapeHTML,
loadIndexRecords,
populateIndex,
printSuccess,
updateAlias,
} from '@/search/scripts/index/utils/indexing-elasticsearch-utils'
import { sleep } from '@/search/lib/helpers/time'
import { getGeneralSearchSettings } from '@/search/scripts/index/utils/settings'
import { generalSearchMappings } from '@/search/scripts/index/utils/mappings'
import type { AllVersionInfo } from '@/search/scripts/index/types'
interface Options {
verbose?: boolean
version?: string[] | string
language?: string[]
notLanguage?: string[]
elasticsearchUrl?: string
indexPrefix?: string
staggerSeconds?: number
retries?: number
sleepTime: number
}
const shortNames: { [key: string]: AllVersionInfo } = Object.fromEntries(
Object.values(allVersions).map((info: AllVersionInfo) => {
const shortName = info.hasNumberedReleases
? info.miscBaseName + info.currentRelease
: info.miscBaseName
return [shortName, info]
}),
)
const allVersionKeys = Object.keys(shortNames)
export async function indexGeneralSearch(sourceDirectory: string, opts: Options) {
if (!sourceDirectory) {
throw new Error('Must pass the source directory as the first argument')
}
const { language, notLanguage } = opts
if (language && notLanguage) {
throw new Error("Can't combine --language and --not-language")
}
const client = getElasticsearchClient(opts.elasticsearchUrl, opts.verbose)
await client.ping() // Will throw if not available
let version: string | string[] | undefined = opts.version
if (!version && process.env.VERSION && process.env.VERSION !== 'all') {
version = process.env.VERSION
if (!allVersionKeys.includes(version)) {
throw new Error(
`Environment variable 'VERSION' (${version}) is not recognized. Must be one of ${allVersionKeys}`,
)
}
}
let versionKeys = allVersionKeys
if (version) {
versionKeys = Array.isArray(version) ? version : [version]
}
const languages =
language || languageKeys.filter((lang) => !notLanguage || !notLanguage.includes(lang))
if (opts.verbose) {
console.log(`Indexing on languages ${chalk.bold(languages.join(', '))}`)
}
for (const language of languages) {
let count = 0
for (const versionKey of versionKeys) {
const startTime = new Date()
const { indexName, indexAlias } = getElasticSearchIndex(
'generalSearch',
versionKey,
language,
opts.indexPrefix || '',
)
await indexVersion(client, indexName, indexAlias, language, sourceDirectory, opts)
count++
if (opts.staggerSeconds && count < versionKeys.length - 1) {
console.log(`Sleeping for ${opts.staggerSeconds} seconds...`)
await sleep(1000 * opts.staggerSeconds)
}
printSuccess(indexName, startTime, opts.verbose)
}
}
}
async function indexVersion(
client: Client,
indexName: string,
indexAlias: string,
language: string,
sourceDirectory: string,
opts: Options,
) {
const recordsData = await loadIndexRecords(indexName, sourceDirectory)
const allRecords = Object.values(recordsData).sort((a, b) => b.popularity - a.popularity)
const records = allRecords.map((doc) => {
const { title, objectID, content, breadcrumbs, headings, intro, toplevel } = doc
const contentEscaped = escapeHTML(content)
const headingsEscaped = escapeHTML(headings)
return {
url: objectID,
title,
title_explicit: title,
content: contentEscaped,
content_explicit: contentEscaped,
breadcrumbs,
headings: headingsEscaped,
headings_explicit: headingsEscaped,
popularity: doc.popularity + 1,
intro,
toplevel,
}
})
const settings = getGeneralSearchSettings(language, opts.verbose || false)
const mappings = generalSearchMappings
await createIndex(client, indexAlias, settings, mappings)
await populateIndex(client, indexAlias, indexName, records, {
retries: opts.retries,
sleepTime: opts.sleepTime * 1000,
verbose: opts.verbose,
})
await updateAlias(client, indexName, indexAlias, opts)
}

Просмотреть файл

@ -1,107 +0,0 @@
import chalk from 'chalk'
import { Client, errors } from '@elastic/elasticsearch'
import type { Records, RetryConfig } from '../types'
import { retryOnErrorTest } from './retry-on-error-test'
import { repointAlias } from './repoint-alias'
import { formatTime, sleep } from './utils'
type PopulateOptions = RetryConfig & {
verbose?: boolean
alias: string
name: string
}
export async function populate(client: Client, records: Records, options: PopulateOptions) {
const { alias, name } = options
const allRecords = Object.entries(records).sort((a, b) => b[1] - a[1])
const operations = allRecords.flatMap(([term, count]) => {
const popularity = count / allRecords[0][1] // Normalize to 1.0 for the highest count
return [
{ index: { _index: alias } },
{
term,
popularity,
},
]
})
const bulkOptions = {
// Default is 'false'.
// It means that the index is NOT refreshed as documents are inserted.
// Which makes sense in our case because we do not intend to search on
// this index until after we've pointed the alias to this new index.
refresh: false,
// Default is '1m' but we have no reason *not* to be patient. It's run
// by a bot on a schedeule (GitHub Actions).
timeout: '5m',
}
const attempts = options.retries
const sleepTime = options.sleepTime * 1000
console.log(`About to bulk index ${allRecords.length.toLocaleString()} records with retry %O`, {
attempts,
sleepTime,
})
const t0 = new Date()
const bulkResponse = await retryOnErrorTest(
(error: Error) => {
// Rate limiting can happen when you're indexing too much at
// same time.
return error instanceof errors.ResponseError && error.meta.statusCode === 429
},
() => client.bulk({ operations, ...bulkOptions }),
{
attempts,
sleepTime,
onError: (_, attempts, sleepTime) => {
console.warn(
chalk.yellow(
`Failed to bulk index ${name}. Will attempt ${attempts} more times (after ${
sleepTime / 1000
}s sleep).`,
),
)
},
},
)
if (bulkResponse.errors) {
// Some day, when we're more confident how and why this might happen
// we can rewrite this code to "massage" the errors better.
// For now, if it fails, it's "OK". It means we won't be proceeding,
// an error is thrown in Actions and we don't have to worry about
// an incompletion index.
console.error(`Bulk response errors: ${bulkResponse.errors}`)
throw new Error('Bulk errors happened.')
}
const t1 = new Date()
console.log(`Bulk indexed ${alias}. Took ${formatTime(t1.getTime() - t0.getTime())}`)
// The counting of documents in the index is async and can take a while
// to reflect. So send count requests until we get the right number.
let documentsInIndex = 0
let countAttempts = 3
while (documentsInIndex < allRecords.length) {
const { count } = await client.count({ index: alias })
documentsInIndex = count
if (documentsInIndex >= allRecords.length) break
countAttempts--
if (!countAttempts) {
console.log(`After ${countAttempts} attempts still haven't matched the expected number.`)
break
}
await sleep(1000)
}
console.log(
`Documents now in ${chalk.bold(alias)}: ${chalk.bold(documentsInIndex.toLocaleString())}`,
)
await repointAlias(client, alias, name, {
attempts,
sleepTime,
verbose: Boolean(options.verbose),
})
}

Просмотреть файл

@ -1,77 +0,0 @@
import chalk from 'chalk'
import { Client, errors } from '@elastic/elasticsearch'
import { retryOnErrorTest } from './retry-on-error-test'
import { formatTime } from './utils'
export async function repointAlias(
client: Client,
alias: string,
name: string,
options: {
attempts: number
sleepTime: number
verbose: boolean
},
) {
const { attempts, sleepTime, verbose } = options
// To perform an atomic operation that creates the new alias and removes
// the old indexes, we can use the updateAliases API with a body that
// includes an "actions" array. The array includes the added alias
// and the removed indexes. If any of the actions fail, none of the operations
// are performed.
// https://www.elastic.co/guide/en/elasticsearch/reference/master/indices-aliases.html
type Update =
| {
add: {
index: string
alias: string
}
}
| {
remove_index: {
index: string
}
}
const aliasUpdates: Update[] = [
{
add: {
index: alias,
alias: name,
},
},
]
console.log(`Alias ${name} -> ${alias}`)
console.log('About to get indices with retry %O', { attempts, sleepTime })
const indices = await retryOnErrorTest(
(error: any) => {
// 404 can happen when you're trying to get an index that
// doesn't exist. ...yet!
return error instanceof errors.ResponseError && error.meta.statusCode === 404
},
() => client.cat.indices({ format: 'json' }),
{
attempts,
sleepTime,
onError: (error, attempts, sleepTime) => {
console.warn(
chalk.yellow(
`Failed to get index ${name} (${
error.message || error.toString()
}). Will attempt ${attempts} more times (after ${formatTime(sleepTime)}s sleep).`,
),
)
},
},
)
for (const index of indices) {
if (index.index !== alias && index.index.startsWith(name)) {
aliasUpdates.push({ remove_index: { index: index.index } })
console.log('Deleting index', index.index)
}
}
if (verbose) console.log('Updating alias actions:', aliasUpdates)
await client.indices.updateAliases({ body: { actions: aliasUpdates } })
}

Просмотреть файл

@ -1,10 +1,55 @@
export type Version = 'free-pro-team' | 'enterprise-server' | 'enterprise-cloud'
export type Records = {
[key: string]: number
}
export type RetryConfig = {
retries: number
sleepTime: number
}
export interface AllVersionInfo {
hasNumberedReleases: boolean
miscBaseName: string
currentRelease: string
version: string
plan: string
}
export interface AllVersions {
[key: string]: AllVersionInfo
}
export interface Options {
language?: string
notLanguage?: string
version?: string
docsInternalData?: string
markers?: boolean
filter?: string
}
export type Args = string[]
export interface Page {
relativePath: string
redirect_from?: string[]
}
export interface Config {
noMarkers: boolean
filter?: string
docsInternalDataPath?: string
}
export type TermsWithFrequency = { [term: string]: number }
export interface Records {
[objectID: string]: Record // Here objectId will be identical to the record's objectId
}
export interface Record {
objectID: string // e.g. "/en/enterprise-cloud@latest/get-started"
breadcrumbs: string // e.g. "Get started / Using GitHub"
title: string // e.g. "Get started with GitHub documentation"
headings: string
content: string
intro: string
toplevel: string
popularity: number
}

Просмотреть файл

@ -0,0 +1,11 @@
export const SNOWBALL_LANGUAGES: { [key: string]: string } = {
en: 'English',
fr: 'French',
es: 'Spanish',
ru: 'Russian',
it: 'Italian',
de: 'German',
pt: 'Portuguese',
}
export const DEFAULT_SLEEPTIME_SECONDS = 30

Просмотреть файл

@ -0,0 +1,178 @@
import chalk from 'chalk'
import { Client, estypes, errors } from '@elastic/elasticsearch'
import fs from 'fs/promises'
import path from 'path'
import { readableTimeMinAndSec, sleep } from '@/search/lib/helpers/time'
import { retryOnErrorTest } from '@/search/scripts/index/utils/retry-on-error-test'
import {
DEFAULT_SLEEPTIME_SECONDS,
SNOWBALL_LANGUAGES,
} from '@/search/scripts/index/utils/constants'
import { safeUrlDisplay } from '@/search/lib/helpers/strings'
import type { Records } from '@/search/scripts/index/types'
type Options = {
retries?: number
sleepTime?: number
verbose?: boolean
}
export async function createIndex(
client: Client,
indexAlias: string,
settings: estypes.IndicesIndexSettings,
mappings: estypes.MappingTypeMapping,
) {
await client.indices.create({
index: indexAlias,
mappings,
settings,
})
}
export async function populateIndex(
client: Client,
indexAlias: string,
indexName: string,
records: any[],
options: Options,
) {
console.log(chalk.yellow(`\nIndexing ${chalk.bold(indexName)}`))
const bulkOperations = records.flatMap((doc) => [{ index: { _index: indexAlias } }, doc])
const bulkOptions = {
refresh: false,
timeout: '5m',
}
const attempts = options.retries || 0
const sleepTime = options.sleepTime || DEFAULT_SLEEPTIME_SECONDS * 1000
console.log(`About to bulk index ${records.length.toLocaleString()} records with retry %O`, {
attempts,
sleepTimeMS: sleepTime,
})
const t0 = new Date()
const bulkResponse = await retryOnErrorTest(
(error) => error instanceof errors.ResponseError && error.meta.statusCode === 429,
() => client.bulk({ operations: bulkOperations, ...bulkOptions }),
{
attempts,
sleepTime,
onError: (_, attempts, sleepTime) => {
console.warn(
chalk.yellow(
`Failed to bulk index ${indexName}. Will attempt ${attempts} more times (after ${
sleepTime / 1000
}s sleep).`,
),
)
},
},
)
if (bulkResponse.errors) {
console.error(`Bulk response errors: ${bulkResponse.errors}`)
throw new Error('Bulk errors happened.')
}
const t1 = new Date()
console.log(
`Bulk indexed ${indexAlias}. Took ${readableTimeMinAndSec(t1.getTime() - t0.getTime())}`,
)
let documentsInIndex = 0
let countAttempts = 3
while (documentsInIndex < records.length) {
const { count } = await client.count({ index: indexAlias })
documentsInIndex = count
if (documentsInIndex >= records.length) break
countAttempts--
if (!countAttempts) {
console.log(`After ${countAttempts} attempts still haven't matched the expected number.`)
break
}
await sleep(1000)
}
console.log(`Documents now in ${indexAlias}: ${documentsInIndex.toLocaleString()}`)
}
export async function updateAlias(
client: Client,
indexName: string,
indexAlias: string,
options: Options,
) {
const aliasUpdates: estypes.IndicesUpdateAliasesAction[] = [
{
add: {
index: indexAlias,
alias: indexName,
},
},
]
const indices = await retryOnErrorTest(
(error) => {
// 404 can happen when you're trying to get an index that
// doesn't exist. ...yet!
return error instanceof errors.ResponseError && error.meta.statusCode === 404
},
() => client.cat.indices({ format: 'json' }),
{
attempts: options.retries || 0,
sleepTime: (options.sleepTime || DEFAULT_SLEEPTIME_SECONDS) * 1000,
onError: (error, attempts, sleepTime) => {
console.warn(
chalk.yellow(
`Failed to get index ${indexName} (${
error.message || error.toString()
}). Will attempt ${attempts} more times (after ${readableTimeMinAndSec(sleepTime)}s sleep).`,
),
)
},
},
)
for (const index of indices) {
if (index.index !== indexAlias && index.index.startsWith(indexName)) {
aliasUpdates.push({ remove_index: { index: index.index } })
console.log('Deleting old index', index.index)
}
}
if (options.verbose) console.log('Updating alias actions:', aliasUpdates)
await client.indices.updateAliases({ body: { actions: aliasUpdates } })
}
export function printSuccess(indexName: string, startTime: Date, verbose = false) {
const endTime = new Date()
console.log(
chalk.green(
`Finished indexing ${indexName}. Took ${readableTimeMinAndSec(endTime.getTime() - startTime.getTime())}`,
),
)
if (verbose) {
console.log(`To view index: ${safeUrlDisplay(`<elasticsearch-url>/${indexName}`)}`)
console.log(`To search index: ${safeUrlDisplay(`<elasticsearch-url>/${indexName}/_search`)}`)
}
}
export async function loadIndexRecords(
indexName: string,
sourceDirectory: string,
): Promise<Records> {
const filePath = path.join(sourceDirectory, `${indexName}-records.json`)
const payload = await fs.readFile(filePath, 'utf8')
return JSON.parse(payload)
}
export function escapeHTML(content: string): string {
return content.replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/"/g, '&quot;')
}
export function getSnowballLanguage(language: string): string | undefined {
return SNOWBALL_LANGUAGES[language]
}

Просмотреть файл

@ -0,0 +1,52 @@
import type { estypes } from '@elastic/elasticsearch'
export const generalSearchMappings: estypes.MappingTypeMapping = {
properties: {
url: { type: 'keyword' },
title: {
type: 'text',
analyzer: 'text_analyzer',
norms: false,
term_vector: 'with_positions_offsets',
},
title_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
content: {
type: 'text',
analyzer: 'text_analyzer',
term_vector: 'with_positions_offsets',
},
content_explicit: {
type: 'text',
analyzer: 'text_analyzer_explicit',
term_vector: 'with_positions_offsets',
},
headings: { type: 'text', analyzer: 'text_analyzer', norms: false },
headings_explicit: { type: 'text', analyzer: 'text_analyzer_explicit', norms: false },
breadcrumbs: { type: 'text' },
popularity: { type: 'float' },
intro: { type: 'text' },
toplevel: { type: 'keyword' },
},
}
export const generalAutocompleteMappings: estypes.MappingTypeMapping = {
properties: {
term: {
type: 'text',
analyzer: 'text_analyzer',
term_vector: 'with_positions_offsets',
},
popularity: { type: 'float' },
},
}
export const aiSearchAutocompleteMappings: estypes.MappingTypeMapping = {
properties: {
term: {
type: 'text',
analyzer: 'text_analyzer',
term_vector: 'with_positions_offsets',
},
popularity: { type: 'float' },
},
}

Просмотреть файл

@ -1,5 +1,3 @@
// [start-readme]
//
// Return a function that you can use to run any code within and if it
// throws you get a chance to say whether to sleep + retry.
// Example:
@ -20,10 +18,8 @@
// Note that, by default, the sleep time is "exponential" by a factor of
// 1.5. So the first sleep will, in the above example,
// be 800ms. Then 1,200ms, Then 1,800ms. etc.
//
// [end-readme]
import { sleep } from './utils'
import { sleep } from '@/search/lib/helpers/time'
export async function retryOnErrorTest(
errorTest: (error: any) => boolean,

Просмотреть файл

@ -0,0 +1,118 @@
import { SNOWBALL_LANGUAGES } from '@/search/scripts/index/utils/constants'
import type { estypes } from '@elastic/elasticsearch'
import type {
AnalysisSnowballLanguage,
AnalysisCustomAnalyzer,
} from '@elastic/elasticsearch/lib/api/types'
export function getGeneralSearchSettings(
language: string,
verbose: boolean,
): estypes.IndicesIndexSettings {
const settings: estypes.IndicesIndexSettings = {
analysis: {
char_filter: {
hyphenation_filter: {
type: 'mapping',
mappings: ['- => _'],
},
},
analyzer: {
text_analyzer_explicit: {
char_filter: ['hyphenation_filter'],
filter: ['lowercase', 'stop', 'asciifolding'],
tokenizer: 'standard',
type: 'custom',
} as AnalysisCustomAnalyzer,
text_analyzer: {
filter: ['lowercase', 'stop', 'asciifolding'],
tokenizer: 'standard',
type: 'custom',
} as AnalysisCustomAnalyzer,
},
filter: {},
},
}
const snowballLanguage = SNOWBALL_LANGUAGES[language]
if (snowballLanguage) {
const textAnalyzer = settings.analysis!.analyzer!.text_analyzer as AnalysisCustomAnalyzer
textAnalyzer.filter!.push('languaged_snowball')
settings.analysis!.filter!['languaged_snowball'] = {
type: 'snowball',
language: snowballLanguage as AnalysisSnowballLanguage,
}
} else if (verbose) {
console.warn(`No snowball language for '${language}'`)
}
return settings
}
export function getGeneralAutocompleteSettings(
language: string,
verbose = false,
): estypes.IndicesIndexSettings {
const settings: estypes.IndicesIndexSettings = {
analysis: {
analyzer: {
text_analyzer: {
filter: ['lowercase'],
tokenizer: 'standard',
type: 'custom',
} as AnalysisCustomAnalyzer,
},
filter: {},
},
}
const snowballLanguage = SNOWBALL_LANGUAGES[language]
if (snowballLanguage) {
const textAnalyzer = settings.analysis!.analyzer!.text_analyzer as AnalysisCustomAnalyzer
textAnalyzer.filter!.push('languaged_snowball')
settings.analysis!.filter!['languaged_snowball'] = {
type: 'snowball',
language: snowballLanguage as AnalysisSnowballLanguage,
}
} else if (verbose) {
console.warn(`No snowball language for '${language}'`)
}
return settings
}
export function getAISearchAutocompleteSettings(
language: string,
verbose = false,
): estypes.IndicesIndexSettings {
const settings: estypes.IndicesIndexSettings = {
analysis: {
analyzer: {
text_analyzer: {
filter: ['lowercase'],
tokenizer: 'standard',
type: 'custom',
} as AnalysisCustomAnalyzer,
},
filter: {},
},
}
const snowballLanguage = SNOWBALL_LANGUAGES[language]
if (snowballLanguage) {
const textAnalyzer = settings.analysis!.analyzer!.text_analyzer as AnalysisCustomAnalyzer
textAnalyzer.filter!.push('languaged_snowball')
settings.analysis!.filter!['languaged_snowball'] = {
type: 'snowball',
language: snowballLanguage as AnalysisSnowballLanguage,
}
} else if (verbose) {
console.warn(`No snowball language for '${language}'`)
}
return settings
}

Просмотреть файл

@ -1,76 +0,0 @@
// [start-readme]
//
// Return a function that you can use to run any code within and if it
// throws you get a chance to say whether to sleep + retry.
// Example:
//
// async function mainFunction() {
// if (Math.random() > 0.9) throw new Error('too large')
// return 'OK'
// }
//
// const errorTest = (err) => err instanceof Error && err.message.includes('too large')
// const config = { // all optional
// attempts: 3,
// sleepTime: 800,
// onError: (err, attempts) => console.warn(`Failed ${attempts} attempts`)
// }
// const ok = await retry(errorTest, mainFunction, config)
//
// Note that, by default, the sleep time is "exponential" by a factor of
// 1.5. So the first sleep will, in the above example,
// be 800ms. Then 1,200ms, Then 1,800ms. etc.
//
// [end-readme]
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
export async function retryOnErrorTest(
errorTest,
callback,
{
attempts = 4,
sleepTime = 1000,
exponential = 1.5,
jitterPercent = 25,
onError = () => {},
} = {},
) {
while (true) {
try {
return await callback()
} catch (error) {
if (error instanceof Error && attempts > 0 && errorTest(error)) {
if (onError) onError(error, attempts, sleepTime)
attempts--
// The reason for the jitter is to avoid a thundering herd problem.
// Suppose two independent processes/threads start at the same time.
// They both fail, perhaps due to rate limiting. Now, if they both
// sleep for 30 seconds in the first retry attempt, it'll just
// clash again 30 seconds later. But if you add a bit of jitter, at
// the next attempt these independent processes/threads will now
// start at slightly different times.
// According to the Oxford English dictionary, they define "jitter" as:
//
// slight irregular movement, variation, or unsteadiness,
// especially in an electrical signal or electronic device.
//
await sleep(addJitter(sleepTime, jitterPercent))
if (exponential) {
sleepTime *= 2
}
} else {
throw error
}
}
}
}
function addJitter(num, percent) {
// Return the number plus between 0 and $percent of that number.
// For example, for 1,000 with a 20% jitter you might get 1133.4
// because you start with 1,000 and 13.4% is a random number between
// 0 and 20%.
return num + Math.random() * percent * 0.01 * num
}

Просмотреть файл

@ -0,0 +1,40 @@
# Scraping for General Search
We need to scrape each page on the Docs site and use the data we scrape to index Elasticsearch.
We currently only scrape for **general search** results.
Autocomplete search data is generated from analytics events and GPT queries.
## CLI Script
Before running the scraping script ensure that the server is running in another terminal with `npm run general-search-scrape-server`
Run the script with `npm run general-search-scrape -- <scrape-directory>`
After a successful run it will generate a series of JSON files with the page data of every page of the Docs site into the passed directory.
The `index-general-search.yml` workflow will scrape the records into `/tmp/records` then proceed to run the [general-search indexing script](../index/README.md)
To see the arguments accepted by the script, pass the `--help` argument, for example
```bash
npm run general-search-scrape -- --help
```
## Records (scraped pages)
In the context of an Elasticsearch index, a record represents a page. Each record has `breadcrumbs`, `title`, `headings`, `content` (the article content in text, not HTML), `intro` (if one exists in the frontmatter), and a unique `objectID` that is currently just the permalink of the article. Here's an example:
```json
{
"objectID":"/en/actions/creating-actions/about-custom-actions",
"breadcrumbs":"GitHub Actions / Creating actions",
"title":"About custom actions",
"headings":"About custom actions\nTypes of actions\n[...]",
"content":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, [...]",
"intro":"Actions are individual tasks that you can combine to create jobs and customize your workflow. You can create your own actions, or use and customize actions shared by the GitHub community.",
"toplevel":"GitHub Actions",
"popularity":0
}
```

Просмотреть файл

@ -1,14 +1,16 @@
#!/usr/bin/env node
import eventToPromise from 'event-to-promise'
import chalk from 'chalk'
import dotenv from 'dotenv'
import boxen from 'boxen'
import { HTTPError } from 'got'
import parsePageSectionsIntoRecords from './parse-page-sections-into-records.js'
import getPopularPages from './popular-pages.js'
import languages from '#src/languages/lib/languages.js'
import domwaiter from './domwaiter.js'
import languages from '@/languages/lib/languages.js'
import parsePageSectionsIntoRecords from '@/search/scripts/scrape/lib/parse-page-sections-into-records'
import getPopularPages from '@/search/scripts/scrape/lib/popular-pages'
import domwaiter from '@/search/scripts/scrape/lib/domwaiter'
import { getAllVersionsKeyFromIndexVersion } from '@/search/lib/elasticsearch-versions'
import type { Page, Permalink, Record, Config, Redirects } from '@/search/scripts/scrape/types'
const pageMarker = chalk.green('|')
const recordMarker = chalk.grey('.')
@ -31,16 +33,19 @@ const MIN_TIME = parseInt(process.env.BUILD_RECORDS_MIN_TIME || '5', 10)
const FORCE_0_POPULARITY_PRODUCTS = new Set(['contributing'])
export default async function buildRecords(
indexName,
indexablePages,
pageVersion,
languageCode,
redirects,
config = {},
) {
indexName: string,
indexablePages: Page[],
indexVersion: string,
languageCode: string,
redirects: Redirects,
config: Config = {} as Config,
): Promise<Record[]> {
// Determine the page version from the index version
const pageVersion = getAllVersionsKeyFromIndexVersion(indexVersion)
const { noMarkers, docsInternalDataPath } = config
console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`)
const records = []
const records: Record[] = []
const pages = indexablePages
// exclude pages that are not in the current language
.filter((page) => page.languageCode === languageCode)
@ -55,12 +60,15 @@ export default async function buildRecords(
})
})
.map((permalink) => {
permalink.url = `http://localhost:${port}${permalink.href}`
if (permalink) {
permalink.url = `http://localhost:${port}${permalink.href}`
}
return permalink
})
.filter((permalink): permalink is Permalink => permalink !== undefined)
const popularPages = docsInternalDataPath
? await getPopularPages(docsInternalDataPath, redirects, pageVersion, languageCode)
? await getPopularPages(docsInternalDataPath, redirects, indexVersion, languageCode)
: {}
console.log('indexable pages', indexablePages.length)
@ -93,7 +101,7 @@ export default async function buildRecords(
if (err instanceof HTTPError && !err.response.ok) {
console.log(
'\n' +
boxen(chalk.bold(err.request.requestUrl.pathname), {
boxen(chalk.bold(err.request.requestUrl?.pathname), {
title: chalk.red('The URL it failed on was'),
padding: 1,
borderColor: 'red',

Просмотреть файл

@ -1,9 +1,18 @@
import { EventEmitter } from 'node:events'
import { EventEmitter } from 'events'
import Bottleneck from 'bottleneck'
import got from 'got'
import cheerio from 'cheerio'
export default function domwaiter(pages, opts = {}) {
import type { Permalink } from '@/search/scripts/scrape/types'
interface DomWaiterOptions {
parseDOM?: boolean
json?: boolean
maxConcurrent?: number
minTime?: number
}
export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter {
const emitter = new EventEmitter()
const defaults = {
@ -17,26 +26,26 @@ export default function domwaiter(pages, opts = {}) {
const limiter = new Bottleneck(opts)
pages.forEach((page) => {
limiter.schedule(getPage, page, emitter, opts)
limiter.schedule(() => getPage(page, emitter, opts))
})
limiter
.on('idle', () => {
emitter.emit('done')
})
.on('error', (err) => {
emitter.emit('error', err)
})
limiter.on('idle', () => {
emitter.emit('done')
})
limiter.on('error', (err) => {
emitter.emit('error', err)
})
return emitter
}
async function getPage(page, emitter, opts) {
async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) {
emitter.emit('beforePageLoad', page)
if (opts.json) {
try {
const json = await got(page.url).json()
const json = await got(page.url!).json()
const pageCopy = Object.assign({}, page, { json })
emitter.emit('page', pageCopy)
} catch (err) {
@ -44,9 +53,9 @@ async function getPage(page, emitter, opts) {
}
} else {
try {
const body = (await got(page.url)).body
const body = (await got(page.url!)).body
const pageCopy = Object.assign({}, page, { body })
if (opts.parseDOM) pageCopy.$ = cheerio.load(body)
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
emitter.emit('page', pageCopy)
} catch (err) {
emitter.emit('error', err)

Просмотреть файл

@ -1,8 +1,9 @@
#!/usr/bin/env node
import { loadPages } from '#src/frame/lib/page-data.js'
import { loadPages } from '@/frame/lib/page-data.js'
export default async function findIndexablePages(match = '') {
const allPages = await loadPages()
import type { Page } from '@/search/scripts/scrape/types'
export default async function findIndexablePages(match = ''): Promise<Page[]> {
const allPages: Page[] = await loadPages()
const indexablePages = allPages
// exclude hidden pages
.filter((page) => !page.hidden)

Просмотреть файл

@ -1,17 +1,18 @@
#!/usr/bin/env node
import { render } from 'cheerio-to-text'
import type { Record } from '@/search/scripts/scrape/types'
// This module takes cheerio page object and divides it into sections
// using H1,H2 heading elements as section delimiters. The text
// that follows each heading becomes the content of the search record.
const ignoredHeadingSlugs = ['in-this-article', 'further-reading', 'prerequisites']
export default function parsePageSectionsIntoRecords(page) {
export default function parsePageSectionsIntoRecords(page: any): Record {
const { href, $ } = page
const title = $('h1').first().text().trim()
const breadcrumbsArray = $('[data-search=breadcrumbs] nav.breadcrumbs a')
.map((i, el) => {
.map((i: number, el: any) => {
return $(el).text().trim().replace('/', '').replace(/\s+/g, ' ')
})
.get()
@ -21,8 +22,7 @@ export default function parsePageSectionsIntoRecords(page) {
// page that don't make much sense to find in a site search.
$('[data-search=hide]').remove()
// Only slice off the last one if the length of the array is greater
// that 1.
// Only slice off the last one if the length of the array is greater than 1
// On an article page, we the breadcrumbs array will be something
// like:
//
@ -51,12 +51,12 @@ export default function parsePageSectionsIntoRecords(page) {
const $sections = $('h2', $root)
.filter('[id]')
.filter((i, el) => {
.filter((i: number, el: any) => {
return !ignoredHeadingSlugs.includes($(el).attr('id'))
})
const headings = $sections
.map((i, el) => $(el).text())
.map((i: number, el: any) => $(el).text())
.get()
.join('\n')
.trim()

Просмотреть файл

@ -2,28 +2,31 @@ import { join } from 'path'
import { existsSync } from 'fs'
import fs from 'fs/promises'
export default async function getPopularPages(dirPath, redirects, version, language) {
// The dirPath is the path to the github/docs-internal-data repo.
// We make assumptions about the structure of the repo. In particular,
// the pageviews rollups live in
// `hydro/rollups/pageviews/$language/$versionprefix/rollup.json`
// For example
// `hydro/rollups/pageviews/en/enterprise-cloud/rollup.json`
const versionPrefix = version.split('@')[0]
let filePath = join(dirPath, 'hydro/rollups/pageviews', language, versionPrefix, 'rollup.json')
import { getPlanVersionFromIndexVersion } from '@/search/lib/elasticsearch-versions.js'
import type { Redirects, PopularPages } from '@/search/scripts/scrape/types'
export default async function getPopularPages(
dirPath: string,
redirects: Redirects,
indexVersion: string,
language: string,
): Promise<PopularPages> {
const planVersion = getPlanVersionFromIndexVersion(indexVersion)
let filePath = join(dirPath, 'hydro/rollups/pageviews', language, planVersion, 'rollup.json')
if (!existsSync(filePath) && language !== 'en') {
console.warn("Trying the rollup for 'en'")
language = 'en'
filePath = join(dirPath, 'hydro/rollups/pageviews', language, versionPrefix, 'rollup.json')
filePath = join(dirPath, 'hydro/rollups/pageviews', language, planVersion, 'rollup.json')
}
if (!existsSync(filePath)) {
throw new Error(`No rollup found for version '${versionPrefix}'. Tried ${filePath}`)
throw new Error(`No rollup found for version '${planVersion}'. Tried ${filePath}`)
}
const rollupRaw = await fs.readFile(filePath, 'utf-8')
// Firt iterate through the array of objects, not making an assumption
// First iterate through the array of objects, not making an assumption
// that the first one is the biggest one.
const all = {}
const all: { [key: string]: number } = {}
for (const [path, count] of Object.entries(JSON.parse(rollupRaw))) {
if (!path) {
// Can happen if the SQL query is, for some unknown reason, finding
@ -41,11 +44,11 @@ export default async function getPopularPages(dirPath, redirects, version, langu
// We never index these anyway so their popularity is never relevant.
continue
}
all[path] = count
all[path] = count as number
}
const biggestCount = Math.max(...Object.values(all))
const popularPages = {}
const popularPages: PopularPages = {}
for (const [path, count] of Object.entries(all)) {
// Don't bother writing massively long floating point numbers
// because reducing it makes the JSON records smaller and we don't
@ -55,11 +58,6 @@ export default async function getPopularPages(dirPath, redirects, version, langu
// The reason we're heeding redirects is because it's possible
// that the JSON file is older/"staler" than the
// content itself.
// Imaging our analytics recorded that `/en/foo` had 1,234 pageviews,
// and someone goes and... `git mv content/foo content/bar` plus
// adding `redirect_from: - /foo` into the front-matter.
// Then, by using the redirects first, we can maintain that popularity
// by now "pretending" that it's `/en/bar` that has 1,234 pageviews.
popularPages[redirects[path] || path] = ratio
}

Просмотреть файл

@ -1,22 +1,22 @@
#!/usr/bin/env node
import chalk from 'chalk'
import languages from '#src/languages/lib/languages.js'
import buildRecords from './build-records.js'
import findIndexablePages from './find-indexable-pages.js'
import { allVersions } from '#src/versions/lib/all-versions.js'
import { namePrefix } from '#src/search/lib/config.js'
import { writeIndexRecords } from './search-index-records.js'
import languages from '@/languages/lib/languages.js'
import buildRecords from '@/search/scripts/scrape/lib/build-records'
import findIndexablePages from '@/search/scripts/scrape/lib/find-indexable-pages'
import { writeIndexRecords } from '@/search/scripts/scrape/lib/search-index-records'
import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes'
import type { Options, Config, Page, Redirects } from '@/search/scripts/scrape/types'
// Build a search data file for every combination of product version and language
// e.g. `github-docs-dotcom-en.json` and `github-docs-2.14-ja.json`
export default async function syncSearchIndexes({
export default async function scrapeIntoIndexJson({
language,
notLanguage,
outDirectory,
versionsToBuild,
config = {},
}) {
config = {} as Config,
}: Options): Promise<void> {
const t0 = new Date()
// build indices for a specific language if provided; otherwise build indices for all languages
@ -25,14 +25,14 @@ export default async function syncSearchIndexes({
)
console.log(
`Building indices for ${chalk.yellow(language || 'all languages')} and ${chalk.yellow(
`Building indices for language: ${chalk.yellow(language || 'all languages')} and version: ${chalk.yellow(
versionsToBuild.length === 1 ? versionsToBuild[0] : 'all versions',
)}.\n`,
)
// Exclude WIP pages, hidden pages, index pages, etc
const indexablePages = await findIndexablePages(config.filter)
const redirects = {}
const indexablePages: Page[] = await findIndexablePages(config.filter)
const redirects: Redirects = {}
indexablePages.forEach((page) => {
const href = page.relativePath.replace('index.md', '').replace('.md', '')
for (let redirectFrom of page.redirect_from || []) {
@ -47,22 +47,14 @@ export default async function syncSearchIndexes({
let countRecordsTotal = 0
// Build and validate all indices
for (const languageCode of languagesToBuild) {
for (const pageVersion of versionsToBuild) {
// if GHES, resolves to the release number like 2.21, 2.22, etc.
// if FPT, resolves to 'dotcom'
const indexVersion =
allVersions[pageVersion].plan === 'enterprise-server'
? allVersions[pageVersion].currentRelease
: allVersions[pageVersion].miscBaseName
// github-docs-dotcom-en, github-docs-2.22-en
const indexName = `${namePrefix}-${indexVersion}-${languageCode}`
for (const indexVersion of versionsToBuild) {
const { indexName } = getElasticSearchIndex('generalSearch', indexVersion, languageCode)
// The page version will be the new version, e.g., free-pro-team@latest, enterprise-server@3.7
const records = await buildRecords(
indexName,
indexablePages,
pageVersion,
indexVersion,
languageCode,
redirects,
config,
@ -81,6 +73,6 @@ export default async function syncSearchIndexes({
console.log(`Rate ~${chalk.bold(rate)} pages per second.`)
}
function formatSeconds(seconds) {
function formatSeconds(seconds: number): string {
return new Date(seconds * 1000).toISOString().substr(11, 8)
}

Просмотреть файл

@ -1,16 +1,27 @@
#!/usr/bin/env node
import path from 'path'
import fs from 'fs/promises'
import assert from 'assert'
import { isArray, isString } from 'lodash-es'
function countArrayValues(arr) {
const counter = new Map()
arr.forEach((value) => counter.set(value, (counter.get(value) || 0) + 1))
return [...counter.entries()].map(([value, count]) => {
return { value, count }
})
import type { Record } from '@/search/scripts/scrape/types'
export async function writeIndexRecords(
name: string,
records: Record[],
outDirectory: string,
): Promise<string> {
validateRecords(name, records)
const recordsObject = Object.fromEntries(records.map((record) => [record.objectID, record]))
const content = JSON.stringify(recordsObject, undefined, 0)
const filePath = path.join(outDirectory, `${name}-records.json`)
await fs.writeFile(filePath, content)
return filePath
}
export default function validateRecords(name, records) {
function validateRecords(name: string, records: Record[]): true {
assert(isString(name) && name.length, '`name` is required')
assert(isArray(records) && records.length, '`records` must be a non-empty array')
@ -35,3 +46,11 @@ export default function validateRecords(name, records) {
return true
}
function countArrayValues(arr: string[]) {
const counter = new Map()
arr.forEach((value) => counter.set(value, (counter.get(value) || 0) + 1))
return [...counter.entries()].map(([value, count]) => {
return { value, count }
})
}

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше