azure-sdk-tools/tools/sdk-ai-bots/Scripts/Build-EngHubDocEmbeddings.ps1

114 строки
4.7 KiB
PowerShell

<#
.SYNOPSIS
This code build the embeddings for the document under https://eng.ms/docs/products/azure-developer-experience.
.DESCRIPTION
This code is responsible for refreshing the embeddings for the document in engineering hub site.
.PARAMETER IncrementalEmbedding
Control the incremental building behavior for the embeddings.
.EXAMPLE
Build-EngHubDocEmbeddings.ps1 -IncrementalEmbedding $true
#>
[CmdletBinding()]
param (
[Parameter(Position = 0)]
[ValidateNotNullOrEmpty()]
[string] $IncrementalEmbedding = $true
)
# Set the working directory, current location is supposed to be the root of the repository
$buildSourceDirectory = Get-Location
$workingDirectory = Join-Path $buildSourceDirectory "tools\sdk-ai-bots"
if($env:AGENT_ID) {
# Running in Azure DevOps, pipeline would checkout two repositories, azure-sdk-tools and enginerring hub repository, so the working directory should be azure-sdk-tools
$workingDirectory = Join-Path $buildSourceDirectory "azure-sdk-tools\tools\sdk-ai-bots"
}
$scriptsRoot = Join-Path $workingDirectory "Scripts"
$embeddingToolFolder = Join-Path $workingDirectory "Embeddings"
Write-Host "scriptsRoot: $scriptsRoot"
Write-Host "embeddingToolFolder: $embeddingToolFolder"
. (Join-Path $scriptsRoot Common.ps1)
# Create embeddingSource folder on current location
$embeddingSourceFolder = Join-Path -Path $workingDirectory -ChildPath "embeddingSource"
if (-not (Test-Path -Path $embeddingSourceFolder)) {
New-Item -ItemType Directory -Path $embeddingSourceFolder
}
# Create folder to save the enghub documents
$enghubDocsDestFolder = Join-Path -Path $embeddingSourceFolder -ChildPath "enghub-docs"
if (-not (Test-Path -Path $enghubDocsDestFolder)) {
New-Item -ItemType Directory -Path $enghubDocsDestFolder
}
$reposFolder = Join-Path -Path $buildSourceDirectory -ChildPath "azure-sdk-docs-eng.ms"
if(-not (Test-Path $reposFolder)) {
# Clone eng hub repository
Write-Host "Cloning azure-sdk-docs-eng.ms repository at $buildSourceDirectory"
if(-not (Clone-Repository -RepoUrl "https://azure-sdk@dev.azure.com/azure-sdk/internal/_git/azure-sdk-docs-eng.ms" -RootFolder $buildSourceDirectory)) {
exit 1
}
}
$enghubDocsSrcFolder = Join-Path -Path $buildSourceDirectory -ChildPath "azure-sdk-docs-eng.ms/docs"
if(-not (Test-Path $enghubDocsSrcFolder)) {
Write-Error "Failed to find the enghub documents folder at $enghubDocsSrcFolder"
exit 1
}
# Call the script to build the metadata.json file
Write-Host "Building metadata.json file for enghub documents"
$buildMetadataScript = Join-Path $scriptsRoot "Markdown-BuildIndexMetadata.ps1"
& $buildMetadataScript -MarkdownDirectory $enghubDocsSrcFolder -OutputDirectory $enghubDocsDestFolder
if(Test-Path $enghubDocsDestFolder/metadata.json) {
Copy-Item -Path $enghubDocsDestFolder/metadata.json -Destination "$embeddingSourceFolder/metadata_enghub_docs.json"
}
else {
Write-Error "Failed to build metadata.json file for enghub documents"
exit 1
}
# Download previous saved embeddings(last_rag_chunks_enghub_docs.json) from Azure Blob Storage
# Using Azure PowerShell login type for AzCopy.
# When running this script locally, first using 'Connect-AzAccount' then 'Set-AzContext' to switch to the correct subscription
$env:AZCOPY_AUTO_LOGIN_TYPE="PSCRED"
$blobName = "last_rag_chunks_enghub_docs.json"
$destinationPath = $embeddingSourceFolder
$ragChunkPath = Join-Path -Path $embeddingSourceFolder -ChildPath $blobName
$storageAccountName = $env:AZURE_STORAGE_ACCOUNT_NAME
$containerName = $env:AZURE_STORAGE_ACCOUNT_CONTAINER
if(-not $containerName) {
Write-Error "Please set the environment variable 'AZURE_STORAGE_ACCOUNT_CONTAINER'."
exit 1
}
if($IncrementalEmbedding -eq $true) {
Write-Host "Downloading previous saved embeddings $blobName from Azure Blob Storage"
if(-not (Download-AzureBlob -StorageAccountName $storageAccountName -ContainerName $containerName -BlobName $blobName -DestinationPath $destinationPath)) {
exit 1
}
}
# Build embeddings
Write-Host "Building embeddings for enghub documents"
$env:RAG_CHUNK_PATH = $ragChunkPath
$env:METADATA_PATH = "$embeddingSourceFolder/metadata_enghub_docs.json"
$env:DOCUMENT_PATH = $enghubDocsDestFolder
$env:INCREMENTAL_EMBEDDING = $IncrementalEmbedding
$env:AZURESEARCH_FIELDS_CONTENT = "Text"
$env:AZURESEARCH_FIELDS_CONTENT_VECTOR = "Embedding"
$env:AZURESEARCH_FIELDS_TAG = "AdditionalMetadata"
$env:AZURESEARCH_FIELDS_ID = "Id"
if(-not (Build-Embeddings -EmbeddingToolFolder $embeddingToolFolder)) {
exit 1
}
# Upload embeddings output to Azure Blob Storage
Write-Host "Uploading embeddings output $ragChunkPath to Azure Blob Storage"
if(-not (Upload-AzureBlob -StorageAccountName $storageAccountName -ContainerName $containerName -BlobName $blobName -SourceFile $ragChunkPath)) {
exit 1
}