Shared resources, search update, some other nits

This commit is contained in:
Liudmila Molkova 2024-04-11 16:15:15 -07:00
Родитель ac553163d4
Коммит c4ea7a8c68
7 изменённых файлов: 95 добавлений и 55 удалений

Просмотреть файл

@ -16,7 +16,7 @@
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
<!-- TODO: Switch to Azure.Monitor.Query once https://github.com/Azure/azure-sdk-for-net/issues/37475 is resolved -->
<PackageReference Include="Microsoft.Azure.Kusto.Data" Version="11.3.2" />
<PackageReference Include="Azure.Search.Documents" Version="11.5.0-beta.4" />
<PackageReference Include="Azure.Search.Documents" Version="11.6.0-beta.3" />
<PackageReference Include="Microsoft.Extensions.Configuration" Version="6.0.0" />
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="6.0.0" />
<PackageReference Include="Microsoft.Extensions.Configuration.EnvironmentVariables" Version="6.0.0" />

Просмотреть файл

@ -8,17 +8,21 @@ namespace Azure.Sdk.Tools.AI.Helper.KnowledgeBase;
public class CloudMineIndexer : BaseIndexer
{
private readonly ICslQueryProvider _kustoClient;
public CloudMineIndexer(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) :
private readonly ILogger<CloudMineIndexer> _logger;
public CloudMineIndexer(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) :
base(searchConfig, openAiConfig, repoName, loggerFactory)
{
_kustoClient = KustoClientFactory.CreateCslQueryProvider(new KustoConnectionStringBuilder(cloudMineConfig.Endpoint)
.WithAadAzCliAuthentication(true));
_logger = loggerFactory.CreateLogger<CloudMineIndexer>();
}
public async Task Index(string? language, string? mode)
public async Task Index(string? language, IndexMode mode)
{
using var activity = _activitySource.StartActivity("Index");
string query = GetQuery(language, mode);
_logger.LogInformation("executing {query}", query);
using var reader = await _kustoClient.ExecuteQueryAsync("GitHub", query, new ClientRequestProperties());
var issues = new List<Issue>();
while (reader.Read())
@ -26,37 +30,58 @@ public class CloudMineIndexer : BaseIndexer
issues.Add(Issue.Read(reader));
}
_logger.LogInformation("Indexing: {count}", issues.Count);
await _documentIndexer.CreateIndexIfNotExists();
await _documentIndexer.Index(issues);
}
private static string GetQuery(string? langauge, string? mode)
private static string GetLabelFilter(IndexMode mode)
{
string repo;
string labelFilter = "";
if (langauge == "dotnet")
switch (mode)
{
repo = "azure-sdk-for-net";
case IndexMode.AddressedIssues:
return "set_has_element(labelset, \"customer-reported\") and set_has_element(labelset, \"issue-addressed\")";
case IndexMode.ReferenceIssues:
return "set_has_element(labelset, \"reference-issue\")";
default:
return "set_has_element(labelset, \"customer-reported\")";
}
else if (langauge == "java")
}
private static string GetQuery(string? langauge, IndexMode mode)
{
string repo = GetRepo(langauge);
switch(mode)
{
repo = "azure-sdk-for-java";
case IndexMode.ReferenceIssues:
return GetReferenceIssueQuery(repo);
case IndexMode.AddressedIssues:
case IndexMode.ClosedIssues:
return GetClosedOrAddressedIssues(repo, GetLabelFilter(mode));
default:
throw new NotImplementedException("Mode not supported");
}
}
private static string GetRepo(string? language)
{
if (language == "dotnet")
{
return "azure-sdk-for-net";
}
else if (language == "java")
{
return "azure-sdk-for-java";
}
else
{
throw new NotImplementedException("Language not supported");
}
if (mode == "reference-issue")
{
labelFilter = " and set_has_element(labelset, \"reference-issue\")";
}
else if (langauge == "dotnet")
{
labelFilter = " and set_has_element(labelset, \"issue-addressed\")";
throw new NotImplementedException("language not supported");
}
}
private static string GetClosedOrAddressedIssues(string repo, string labelFilter)
{
return $@"
let repo = ""{repo}"";
let issues = Issue
@ -65,7 +90,7 @@ let issues = Issue
| project Title, Description = Body, OwnerId, IssueNumber = Number, ClosedAt, Labels, OwnerLogin
| mv-expand labels = parse_json(Labels)
| summarize labelset = make_set(tostring(labels[""name""])) by IssueNumber, ClosedAt, Description, Title, OwnerId, OwnerLogin
| where set_has_element(labelset, ""customer-reported""){labelFilter};
| where {labelFilter};
let users = User | project UserId;
let AzureOrgMembers = Member
| where OrganizationLogin == ""Azure"" // todo: it includes all people who belonged to the org at some point, but might no longer belong to it
@ -84,4 +109,17 @@ issues
| extend reply = bag_pack(""Comment"", Comment, ""User"", UserId, ""Timestamp"", CommentTs, ""AuthorIsInAzureOrg"", AuthorIsInAzureOrg)
| summarize Comments = make_list(reply) by Title, Description, IssueNumber, OwnerId, ClosedAt, OwnerLogin";
}
private static string GetReferenceIssueQuery(string repo) {
return $@"
let repo = ""{repo}"";
Issue
| where CreatedAt >= ago(365d) and isnotempty(ClosedAt)
| where OrganizationLogin == ""Azure"" and RepositoryName == repo
| project Title, Description = Body, OwnerId, IssueNumber = Number, ClosedAt, Labels, OwnerLogin
| mv-expand labels = parse_json(Labels)
| summarize labelset = make_set(tostring(labels[""name""])) by IssueNumber, ClosedAt, Description, Title, OwnerId, OwnerLogin
| where set_has_element(labelset, ""reference-issue"")
| project Title, Description, IssueNumber, OwnerId, ClosedAt, OwnerLogin";
}
}

Просмотреть файл

@ -52,26 +52,26 @@ public class DocumentIndexer
public async Task CreateIndex()
{
string vectorSearchConfigName = "my-vector-config";
string vectorSearchHnswConfig = "my-hsnw-vector-config";
var index = new SearchIndex(_indexName)
{
Fields =
{
new SimpleField("Id", SearchFieldDataType.String) { IsKey = true, IsFilterable = true },
new SearchableField("Content") { IsFilterable = true },
new SearchField("ContentVector", SearchFieldDataType.Collection(SearchFieldDataType.Single))
{
IsSearchable = true,
VectorSearchDimensions = 1536,
VectorSearchConfiguration = vectorSearchConfigName
},
new VectorSearchField("ContentVector", 1536, vectorSearchConfigName),
new SearchableField("Source") { IsFilterable = true, IsSortable = true, IsFacetable = true},
new SearchableField("Title") { IsFilterable = true, IsSortable = true, IsFacetable = true},
},
VectorSearch = new()
{
AlgorithmConfigurations =
Profiles =
{
new HnswVectorSearchAlgorithmConfiguration(vectorSearchConfigName)
new VectorSearchProfile(vectorSearchConfigName, vectorSearchHnswConfig)
},
Algorithms =
{
new HnswAlgorithmConfiguration(vectorSearchHnswConfig)
}
}
};
@ -116,7 +116,9 @@ public class DocumentIndexer
private async Task<IReadOnlyList<float>> Vectorize(string text)
{
var embeddings = await _openAi.GetEmbeddingsAsync(_embeddingsModel, new EmbeddingsOptions(text));
text = text.Replace('\n', ' ').Replace('\r', ' ');
var embeddings = await _openAi.GetEmbeddingsAsync(_embeddingsModel, new EmbeddingsOptions(new[] { text }));
return embeddings.Value.Data[0].Embedding;
}
@ -192,7 +194,6 @@ public class DocumentIndexer
public class Document
{
internal const string Separator = "\n~~~END~~~\n";
internal static readonly string RepoUrl = $"https://github.com/Azure/";
internal static readonly char[] EndOfLine = new[] { '\n', '\r' };
@ -238,7 +239,7 @@ public class DocumentIndexer
var content = new StringBuilder()
.AppendLine(issue.Title)
.AppendFormat("Customer: {0} ", description.Trim())
.Append(Separator);
.Append("\n");
foreach (IssueComment comment in issue.Comments!)
{
@ -251,7 +252,7 @@ public class DocumentIndexer
content.Append("Customer: ");
}
content.Append(comment.Comment?.Trim()).Append(Separator);
content.Append(comment.Comment?.Trim()).Append("\n");
}
return content.ToString();

Просмотреть файл

@ -1,3 +1,4 @@
using System.ComponentModel.DataAnnotations;
using Azure.Monitor.OpenTelemetry.Exporter;
using Azure.Sdk.Tools.AI.Helper.KnowledgeBase;
using CommandLine;
@ -22,14 +23,14 @@ internal class Program
[Option('c', "command", HelpText = "Select command, one of: `index`, `query`", Required = true)]
public string? Command { get; set; }
[Option('m', "mode", HelpText = "Select indexing data subset: one of `issues`, `docs`, `reference-issues`")]
public string? Mode { get; set; }
[Option('m', "mode", HelpText = "Select indexing data subset: one of `Docs`, `ReferenceIssues`, `AddressedIssues`, `ClosedIssues`", Default = IndexMode.ClosedIssues)]
public IndexMode Mode { get; set; }
[Option('p', "path", HelpText = "When indexing markdown documents, path to the root - all md files except changelogs, swaggers, contributing will be indexed recursively")]
public string? Path { get; set; }
[Option('k', "keep-index", HelpText = "Keep existing index. Defaults to false", Default = false)]
public bool KeepIndex { get; set; }
[Option('k', "drop-index", HelpText = "Drop existing index. Defaults to false", Default = false)]
public bool DropIndex { get; set; }
[Option('q', "question", HelpText = "Issue description to get bot suggestion for")]
public string? Question { get; set; }
@ -64,13 +65,13 @@ internal class Program
}
else if (options.Command == "index")
{
if (options.Mode == "issues" || options.Mode == "reference-issues")
{
await IndexIssues(cloudMineConfig, searchConfig, openAiConfig, loggerFactory, options);
}
else if (options.Mode == "docs")
if (options.Mode == IndexMode.Docs)
{
await IndexDocs(searchConfig, openAiConfig, loggerFactory, options);
}
else
{
await IndexIssues(cloudMineConfig, searchConfig, openAiConfig, loggerFactory, options);
}
}
else
@ -82,7 +83,7 @@ internal class Program
private static async Task IndexIssues(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory, ConsoleOptions options)
{
var cloudMineIndexer = new CloudMineIndexer(cloudMineConfig, searchConfig, openAiConfig, GetRepoName(options), loggerFactory);
if (!options.KeepIndex)
if (options.DropIndex)
{
await cloudMineIndexer.DeleteIndex();
}
@ -92,7 +93,7 @@ internal class Program
private static async Task IndexDocs(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory, ConsoleOptions options)
{
var mdIndexer = new MarkdownFileIndexer(searchConfig, openAiConfig, GetRepoName(options), loggerFactory);
if (!options.KeepIndex)
if (options.DropIndex)
{
await mdIndexer.DeleteIndex();
}

Просмотреть файл

@ -28,24 +28,24 @@ There's a default issue text inside `Program.cs`, so `--question` is optional.
2. To index existing issues (created by customers, closed, marked with `issue-addressed`) in the dotnet repo, do
```bash
dotnet run --command index --language dotnet --mode issues
dotnet run --command index --language dotnet --mode AddressedIssues
```
Only `dotnet` and `java` are supported.
3. As a variation, you can index all issues marked with `reference-issue` with
3. Or you can index all issues marked with `reference-issue` with
```bash
dotnet run --command index --language java --mode reference-issues
dotnet run --command index --language java --mode ReferenceIssues
```
4. To index all markdown docs in the path (except CHANGELOG, CONTRIBUTING, swaggers)
```bash
dotnet run --command index --language java --mode docs --path c:\repo\azure-sdk-for-java\sdk
dotnet run --command index --language java --mode Docs --path c:\repo\azure-sdk-for-java\sdk
```
> Note: when indexing data, existing index with provided name is deleted first. If you want to preserve the index, pass `--keep-index`
> Note: when indexing data, it's added to the existing index. If you want to clean the index first, pass `--drop-index`.
You can always get cli help with `dotnet run -- -h`

Просмотреть файл

@ -6,15 +6,15 @@
}
},
"OpenAI": {
"Endpoint": "https://support-bot-playground.openai.azure.com/",
"EmbeddingModel": "knowledge-base-embeddings",
"InferenceModel": "knowledge-base-gpt4"
"Endpoint": "https://openai-shared.openai.azure.com",
"EmbeddingModel": "text-embedding-ada-002",
"InferenceModel": "gpt-4"
},
"CloudMine": {
"Endpoint": "https://1es.kusto.windows.net"
},
"Search": {
"Endpoint": "https://lim-knowledge-base-search.search.windows.net",
"Endpoint": "https://openai-shared.search.windows.net",
"IndexName": "java-vector"
}
}

Просмотреть файл

@ -8,7 +8,7 @@ param location string = resourceGroup().location
param embeddingsCapacity int = 20
param inferenceCapacity int = 10
var embeddingModelName = 'text-embedding-3-small'
var embeddingModelName = 'text-embedding-ada-002'
var inferenceModelName = 'gpt-4'
var openAiDeployments = [