From c4ea7a8c689f33227a415a1d8c58180f450bacdc Mon Sep 17 00:00:00 2001 From: Liudmila Molkova Date: Thu, 11 Apr 2024 16:15:15 -0700 Subject: [PATCH] Shared resources, search update, some other nits --- .../Azure.Sdk.Tools.AI.Helper.csproj | 2 +- .../KnowledgeBase/CloudMineIndexer.cs | 80 ++++++++++++++----- .../KnowledgeBase/DocumentIndexer.cs | 25 +++--- .../Azure.Sdk.Tools.AI.Helper/Program.cs | 23 +++--- .../Azure.Sdk.Tools.AI.Helper/README.md | 10 +-- .../appsettings.json | 8 +- .../bot-resources.bicep | 2 +- 7 files changed, 95 insertions(+), 55 deletions(-) diff --git a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/Azure.Sdk.Tools.AI.Helper.csproj b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/Azure.Sdk.Tools.AI.Helper.csproj index 8c7c3f16c..6885ad812 100644 --- a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/Azure.Sdk.Tools.AI.Helper.csproj +++ b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/Azure.Sdk.Tools.AI.Helper.csproj @@ -16,7 +16,7 @@ - + diff --git a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/KnowledgeBase/CloudMineIndexer.cs b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/KnowledgeBase/CloudMineIndexer.cs index d5499ed53..302e0b8c7 100644 --- a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/KnowledgeBase/CloudMineIndexer.cs +++ b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/KnowledgeBase/CloudMineIndexer.cs @@ -8,17 +8,21 @@ namespace Azure.Sdk.Tools.AI.Helper.KnowledgeBase; public class CloudMineIndexer : BaseIndexer { private readonly ICslQueryProvider _kustoClient; - public CloudMineIndexer(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) : + private readonly ILogger _logger; + public CloudMineIndexer(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, repoName, loggerFactory) { _kustoClient = KustoClientFactory.CreateCslQueryProvider(new KustoConnectionStringBuilder(cloudMineConfig.Endpoint) .WithAadAzCliAuthentication(true)); + _logger = loggerFactory.CreateLogger(); } - public async Task Index(string? language, string? mode) + public async Task Index(string? language, IndexMode mode) { using var activity = _activitySource.StartActivity("Index"); string query = GetQuery(language, mode); + + _logger.LogInformation("executing {query}", query); using var reader = await _kustoClient.ExecuteQueryAsync("GitHub", query, new ClientRequestProperties()); var issues = new List(); while (reader.Read()) @@ -26,37 +30,58 @@ public class CloudMineIndexer : BaseIndexer issues.Add(Issue.Read(reader)); } + _logger.LogInformation("Indexing: {count}", issues.Count); await _documentIndexer.CreateIndexIfNotExists(); await _documentIndexer.Index(issues); } - private static string GetQuery(string? langauge, string? mode) + private static string GetLabelFilter(IndexMode mode) { - string repo; - string labelFilter = ""; - - if (langauge == "dotnet") + switch (mode) { - repo = "azure-sdk-for-net"; + case IndexMode.AddressedIssues: + return "set_has_element(labelset, \"customer-reported\") and set_has_element(labelset, \"issue-addressed\")"; + case IndexMode.ReferenceIssues: + return "set_has_element(labelset, \"reference-issue\")"; + default: + return "set_has_element(labelset, \"customer-reported\")"; } - else if (langauge == "java") + } + + private static string GetQuery(string? langauge, IndexMode mode) + { + string repo = GetRepo(langauge); + + switch(mode) { - repo = "azure-sdk-for-java"; + case IndexMode.ReferenceIssues: + return GetReferenceIssueQuery(repo); + case IndexMode.AddressedIssues: + case IndexMode.ClosedIssues: + return GetClosedOrAddressedIssues(repo, GetLabelFilter(mode)); + default: + throw new NotImplementedException("Mode not supported"); + } + } + + private static string GetRepo(string? language) + { + if (language == "dotnet") + { + return "azure-sdk-for-net"; + } + else if (language == "java") + { + return "azure-sdk-for-java"; } else { - throw new NotImplementedException("Language not supported"); - } - - if (mode == "reference-issue") - { - labelFilter = " and set_has_element(labelset, \"reference-issue\")"; - } - else if (langauge == "dotnet") - { - labelFilter = " and set_has_element(labelset, \"issue-addressed\")"; + throw new NotImplementedException("language not supported"); } + } + private static string GetClosedOrAddressedIssues(string repo, string labelFilter) + { return $@" let repo = ""{repo}""; let issues = Issue @@ -65,7 +90,7 @@ let issues = Issue | project Title, Description = Body, OwnerId, IssueNumber = Number, ClosedAt, Labels, OwnerLogin | mv-expand labels = parse_json(Labels) | summarize labelset = make_set(tostring(labels[""name""])) by IssueNumber, ClosedAt, Description, Title, OwnerId, OwnerLogin -| where set_has_element(labelset, ""customer-reported""){labelFilter}; +| where {labelFilter}; let users = User | project UserId; let AzureOrgMembers = Member | where OrganizationLogin == ""Azure"" // todo: it includes all people who belonged to the org at some point, but might no longer belong to it @@ -84,4 +109,17 @@ issues | extend reply = bag_pack(""Comment"", Comment, ""User"", UserId, ""Timestamp"", CommentTs, ""AuthorIsInAzureOrg"", AuthorIsInAzureOrg) | summarize Comments = make_list(reply) by Title, Description, IssueNumber, OwnerId, ClosedAt, OwnerLogin"; } + + private static string GetReferenceIssueQuery(string repo) { + return $@" +let repo = ""{repo}""; +Issue +| where CreatedAt >= ago(365d) and isnotempty(ClosedAt) +| where OrganizationLogin == ""Azure"" and RepositoryName == repo +| project Title, Description = Body, OwnerId, IssueNumber = Number, ClosedAt, Labels, OwnerLogin +| mv-expand labels = parse_json(Labels) +| summarize labelset = make_set(tostring(labels[""name""])) by IssueNumber, ClosedAt, Description, Title, OwnerId, OwnerLogin +| where set_has_element(labelset, ""reference-issue"") +| project Title, Description, IssueNumber, OwnerId, ClosedAt, OwnerLogin"; + } } diff --git a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/KnowledgeBase/DocumentIndexer.cs b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/KnowledgeBase/DocumentIndexer.cs index 7c1982c33..90e5d8509 100644 --- a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/KnowledgeBase/DocumentIndexer.cs +++ b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/KnowledgeBase/DocumentIndexer.cs @@ -52,26 +52,26 @@ public class DocumentIndexer public async Task CreateIndex() { string vectorSearchConfigName = "my-vector-config"; + string vectorSearchHnswConfig = "my-hsnw-vector-config"; var index = new SearchIndex(_indexName) { Fields = { new SimpleField("Id", SearchFieldDataType.String) { IsKey = true, IsFilterable = true }, new SearchableField("Content") { IsFilterable = true }, - new SearchField("ContentVector", SearchFieldDataType.Collection(SearchFieldDataType.Single)) - { - IsSearchable = true, - VectorSearchDimensions = 1536, - VectorSearchConfiguration = vectorSearchConfigName - }, + new VectorSearchField("ContentVector", 1536, vectorSearchConfigName), new SearchableField("Source") { IsFilterable = true, IsSortable = true, IsFacetable = true}, new SearchableField("Title") { IsFilterable = true, IsSortable = true, IsFacetable = true}, }, VectorSearch = new() { - AlgorithmConfigurations = + Profiles = { - new HnswVectorSearchAlgorithmConfiguration(vectorSearchConfigName) + new VectorSearchProfile(vectorSearchConfigName, vectorSearchHnswConfig) + }, + Algorithms = + { + new HnswAlgorithmConfiguration(vectorSearchHnswConfig) } } }; @@ -116,7 +116,9 @@ public class DocumentIndexer private async Task> Vectorize(string text) { - var embeddings = await _openAi.GetEmbeddingsAsync(_embeddingsModel, new EmbeddingsOptions(text)); + text = text.Replace('\n', ' ').Replace('\r', ' '); + + var embeddings = await _openAi.GetEmbeddingsAsync(_embeddingsModel, new EmbeddingsOptions(new[] { text })); return embeddings.Value.Data[0].Embedding; } @@ -192,7 +194,6 @@ public class DocumentIndexer public class Document { - internal const string Separator = "\n~~~END~~~\n"; internal static readonly string RepoUrl = $"https://github.com/Azure/"; internal static readonly char[] EndOfLine = new[] { '\n', '\r' }; @@ -238,7 +239,7 @@ public class DocumentIndexer var content = new StringBuilder() .AppendLine(issue.Title) .AppendFormat("Customer: {0} ", description.Trim()) - .Append(Separator); + .Append("\n"); foreach (IssueComment comment in issue.Comments!) { @@ -251,7 +252,7 @@ public class DocumentIndexer content.Append("Customer: "); } - content.Append(comment.Comment?.Trim()).Append(Separator); + content.Append(comment.Comment?.Trim()).Append("\n"); } return content.ToString(); diff --git a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/Program.cs b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/Program.cs index 12b3996f7..e9c77b97f 100644 --- a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/Program.cs +++ b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/Program.cs @@ -1,3 +1,4 @@ +using System.ComponentModel.DataAnnotations; using Azure.Monitor.OpenTelemetry.Exporter; using Azure.Sdk.Tools.AI.Helper.KnowledgeBase; using CommandLine; @@ -22,14 +23,14 @@ internal class Program [Option('c', "command", HelpText = "Select command, one of: `index`, `query`", Required = true)] public string? Command { get; set; } - [Option('m', "mode", HelpText = "Select indexing data subset: one of `issues`, `docs`, `reference-issues`")] - public string? Mode { get; set; } + [Option('m', "mode", HelpText = "Select indexing data subset: one of `Docs`, `ReferenceIssues`, `AddressedIssues`, `ClosedIssues`", Default = IndexMode.ClosedIssues)] + public IndexMode Mode { get; set; } [Option('p', "path", HelpText = "When indexing markdown documents, path to the root - all md files except changelogs, swaggers, contributing will be indexed recursively")] public string? Path { get; set; } - [Option('k', "keep-index", HelpText = "Keep existing index. Defaults to false", Default = false)] - public bool KeepIndex { get; set; } + [Option('k', "drop-index", HelpText = "Drop existing index. Defaults to false", Default = false)] + public bool DropIndex { get; set; } [Option('q', "question", HelpText = "Issue description to get bot suggestion for")] public string? Question { get; set; } @@ -64,13 +65,13 @@ internal class Program } else if (options.Command == "index") { - if (options.Mode == "issues" || options.Mode == "reference-issues") - { - await IndexIssues(cloudMineConfig, searchConfig, openAiConfig, loggerFactory, options); - } - else if (options.Mode == "docs") + if (options.Mode == IndexMode.Docs) { await IndexDocs(searchConfig, openAiConfig, loggerFactory, options); + } + else + { + await IndexIssues(cloudMineConfig, searchConfig, openAiConfig, loggerFactory, options); } } else @@ -82,7 +83,7 @@ internal class Program private static async Task IndexIssues(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory, ConsoleOptions options) { var cloudMineIndexer = new CloudMineIndexer(cloudMineConfig, searchConfig, openAiConfig, GetRepoName(options), loggerFactory); - if (!options.KeepIndex) + if (options.DropIndex) { await cloudMineIndexer.DeleteIndex(); } @@ -92,7 +93,7 @@ internal class Program private static async Task IndexDocs(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory, ConsoleOptions options) { var mdIndexer = new MarkdownFileIndexer(searchConfig, openAiConfig, GetRepoName(options), loggerFactory); - if (!options.KeepIndex) + if (options.DropIndex) { await mdIndexer.DeleteIndex(); } diff --git a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/README.md b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/README.md index b618a0df9..25586e980 100644 --- a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/README.md +++ b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/README.md @@ -28,24 +28,24 @@ There's a default issue text inside `Program.cs`, so `--question` is optional. 2. To index existing issues (created by customers, closed, marked with `issue-addressed`) in the dotnet repo, do ```bash -dotnet run --command index --language dotnet --mode issues +dotnet run --command index --language dotnet --mode AddressedIssues ``` Only `dotnet` and `java` are supported. -3. As a variation, you can index all issues marked with `reference-issue` with +3. Or you can index all issues marked with `reference-issue` with ```bash -dotnet run --command index --language java --mode reference-issues +dotnet run --command index --language java --mode ReferenceIssues ``` 4. To index all markdown docs in the path (except CHANGELOG, CONTRIBUTING, swaggers) ```bash -dotnet run --command index --language java --mode docs --path c:\repo\azure-sdk-for-java\sdk +dotnet run --command index --language java --mode Docs --path c:\repo\azure-sdk-for-java\sdk ``` -> Note: when indexing data, existing index with provided name is deleted first. If you want to preserve the index, pass `--keep-index` +> Note: when indexing data, it's added to the existing index. If you want to clean the index first, pass `--drop-index`. You can always get cli help with `dotnet run -- -h` diff --git a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/appsettings.json b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/appsettings.json index 4be6cd752..d08f3f83e 100644 --- a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/appsettings.json +++ b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/appsettings.json @@ -6,15 +6,15 @@ } }, "OpenAI": { - "Endpoint": "https://support-bot-playground.openai.azure.com/", - "EmbeddingModel": "knowledge-base-embeddings", - "InferenceModel": "knowledge-base-gpt4" + "Endpoint": "https://openai-shared.openai.azure.com", + "EmbeddingModel": "text-embedding-ada-002", + "InferenceModel": "gpt-4" }, "CloudMine": { "Endpoint": "https://1es.kusto.windows.net" }, "Search": { - "Endpoint": "https://lim-knowledge-base-search.search.windows.net", + "Endpoint": "https://openai-shared.search.windows.net", "IndexName": "java-vector" } } diff --git a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/bot-resources.bicep b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/bot-resources.bicep index 7bb785a85..214bd77e0 100644 --- a/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/bot-resources.bicep +++ b/tools/github-event-processor/Azure.Sdk.Tools.AI.Helper/bot-resources.bicep @@ -8,7 +8,7 @@ param location string = resourceGroup().location param embeddingsCapacity int = 20 param inferenceCapacity int = 10 -var embeddingModelName = 'text-embedding-3-small' +var embeddingModelName = 'text-embedding-ada-002' var inferenceModelName = 'gpt-4' var openAiDeployments = [