Shared resources, search update, some other nits
This commit is contained in:
Родитель
ac553163d4
Коммит
c4ea7a8c68
|
@ -16,7 +16,7 @@
|
|||
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
|
||||
<!-- TODO: Switch to Azure.Monitor.Query once https://github.com/Azure/azure-sdk-for-net/issues/37475 is resolved -->
|
||||
<PackageReference Include="Microsoft.Azure.Kusto.Data" Version="11.3.2" />
|
||||
<PackageReference Include="Azure.Search.Documents" Version="11.5.0-beta.4" />
|
||||
<PackageReference Include="Azure.Search.Documents" Version="11.6.0-beta.3" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration" Version="6.0.0" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="6.0.0" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.EnvironmentVariables" Version="6.0.0" />
|
||||
|
|
|
@ -8,17 +8,21 @@ namespace Azure.Sdk.Tools.AI.Helper.KnowledgeBase;
|
|||
public class CloudMineIndexer : BaseIndexer
|
||||
{
|
||||
private readonly ICslQueryProvider _kustoClient;
|
||||
public CloudMineIndexer(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) :
|
||||
private readonly ILogger<CloudMineIndexer> _logger;
|
||||
public CloudMineIndexer(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) :
|
||||
base(searchConfig, openAiConfig, repoName, loggerFactory)
|
||||
{
|
||||
_kustoClient = KustoClientFactory.CreateCslQueryProvider(new KustoConnectionStringBuilder(cloudMineConfig.Endpoint)
|
||||
.WithAadAzCliAuthentication(true));
|
||||
_logger = loggerFactory.CreateLogger<CloudMineIndexer>();
|
||||
}
|
||||
|
||||
public async Task Index(string? language, string? mode)
|
||||
public async Task Index(string? language, IndexMode mode)
|
||||
{
|
||||
using var activity = _activitySource.StartActivity("Index");
|
||||
string query = GetQuery(language, mode);
|
||||
|
||||
_logger.LogInformation("executing {query}", query);
|
||||
using var reader = await _kustoClient.ExecuteQueryAsync("GitHub", query, new ClientRequestProperties());
|
||||
var issues = new List<Issue>();
|
||||
while (reader.Read())
|
||||
|
@ -26,37 +30,58 @@ public class CloudMineIndexer : BaseIndexer
|
|||
issues.Add(Issue.Read(reader));
|
||||
}
|
||||
|
||||
_logger.LogInformation("Indexing: {count}", issues.Count);
|
||||
await _documentIndexer.CreateIndexIfNotExists();
|
||||
await _documentIndexer.Index(issues);
|
||||
}
|
||||
|
||||
private static string GetQuery(string? langauge, string? mode)
|
||||
private static string GetLabelFilter(IndexMode mode)
|
||||
{
|
||||
string repo;
|
||||
string labelFilter = "";
|
||||
|
||||
if (langauge == "dotnet")
|
||||
switch (mode)
|
||||
{
|
||||
repo = "azure-sdk-for-net";
|
||||
case IndexMode.AddressedIssues:
|
||||
return "set_has_element(labelset, \"customer-reported\") and set_has_element(labelset, \"issue-addressed\")";
|
||||
case IndexMode.ReferenceIssues:
|
||||
return "set_has_element(labelset, \"reference-issue\")";
|
||||
default:
|
||||
return "set_has_element(labelset, \"customer-reported\")";
|
||||
}
|
||||
else if (langauge == "java")
|
||||
}
|
||||
|
||||
private static string GetQuery(string? langauge, IndexMode mode)
|
||||
{
|
||||
string repo = GetRepo(langauge);
|
||||
|
||||
switch(mode)
|
||||
{
|
||||
repo = "azure-sdk-for-java";
|
||||
case IndexMode.ReferenceIssues:
|
||||
return GetReferenceIssueQuery(repo);
|
||||
case IndexMode.AddressedIssues:
|
||||
case IndexMode.ClosedIssues:
|
||||
return GetClosedOrAddressedIssues(repo, GetLabelFilter(mode));
|
||||
default:
|
||||
throw new NotImplementedException("Mode not supported");
|
||||
}
|
||||
}
|
||||
|
||||
private static string GetRepo(string? language)
|
||||
{
|
||||
if (language == "dotnet")
|
||||
{
|
||||
return "azure-sdk-for-net";
|
||||
}
|
||||
else if (language == "java")
|
||||
{
|
||||
return "azure-sdk-for-java";
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new NotImplementedException("Language not supported");
|
||||
}
|
||||
|
||||
if (mode == "reference-issue")
|
||||
{
|
||||
labelFilter = " and set_has_element(labelset, \"reference-issue\")";
|
||||
}
|
||||
else if (langauge == "dotnet")
|
||||
{
|
||||
labelFilter = " and set_has_element(labelset, \"issue-addressed\")";
|
||||
throw new NotImplementedException("language not supported");
|
||||
}
|
||||
}
|
||||
|
||||
private static string GetClosedOrAddressedIssues(string repo, string labelFilter)
|
||||
{
|
||||
return $@"
|
||||
let repo = ""{repo}"";
|
||||
let issues = Issue
|
||||
|
@ -65,7 +90,7 @@ let issues = Issue
|
|||
| project Title, Description = Body, OwnerId, IssueNumber = Number, ClosedAt, Labels, OwnerLogin
|
||||
| mv-expand labels = parse_json(Labels)
|
||||
| summarize labelset = make_set(tostring(labels[""name""])) by IssueNumber, ClosedAt, Description, Title, OwnerId, OwnerLogin
|
||||
| where set_has_element(labelset, ""customer-reported""){labelFilter};
|
||||
| where {labelFilter};
|
||||
let users = User | project UserId;
|
||||
let AzureOrgMembers = Member
|
||||
| where OrganizationLogin == ""Azure"" // todo: it includes all people who belonged to the org at some point, but might no longer belong to it
|
||||
|
@ -84,4 +109,17 @@ issues
|
|||
| extend reply = bag_pack(""Comment"", Comment, ""User"", UserId, ""Timestamp"", CommentTs, ""AuthorIsInAzureOrg"", AuthorIsInAzureOrg)
|
||||
| summarize Comments = make_list(reply) by Title, Description, IssueNumber, OwnerId, ClosedAt, OwnerLogin";
|
||||
}
|
||||
|
||||
private static string GetReferenceIssueQuery(string repo) {
|
||||
return $@"
|
||||
let repo = ""{repo}"";
|
||||
Issue
|
||||
| where CreatedAt >= ago(365d) and isnotempty(ClosedAt)
|
||||
| where OrganizationLogin == ""Azure"" and RepositoryName == repo
|
||||
| project Title, Description = Body, OwnerId, IssueNumber = Number, ClosedAt, Labels, OwnerLogin
|
||||
| mv-expand labels = parse_json(Labels)
|
||||
| summarize labelset = make_set(tostring(labels[""name""])) by IssueNumber, ClosedAt, Description, Title, OwnerId, OwnerLogin
|
||||
| where set_has_element(labelset, ""reference-issue"")
|
||||
| project Title, Description, IssueNumber, OwnerId, ClosedAt, OwnerLogin";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,26 +52,26 @@ public class DocumentIndexer
|
|||
public async Task CreateIndex()
|
||||
{
|
||||
string vectorSearchConfigName = "my-vector-config";
|
||||
string vectorSearchHnswConfig = "my-hsnw-vector-config";
|
||||
var index = new SearchIndex(_indexName)
|
||||
{
|
||||
Fields =
|
||||
{
|
||||
new SimpleField("Id", SearchFieldDataType.String) { IsKey = true, IsFilterable = true },
|
||||
new SearchableField("Content") { IsFilterable = true },
|
||||
new SearchField("ContentVector", SearchFieldDataType.Collection(SearchFieldDataType.Single))
|
||||
{
|
||||
IsSearchable = true,
|
||||
VectorSearchDimensions = 1536,
|
||||
VectorSearchConfiguration = vectorSearchConfigName
|
||||
},
|
||||
new VectorSearchField("ContentVector", 1536, vectorSearchConfigName),
|
||||
new SearchableField("Source") { IsFilterable = true, IsSortable = true, IsFacetable = true},
|
||||
new SearchableField("Title") { IsFilterable = true, IsSortable = true, IsFacetable = true},
|
||||
},
|
||||
VectorSearch = new()
|
||||
{
|
||||
AlgorithmConfigurations =
|
||||
Profiles =
|
||||
{
|
||||
new HnswVectorSearchAlgorithmConfiguration(vectorSearchConfigName)
|
||||
new VectorSearchProfile(vectorSearchConfigName, vectorSearchHnswConfig)
|
||||
},
|
||||
Algorithms =
|
||||
{
|
||||
new HnswAlgorithmConfiguration(vectorSearchHnswConfig)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -116,7 +116,9 @@ public class DocumentIndexer
|
|||
|
||||
private async Task<IReadOnlyList<float>> Vectorize(string text)
|
||||
{
|
||||
var embeddings = await _openAi.GetEmbeddingsAsync(_embeddingsModel, new EmbeddingsOptions(text));
|
||||
text = text.Replace('\n', ' ').Replace('\r', ' ');
|
||||
|
||||
var embeddings = await _openAi.GetEmbeddingsAsync(_embeddingsModel, new EmbeddingsOptions(new[] { text }));
|
||||
|
||||
return embeddings.Value.Data[0].Embedding;
|
||||
}
|
||||
|
@ -192,7 +194,6 @@ public class DocumentIndexer
|
|||
|
||||
public class Document
|
||||
{
|
||||
internal const string Separator = "\n~~~END~~~\n";
|
||||
internal static readonly string RepoUrl = $"https://github.com/Azure/";
|
||||
internal static readonly char[] EndOfLine = new[] { '\n', '\r' };
|
||||
|
||||
|
@ -238,7 +239,7 @@ public class DocumentIndexer
|
|||
var content = new StringBuilder()
|
||||
.AppendLine(issue.Title)
|
||||
.AppendFormat("Customer: {0} ", description.Trim())
|
||||
.Append(Separator);
|
||||
.Append("\n");
|
||||
|
||||
foreach (IssueComment comment in issue.Comments!)
|
||||
{
|
||||
|
@ -251,7 +252,7 @@ public class DocumentIndexer
|
|||
content.Append("Customer: ");
|
||||
}
|
||||
|
||||
content.Append(comment.Comment?.Trim()).Append(Separator);
|
||||
content.Append(comment.Comment?.Trim()).Append("\n");
|
||||
}
|
||||
|
||||
return content.ToString();
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
using System.ComponentModel.DataAnnotations;
|
||||
using Azure.Monitor.OpenTelemetry.Exporter;
|
||||
using Azure.Sdk.Tools.AI.Helper.KnowledgeBase;
|
||||
using CommandLine;
|
||||
|
@ -22,14 +23,14 @@ internal class Program
|
|||
[Option('c', "command", HelpText = "Select command, one of: `index`, `query`", Required = true)]
|
||||
public string? Command { get; set; }
|
||||
|
||||
[Option('m', "mode", HelpText = "Select indexing data subset: one of `issues`, `docs`, `reference-issues`")]
|
||||
public string? Mode { get; set; }
|
||||
[Option('m', "mode", HelpText = "Select indexing data subset: one of `Docs`, `ReferenceIssues`, `AddressedIssues`, `ClosedIssues`", Default = IndexMode.ClosedIssues)]
|
||||
public IndexMode Mode { get; set; }
|
||||
|
||||
[Option('p', "path", HelpText = "When indexing markdown documents, path to the root - all md files except changelogs, swaggers, contributing will be indexed recursively")]
|
||||
public string? Path { get; set; }
|
||||
|
||||
[Option('k', "keep-index", HelpText = "Keep existing index. Defaults to false", Default = false)]
|
||||
public bool KeepIndex { get; set; }
|
||||
[Option('k', "drop-index", HelpText = "Drop existing index. Defaults to false", Default = false)]
|
||||
public bool DropIndex { get; set; }
|
||||
|
||||
[Option('q', "question", HelpText = "Issue description to get bot suggestion for")]
|
||||
public string? Question { get; set; }
|
||||
|
@ -64,13 +65,13 @@ internal class Program
|
|||
}
|
||||
else if (options.Command == "index")
|
||||
{
|
||||
if (options.Mode == "issues" || options.Mode == "reference-issues")
|
||||
{
|
||||
await IndexIssues(cloudMineConfig, searchConfig, openAiConfig, loggerFactory, options);
|
||||
}
|
||||
else if (options.Mode == "docs")
|
||||
if (options.Mode == IndexMode.Docs)
|
||||
{
|
||||
await IndexDocs(searchConfig, openAiConfig, loggerFactory, options);
|
||||
}
|
||||
else
|
||||
{
|
||||
await IndexIssues(cloudMineConfig, searchConfig, openAiConfig, loggerFactory, options);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -82,7 +83,7 @@ internal class Program
|
|||
private static async Task IndexIssues(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory, ConsoleOptions options)
|
||||
{
|
||||
var cloudMineIndexer = new CloudMineIndexer(cloudMineConfig, searchConfig, openAiConfig, GetRepoName(options), loggerFactory);
|
||||
if (!options.KeepIndex)
|
||||
if (options.DropIndex)
|
||||
{
|
||||
await cloudMineIndexer.DeleteIndex();
|
||||
}
|
||||
|
@ -92,7 +93,7 @@ internal class Program
|
|||
private static async Task IndexDocs(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory, ConsoleOptions options)
|
||||
{
|
||||
var mdIndexer = new MarkdownFileIndexer(searchConfig, openAiConfig, GetRepoName(options), loggerFactory);
|
||||
if (!options.KeepIndex)
|
||||
if (options.DropIndex)
|
||||
{
|
||||
await mdIndexer.DeleteIndex();
|
||||
}
|
||||
|
|
|
@ -28,24 +28,24 @@ There's a default issue text inside `Program.cs`, so `--question` is optional.
|
|||
2. To index existing issues (created by customers, closed, marked with `issue-addressed`) in the dotnet repo, do
|
||||
|
||||
```bash
|
||||
dotnet run --command index --language dotnet --mode issues
|
||||
dotnet run --command index --language dotnet --mode AddressedIssues
|
||||
```
|
||||
|
||||
Only `dotnet` and `java` are supported.
|
||||
|
||||
3. As a variation, you can index all issues marked with `reference-issue` with
|
||||
3. Or you can index all issues marked with `reference-issue` with
|
||||
|
||||
```bash
|
||||
dotnet run --command index --language java --mode reference-issues
|
||||
dotnet run --command index --language java --mode ReferenceIssues
|
||||
```
|
||||
|
||||
4. To index all markdown docs in the path (except CHANGELOG, CONTRIBUTING, swaggers)
|
||||
|
||||
```bash
|
||||
dotnet run --command index --language java --mode docs --path c:\repo\azure-sdk-for-java\sdk
|
||||
dotnet run --command index --language java --mode Docs --path c:\repo\azure-sdk-for-java\sdk
|
||||
```
|
||||
|
||||
> Note: when indexing data, existing index with provided name is deleted first. If you want to preserve the index, pass `--keep-index`
|
||||
> Note: when indexing data, it's added to the existing index. If you want to clean the index first, pass `--drop-index`.
|
||||
|
||||
You can always get cli help with `dotnet run -- -h`
|
||||
|
||||
|
|
|
@ -6,15 +6,15 @@
|
|||
}
|
||||
},
|
||||
"OpenAI": {
|
||||
"Endpoint": "https://support-bot-playground.openai.azure.com/",
|
||||
"EmbeddingModel": "knowledge-base-embeddings",
|
||||
"InferenceModel": "knowledge-base-gpt4"
|
||||
"Endpoint": "https://openai-shared.openai.azure.com",
|
||||
"EmbeddingModel": "text-embedding-ada-002",
|
||||
"InferenceModel": "gpt-4"
|
||||
},
|
||||
"CloudMine": {
|
||||
"Endpoint": "https://1es.kusto.windows.net"
|
||||
},
|
||||
"Search": {
|
||||
"Endpoint": "https://lim-knowledge-base-search.search.windows.net",
|
||||
"Endpoint": "https://openai-shared.search.windows.net",
|
||||
"IndexName": "java-vector"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ param location string = resourceGroup().location
|
|||
param embeddingsCapacity int = 20
|
||||
param inferenceCapacity int = 10
|
||||
|
||||
var embeddingModelName = 'text-embedding-3-small'
|
||||
var embeddingModelName = 'text-embedding-ada-002'
|
||||
var inferenceModelName = 'gpt-4'
|
||||
|
||||
var openAiDeployments = [
|
||||
|
|
Загрузка…
Ссылка в новой задаче