fix keeping index and some md indexing issues
This commit is contained in:
Родитель
c2413f175b
Коммит
0007da19f0
|
@ -7,9 +7,9 @@ public abstract class BaseIndexer
|
|||
{
|
||||
protected readonly DocumentIndexer _documentIndexer;
|
||||
protected readonly ActivitySource _activitySource;
|
||||
public BaseIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory)
|
||||
public BaseIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory)
|
||||
{
|
||||
_documentIndexer = new DocumentIndexer(searchConfig, openAiConfig, loggerFactory?.CreateLogger<DocumentIndexer>());
|
||||
_documentIndexer = new DocumentIndexer(searchConfig, openAiConfig, repoName, loggerFactory?.CreateLogger<DocumentIndexer>());
|
||||
_activitySource = new ActivitySource(GetType().FullName ?? throw new ArgumentNullException("GetType().FullName is null"));
|
||||
}
|
||||
|
||||
|
|
|
@ -8,7 +8,8 @@ namespace Azure.Sdk.Tools.AI.Helper.KnowledgeBase;
|
|||
public class CloudMineIndexer : BaseIndexer
|
||||
{
|
||||
private readonly ICslQueryProvider _kustoClient;
|
||||
public CloudMineIndexer(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, loggerFactory)
|
||||
public CloudMineIndexer(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) :
|
||||
base(searchConfig, openAiConfig, repoName, loggerFactory)
|
||||
{
|
||||
_kustoClient = KustoClientFactory.CreateCslQueryProvider(new KustoConnectionStringBuilder(cloudMineConfig.Endpoint)
|
||||
.WithAadAzCliAuthentication(true));
|
||||
|
|
|
@ -15,12 +15,13 @@ public class DocumentIndexer
|
|||
private readonly ILogger<DocumentIndexer>? _logger;
|
||||
private readonly string _indexName;
|
||||
private readonly string _embeddingsModel;
|
||||
private readonly string _repoName;
|
||||
|
||||
public DocumentIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILogger<DocumentIndexer>? logger)
|
||||
public DocumentIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILogger<DocumentIndexer>? logger)
|
||||
{
|
||||
_indexName = searchConfig.IndexName
|
||||
?? throw new ArgumentNullException("IndexName");
|
||||
|
||||
_repoName = repoName ?? throw new ArgumentNullException(nameof(repoName));
|
||||
_logger = logger;
|
||||
|
||||
var endpointUrl = new Uri(searchConfig.Endpoint ?? throw new ArgumentNullException("Endpoint"));
|
||||
|
@ -88,6 +89,7 @@ public class DocumentIndexer
|
|||
var documents = issues
|
||||
.Select(async i => await DocumentsFromIssue(i))
|
||||
.Select(t => t.Result)
|
||||
.SelectMany(d => d)
|
||||
.Where(d => d != null)
|
||||
.Cast<Document>();
|
||||
|
||||
|
@ -97,9 +99,14 @@ public class DocumentIndexer
|
|||
|
||||
public async Task Index(IEnumerable<string> mdFiles, string azSdkRepoPath)
|
||||
{
|
||||
// TODO: fix path, currently peopr links are generated only if
|
||||
// we pass path/to/azure-sdk-for-*/
|
||||
// but we don't care about ./eng, so we want to index ./sdk only
|
||||
// but generated links are not right - https://github.com/Azure/azure-sdk-for-net/tree/main/monitor\Azure.Monitor.OpenTelemetry.Exporter\README.md
|
||||
var documents = mdFiles
|
||||
.Select(async f => await FromMdFile(f, Path.GetRelativePath(azSdkRepoPath, f)))
|
||||
.Select(t => t.Result)
|
||||
.SelectMany(d => d)
|
||||
.Where(d => d != null)
|
||||
.Cast<Document>();
|
||||
|
||||
|
@ -114,59 +121,79 @@ public class DocumentIndexer
|
|||
return embeddings.Value.Data[0].Embedding;
|
||||
}
|
||||
|
||||
private async Task<Document?> DocumentsFromIssue(Issue issue)
|
||||
private async Task<IEnumerable<Document?>> DocumentsFromIssue(Issue issue)
|
||||
{
|
||||
var content = Document.GetFullContent(issue);
|
||||
|
||||
if (content.Length > 8000)
|
||||
var docs = new List<Document>();
|
||||
var chunks = Chunk(content);
|
||||
|
||||
for (var i = 0; i < chunks.Count(); i++)
|
||||
{
|
||||
_logger?.LogWarning("Issue {issue} content length is too long - {length}", issue.IssueNumber, content.Length);
|
||||
Console.WriteLine($"Issue {issue.IssueNumber} content length is too long - {content.Length}");
|
||||
return null;
|
||||
docs.Add(new Document()
|
||||
{
|
||||
Id = string.Concat(_repoName, "_", issue.IssueNumber + "_" + i),
|
||||
Source = string.Concat(Document.RepoUrl, _repoName, "/issues/", issue.IssueNumber),
|
||||
Title = issue.Title,
|
||||
Content = chunks[i],
|
||||
ContentVector = await Vectorize(chunks[i])
|
||||
});
|
||||
}
|
||||
return new Document()
|
||||
{
|
||||
Id = string.Concat(Document.RepoName, "_", issue.IssueNumber),
|
||||
Source = string.Concat(Document.RepoUrl, "issues/", issue.IssueNumber),
|
||||
Title = issue.Title,
|
||||
Content = content,
|
||||
ContentVector = await Vectorize(content)
|
||||
};
|
||||
|
||||
return docs;
|
||||
}
|
||||
|
||||
public async Task<Document?> FromMdFile(string absolutePath, string relativePath)
|
||||
public async Task<IEnumerable<Document?>> FromMdFile(string absolutePath, string relativePath)
|
||||
{
|
||||
var text = File.ReadAllText(absolutePath);
|
||||
var firstLineEnd = text.Trim().IndexOfAny(Document.EndOfLine);
|
||||
|
||||
if (firstLineEnd <= 0)
|
||||
{
|
||||
return null;
|
||||
return Enumerable.Empty<Document?>();
|
||||
}
|
||||
|
||||
var firstLine = text[..firstLineEnd];
|
||||
|
||||
if (text.Length > 8000)
|
||||
var docs = new List<Document>();
|
||||
var chunks = Chunk(text);
|
||||
for (var i = 0; i < chunks.Count(); i ++)
|
||||
{
|
||||
_logger?.LogWarning("Document {path} content length is too long - {length}", absolutePath, text.Length);
|
||||
return null;
|
||||
docs.Add(new Document()
|
||||
{
|
||||
Id = string.Concat(_repoName, "_", Document.GetId(relativePath) + "_" + i),
|
||||
Source = string.Concat(Document.RepoUrl, _repoName, "/tree/main/", relativePath),
|
||||
Title = firstLine,
|
||||
Content = chunks[i],
|
||||
ContentVector = await Vectorize(chunks[i])
|
||||
});
|
||||
}
|
||||
|
||||
return new Document()
|
||||
return docs;
|
||||
}
|
||||
|
||||
|
||||
private List<string> Chunk(string text)
|
||||
{
|
||||
var chunks = new List<string>();
|
||||
for (int i = 0; i < text.Length; i += 8000)
|
||||
{
|
||||
Id = string.Concat(Document.RepoName, "_", Document.GetId(relativePath)),
|
||||
Source = string.Concat(Document.RepoUrl, "tree/main/", relativePath),
|
||||
Title = firstLine,
|
||||
Content = text,
|
||||
ContentVector = await Vectorize(text)
|
||||
};
|
||||
if (i + 8000 >= text.Length)
|
||||
{
|
||||
chunks.Add(text.Substring(i));
|
||||
}
|
||||
else
|
||||
{
|
||||
chunks.Add(text.Substring(i, 8000));
|
||||
}
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
|
||||
public class Document
|
||||
{
|
||||
internal const string Separator = "\n~~~END~~~\n";
|
||||
internal const string RepoName = "azure-sdk-for-net"; //TODO
|
||||
internal static readonly string RepoUrl = $"https://github.com/Azure/{RepoName}/";
|
||||
internal static readonly string RepoUrl = $"https://github.com/Azure/";
|
||||
internal static readonly char[] EndOfLine = new[] { '\n', '\r' };
|
||||
|
||||
internal static string GetId(string url)
|
||||
|
|
|
@ -4,7 +4,7 @@ namespace Azure.Sdk.Tools.AI.Helper.KnowledgeBase;
|
|||
|
||||
public class MarkdownFileIndexer : BaseIndexer
|
||||
{
|
||||
public MarkdownFileIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, loggerFactory)
|
||||
public MarkdownFileIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, repoName, loggerFactory)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ namespace Azure.Sdk.Tools.AI.Helper.KnowledgeBase;
|
|||
|
||||
public class ReferenceIssueIndexer : BaseIndexer
|
||||
{
|
||||
public ReferenceIssueIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, loggerFactory)
|
||||
public ReferenceIssueIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, repoName, loggerFactory)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
|
@ -28,8 +28,8 @@ internal class Program
|
|||
[Option('p', "path", HelpText = "When indexing markdown documents, path to the root - all md files except changelogs, swaggers, contributing will be indexed recursively")]
|
||||
public string? Path { get; set; }
|
||||
|
||||
[Option('d', "delete_index", HelpText = "Delete existing index before indexing. Defaults to true", Default = true)]
|
||||
public bool Delete { get; set; }
|
||||
[Option('k', "keep-index", HelpText = "Keep existing index. Defaults to false", Default = false)]
|
||||
public bool KeepIndex { get; set; }
|
||||
|
||||
[Option('q', "question", HelpText = "Issue description to get bot suggestion for")]
|
||||
public string? Question { get; set; }
|
||||
|
@ -79,11 +79,10 @@ internal class Program
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
private static async Task IndexIssues(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory, ConsoleOptions options)
|
||||
{
|
||||
var cloudMineIndexer = new CloudMineIndexer(cloudMineConfig, searchConfig, openAiConfig, loggerFactory);
|
||||
if (options.Delete)
|
||||
var cloudMineIndexer = new CloudMineIndexer(cloudMineConfig, searchConfig, openAiConfig, GetRepoName(options), loggerFactory);
|
||||
if (!options.KeepIndex)
|
||||
{
|
||||
await cloudMineIndexer.DeleteIndex();
|
||||
}
|
||||
|
@ -92,8 +91,8 @@ internal class Program
|
|||
|
||||
private static async Task IndexDocs(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory, ConsoleOptions options)
|
||||
{
|
||||
var mdIndexer = new MarkdownFileIndexer(searchConfig, openAiConfig, loggerFactory);
|
||||
if (options.Delete)
|
||||
var mdIndexer = new MarkdownFileIndexer(searchConfig, openAiConfig, GetRepoName(options), loggerFactory);
|
||||
if (!options.KeepIndex)
|
||||
{
|
||||
await mdIndexer.DeleteIndex();
|
||||
}
|
||||
|
@ -111,7 +110,7 @@ internal class Program
|
|||
//{
|
||||
Console.WriteLine(suggestion.Solution);
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"Related issues and documents:\r\n - {string.Join("\r\n - ", suggestion.References)}");
|
||||
Console.WriteLine($"Related issues and documents:\r\n - {string.Join("\r\n - ", suggestion.References)}");
|
||||
Console.WriteLine($"<!-- Confidence Level: {suggestion.ConfidenceLevel} -->\r\n");
|
||||
//}
|
||||
}
|
||||
|
@ -119,9 +118,16 @@ internal class Program
|
|||
{
|
||||
loggerFactory.CreateLogger<Program>().LogError(ex, "Failed to get suggestion");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static string GetRepoName(ConsoleOptions options)
|
||||
{
|
||||
if (options.Language == "dotnet") return "azure-sdk-for-net";
|
||||
if (options.Language == "java") return "azure-sdk-for-java";
|
||||
throw new NotSupportedException("Language is not supported");
|
||||
}
|
||||
|
||||
|
||||
private const string SampleQuery = @"
|
||||
[BUG] Azure.Security.KeyVault.Secrets GetSecret tries to spawn a child process even though it's a synchronous function.
|
||||
|
||||
|
@ -204,8 +210,8 @@ Run the application.
|
|||
string connectionString = configuration.GetSection("AzureMonitor").GetValue<string>("ConnectionString");
|
||||
var tpb = OpenTelemetry.Sdk.CreateTracerProviderBuilder()
|
||||
.SetResourceBuilder(CreateResource())
|
||||
.AddSource("Azure.*")
|
||||
.AddHttpClientInstrumentation();
|
||||
.AddSource("Azure.*");
|
||||
//.AddHttpClientInstrumentation();
|
||||
|
||||
if (connectionString == null)
|
||||
{
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
}
|
||||
},
|
||||
"OpenAI": {
|
||||
"Endpoint": "https://....openai.azure.com/",
|
||||
"Endpoint": "https://support-bot-playground.openai.azure.com/",
|
||||
"EmbeddingModel": "knowledge-base-embeddings",
|
||||
"InferenceModel": "knowledge-base-gpt4"
|
||||
},
|
||||
|
@ -14,7 +14,7 @@
|
|||
"Endpoint": "https://1es.kusto.windows.net"
|
||||
},
|
||||
"Search": {
|
||||
"Endpoint": "https://....search.windows.net",
|
||||
"Endpoint": "https://lim-knowledge-base-search.search.windows.net",
|
||||
"IndexName": "dotnet-issues-vector"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -268,7 +268,7 @@ namespace Azure.Sdk.Tools.GitHubEventProcessor.EventProcessing
|
|||
{
|
||||
if (issueEventPayload.Action == ActionConstants.Labeled && issueEventPayload.Issue.State == ItemState.Open && issueEventPayload.Label.Name == TriageLabelConstants.ReferenceIssue)
|
||||
{
|
||||
var indexer = new AI.Helper.KnowledgeBase.ReferenceIssueIndexer(new SearchConfig(), new OpenAiConfig(), null);
|
||||
var indexer = new AI.Helper.KnowledgeBase.ReferenceIssueIndexer(new SearchConfig(), new OpenAiConfig(), issueEventPayload.Repository.Name, null);
|
||||
var issue = new AI.Helper.KnowledgeBase.Issue()
|
||||
{
|
||||
Title = issueEventPayload.Issue.Title,
|
||||
|
|
Загрузка…
Ссылка в новой задаче