fix keeping index and some md indexing issues

This commit is contained in:
Liudmila Molkova 2024-03-19 11:41:07 -07:00
Родитель c2413f175b
Коммит 0007da19f0
8 изменённых файлов: 82 добавлений и 48 удалений

Просмотреть файл

@ -7,9 +7,9 @@ public abstract class BaseIndexer
{
protected readonly DocumentIndexer _documentIndexer;
protected readonly ActivitySource _activitySource;
public BaseIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory)
public BaseIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory)
{
_documentIndexer = new DocumentIndexer(searchConfig, openAiConfig, loggerFactory?.CreateLogger<DocumentIndexer>());
_documentIndexer = new DocumentIndexer(searchConfig, openAiConfig, repoName, loggerFactory?.CreateLogger<DocumentIndexer>());
_activitySource = new ActivitySource(GetType().FullName ?? throw new ArgumentNullException("GetType().FullName is null"));
}

Просмотреть файл

@ -8,7 +8,8 @@ namespace Azure.Sdk.Tools.AI.Helper.KnowledgeBase;
public class CloudMineIndexer : BaseIndexer
{
private readonly ICslQueryProvider _kustoClient;
public CloudMineIndexer(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, loggerFactory)
public CloudMineIndexer(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) :
base(searchConfig, openAiConfig, repoName, loggerFactory)
{
_kustoClient = KustoClientFactory.CreateCslQueryProvider(new KustoConnectionStringBuilder(cloudMineConfig.Endpoint)
.WithAadAzCliAuthentication(true));

Просмотреть файл

@ -15,12 +15,13 @@ public class DocumentIndexer
private readonly ILogger<DocumentIndexer>? _logger;
private readonly string _indexName;
private readonly string _embeddingsModel;
private readonly string _repoName;
public DocumentIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILogger<DocumentIndexer>? logger)
public DocumentIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILogger<DocumentIndexer>? logger)
{
_indexName = searchConfig.IndexName
?? throw new ArgumentNullException("IndexName");
_repoName = repoName ?? throw new ArgumentNullException(nameof(repoName));
_logger = logger;
var endpointUrl = new Uri(searchConfig.Endpoint ?? throw new ArgumentNullException("Endpoint"));
@ -88,6 +89,7 @@ public class DocumentIndexer
var documents = issues
.Select(async i => await DocumentsFromIssue(i))
.Select(t => t.Result)
.SelectMany(d => d)
.Where(d => d != null)
.Cast<Document>();
@ -97,9 +99,14 @@ public class DocumentIndexer
public async Task Index(IEnumerable<string> mdFiles, string azSdkRepoPath)
{
// TODO: fix path, currently peopr links are generated only if
// we pass path/to/azure-sdk-for-*/
// but we don't care about ./eng, so we want to index ./sdk only
// but generated links are not right - https://github.com/Azure/azure-sdk-for-net/tree/main/monitor\Azure.Monitor.OpenTelemetry.Exporter\README.md
var documents = mdFiles
.Select(async f => await FromMdFile(f, Path.GetRelativePath(azSdkRepoPath, f)))
.Select(t => t.Result)
.SelectMany(d => d)
.Where(d => d != null)
.Cast<Document>();
@ -114,59 +121,79 @@ public class DocumentIndexer
return embeddings.Value.Data[0].Embedding;
}
private async Task<Document?> DocumentsFromIssue(Issue issue)
private async Task<IEnumerable<Document?>> DocumentsFromIssue(Issue issue)
{
var content = Document.GetFullContent(issue);
if (content.Length > 8000)
var docs = new List<Document>();
var chunks = Chunk(content);
for (var i = 0; i < chunks.Count(); i++)
{
_logger?.LogWarning("Issue {issue} content length is too long - {length}", issue.IssueNumber, content.Length);
Console.WriteLine($"Issue {issue.IssueNumber} content length is too long - {content.Length}");
return null;
docs.Add(new Document()
{
Id = string.Concat(_repoName, "_", issue.IssueNumber + "_" + i),
Source = string.Concat(Document.RepoUrl, _repoName, "/issues/", issue.IssueNumber),
Title = issue.Title,
Content = chunks[i],
ContentVector = await Vectorize(chunks[i])
});
}
return new Document()
{
Id = string.Concat(Document.RepoName, "_", issue.IssueNumber),
Source = string.Concat(Document.RepoUrl, "issues/", issue.IssueNumber),
Title = issue.Title,
Content = content,
ContentVector = await Vectorize(content)
};
return docs;
}
public async Task<Document?> FromMdFile(string absolutePath, string relativePath)
public async Task<IEnumerable<Document?>> FromMdFile(string absolutePath, string relativePath)
{
var text = File.ReadAllText(absolutePath);
var firstLineEnd = text.Trim().IndexOfAny(Document.EndOfLine);
if (firstLineEnd <= 0)
{
return null;
return Enumerable.Empty<Document?>();
}
var firstLine = text[..firstLineEnd];
if (text.Length > 8000)
var docs = new List<Document>();
var chunks = Chunk(text);
for (var i = 0; i < chunks.Count(); i ++)
{
_logger?.LogWarning("Document {path} content length is too long - {length}", absolutePath, text.Length);
return null;
docs.Add(new Document()
{
Id = string.Concat(_repoName, "_", Document.GetId(relativePath) + "_" + i),
Source = string.Concat(Document.RepoUrl, _repoName, "/tree/main/", relativePath),
Title = firstLine,
Content = chunks[i],
ContentVector = await Vectorize(chunks[i])
});
}
return new Document()
return docs;
}
private List<string> Chunk(string text)
{
var chunks = new List<string>();
for (int i = 0; i < text.Length; i += 8000)
{
Id = string.Concat(Document.RepoName, "_", Document.GetId(relativePath)),
Source = string.Concat(Document.RepoUrl, "tree/main/", relativePath),
Title = firstLine,
Content = text,
ContentVector = await Vectorize(text)
};
if (i + 8000 >= text.Length)
{
chunks.Add(text.Substring(i));
}
else
{
chunks.Add(text.Substring(i, 8000));
}
}
return chunks;
}
public class Document
{
internal const string Separator = "\n~~~END~~~\n";
internal const string RepoName = "azure-sdk-for-net"; //TODO
internal static readonly string RepoUrl = $"https://github.com/Azure/{RepoName}/";
internal static readonly string RepoUrl = $"https://github.com/Azure/";
internal static readonly char[] EndOfLine = new[] { '\n', '\r' };
internal static string GetId(string url)

Просмотреть файл

@ -4,7 +4,7 @@ namespace Azure.Sdk.Tools.AI.Helper.KnowledgeBase;
public class MarkdownFileIndexer : BaseIndexer
{
public MarkdownFileIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, loggerFactory)
public MarkdownFileIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, repoName, loggerFactory)
{
}

Просмотреть файл

@ -4,7 +4,7 @@ namespace Azure.Sdk.Tools.AI.Helper.KnowledgeBase;
public class ReferenceIssueIndexer : BaseIndexer
{
public ReferenceIssueIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, loggerFactory)
public ReferenceIssueIndexer(SearchConfig searchConfig, OpenAiConfig openAiConfig, string repoName, ILoggerFactory loggerFactory) : base(searchConfig, openAiConfig, repoName, loggerFactory)
{
}

Просмотреть файл

@ -28,8 +28,8 @@ internal class Program
[Option('p', "path", HelpText = "When indexing markdown documents, path to the root - all md files except changelogs, swaggers, contributing will be indexed recursively")]
public string? Path { get; set; }
[Option('d', "delete_index", HelpText = "Delete existing index before indexing. Defaults to true", Default = true)]
public bool Delete { get; set; }
[Option('k', "keep-index", HelpText = "Keep existing index. Defaults to false", Default = false)]
public bool KeepIndex { get; set; }
[Option('q', "question", HelpText = "Issue description to get bot suggestion for")]
public string? Question { get; set; }
@ -79,11 +79,10 @@ internal class Program
}
}
private static async Task IndexIssues(CloudMineConfig cloudMineConfig, SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory, ConsoleOptions options)
{
var cloudMineIndexer = new CloudMineIndexer(cloudMineConfig, searchConfig, openAiConfig, loggerFactory);
if (options.Delete)
var cloudMineIndexer = new CloudMineIndexer(cloudMineConfig, searchConfig, openAiConfig, GetRepoName(options), loggerFactory);
if (!options.KeepIndex)
{
await cloudMineIndexer.DeleteIndex();
}
@ -92,8 +91,8 @@ internal class Program
private static async Task IndexDocs(SearchConfig searchConfig, OpenAiConfig openAiConfig, ILoggerFactory loggerFactory, ConsoleOptions options)
{
var mdIndexer = new MarkdownFileIndexer(searchConfig, openAiConfig, loggerFactory);
if (options.Delete)
var mdIndexer = new MarkdownFileIndexer(searchConfig, openAiConfig, GetRepoName(options), loggerFactory);
if (!options.KeepIndex)
{
await mdIndexer.DeleteIndex();
}
@ -111,7 +110,7 @@ internal class Program
//{
Console.WriteLine(suggestion.Solution);
Console.WriteLine();
Console.WriteLine($"Related issues and documents:\r\n - {string.Join("\r\n - ", suggestion.References)}");
Console.WriteLine($"Related issues and documents:\r\n - {string.Join("\r\n - ", suggestion.References)}");
Console.WriteLine($"<!-- Confidence Level: {suggestion.ConfidenceLevel} -->\r\n");
//}
}
@ -119,9 +118,16 @@ internal class Program
{
loggerFactory.CreateLogger<Program>().LogError(ex, "Failed to get suggestion");
}
}
private static string GetRepoName(ConsoleOptions options)
{
if (options.Language == "dotnet") return "azure-sdk-for-net";
if (options.Language == "java") return "azure-sdk-for-java";
throw new NotSupportedException("Language is not supported");
}
private const string SampleQuery = @"
[BUG] Azure.Security.KeyVault.Secrets GetSecret tries to spawn a child process even though it's a synchronous function.
@ -204,8 +210,8 @@ Run the application.
string connectionString = configuration.GetSection("AzureMonitor").GetValue<string>("ConnectionString");
var tpb = OpenTelemetry.Sdk.CreateTracerProviderBuilder()
.SetResourceBuilder(CreateResource())
.AddSource("Azure.*")
.AddHttpClientInstrumentation();
.AddSource("Azure.*");
//.AddHttpClientInstrumentation();
if (connectionString == null)
{

Просмотреть файл

@ -6,7 +6,7 @@
}
},
"OpenAI": {
"Endpoint": "https://....openai.azure.com/",
"Endpoint": "https://support-bot-playground.openai.azure.com/",
"EmbeddingModel": "knowledge-base-embeddings",
"InferenceModel": "knowledge-base-gpt4"
},
@ -14,7 +14,7 @@
"Endpoint": "https://1es.kusto.windows.net"
},
"Search": {
"Endpoint": "https://....search.windows.net",
"Endpoint": "https://lim-knowledge-base-search.search.windows.net",
"IndexName": "dotnet-issues-vector"
}
}

Просмотреть файл

@ -268,7 +268,7 @@ namespace Azure.Sdk.Tools.GitHubEventProcessor.EventProcessing
{
if (issueEventPayload.Action == ActionConstants.Labeled && issueEventPayload.Issue.State == ItemState.Open && issueEventPayload.Label.Name == TriageLabelConstants.ReferenceIssue)
{
var indexer = new AI.Helper.KnowledgeBase.ReferenceIssueIndexer(new SearchConfig(), new OpenAiConfig(), null);
var indexer = new AI.Helper.KnowledgeBase.ReferenceIssueIndexer(new SearchConfig(), new OpenAiConfig(), issueEventPayload.Repository.Name, null);
var issue = new AI.Helper.KnowledgeBase.Issue()
{
Title = issueEventPayload.Issue.Title,