зеркало из https://github.com/Azure/logicapps.git
Logic Apps sample for RAG patterns - Ingestion and Retrieval with SQL as the data source (#1081)
* rndecodetemplate * namefix * revert * jsonfix * remove_res * blob * blobfix * api1 * icon * addhttpicon * RNencode * RNencode1 * RNencode2 * simplifying * urifix * decode_update * decode_update1 * decode_update2 * newupdates * RAG sample with ingestion and retrieval workflows * Updating to remove app settings and instead add parameters. Also update the readme * Adding app settings for all connection parameters. Updated Readme as well removing secrets updating app settings
This commit is contained in:
Родитель
c44ba7665b
Коммит
42ad494bac
|
@ -0,0 +1,123 @@
|
|||
{
|
||||
"name": "chat-index",
|
||||
"fields": [
|
||||
{
|
||||
"name": "id",
|
||||
"type": "Edm.String",
|
||||
"searchable": false,
|
||||
"filterable": false,
|
||||
"retrievable": true,
|
||||
"sortable": false,
|
||||
"facetable": false,
|
||||
"key": true,
|
||||
"indexAnalyzer": null,
|
||||
"searchAnalyzer": null,
|
||||
"analyzer": null,
|
||||
"normalizer": null,
|
||||
"dimensions": null,
|
||||
"vectorSearchProfile": null,
|
||||
"synonymMaps": []
|
||||
},
|
||||
{
|
||||
"name": "documentName",
|
||||
"type": "Edm.String",
|
||||
"searchable": false,
|
||||
"filterable": true,
|
||||
"retrievable": true,
|
||||
"sortable": false,
|
||||
"facetable": false,
|
||||
"key": false,
|
||||
"indexAnalyzer": null,
|
||||
"searchAnalyzer": null,
|
||||
"analyzer": null,
|
||||
"normalizer": null,
|
||||
"dimensions": null,
|
||||
"vectorSearchProfile": null,
|
||||
"synonymMaps": []
|
||||
},
|
||||
{
|
||||
"name": "documentUrl",
|
||||
"type": "Edm.String",
|
||||
"searchable": false,
|
||||
"filterable": true,
|
||||
"retrievable": true,
|
||||
"sortable": false,
|
||||
"facetable": false,
|
||||
"key": false,
|
||||
"indexAnalyzer": null,
|
||||
"searchAnalyzer": null,
|
||||
"analyzer": null,
|
||||
"normalizer": null,
|
||||
"dimensions": null,
|
||||
"vectorSearchProfile": null,
|
||||
"synonymMaps": []
|
||||
},
|
||||
{
|
||||
"name": "content",
|
||||
"type": "Edm.String",
|
||||
"searchable": true,
|
||||
"filterable": true,
|
||||
"retrievable": true,
|
||||
"sortable": false,
|
||||
"facetable": false,
|
||||
"key": false,
|
||||
"indexAnalyzer": null,
|
||||
"searchAnalyzer": null,
|
||||
"analyzer": "standard.lucene",
|
||||
"normalizer": null,
|
||||
"dimensions": null,
|
||||
"vectorSearchProfile": null,
|
||||
"synonymMaps": []
|
||||
},
|
||||
{
|
||||
"name": "embeddings",
|
||||
"type": "Collection(Edm.Single)",
|
||||
"searchable": true,
|
||||
"filterable": false,
|
||||
"retrievable": true,
|
||||
"sortable": false,
|
||||
"facetable": false,
|
||||
"key": false,
|
||||
"indexAnalyzer": null,
|
||||
"searchAnalyzer": null,
|
||||
"analyzer": null,
|
||||
"normalizer": null,
|
||||
"dimensions": 1536,
|
||||
"vectorSearchProfile": "vector-profile",
|
||||
"synonymMaps": []
|
||||
}
|
||||
],
|
||||
"scoringProfiles": [],
|
||||
"corsOptions": null,
|
||||
"suggesters": [],
|
||||
"analyzers": [],
|
||||
"normalizers": [],
|
||||
"tokenizers": [],
|
||||
"tokenFilters": [],
|
||||
"charFilters": [],
|
||||
"encryptionKey": null,
|
||||
"semantic": null,
|
||||
"vectorSearch": {
|
||||
"algorithms": [
|
||||
{
|
||||
"name": "vector-config",
|
||||
"kind": "hnsw",
|
||||
"hnswParameters": {
|
||||
"metric": "cosine",
|
||||
"m": 4,
|
||||
"efConstruction": 400,
|
||||
"efSearch": 500
|
||||
},
|
||||
"exhaustiveKnnParameters": null
|
||||
}
|
||||
],
|
||||
"profiles": [
|
||||
{
|
||||
"name": "vector-profile",
|
||||
"algorithm": "vector-config",
|
||||
"vectorizer": null
|
||||
}
|
||||
],
|
||||
"vectorizers": []
|
||||
}
|
||||
}
|
|
@ -0,0 +1,179 @@
|
|||
{
|
||||
"definition": {
|
||||
"$schema": "https://schema.management.azure.com/providers/Microsoft.Logic/schemas/2016-06-01/workflowdefinition.json#",
|
||||
"actions": {
|
||||
"Azure_Open_AI_-_Get_multiple_embeddings": {
|
||||
"type": "ServiceProvider",
|
||||
"inputs": {
|
||||
"parameters": {
|
||||
"deploymentId": "@parameters('openai_embedding_deployment_model')",
|
||||
"input": "@body('Select_JSON_array_items')"
|
||||
},
|
||||
"serviceProviderConfiguration": {
|
||||
"connectionName": "openai",
|
||||
"operationId": "getArrayEmbeddings",
|
||||
"serviceProviderId": "/serviceProviders/openai"
|
||||
}
|
||||
},
|
||||
"runAfter": {
|
||||
"Select_JSON_array_items": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Chunk_and_Tokenize": {
|
||||
"type": "Http",
|
||||
"inputs": {
|
||||
"uri": "@parameters('tokenize_function_url')",
|
||||
"method": "POST",
|
||||
"body": "@outputs('Parameters_for_Tokenize')"
|
||||
},
|
||||
"runAfter": {
|
||||
"Parameters_for_Tokenize": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
},
|
||||
"runtimeConfiguration": {
|
||||
"contentTransfer": {
|
||||
"transferMode": "Chunked"
|
||||
}
|
||||
}
|
||||
},
|
||||
"Convert_tokenized_output_to_JSON": {
|
||||
"type": "ParseJson",
|
||||
"inputs": {
|
||||
"content": "@body('Chunk_and_Tokenize')",
|
||||
"schema": {
|
||||
"items": {
|
||||
"properties": {
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"properties": {
|
||||
"page": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"type": "object"
|
||||
},
|
||||
"tokenLength": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"content",
|
||||
"tokenLength",
|
||||
"metadata"
|
||||
],
|
||||
"type": "object"
|
||||
},
|
||||
"type": "array"
|
||||
}
|
||||
},
|
||||
"runAfter": {
|
||||
"Chunk_and_Tokenize": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Get_rows": {
|
||||
"type": "ServiceProvider",
|
||||
"inputs": {
|
||||
"parameters": {
|
||||
"tableName": "@parameters('sql_table_name')"
|
||||
},
|
||||
"serviceProviderConfiguration": {
|
||||
"connectionName": "sql",
|
||||
"operationId": "getRows",
|
||||
"serviceProviderId": "/serviceProviders/sql"
|
||||
}
|
||||
},
|
||||
"runAfter": {}
|
||||
},
|
||||
"Index_multiple_documents": {
|
||||
"type": "ServiceProvider",
|
||||
"inputs": {
|
||||
"parameters": {
|
||||
"indexName": "@parameters('aisearch_index_name')",
|
||||
"documents": "@body('Select_embeddings')"
|
||||
},
|
||||
"serviceProviderConfiguration": {
|
||||
"connectionName": "azureaisearch",
|
||||
"operationId": "indexDocuments",
|
||||
"serviceProviderId": "/serviceProviders/azureaisearch"
|
||||
}
|
||||
},
|
||||
"runAfter": {
|
||||
"Select_embeddings": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Parameters_for_Tokenize": {
|
||||
"type": "Compose",
|
||||
"inputs": {
|
||||
"base64Content": "@base64(body('Get_rows'))",
|
||||
"documentType": "txt",
|
||||
"splittingStrategy": "recursive",
|
||||
"tokenLength": 512
|
||||
},
|
||||
"runAfter": {
|
||||
"Get_rows": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Response": {
|
||||
"type": "Response",
|
||||
"kind": "Http",
|
||||
"inputs": {
|
||||
"statusCode": 200
|
||||
},
|
||||
"runAfter": {
|
||||
"Index_multiple_documents": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Select_JSON_array_items": {
|
||||
"type": "Select",
|
||||
"inputs": {
|
||||
"from": "@range(0, length(body('Convert_tokenized_output_to_JSON')))",
|
||||
"select": "@body('Convert_tokenized_output_to_JSON')[item()]['content']"
|
||||
},
|
||||
"runAfter": {
|
||||
"Convert_tokenized_output_to_JSON": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Select_embeddings": {
|
||||
"type": "Select",
|
||||
"inputs": {
|
||||
"from": "@range(0, length(body('Convert_tokenized_output_to_JSON')))",
|
||||
"select": {
|
||||
"content": "@body('Convert_tokenized_output_to_JSON')[item()]['content']",
|
||||
"documentName": "music-sales-data.txt",
|
||||
"embeddings": "@body('Azure_Open_AI_-_Get_multiple_embeddings')['embeddings'][item()]",
|
||||
"id": "@concat('music_sales_data_txt', item())"
|
||||
}
|
||||
},
|
||||
"runAfter": {
|
||||
"Azure_Open_AI_-_Get_multiple_embeddings": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"contentVersion": "1.0.0.0",
|
||||
"outputs": {},
|
||||
"triggers": {
|
||||
"When_a_HTTP_request_is_received": {
|
||||
"type": "Request",
|
||||
"kind": "Http"
|
||||
}
|
||||
}
|
||||
},
|
||||
"kind": "Stateful"
|
||||
}
|
|
@ -0,0 +1,192 @@
|
|||
{
|
||||
"definition": {
|
||||
"$schema": "https://schema.management.azure.com/providers/Microsoft.Logic/schemas/2016-06-01/workflowdefinition.json#",
|
||||
"actions": {
|
||||
"system_message": {
|
||||
"type": "Compose",
|
||||
"inputs": "You are an intelligent assistant helping employees generate sales and marketing insights based on the sales data for music albums provided to you through sales report. Use the sales data provides to you and offer detailed insights and actionable recommendations for improving the music album sales. Also answer questions on the sales of albums based on genre or artist name. Use 'you' to refer to the individual asking the questions even if they ask with 'I'. Answer the following question using only the data provided in the sources below. Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. If you cannot answer using the sources below, say you don't know. Generate well formatted response in HTML that can be read easily in an email or chat",
|
||||
"runAfter": {}
|
||||
},
|
||||
"sample_responses": {
|
||||
"type": "Compose",
|
||||
"inputs": [
|
||||
{
|
||||
"message": "How were the sales in North America?",
|
||||
"role": "user"
|
||||
},
|
||||
{
|
||||
"message": "How were the overall sales in all regions",
|
||||
"role": "assistant"
|
||||
},
|
||||
{
|
||||
"message": "Generate sales insights from this data to share with leadership",
|
||||
"role": "user"
|
||||
},
|
||||
{
|
||||
"message": "Sales report for pop music",
|
||||
"role": "assistant"
|
||||
}
|
||||
],
|
||||
"runAfter": {
|
||||
"system_message": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"system_message_for_search_query": {
|
||||
"type": "Compose",
|
||||
"inputs": "Below is a history of the conversation so far, and a new question asked by the user that needs to be answered by searching in the sales data of music albums globally.\nYou have access to Azure Cognitive Search index with 100's of documents.\nGenerate a search query based on the question.\nDo not include cited source filenames and document names e.g info.txt or doc.pdf in the search query terms.\nDo not include any text inside [] or <<>> in the search query terms.\nDo not include any special characters like '+'.\nIf the question is not in English, translate the question to English before generating the search query.\nIf you cannot generate a search query, return just the number 0",
|
||||
"runAfter": {
|
||||
"sample_responses": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Execute_JavaScript_Code": {
|
||||
"type": "JavaScriptCode",
|
||||
"inputs": {
|
||||
"code": "var system_message=workflowContext.actions.system_message_for_search_query.outputs;\r\nvar samples=workflowContext.actions.sample_responses.outputs\r\nvar original_user_query = workflowContext.trigger.outputs.body.prompt\r\n\r\nvar messages = [];\r\nmessages.push({role: \"system\", message: system_message});\r\nfor (var i=0; i<samples.length; i++)\r\n{\r\n messages.push(samples[i]);\r\n}\r\n\r\nvar user_message = \"Generate search query for: \" + original_user_query;\r\nmessages.push({ role: \"user\", message: user_message});\r\n\r\nreturn messages;\r\n"
|
||||
},
|
||||
"runAfter": {
|
||||
"system_message_for_search_query": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"get_search_query": {
|
||||
"type": "ServiceProvider",
|
||||
"inputs": {
|
||||
"parameters": {
|
||||
"deploymentId": "@parameters('openai_chat_deployment_model')",
|
||||
"messages": "@outputs('Execute_JavaScript_Code')",
|
||||
"temperature": 1
|
||||
},
|
||||
"serviceProviderConfiguration": {
|
||||
"connectionName": "openai",
|
||||
"operationId": "getChatCompletions",
|
||||
"serviceProviderId": "/serviceProviders/openai"
|
||||
}
|
||||
},
|
||||
"runAfter": {
|
||||
"Execute_JavaScript_Code": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Get_an_embedding": {
|
||||
"type": "ServiceProvider",
|
||||
"inputs": {
|
||||
"parameters": {
|
||||
"deploymentId": "@parameters('openai_embedding_deployment_model')",
|
||||
"input": "@body('get_search_query')?['content']"
|
||||
},
|
||||
"serviceProviderConfiguration": {
|
||||
"connectionName": "openai",
|
||||
"operationId": "getSingleEmbedding",
|
||||
"serviceProviderId": "/serviceProviders/openai"
|
||||
}
|
||||
},
|
||||
"runAfter": {
|
||||
"get_search_query": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Vector_search": {
|
||||
"type": "ServiceProvider",
|
||||
"inputs": {
|
||||
"parameters": {
|
||||
"indexName": "@parameters('aisearch_index_name')",
|
||||
"searchVector": {
|
||||
"fieldName": "embeddings",
|
||||
"vector": "@body('Get_an_embedding')['embedding']"
|
||||
},
|
||||
"kNearestNeighbors": 3
|
||||
},
|
||||
"serviceProviderConfiguration": {
|
||||
"connectionName": "azureaisearch",
|
||||
"operationId": "vectorSearch",
|
||||
"serviceProviderId": "/serviceProviders/azureaisearch"
|
||||
}
|
||||
},
|
||||
"runAfter": {
|
||||
"Get_an_embedding": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"create_prompt": {
|
||||
"type": "JavaScriptCode",
|
||||
"inputs": {
|
||||
"code": "var search_results = workflowContext.actions.Vector_Search.outputs.body;\r\nvar system_message = workflowContext.actions.system_message.outputs;\r\n\r\nvar sources = \"\"\r\nfor (let i=0;i<search_results.length;i++)\r\n{\r\n sources = sources + \"\\n\" + search_results[i]['id'] +\":\" + search_results[i]['content']\r\n}\r\n\r\nvar system_message = system_message + \"\\n\" + \"Sources: \\n\" + sources\r\n\r\nreturn system_message"
|
||||
},
|
||||
"runAfter": {
|
||||
"Vector_search": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Get_chat_completions": {
|
||||
"type": "ServiceProvider",
|
||||
"inputs": {
|
||||
"parameters": {
|
||||
"deploymentId": "@parameters('openai_chat_deployment_model')",
|
||||
"messages": [
|
||||
{
|
||||
"message": "@outputs('create_prompt')",
|
||||
"role": "system"
|
||||
},
|
||||
{
|
||||
"message": "@triggerBody()?['prompt']",
|
||||
"role": "user"
|
||||
}
|
||||
],
|
||||
"temperature": 1
|
||||
},
|
||||
"serviceProviderConfiguration": {
|
||||
"connectionName": "openai",
|
||||
"operationId": "getChatCompletions",
|
||||
"serviceProviderId": "/serviceProviders/openai"
|
||||
}
|
||||
},
|
||||
"runAfter": {
|
||||
"create_prompt": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
},
|
||||
"Response": {
|
||||
"type": "Response",
|
||||
"kind": "Http",
|
||||
"inputs": {
|
||||
"statusCode": 200,
|
||||
"body": "@body('Get_chat_completions')?['content']"
|
||||
},
|
||||
"runAfter": {
|
||||
"Get_chat_completions": [
|
||||
"SUCCEEDED"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"contentVersion": "1.0.0.0",
|
||||
"outputs": {},
|
||||
"triggers": {
|
||||
"When_a_HTTP_request_is_received": {
|
||||
"type": "Request",
|
||||
"kind": "Http",
|
||||
"inputs": {
|
||||
"schema": {
|
||||
"properties": {
|
||||
"prompt": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"type": "object"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"kind": "Stateful"
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
{
|
||||
"serviceProviderConnections": {
|
||||
"sql": {
|
||||
"parameterValues": {
|
||||
"connectionString": "@parameters('sql_connectionString')"
|
||||
},
|
||||
"parameterSetName": "connectionString",
|
||||
"serviceProvider": {
|
||||
"id": "/serviceProviders/sql"
|
||||
},
|
||||
"displayName": "sql-con"
|
||||
},
|
||||
"openai": {
|
||||
"parameterValues": {
|
||||
"openAIEndpoint": "@parameters('openai_openAIEndpoint')",
|
||||
"openAIKey": "@parameters('openai_openAIKey')"
|
||||
},
|
||||
"parameterSetName": "KeyAndEndpointConnection",
|
||||
"serviceProvider": {
|
||||
"id": "/serviceProviders/openai"
|
||||
},
|
||||
"displayName": "openai-con"
|
||||
},
|
||||
"azureaisearch": {
|
||||
"parameterValues": {
|
||||
"searchServiceEndpoint": "@parameters('azureaisearch_searchServiceEndpoint')",
|
||||
"searchServiceAdminKey": "@parameters('azureaisearch_searchServiceAdminKey')"
|
||||
},
|
||||
"parameterSetName": "ConnectionString",
|
||||
"serviceProvider": {
|
||||
"id": "/serviceProviders/azureaisearch"
|
||||
},
|
||||
"displayName": "aisearch-con"
|
||||
}
|
||||
},
|
||||
"managedApiConnections": {}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"version": "2.0",
|
||||
"extensionBundle": {
|
||||
"id": "Microsoft.Azure.Functions.ExtensionBundle.Workflows",
|
||||
"version": "[1.*, 2.0.0)"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
{
|
||||
"IsEncrypted": false,
|
||||
"Values": {
|
||||
"openai_openAIKey": "",
|
||||
"openai_openAIEndpoint": "",
|
||||
"sql_connectionString": "",
|
||||
"azureaisearch_searchServiceEndpoint": "",
|
||||
"azureaisearch_searchServiceAdminKey": ""
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
{
|
||||
"tokenize_function_url": {
|
||||
"type": "String",
|
||||
"value": ""
|
||||
},
|
||||
"openai_embedding_deployment_model": {
|
||||
"type": "String",
|
||||
"value": ""
|
||||
},
|
||||
"aisearch_index_name": {
|
||||
"type": "string",
|
||||
"value": ""
|
||||
},
|
||||
"sql_table_name": {
|
||||
"type": "String",
|
||||
"value": ""
|
||||
},
|
||||
"openai_chat_deployment_model": {
|
||||
"type": "String",
|
||||
"value": ""
|
||||
}
|
||||
}
|
|
@ -0,0 +1,215 @@
|
|||
import azure.functions as func
|
||||
import logging
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from langchain.document_loaders import PyPDFLoader, TextLoader, UnstructuredHTMLLoader, UnstructuredPowerPointLoader, UnstructuredMarkdownLoader
|
||||
from langchain.document_loaders.word_document import Docx2txtLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter, MarkdownHeaderTextSplitter, HTMLHeaderTextSplitter
|
||||
import tiktoken
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
|
||||
|
||||
@app.route(route="tokenize_trigger")
|
||||
def tokenize_trigger(req: func.HttpRequest) -> func.HttpResponse:
|
||||
logging.info('Python HTTP trigger function processed a request.')
|
||||
|
||||
output = main(req.get_json())
|
||||
|
||||
resp = func.HttpResponse(
|
||||
body=output,
|
||||
status_code=200)
|
||||
return resp
|
||||
|
||||
def load_file(req):
|
||||
# accepts user input as a json object, decodes and returns the document data.
|
||||
loader_mapping = {
|
||||
"PDF": PyPDFLoader,
|
||||
"DOCUMENT": Docx2txtLoader,
|
||||
"MARKUP": UnstructuredMarkdownLoader,
|
||||
"TXT": TextLoader,
|
||||
"PPTX": UnstructuredPowerPointLoader,
|
||||
"HTML": UnstructuredHTMLLoader,
|
||||
}
|
||||
|
||||
content = req["base64Content"]
|
||||
file_bytes = base64.b64decode(content)
|
||||
file = BytesIO(file_bytes)
|
||||
|
||||
fd, path = tempfile.mkstemp()
|
||||
|
||||
try:
|
||||
with os.fdopen(fd, "wb") as f:
|
||||
f.write(file.read())
|
||||
|
||||
document_type = req["documentType"].upper()
|
||||
splitting_strategy = req["splittingStrategy"].upper()
|
||||
if document_type in loader_mapping:
|
||||
if (document_type == "MARKUP" and splitting_strategy == "MARKUP") or (
|
||||
document_type == "HTML" and splitting_strategy == "HTML"
|
||||
):
|
||||
# return raw data for md and html splitters
|
||||
return file_bytes.decode()
|
||||
else:
|
||||
loader_class = loader_mapping[document_type]
|
||||
loader = loader_class(path)
|
||||
else:
|
||||
raise ValueError("File type not supported")
|
||||
|
||||
documents = loader.load()
|
||||
|
||||
# remove the source
|
||||
for doc in documents:
|
||||
doc.metadata.pop("source")
|
||||
|
||||
return documents
|
||||
finally:
|
||||
os.remove(path)
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name="cl100k_base") -> int:
|
||||
if not string:
|
||||
return 0
|
||||
# Returns the number of tokens in a text string
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
return num_tokens
|
||||
|
||||
|
||||
def split_document_by_splitter_type(
|
||||
documents,
|
||||
document_type,
|
||||
splitter="RECURSIVE",
|
||||
secondary_splitter="RECURSIVE",
|
||||
headers_to_split_on=None,
|
||||
chunk_size=4000,
|
||||
chunk_overlap=200,
|
||||
length_function=len,
|
||||
):
|
||||
|
||||
MARKUP_HEADERS = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
("###", "Header 3"),
|
||||
("####", "Header 4"),
|
||||
]
|
||||
|
||||
HTML_HEADERS = [
|
||||
("h1", "Header 1"),
|
||||
("h2", "Header 2"),
|
||||
("h3", "Header 3"),
|
||||
("h4", "Header 4"),
|
||||
("h5", "Header 5"),
|
||||
("h6", "Header 6"),
|
||||
]
|
||||
|
||||
splitter_mapping = {
|
||||
"RECURSIVE": RecursiveCharacterTextSplitter,
|
||||
"TOKEN": TokenTextSplitter,
|
||||
"MARKUP": MarkdownHeaderTextSplitter,
|
||||
"HTML": HTMLHeaderTextSplitter,
|
||||
}
|
||||
|
||||
if splitter == "RECURSIVE" or splitter == "TOKEN":
|
||||
chosen_splitter = splitter_mapping.get(splitter)(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
length_function=length_function,
|
||||
)
|
||||
new_list = []
|
||||
for chunk in chosen_splitter.split_documents(documents):
|
||||
text = chunk.page_content.replace("\n", " ")
|
||||
item = {}
|
||||
item["content"] = text
|
||||
item["tokenLength"] = num_tokens_from_string(text)
|
||||
item["metadata"] = chunk.metadata
|
||||
new_list.append(item)
|
||||
if new_list == []:
|
||||
raise ValueError("There is no content in this document.")
|
||||
return new_list
|
||||
|
||||
elif splitter == "MARKUP" or splitter == "HTML":
|
||||
if headers_to_split_on is None:
|
||||
if splitter == "HTML" and document_type == "HTML":
|
||||
headers_to_split_on = HTML_HEADERS
|
||||
elif splitter == "MARKUP" and document_type == "MARKUP":
|
||||
headers_to_split_on = MARKUP_HEADERS
|
||||
else:
|
||||
raise ValueError("The MARKUP and HTML splitter can only be used with MARKUP and HTML documents respectively.")
|
||||
|
||||
chosen_splitter = splitter_mapping.get(splitter)(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
|
||||
second_splitter = splitter_mapping.get(secondary_splitter)(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
length_function=length_function,
|
||||
)
|
||||
|
||||
new_list = []
|
||||
header_chunks = chosen_splitter.split_text(documents)
|
||||
for c in header_chunks:
|
||||
content = c.page_content.replace("\n", " ")
|
||||
for c2 in second_splitter.split_text(content.strip()):
|
||||
text = c2.replace("\n", " ")
|
||||
item = {}
|
||||
item["content"] = text
|
||||
item["tokenLength"] = num_tokens_from_string(text)
|
||||
item["metadata"] = c.metadata
|
||||
new_list.append(item)
|
||||
if new_list == []:
|
||||
raise ValueError("There is no content in this document.")
|
||||
return new_list
|
||||
|
||||
|
||||
def validate_json_data(json_data):
|
||||
json_data["chunkSize"] = json_data.get("chunkSize", 4000)
|
||||
if json_data["chunkSize"] <= 1:
|
||||
raise ValueError("Chunk size should be greater than 1.")
|
||||
json_data["chunkOverlap"] = json_data.get("chunkOverlap", 200)
|
||||
if json_data["chunkOverlap"] < 0:
|
||||
raise ValueError("Chunk overlap should be 0 or greater.")
|
||||
|
||||
valid_primary_splitters = {"RECURSIVE", "TOKEN", "MARKUP", "HTML"}
|
||||
json_data["splittingStrategy"] = json_data.get("splittingStrategy", "RECURSIVE")
|
||||
if json_data["splittingStrategy"].upper() not in valid_primary_splitters:
|
||||
raise ValueError("Invalid primary splitter value.")
|
||||
|
||||
valid_secondary_splitters = {"RECURSIVE", "TOKEN"}
|
||||
json_data["secondarySplittingStrategy"] = json_data.get("secondarySplittingStrategy", "RECURSIVE")
|
||||
if json_data["secondarySplittingStrategy"].upper() not in valid_secondary_splitters:
|
||||
raise ValueError("Invalid secondary splitter value.")
|
||||
|
||||
|
||||
def split_document(json_data, document):
|
||||
splitter = json_data.get("splittingStrategy").upper()
|
||||
secondary_splitter = json_data.get(
|
||||
"secondarySplittingStrategy").upper()
|
||||
headers_to_split_on = json_data.get("headersToSplitOn", None)
|
||||
chunk_size = json_data.get("chunkSize")
|
||||
chunk_overlap = json_data.get("chunkOverlap")
|
||||
document_type = json_data["documentType"].upper()
|
||||
return split_document_by_splitter_type(
|
||||
document,
|
||||
document_type,
|
||||
splitter=splitter,
|
||||
secondary_splitter=secondary_splitter,
|
||||
headers_to_split_on=headers_to_split_on,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
|
||||
def main(req):
|
||||
try:
|
||||
json_data = req
|
||||
except json.JSONDecodeError:
|
||||
raise ValueError("Invalid JSON data.")
|
||||
|
||||
validate_json_data(json_data)
|
||||
document = load_file(json_data)
|
||||
chunks = split_document(json_data, document)
|
||||
return json.dumps(chunks)
|
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
"version": "2.0",
|
||||
"logging": {
|
||||
"applicationInsights": {
|
||||
"samplingSettings": {
|
||||
"isEnabled": true,
|
||||
"excludedTypes": "Request"
|
||||
}
|
||||
}
|
||||
},
|
||||
"extensionBundle": {
|
||||
"id": "Microsoft.Azure.Functions.ExtensionBundle",
|
||||
"version": "[3.*, 4.0.0)"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"IsEncrypted": false,
|
||||
"Values": {
|
||||
"AzureWebJobsStorage": "UseDevelopmentStorage=true",
|
||||
"FUNCTIONS_WORKER_RUNTIME": "python",
|
||||
"AzureWebJobsFeatureFlags": "EnableWorkerIndexing"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
# DO NOT include azure-functions-worker in this file
|
||||
# The Python Worker is managed by Azure Functions platform
|
||||
# Manually managing azure-functions-worker may cause unexpected issues
|
||||
|
||||
azure-functions
|
||||
langchain==0.0.326
|
||||
tiktoken==0.5.1
|
||||
pypdf==3.15.5
|
|
@ -0,0 +1,180 @@
|
|||
# Create a **Chat with Your Data** Logic App Project
|
||||
|
||||
This readme document provides step-by-step instructions on how to enable a **Chat with your Data** Logic Apps project.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### Required installations
|
||||
|
||||
- [Visual Studio Code](https://code.visualstudio.com/)
|
||||
- [Azure Logic Apps extension for Visual Studio Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-logicapps)
|
||||
- [Azure Functions extension for Visual Studio Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-azurefunctions)
|
||||
- [Azurite extension for Visual Studio Code](https://marketplace.visualstudio.com/items?itemName=Azurite.azurite)
|
||||
- This guide also assumes you have pulled this repo down to your local machine.
|
||||
|
||||
### Required AI Services
|
||||
|
||||
#### Access to an Azure OpenAI Service
|
||||
If you already have an existing OpenAI Service and model you can skip these steps.
|
||||
|
||||
1. Go to the Azure portal
|
||||
|
||||
2. Click `Create a resource`
|
||||
|
||||
3. In the search box type: `OpenAI`.
|
||||
|
||||
4. In the search results list, click `Create` on `Azure OpenAI`.
|
||||
|
||||
5. Follow the prompts to create the service in your chosen subscription and resource group.
|
||||
|
||||
6. Once your OpenAI service is created you will need to create a deployments for generating embeddings and chat completions.
|
||||
- Go to your OpenAI service, under the `Resource Management` menu pane, click `Model deployments`
|
||||
- Click `Manage Deployments`
|
||||
- On the `Deployments` page click `Create new deployment`
|
||||
- Select an available embedding `model` e.g. `text-embedding-ada-002`, `model version`, and `deployment name`. Keep track of the `deployment name`, it will be used in later steps.
|
||||
- Ensure your model is successfully deployed by viewing it on the `Deployments` page
|
||||
- On the `Deployments` page click `Create new deployment`
|
||||
- Select an available chat `model` e.g. `gpt-35-turbo`, `model version`, and `deployment name`. Keep track of the `deployment name`, it will be used in later steps.
|
||||
- Ensure your model is successfully deployed by viewing it on the `Deployments` page
|
||||
|
||||
|
||||
|
||||
#### Access to an Azure AI Search Service
|
||||
If you already have an existing AI Search Service you can skip to step 5.
|
||||
|
||||
1. Go to the Azure portal.
|
||||
|
||||
2. Click `Create a resource`.
|
||||
|
||||
3. In the search box type: `Azure AI Search`.
|
||||
|
||||
4. In the search results list, click `Create` on `Azure AI Search`.
|
||||
|
||||
5. Follow the prompts to create the service in your chosen subscription and resource group.
|
||||
|
||||
6. Once your AI Search service is created you will need to create an index to store your document content and embeddings.
|
||||
- Go to your search service on the `Overview` page, at the top click `Add index (JSON)`
|
||||
- Go up one level to the root folder `ai-sample` and open the `Deployment` folder. Copy the entire contents of the file `aisearch_index.json` and paste them into the index window. You can change the name of the index in the `name` field if you choose. This name will be used in later steps.
|
||||
- Ensure your index is created by viewing in on the `Indexes` page
|
||||
|
||||
|
||||
## Function App and Workflows Creation
|
||||
There are 2 projects that need to be created and published to Azure:
|
||||
- Azure Functions project located in `TokenizeDocFuntion` folder
|
||||
- Azure Standard Logic Apps project located in `SampleAIWorkflows` folder
|
||||
|
||||
### Follow these steps to create the Azure Functions project and deploy it to Azure:
|
||||
|
||||
1. Open Visual Studio Code.
|
||||
|
||||
2. Go to the Azure Function extension.
|
||||
|
||||
3. Under Azure Function option, click `Create New Project` then navigate to and select the `TokenizeDocFunction` folder.
|
||||
|
||||
4. Follow the setup prompts:
|
||||
- Choose `Python` language
|
||||
- Choose Python programming model V1 or V2
|
||||
- Skip `Trigger Type` selection
|
||||
- Select `Yes` if asked to overwrite any existing files except the `requirements.txt` file
|
||||
|
||||
6. Deploy your Function App:
|
||||
- Go to the Azure Function extension.
|
||||
- Under the Azure Function option, click `Create Function App in Azure`
|
||||
- Select a Subscription and Resource Group to deploy your Function App.
|
||||
|
||||
7. Go to the Azure portal to verify your app is up and running.
|
||||
|
||||
8. Make note of the URL generated by your Function App, it will be used in later steps.
|
||||
|
||||
|
||||
### Follow these steps to create the Azure Standard Logic Apps project and deploy it to Azure:
|
||||
|
||||
1. Open Visual Studio Code.
|
||||
|
||||
2. Go to the Azure Logic Apps extension.
|
||||
|
||||
3. Click `Create New Project` then navigate to and select the `SampleAIWorkflows` folder.
|
||||
|
||||
4. Follow the setup prompts:
|
||||
- Choose Stateful Workflow
|
||||
- Press Enter to use the default `Stateful` name. This can be deleted later
|
||||
- Select `Yes` if asked to overwrite any existing files
|
||||
|
||||
5. Update your `local.settings.json` file:
|
||||
- Open the `local.settings.json` file
|
||||
- Go to your Azure OpenAI service in the portal
|
||||
- Under the `Resource Management` menu click `Keys and Endpoint`
|
||||
- Copy the `KEY 1` value and place its value into the `value` field of the `openai_openAIKey` property
|
||||
- Copy the `Endpoint` value and place its values into the `value` field of the `openai_openAIEndpoint` property
|
||||
- Go to your Azure AI Search service in the portal
|
||||
- On the `Overview` page copy the `Url` value. Place its value in the `value` field of the `azureaisearch_searchServiceEndpoint` property
|
||||
- Under the `Settings` menu click `Keys`. Copy either the `Primary` or `Secondary` admin key and place its value into the `value` field of the `azureaisearch_searchServiceAdminKey` property
|
||||
- Go to your SQL Server in Azure Portal or in SQL Management Studio
|
||||
- Copy the `Conenction Strings` for the SQL database. Place the value into the `sql_connectionString` property
|
||||
|
||||
5. Update your `parameters.json` file:
|
||||
- Open the `parameters.json` file
|
||||
- Go to your Azure OpenAI service in the portal
|
||||
- Under the `Resource Management` menu click `Model deployments`
|
||||
- Click `Manage Deployments`
|
||||
- Copy the `Deployment name` of the embeddings model you want to use and place its value into the `value` field of the `openai_embeddings_deployment_model` property
|
||||
- Copy the `Deployment name` of the chat model you want to use and place its value into the `value` field of the `openai_chat_deployment_model` property
|
||||
|
||||
- Go to your Azure AI Search service in the portal
|
||||
- Under the `Resource Management` menu click `Indexes`
|
||||
- Copy the name of index that you want to use and place its value into the `value` field of the `oaisearch_index_name` property
|
||||
|
||||
|
||||
- Go to your Tokenize Function App
|
||||
- On the `Overview` page. Copy the `URL` value and place its value into the `value` field of the `tokenize_function_url` property. Then append `/api/tokenize_trigger` to the end of the url.
|
||||
|
||||
- Go to your SQL Server in Azure Portal or in SQL Management Studio
|
||||
- Copy the name of the table which would be the source of data. Place the value of table name into the `value` field of the `sql_table_name` property.
|
||||
|
||||
|
||||
7. Deploy your Logic App:
|
||||
- Go to the Azure Logic Apps extension
|
||||
- Click `Deploy to Azure`
|
||||
- Select a Subscription and Resource Group to deploy your Logic App
|
||||
|
||||
7. Go to the Azure portal to verify your app is up and running.
|
||||
|
||||
8. Verify your Logic Apps contains two workflows. They will be named: `RAG-Ingestion-Workflow` and `RAG-Retrieval-Workflow`.
|
||||
|
||||
## Run your workflows
|
||||
|
||||
Now that the Azure Function and Azure Logic App workflows are live in Azure. You are ready to ingest your data and chat with it.
|
||||
|
||||
### Ingest Workflow
|
||||
1. Go to your Logic App in the Azure portal.
|
||||
|
||||
2. Go to your `RAG-Ingestion-Workflow` workflow.
|
||||
|
||||
3. On the `Overview` tab select `Run` to trigger the workflow.
|
||||
|
||||
4. On the `Overview` tab, Click `Run`, this will trigger the `RAG-Ingestion-Workflow` workflow. This will pull in your data from SQL database and store it in your Azure AI Search Service.
|
||||
|
||||
5. View the `Run History` to ensure a successful run.
|
||||
|
||||
### Chat Workflow
|
||||
1. Go to your Logic App in the Azure portal.
|
||||
|
||||
2. Go to your `RAG-Retrieval-Workflows` workflow.
|
||||
|
||||
3. On the `Overview` tab click the drop down `Run` then select `Run with payload`.
|
||||
|
||||
4. Fill in the JSON `Body` section with your `prompt`. For example: `{ "prompt": "Provide insights and recommendations using sales data on how to improve album sales" }`
|
||||
|
||||
5. Click `Run`, This will trigger the `RAG-Retrieval-Workflows` workflow. This will query your data stored in your Azure AI Search Service and respond with an answer.
|
||||
|
||||
6. View the `Run History` to see the Response from your query.
|
||||
|
||||
|
||||
## Conclusion
|
||||
|
||||
In this readme document, you learned how to:
|
||||
- Create an Azure OpenAI Service
|
||||
- Create an Azure AI Search Service
|
||||
- Create and deploy an Azure Function and multiple Logic Apps workflows using Visual Studio Code and their respective extensions.
|
||||
|
||||
For more information and advanced usage, refer to the official documentation of Azure Logic Apps and Azure Functions.
|
Загрузка…
Ссылка в новой задаче