From 4841ddb70cae5e6b8cf76d77bd2fed9d486b5471 Mon Sep 17 00:00:00 2001 From: aydan-at-microsoft <51974608+aydan-at-microsoft@users.noreply.github.com> Date: Mon, 24 Jul 2023 07:56:16 -0700 Subject: [PATCH] docs: add QandA notebook. (#2029) * add QandA notebook. Co-authored-by: Kartavya Neema Co-authored-by: Amir Jafari * address comments. Co-authored-by: Kartavya Neema Co-authored-by: Amir Jafari * rename cog services to ai services * add test to exclude, wait for synapse fix --------- Co-authored-by: Kartavya Neema Co-authored-by: Amir Jafari Co-authored-by: Mark Hamilton --- .../synapse/ml/nbtest/SynapseTests.scala | 5 +- ...ent Question and Answering with PDFs.ipynb | 1016 +++++++++++++++++ website/sidebars.js | 1 + 3 files changed, 1020 insertions(+), 2 deletions(-) create mode 100644 docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala index d7916b8eaf..195cd97800 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala @@ -46,8 +46,9 @@ class SynapseTests extends TestBase { .filter(_.getAbsolutePath.endsWith(".py")) .filterNot(_.getAbsolutePath.contains("Finetune")) // Excluded by design task 1829306 .filterNot(_.getAbsolutePath.contains("VWnativeFormat")) - .filterNot(_.getAbsolutePath.contains("VowpalWabbitMulticlassclassification")) // Wait for Synpase fix - .filterNot(_.getAbsolutePath.contains("Langchain")) // Wait for Synpase fix + .filterNot(_.getAbsolutePath.contains("VowpalWabbitMulticlassclassification")) // Wait for Synapse fix + .filterNot(_.getAbsolutePath.contains("Langchain")) // Wait for Synapse fix + .filterNot(_.getAbsolutePath.contains("DocumentQuestionandAnsweringwithPDFs")) // Wait for Synapse fix .filterNot(_.getAbsolutePath.contains("SetupCognitive")) // No code to run .filterNot(_.getAbsolutePath.contains("CreateaSparkCluster")) // No code to run .sortBy(_.getAbsolutePath) diff --git a/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb new file mode 100644 index 0000000000..e8e4a938e2 --- /dev/null +++ b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb @@ -0,0 +1,1016 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6b31dee8-67e3-4bb7-a501-269c69c80d3f", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# A Guide to Q&A on PDF Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b4000620-9ea1-45aa-be4f-ddb971cc708e", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Introduction\n", + "In this notebook, we'll demonstrate how to develop a context-aware question answering framework for any form of a document using [OpenAI models](https://azure.microsoft.com/en-us/products/ai-services/openai-service), [SynapseML](https://microsoft.github.io/SynapseML/docs/about/) and [Azure AI Services](https://azure.microsoft.com/en-us/products/cognitive-services/). In this notebook, we assume that PDF documents are the source of data, however, the same framework can be easiy extended to other document formats too. \n", + "\n", + "We’ll cover the following key steps:\n", + "\n", + "1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Form Recognizer Service](https://azure.microsoft.com/en-us/products/form-recognizer/) in Azure AI Services, and use SynapseML to split the documents into chunks.\n", + "2. Embedding Generation and Storage: Learn how to generate embeddings for the chunks using SynapseML and [Azure OpenAI Services](https://azure.microsoft.com/en-us/products/cognitive-services/openai-service), store the embeddings in a vector store using [Azure Cognitive Search](https://azure.microsoft.com/en-us/products/search), and search the vector store to answer the user’s question.\n", + "3. Question Answering Pipeline: Learn how to retrieve relevant document based on the user’s question and provide the answer using [Langchain](https://python.langchain.com/en/latest/index.html#)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b7c8afaa-8298-4d48-9db0-867b6307963a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We start by installing the necessary python libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install langchain openai" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "be4f7d31-48e0-4d71-af5c-645883891567", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 1: Provide the keys for Azure AI Services and Azure OpenAI to authenticate the applications." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "87b58b64-49a4-4a78-a915-7c2478c22c7d", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "To authenticate Azure AI Services and Azure OpenAI applications, you need to provide the respective API keys. Here is an example of how you can provide the keys in Python code. `find_secret()` function uses Azure Keyvault to get the API keys, however you can directly paste your own keys there." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "from synapse.ml.core.platform import find_secret\n", + "\n", + "ai_services_key = find_secret(\"cognitive-api-key\")\n", + "ai_services_location = \"eastus\"\n", + "\n", + "# Fill in the following lines with your Azure service information\n", + "aoai_service_name = \"synapseml-openai\"\n", + "aoai_endpoint = f\"https://{aoai_service_name}.openai.azure.com/\"\n", + "aoai_key = find_secret(\"openai-api-key\")\n", + "aoai_deployment_name_embeddings = \"text-embedding-ada-002\"\n", + "aoai_deployment_name_query = \"text-davinci-003\"\n", + "aoai_model_name_query = \"text-davinci-003\"\n", + "\n", + "# Azure Cognitive Search\n", + "cogsearch_name = \"mmlspark-azure-search\"\n", + "cogsearch_index_name = \"exampleindex\"\n", + "cogsearch_api_key = find_secret(\"azure-search-key\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "97f056e7-9f88-45b9-b6b2-95be8c7fccac", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 2: Load the PDF documents into a Spark DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eb6519d4-f03a-4359-8a6f-4922bfeedbf5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import StringType\n", + "\n", + "document_path = \"wasbs://public@synapseaisolutionsa.blob.core.windows.net/NASAEarth\" # path to your document\n", + "df = spark.read.format(\"binaryFile\").load(document_path).limit(10).cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "189a84ca-ac81-4130-9143-75883b2633ba", + "showTitle": false, + "title": "" + } + }, + "source": [ + "This code will read the PDF documents and create a Spark DataFrame named df with the contents of the PDFs. The DataFrame will have a schema that represents the structure of the PDF documents, including their textual content." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "90f708b9-9ef2-4de5-b555-a2aa32fd0cfc", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Let's take a glimpse at the contents of the e-books we are working with. Below are some screenshots that showcase the essence of the books; as you can see they contain information about the Earth.\n", + "\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8119ea95-aa60-4f81-8189-04009fb4aac0", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "##### Display the raw data from the PDF documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the dataframe without the content\n", + "display(df.drop(\"content\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "34e06daf-e9e7-4144-b956-e57bde8fab77", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 3: Read the documents using Azure AI Services Form Recognizer." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "304ed77d-a032-4620-a74d-65a277caeaf7", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We utilize [SynapseML](https://microsoft.github.io/SynapseML/), an ecosystem of tools designed to enhance the distributed computing framework [Apache Spark](https://github.com/apache/spark). SynapseML introduces advanced networking capabilities to the Spark ecosystem and offers user-friendly SparkML transformers for various [Azure AI Services](https://azure.microsoft.com/en-us/products/ai-services).\n", + "\n", + "Additionally, we employ AnalyzeDocument from Azure AI Services to extract the complete document content and present it in the designated columns called \"output_content\" and \"paragraph.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from synapse.ml.cognitive import AnalyzeDocument\n", + "from pyspark.sql.functions import col\n", + "\n", + "analyzeDocument = (\n", + " AnalyzeDocument()\n", + " .setPrebuiltModelId(\"prebuilt-layout\")\n", + " .setSubscriptionKey(ai_services_key)\n", + " .setLocation(ai_services_location)\n", + " .setImageBytesCol(\"content\")\n", + " .setOutputCol(\"result\")\n", + " .setPages(\n", + " \"1-15\"\n", + " ) # Here we are reading the first 15 pages of the documents for demo purposes\n", + ")\n", + "\n", + "analyzed_df = (\n", + " analyzeDocument.transform(df)\n", + " .withColumn(\"output_content\", col(\"result.analyzeResult.content\"))\n", + " .withColumn(\"paragraphs\", col(\"result.analyzeResult.paragraphs\"))\n", + ").cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d26e4217-ac87-4583-9500-af65d969c199", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We can observe the analayzed Spark DataFrame named ```analyzed_df``` using the following code. Note that we drop the \"content\" column as it is not needed anymore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analyzed_df = analyzed_df.drop(\"content\")\n", + "display(analyzed_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "59188b7a-32fa-406d-8562-09ad69400b28", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 4: Split the documents into chunks." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d682af37-faa8-4830-acd0-96aa348815d3", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "After analyzing the document, we leverage SynapseML’s PageSplitter to divide the documents into smaller sections, which are subsequently stored in the “chunks” column. This allows for more granular representation and processing of the document content." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from synapse.ml.featurize.text import PageSplitter\n", + "\n", + "ps = (\n", + " PageSplitter()\n", + " .setInputCol(\"output_content\")\n", + " .setMaximumPageLength(4000)\n", + " .setMinimumPageLength(3000)\n", + " .setOutputCol(\"chunks\")\n", + ")\n", + "\n", + "splitted_df = ps.transform(analyzed_df)\n", + "display(splitted_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ce75e0fc-c036-488f-acba-57a44924d55e", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "Note that the chunks for each document are presented in a single row inside an array. In order to embed all the chunks in the following cells, we need to have each chunk in a separate row. To accomplish that, we first explode these arrays so there is only one chunk in each row, then filter the Spark DataFrame in order to only keep the path to the document and the chunk in a single row." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Each column contains many chunks for the same document as a vector.\n", + "# Explode will distribute and replicate the content of a vecor across multple rows\n", + "from pyspark.sql.functions import explode, col\n", + "\n", + "exploded_df = splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\")).select(\n", + " \"path\", \"chunk\"\n", + ")\n", + "display(exploded_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1e5b0f56-0a64-4e4a-86f2-b647e82b41ce", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 5: Generate Embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ebba439c-9503-46d7-bafb-f7fa790974a8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "To produce embeddings for each chunk, we utilize both SynapseML and Azure OpenAI Service. By integrating the Azure OpenAI service with SynapseML, we can leverage the power of the Apache Spark distributed computing framework to process numerous prompts using the OpenAI service. This integration enables the SynapseML embedding client to generate embeddings in a distributed manner, enabling efficient processing of large volumes of data. If you're interested in applying large language models at a distributed scale using Azure OpenAI and Azure Synapse Analytics, you can refer to [this approach](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20OpenAI/). For more detailed information on generating embeddings with Azure OpenAI, you can look [here]( https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/embeddings?tabs=console)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from synapse.ml.cognitive import OpenAIEmbedding\n", + "\n", + "embedding = (\n", + " OpenAIEmbedding()\n", + " .setSubscriptionKey(aoai_key)\n", + " .setDeploymentName(aoai_deployment_name_embeddings)\n", + " .setCustomServiceName(aoai_service_name)\n", + " .setTextCol(\"chunk\")\n", + " .setErrorCol(\"error\")\n", + " .setOutputCol(\"embeddings\")\n", + ")\n", + "\n", + "df_embeddings = embedding.transform(exploded_df)\n", + "\n", + "display(df_embeddings)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e7d8e559-92bb-44bc-aee0-93b2490f38e2", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 6: Store the embeddings in Azure Cognitive Search Vector Store." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6d3aaa47-818c-4eb2-b131-8d316380a0ab", + "showTitle": false, + "title": "" + } + }, + "source": [ + "[Azure Cognitive Search](https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search) offers a user-friendly interface for creating a vector database, as well as storing and retrieving data using vector search. If you're interested in learning more about vector search, you can look [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main).\n", + "\n", + "\n", + "Storing data in the AzureCogSearch vector database involves two main steps:\n", + "\n", + "Creating the Index: The first step is to establish the index or schema of the vector database. This entails defining the structure and properties of the data that will be stored and indexed in the vector database.\n", + "\n", + "Adding Chunked Documents and Embeddings: The second step involves adding the chunked documents, along with their corresponding embeddings, to the vector datastore. This allows for efficient storage and retrieval of the data using vector search capabilities.\n", + "\n", + "By following these steps, you can effectively store your chunked documents and their associated embeddings in the AzureCogSearch vector database, enabling seamless retrieval of relevant information through vector search functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary packages\n", + "import requests\n", + "import json\n", + "\n", + "EMBEDDING_LENGTH = (\n", + " 1536 # length of the embedding vector (OpenAI generates embeddings of length 1536)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create Index for Cog Search with fields as id, content, and contentVector\n", + "# Note the datatypes for each field below\n", + "\n", + "url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}?api-version=2023-07-01-Preview\"\n", + "payload = json.dumps(\n", + " {\n", + " \"name\": cogsearch_index_name,\n", + " \"fields\": [\n", + " {\"name\": \"id\", \"type\": \"Edm.String\", \"key\": True, \"filterable\": True},\n", + " {\n", + " \"name\": \"content\",\n", + " \"type\": \"Edm.String\",\n", + " \"searchable\": True,\n", + " \"retrievable\": True,\n", + " },\n", + " {\n", + " \"name\": \"contentVector\",\n", + " \"type\": \"Collection(Edm.Single)\",\n", + " \"searchable\": True,\n", + " \"retrievable\": True,\n", + " \"dimensions\": EMBEDDING_LENGTH,\n", + " \"vectorSearchConfiguration\": \"vectorConfig\",\n", + " },\n", + " ],\n", + " \"vectorSearch\": {\n", + " \"algorithmConfigurations\": [\n", + " {\n", + " \"name\": \"vectorConfig\",\n", + " \"kind\": \"hnsw\",\n", + " }\n", + " ]\n", + " },\n", + " }\n", + ")\n", + "headers = {\"Content-Type\": \"application/json\", \"api-key\": cogsearch_api_key}\n", + "\n", + "response = requests.request(\"PUT\", url, headers=headers, data=payload)\n", + "print(response.status_code)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "07396763-74c3-4299-8976-e15e6d510d47", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We need to use User Defined Function (UDF) through the udf() method in order to apply functions directly to the DataFrames and SQL databases in Python, without any need to individually register them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use Spark's UDF to insert entries to Cognitive Search\n", + "# This allows to run the code in a distributed fashion\n", + "\n", + "# Define a UDF using the @udf decorator\n", + "@udf(returnType=StringType())\n", + "def insertToCogSearch(idx, content, contentVector):\n", + " url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/index?api-version=2023-07-01-Preview\"\n", + "\n", + " payload = json.dumps(\n", + " {\n", + " \"value\": [\n", + " {\n", + " \"id\": str(idx),\n", + " \"content\": content,\n", + " \"contentVector\": contentVector.tolist(),\n", + " \"@search.action\": \"upload\",\n", + " },\n", + " ]\n", + " }\n", + " )\n", + " headers = {\n", + " \"Content-Type\": \"application/json\",\n", + " \"api-key\": cogsearch_api_key,\n", + " }\n", + "\n", + " response = requests.request(\"POST\", url, headers=headers, data=payload)\n", + " # response.text\n", + "\n", + " if response.status_code == 200 or response.status_code == 201:\n", + " return \"Success\"\n", + " else:\n", + " return \"Failure\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "42688e00-98fb-406e-9f19-c89fed3248ef", + "showTitle": false, + "title": "" + } + }, + "source": [ + "In the following, we apply UDF to different columns. Note that UDF also helps to add new columns to the DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Apply the UDF on the different columns\n", + "from pyspark.sql.functions import monotonically_increasing_id\n", + "\n", + "df_embeddings = df_embeddings.withColumn(\n", + " \"idx\", monotonically_increasing_id()\n", + ") ## adding a column with id\n", + "df_embeddings = df_embeddings.withColumn(\n", + " \"errorCogSearch\",\n", + " insertToCogSearch(\n", + " df_embeddings[\"idx\"], df_embeddings[\"chunk\"], df_embeddings[\"embeddings\"]\n", + " ),\n", + ")\n", + "\n", + "# Show the transformed DataFrame\n", + "df_embeddings.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "17b3890f-4163-443c-929b-252d62a6c736", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 7: Ask a Question" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "749a6ec7-d6c9-4945-bc72-2deed94e712b", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "After processing the document, we can proceed to pose a question. We will use [SynapseML](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20OpenAI%20Embedding/) to convert the user's question into an embedding and then utilize cosine similarity to retrieve the top K document chunks that closely match the user's question. It's worth mentioning that alternative similarity metrics can also be employed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "userQuestion = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "retrieve_k = 2 # Retrieve the top 2 documents from vector database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ask a question and convert to embeddings\n", + "\n", + "\n", + "def genQuestionEmbedding(userQuestion):\n", + " # Convert question to embedding using synapseML\n", + " from synapse.ml.cognitive import OpenAIEmbedding\n", + "\n", + " df_ques = spark.createDataFrame([(userQuestion, 1)], [\"questions\", \"dummy\"])\n", + " embedding = (\n", + " OpenAIEmbedding()\n", + " .setSubscriptionKey(aoai_key)\n", + " .setDeploymentName(aoai_deployment_name_embeddings)\n", + " .setCustomServiceName(aoai_service_name)\n", + " .setTextCol(\"questions\")\n", + " .setErrorCol(\"errorQ\")\n", + " .setOutputCol(\"embeddings\")\n", + " )\n", + " df_ques_embeddings = embedding.transform(df_ques)\n", + " row = df_ques_embeddings.collect()[0]\n", + " questionEmbedding = row.embeddings.tolist()\n", + " return questionEmbedding\n", + "\n", + "\n", + "def retrieve_k_chunk(k, questionEmbedding):\n", + " # Retrieve the top K entries\n", + " url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/search?api-version=2023-07-01-Preview\"\n", + "\n", + " payload = json.dumps(\n", + " {\"vector\": {\"value\": questionEmbedding, \"fields\": \"contentVector\", \"k\": 2}}\n", + " )\n", + " headers = {\n", + " \"Content-Type\": \"application/json\",\n", + " \"api-key\": cogsearch_api_key,\n", + " }\n", + "\n", + " response = requests.request(\"POST\", url, headers=headers, data=payload)\n", + " output = json.loads(response.text)\n", + " print(response.status_code)\n", + " return output\n", + "\n", + "\n", + "# Generate embeddings for the question and retrieve the top k document chunks\n", + "questionEmbedding = genQuestionEmbedding(userQuestion)\n", + "output = retrieve_k_chunk(retrieve_k, questionEmbedding)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "79356cff-a236-4ef3-91f7-a601ee38d5f9", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 8: Respond to a User’s Question" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "06778fa1-303f-4a3b-814b-c0375df855c2", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "To provide a response to the user's question, we will utilize the [LangChain](https://python.langchain.com/en/latest/index.html) framework. With the LangChain framework we will augment the retrieved documents with respect to the user's question. Following this, we can request a response to the user's question from our framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necenssary libraries and setting up OpenAI\n", + "from langchain.llms import AzureOpenAI\n", + "from langchain import PromptTemplate\n", + "from langchain.chains import LLMChain\n", + "import openai\n", + "\n", + "openai.api_type = \"azure\"\n", + "openai.api_base = aoai_endpoint\n", + "openai.api_version = \"2022-12-01\"\n", + "openai.api_key = aoai_key" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "412d83cc-4fe9-455e-ad3d-7780ed262dac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define a Question Answering chain function using LangChain\n", + "def QA_chain_func():\n", + "\n", + " # Define llm model\n", + " llm = AzureOpenAI(\n", + " deployment_name=aoai_deployment_name_query,\n", + " model_name=aoai_model_name_query,\n", + " openai_api_key=aoai_key,\n", + " openai_api_version=\"2022-12-01\",\n", + " )\n", + "\n", + " # Write a preprompt with context and query as variables\n", + " template = \"\"\"\n", + " context :{context}\n", + " Answer the question based on the context above. If the\n", + " information to answer the question is not present in the given context then reply \"I don't know\".\n", + " Question: {query}\n", + " Answer: \"\"\"\n", + "\n", + " # Define a prompt template\n", + " prompt_template = PromptTemplate(\n", + " input_variables=[\"context\", \"query\"], template=template\n", + " )\n", + " # Define a chain\n", + " qa_chain = LLMChain(llm=llm, prompt=prompt_template)\n", + " return qa_chain\n", + "\n", + "\n", + "# Concatenate the content of retrieved documents\n", + "context = [i[\"content\"] for i in output[\"value\"]]\n", + "\n", + "# Make a Quesion Answer chain function and pass\n", + "qa_chain = QA_chain_func()\n", + "answer = qa_chain.run({\"context\": context, \"query\": userQuestion})\n", + "\n", + "print(answer)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/website/sidebars.js b/website/sidebars.js index 22bbbab243..c3ea247681 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -39,6 +39,7 @@ module.exports = { "Explore Algorithms/AI Services/Quickstart - Analyze Text", "Explore Algorithms/AI Services/Quickstart - Creare a Visual Search Engine", "Explore Algorithms/AI Services/Quickstart - Create Audiobooks", + "Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs", "Explore Algorithms/AI Services/Quickstart - Flooding Risk", "Explore Algorithms/AI Services/Quickstart - Predictive Maintenance", ],