docs: fix variable formatting for QandA nb (#2033)

This commit is contained in:
aydan-at-microsoft 2023-07-24 13:58:51 -07:00 коммит произвёл GitHub
Родитель 072c9c9631
Коммит 8be8fe3e61
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
1 изменённых файлов: 20 добавлений и 20 удалений

Просмотреть файл

@ -48,7 +48,7 @@
"\n",
"Well cover the following key steps:\n",
"\n",
"1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Form Recognizer Service](https://azure.microsoft.com/en-us/products/form-recognizer/) in Azure AI Services, and use SynapseML to split the documents into chunks.\n",
"1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Azure AI Document Intelligence](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence) in Azure AI Services, and use SynapseML to split the documents into chunks.\n",
"2. Embedding Generation and Storage: Learn how to generate embeddings for the chunks using SynapseML and [Azure OpenAI Services](https://azure.microsoft.com/en-us/products/cognitive-services/openai-service), store the embeddings in a vector store using [Azure Cognitive Search](https://azure.microsoft.com/en-us/products/search), and search the vector store to answer the users question.\n",
"3. Question Answering Pipeline: Learn how to retrieve relevant document based on the users question and provide the answer using [Langchain](https://python.langchain.com/en/latest/index.html#)."
]
@ -298,7 +298,7 @@
}
},
"source": [
"### Step 3: Read the documents using Azure AI Services Form Recognizer."
"### Step 3: Read the documents using Azure AI Document Intelligence."
]
},
{
@ -335,7 +335,7 @@
"from synapse.ml.cognitive import AnalyzeDocument\n",
"from pyspark.sql.functions import col\n",
"\n",
"analyzeDocument = (\n",
"analyze_document = (\n",
" AnalyzeDocument()\n",
" .setPrebuiltModelId(\"prebuilt-layout\")\n",
" .setSubscriptionKey(ai_services_key)\n",
@ -348,7 +348,7 @@
")\n",
"\n",
"analyzed_df = (\n",
" analyzeDocument.transform(df)\n",
" analyze_document.transform(df)\n",
" .withColumn(\"output_content\", col(\"result.analyzeResult.content\"))\n",
" .withColumn(\"paragraphs\", col(\"result.analyzeResult.paragraphs\"))\n",
").cache()"
@ -701,7 +701,7 @@
"\n",
"# Define a UDF using the @udf decorator\n",
"@udf(returnType=StringType())\n",
"def insertToCogSearch(idx, content, contentVector):\n",
"def insert_to_cog_search(idx, content, contentVector):\n",
" url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/index?api-version=2023-07-01-Preview\"\n",
"\n",
" payload = json.dumps(\n",
@ -762,7 +762,7 @@
") ## adding a column with id\n",
"df_embeddings = df_embeddings.withColumn(\n",
" \"errorCogSearch\",\n",
" insertToCogSearch(\n",
" insert_to_cog_search(\n",
" df_embeddings[\"idx\"], df_embeddings[\"chunk\"], df_embeddings[\"embeddings\"]\n",
" ),\n",
")\n",
@ -791,7 +791,7 @@
}
},
"source": [
"### Step 7: Ask a Question"
"### Step 7: Ask a Question."
]
},
{
@ -823,7 +823,7 @@
"metadata": {},
"outputs": [],
"source": [
"userQuestion = \"What did the astronaut Edgar Mitchell call Earth?\"\n",
"user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n",
"retrieve_k = 2 # Retrieve the top 2 documents from vector database"
]
},
@ -836,11 +836,11 @@
"# Ask a question and convert to embeddings\n",
"\n",
"\n",
"def genQuestionEmbedding(userQuestion):\n",
"def gen_question_embedding(user_question):\n",
" # Convert question to embedding using synapseML\n",
" from synapse.ml.cognitive import OpenAIEmbedding\n",
"\n",
" df_ques = spark.createDataFrame([(userQuestion, 1)], [\"questions\", \"dummy\"])\n",
" df_ques = spark.createDataFrame([(user_question, 1)], [\"questions\", \"dummy\"])\n",
" embedding = (\n",
" OpenAIEmbedding()\n",
" .setSubscriptionKey(aoai_key)\n",
@ -852,16 +852,16 @@
" )\n",
" df_ques_embeddings = embedding.transform(df_ques)\n",
" row = df_ques_embeddings.collect()[0]\n",
" questionEmbedding = row.embeddings.tolist()\n",
" return questionEmbedding\n",
" question_embedding = row.embeddings.tolist()\n",
" return question_embedding\n",
"\n",
"\n",
"def retrieve_k_chunk(k, questionEmbedding):\n",
"def retrieve_k_chunk(k, question_embedding):\n",
" # Retrieve the top K entries\n",
" url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/search?api-version=2023-07-01-Preview\"\n",
"\n",
" payload = json.dumps(\n",
" {\"vector\": {\"value\": questionEmbedding, \"fields\": \"contentVector\", \"k\": 2}}\n",
" {\"vector\": {\"value\": question_embedding, \"fields\": \"contentVector\", \"k\": 2}}\n",
" )\n",
" headers = {\n",
" \"Content-Type\": \"application/json\",\n",
@ -875,8 +875,8 @@
"\n",
"\n",
"# Generate embeddings for the question and retrieve the top k document chunks\n",
"questionEmbedding = genQuestionEmbedding(userQuestion)\n",
"output = retrieve_k_chunk(retrieve_k, questionEmbedding)"
"question_embedding = gen_question_embedding(user_question)\n",
"output = retrieve_k_chunk(retrieve_k, question_embedding)"
]
},
{
@ -899,7 +899,7 @@
}
},
"source": [
"### Step 8: Respond to a Users Question"
"### Step 8: Respond to a Users Question."
]
},
{
@ -968,7 +968,7 @@
"outputs": [],
"source": [
"# Define a Question Answering chain function using LangChain\n",
"def QA_chain_func():\n",
"def qa_chain_func():\n",
"\n",
" # Define llm model\n",
" llm = AzureOpenAI(\n",
@ -999,8 +999,8 @@
"context = [i[\"content\"] for i in output[\"value\"]]\n",
"\n",
"# Make a Quesion Answer chain function and pass\n",
"qa_chain = QA_chain_func()\n",
"answer = qa_chain.run({\"context\": context, \"query\": userQuestion})\n",
"qa_chain = qa_chain_func()\n",
"answer = qa_chain.run({\"context\": context, \"query\": user_question})\n",
"\n",
"print(answer)"
]