зеркало из https://github.com/microsoft/SynapseML.git
docs: fix variable formatting for QandA nb (#2033)
This commit is contained in:
Родитель
072c9c9631
Коммит
8be8fe3e61
|
@ -48,7 +48,7 @@
|
|||
"\n",
|
||||
"We’ll cover the following key steps:\n",
|
||||
"\n",
|
||||
"1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Form Recognizer Service](https://azure.microsoft.com/en-us/products/form-recognizer/) in Azure AI Services, and use SynapseML to split the documents into chunks.\n",
|
||||
"1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Azure AI Document Intelligence](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence) in Azure AI Services, and use SynapseML to split the documents into chunks.\n",
|
||||
"2. Embedding Generation and Storage: Learn how to generate embeddings for the chunks using SynapseML and [Azure OpenAI Services](https://azure.microsoft.com/en-us/products/cognitive-services/openai-service), store the embeddings in a vector store using [Azure Cognitive Search](https://azure.microsoft.com/en-us/products/search), and search the vector store to answer the user’s question.\n",
|
||||
"3. Question Answering Pipeline: Learn how to retrieve relevant document based on the user’s question and provide the answer using [Langchain](https://python.langchain.com/en/latest/index.html#)."
|
||||
]
|
||||
|
@ -298,7 +298,7 @@
|
|||
}
|
||||
},
|
||||
"source": [
|
||||
"### Step 3: Read the documents using Azure AI Services Form Recognizer."
|
||||
"### Step 3: Read the documents using Azure AI Document Intelligence."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -335,7 +335,7 @@
|
|||
"from synapse.ml.cognitive import AnalyzeDocument\n",
|
||||
"from pyspark.sql.functions import col\n",
|
||||
"\n",
|
||||
"analyzeDocument = (\n",
|
||||
"analyze_document = (\n",
|
||||
" AnalyzeDocument()\n",
|
||||
" .setPrebuiltModelId(\"prebuilt-layout\")\n",
|
||||
" .setSubscriptionKey(ai_services_key)\n",
|
||||
|
@ -348,7 +348,7 @@
|
|||
")\n",
|
||||
"\n",
|
||||
"analyzed_df = (\n",
|
||||
" analyzeDocument.transform(df)\n",
|
||||
" analyze_document.transform(df)\n",
|
||||
" .withColumn(\"output_content\", col(\"result.analyzeResult.content\"))\n",
|
||||
" .withColumn(\"paragraphs\", col(\"result.analyzeResult.paragraphs\"))\n",
|
||||
").cache()"
|
||||
|
@ -701,7 +701,7 @@
|
|||
"\n",
|
||||
"# Define a UDF using the @udf decorator\n",
|
||||
"@udf(returnType=StringType())\n",
|
||||
"def insertToCogSearch(idx, content, contentVector):\n",
|
||||
"def insert_to_cog_search(idx, content, contentVector):\n",
|
||||
" url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/index?api-version=2023-07-01-Preview\"\n",
|
||||
"\n",
|
||||
" payload = json.dumps(\n",
|
||||
|
@ -762,7 +762,7 @@
|
|||
") ## adding a column with id\n",
|
||||
"df_embeddings = df_embeddings.withColumn(\n",
|
||||
" \"errorCogSearch\",\n",
|
||||
" insertToCogSearch(\n",
|
||||
" insert_to_cog_search(\n",
|
||||
" df_embeddings[\"idx\"], df_embeddings[\"chunk\"], df_embeddings[\"embeddings\"]\n",
|
||||
" ),\n",
|
||||
")\n",
|
||||
|
@ -791,7 +791,7 @@
|
|||
}
|
||||
},
|
||||
"source": [
|
||||
"### Step 7: Ask a Question"
|
||||
"### Step 7: Ask a Question."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -823,7 +823,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"userQuestion = \"What did the astronaut Edgar Mitchell call Earth?\"\n",
|
||||
"user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n",
|
||||
"retrieve_k = 2 # Retrieve the top 2 documents from vector database"
|
||||
]
|
||||
},
|
||||
|
@ -836,11 +836,11 @@
|
|||
"# Ask a question and convert to embeddings\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def genQuestionEmbedding(userQuestion):\n",
|
||||
"def gen_question_embedding(user_question):\n",
|
||||
" # Convert question to embedding using synapseML\n",
|
||||
" from synapse.ml.cognitive import OpenAIEmbedding\n",
|
||||
"\n",
|
||||
" df_ques = spark.createDataFrame([(userQuestion, 1)], [\"questions\", \"dummy\"])\n",
|
||||
" df_ques = spark.createDataFrame([(user_question, 1)], [\"questions\", \"dummy\"])\n",
|
||||
" embedding = (\n",
|
||||
" OpenAIEmbedding()\n",
|
||||
" .setSubscriptionKey(aoai_key)\n",
|
||||
|
@ -852,16 +852,16 @@
|
|||
" )\n",
|
||||
" df_ques_embeddings = embedding.transform(df_ques)\n",
|
||||
" row = df_ques_embeddings.collect()[0]\n",
|
||||
" questionEmbedding = row.embeddings.tolist()\n",
|
||||
" return questionEmbedding\n",
|
||||
" question_embedding = row.embeddings.tolist()\n",
|
||||
" return question_embedding\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def retrieve_k_chunk(k, questionEmbedding):\n",
|
||||
"def retrieve_k_chunk(k, question_embedding):\n",
|
||||
" # Retrieve the top K entries\n",
|
||||
" url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/search?api-version=2023-07-01-Preview\"\n",
|
||||
"\n",
|
||||
" payload = json.dumps(\n",
|
||||
" {\"vector\": {\"value\": questionEmbedding, \"fields\": \"contentVector\", \"k\": 2}}\n",
|
||||
" {\"vector\": {\"value\": question_embedding, \"fields\": \"contentVector\", \"k\": 2}}\n",
|
||||
" )\n",
|
||||
" headers = {\n",
|
||||
" \"Content-Type\": \"application/json\",\n",
|
||||
|
@ -875,8 +875,8 @@
|
|||
"\n",
|
||||
"\n",
|
||||
"# Generate embeddings for the question and retrieve the top k document chunks\n",
|
||||
"questionEmbedding = genQuestionEmbedding(userQuestion)\n",
|
||||
"output = retrieve_k_chunk(retrieve_k, questionEmbedding)"
|
||||
"question_embedding = gen_question_embedding(user_question)\n",
|
||||
"output = retrieve_k_chunk(retrieve_k, question_embedding)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -899,7 +899,7 @@
|
|||
}
|
||||
},
|
||||
"source": [
|
||||
"### Step 8: Respond to a User’s Question"
|
||||
"### Step 8: Respond to a User’s Question."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -968,7 +968,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Define a Question Answering chain function using LangChain\n",
|
||||
"def QA_chain_func():\n",
|
||||
"def qa_chain_func():\n",
|
||||
"\n",
|
||||
" # Define llm model\n",
|
||||
" llm = AzureOpenAI(\n",
|
||||
|
@ -999,8 +999,8 @@
|
|||
"context = [i[\"content\"] for i in output[\"value\"]]\n",
|
||||
"\n",
|
||||
"# Make a Quesion Answer chain function and pass\n",
|
||||
"qa_chain = QA_chain_func()\n",
|
||||
"answer = qa_chain.run({\"context\": context, \"query\": userQuestion})\n",
|
||||
"qa_chain = qa_chain_func()\n",
|
||||
"answer = qa_chain.run({\"context\": context, \"query\": user_question})\n",
|
||||
"\n",
|
||||
"print(answer)"
|
||||
]
|
||||
|
|
Загрузка…
Ссылка в новой задаче