added code, docu and assets, initial commit

This commit is contained in:
nonstoptimm 2020-11-24 13:09:57 +01:00
Родитель 630a76cbd5
Коммит 3a028d069b
16 изменённых файлов: 930 добавлений и 10 удалений

6
.funcignore Normal file
Просмотреть файл

@ -0,0 +1,6 @@
.vscode/
MRC/__pycache__/
assets/
config.ini
config.sample.ini
README.md

46
.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,46 @@
bin
obj
csx
.vs
edge
Publish
*.user
*.suo
*.cscfg
*.Cache
project.lock.json
/packages
/TestResults
/tools/NuGet.exe
/App_Data
/secrets
/data
.secrets
appsettings.json
local.settings.json
node_modules
dist
# Local python packages
.python_packages/
# Python Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Models
models/

43
MRC/__init__.py Normal file
Просмотреть файл

@ -0,0 +1,43 @@
import logging
import json
import azure.functions as func
from . import bm25
from . import retriever
from . import reader
from . import helper
def main(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
# Collect parameters
question, n_doc, treshold, tokenize, bm_n_doc = helper.get_config(req)
if question:
# Fetch relevant documents, if any
documents, sources = retriever.main(question, n_doc, treshold, tokenize)
# Apply BM25, if more than n documents
if documents and len(documents) > bm_n_doc:
documents, sources = bm25.main(question, documents, sources, bm_n_doc)
# Extract relevant answers, if any
if documents:
answers = reader.main(question, documents, sources)
else:
answers = []
# Format response
res = json.dumps(dict(
answers = answers,
counts = dict(
documents = len(documents),
answers = len(answers)
)
))
return func.HttpResponse(res, mimetype='application/json')
else:
return func.HttpResponse(
"This HTTP triggered function executed successfully. Pass a question in the query string or in the request body for a personalized response.",
status_code=200
)

183
MRC/bm25.py Normal file
Просмотреть файл

@ -0,0 +1,183 @@
import math
import logging
stopwords = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"])
class BM25:
"""
Best Match 25.
Parameters
----------
k1 : float, default 1.5
b : float, default 0.75
Attributes
----------
tf_ : list[dict[str, int]]
Term Frequency per document. So [{'hi': 1}] means
the first document contains the term 'hi' 1 time.
df_ : dict[str, int]
Document Frequency per term. i.e. Number of documents in the
corpus that contains the term.
idf_ : dict[str, float]
Inverse Document Frequency per term.
doc_len_ : list[int]
Number of terms per document. So [3] means the first
document contains 3 terms.
corpus_ : list[list[str]]
The input corpus.
corpus_size_ : int
Number of documents in the corpus.
avg_doc_len_ : float
Average number of terms for documents in the corpus.
"""
def __init__(self, k1=1.5, b=0.75):
self.b = b
self.k1 = k1
def fit(self, corpus):
"""
Fit the various statistics that are required to calculate BM25 ranking
score using the corpus given.
Parameters
----------
corpus : list[list[str]]
Each element in the list represents a document, and each document
is a list of the terms.
Returns
-------
self
"""
tf = []
df = {}
idf = {}
doc_len = []
corpus_size = 0
for document in corpus:
corpus_size += 1
doc_len.append(len(document))
# compute tf (term frequency) per document
frequencies = {}
for term in document:
term_count = frequencies.get(term, 0) + 1
frequencies[term] = term_count
tf.append(frequencies)
# compute df (document frequency) per term
for term, _ in frequencies.items():
df_count = df.get(term, 0) + 1
df[term] = df_count
for term, freq in df.items():
idf[term] = math.log(1 + (corpus_size - freq + 0.5) / (freq + 0.5))
self.tf_ = tf
self.df_ = df
self.idf_ = idf
self.doc_len_ = doc_len
self.corpus_ = corpus
self.corpus_size_ = corpus_size
self.avg_doc_len_ = sum(doc_len) / corpus_size
return self
def search(self, query):
scores = [self._score(query, index) for index in range(self.corpus_size_)]
return scores
def _score(self, query, index):
score = 0.0
doc_len = self.doc_len_[index]
frequencies = self.tf_[index]
for term in query:
if term not in frequencies:
continue
freq = frequencies[term]
numerator = self.idf_[term] * freq * (self.k1 + 1)
denominator = freq + self.k1 * (1 - self.b + self.b * doc_len / self.avg_doc_len_)
score += (numerator / denominator)
return score
def preprocess_text(corpus):
''' Prepare text for BM25-compatibility '''
texts = [
[word for word in document.lower().split() if word not in stopwords]
for document in corpus
]
# Build a word count dictionary so we can remove words that appear only once
word_count_dict = {}
for text in texts:
for token in text:
word_count = word_count_dict.get(token, 0) + 1
word_count_dict[token] = word_count
texts = [[token for token in text if word_count_dict[token] > 1] for text in texts]
return texts
def main(query, corpus, sources, bm_n_doc):
logging.warning('Applying BM25 algorithm ...')
# Query our corpus to see which document is more relevant
query = [word for word in query.lower().split() if word not in stopwords]
# Preprocess text to remove stopwords and stuff
texts = preprocess_text(corpus)
# Fit and score texts on BM25
bm25 = BM25()
bm25.fit(texts)
scores = bm25.search(query)
# Sort by relevance score
bm25_ranked = sorted(zip(scores, corpus, sources), key = lambda x: x[0], reverse=True)
# Unpack the zipped lists
scores, corpus, sources = zip(*bm25_ranked[:bm_n_doc])
return list(corpus), list(sources)
if __name__ == '__main__':
# Test question + context
corpus = [
'Human machine interface for lab abc computer applications',
'A survey of user opinion of computer system response time',
'The EPS user interface management system',
'System and human system engineering testing of EPS',
'Relation of user perceived response time to error measurement',
'The generation of random binary unordered trees',
'The intersection graph of paths in trees',
'Graph minors IV Widths of trees and well quasi ordering',
'Graph minors A survey'
]
sources = [
{'metadata_storage_name': 'bla',
'document_id': 'bla',
'document_uri': 'bla',
'title': 'bla'},
{'metadata_storage_name': 'bla',
'document_id': 'bla',
'document_uri': 'bla',
'title': 'bla'},
{'metadata_storage_name': 'bla',
'document_id': 'bla',
'document_uri': 'bla',
'title': 'bla'},
{'metadata_storage_name': 'bla',
'document_id': 'bla',
'document_uri': 'bla',
'title': 'bla'},
{'metadata_storage_name': 'bla',
'document_id': 'bla',
'document_uri': 'bla',
'title': 'bla'}]
main("The intersection of graph survey and trees", corpus, sources)

20
MRC/function.json Normal file
Просмотреть файл

@ -0,0 +1,20 @@
{
"scriptFile": "__init__.py",
"bindings": [
{
"authLevel": "function",
"type": "httpTrigger",
"direction": "in",
"name": "req",
"methods": [
"get",
"post"
]
},
{
"type": "http",
"direction": "out",
"name": "$return"
}
]
}

101
MRC/helper.py Normal file
Просмотреть файл

@ -0,0 +1,101 @@
import logging
import sys
def main():
return None
def get_config(req):
'''
Get config and set parameters
'''
# Assuming that we receive the params via url
question = req.params.get('question')
n_doc = req.params.get('az_documents')
treshold = req.params.get('az_treshold')
tokenize = req.params.get('az_tokenize')
bm_n_doc = req.params.get('bm_ndoc')
# We check, whether we have received a value for every parameter
if not any([question, n_doc, treshold, tokenize, bm_n_doc]):
# Otherwise we try the request body
try:
req_body = req.get_json()
except ValueError:
pass
else:
question = req_body.get('question')
n_doc = req_body.get('az_documents')
treshold = req_body.get('az_treshold')
tokenize = req_body.get('az_tokenize')
bm_n_doc = req_body.get('bm_ndoc')
# Now we check whether we have everything - if we have some missings, we just set the defaults
if not all([question, n_doc, treshold, tokenize, bm_n_doc]):
logging.warning('Received one or multiple empty parameters, filling up with default values')
if not n_doc:
n_doc = 5
if not treshold:
treshold = 5
if not bm_n_doc:
bm_n_doc = 3
if str(tokenize).lower() == "false":
tokenize = False
else:
tokenize = True
elif all([question, n_doc, treshold, tokenize, bm_n_doc]):
pass
logging.warning(f'[INFO] - Working with following parameters: n_doc: {n_doc}, treshold: {treshold}, bm_n_doc: {bm_n_doc}, tokenize: {tokenize}.')
return question, int(n_doc), int(treshold), tokenize, int(bm_n_doc)
def load_models(model_name_or_path):
'''
Load models, tokenizers and configuration
Pick one from the models below and set in in reader.py:
- model_name_or_path = "deepset/bert-large-uncased-whole-word-masking-squad2"
- model_name_or_path = "deepset/roberta-base-squad2"
- model_name_or_path = "twmkn9/albert-base-v2-squad2"
- model_name_or_path = "distilbert-base-cased-distilled-squad"
'''
#
if model_name_or_path == "twmkn9/albert-base-v2-squad2":
from transformers import (
AlbertConfig,
AlbertForQuestionAnswering,
AlbertTokenizer,
squad_convert_examples_to_features
)
config_class, model_class, tokenizer_class = (
AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer)
elif model_name_or_path == "deepset/bert-large-uncased-whole-word-masking-squad2":
from transformers import (
BertConfig,
BertForQuestionAnswering,
BertTokenizer,
squad_convert_examples_to_features
)
config_class, model_class, tokenizer_class = (
BertConfig, BertForQuestionAnswering, BertTokenizer)
elif model_name_or_path == "distilbert-base-cased-distilled-squad":
from transformers import (
DistilBertConfig,
DistilBertForQuestionAnswering,
DistilBertTokenizer,
squad_convert_examples_to_features
)
config_class, model_class, tokenizer_class = (
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
elif model_name_or_path == "deepset/roberta-base-squad2":
from transformers import (
RobertaConfig,
RobertaForQuestionAnswering,
RobertaTokenizer,
squad_convert_examples_to_features
)
config_class, model_class, tokenizer_class = (
RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer)
else:
logging.error(f'Model {model_name_or_path} is not available!')
sys.exit()
logging.info(f'Loaded {model_name_or_path} ...')
return config_class, model_class, tokenizer_class, squad_convert_examples_to_features
if __name__ == "__main__":
main()

151
MRC/reader.py Normal file
Просмотреть файл

@ -0,0 +1,151 @@
import os
import logging
import torch
import time
from . import helper as he
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
from transformers.data.metrics.squad_metrics import compute_predictions_logits
# Define model - just uncomment the model you want to use
model_type = "bert" # may be bert, roberta, albert, distilbert, see helper.py
model_name_or_path = "deepset/bert-large-uncased-whole-word-masking-squad2"
# Set the flag for deployment
if os.path.exists(f'./models/{model_type}/config.json'):
# Load supported models
config_class, model_class, tokenizer_class, squad_convert_examples_to_features = he.load_models(model_name_or_path)
model_name_or_path = f'./models/{model_type}/'
logging.warning(f'[INFO] - Loading local model {model_type}.')
else:
# Load supported models
config_class, model_class, tokenizer_class, squad_convert_examples_to_features = he.load_models(model_name_or_path)
logging.warning(f'[INFO] - Loading remote model {model_type}.')
# Config
n_best_size = 1
max_answer_length = 50
do_lower_case = True
null_score_diff_threshold = 0.0
def to_list(tensor):
return tensor.detach().cpu().tolist()
# Setup model
config = config_class.from_pretrained(model_name_or_path)
tokenizer = tokenizer_class.from_pretrained(
model_name_or_path, do_lower_case=do_lower_case)
model = model_class.from_pretrained(model_name_or_path, config=config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
processor = SquadV2Processor()
def run_prediction(question_text, context_texts):
"""
Setup function to compute predictions
"""
examples = []
for i, context_text in enumerate(context_texts):
example = SquadExample(
qas_id=str(i),
question_text=question_text,
context_text=context_text,
answer_text=None,
start_position_character=None,
title="Predict",
is_impossible=False,
answers=None,
)
examples.append(example)
features, dataset = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=384,
doc_stride=128,
max_query_length=64,
is_training=False,
return_dataset="pt",
threads=1,
)
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)
all_results = []
for batch in eval_dataloader:
model.eval()
batch = tuple(t.to(device) for t in batch)
with torch.no_grad():
inputs = {
"input_ids": batch[0],
"attention_mask": batch[1],
#"token_type_ids": batch[2], #TODO: had to comment this?
}
example_indices = batch[3]
outputs = model(**inputs)
for i, example_index in enumerate(example_indices):
eval_feature = features[example_index.item()]
unique_id = int(eval_feature.unique_id)
output = [to_list(output[i]) for output in outputs]
start_logits, end_logits = output
result = SquadResult(unique_id, start_logits, end_logits)
all_results.append(result)
# Output files are optional, may help debugging/learning locally
# output_prediction_file = "/tmp/predictions.json"
# output_nbest_file = "/tmp/nbest_predictions.json"
# output_null_log_odds_file = "/tmp/null_predictions.json"
predictions = compute_predictions_logits(
examples,
features,
all_results,
n_best_size,
max_answer_length, # TODO
do_lower_case,
None,
None,
None,
False, # verbose_logging
True, # version_2_with_negative
null_score_diff_threshold,
tokenizer,
)
return predictions
def main(question, documents, meta):
# Run method
_predictions = run_prediction(question, documents)
logging.info(_predictions)
predictions = []
for prediction, m in zip(_predictions, meta):
_prediction = _predictions[prediction]
if _prediction != "" and _prediction != "empty":
predictions.append(dict(
answer = _prediction,
title = m['title'],
metadata_storage_name = m['metadata_storage_name'],
document_id = m['document_id'],
document_uri = m['document_uri']
))
return predictions
if __name__ == '__main__':
# Test question + context
contexts = ["New Zealand (Māori: Aotearoa) is a sovereign island country in the southwestern Pacific Ocean. It has a total land area of 268,000 square kilometres (103,500 sq mi), and a population of 4.9 million. New Zealand's capital city is Wellington, and its most populous city is Auckland."]
question = "How many people live in New Zealand?"
print(main(question, contexts, [""]))

97
MRC/retriever.py Normal file
Просмотреть файл

@ -0,0 +1,97 @@
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
import logging
from nltk import sent_tokenize
import nltk.data
import nltk.downloader
try:
nltk.data.path.append("./models/nltk/")
logging.info('Importing nltk from local, specified folder.')
except:
nltk.download('punkt')
finally:
nltk.data.find('tokenizers/punkt')
# Local debugging
try:
import sys
import os
sys.path.append('./')
import configparser
config = configparser.ConfigParser()
config.read('config.ini')
service_name = config['search']['service_name']
index_name = config['search']['index_name']
api_key = config['search']['key']
logging.info('extracted keys from config')
except Exception as e:
logging.warning(f'{e}')
service_name = os.environ.get('service_name')
index_name = os.environ.get('index_name')
api_key = os.environ.get('api_key')
logging.info('extracted keys from environment')
# Extract relevant text parts
def extract_relevant_text(highlight, paragraphs, tokenize):
''' Extracts relevant sentences around highlights to get optimized text and content
This ensures that we have better data for the MRC scoring, as not only partial sentences are extracted'''
if not isinstance(paragraphs, list):
paragraphs = [paragraphs]
# For every highlight, look up the paragraphs
for paragraph in paragraphs:
# If the highlight is in the respective paragraph, tokenize the sentences
if tokenize:
highlight = sent_tokenize(highlight)[0]
if highlight in paragraph:
sentences = sent_tokenize(paragraph)
# Loop through every sentence and detect the highlight
for sentence in sentences:
# As soon as we found the highlight, extract the sentences around it
if highlight in sentence:
if sentences.index(sentence) == 0:
return ". ".join(sentences[:2])
elif sentences.index(sentence) == len(sentences) - 1:
return ". ".join(sentences[-2:])
else:
return ". ".join(sentences[sentences.index(sentence)-1:sentences.index(sentence)+1])
# Request to Cognitive Search
def main(question, n=5, threshold=5, tokenize=True):
# Create a SearchClient to send queries
endpoint = f"https://{service_name}.search.windows.net/"
credential = AzureKeyCredential(api_key)
client = SearchClient(endpoint=endpoint,
index_name=index_name,
credential=credential)
# Get top n results
results = client.search(search_text=question, search_fields='paragraphs', highlight_fields='paragraphs-3', select='paragraphs,metadata_storage_name,document_id,document_uri,title', top=n)
documents = []
meta = []
for result in results:
if result['@search.score'] > threshold:
if len(result['paragraphs']) == 0:
continue
search_highlights = list(dict.fromkeys(result['@search.highlights']['paragraphs']))
for highlight in search_highlights:
h = highlight.replace('<em>', '').replace('</em>', '')
relevant_text = extract_relevant_text(h, result['paragraphs'], tokenize)
if relevant_text is None:
logging.info("Text is none, continue")
continue
# We only proceed with the first 500 characters due to MRC limitations
elif relevant_text[:500] in documents:
logging.info("Text already exists, continue")
continue
documents.append(relevant_text[:500])
meta.append(dict(
metadata_storage_name = result['metadata_storage_name'],
document_id = result['document_id'],
document_uri = result['document_uri'],
title = result['title']
))
return documents, meta
if __name__ == "__main__":
main("Who is the Boss of Microsoft?")

178
README.md
Просмотреть файл

@ -1,17 +1,176 @@
# Project
# Machine Reading Comprehension with Azure Cognitive Search
This documentation serves to describe the technical setup of a Machine Reading Comprehension (MRC) component in an Azure Function, how to deploy and as well as how to use it. In this setup, we require the combination with Azure Cognitive Search, so that documents retrieved from the index given an input question can be processed using a MRC model. The component has been implemented in Python 3.7.
> This repo has been populated by an initial template to help get you started. Please
> make sure to update the content to build a great experience for community-building.
## Definition of Machine Reading Comprehension (MRC)
- Machine Reading Comprehension (MRC), or the ability to read and understand unstructured text and then answer questions about it remains a challenging task for computers
- MRC is a growing field of research due to its potential in various enterprise applications, as well as the availability of MRC benchmarking datasets (MSMARCO, SQuAD, NewsQA, etc.)
As the maintainer of this project, please make a few updates:
## Tech Stack
The technical components of the MRC function look as follows:
![Tech Stack](assets/tech_stack.png)
- Improving this README.MD file to provide a great experience
- Updating SUPPORT.MD with content about this project's support experience
- Understanding the security reporting process in SECURITY.MD
- Remove this section from the README
### Azure Functions
The basis of the MRC is an [Azure Functions](https://docs.microsoft.com/en-us/azure/azure-functions/functions-overview) component, which is a serverless infrastructure type offered on Microsoft Azure. It acts as webservice and can be triggered as REST-API. Basically, it is available in multiple setups such as C#, JavaScript and Python - in this case we use Python and recommend to use the Python 3.7 runtime. The minimum scale level should be either AppService or ideally Premium plan. The description to the respective plans can be found [here](https://docs.microsoft.com/en-us/azure/azure-functions/functions-scale). Depending on your scale, a Function, a storage account and an App Service Plan is deployed in your subscription when creating the resource intially.
### Python
The Python packages that are additionally required are `azure-functions`, `torch`, `torchvision`, `transformers`, `azure-search-documents` and `nltk`. Further, they are listed in the `requirements.txt` with the respective version numbers. When deploying the service, it will automatically be used for transferring and installing it.
### Azure Search
- [Azure Cognitive Search](https://azure.microsoft.com/en-us/services/search/) is the only cloud search service with built-in AI capabilities that enrich all types of information to easily identify and explore relevant content at scale. Formerly known as Azure Search, it uses the same integrated Microsoft natural language stack that Bing and Office have used for more than a decade, and AI services across vision, language, and speech. Spend more time innovating and less time maintaining a complex cloud search solution.
- In this setup, it is used as document retriever to look up for documents which may be related to an incoming question
### Transformers
- Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone.
- Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets then share them with the community on our model hub. At the same time, each Python module defining an architecture can be used as a standalone and modified to enable quick research experiments.
- Transformers is backed by the two most popular deep learning libraries, PyTorch and TensorFlow, with a seamless integration between them, allowing you to train your models with one then load it for inference with the other.
- In our setup, Transformers are used to search the text returned by Azure Functions and the filtering mechanisms of our pipeline.
- The pre-trained models are available on [huggingface.co](https://huggingface.co/models). Usually, they are downloaded directly by the script, however this may be restricted depending on the company policy, as the models are hosted on AWS.
- We have configured the function the way that we upload the model during the deployment, so no further model download has to be done during the deployment (limitation of accessing AWS resources exist)
## Architecture
The MRC component receives a request from the API backend after a user types in a specific question he or she wants to loop up for. The Function sends a request to the Azure Cognitive Search, asking for a collection of documents matching to the incoming question. After receiving the results, the documents are processed and checked for potential answers. The Function responds with a JSON containing the potential answers.
## Code and Process Flow
We followed a modular approach for the code, so that every script may be executed separately to try out changes autonomously. The files in the root folder are predominantly skeleton files for the Azure Function. The main function itself is stored in the `MRC` subfolder. When a request is received by the endpoint of the Azure Function, it is going to handled as illustrated below:
![MRC Flow](assets/mrc_flow.PNG)
## `__init__.py`
- Orchestration script of the `MRC` function
- Gets triggered after the function receives a request, accepts the request body and orchestrates the call of all other submodules
- After document processing, a response json is packed by the `__init__.py`
## `helper.py`
- The helper provides functions that are required for the runtime without being executable standalone
- Further, it allows to keep the `__init.py__` clean
- The `get_config()`-function of the helper gets called by the `__init__.py` to collect the parameters and eventually fill up missing ones with standard values
- The `load_model()`-function is called by the `reader.py` to load the pre-configured model. It has been separated from the reader to allow extensions for further models without affecting the reader itself
## `retriever.py`
- The document retriever is the first processing component that gets triggered
- Based on the input question, the Azure Search gets queried and returns `n` (optional query parameter, default == 5) documents that match to the respective string
- The results are filtered based on the `treshold` (optional input parameter, treshold is TF/IDF-score, default == 5)
- Afterwards, the highlighted text is extracted and with help of `extract_relevant_text()`-function, the relevant text from the paragraph around the highlight is extracted
- For the extraction of relevant text, a custom sentence tokenizer is used, which is activated by default based on the parameter `tokenize` (optional parameter, default == True), which can be set to `False` (not recommended)
- We always have to make a cut at 500 characters, as it is the maximum length of a documents to be processed by the MRC
- If the part of the document does not exist in our list yet, we add it as potential candidate to be "read" by the MRC component later on
- The collection of documents with their respective metadata is returned to the orchtestrator
## `bm25.py`
- In the next step, we run a [Okapi Best Match 25](https://en.wikipedia.org/wiki/Okapi_BM25) on the collection of documents, if the document collection is larger than `bm_n_doc` (optional input parameter, default == 3)
- It is a ranking algorithm that filters the documents again for the best match to an input question in order to reduce the processing time of a large amount of documents. On this way we only process the most relevant documents. However, relevant documents might also be filtered out. If you do not want to use it as preprocessing any more, just comment it out in `__init__.py`, as below, and redeploy:
```python
# Apply BM25, if more than n documents
# if documents and len(documents) > bm_n_doc:
# documents, sources = bm25.main(question, documents, sources, bm_n_doc)
```
- The results are scored and the `bm_n_doc` documents with the highest scores get returned and passed to the orchestrator again
## `reader.py`
- This is the stage where the actual MRC happens. The pre-selected documents get applied on a pre-trained transformer model, which is specialized in MRC
- The models get loaded as described in `helper.py`, as there are multiple choice options which model to put into production. We recommend going for the active `deepset/bert-large-uncased-whole-word-masking-squad2`. You can expect significantly shorter processing times with the other models, however they do not perform so well in reading documents and also show difficulties with case-sensitive documents
- The reader checks, whether a model is available in the `models/bert`-subfolder. If no model can be found, a download is initiated from the huggingface API, given the security policies from the VNET allow it
- With help of tokenization and some parameters, which are set in `reader.py`, all the pre-selected documents get processed and checked for potential answers to a question
- If there is a match, the reader returns the respective documents to the orchestrator
The files and folders listed in `.funcignore` are not deployed to the function as they are either not needed or not wanted in the infrastructure component, e.g. as they are just for local development.
## Request and Responses
The section below shows example inputs and outputs how to trigger the function and pass values to the processing pipeline as well as the return.
### Request
The request below shows the minimal setup of a request with the mandatory parameters.
The link to the function usually looks as following:<br>
`https://just-mrc-test.azurewebsites.net/api/MRC?code=abcde312321hnIeA812345556blablablablablabla==`<br>
It is based on the structure:<br>
`https://{function resource name}.azurewebsites.net/api/{function name}?code={access code}==`
You can either send a request body or the address string, however we recommend using the request body as shown below:
```json
{
"question": "Who is the CEO of Microsoft?"
}
```
If you want to pass further optional parameters to tweak the results a bit, look at the following body:
```json
{
"question": "Who is the CEO of Microsoft?",
"az_documents": 8,
"az_treshold": 10,
"az_tokenize": False,
"bm_ndoc": 8
}
```
As already described above, their meaning and default values would look as following:
```json
{
"question": "Who is the CEO of Microsoft?",
"az_documents": 5, # documents to be requested from the Azure Search
"az_treshold": 5, # minimum TF-IDF relevance score to accept the document from Azure Search
"az_tokenize": True, # tokenize sentences to extract paragraphs
"bm_ndoc": 3 # Amount of documents to be returned by BM25
}
```
An increase of `az_documents` and `bm_ndoc` may lead to a larger document corpus with a greater likeliness to find a match, however they may increase the processing time significantly. Decreasing `az_treshold` may also increase this effect. Deactivating the sentence tokenizer `az_tokenize` may lead to smaller text chunks, which may reduce the recognition quality.
### Response
The service will return a json as below, in case a match has been found. There may be multiple entires in the `answers` section. The service also returns references to the document `title`, the `metadata_storage_name` as well as `document_id` and `document_uri`.
```json
{
"answers": [
{
"answer": "Satya Nadella",
"title": "Leadership - Stories",
"metadata_storage_name": "senior-leaders.html",
"document_id": "https://news.microsoft.com/leadership/?section=senior-leaders",
"document_uri": "https://news.microsoft.com/leadership/?section=senior-leaders"
}
],
"counts": {
"documents": 3,
"answers": 1
}
}
```
## Operations
The section below describes the frameworks to be installed locally before you can get started testing, debugging and deploying the service.
### Local Installation
First, you have to install/set up following components:
1. PowerShell
- [Azure Command Line Interface (CLI)](https://docs.microsoft.com/de-de/cli/azure/install-azure-cli), command line tools for Azure using PowerShell
- [Azure Functions Core Tools](https://docs.microsoft.com/de-de/azure/azure-functions/functions-run-local?tabs=windows%2Ccsharp%2Cbash#v2), download for your local runtime environment, e.g. as `.exe` -> _v3.x: Windows 64-Bit_
- A restart is highly recommended or even required after installing these components, otherwise you will run into issues!
2. Python 3.7
- From the [Windows Store](https://www.microsoft.com/en-us/p/python-37/9nj46sx7x90p?activetab=pivot:overviewtab) or from [Python](https://www.python.org/downloads/release/python-379/), make sure you install `pip` and set Python as `path` variable during the installation
3. Postman
- Framework to test endpoints, download it [here](https://www.postman.com/downloads/)
### Testing and Debugging
1. Get your code from Azure DevOps or the respective repository (07 - SCIO MRC)
1. Create a virtual environment <u>outside</u> of the project folder: `python –m venv venv`
1. Activate the virtual environment: `source venv/bin/activate` (Linux) or `venv/Scripts/activate` (Windows), `deactivate` um zu deaktivieren
1. Install the requirements: `pip install -r requirements.txt`
1. Set your keys (only for local development and debugging) in the `config.ini` (they are needed for the Azure Search request)
1. After the initial installation, run `func init` in the root folder (verifies if all necessary skeleton files for the function are available)
1. For debugging and local testing, open a separate PowerShell window and execute `func host start --verbose` in the root folder of the function. This enables you to do code changes during runtime without shutting down the function completely when there is an issue
1. Use [Postman](https://www.postman.com/downloads/) for testing the endpoints using the localhost request of this [collection](assets/MRC-Requests.postman_collection.json)
### Deployment to Azure
1. Open your PowerShell
1. Activate your environment, if you haven't before: <br>
`source venv/bin/activate` (Linux) or `venv/Scripts/activate` (Windows)
1. Login to your Azure Account: `az login` (a browser window will open, where you may have to log on Azure)
1. Execute the command:<br>
`func azure functionapp publish [insert your function name] --remote build`
1. Wait until the deployment is finished
1. Use [Postman](https://www.postman.com/downloads/) for testing the endpoints using this [collection](assets/MRC-Requests.postman_collection.json)
## Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
@ -25,7 +184,6 @@ For more information see the [Code of Conduct FAQ](https://opensource.microsoft.
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
## Trademarks
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
trademarks or logos is subject to and must follow
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).

Просмотреть файл

@ -0,0 +1,81 @@
{
"info": {
"_postman_id": "95608c8f-13b1-44e2-b2ca-c3fc68ba30c1",
"name": "MRC Requests",
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
},
"item": [
{
"name": "MRC API",
"protocolProfileBehavior": {
"disableBodyPruning": true
},
"request": {
"method": "GET",
"header": [],
"body": {
"mode": "raw",
"raw": "{\r\n \"question\": \"Who is the CEO if Microsoft?\"\r\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "http://localhost:7071/api/MRC",
"protocol": "http",
"host": [
"localhost"
],
"port": "7071",
"path": [
"api",
"MRC"
]
}
},
"response": []
},
{
"name": "MRC API Deployed",
"protocolProfileBehavior": {
"disableBodyPruning": true
},
"request": {
"method": "GET",
"header": [],
"body": {
"mode": "raw",
"raw": "{\r\n \"question\": \"Who is the CEO if Microsoft?\"\r\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "https://mrc-test.azurewebsites.net/api/MRC?code=insert function code here",
"protocol": "https",
"host": [
"mrc-test",
"azurewebsites",
"net"
],
"path": [
"api",
"MRC"
],
"query": [
{
"key": "code",
"value": "insert function code here"
}
]
}
},
"response": []
}
],
"protocolProfileBehavior": {}
}

Двоичные данные
assets/mrc_flow.PNG Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 102 KiB

Двоичные данные
assets/tech_stack.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 61 KiB

4
config.sample.ini Normal file
Просмотреть файл

@ -0,0 +1,4 @@
[search]
service_name=
index_name=
key=

15
host.json Normal file
Просмотреть файл

@ -0,0 +1,15 @@
{
"version": "2.0",
"logging": {
"applicationInsights": {
"samplingSettings": {
"isEnabled": true,
"excludedTypes": "Request"
}
}
},
"extensionBundle": {
"id": "Microsoft.Azure.Functions.ExtensionBundle",
"version": "[1.*, 2.0.0)"
}
}

4
proxies.json Normal file
Просмотреть файл

@ -0,0 +1,4 @@
{
"$schema": "http://json.schemastore.org/proxies",
"proxies": {}
}

11
requirements.txt Normal file
Просмотреть файл

@ -0,0 +1,11 @@
# DO NOT include azure-functions-worker in this file
# The Python Worker is managed by Azure Functions platform
# Manually managing azure-functions-worker may cause unexpected issues
azure-functions==1.4.0
#torch===1.6.0 -f https://download.pytorch.org/whl/torch_stable.html
torch===1.5.1
torchvision===0.6.1
#torchvision===0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
transformers==3.4.0
azure-search-documents==11.0.0
nltk==3.5