added code, docu and assets, initial commit
This commit is contained in:
Родитель
630a76cbd5
Коммит
3a028d069b
|
@ -0,0 +1,6 @@
|
|||
.vscode/
|
||||
MRC/__pycache__/
|
||||
assets/
|
||||
config.ini
|
||||
config.sample.ini
|
||||
README.md
|
|
@ -0,0 +1,46 @@
|
|||
bin
|
||||
obj
|
||||
csx
|
||||
.vs
|
||||
edge
|
||||
Publish
|
||||
|
||||
*.user
|
||||
*.suo
|
||||
*.cscfg
|
||||
*.Cache
|
||||
project.lock.json
|
||||
|
||||
/packages
|
||||
/TestResults
|
||||
|
||||
/tools/NuGet.exe
|
||||
/App_Data
|
||||
/secrets
|
||||
/data
|
||||
.secrets
|
||||
appsettings.json
|
||||
local.settings.json
|
||||
|
||||
node_modules
|
||||
dist
|
||||
|
||||
# Local python packages
|
||||
.python_packages/
|
||||
|
||||
# Python Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# Models
|
||||
models/
|
|
@ -0,0 +1,43 @@
|
|||
import logging
|
||||
import json
|
||||
import azure.functions as func
|
||||
|
||||
from . import bm25
|
||||
from . import retriever
|
||||
from . import reader
|
||||
from . import helper
|
||||
|
||||
def main(req: func.HttpRequest) -> func.HttpResponse:
|
||||
logging.info('Python HTTP trigger function processed a request.')
|
||||
|
||||
# Collect parameters
|
||||
question, n_doc, treshold, tokenize, bm_n_doc = helper.get_config(req)
|
||||
|
||||
if question:
|
||||
# Fetch relevant documents, if any
|
||||
documents, sources = retriever.main(question, n_doc, treshold, tokenize)
|
||||
|
||||
# Apply BM25, if more than n documents
|
||||
if documents and len(documents) > bm_n_doc:
|
||||
documents, sources = bm25.main(question, documents, sources, bm_n_doc)
|
||||
|
||||
# Extract relevant answers, if any
|
||||
if documents:
|
||||
answers = reader.main(question, documents, sources)
|
||||
else:
|
||||
answers = []
|
||||
|
||||
# Format response
|
||||
res = json.dumps(dict(
|
||||
answers = answers,
|
||||
counts = dict(
|
||||
documents = len(documents),
|
||||
answers = len(answers)
|
||||
)
|
||||
))
|
||||
return func.HttpResponse(res, mimetype='application/json')
|
||||
else:
|
||||
return func.HttpResponse(
|
||||
"This HTTP triggered function executed successfully. Pass a question in the query string or in the request body for a personalized response.",
|
||||
status_code=200
|
||||
)
|
|
@ -0,0 +1,183 @@
|
|||
import math
|
||||
import logging
|
||||
|
||||
stopwords = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"])
|
||||
|
||||
class BM25:
|
||||
"""
|
||||
Best Match 25.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k1 : float, default 1.5
|
||||
|
||||
b : float, default 0.75
|
||||
|
||||
Attributes
|
||||
----------
|
||||
tf_ : list[dict[str, int]]
|
||||
Term Frequency per document. So [{'hi': 1}] means
|
||||
the first document contains the term 'hi' 1 time.
|
||||
|
||||
df_ : dict[str, int]
|
||||
Document Frequency per term. i.e. Number of documents in the
|
||||
corpus that contains the term.
|
||||
|
||||
idf_ : dict[str, float]
|
||||
Inverse Document Frequency per term.
|
||||
|
||||
doc_len_ : list[int]
|
||||
Number of terms per document. So [3] means the first
|
||||
document contains 3 terms.
|
||||
|
||||
corpus_ : list[list[str]]
|
||||
The input corpus.
|
||||
|
||||
corpus_size_ : int
|
||||
Number of documents in the corpus.
|
||||
|
||||
avg_doc_len_ : float
|
||||
Average number of terms for documents in the corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, k1=1.5, b=0.75):
|
||||
self.b = b
|
||||
self.k1 = k1
|
||||
|
||||
def fit(self, corpus):
|
||||
"""
|
||||
Fit the various statistics that are required to calculate BM25 ranking
|
||||
score using the corpus given.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
corpus : list[list[str]]
|
||||
Each element in the list represents a document, and each document
|
||||
is a list of the terms.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
tf = []
|
||||
df = {}
|
||||
idf = {}
|
||||
doc_len = []
|
||||
corpus_size = 0
|
||||
for document in corpus:
|
||||
corpus_size += 1
|
||||
doc_len.append(len(document))
|
||||
|
||||
# compute tf (term frequency) per document
|
||||
frequencies = {}
|
||||
for term in document:
|
||||
term_count = frequencies.get(term, 0) + 1
|
||||
frequencies[term] = term_count
|
||||
|
||||
tf.append(frequencies)
|
||||
|
||||
# compute df (document frequency) per term
|
||||
for term, _ in frequencies.items():
|
||||
df_count = df.get(term, 0) + 1
|
||||
df[term] = df_count
|
||||
|
||||
for term, freq in df.items():
|
||||
idf[term] = math.log(1 + (corpus_size - freq + 0.5) / (freq + 0.5))
|
||||
|
||||
self.tf_ = tf
|
||||
self.df_ = df
|
||||
self.idf_ = idf
|
||||
self.doc_len_ = doc_len
|
||||
self.corpus_ = corpus
|
||||
self.corpus_size_ = corpus_size
|
||||
self.avg_doc_len_ = sum(doc_len) / corpus_size
|
||||
return self
|
||||
|
||||
def search(self, query):
|
||||
scores = [self._score(query, index) for index in range(self.corpus_size_)]
|
||||
return scores
|
||||
|
||||
def _score(self, query, index):
|
||||
score = 0.0
|
||||
|
||||
doc_len = self.doc_len_[index]
|
||||
frequencies = self.tf_[index]
|
||||
for term in query:
|
||||
if term not in frequencies:
|
||||
continue
|
||||
|
||||
freq = frequencies[term]
|
||||
numerator = self.idf_[term] * freq * (self.k1 + 1)
|
||||
denominator = freq + self.k1 * (1 - self.b + self.b * doc_len / self.avg_doc_len_)
|
||||
score += (numerator / denominator)
|
||||
|
||||
return score
|
||||
|
||||
def preprocess_text(corpus):
|
||||
''' Prepare text for BM25-compatibility '''
|
||||
texts = [
|
||||
[word for word in document.lower().split() if word not in stopwords]
|
||||
for document in corpus
|
||||
]
|
||||
|
||||
# Build a word count dictionary so we can remove words that appear only once
|
||||
word_count_dict = {}
|
||||
for text in texts:
|
||||
for token in text:
|
||||
word_count = word_count_dict.get(token, 0) + 1
|
||||
word_count_dict[token] = word_count
|
||||
|
||||
texts = [[token for token in text if word_count_dict[token] > 1] for text in texts]
|
||||
return texts
|
||||
|
||||
def main(query, corpus, sources, bm_n_doc):
|
||||
logging.warning('Applying BM25 algorithm ...')
|
||||
# Query our corpus to see which document is more relevant
|
||||
query = [word for word in query.lower().split() if word not in stopwords]
|
||||
# Preprocess text to remove stopwords and stuff
|
||||
texts = preprocess_text(corpus)
|
||||
# Fit and score texts on BM25
|
||||
bm25 = BM25()
|
||||
bm25.fit(texts)
|
||||
scores = bm25.search(query)
|
||||
# Sort by relevance score
|
||||
bm25_ranked = sorted(zip(scores, corpus, sources), key = lambda x: x[0], reverse=True)
|
||||
# Unpack the zipped lists
|
||||
scores, corpus, sources = zip(*bm25_ranked[:bm_n_doc])
|
||||
return list(corpus), list(sources)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Test question + context
|
||||
corpus = [
|
||||
'Human machine interface for lab abc computer applications',
|
||||
'A survey of user opinion of computer system response time',
|
||||
'The EPS user interface management system',
|
||||
'System and human system engineering testing of EPS',
|
||||
'Relation of user perceived response time to error measurement',
|
||||
'The generation of random binary unordered trees',
|
||||
'The intersection graph of paths in trees',
|
||||
'Graph minors IV Widths of trees and well quasi ordering',
|
||||
'Graph minors A survey'
|
||||
]
|
||||
sources = [
|
||||
{'metadata_storage_name': 'bla',
|
||||
'document_id': 'bla',
|
||||
'document_uri': 'bla',
|
||||
'title': 'bla'},
|
||||
{'metadata_storage_name': 'bla',
|
||||
'document_id': 'bla',
|
||||
'document_uri': 'bla',
|
||||
'title': 'bla'},
|
||||
{'metadata_storage_name': 'bla',
|
||||
'document_id': 'bla',
|
||||
'document_uri': 'bla',
|
||||
'title': 'bla'},
|
||||
{'metadata_storage_name': 'bla',
|
||||
'document_id': 'bla',
|
||||
'document_uri': 'bla',
|
||||
'title': 'bla'},
|
||||
{'metadata_storage_name': 'bla',
|
||||
'document_id': 'bla',
|
||||
'document_uri': 'bla',
|
||||
'title': 'bla'}]
|
||||
main("The intersection of graph survey and trees", corpus, sources)
|
|
@ -0,0 +1,20 @@
|
|||
{
|
||||
"scriptFile": "__init__.py",
|
||||
"bindings": [
|
||||
{
|
||||
"authLevel": "function",
|
||||
"type": "httpTrigger",
|
||||
"direction": "in",
|
||||
"name": "req",
|
||||
"methods": [
|
||||
"get",
|
||||
"post"
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "http",
|
||||
"direction": "out",
|
||||
"name": "$return"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
import logging
|
||||
import sys
|
||||
|
||||
def main():
|
||||
return None
|
||||
|
||||
def get_config(req):
|
||||
'''
|
||||
Get config and set parameters
|
||||
'''
|
||||
# Assuming that we receive the params via url
|
||||
question = req.params.get('question')
|
||||
n_doc = req.params.get('az_documents')
|
||||
treshold = req.params.get('az_treshold')
|
||||
tokenize = req.params.get('az_tokenize')
|
||||
bm_n_doc = req.params.get('bm_ndoc')
|
||||
# We check, whether we have received a value for every parameter
|
||||
if not any([question, n_doc, treshold, tokenize, bm_n_doc]):
|
||||
# Otherwise we try the request body
|
||||
try:
|
||||
req_body = req.get_json()
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
question = req_body.get('question')
|
||||
n_doc = req_body.get('az_documents')
|
||||
treshold = req_body.get('az_treshold')
|
||||
tokenize = req_body.get('az_tokenize')
|
||||
bm_n_doc = req_body.get('bm_ndoc')
|
||||
# Now we check whether we have everything - if we have some missings, we just set the defaults
|
||||
if not all([question, n_doc, treshold, tokenize, bm_n_doc]):
|
||||
logging.warning('Received one or multiple empty parameters, filling up with default values')
|
||||
if not n_doc:
|
||||
n_doc = 5
|
||||
if not treshold:
|
||||
treshold = 5
|
||||
if not bm_n_doc:
|
||||
bm_n_doc = 3
|
||||
if str(tokenize).lower() == "false":
|
||||
tokenize = False
|
||||
else:
|
||||
tokenize = True
|
||||
elif all([question, n_doc, treshold, tokenize, bm_n_doc]):
|
||||
pass
|
||||
logging.warning(f'[INFO] - Working with following parameters: n_doc: {n_doc}, treshold: {treshold}, bm_n_doc: {bm_n_doc}, tokenize: {tokenize}.')
|
||||
return question, int(n_doc), int(treshold), tokenize, int(bm_n_doc)
|
||||
|
||||
def load_models(model_name_or_path):
|
||||
'''
|
||||
Load models, tokenizers and configuration
|
||||
Pick one from the models below and set in in reader.py:
|
||||
- model_name_or_path = "deepset/bert-large-uncased-whole-word-masking-squad2"
|
||||
- model_name_or_path = "deepset/roberta-base-squad2"
|
||||
- model_name_or_path = "twmkn9/albert-base-v2-squad2"
|
||||
- model_name_or_path = "distilbert-base-cased-distilled-squad"
|
||||
'''
|
||||
#
|
||||
if model_name_or_path == "twmkn9/albert-base-v2-squad2":
|
||||
from transformers import (
|
||||
AlbertConfig,
|
||||
AlbertForQuestionAnswering,
|
||||
AlbertTokenizer,
|
||||
squad_convert_examples_to_features
|
||||
)
|
||||
config_class, model_class, tokenizer_class = (
|
||||
AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer)
|
||||
elif model_name_or_path == "deepset/bert-large-uncased-whole-word-masking-squad2":
|
||||
from transformers import (
|
||||
BertConfig,
|
||||
BertForQuestionAnswering,
|
||||
BertTokenizer,
|
||||
squad_convert_examples_to_features
|
||||
)
|
||||
config_class, model_class, tokenizer_class = (
|
||||
BertConfig, BertForQuestionAnswering, BertTokenizer)
|
||||
elif model_name_or_path == "distilbert-base-cased-distilled-squad":
|
||||
from transformers import (
|
||||
DistilBertConfig,
|
||||
DistilBertForQuestionAnswering,
|
||||
DistilBertTokenizer,
|
||||
squad_convert_examples_to_features
|
||||
)
|
||||
config_class, model_class, tokenizer_class = (
|
||||
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
||||
elif model_name_or_path == "deepset/roberta-base-squad2":
|
||||
from transformers import (
|
||||
RobertaConfig,
|
||||
RobertaForQuestionAnswering,
|
||||
RobertaTokenizer,
|
||||
squad_convert_examples_to_features
|
||||
)
|
||||
config_class, model_class, tokenizer_class = (
|
||||
RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer)
|
||||
else:
|
||||
logging.error(f'Model {model_name_or_path} is not available!')
|
||||
sys.exit()
|
||||
logging.info(f'Loaded {model_name_or_path} ...')
|
||||
return config_class, model_class, tokenizer_class, squad_convert_examples_to_features
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,151 @@
|
|||
import os
|
||||
import logging
|
||||
import torch
|
||||
import time
|
||||
from . import helper as he
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
|
||||
from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
|
||||
from transformers.data.metrics.squad_metrics import compute_predictions_logits
|
||||
|
||||
# Define model - just uncomment the model you want to use
|
||||
model_type = "bert" # may be bert, roberta, albert, distilbert, see helper.py
|
||||
model_name_or_path = "deepset/bert-large-uncased-whole-word-masking-squad2"
|
||||
|
||||
# Set the flag for deployment
|
||||
if os.path.exists(f'./models/{model_type}/config.json'):
|
||||
# Load supported models
|
||||
config_class, model_class, tokenizer_class, squad_convert_examples_to_features = he.load_models(model_name_or_path)
|
||||
model_name_or_path = f'./models/{model_type}/'
|
||||
logging.warning(f'[INFO] - Loading local model {model_type}.')
|
||||
else:
|
||||
# Load supported models
|
||||
config_class, model_class, tokenizer_class, squad_convert_examples_to_features = he.load_models(model_name_or_path)
|
||||
logging.warning(f'[INFO] - Loading remote model {model_type}.')
|
||||
|
||||
# Config
|
||||
n_best_size = 1
|
||||
max_answer_length = 50
|
||||
do_lower_case = True
|
||||
null_score_diff_threshold = 0.0
|
||||
|
||||
def to_list(tensor):
|
||||
return tensor.detach().cpu().tolist()
|
||||
|
||||
# Setup model
|
||||
config = config_class.from_pretrained(model_name_or_path)
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
model_name_or_path, do_lower_case=do_lower_case)
|
||||
model = model_class.from_pretrained(model_name_or_path, config=config)
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
model.to(device)
|
||||
|
||||
processor = SquadV2Processor()
|
||||
|
||||
def run_prediction(question_text, context_texts):
|
||||
"""
|
||||
Setup function to compute predictions
|
||||
"""
|
||||
examples = []
|
||||
|
||||
for i, context_text in enumerate(context_texts):
|
||||
example = SquadExample(
|
||||
qas_id=str(i),
|
||||
question_text=question_text,
|
||||
context_text=context_text,
|
||||
answer_text=None,
|
||||
start_position_character=None,
|
||||
title="Predict",
|
||||
is_impossible=False,
|
||||
answers=None,
|
||||
)
|
||||
examples.append(example)
|
||||
|
||||
features, dataset = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=384,
|
||||
doc_stride=128,
|
||||
max_query_length=64,
|
||||
is_training=False,
|
||||
return_dataset="pt",
|
||||
threads=1,
|
||||
)
|
||||
|
||||
eval_sampler = SequentialSampler(dataset)
|
||||
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)
|
||||
|
||||
all_results = []
|
||||
|
||||
for batch in eval_dataloader:
|
||||
model.eval()
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
#"token_type_ids": batch[2], #TODO: had to comment this?
|
||||
}
|
||||
|
||||
example_indices = batch[3]
|
||||
|
||||
outputs = model(**inputs)
|
||||
|
||||
for i, example_index in enumerate(example_indices):
|
||||
eval_feature = features[example_index.item()]
|
||||
unique_id = int(eval_feature.unique_id)
|
||||
|
||||
output = [to_list(output[i]) for output in outputs]
|
||||
|
||||
start_logits, end_logits = output
|
||||
result = SquadResult(unique_id, start_logits, end_logits)
|
||||
all_results.append(result)
|
||||
|
||||
# Output files are optional, may help debugging/learning locally
|
||||
# output_prediction_file = "/tmp/predictions.json"
|
||||
# output_nbest_file = "/tmp/nbest_predictions.json"
|
||||
# output_null_log_odds_file = "/tmp/null_predictions.json"
|
||||
|
||||
predictions = compute_predictions_logits(
|
||||
examples,
|
||||
features,
|
||||
all_results,
|
||||
n_best_size,
|
||||
max_answer_length, # TODO
|
||||
do_lower_case,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
False, # verbose_logging
|
||||
True, # version_2_with_negative
|
||||
null_score_diff_threshold,
|
||||
tokenizer,
|
||||
)
|
||||
|
||||
return predictions
|
||||
|
||||
def main(question, documents, meta):
|
||||
# Run method
|
||||
_predictions = run_prediction(question, documents)
|
||||
logging.info(_predictions)
|
||||
predictions = []
|
||||
for prediction, m in zip(_predictions, meta):
|
||||
_prediction = _predictions[prediction]
|
||||
if _prediction != "" and _prediction != "empty":
|
||||
predictions.append(dict(
|
||||
answer = _prediction,
|
||||
title = m['title'],
|
||||
metadata_storage_name = m['metadata_storage_name'],
|
||||
document_id = m['document_id'],
|
||||
document_uri = m['document_uri']
|
||||
))
|
||||
return predictions
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Test question + context
|
||||
contexts = ["New Zealand (Māori: Aotearoa) is a sovereign island country in the southwestern Pacific Ocean. It has a total land area of 268,000 square kilometres (103,500 sq mi), and a population of 4.9 million. New Zealand's capital city is Wellington, and its most populous city is Auckland."]
|
||||
question = "How many people live in New Zealand?"
|
||||
print(main(question, contexts, [""]))
|
|
@ -0,0 +1,97 @@
|
|||
from azure.core.credentials import AzureKeyCredential
|
||||
from azure.search.documents import SearchClient
|
||||
import logging
|
||||
from nltk import sent_tokenize
|
||||
import nltk.data
|
||||
import nltk.downloader
|
||||
|
||||
try:
|
||||
nltk.data.path.append("./models/nltk/")
|
||||
logging.info('Importing nltk from local, specified folder.')
|
||||
except:
|
||||
nltk.download('punkt')
|
||||
finally:
|
||||
nltk.data.find('tokenizers/punkt')
|
||||
|
||||
# Local debugging
|
||||
try:
|
||||
import sys
|
||||
import os
|
||||
sys.path.append('./')
|
||||
import configparser
|
||||
config = configparser.ConfigParser()
|
||||
config.read('config.ini')
|
||||
service_name = config['search']['service_name']
|
||||
index_name = config['search']['index_name']
|
||||
api_key = config['search']['key']
|
||||
logging.info('extracted keys from config')
|
||||
except Exception as e:
|
||||
logging.warning(f'{e}')
|
||||
service_name = os.environ.get('service_name')
|
||||
index_name = os.environ.get('index_name')
|
||||
api_key = os.environ.get('api_key')
|
||||
logging.info('extracted keys from environment')
|
||||
|
||||
# Extract relevant text parts
|
||||
def extract_relevant_text(highlight, paragraphs, tokenize):
|
||||
''' Extracts relevant sentences around highlights to get optimized text and content
|
||||
This ensures that we have better data for the MRC scoring, as not only partial sentences are extracted'''
|
||||
if not isinstance(paragraphs, list):
|
||||
paragraphs = [paragraphs]
|
||||
# For every highlight, look up the paragraphs
|
||||
for paragraph in paragraphs:
|
||||
# If the highlight is in the respective paragraph, tokenize the sentences
|
||||
if tokenize:
|
||||
highlight = sent_tokenize(highlight)[0]
|
||||
if highlight in paragraph:
|
||||
sentences = sent_tokenize(paragraph)
|
||||
# Loop through every sentence and detect the highlight
|
||||
for sentence in sentences:
|
||||
# As soon as we found the highlight, extract the sentences around it
|
||||
if highlight in sentence:
|
||||
if sentences.index(sentence) == 0:
|
||||
return ". ".join(sentences[:2])
|
||||
elif sentences.index(sentence) == len(sentences) - 1:
|
||||
return ". ".join(sentences[-2:])
|
||||
else:
|
||||
return ". ".join(sentences[sentences.index(sentence)-1:sentences.index(sentence)+1])
|
||||
|
||||
# Request to Cognitive Search
|
||||
def main(question, n=5, threshold=5, tokenize=True):
|
||||
# Create a SearchClient to send queries
|
||||
endpoint = f"https://{service_name}.search.windows.net/"
|
||||
credential = AzureKeyCredential(api_key)
|
||||
client = SearchClient(endpoint=endpoint,
|
||||
index_name=index_name,
|
||||
credential=credential)
|
||||
|
||||
# Get top n results
|
||||
results = client.search(search_text=question, search_fields='paragraphs', highlight_fields='paragraphs-3', select='paragraphs,metadata_storage_name,document_id,document_uri,title', top=n)
|
||||
documents = []
|
||||
meta = []
|
||||
for result in results:
|
||||
if result['@search.score'] > threshold:
|
||||
if len(result['paragraphs']) == 0:
|
||||
continue
|
||||
search_highlights = list(dict.fromkeys(result['@search.highlights']['paragraphs']))
|
||||
for highlight in search_highlights:
|
||||
h = highlight.replace('<em>', '').replace('</em>', '')
|
||||
relevant_text = extract_relevant_text(h, result['paragraphs'], tokenize)
|
||||
if relevant_text is None:
|
||||
logging.info("Text is none, continue")
|
||||
continue
|
||||
# We only proceed with the first 500 characters due to MRC limitations
|
||||
elif relevant_text[:500] in documents:
|
||||
logging.info("Text already exists, continue")
|
||||
continue
|
||||
documents.append(relevant_text[:500])
|
||||
meta.append(dict(
|
||||
metadata_storage_name = result['metadata_storage_name'],
|
||||
document_id = result['document_id'],
|
||||
document_uri = result['document_uri'],
|
||||
title = result['title']
|
||||
))
|
||||
return documents, meta
|
||||
|
||||
if __name__ == "__main__":
|
||||
main("Who is the Boss of Microsoft?")
|
178
README.md
178
README.md
|
@ -1,17 +1,176 @@
|
|||
# Project
|
||||
# Machine Reading Comprehension with Azure Cognitive Search
|
||||
This documentation serves to describe the technical setup of a Machine Reading Comprehension (MRC) component in an Azure Function, how to deploy and as well as how to use it. In this setup, we require the combination with Azure Cognitive Search, so that documents retrieved from the index given an input question can be processed using a MRC model. The component has been implemented in Python 3.7.
|
||||
|
||||
> This repo has been populated by an initial template to help get you started. Please
|
||||
> make sure to update the content to build a great experience for community-building.
|
||||
## Definition of Machine Reading Comprehension (MRC)
|
||||
- Machine Reading Comprehension (MRC), or the ability to read and understand unstructured text and then answer questions about it remains a challenging task for computers
|
||||
- MRC is a growing field of research due to its potential in various enterprise applications, as well as the availability of MRC benchmarking datasets (MSMARCO, SQuAD, NewsQA, etc.)
|
||||
|
||||
As the maintainer of this project, please make a few updates:
|
||||
## Tech Stack
|
||||
The technical components of the MRC function look as follows:
|
||||
![Tech Stack](assets/tech_stack.png)
|
||||
|
||||
- Improving this README.MD file to provide a great experience
|
||||
- Updating SUPPORT.MD with content about this project's support experience
|
||||
- Understanding the security reporting process in SECURITY.MD
|
||||
- Remove this section from the README
|
||||
### Azure Functions
|
||||
The basis of the MRC is an [Azure Functions](https://docs.microsoft.com/en-us/azure/azure-functions/functions-overview) component, which is a serverless infrastructure type offered on Microsoft Azure. It acts as webservice and can be triggered as REST-API. Basically, it is available in multiple setups such as C#, JavaScript and Python - in this case we use Python and recommend to use the Python 3.7 runtime. The minimum scale level should be either AppService or ideally Premium plan. The description to the respective plans can be found [here](https://docs.microsoft.com/en-us/azure/azure-functions/functions-scale). Depending on your scale, a Function, a storage account and an App Service Plan is deployed in your subscription when creating the resource intially.
|
||||
|
||||
### Python
|
||||
The Python packages that are additionally required are `azure-functions`, `torch`, `torchvision`, `transformers`, `azure-search-documents` and `nltk`. Further, they are listed in the `requirements.txt` with the respective version numbers. When deploying the service, it will automatically be used for transferring and installing it.
|
||||
|
||||
### Azure Search
|
||||
- [Azure Cognitive Search](https://azure.microsoft.com/en-us/services/search/) is the only cloud search service with built-in AI capabilities that enrich all types of information to easily identify and explore relevant content at scale. Formerly known as Azure Search, it uses the same integrated Microsoft natural language stack that Bing and Office have used for more than a decade, and AI services across vision, language, and speech. Spend more time innovating and less time maintaining a complex cloud search solution.
|
||||
- In this setup, it is used as document retriever to look up for documents which may be related to an incoming question
|
||||
|
||||
### Transformers
|
||||
- Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone.
|
||||
- Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets then share them with the community on our model hub. At the same time, each Python module defining an architecture can be used as a standalone and modified to enable quick research experiments.
|
||||
- Transformers is backed by the two most popular deep learning libraries, PyTorch and TensorFlow, with a seamless integration between them, allowing you to train your models with one then load it for inference with the other.
|
||||
- In our setup, Transformers are used to search the text returned by Azure Functions and the filtering mechanisms of our pipeline.
|
||||
- The pre-trained models are available on [huggingface.co](https://huggingface.co/models). Usually, they are downloaded directly by the script, however this may be restricted depending on the company policy, as the models are hosted on AWS.
|
||||
- We have configured the function the way that we upload the model during the deployment, so no further model download has to be done during the deployment (limitation of accessing AWS resources exist)
|
||||
|
||||
## Architecture
|
||||
The MRC component receives a request from the API backend after a user types in a specific question he or she wants to loop up for. The Function sends a request to the Azure Cognitive Search, asking for a collection of documents matching to the incoming question. After receiving the results, the documents are processed and checked for potential answers. The Function responds with a JSON containing the potential answers.
|
||||
|
||||
## Code and Process Flow
|
||||
We followed a modular approach for the code, so that every script may be executed separately to try out changes autonomously. The files in the root folder are predominantly skeleton files for the Azure Function. The main function itself is stored in the `MRC` subfolder. When a request is received by the endpoint of the Azure Function, it is going to handled as illustrated below:
|
||||
|
||||
![MRC Flow](assets/mrc_flow.PNG)
|
||||
|
||||
## `__init__.py`
|
||||
- Orchestration script of the `MRC` function
|
||||
- Gets triggered after the function receives a request, accepts the request body and orchestrates the call of all other submodules
|
||||
- After document processing, a response json is packed by the `__init__.py`
|
||||
|
||||
## `helper.py`
|
||||
- The helper provides functions that are required for the runtime without being executable standalone
|
||||
- Further, it allows to keep the `__init.py__` clean
|
||||
- The `get_config()`-function of the helper gets called by the `__init__.py` to collect the parameters and eventually fill up missing ones with standard values
|
||||
- The `load_model()`-function is called by the `reader.py` to load the pre-configured model. It has been separated from the reader to allow extensions for further models without affecting the reader itself
|
||||
|
||||
## `retriever.py`
|
||||
- The document retriever is the first processing component that gets triggered
|
||||
- Based on the input question, the Azure Search gets queried and returns `n` (optional query parameter, default == 5) documents that match to the respective string
|
||||
- The results are filtered based on the `treshold` (optional input parameter, treshold is TF/IDF-score, default == 5)
|
||||
- Afterwards, the highlighted text is extracted and with help of `extract_relevant_text()`-function, the relevant text from the paragraph around the highlight is extracted
|
||||
- For the extraction of relevant text, a custom sentence tokenizer is used, which is activated by default based on the parameter `tokenize` (optional parameter, default == True), which can be set to `False` (not recommended)
|
||||
- We always have to make a cut at 500 characters, as it is the maximum length of a documents to be processed by the MRC
|
||||
- If the part of the document does not exist in our list yet, we add it as potential candidate to be "read" by the MRC component later on
|
||||
- The collection of documents with their respective metadata is returned to the orchtestrator
|
||||
|
||||
## `bm25.py`
|
||||
- In the next step, we run a [Okapi Best Match 25](https://en.wikipedia.org/wiki/Okapi_BM25) on the collection of documents, if the document collection is larger than `bm_n_doc` (optional input parameter, default == 3)
|
||||
- It is a ranking algorithm that filters the documents again for the best match to an input question in order to reduce the processing time of a large amount of documents. On this way we only process the most relevant documents. However, relevant documents might also be filtered out. If you do not want to use it as preprocessing any more, just comment it out in `__init__.py`, as below, and redeploy:
|
||||
```python
|
||||
# Apply BM25, if more than n documents
|
||||
# if documents and len(documents) > bm_n_doc:
|
||||
# documents, sources = bm25.main(question, documents, sources, bm_n_doc)
|
||||
```
|
||||
- The results are scored and the `bm_n_doc` documents with the highest scores get returned and passed to the orchestrator again
|
||||
|
||||
## `reader.py`
|
||||
- This is the stage where the actual MRC happens. The pre-selected documents get applied on a pre-trained transformer model, which is specialized in MRC
|
||||
- The models get loaded as described in `helper.py`, as there are multiple choice options which model to put into production. We recommend going for the active `deepset/bert-large-uncased-whole-word-masking-squad2`. You can expect significantly shorter processing times with the other models, however they do not perform so well in reading documents and also show difficulties with case-sensitive documents
|
||||
- The reader checks, whether a model is available in the `models/bert`-subfolder. If no model can be found, a download is initiated from the huggingface API, given the security policies from the VNET allow it
|
||||
- With help of tokenization and some parameters, which are set in `reader.py`, all the pre-selected documents get processed and checked for potential answers to a question
|
||||
- If there is a match, the reader returns the respective documents to the orchestrator
|
||||
|
||||
The files and folders listed in `.funcignore` are not deployed to the function as they are either not needed or not wanted in the infrastructure component, e.g. as they are just for local development.
|
||||
|
||||
## Request and Responses
|
||||
The section below shows example inputs and outputs how to trigger the function and pass values to the processing pipeline as well as the return.
|
||||
|
||||
### Request
|
||||
The request below shows the minimal setup of a request with the mandatory parameters.
|
||||
|
||||
The link to the function usually looks as following:<br>
|
||||
`https://just-mrc-test.azurewebsites.net/api/MRC?code=abcde312321hnIeA812345556blablablablablabla==`<br>
|
||||
It is based on the structure:<br>
|
||||
`https://{function resource name}.azurewebsites.net/api/{function name}?code={access code}==`
|
||||
|
||||
You can either send a request body or the address string, however we recommend using the request body as shown below:
|
||||
```json
|
||||
{
|
||||
"question": "Who is the CEO of Microsoft?"
|
||||
}
|
||||
```
|
||||
|
||||
If you want to pass further optional parameters to tweak the results a bit, look at the following body:
|
||||
```json
|
||||
{
|
||||
"question": "Who is the CEO of Microsoft?",
|
||||
"az_documents": 8,
|
||||
"az_treshold": 10,
|
||||
"az_tokenize": False,
|
||||
"bm_ndoc": 8
|
||||
}
|
||||
```
|
||||
|
||||
As already described above, their meaning and default values would look as following:
|
||||
```json
|
||||
{
|
||||
"question": "Who is the CEO of Microsoft?",
|
||||
"az_documents": 5, # documents to be requested from the Azure Search
|
||||
"az_treshold": 5, # minimum TF-IDF relevance score to accept the document from Azure Search
|
||||
"az_tokenize": True, # tokenize sentences to extract paragraphs
|
||||
"bm_ndoc": 3 # Amount of documents to be returned by BM25
|
||||
}
|
||||
```
|
||||
An increase of `az_documents` and `bm_ndoc` may lead to a larger document corpus with a greater likeliness to find a match, however they may increase the processing time significantly. Decreasing `az_treshold` may also increase this effect. Deactivating the sentence tokenizer `az_tokenize` may lead to smaller text chunks, which may reduce the recognition quality.
|
||||
|
||||
### Response
|
||||
The service will return a json as below, in case a match has been found. There may be multiple entires in the `answers` section. The service also returns references to the document `title`, the `metadata_storage_name` as well as `document_id` and `document_uri`.
|
||||
```json
|
||||
{
|
||||
"answers": [
|
||||
{
|
||||
"answer": "Satya Nadella",
|
||||
"title": "Leadership - Stories",
|
||||
"metadata_storage_name": "senior-leaders.html",
|
||||
"document_id": "https://news.microsoft.com/leadership/?section=senior-leaders",
|
||||
"document_uri": "https://news.microsoft.com/leadership/?section=senior-leaders"
|
||||
}
|
||||
],
|
||||
"counts": {
|
||||
"documents": 3,
|
||||
"answers": 1
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Operations
|
||||
The section below describes the frameworks to be installed locally before you can get started testing, debugging and deploying the service.
|
||||
|
||||
### Local Installation
|
||||
First, you have to install/set up following components:
|
||||
1. PowerShell
|
||||
- [Azure Command Line Interface (CLI)](https://docs.microsoft.com/de-de/cli/azure/install-azure-cli), command line tools for Azure using PowerShell
|
||||
- [Azure Functions Core Tools](https://docs.microsoft.com/de-de/azure/azure-functions/functions-run-local?tabs=windows%2Ccsharp%2Cbash#v2), download for your local runtime environment, e.g. as `.exe` -> _v3.x: Windows 64-Bit_
|
||||
- A restart is highly recommended or even required after installing these components, otherwise you will run into issues!
|
||||
2. Python 3.7
|
||||
- From the [Windows Store](https://www.microsoft.com/en-us/p/python-37/9nj46sx7x90p?activetab=pivot:overviewtab) or from [Python](https://www.python.org/downloads/release/python-379/), make sure you install `pip` and set Python as `path` variable during the installation
|
||||
3. Postman
|
||||
- Framework to test endpoints, download it [here](https://www.postman.com/downloads/)
|
||||
|
||||
### Testing and Debugging
|
||||
1. Get your code from Azure DevOps or the respective repository (07 - SCIO MRC)
|
||||
1. Create a virtual environment <u>outside</u> of the project folder: `python –m venv venv`
|
||||
1. Activate the virtual environment: `source venv/bin/activate` (Linux) or `venv/Scripts/activate` (Windows), `deactivate` um zu deaktivieren
|
||||
1. Install the requirements: `pip install -r requirements.txt`
|
||||
1. Set your keys (only for local development and debugging) in the `config.ini` (they are needed for the Azure Search request)
|
||||
1. After the initial installation, run `func init` in the root folder (verifies if all necessary skeleton files for the function are available)
|
||||
1. For debugging and local testing, open a separate PowerShell window and execute `func host start --verbose` in the root folder of the function. This enables you to do code changes during runtime without shutting down the function completely when there is an issue
|
||||
1. Use [Postman](https://www.postman.com/downloads/) for testing the endpoints using the localhost request of this [collection](assets/MRC-Requests.postman_collection.json)
|
||||
|
||||
### Deployment to Azure
|
||||
1. Open your PowerShell
|
||||
1. Activate your environment, if you haven't before: <br>
|
||||
`source venv/bin/activate` (Linux) or `venv/Scripts/activate` (Windows)
|
||||
1. Login to your Azure Account: `az login` (a browser window will open, where you may have to log on Azure)
|
||||
1. Execute the command:<br>
|
||||
`func azure functionapp publish [insert your function name] --remote build`
|
||||
1. Wait until the deployment is finished
|
||||
1. Use [Postman](https://www.postman.com/downloads/) for testing the endpoints using this [collection](assets/MRC-Requests.postman_collection.json)
|
||||
|
||||
## Contributing
|
||||
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
|
||||
the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
|
||||
|
@ -25,7 +184,6 @@ For more information see the [Code of Conduct FAQ](https://opensource.microsoft.
|
|||
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
||||
|
||||
## Trademarks
|
||||
|
||||
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
|
||||
trademarks or logos is subject to and must follow
|
||||
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
{
|
||||
"info": {
|
||||
"_postman_id": "95608c8f-13b1-44e2-b2ca-c3fc68ba30c1",
|
||||
"name": "MRC Requests",
|
||||
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
|
||||
},
|
||||
"item": [
|
||||
{
|
||||
"name": "MRC API",
|
||||
"protocolProfileBehavior": {
|
||||
"disableBodyPruning": true
|
||||
},
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"header": [],
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\r\n \"question\": \"Who is the CEO if Microsoft?\"\r\n}",
|
||||
"options": {
|
||||
"raw": {
|
||||
"language": "json"
|
||||
}
|
||||
}
|
||||
},
|
||||
"url": {
|
||||
"raw": "http://localhost:7071/api/MRC",
|
||||
"protocol": "http",
|
||||
"host": [
|
||||
"localhost"
|
||||
],
|
||||
"port": "7071",
|
||||
"path": [
|
||||
"api",
|
||||
"MRC"
|
||||
]
|
||||
}
|
||||
},
|
||||
"response": []
|
||||
},
|
||||
{
|
||||
"name": "MRC API Deployed",
|
||||
"protocolProfileBehavior": {
|
||||
"disableBodyPruning": true
|
||||
},
|
||||
"request": {
|
||||
"method": "GET",
|
||||
"header": [],
|
||||
"body": {
|
||||
"mode": "raw",
|
||||
"raw": "{\r\n \"question\": \"Who is the CEO if Microsoft?\"\r\n}",
|
||||
"options": {
|
||||
"raw": {
|
||||
"language": "json"
|
||||
}
|
||||
}
|
||||
},
|
||||
"url": {
|
||||
"raw": "https://mrc-test.azurewebsites.net/api/MRC?code=insert function code here",
|
||||
"protocol": "https",
|
||||
"host": [
|
||||
"mrc-test",
|
||||
"azurewebsites",
|
||||
"net"
|
||||
],
|
||||
"path": [
|
||||
"api",
|
||||
"MRC"
|
||||
],
|
||||
"query": [
|
||||
{
|
||||
"key": "code",
|
||||
"value": "insert function code here"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"response": []
|
||||
}
|
||||
],
|
||||
"protocolProfileBehavior": {}
|
||||
}
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 102 KiB |
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 61 KiB |
|
@ -0,0 +1,4 @@
|
|||
[search]
|
||||
service_name=
|
||||
index_name=
|
||||
key=
|
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
"version": "2.0",
|
||||
"logging": {
|
||||
"applicationInsights": {
|
||||
"samplingSettings": {
|
||||
"isEnabled": true,
|
||||
"excludedTypes": "Request"
|
||||
}
|
||||
}
|
||||
},
|
||||
"extensionBundle": {
|
||||
"id": "Microsoft.Azure.Functions.ExtensionBundle",
|
||||
"version": "[1.*, 2.0.0)"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"$schema": "http://json.schemastore.org/proxies",
|
||||
"proxies": {}
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
# DO NOT include azure-functions-worker in this file
|
||||
# The Python Worker is managed by Azure Functions platform
|
||||
# Manually managing azure-functions-worker may cause unexpected issues
|
||||
azure-functions==1.4.0
|
||||
#torch===1.6.0 -f https://download.pytorch.org/whl/torch_stable.html
|
||||
torch===1.5.1
|
||||
torchvision===0.6.1
|
||||
#torchvision===0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
|
||||
transformers==3.4.0
|
||||
azure-search-documents==11.0.0
|
||||
nltk==3.5
|
Загрузка…
Ссылка в новой задаче