Merge pull request #14 from microsoft/omri/reintroduce_analyzer

reintroduced analyzer
This commit is contained in:
Omri Mendels 2020-05-27 15:35:14 +03:00 коммит произвёл GitHub
Родитель 2a635427ea a271a1825f
Коммит 0d5c4ba3fe
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
9 изменённых файлов: 580 добавлений и 540 удалений

Просмотреть файл

@ -1,33 +1,31 @@
'''
Presidio Analyzer not yet on PyPI, cannot explicitly reference it
'''
from typing import List
from presidio_analyzer import AnalyzerEngine
from typing import List, Dict
#
from presidio_evaluator import ModelEvaluator, InputSample, span_to_tag
#
from presidio_evaluator.data_generator import read_synth_dataset
#
#
class PresidioAnalyzer(ModelEvaluator):
def __init__(self, analyzer,
entities_to_keep: List[str] = None,
verbose: bool = False,
labeling_scheme="BIO",
compare_by_io=True,
score_threshold=0.4
):
def __init__(
self,
analyzer=AnalyzerEngine(),
entities_to_keep: List[str] = None,
verbose: bool = False,
labeling_scheme="BIO",
compare_by_io=True,
score_threshold=0.4,
):
"""
Evaluation wrapper for the Presidio Analyzer
:param analyzer: object of type AnalyzerEngine (from presidio-analyzer)
"""
super().__init__(entities_to_keep=entities_to_keep,
verbose=verbose,
labeling_scheme=labeling_scheme,
compare_by_io=compare_by_io)
super().__init__(
entities_to_keep=entities_to_keep,
verbose=verbose,
labeling_scheme=labeling_scheme,
compare_by_io=compare_by_io,
)
self.analyzer = analyzer
self.score_threshold = score_threshold
@ -37,8 +35,12 @@ class PresidioAnalyzer(ModelEvaluator):
all_fields = True
else:
all_fields = None
results = self.analyzer.analyze(sample.full_text, self.entities,
language='en', all_fields=all_fields)
results = self.analyzer.analyze(
text=sample.full_text,
entities=self.entities,
language="en",
all_fields=all_fields,
)
starts = []
ends = []
scores = []
@ -52,13 +54,15 @@ class PresidioAnalyzer(ModelEvaluator):
tags.append(res.entity_type)
scores.append(res.score)
#
response_tags = span_to_tag(scheme=self.labeling_scheme,
text=sample.full_text,
start=starts,
end=ends,
tokens=sample.tokens,
scores=scores,
tag=tags)
response_tags = span_to_tag(
scheme=self.labeling_scheme,
text=sample.full_text,
start=starts,
end=ends,
tokens=sample.tokens,
scores=scores,
tag=tags,
)
return response_tags
@ -70,41 +74,53 @@ if __name__ == "__main__":
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
entities_mapping = {
'PERSON': 'PERSON',
'EMAIL': 'EMAIL_ADDRESS',
'CREDIT_CARD': 'CREDIT_CARD',
'FIRST_NAME': 'PERSON',
'PHONE_NUMBER': 'PHONE_NUMBER',
'BIRTHDAY': 'DATE_TIME',
'DATE': 'DATE_TIME',
'DOMAIN': 'DOMAIN',
'CITY': 'LOCATION',
'ADDRESS': 'LOCATION',
'IBAN': 'IBAN_CODE',
'URL': 'DOMAIN_NAME',
'US_SSN': 'US_SSN',
'IP_ADDRESS': 'IP_ADDRESS',
'ORGANIZATION': 'ORG',
'O': 'O'
"PERSON": "PERSON",
"EMAIL": "EMAIL_ADDRESS",
"CREDIT_CARD": "CREDIT_CARD",
"FIRST_NAME": "PERSON",
"PHONE_NUMBER": "PHONE_NUMBER",
"BIRTHDAY": "DATE_TIME",
"DATE": "DATE_TIME",
"DOMAIN": "DOMAIN",
"CITY": "LOCATION",
"ADDRESS": "LOCATION",
"IBAN": "IBAN_CODE",
"URL": "DOMAIN_NAME",
"US_SSN": "US_SSN",
"IP_ADDRESS": "IP_ADDRESS",
"ORGANIZATION": "ORG",
"O": "O",
}
updated_samples = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,
entities_mapping)
updated_samples = ModelEvaluator.align_input_samples_to_presidio_analyzer(
input_samples, entities_mapping
)
flatten = lambda l: [item for sublist in l for item in sublist]
from collections import Counter
count_per_entity = Counter(
[span.entity_type for span in flatten([input_sample.spans for input_sample in updated_samples])])
[
span.entity_type
for span in flatten(
[input_sample.spans for input_sample in updated_samples]
)
]
)
print("Evaluating samples")
analyzer = PresidioAnalyzer(entities_to_keep=count_per_entity.keys())
evaluated_samples = analyzer.evaluate_all(updated_samples)
#
print("Estimating metrics")
precision, recall, \
entity_recall, entity_precision, \
f, errors = analyzer.calculate_score(evaluation_results=evaluated_samples, beta=2.5)
(
precision,
recall,
entity_recall,
entity_precision,
f,
errors,
) = analyzer.calculate_score(evaluation_results=evaluated_samples, beta=2.5)
#
print("precision: {}".format(precision))
print("Recall: {}".format(recall))
@ -112,22 +128,24 @@ if __name__ == "__main__":
print("Precision per entity: {}".format(entity_precision))
print("Recall per entity: {}".format(entity_recall))
#
FN_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == 'FN']
FP_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == 'FP']
other_mistakes = [mistake for mistake in flatten(errors) if "Wrong entity" in mistake]
FN_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == "FN"]
FP_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == "FP"]
other_mistakes = [
mistake for mistake in flatten(errors) if "Wrong entity" in mistake
]
fn = open('../data/fn_30000.txt', 'w+', encoding='utf-8')
fn1 = '\n'.join(FN_mistakes)
fn = open("../data/fn_30000.txt", "w+", encoding="utf-8")
fn1 = "\n".join(FN_mistakes)
fn.write(fn1)
fn.close()
fp = open('../data/fp_30000.txt', 'w+', encoding='utf-8')
fp1 = '\n'.join(FP_mistakes)
fp = open("../data/fp_30000.txt", "w+", encoding="utf-8")
fp1 = "\n".join(FP_mistakes)
fp.write(fp1)
fp.close()
mistakes_file = open('../data/mistakes_30000.txt', 'w+', encoding='utf-8')
mistakes1 = '\n'.join(other_mistakes)
mistakes_file = open("../data/mistakes_30000.txt", "w+", encoding="utf-8")
mistakes1 = "\n".join(other_mistakes)
mistakes_file.write(mistakes1)
mistakes_file.close()

Просмотреть файл

@ -1,40 +1,50 @@
'''
"""
Presidio Analyzer not yet on PyPI, therefore it cannot be referenced explicitly
'''
"""
import math
from typing import List, Tuple, Dict
from presidio_evaluator import ModelEvaluator, InputSample
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_evaluator import ModelEvaluator, InputSample, EvaluationResult
from presidio_evaluator.span_to_tag import span_to_tag
class PresidioRecognizerEvaluator(ModelEvaluator):
def __init__(self, recognizer, nlp_engine, entities_to_keep=None,
with_nlp_artifacts=False, verbose=False, compare_by_io=True,
):
def __init__(
self,
recognizer,
nlp_engine,
entities_to_keep=None,
with_nlp_artifacts=False,
verbose=False,
compare_by_io=True,
):
"""
Evaluator for one recognizer
:param recognizer: An object of type EntityRecognizer (in presidion-analyzer)
:param nlp_engine: An object of type NlpEngine, e.g. SpacyNlpEngine (in presidio-analyzer)
"""
super().__init__(entities_to_keep=entities_to_keep,
verbose=verbose, compare_by_io=compare_by_io)
super().__init__(
entities_to_keep=entities_to_keep,
verbose=verbose,
compare_by_io=compare_by_io,
)
self.withNlpArtifacts = with_nlp_artifacts
self.recognizer = recognizer
self.nlp_engine = nlp_engine
#
def __make_nlp_artifacts(self, text: str):
return self.nlp_engine.process_text(text, 'en')
return self.nlp_engine.process_text(text, "en")
#
def predict(self, sample: InputSample) -> List[str]:
nlpArtifacts = None
if self.withNlpArtifacts:
nlpArtifacts = self.__make_nlp_artifacts(sample.full_text)
results = self.recognizer.analyze(sample.full_text, self.entities,
nlpArtifacts)
results = self.recognizer.analyze(sample.full_text, self.entities, nlpArtifacts)
starts = []
ends = []
tags = []
@ -46,37 +56,33 @@ class PresidioRecognizerEvaluator(ModelEvaluator):
ends.append(res.end)
tags.append(res.entity_type)
scores.append(res.score)
response_tags = span_to_tag(scheme=self.labeling_scheme,
text=sample.full_text,
start=starts,
end=ends,
tag=tags,
tokens=sample.tokens,
scores=scores,
io_tags_only=self.compare_by_io)
response_tags = span_to_tag(
scheme=self.labeling_scheme,
text=sample.full_text,
start=starts,
end=ends,
tag=tags,
tokens=sample.tokens,
scores=scores,
io_tags_only=self.compare_by_io,
)
if len(sample.tags) == 0:
sample.tags = ['0' for word in response_tags]
sample.tags = ["0" for word in response_tags]
return response_tags
def score_presidio_recognizer(recognizer, entities_to_keep, input_samples,
withNlpArtifacts=False) \
-> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[
str, float], Dict[str, float], List[str]]:
model = PresidioRecognizerEvaluator(recognizer=recognizer,
entities_to_keep=entities_to_keep,
with_nlp_artifacts=withNlpArtifacts)
def score_presidio_recognizer(
recognizer, entities_to_keep, input_samples, withNlpArtifacts=False
) -> EvaluationResult:
model = PresidioRecognizerEvaluator(
recognizer=recognizer,
entities_to_keep=entities_to_keep,
nlp_engine=SpacyNlpEngine(),
with_nlp_artifacts=withNlpArtifacts,
)
evaluated_samples = model.evaluate_all(input_samples[:])
precision, recall, ent_recall, \
ent_precision, fscore, mistakes = model.calculate_score(
evaluated_samples, beta=2.5)
print("p={precision}, r={recall},f={f},"
"entity recall={ent},entity precision={prec}".format(
precision=precision,
recall=recall,
f=fscore,
ent=ent_recall,
prec=ent_precision))
if math.isnan(precision):
precision = 0
return precision, recall, ent_recall, ent_precision, fscore, mistakes
evaluation_result = model.calculate_score(evaluated_samples, beta=2.5)
evaluation_result.print()
if math.isnan(evaluation_result.pii_precision):
evaluation_result.pii_precision = 0
return evaluation_result

Просмотреть файл

@ -15,3 +15,4 @@ regex
#flair
sklearn_crfsuite
pytest
presidio_analyzer

Просмотреть файл

@ -1,7 +1,9 @@
import pytest
try:
from flair.models import SequenceTagger
except ImportError:
print("Flair is not installed by default")
except:
ImportError("Flair is not installed by default")
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator.flair_evaluator import FlairEvaluator
@ -9,18 +11,26 @@ from presidio_evaluator.flair_evaluator import FlairEvaluator
import numpy as np
# no-unit because flair is not a dependency by default
def no_unit_test_flair_simple():
@pytest.mark.skip(reason="Flair not installed by default")
def test_flair_simple():
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))
input_samples = read_synth_dataset(
os.path.join(dir_path, "data/generated_small.txt")
)
model = SequenceTagger.load('ner-ontonotes-fast') # .load('ner')
model = SequenceTagger.load("ner-ontonotes-fast") # .load('ner')
flair_evaluator = FlairEvaluator(model=model, entities_to_keep=['PERSON'])
flair_evaluator = FlairEvaluator(model=model, entities_to_keep=["PERSON"])
evaluation_results = flair_evaluator.evaluate_all(input_samples)
scores = flair_evaluator.calculate_score(evaluation_results)
np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON'])
np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON'])
np.testing.assert_almost_equal(
scores.pii_precision, scores.entity_precision_dict["PERSON"]
)
np.testing.assert_almost_equal(
scores.pii_recall, scores.entity_recall_dict["PERSON"]
)
assert scores.pii_recall > 0
assert scores.pii_precision > 0

Просмотреть файл

@ -1,80 +1,101 @@
'''
Presidio Analyzer not yet on PyPI, ignoring temporarily
'''
#
# import pytest
#
# from presidio_evaluator import InputSample, Span
# from presidio_evaluator.data_generator import read_synth_dataset
# from presidio_evaluator.presidio_analyzer import PresidioAnalyzer
#
#
# class GeneratedTextTestCase:
# def __init__(self, test_name, test_input, acceptance_threshold, marks):
# self.test_name = test_name
# self.test_input = test_input
# self.acceptance_threshold = acceptance_threshold
# self.marks = marks
#
# def to_pytest_param(self):
# return pytest.param(self.test_input, self.acceptance_threshold,
# id=self.test_name, marks=self.marks)
#
#
# # generated-text test cases
# analyzer_test_generate_text_testdata = [
# # small set fixture which expects all results.
# GeneratedTextTestCase(
# test_name="small-set",
# test_input="{}/data/generated_small.txt",
# acceptance_threshold=0.3,
# marks=pytest.mark.none
# )
# ]
#
#
# @pytest.mark.skip(reason="Presidio analyzer not on PyPi")
# def test_analyzer_simple_input():
# model = PresidioAnalyzer(entities_to_keep=['PERSON'])
#
# sample = InputSample(full_text="My name is Mike",
# masked="My name is [PERSON]",
# spans=[Span('PERSON', 'Mike', 10, 14)],
# create_tags_from_span=True)
#
# evaluated = model.evaluate_sample(sample)
# metrics = model.calculate_score(
# [evaluated])
#
# assert metrics.pii_precision == 1
# assert metrics.pii_recall == 1
#
#
# # analyzer tests on generated data
# @pytest.mark.skip(reason="Presidio analyzer not on PyPi")
# @pytest.mark.parametrize("test_input,acceptance_threshold",
# [testcase.to_pytest_param() for testcase in
# analyzer_test_generate_text_testdata])
# def test_analyzer_with_generated_text(test_input, acceptance_threshold):
# """
# Test analyzer with a generated dataset text file
# :param test_input: input text file location
# :param acceptance_threshold: minimim precision/recall
# allowed for tests to pass
# """
# # read test input from generated file
#
# import os
# dir_path = os.path.dirname(os.path.realpath(__file__))
# input_samples = read_synth_dataset(
# test_input.format(dir_path))
#
# updated_samples = PresidioAnalyzer. \
# align_input_samples_to_presidio_analyzer(input_samples)
#
# analyzer = PresidioAnalyzer()
# evaluated_samples = analyzer.evaluate_all(updated_samples)
# scores = analyzer.calculate_score(evaluation_results=evaluated_samples)
#
# assert acceptance_threshold <= scores.pii_precision
# assert acceptance_threshold <= scores.pii_recall
import pytest
from presidio_evaluator import InputSample, Span
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator.presidio_analyzer import PresidioAnalyzer
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
entities_mapping = {
"PERSON": "PERSON",
"EMAIL": "EMAIL_ADDRESS",
"CREDIT_CARD": "CREDIT_CARD",
"FIRST_NAME": "PERSON",
"PHONE_NUMBER": "PHONE_NUMBER",
"BIRTHDAY": "DATE_TIME",
"DATE": "DATE_TIME",
"DOMAIN": "DOMAIN",
"CITY": "LOCATION",
"ADDRESS": "LOCATION",
"IBAN": "IBAN_CODE",
"URL": "DOMAIN_NAME",
"US_SSN": "US_SSN",
"IP_ADDRESS": "IP_ADDRESS",
"ORGANIZATION": "ORG",
"O": "O",
}
class GeneratedTextTestCase:
def __init__(self, test_name, test_input, acceptance_threshold, marks):
self.test_name = test_name
self.test_input = test_input
self.acceptance_threshold = acceptance_threshold
self.marks = marks
def to_pytest_param(self):
return pytest.param(
self.test_input,
self.acceptance_threshold,
id=self.test_name,
marks=self.marks,
)
# generated-text test cases
analyzer_test_generate_text_testdata = [
# small set fixture which expects all results.
GeneratedTextTestCase(
test_name="small-set",
test_input="{}/data/generated_small.txt",
acceptance_threshold=0.3,
marks=pytest.mark.none,
)
]
def test_analyzer_simple_input():
model = PresidioAnalyzer(entities_to_keep=["PERSON"])
sample = InputSample(
full_text="My name is Mike",
masked="My name is [PERSON]",
spans=[Span("PERSON", "Mike", 10, 14)],
create_tags_from_span=True,
)
evaluated = model.evaluate_sample(sample)
metrics = model.calculate_score([evaluated])
assert metrics.pii_precision == 1
assert metrics.pii_recall == 1
# analyzer tests on generated data
@pytest.mark.parametrize(
"test_input,acceptance_threshold",
[testcase.to_pytest_param() for testcase in analyzer_test_generate_text_testdata],
)
def test_analyzer_with_generated_text(test_input, acceptance_threshold):
"""
Test analyzer with a generated dataset text file
:param test_input: input text file location
:param acceptance_threshold: minimim precision/recall
allowed for tests to pass
"""
# read test input from generated file
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
input_samples = read_synth_dataset(test_input.format(dir_path))
updated_samples = PresidioAnalyzer.align_input_samples_to_presidio_analyzer(
input_samples=input_samples, entities_mapping=entities_mapping
)
analyzer = PresidioAnalyzer()
evaluated_samples = analyzer.evaluate_all(updated_samples)
scores = analyzer.calculate_score(evaluation_results=evaluated_samples)
assert acceptance_threshold <= scores.pii_precision
assert acceptance_threshold <= scores.pii_recall

Просмотреть файл

@ -1,62 +1,58 @@
'''
Presidio Analyzer not yet on PyPI, ignoring temporarily
'''
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator.presidio_recognizer_evaluator import score_presidio_recognizer
import pytest
# from presidio_evaluator.data_generator import read_synth_dataset
# from presidio_evaluator.presidio_recognizer_evaluator import score_presidio_recognizer
# import pytest
#
# from analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer
#
# # test case parameters for tests with dataset which was previously generated.
# class GeneratedTextTestCase:
# def __init__(self, test_name, test_input, acceptance_threshold, marks):
# self.test_name = test_name
# self.test_input = test_input
# self.acceptance_threshold = acceptance_threshold
# self.marks = marks
#
# def to_pytest_param(self):
# return pytest.param(self.test_input, self.acceptance_threshold,
# id=self.test_name, marks=self.marks)
#
#
# # generated-text test cases
# cc_test_generate_text_testdata = [
# # small set fixture which expects all type results.
# GeneratedTextTestCase(
# test_name="small-set",
# test_input="{}/data/generated_small.txt",
# acceptance_threshold=1,
# marks=pytest.mark.none
# ),
# # large set fixture which expects all type results. marked as "slow"
# GeneratedTextTestCase(
# test_name="large_set",
# test_input="{}/data/generated_large.txt",
# acceptance_threshold=1,
# marks=pytest.mark.slow
# )
# ]
#
#
# # credit card recognizer tests on generated data
# @pytest.mark.parametrize("test_input,acceptance_threshold",
# [testcase.to_pytest_param()
# for testcase in cc_test_generate_text_testdata])
# def test_credit_card_recognizer_with_generated_text(test_input, acceptance_threshold):
# """
# Test credit card recognizer with a generated dataset text file
# :param test_input: input text file location
# :param acceptance_threshold: minimim precision/recall
# allowed for tests to pass
# """
#
# # read test input from generated file
# import os
# dir_path = os.path.dirname(os.path.realpath(__file__))
# input_samples = read_synth_dataset(
# test_input.format(dir_path))
# scores = score_presidio_recognizer(
# CreditCardRecognizer(), 'CREDIT_CARD', input_samples)
# assert acceptance_threshold <= scores.pii_f
from presidio_analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer
# test case parameters for tests with dataset which was previously generated.
class GeneratedTextTestCase:
def __init__(self, test_name, test_input, acceptance_threshold, marks):
self.test_name = test_name
self.test_input = test_input
self.acceptance_threshold = acceptance_threshold
self.marks = marks
def to_pytest_param(self):
return pytest.param(self.test_input, self.acceptance_threshold,
id=self.test_name, marks=self.marks)
# generated-text test cases
cc_test_generate_text_testdata = [
# small set fixture which expects all type results.
GeneratedTextTestCase(
test_name="small-set",
test_input="{}/data/generated_small.txt",
acceptance_threshold=1,
marks=pytest.mark.none
),
# large set fixture which expects all type results. marked as "slow"
GeneratedTextTestCase(
test_name="large_set",
test_input="{}/data/generated_large.txt",
acceptance_threshold=1,
marks=pytest.mark.slow
)
]
# credit card recognizer tests on generated data
@pytest.mark.parametrize("test_input,acceptance_threshold",
[testcase.to_pytest_param()
for testcase in cc_test_generate_text_testdata])
def test_credit_card_recognizer_with_generated_text(test_input, acceptance_threshold):
"""
Test credit card recognizer with a generated dataset text file
:param test_input: input text file location
:param acceptance_threshold: minimim precision/recall
allowed for tests to pass
"""
# read test input from generated file
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
input_samples = read_synth_dataset(
test_input.format(dir_path))
scores = score_presidio_recognizer(
CreditCardRecognizer(), 'CREDIT_CARD', input_samples)
assert acceptance_threshold <= scores.pii_f

Просмотреть файл

@ -1,83 +1,79 @@
'''
Presidio Analyzer not yet on PyPI, ignoring temporarily
'''
from presidio_evaluator.data_generator import generate
from presidio_evaluator.presidio_recognizer_evaluator import \
score_presidio_recognizer
import pytest
import numpy as np
# from presidio_evaluator.data_generator import generate
# from presidio_evaluator.presidio_recognizer_evaluator import \
# score_presidio_recognizer
# import pytest
# import numpy as np
#
# from analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer
#
# # test case parameters for tests with dataset generated from a template and csv values
# class TemplateTextTestCase:
# def __init__(self, test_name, pii_csv, utterances, dictionary_path,
# num_of_examples, acceptance_threshold, marks):
# self.test_name = test_name
# self.pii_csv = pii_csv
# self.utterances = utterances
# self.dictionary_path = dictionary_path
# self.num_of_examples = num_of_examples
# self.acceptance_threshold = acceptance_threshold
# self.marks = marks
#
# def to_pytest_param(self):
# return pytest.param(self.pii_csv, self.utterances, self.dictionary_path,
# self.num_of_examples, self.acceptance_threshold,
# id=self.test_name, marks=self.marks)
#
#
# # template-dataset test cases
# cc_test_template_testdata = [
# # large dataset fixture. marked as slow
# TemplateTextTestCase(
# test_name="fake-names-100",
# pii_csv="{}/data/FakeNameGenerator.com_100.csv",
# utterances="{}/data/templates.txt",
# dictionary_path="{}/data/Dictionary_test.csv",
# num_of_examples=100,
# acceptance_threshold=0.9,
# marks=pytest.mark.slow
# )
# ]
#
#
# # credit card recognizer tests on template-generates data
# @pytest.mark.parametrize("pii_csv, "
# "utterances, "
# "dictionary_path, "
# "num_of_examples, "
# "acceptance_threshold",
# [testcase.to_pytest_param()
# for testcase in cc_test_template_testdata])
# def test_credit_card_recognizer_with_template(pii_csv, utterances,
# dictionary_path,
# num_of_examples,
# acceptance_threshold):
# """
# Test credit card recognizer with a dataset generated from
# template and a CSV values file
# :param pii_csv: input csv file location
# :param utterances: template file location
# :param dictionary_path: dictionary/vocabulary file location
# :param num_of_examples: number of samples to be used from dataset
# to test
# :param acceptance_threshold: minimim precision/recall
# allowed for tests to pass
# """
#
# # read template and CSV files
# import os
# dir_path = os.path.dirname(os.path.realpath(__file__))
#
# input_samples = generate(fake_pii_csv=pii_csv.format(dir_path),
# utterances_file=utterances.format(dir_path),
# dictionary_path=dictionary_path.format(dir_path),
# lower_case_ratio=0.5,
# num_of_examples=num_of_examples)
#
# scores = score_presidio_recognizer(
# CreditCardRecognizer(), 'CREDIT_CARD', input_samples)
# if not np.isnan(scores.pii_f):
# assert acceptance_threshold <= scores.pii_f
from presidio_analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer
# test case parameters for tests with dataset generated from a template and csv values
class TemplateTextTestCase:
def __init__(self, test_name, pii_csv, utterances, dictionary_path,
num_of_examples, acceptance_threshold, marks):
self.test_name = test_name
self.pii_csv = pii_csv
self.utterances = utterances
self.dictionary_path = dictionary_path
self.num_of_examples = num_of_examples
self.acceptance_threshold = acceptance_threshold
self.marks = marks
def to_pytest_param(self):
return pytest.param(self.pii_csv, self.utterances, self.dictionary_path,
self.num_of_examples, self.acceptance_threshold,
id=self.test_name, marks=self.marks)
# template-dataset test cases
cc_test_template_testdata = [
# large dataset fixture. marked as slow
TemplateTextTestCase(
test_name="fake-names-100",
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
utterances="{}/data/templates.txt",
dictionary_path="{}/data/Dictionary_test.csv",
num_of_examples=100,
acceptance_threshold=0.9,
marks=pytest.mark.slow
)
]
# credit card recognizer tests on template-generates data
@pytest.mark.parametrize("pii_csv, "
"utterances, "
"dictionary_path, "
"num_of_examples, "
"acceptance_threshold",
[testcase.to_pytest_param()
for testcase in cc_test_template_testdata])
def test_credit_card_recognizer_with_template(pii_csv, utterances,
dictionary_path,
num_of_examples,
acceptance_threshold):
"""
Test credit card recognizer with a dataset generated from
template and a CSV values file
:param pii_csv: input csv file location
:param utterances: template file location
:param dictionary_path: dictionary/vocabulary file location
:param num_of_examples: number of samples to be used from dataset
to test
:param acceptance_threshold: minimim precision/recall
allowed for tests to pass
"""
# read template and CSV files
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
input_samples = generate(fake_pii_csv=pii_csv.format(dir_path),
utterances_file=utterances.format(dir_path),
dictionary_path=dictionary_path.format(dir_path),
lower_case_ratio=0.5,
num_of_examples=num_of_examples)
scores = score_presidio_recognizer(
CreditCardRecognizer(), 'CREDIT_CARD', input_samples)
if not np.isnan(scores.pii_f):
assert acceptance_threshold <= scores.pii_f

Просмотреть файл

@ -1,148 +1,144 @@
'''
Presidio Analyzer not yet on PyPI, ignoring temporarily
'''
from presidio_evaluator.data_generator import FakeDataGenerator
from presidio_evaluator.presidio_recognizer_evaluator import \
score_presidio_recognizer
import pandas as pd
import pytest
import numpy as np
# from presidio_evaluator.data_generator import FakeDataGenerator
# from presidio_evaluator.presidio_recognizer_evaluator import \
# score_presidio_recognizer
# import pandas as pd
# import pytest
# import numpy as np
#
# from analyzer import Pattern, PatternRecognizer
#
# # test case parameters for tests with dataset generated from a template and
# # two csv value files, one containing the common-entities and another one with custom entities
# class PatternRecognizerTestCase:
# def __init__(self, test_name, entity_name, pattern, score, pii_csv, ext_csv,
# utterances, dictionary_path, num_of_examples, acceptance_threshold,
# max_mistakes_number, marks):
# self.test_name = test_name
# self.entity_name = entity_name
# self.pattern = pattern
# self.score = score
# self.pii_csv = pii_csv
# self.ext_csv = ext_csv
# self.utterances = utterances
# self.dictionary_path = dictionary_path
# self.num_of_examples = num_of_examples
# self.acceptance_threshold = acceptance_threshold
# self.max_mistakes_number = max_mistakes_number
# self.marks = marks
#
# def to_pytest_param(self):
# return pytest.param(self.pii_csv, self.ext_csv, self.utterances,
# self.dictionary_path,
# self.entity_name, self.pattern, self.score,
# self.num_of_examples, self.acceptance_threshold,
# self.max_mistakes_number, id=self.test_name,
# marks=self.marks)
#
#
# # template-dataset test cases
# rocket_test_template_testdata = [
# # large dataset fixture. marked as slow.
# # all input is correct, test is conclusive
# PatternRecognizerTestCase(
# test_name="rocket-no-errors",
# entity_name="ROCKET",
# pattern=r'\W*(rocket)\W*',
# score=0.8,
# pii_csv="{}/data/FakeNameGenerator.com_100.csv",
# ext_csv="{}/data/FakeRocketGenerator.csv",
# utterances="{}/data/rocket_example_sentences.txt",
# dictionary_path="{}/data/Dictionary_test.csv",
# num_of_examples=100,
# acceptance_threshold=1,
# max_mistakes_number=0,
# marks=pytest.mark.slow
# ),
# # large dataset fixture. marked as slow
# # all input is correct, test is conclusive
# PatternRecognizerTestCase(
# test_name="rocket-all-errors",
# entity_name="ROCKET",
# pattern=r'\W*(rocket)\W*',
# score=0.8,
# pii_csv="{}/data/FakeNameGenerator.com_100.csv",
# ext_csv="{}/data/FakeRocketErrorsGenerator.csv",
# utterances="{}/data/rocket_example_sentences.txt",
# dictionary_path="{}/data/Dictionary_test.csv",
# num_of_examples=100,
# acceptance_threshold=0,
# max_mistakes_number=100,
# marks=pytest.mark.slow
# ),
# # large dataset fixture. marked as slow
# # some input is correct some is not, test is inconclusive
# PatternRecognizerTestCase(
# test_name="rocket-some-errors",
# entity_name="ROCKET",
# pattern=r'\W*(rocket)\W*',
# score=0.8,
# pii_csv="{}/data/FakeNameGenerator.com_100.csv",
# ext_csv="{}/data/FakeRocket50PercentErrorsGenerator.csv",
# utterances="{}/data/rocket_example_sentences.txt",
# dictionary_path="{}/data/Dictionary_test.csv",
# num_of_examples=100,
# acceptance_threshold=0.3,
# max_mistakes_number=70,
# marks=[pytest.mark.slow, pytest.mark.inconclusive]
# )
# ]
#
#
# @pytest.mark.parametrize(
# "pii_csv, ext_csv, utterances, dictionary_path, "
# "entity_name, pattern, score, num_of_examples, "
# "acceptance_threshold, max_mistakes_number",
# [testcase.to_pytest_param()
# for testcase in rocket_test_template_testdata])
# def test_pattern_recognizer(pii_csv, ext_csv, utterances, dictionary_path,
# entity_name, pattern,
# score, num_of_examples, acceptance_threshold,
# max_mistakes_number):
# """
# Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities
# and another CSV values file with a custom entity
# :param pii_csv: input csv file location with the common entities
# :param ext_csv: input csv file location with custom entities
# :param utterances: template file location
# :param dictionary_path: vocabulary/dictionary file location
# :param entity_name: custom entity name
# :param pattern: recognizer pattern
# :param num_of_examples: number of samples to be used from dataset to test
# :param acceptance_threshold: minimim precision/recall
# allowed for tests to pass
# """
#
# import os
# dir_path = os.path.dirname(os.path.realpath(__file__))
# dfpii = pd.read_csv(pii_csv.format(dir_path), encoding='utf-8')
# dfext = pd.read_csv(ext_csv.format(dir_path), encoding='utf-8')
# dictionary_path = dictionary_path.format(dir_path)
# ext_column_name = dfext.columns[0]
#
# def get_from_ext(i):
# index = i % dfext.shape[0]
# return dfext.iat[index, 0]
#
# # extend pii with ext data
# dfpii[ext_column_name] = [get_from_ext(i) for i in range(0, dfpii.shape[0])]
#
# # generate examples
# generator = FakeDataGenerator(fake_pii_csv_file=dfpii,
# utterances_file=utterances.format(dir_path),
# dictionary_path=dictionary_path)
# examples = generator.sample_examples(num_of_examples)
#
# pattern = Pattern("test pattern", pattern, score)
# pattern_recognizer = PatternRecognizer(entity_name,
# name="test recognizer",
# patterns=[pattern])
#
# scores = score_presidio_recognizer(
# pattern_recognizer, [entity_name], examples)
# if not np.isnan(scores.pii_f):
# assert acceptance_threshold <= scores.pii_f
# assert max_mistakes_number >= len(scores.model_errors)
from presidio_analyzer import Pattern, PatternRecognizer
# test case parameters for tests with dataset generated from a template and
# two csv value files, one containing the common-entities and another one with custom entities
class PatternRecognizerTestCase:
def __init__(self, test_name, entity_name, pattern, score, pii_csv, ext_csv,
utterances, dictionary_path, num_of_examples, acceptance_threshold,
max_mistakes_number, marks):
self.test_name = test_name
self.entity_name = entity_name
self.pattern = pattern
self.score = score
self.pii_csv = pii_csv
self.ext_csv = ext_csv
self.utterances = utterances
self.dictionary_path = dictionary_path
self.num_of_examples = num_of_examples
self.acceptance_threshold = acceptance_threshold
self.max_mistakes_number = max_mistakes_number
self.marks = marks
def to_pytest_param(self):
return pytest.param(self.pii_csv, self.ext_csv, self.utterances,
self.dictionary_path,
self.entity_name, self.pattern, self.score,
self.num_of_examples, self.acceptance_threshold,
self.max_mistakes_number, id=self.test_name,
marks=self.marks)
# template-dataset test cases
rocket_test_template_testdata = [
# large dataset fixture. marked as slow.
# all input is correct, test is conclusive
PatternRecognizerTestCase(
test_name="rocket-no-errors",
entity_name="ROCKET",
pattern=r'\W*(rocket)\W*',
score=0.8,
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
ext_csv="{}/data/FakeRocketGenerator.csv",
utterances="{}/data/rocket_example_sentences.txt",
dictionary_path="{}/data/Dictionary_test.csv",
num_of_examples=100,
acceptance_threshold=1,
max_mistakes_number=0,
marks=pytest.mark.slow
),
# large dataset fixture. marked as slow
# all input is correct, test is conclusive
PatternRecognizerTestCase(
test_name="rocket-all-errors",
entity_name="ROCKET",
pattern=r'\W*(rocket)\W*',
score=0.8,
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
ext_csv="{}/data/FakeRocketErrorsGenerator.csv",
utterances="{}/data/rocket_example_sentences.txt",
dictionary_path="{}/data/Dictionary_test.csv",
num_of_examples=100,
acceptance_threshold=0,
max_mistakes_number=100,
marks=pytest.mark.slow
),
# large dataset fixture. marked as slow
# some input is correct some is not, test is inconclusive
PatternRecognizerTestCase(
test_name="rocket-some-errors",
entity_name="ROCKET",
pattern=r'\W*(rocket)\W*',
score=0.8,
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
ext_csv="{}/data/FakeRocket50PercentErrorsGenerator.csv",
utterances="{}/data/rocket_example_sentences.txt",
dictionary_path="{}/data/Dictionary_test.csv",
num_of_examples=100,
acceptance_threshold=0.3,
max_mistakes_number=70,
marks=[pytest.mark.slow, pytest.mark.inconclusive]
)
]
@pytest.mark.parametrize(
"pii_csv, ext_csv, utterances, dictionary_path, "
"entity_name, pattern, score, num_of_examples, "
"acceptance_threshold, max_mistakes_number",
[testcase.to_pytest_param()
for testcase in rocket_test_template_testdata])
def test_pattern_recognizer(pii_csv, ext_csv, utterances, dictionary_path,
entity_name, pattern,
score, num_of_examples, acceptance_threshold,
max_mistakes_number):
"""
Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities
and another CSV values file with a custom entity
:param pii_csv: input csv file location with the common entities
:param ext_csv: input csv file location with custom entities
:param utterances: template file location
:param dictionary_path: vocabulary/dictionary file location
:param entity_name: custom entity name
:param pattern: recognizer pattern
:param num_of_examples: number of samples to be used from dataset to test
:param acceptance_threshold: minimim precision/recall
allowed for tests to pass
"""
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
dfpii = pd.read_csv(pii_csv.format(dir_path), encoding='utf-8')
dfext = pd.read_csv(ext_csv.format(dir_path), encoding='utf-8')
dictionary_path = dictionary_path.format(dir_path)
ext_column_name = dfext.columns[0]
def get_from_ext(i):
index = i % dfext.shape[0]
return dfext.iat[index, 0]
# extend pii with ext data
dfpii[ext_column_name] = [get_from_ext(i) for i in range(0, dfpii.shape[0])]
# generate examples
generator = FakeDataGenerator(fake_pii_csv_file=dfpii,
utterances_file=utterances.format(dir_path),
dictionary_path=dictionary_path)
examples = generator.sample_examples(num_of_examples)
pattern = Pattern("test pattern", pattern, score)
pattern_recognizer = PatternRecognizer(entity_name,
name="test recognizer",
patterns=[pattern])
scores = score_presidio_recognizer(
pattern_recognizer, [entity_name], examples)
if not np.isnan(scores.pii_f):
assert acceptance_threshold <= scores.pii_f
assert max_mistakes_number >= len(scores.model_errors)

Просмотреть файл

@ -1,63 +1,59 @@
'''
Presidio Analyzer not yet on PyPI, ignoring temporarily
'''
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator.presidio_recognizer_evaluator import \
score_presidio_recognizer
# from presidio_evaluator.data_generator import read_synth_dataset
# from presidio_evaluator.presidio_recognizer_evaluator import \
# score_presidio_recognizer
#
# import pytest
# from analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
#
# # test case parameters for tests with dataset which was previously generated.
# class GeneratedTextTestCase:
# def __init__(self, test_name, test_input, acceptance_threshold, marks):
# self.test_name = test_name
# self.test_input = test_input
# self.acceptance_threshold = acceptance_threshold
# self.marks = marks
#
# def to_pytest_param(self):
# return pytest.param(self.test_input, self.acceptance_threshold,
# id=self.test_name, marks=self.marks)
#
#
# # generated-text test cases
# cc_test_generate_text_testdata = [
# # small dataset, inconclusive results
# GeneratedTextTestCase(
# test_name="small-set",
# test_input="{}/data/generated_small.txt",
# acceptance_threshold=0.5,
# marks=pytest.mark.inconclusive
# ),
# # large dataset - test is slow and inconclusive
# GeneratedTextTestCase(
# test_name="large-set",
# test_input="{}/data/generated_large.txt",
# acceptance_threshold=0.5,
# marks=pytest.mark.slow
# )
# ]
#
#
# # credit card recognizer tests on generated data
# @pytest.mark.parametrize("test_input,acceptance_threshold",
# [testcase.to_pytest_param() for testcase in
# cc_test_generate_text_testdata])
# def test_spacy_recognizer_with_generated_text(test_input, acceptance_threshold):
# """
# Test spacy recognizer with a generated dataset text file
# :param test_input: input text file location
# :param acceptance_threshold: minimim precision/recall
# allowed for tests to pass
# """
#
# # read test input from generated file
# import os
# dir_path = os.path.dirname(os.path.realpath(__file__))
# input_samples = read_synth_dataset(
# test_input.format(dir_path))
# scores = score_presidio_recognizer(
# SpacyRecognizer(), ['PERSON'], input_samples, True)
# assert acceptance_threshold <= scores.pii_f
import pytest
from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
# test case parameters for tests with dataset which was previously generated.
class GeneratedTextTestCase:
def __init__(self, test_name, test_input, acceptance_threshold, marks):
self.test_name = test_name
self.test_input = test_input
self.acceptance_threshold = acceptance_threshold
self.marks = marks
def to_pytest_param(self):
return pytest.param(self.test_input, self.acceptance_threshold,
id=self.test_name, marks=self.marks)
# generated-text test cases
cc_test_generate_text_testdata = [
# small dataset, inconclusive results
GeneratedTextTestCase(
test_name="small-set",
test_input="{}/data/generated_small.txt",
acceptance_threshold=0.5,
marks=pytest.mark.inconclusive
),
# large dataset - test is slow and inconclusive
GeneratedTextTestCase(
test_name="large-set",
test_input="{}/data/generated_large.txt",
acceptance_threshold=0.5,
marks=pytest.mark.slow
)
]
# credit card recognizer tests on generated data
@pytest.mark.parametrize("test_input,acceptance_threshold",
[testcase.to_pytest_param() for testcase in
cc_test_generate_text_testdata])
def test_spacy_recognizer_with_generated_text(test_input, acceptance_threshold):
"""
Test spacy recognizer with a generated dataset text file
:param test_input: input text file location
:param acceptance_threshold: minimim precision/recall
allowed for tests to pass
"""
# read test input from generated file
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
input_samples = read_synth_dataset(
test_input.format(dir_path))
scores = score_presidio_recognizer(
SpacyRecognizer(), ['PERSON'], input_samples, True)
assert acceptance_threshold <= scores.pii_f