Merge pull request #14 from microsoft/omri/reintroduce_analyzer
reintroduced analyzer
This commit is contained in:
Коммит
0d5c4ba3fe
|
@ -1,33 +1,31 @@
|
|||
'''
|
||||
Presidio Analyzer not yet on PyPI, cannot explicitly reference it
|
||||
'''
|
||||
from typing import List
|
||||
|
||||
from presidio_analyzer import AnalyzerEngine
|
||||
|
||||
from typing import List, Dict
|
||||
#
|
||||
from presidio_evaluator import ModelEvaluator, InputSample, span_to_tag
|
||||
#
|
||||
from presidio_evaluator.data_generator import read_synth_dataset
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
class PresidioAnalyzer(ModelEvaluator):
|
||||
|
||||
def __init__(self, analyzer,
|
||||
entities_to_keep: List[str] = None,
|
||||
verbose: bool = False,
|
||||
labeling_scheme="BIO",
|
||||
compare_by_io=True,
|
||||
score_threshold=0.4
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
analyzer=AnalyzerEngine(),
|
||||
entities_to_keep: List[str] = None,
|
||||
verbose: bool = False,
|
||||
labeling_scheme="BIO",
|
||||
compare_by_io=True,
|
||||
score_threshold=0.4,
|
||||
):
|
||||
"""
|
||||
Evaluation wrapper for the Presidio Analyzer
|
||||
:param analyzer: object of type AnalyzerEngine (from presidio-analyzer)
|
||||
"""
|
||||
super().__init__(entities_to_keep=entities_to_keep,
|
||||
verbose=verbose,
|
||||
labeling_scheme=labeling_scheme,
|
||||
compare_by_io=compare_by_io)
|
||||
super().__init__(
|
||||
entities_to_keep=entities_to_keep,
|
||||
verbose=verbose,
|
||||
labeling_scheme=labeling_scheme,
|
||||
compare_by_io=compare_by_io,
|
||||
)
|
||||
self.analyzer = analyzer
|
||||
|
||||
self.score_threshold = score_threshold
|
||||
|
@ -37,8 +35,12 @@ class PresidioAnalyzer(ModelEvaluator):
|
|||
all_fields = True
|
||||
else:
|
||||
all_fields = None
|
||||
results = self.analyzer.analyze(sample.full_text, self.entities,
|
||||
language='en', all_fields=all_fields)
|
||||
results = self.analyzer.analyze(
|
||||
text=sample.full_text,
|
||||
entities=self.entities,
|
||||
language="en",
|
||||
all_fields=all_fields,
|
||||
)
|
||||
starts = []
|
||||
ends = []
|
||||
scores = []
|
||||
|
@ -52,13 +54,15 @@ class PresidioAnalyzer(ModelEvaluator):
|
|||
tags.append(res.entity_type)
|
||||
scores.append(res.score)
|
||||
#
|
||||
response_tags = span_to_tag(scheme=self.labeling_scheme,
|
||||
text=sample.full_text,
|
||||
start=starts,
|
||||
end=ends,
|
||||
tokens=sample.tokens,
|
||||
scores=scores,
|
||||
tag=tags)
|
||||
response_tags = span_to_tag(
|
||||
scheme=self.labeling_scheme,
|
||||
text=sample.full_text,
|
||||
start=starts,
|
||||
end=ends,
|
||||
tokens=sample.tokens,
|
||||
scores=scores,
|
||||
tag=tags,
|
||||
)
|
||||
return response_tags
|
||||
|
||||
|
||||
|
@ -70,41 +74,53 @@ if __name__ == "__main__":
|
|||
|
||||
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
|
||||
entities_mapping = {
|
||||
'PERSON': 'PERSON',
|
||||
'EMAIL': 'EMAIL_ADDRESS',
|
||||
'CREDIT_CARD': 'CREDIT_CARD',
|
||||
'FIRST_NAME': 'PERSON',
|
||||
'PHONE_NUMBER': 'PHONE_NUMBER',
|
||||
'BIRTHDAY': 'DATE_TIME',
|
||||
'DATE': 'DATE_TIME',
|
||||
'DOMAIN': 'DOMAIN',
|
||||
'CITY': 'LOCATION',
|
||||
'ADDRESS': 'LOCATION',
|
||||
'IBAN': 'IBAN_CODE',
|
||||
'URL': 'DOMAIN_NAME',
|
||||
'US_SSN': 'US_SSN',
|
||||
'IP_ADDRESS': 'IP_ADDRESS',
|
||||
'ORGANIZATION': 'ORG',
|
||||
'O': 'O'
|
||||
"PERSON": "PERSON",
|
||||
"EMAIL": "EMAIL_ADDRESS",
|
||||
"CREDIT_CARD": "CREDIT_CARD",
|
||||
"FIRST_NAME": "PERSON",
|
||||
"PHONE_NUMBER": "PHONE_NUMBER",
|
||||
"BIRTHDAY": "DATE_TIME",
|
||||
"DATE": "DATE_TIME",
|
||||
"DOMAIN": "DOMAIN",
|
||||
"CITY": "LOCATION",
|
||||
"ADDRESS": "LOCATION",
|
||||
"IBAN": "IBAN_CODE",
|
||||
"URL": "DOMAIN_NAME",
|
||||
"US_SSN": "US_SSN",
|
||||
"IP_ADDRESS": "IP_ADDRESS",
|
||||
"ORGANIZATION": "ORG",
|
||||
"O": "O",
|
||||
}
|
||||
|
||||
updated_samples = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,
|
||||
entities_mapping)
|
||||
updated_samples = ModelEvaluator.align_input_samples_to_presidio_analyzer(
|
||||
input_samples, entities_mapping
|
||||
)
|
||||
|
||||
flatten = lambda l: [item for sublist in l for item in sublist]
|
||||
from collections import Counter
|
||||
|
||||
count_per_entity = Counter(
|
||||
[span.entity_type for span in flatten([input_sample.spans for input_sample in updated_samples])])
|
||||
[
|
||||
span.entity_type
|
||||
for span in flatten(
|
||||
[input_sample.spans for input_sample in updated_samples]
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
print("Evaluating samples")
|
||||
analyzer = PresidioAnalyzer(entities_to_keep=count_per_entity.keys())
|
||||
evaluated_samples = analyzer.evaluate_all(updated_samples)
|
||||
#
|
||||
print("Estimating metrics")
|
||||
precision, recall, \
|
||||
entity_recall, entity_precision, \
|
||||
f, errors = analyzer.calculate_score(evaluation_results=evaluated_samples, beta=2.5)
|
||||
(
|
||||
precision,
|
||||
recall,
|
||||
entity_recall,
|
||||
entity_precision,
|
||||
f,
|
||||
errors,
|
||||
) = analyzer.calculate_score(evaluation_results=evaluated_samples, beta=2.5)
|
||||
#
|
||||
print("precision: {}".format(precision))
|
||||
print("Recall: {}".format(recall))
|
||||
|
@ -112,22 +128,24 @@ if __name__ == "__main__":
|
|||
print("Precision per entity: {}".format(entity_precision))
|
||||
print("Recall per entity: {}".format(entity_recall))
|
||||
#
|
||||
FN_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == 'FN']
|
||||
FP_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == 'FP']
|
||||
other_mistakes = [mistake for mistake in flatten(errors) if "Wrong entity" in mistake]
|
||||
FN_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == "FN"]
|
||||
FP_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == "FP"]
|
||||
other_mistakes = [
|
||||
mistake for mistake in flatten(errors) if "Wrong entity" in mistake
|
||||
]
|
||||
|
||||
fn = open('../data/fn_30000.txt', 'w+', encoding='utf-8')
|
||||
fn1 = '\n'.join(FN_mistakes)
|
||||
fn = open("../data/fn_30000.txt", "w+", encoding="utf-8")
|
||||
fn1 = "\n".join(FN_mistakes)
|
||||
fn.write(fn1)
|
||||
fn.close()
|
||||
|
||||
fp = open('../data/fp_30000.txt', 'w+', encoding='utf-8')
|
||||
fp1 = '\n'.join(FP_mistakes)
|
||||
fp = open("../data/fp_30000.txt", "w+", encoding="utf-8")
|
||||
fp1 = "\n".join(FP_mistakes)
|
||||
fp.write(fp1)
|
||||
fp.close()
|
||||
|
||||
mistakes_file = open('../data/mistakes_30000.txt', 'w+', encoding='utf-8')
|
||||
mistakes1 = '\n'.join(other_mistakes)
|
||||
mistakes_file = open("../data/mistakes_30000.txt", "w+", encoding="utf-8")
|
||||
mistakes1 = "\n".join(other_mistakes)
|
||||
mistakes_file.write(mistakes1)
|
||||
mistakes_file.close()
|
||||
|
||||
|
|
|
@ -1,40 +1,50 @@
|
|||
'''
|
||||
"""
|
||||
Presidio Analyzer not yet on PyPI, therefore it cannot be referenced explicitly
|
||||
'''
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import List, Tuple, Dict
|
||||
|
||||
from presidio_evaluator import ModelEvaluator, InputSample
|
||||
from presidio_analyzer.nlp_engine import SpacyNlpEngine
|
||||
|
||||
from presidio_evaluator import ModelEvaluator, InputSample, EvaluationResult
|
||||
from presidio_evaluator.span_to_tag import span_to_tag
|
||||
|
||||
|
||||
class PresidioRecognizerEvaluator(ModelEvaluator):
|
||||
def __init__(self, recognizer, nlp_engine, entities_to_keep=None,
|
||||
with_nlp_artifacts=False, verbose=False, compare_by_io=True,
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
recognizer,
|
||||
nlp_engine,
|
||||
entities_to_keep=None,
|
||||
with_nlp_artifacts=False,
|
||||
verbose=False,
|
||||
compare_by_io=True,
|
||||
):
|
||||
"""
|
||||
Evaluator for one recognizer
|
||||
:param recognizer: An object of type EntityRecognizer (in presidion-analyzer)
|
||||
:param nlp_engine: An object of type NlpEngine, e.g. SpacyNlpEngine (in presidio-analyzer)
|
||||
"""
|
||||
super().__init__(entities_to_keep=entities_to_keep,
|
||||
verbose=verbose, compare_by_io=compare_by_io)
|
||||
super().__init__(
|
||||
entities_to_keep=entities_to_keep,
|
||||
verbose=verbose,
|
||||
compare_by_io=compare_by_io,
|
||||
)
|
||||
self.withNlpArtifacts = with_nlp_artifacts
|
||||
self.recognizer = recognizer
|
||||
self.nlp_engine = nlp_engine
|
||||
|
||||
#
|
||||
def __make_nlp_artifacts(self, text: str):
|
||||
return self.nlp_engine.process_text(text, 'en')
|
||||
return self.nlp_engine.process_text(text, "en")
|
||||
|
||||
#
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
nlpArtifacts = None
|
||||
if self.withNlpArtifacts:
|
||||
nlpArtifacts = self.__make_nlp_artifacts(sample.full_text)
|
||||
results = self.recognizer.analyze(sample.full_text, self.entities,
|
||||
nlpArtifacts)
|
||||
results = self.recognizer.analyze(sample.full_text, self.entities, nlpArtifacts)
|
||||
starts = []
|
||||
ends = []
|
||||
tags = []
|
||||
|
@ -46,37 +56,33 @@ class PresidioRecognizerEvaluator(ModelEvaluator):
|
|||
ends.append(res.end)
|
||||
tags.append(res.entity_type)
|
||||
scores.append(res.score)
|
||||
response_tags = span_to_tag(scheme=self.labeling_scheme,
|
||||
text=sample.full_text,
|
||||
start=starts,
|
||||
end=ends,
|
||||
tag=tags,
|
||||
tokens=sample.tokens,
|
||||
scores=scores,
|
||||
io_tags_only=self.compare_by_io)
|
||||
response_tags = span_to_tag(
|
||||
scheme=self.labeling_scheme,
|
||||
text=sample.full_text,
|
||||
start=starts,
|
||||
end=ends,
|
||||
tag=tags,
|
||||
tokens=sample.tokens,
|
||||
scores=scores,
|
||||
io_tags_only=self.compare_by_io,
|
||||
)
|
||||
if len(sample.tags) == 0:
|
||||
sample.tags = ['0' for word in response_tags]
|
||||
sample.tags = ["0" for word in response_tags]
|
||||
return response_tags
|
||||
|
||||
|
||||
def score_presidio_recognizer(recognizer, entities_to_keep, input_samples,
|
||||
withNlpArtifacts=False) \
|
||||
-> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[
|
||||
str, float], Dict[str, float], List[str]]:
|
||||
model = PresidioRecognizerEvaluator(recognizer=recognizer,
|
||||
entities_to_keep=entities_to_keep,
|
||||
with_nlp_artifacts=withNlpArtifacts)
|
||||
def score_presidio_recognizer(
|
||||
recognizer, entities_to_keep, input_samples, withNlpArtifacts=False
|
||||
) -> EvaluationResult:
|
||||
model = PresidioRecognizerEvaluator(
|
||||
recognizer=recognizer,
|
||||
entities_to_keep=entities_to_keep,
|
||||
nlp_engine=SpacyNlpEngine(),
|
||||
with_nlp_artifacts=withNlpArtifacts,
|
||||
)
|
||||
evaluated_samples = model.evaluate_all(input_samples[:])
|
||||
precision, recall, ent_recall, \
|
||||
ent_precision, fscore, mistakes = model.calculate_score(
|
||||
evaluated_samples, beta=2.5)
|
||||
print("p={precision}, r={recall},f={f},"
|
||||
"entity recall={ent},entity precision={prec}".format(
|
||||
precision=precision,
|
||||
recall=recall,
|
||||
f=fscore,
|
||||
ent=ent_recall,
|
||||
prec=ent_precision))
|
||||
if math.isnan(precision):
|
||||
precision = 0
|
||||
return precision, recall, ent_recall, ent_precision, fscore, mistakes
|
||||
evaluation_result = model.calculate_score(evaluated_samples, beta=2.5)
|
||||
evaluation_result.print()
|
||||
if math.isnan(evaluation_result.pii_precision):
|
||||
evaluation_result.pii_precision = 0
|
||||
return evaluation_result
|
||||
|
|
|
@ -15,3 +15,4 @@ regex
|
|||
#flair
|
||||
sklearn_crfsuite
|
||||
pytest
|
||||
presidio_analyzer
|
|
@ -1,7 +1,9 @@
|
|||
import pytest
|
||||
|
||||
try:
|
||||
from flair.models import SequenceTagger
|
||||
except ImportError:
|
||||
print("Flair is not installed by default")
|
||||
except:
|
||||
ImportError("Flair is not installed by default")
|
||||
|
||||
from presidio_evaluator.data_generator import read_synth_dataset
|
||||
from presidio_evaluator.flair_evaluator import FlairEvaluator
|
||||
|
@ -9,18 +11,26 @@ from presidio_evaluator.flair_evaluator import FlairEvaluator
|
|||
import numpy as np
|
||||
|
||||
# no-unit because flair is not a dependency by default
|
||||
def no_unit_test_flair_simple():
|
||||
@pytest.mark.skip(reason="Flair not installed by default")
|
||||
def test_flair_simple():
|
||||
import os
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))
|
||||
input_samples = read_synth_dataset(
|
||||
os.path.join(dir_path, "data/generated_small.txt")
|
||||
)
|
||||
|
||||
model = SequenceTagger.load('ner-ontonotes-fast') # .load('ner')
|
||||
model = SequenceTagger.load("ner-ontonotes-fast") # .load('ner')
|
||||
|
||||
flair_evaluator = FlairEvaluator(model=model, entities_to_keep=['PERSON'])
|
||||
flair_evaluator = FlairEvaluator(model=model, entities_to_keep=["PERSON"])
|
||||
evaluation_results = flair_evaluator.evaluate_all(input_samples)
|
||||
scores = flair_evaluator.calculate_score(evaluation_results)
|
||||
|
||||
np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON'])
|
||||
np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON'])
|
||||
np.testing.assert_almost_equal(
|
||||
scores.pii_precision, scores.entity_precision_dict["PERSON"]
|
||||
)
|
||||
np.testing.assert_almost_equal(
|
||||
scores.pii_recall, scores.entity_recall_dict["PERSON"]
|
||||
)
|
||||
assert scores.pii_recall > 0
|
||||
assert scores.pii_precision > 0
|
||||
|
|
|
@ -1,80 +1,101 @@
|
|||
'''
|
||||
Presidio Analyzer not yet on PyPI, ignoring temporarily
|
||||
'''
|
||||
#
|
||||
# import pytest
|
||||
#
|
||||
# from presidio_evaluator import InputSample, Span
|
||||
# from presidio_evaluator.data_generator import read_synth_dataset
|
||||
# from presidio_evaluator.presidio_analyzer import PresidioAnalyzer
|
||||
#
|
||||
#
|
||||
# class GeneratedTextTestCase:
|
||||
# def __init__(self, test_name, test_input, acceptance_threshold, marks):
|
||||
# self.test_name = test_name
|
||||
# self.test_input = test_input
|
||||
# self.acceptance_threshold = acceptance_threshold
|
||||
# self.marks = marks
|
||||
#
|
||||
# def to_pytest_param(self):
|
||||
# return pytest.param(self.test_input, self.acceptance_threshold,
|
||||
# id=self.test_name, marks=self.marks)
|
||||
#
|
||||
#
|
||||
# # generated-text test cases
|
||||
# analyzer_test_generate_text_testdata = [
|
||||
# # small set fixture which expects all results.
|
||||
# GeneratedTextTestCase(
|
||||
# test_name="small-set",
|
||||
# test_input="{}/data/generated_small.txt",
|
||||
# acceptance_threshold=0.3,
|
||||
# marks=pytest.mark.none
|
||||
# )
|
||||
# ]
|
||||
#
|
||||
#
|
||||
# @pytest.mark.skip(reason="Presidio analyzer not on PyPi")
|
||||
# def test_analyzer_simple_input():
|
||||
# model = PresidioAnalyzer(entities_to_keep=['PERSON'])
|
||||
#
|
||||
# sample = InputSample(full_text="My name is Mike",
|
||||
# masked="My name is [PERSON]",
|
||||
# spans=[Span('PERSON', 'Mike', 10, 14)],
|
||||
# create_tags_from_span=True)
|
||||
#
|
||||
# evaluated = model.evaluate_sample(sample)
|
||||
# metrics = model.calculate_score(
|
||||
# [evaluated])
|
||||
#
|
||||
# assert metrics.pii_precision == 1
|
||||
# assert metrics.pii_recall == 1
|
||||
#
|
||||
#
|
||||
# # analyzer tests on generated data
|
||||
# @pytest.mark.skip(reason="Presidio analyzer not on PyPi")
|
||||
# @pytest.mark.parametrize("test_input,acceptance_threshold",
|
||||
# [testcase.to_pytest_param() for testcase in
|
||||
# analyzer_test_generate_text_testdata])
|
||||
# def test_analyzer_with_generated_text(test_input, acceptance_threshold):
|
||||
# """
|
||||
# Test analyzer with a generated dataset text file
|
||||
# :param test_input: input text file location
|
||||
# :param acceptance_threshold: minimim precision/recall
|
||||
# allowed for tests to pass
|
||||
# """
|
||||
# # read test input from generated file
|
||||
#
|
||||
# import os
|
||||
# dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
# input_samples = read_synth_dataset(
|
||||
# test_input.format(dir_path))
|
||||
#
|
||||
# updated_samples = PresidioAnalyzer. \
|
||||
# align_input_samples_to_presidio_analyzer(input_samples)
|
||||
#
|
||||
# analyzer = PresidioAnalyzer()
|
||||
# evaluated_samples = analyzer.evaluate_all(updated_samples)
|
||||
# scores = analyzer.calculate_score(evaluation_results=evaluated_samples)
|
||||
#
|
||||
# assert acceptance_threshold <= scores.pii_precision
|
||||
# assert acceptance_threshold <= scores.pii_recall
|
||||
import pytest
|
||||
|
||||
from presidio_evaluator import InputSample, Span
|
||||
from presidio_evaluator.data_generator import read_synth_dataset
|
||||
from presidio_evaluator.presidio_analyzer import PresidioAnalyzer
|
||||
|
||||
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
|
||||
entities_mapping = {
|
||||
"PERSON": "PERSON",
|
||||
"EMAIL": "EMAIL_ADDRESS",
|
||||
"CREDIT_CARD": "CREDIT_CARD",
|
||||
"FIRST_NAME": "PERSON",
|
||||
"PHONE_NUMBER": "PHONE_NUMBER",
|
||||
"BIRTHDAY": "DATE_TIME",
|
||||
"DATE": "DATE_TIME",
|
||||
"DOMAIN": "DOMAIN",
|
||||
"CITY": "LOCATION",
|
||||
"ADDRESS": "LOCATION",
|
||||
"IBAN": "IBAN_CODE",
|
||||
"URL": "DOMAIN_NAME",
|
||||
"US_SSN": "US_SSN",
|
||||
"IP_ADDRESS": "IP_ADDRESS",
|
||||
"ORGANIZATION": "ORG",
|
||||
"O": "O",
|
||||
}
|
||||
|
||||
|
||||
class GeneratedTextTestCase:
|
||||
def __init__(self, test_name, test_input, acceptance_threshold, marks):
|
||||
self.test_name = test_name
|
||||
self.test_input = test_input
|
||||
self.acceptance_threshold = acceptance_threshold
|
||||
self.marks = marks
|
||||
|
||||
def to_pytest_param(self):
|
||||
return pytest.param(
|
||||
self.test_input,
|
||||
self.acceptance_threshold,
|
||||
id=self.test_name,
|
||||
marks=self.marks,
|
||||
)
|
||||
|
||||
|
||||
# generated-text test cases
|
||||
analyzer_test_generate_text_testdata = [
|
||||
# small set fixture which expects all results.
|
||||
GeneratedTextTestCase(
|
||||
test_name="small-set",
|
||||
test_input="{}/data/generated_small.txt",
|
||||
acceptance_threshold=0.3,
|
||||
marks=pytest.mark.none,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def test_analyzer_simple_input():
|
||||
model = PresidioAnalyzer(entities_to_keep=["PERSON"])
|
||||
|
||||
sample = InputSample(
|
||||
full_text="My name is Mike",
|
||||
masked="My name is [PERSON]",
|
||||
spans=[Span("PERSON", "Mike", 10, 14)],
|
||||
create_tags_from_span=True,
|
||||
)
|
||||
|
||||
evaluated = model.evaluate_sample(sample)
|
||||
metrics = model.calculate_score([evaluated])
|
||||
|
||||
assert metrics.pii_precision == 1
|
||||
assert metrics.pii_recall == 1
|
||||
|
||||
|
||||
# analyzer tests on generated data
|
||||
@pytest.mark.parametrize(
|
||||
"test_input,acceptance_threshold",
|
||||
[testcase.to_pytest_param() for testcase in analyzer_test_generate_text_testdata],
|
||||
)
|
||||
def test_analyzer_with_generated_text(test_input, acceptance_threshold):
|
||||
"""
|
||||
Test analyzer with a generated dataset text file
|
||||
:param test_input: input text file location
|
||||
:param acceptance_threshold: minimim precision/recall
|
||||
allowed for tests to pass
|
||||
"""
|
||||
# read test input from generated file
|
||||
|
||||
import os
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
input_samples = read_synth_dataset(test_input.format(dir_path))
|
||||
|
||||
updated_samples = PresidioAnalyzer.align_input_samples_to_presidio_analyzer(
|
||||
input_samples=input_samples, entities_mapping=entities_mapping
|
||||
)
|
||||
|
||||
analyzer = PresidioAnalyzer()
|
||||
evaluated_samples = analyzer.evaluate_all(updated_samples)
|
||||
scores = analyzer.calculate_score(evaluation_results=evaluated_samples)
|
||||
|
||||
assert acceptance_threshold <= scores.pii_precision
|
||||
assert acceptance_threshold <= scores.pii_recall
|
||||
|
|
|
@ -1,62 +1,58 @@
|
|||
'''
|
||||
Presidio Analyzer not yet on PyPI, ignoring temporarily
|
||||
'''
|
||||
from presidio_evaluator.data_generator import read_synth_dataset
|
||||
from presidio_evaluator.presidio_recognizer_evaluator import score_presidio_recognizer
|
||||
import pytest
|
||||
|
||||
# from presidio_evaluator.data_generator import read_synth_dataset
|
||||
# from presidio_evaluator.presidio_recognizer_evaluator import score_presidio_recognizer
|
||||
# import pytest
|
||||
#
|
||||
# from analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer
|
||||
#
|
||||
# # test case parameters for tests with dataset which was previously generated.
|
||||
# class GeneratedTextTestCase:
|
||||
# def __init__(self, test_name, test_input, acceptance_threshold, marks):
|
||||
# self.test_name = test_name
|
||||
# self.test_input = test_input
|
||||
# self.acceptance_threshold = acceptance_threshold
|
||||
# self.marks = marks
|
||||
#
|
||||
# def to_pytest_param(self):
|
||||
# return pytest.param(self.test_input, self.acceptance_threshold,
|
||||
# id=self.test_name, marks=self.marks)
|
||||
#
|
||||
#
|
||||
# # generated-text test cases
|
||||
# cc_test_generate_text_testdata = [
|
||||
# # small set fixture which expects all type results.
|
||||
# GeneratedTextTestCase(
|
||||
# test_name="small-set",
|
||||
# test_input="{}/data/generated_small.txt",
|
||||
# acceptance_threshold=1,
|
||||
# marks=pytest.mark.none
|
||||
# ),
|
||||
# # large set fixture which expects all type results. marked as "slow"
|
||||
# GeneratedTextTestCase(
|
||||
# test_name="large_set",
|
||||
# test_input="{}/data/generated_large.txt",
|
||||
# acceptance_threshold=1,
|
||||
# marks=pytest.mark.slow
|
||||
# )
|
||||
# ]
|
||||
#
|
||||
#
|
||||
# # credit card recognizer tests on generated data
|
||||
# @pytest.mark.parametrize("test_input,acceptance_threshold",
|
||||
# [testcase.to_pytest_param()
|
||||
# for testcase in cc_test_generate_text_testdata])
|
||||
# def test_credit_card_recognizer_with_generated_text(test_input, acceptance_threshold):
|
||||
# """
|
||||
# Test credit card recognizer with a generated dataset text file
|
||||
# :param test_input: input text file location
|
||||
# :param acceptance_threshold: minimim precision/recall
|
||||
# allowed for tests to pass
|
||||
# """
|
||||
#
|
||||
# # read test input from generated file
|
||||
# import os
|
||||
# dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
# input_samples = read_synth_dataset(
|
||||
# test_input.format(dir_path))
|
||||
# scores = score_presidio_recognizer(
|
||||
# CreditCardRecognizer(), 'CREDIT_CARD', input_samples)
|
||||
# assert acceptance_threshold <= scores.pii_f
|
||||
from presidio_analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer
|
||||
|
||||
# test case parameters for tests with dataset which was previously generated.
|
||||
class GeneratedTextTestCase:
|
||||
def __init__(self, test_name, test_input, acceptance_threshold, marks):
|
||||
self.test_name = test_name
|
||||
self.test_input = test_input
|
||||
self.acceptance_threshold = acceptance_threshold
|
||||
self.marks = marks
|
||||
|
||||
def to_pytest_param(self):
|
||||
return pytest.param(self.test_input, self.acceptance_threshold,
|
||||
id=self.test_name, marks=self.marks)
|
||||
|
||||
|
||||
# generated-text test cases
|
||||
cc_test_generate_text_testdata = [
|
||||
# small set fixture which expects all type results.
|
||||
GeneratedTextTestCase(
|
||||
test_name="small-set",
|
||||
test_input="{}/data/generated_small.txt",
|
||||
acceptance_threshold=1,
|
||||
marks=pytest.mark.none
|
||||
),
|
||||
# large set fixture which expects all type results. marked as "slow"
|
||||
GeneratedTextTestCase(
|
||||
test_name="large_set",
|
||||
test_input="{}/data/generated_large.txt",
|
||||
acceptance_threshold=1,
|
||||
marks=pytest.mark.slow
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
# credit card recognizer tests on generated data
|
||||
@pytest.mark.parametrize("test_input,acceptance_threshold",
|
||||
[testcase.to_pytest_param()
|
||||
for testcase in cc_test_generate_text_testdata])
|
||||
def test_credit_card_recognizer_with_generated_text(test_input, acceptance_threshold):
|
||||
"""
|
||||
Test credit card recognizer with a generated dataset text file
|
||||
:param test_input: input text file location
|
||||
:param acceptance_threshold: minimim precision/recall
|
||||
allowed for tests to pass
|
||||
"""
|
||||
|
||||
# read test input from generated file
|
||||
import os
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
input_samples = read_synth_dataset(
|
||||
test_input.format(dir_path))
|
||||
scores = score_presidio_recognizer(
|
||||
CreditCardRecognizer(), 'CREDIT_CARD', input_samples)
|
||||
assert acceptance_threshold <= scores.pii_f
|
||||
|
|
|
@ -1,83 +1,79 @@
|
|||
'''
|
||||
Presidio Analyzer not yet on PyPI, ignoring temporarily
|
||||
'''
|
||||
from presidio_evaluator.data_generator import generate
|
||||
from presidio_evaluator.presidio_recognizer_evaluator import \
|
||||
score_presidio_recognizer
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
# from presidio_evaluator.data_generator import generate
|
||||
# from presidio_evaluator.presidio_recognizer_evaluator import \
|
||||
# score_presidio_recognizer
|
||||
# import pytest
|
||||
# import numpy as np
|
||||
#
|
||||
# from analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer
|
||||
#
|
||||
# # test case parameters for tests with dataset generated from a template and csv values
|
||||
# class TemplateTextTestCase:
|
||||
# def __init__(self, test_name, pii_csv, utterances, dictionary_path,
|
||||
# num_of_examples, acceptance_threshold, marks):
|
||||
# self.test_name = test_name
|
||||
# self.pii_csv = pii_csv
|
||||
# self.utterances = utterances
|
||||
# self.dictionary_path = dictionary_path
|
||||
# self.num_of_examples = num_of_examples
|
||||
# self.acceptance_threshold = acceptance_threshold
|
||||
# self.marks = marks
|
||||
#
|
||||
# def to_pytest_param(self):
|
||||
# return pytest.param(self.pii_csv, self.utterances, self.dictionary_path,
|
||||
# self.num_of_examples, self.acceptance_threshold,
|
||||
# id=self.test_name, marks=self.marks)
|
||||
#
|
||||
#
|
||||
# # template-dataset test cases
|
||||
# cc_test_template_testdata = [
|
||||
# # large dataset fixture. marked as slow
|
||||
# TemplateTextTestCase(
|
||||
# test_name="fake-names-100",
|
||||
# pii_csv="{}/data/FakeNameGenerator.com_100.csv",
|
||||
# utterances="{}/data/templates.txt",
|
||||
# dictionary_path="{}/data/Dictionary_test.csv",
|
||||
# num_of_examples=100,
|
||||
# acceptance_threshold=0.9,
|
||||
# marks=pytest.mark.slow
|
||||
# )
|
||||
# ]
|
||||
#
|
||||
#
|
||||
# # credit card recognizer tests on template-generates data
|
||||
# @pytest.mark.parametrize("pii_csv, "
|
||||
# "utterances, "
|
||||
# "dictionary_path, "
|
||||
# "num_of_examples, "
|
||||
# "acceptance_threshold",
|
||||
# [testcase.to_pytest_param()
|
||||
# for testcase in cc_test_template_testdata])
|
||||
# def test_credit_card_recognizer_with_template(pii_csv, utterances,
|
||||
# dictionary_path,
|
||||
# num_of_examples,
|
||||
# acceptance_threshold):
|
||||
# """
|
||||
# Test credit card recognizer with a dataset generated from
|
||||
# template and a CSV values file
|
||||
# :param pii_csv: input csv file location
|
||||
# :param utterances: template file location
|
||||
# :param dictionary_path: dictionary/vocabulary file location
|
||||
# :param num_of_examples: number of samples to be used from dataset
|
||||
# to test
|
||||
# :param acceptance_threshold: minimim precision/recall
|
||||
# allowed for tests to pass
|
||||
# """
|
||||
#
|
||||
# # read template and CSV files
|
||||
# import os
|
||||
# dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
#
|
||||
# input_samples = generate(fake_pii_csv=pii_csv.format(dir_path),
|
||||
# utterances_file=utterances.format(dir_path),
|
||||
# dictionary_path=dictionary_path.format(dir_path),
|
||||
# lower_case_ratio=0.5,
|
||||
# num_of_examples=num_of_examples)
|
||||
#
|
||||
# scores = score_presidio_recognizer(
|
||||
# CreditCardRecognizer(), 'CREDIT_CARD', input_samples)
|
||||
# if not np.isnan(scores.pii_f):
|
||||
# assert acceptance_threshold <= scores.pii_f
|
||||
from presidio_analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer
|
||||
|
||||
# test case parameters for tests with dataset generated from a template and csv values
|
||||
class TemplateTextTestCase:
|
||||
def __init__(self, test_name, pii_csv, utterances, dictionary_path,
|
||||
num_of_examples, acceptance_threshold, marks):
|
||||
self.test_name = test_name
|
||||
self.pii_csv = pii_csv
|
||||
self.utterances = utterances
|
||||
self.dictionary_path = dictionary_path
|
||||
self.num_of_examples = num_of_examples
|
||||
self.acceptance_threshold = acceptance_threshold
|
||||
self.marks = marks
|
||||
|
||||
def to_pytest_param(self):
|
||||
return pytest.param(self.pii_csv, self.utterances, self.dictionary_path,
|
||||
self.num_of_examples, self.acceptance_threshold,
|
||||
id=self.test_name, marks=self.marks)
|
||||
|
||||
|
||||
# template-dataset test cases
|
||||
cc_test_template_testdata = [
|
||||
# large dataset fixture. marked as slow
|
||||
TemplateTextTestCase(
|
||||
test_name="fake-names-100",
|
||||
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
|
||||
utterances="{}/data/templates.txt",
|
||||
dictionary_path="{}/data/Dictionary_test.csv",
|
||||
num_of_examples=100,
|
||||
acceptance_threshold=0.9,
|
||||
marks=pytest.mark.slow
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
# credit card recognizer tests on template-generates data
|
||||
@pytest.mark.parametrize("pii_csv, "
|
||||
"utterances, "
|
||||
"dictionary_path, "
|
||||
"num_of_examples, "
|
||||
"acceptance_threshold",
|
||||
[testcase.to_pytest_param()
|
||||
for testcase in cc_test_template_testdata])
|
||||
def test_credit_card_recognizer_with_template(pii_csv, utterances,
|
||||
dictionary_path,
|
||||
num_of_examples,
|
||||
acceptance_threshold):
|
||||
"""
|
||||
Test credit card recognizer with a dataset generated from
|
||||
template and a CSV values file
|
||||
:param pii_csv: input csv file location
|
||||
:param utterances: template file location
|
||||
:param dictionary_path: dictionary/vocabulary file location
|
||||
:param num_of_examples: number of samples to be used from dataset
|
||||
to test
|
||||
:param acceptance_threshold: minimim precision/recall
|
||||
allowed for tests to pass
|
||||
"""
|
||||
|
||||
# read template and CSV files
|
||||
import os
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
input_samples = generate(fake_pii_csv=pii_csv.format(dir_path),
|
||||
utterances_file=utterances.format(dir_path),
|
||||
dictionary_path=dictionary_path.format(dir_path),
|
||||
lower_case_ratio=0.5,
|
||||
num_of_examples=num_of_examples)
|
||||
|
||||
scores = score_presidio_recognizer(
|
||||
CreditCardRecognizer(), 'CREDIT_CARD', input_samples)
|
||||
if not np.isnan(scores.pii_f):
|
||||
assert acceptance_threshold <= scores.pii_f
|
||||
|
|
|
@ -1,148 +1,144 @@
|
|||
'''
|
||||
Presidio Analyzer not yet on PyPI, ignoring temporarily
|
||||
'''
|
||||
from presidio_evaluator.data_generator import FakeDataGenerator
|
||||
from presidio_evaluator.presidio_recognizer_evaluator import \
|
||||
score_presidio_recognizer
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
# from presidio_evaluator.data_generator import FakeDataGenerator
|
||||
# from presidio_evaluator.presidio_recognizer_evaluator import \
|
||||
# score_presidio_recognizer
|
||||
# import pandas as pd
|
||||
# import pytest
|
||||
# import numpy as np
|
||||
#
|
||||
# from analyzer import Pattern, PatternRecognizer
|
||||
#
|
||||
# # test case parameters for tests with dataset generated from a template and
|
||||
# # two csv value files, one containing the common-entities and another one with custom entities
|
||||
# class PatternRecognizerTestCase:
|
||||
# def __init__(self, test_name, entity_name, pattern, score, pii_csv, ext_csv,
|
||||
# utterances, dictionary_path, num_of_examples, acceptance_threshold,
|
||||
# max_mistakes_number, marks):
|
||||
# self.test_name = test_name
|
||||
# self.entity_name = entity_name
|
||||
# self.pattern = pattern
|
||||
# self.score = score
|
||||
# self.pii_csv = pii_csv
|
||||
# self.ext_csv = ext_csv
|
||||
# self.utterances = utterances
|
||||
# self.dictionary_path = dictionary_path
|
||||
# self.num_of_examples = num_of_examples
|
||||
# self.acceptance_threshold = acceptance_threshold
|
||||
# self.max_mistakes_number = max_mistakes_number
|
||||
# self.marks = marks
|
||||
#
|
||||
# def to_pytest_param(self):
|
||||
# return pytest.param(self.pii_csv, self.ext_csv, self.utterances,
|
||||
# self.dictionary_path,
|
||||
# self.entity_name, self.pattern, self.score,
|
||||
# self.num_of_examples, self.acceptance_threshold,
|
||||
# self.max_mistakes_number, id=self.test_name,
|
||||
# marks=self.marks)
|
||||
#
|
||||
#
|
||||
# # template-dataset test cases
|
||||
# rocket_test_template_testdata = [
|
||||
# # large dataset fixture. marked as slow.
|
||||
# # all input is correct, test is conclusive
|
||||
# PatternRecognizerTestCase(
|
||||
# test_name="rocket-no-errors",
|
||||
# entity_name="ROCKET",
|
||||
# pattern=r'\W*(rocket)\W*',
|
||||
# score=0.8,
|
||||
# pii_csv="{}/data/FakeNameGenerator.com_100.csv",
|
||||
# ext_csv="{}/data/FakeRocketGenerator.csv",
|
||||
# utterances="{}/data/rocket_example_sentences.txt",
|
||||
# dictionary_path="{}/data/Dictionary_test.csv",
|
||||
# num_of_examples=100,
|
||||
# acceptance_threshold=1,
|
||||
# max_mistakes_number=0,
|
||||
# marks=pytest.mark.slow
|
||||
# ),
|
||||
# # large dataset fixture. marked as slow
|
||||
# # all input is correct, test is conclusive
|
||||
# PatternRecognizerTestCase(
|
||||
# test_name="rocket-all-errors",
|
||||
# entity_name="ROCKET",
|
||||
# pattern=r'\W*(rocket)\W*',
|
||||
# score=0.8,
|
||||
# pii_csv="{}/data/FakeNameGenerator.com_100.csv",
|
||||
# ext_csv="{}/data/FakeRocketErrorsGenerator.csv",
|
||||
# utterances="{}/data/rocket_example_sentences.txt",
|
||||
# dictionary_path="{}/data/Dictionary_test.csv",
|
||||
# num_of_examples=100,
|
||||
# acceptance_threshold=0,
|
||||
# max_mistakes_number=100,
|
||||
# marks=pytest.mark.slow
|
||||
# ),
|
||||
# # large dataset fixture. marked as slow
|
||||
# # some input is correct some is not, test is inconclusive
|
||||
# PatternRecognizerTestCase(
|
||||
# test_name="rocket-some-errors",
|
||||
# entity_name="ROCKET",
|
||||
# pattern=r'\W*(rocket)\W*',
|
||||
# score=0.8,
|
||||
# pii_csv="{}/data/FakeNameGenerator.com_100.csv",
|
||||
# ext_csv="{}/data/FakeRocket50PercentErrorsGenerator.csv",
|
||||
# utterances="{}/data/rocket_example_sentences.txt",
|
||||
# dictionary_path="{}/data/Dictionary_test.csv",
|
||||
# num_of_examples=100,
|
||||
# acceptance_threshold=0.3,
|
||||
# max_mistakes_number=70,
|
||||
# marks=[pytest.mark.slow, pytest.mark.inconclusive]
|
||||
# )
|
||||
# ]
|
||||
#
|
||||
#
|
||||
# @pytest.mark.parametrize(
|
||||
# "pii_csv, ext_csv, utterances, dictionary_path, "
|
||||
# "entity_name, pattern, score, num_of_examples, "
|
||||
# "acceptance_threshold, max_mistakes_number",
|
||||
# [testcase.to_pytest_param()
|
||||
# for testcase in rocket_test_template_testdata])
|
||||
# def test_pattern_recognizer(pii_csv, ext_csv, utterances, dictionary_path,
|
||||
# entity_name, pattern,
|
||||
# score, num_of_examples, acceptance_threshold,
|
||||
# max_mistakes_number):
|
||||
# """
|
||||
# Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities
|
||||
# and another CSV values file with a custom entity
|
||||
# :param pii_csv: input csv file location with the common entities
|
||||
# :param ext_csv: input csv file location with custom entities
|
||||
# :param utterances: template file location
|
||||
# :param dictionary_path: vocabulary/dictionary file location
|
||||
# :param entity_name: custom entity name
|
||||
# :param pattern: recognizer pattern
|
||||
# :param num_of_examples: number of samples to be used from dataset to test
|
||||
# :param acceptance_threshold: minimim precision/recall
|
||||
# allowed for tests to pass
|
||||
# """
|
||||
#
|
||||
# import os
|
||||
# dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
# dfpii = pd.read_csv(pii_csv.format(dir_path), encoding='utf-8')
|
||||
# dfext = pd.read_csv(ext_csv.format(dir_path), encoding='utf-8')
|
||||
# dictionary_path = dictionary_path.format(dir_path)
|
||||
# ext_column_name = dfext.columns[0]
|
||||
#
|
||||
# def get_from_ext(i):
|
||||
# index = i % dfext.shape[0]
|
||||
# return dfext.iat[index, 0]
|
||||
#
|
||||
# # extend pii with ext data
|
||||
# dfpii[ext_column_name] = [get_from_ext(i) for i in range(0, dfpii.shape[0])]
|
||||
#
|
||||
# # generate examples
|
||||
# generator = FakeDataGenerator(fake_pii_csv_file=dfpii,
|
||||
# utterances_file=utterances.format(dir_path),
|
||||
# dictionary_path=dictionary_path)
|
||||
# examples = generator.sample_examples(num_of_examples)
|
||||
#
|
||||
# pattern = Pattern("test pattern", pattern, score)
|
||||
# pattern_recognizer = PatternRecognizer(entity_name,
|
||||
# name="test recognizer",
|
||||
# patterns=[pattern])
|
||||
#
|
||||
# scores = score_presidio_recognizer(
|
||||
# pattern_recognizer, [entity_name], examples)
|
||||
# if not np.isnan(scores.pii_f):
|
||||
# assert acceptance_threshold <= scores.pii_f
|
||||
# assert max_mistakes_number >= len(scores.model_errors)
|
||||
from presidio_analyzer import Pattern, PatternRecognizer
|
||||
|
||||
# test case parameters for tests with dataset generated from a template and
|
||||
# two csv value files, one containing the common-entities and another one with custom entities
|
||||
class PatternRecognizerTestCase:
|
||||
def __init__(self, test_name, entity_name, pattern, score, pii_csv, ext_csv,
|
||||
utterances, dictionary_path, num_of_examples, acceptance_threshold,
|
||||
max_mistakes_number, marks):
|
||||
self.test_name = test_name
|
||||
self.entity_name = entity_name
|
||||
self.pattern = pattern
|
||||
self.score = score
|
||||
self.pii_csv = pii_csv
|
||||
self.ext_csv = ext_csv
|
||||
self.utterances = utterances
|
||||
self.dictionary_path = dictionary_path
|
||||
self.num_of_examples = num_of_examples
|
||||
self.acceptance_threshold = acceptance_threshold
|
||||
self.max_mistakes_number = max_mistakes_number
|
||||
self.marks = marks
|
||||
|
||||
def to_pytest_param(self):
|
||||
return pytest.param(self.pii_csv, self.ext_csv, self.utterances,
|
||||
self.dictionary_path,
|
||||
self.entity_name, self.pattern, self.score,
|
||||
self.num_of_examples, self.acceptance_threshold,
|
||||
self.max_mistakes_number, id=self.test_name,
|
||||
marks=self.marks)
|
||||
|
||||
|
||||
# template-dataset test cases
|
||||
rocket_test_template_testdata = [
|
||||
# large dataset fixture. marked as slow.
|
||||
# all input is correct, test is conclusive
|
||||
PatternRecognizerTestCase(
|
||||
test_name="rocket-no-errors",
|
||||
entity_name="ROCKET",
|
||||
pattern=r'\W*(rocket)\W*',
|
||||
score=0.8,
|
||||
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
|
||||
ext_csv="{}/data/FakeRocketGenerator.csv",
|
||||
utterances="{}/data/rocket_example_sentences.txt",
|
||||
dictionary_path="{}/data/Dictionary_test.csv",
|
||||
num_of_examples=100,
|
||||
acceptance_threshold=1,
|
||||
max_mistakes_number=0,
|
||||
marks=pytest.mark.slow
|
||||
),
|
||||
# large dataset fixture. marked as slow
|
||||
# all input is correct, test is conclusive
|
||||
PatternRecognizerTestCase(
|
||||
test_name="rocket-all-errors",
|
||||
entity_name="ROCKET",
|
||||
pattern=r'\W*(rocket)\W*',
|
||||
score=0.8,
|
||||
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
|
||||
ext_csv="{}/data/FakeRocketErrorsGenerator.csv",
|
||||
utterances="{}/data/rocket_example_sentences.txt",
|
||||
dictionary_path="{}/data/Dictionary_test.csv",
|
||||
num_of_examples=100,
|
||||
acceptance_threshold=0,
|
||||
max_mistakes_number=100,
|
||||
marks=pytest.mark.slow
|
||||
),
|
||||
# large dataset fixture. marked as slow
|
||||
# some input is correct some is not, test is inconclusive
|
||||
PatternRecognizerTestCase(
|
||||
test_name="rocket-some-errors",
|
||||
entity_name="ROCKET",
|
||||
pattern=r'\W*(rocket)\W*',
|
||||
score=0.8,
|
||||
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
|
||||
ext_csv="{}/data/FakeRocket50PercentErrorsGenerator.csv",
|
||||
utterances="{}/data/rocket_example_sentences.txt",
|
||||
dictionary_path="{}/data/Dictionary_test.csv",
|
||||
num_of_examples=100,
|
||||
acceptance_threshold=0.3,
|
||||
max_mistakes_number=70,
|
||||
marks=[pytest.mark.slow, pytest.mark.inconclusive]
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pii_csv, ext_csv, utterances, dictionary_path, "
|
||||
"entity_name, pattern, score, num_of_examples, "
|
||||
"acceptance_threshold, max_mistakes_number",
|
||||
[testcase.to_pytest_param()
|
||||
for testcase in rocket_test_template_testdata])
|
||||
def test_pattern_recognizer(pii_csv, ext_csv, utterances, dictionary_path,
|
||||
entity_name, pattern,
|
||||
score, num_of_examples, acceptance_threshold,
|
||||
max_mistakes_number):
|
||||
"""
|
||||
Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities
|
||||
and another CSV values file with a custom entity
|
||||
:param pii_csv: input csv file location with the common entities
|
||||
:param ext_csv: input csv file location with custom entities
|
||||
:param utterances: template file location
|
||||
:param dictionary_path: vocabulary/dictionary file location
|
||||
:param entity_name: custom entity name
|
||||
:param pattern: recognizer pattern
|
||||
:param num_of_examples: number of samples to be used from dataset to test
|
||||
:param acceptance_threshold: minimim precision/recall
|
||||
allowed for tests to pass
|
||||
"""
|
||||
|
||||
import os
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
dfpii = pd.read_csv(pii_csv.format(dir_path), encoding='utf-8')
|
||||
dfext = pd.read_csv(ext_csv.format(dir_path), encoding='utf-8')
|
||||
dictionary_path = dictionary_path.format(dir_path)
|
||||
ext_column_name = dfext.columns[0]
|
||||
|
||||
def get_from_ext(i):
|
||||
index = i % dfext.shape[0]
|
||||
return dfext.iat[index, 0]
|
||||
|
||||
# extend pii with ext data
|
||||
dfpii[ext_column_name] = [get_from_ext(i) for i in range(0, dfpii.shape[0])]
|
||||
|
||||
# generate examples
|
||||
generator = FakeDataGenerator(fake_pii_csv_file=dfpii,
|
||||
utterances_file=utterances.format(dir_path),
|
||||
dictionary_path=dictionary_path)
|
||||
examples = generator.sample_examples(num_of_examples)
|
||||
|
||||
pattern = Pattern("test pattern", pattern, score)
|
||||
pattern_recognizer = PatternRecognizer(entity_name,
|
||||
name="test recognizer",
|
||||
patterns=[pattern])
|
||||
|
||||
scores = score_presidio_recognizer(
|
||||
pattern_recognizer, [entity_name], examples)
|
||||
if not np.isnan(scores.pii_f):
|
||||
assert acceptance_threshold <= scores.pii_f
|
||||
assert max_mistakes_number >= len(scores.model_errors)
|
||||
|
|
|
@ -1,63 +1,59 @@
|
|||
'''
|
||||
Presidio Analyzer not yet on PyPI, ignoring temporarily
|
||||
'''
|
||||
from presidio_evaluator.data_generator import read_synth_dataset
|
||||
from presidio_evaluator.presidio_recognizer_evaluator import \
|
||||
score_presidio_recognizer
|
||||
|
||||
# from presidio_evaluator.data_generator import read_synth_dataset
|
||||
# from presidio_evaluator.presidio_recognizer_evaluator import \
|
||||
# score_presidio_recognizer
|
||||
#
|
||||
# import pytest
|
||||
# from analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
|
||||
#
|
||||
# # test case parameters for tests with dataset which was previously generated.
|
||||
# class GeneratedTextTestCase:
|
||||
# def __init__(self, test_name, test_input, acceptance_threshold, marks):
|
||||
# self.test_name = test_name
|
||||
# self.test_input = test_input
|
||||
# self.acceptance_threshold = acceptance_threshold
|
||||
# self.marks = marks
|
||||
#
|
||||
# def to_pytest_param(self):
|
||||
# return pytest.param(self.test_input, self.acceptance_threshold,
|
||||
# id=self.test_name, marks=self.marks)
|
||||
#
|
||||
#
|
||||
# # generated-text test cases
|
||||
# cc_test_generate_text_testdata = [
|
||||
# # small dataset, inconclusive results
|
||||
# GeneratedTextTestCase(
|
||||
# test_name="small-set",
|
||||
# test_input="{}/data/generated_small.txt",
|
||||
# acceptance_threshold=0.5,
|
||||
# marks=pytest.mark.inconclusive
|
||||
# ),
|
||||
# # large dataset - test is slow and inconclusive
|
||||
# GeneratedTextTestCase(
|
||||
# test_name="large-set",
|
||||
# test_input="{}/data/generated_large.txt",
|
||||
# acceptance_threshold=0.5,
|
||||
# marks=pytest.mark.slow
|
||||
# )
|
||||
# ]
|
||||
#
|
||||
#
|
||||
# # credit card recognizer tests on generated data
|
||||
# @pytest.mark.parametrize("test_input,acceptance_threshold",
|
||||
# [testcase.to_pytest_param() for testcase in
|
||||
# cc_test_generate_text_testdata])
|
||||
# def test_spacy_recognizer_with_generated_text(test_input, acceptance_threshold):
|
||||
# """
|
||||
# Test spacy recognizer with a generated dataset text file
|
||||
# :param test_input: input text file location
|
||||
# :param acceptance_threshold: minimim precision/recall
|
||||
# allowed for tests to pass
|
||||
# """
|
||||
#
|
||||
# # read test input from generated file
|
||||
# import os
|
||||
# dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
# input_samples = read_synth_dataset(
|
||||
# test_input.format(dir_path))
|
||||
# scores = score_presidio_recognizer(
|
||||
# SpacyRecognizer(), ['PERSON'], input_samples, True)
|
||||
# assert acceptance_threshold <= scores.pii_f
|
||||
import pytest
|
||||
from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
|
||||
|
||||
# test case parameters for tests with dataset which was previously generated.
|
||||
class GeneratedTextTestCase:
|
||||
def __init__(self, test_name, test_input, acceptance_threshold, marks):
|
||||
self.test_name = test_name
|
||||
self.test_input = test_input
|
||||
self.acceptance_threshold = acceptance_threshold
|
||||
self.marks = marks
|
||||
|
||||
def to_pytest_param(self):
|
||||
return pytest.param(self.test_input, self.acceptance_threshold,
|
||||
id=self.test_name, marks=self.marks)
|
||||
|
||||
|
||||
# generated-text test cases
|
||||
cc_test_generate_text_testdata = [
|
||||
# small dataset, inconclusive results
|
||||
GeneratedTextTestCase(
|
||||
test_name="small-set",
|
||||
test_input="{}/data/generated_small.txt",
|
||||
acceptance_threshold=0.5,
|
||||
marks=pytest.mark.inconclusive
|
||||
),
|
||||
# large dataset - test is slow and inconclusive
|
||||
GeneratedTextTestCase(
|
||||
test_name="large-set",
|
||||
test_input="{}/data/generated_large.txt",
|
||||
acceptance_threshold=0.5,
|
||||
marks=pytest.mark.slow
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
# credit card recognizer tests on generated data
|
||||
@pytest.mark.parametrize("test_input,acceptance_threshold",
|
||||
[testcase.to_pytest_param() for testcase in
|
||||
cc_test_generate_text_testdata])
|
||||
def test_spacy_recognizer_with_generated_text(test_input, acceptance_threshold):
|
||||
"""
|
||||
Test spacy recognizer with a generated dataset text file
|
||||
:param test_input: input text file location
|
||||
:param acceptance_threshold: minimim precision/recall
|
||||
allowed for tests to pass
|
||||
"""
|
||||
|
||||
# read test input from generated file
|
||||
import os
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
input_samples = read_synth_dataset(
|
||||
test_input.format(dir_path))
|
||||
scores = score_presidio_recognizer(
|
||||
SpacyRecognizer(), ['PERSON'], input_samples, True)
|
||||
assert acceptance_threshold <= scores.pii_f
|
||||
|
|
Загрузка…
Ссылка в новой задаче