diff --git a/presidio_evaluator/presidio_analyzer.py b/presidio_evaluator/presidio_analyzer.py index 162be17..2fb73fb 100644 --- a/presidio_evaluator/presidio_analyzer.py +++ b/presidio_evaluator/presidio_analyzer.py @@ -1,33 +1,31 @@ -''' -Presidio Analyzer not yet on PyPI, cannot explicitly reference it -''' +from typing import List + +from presidio_analyzer import AnalyzerEngine -from typing import List, Dict -# from presidio_evaluator import ModelEvaluator, InputSample, span_to_tag -# from presidio_evaluator.data_generator import read_synth_dataset -# -# class PresidioAnalyzer(ModelEvaluator): - - def __init__(self, analyzer, - entities_to_keep: List[str] = None, - verbose: bool = False, - labeling_scheme="BIO", - compare_by_io=True, - score_threshold=0.4 - ): + def __init__( + self, + analyzer=AnalyzerEngine(), + entities_to_keep: List[str] = None, + verbose: bool = False, + labeling_scheme="BIO", + compare_by_io=True, + score_threshold=0.4, + ): """ Evaluation wrapper for the Presidio Analyzer :param analyzer: object of type AnalyzerEngine (from presidio-analyzer) """ - super().__init__(entities_to_keep=entities_to_keep, - verbose=verbose, - labeling_scheme=labeling_scheme, - compare_by_io=compare_by_io) + super().__init__( + entities_to_keep=entities_to_keep, + verbose=verbose, + labeling_scheme=labeling_scheme, + compare_by_io=compare_by_io, + ) self.analyzer = analyzer self.score_threshold = score_threshold @@ -37,8 +35,12 @@ class PresidioAnalyzer(ModelEvaluator): all_fields = True else: all_fields = None - results = self.analyzer.analyze(sample.full_text, self.entities, - language='en', all_fields=all_fields) + results = self.analyzer.analyze( + text=sample.full_text, + entities=self.entities, + language="en", + all_fields=all_fields, + ) starts = [] ends = [] scores = [] @@ -52,13 +54,15 @@ class PresidioAnalyzer(ModelEvaluator): tags.append(res.entity_type) scores.append(res.score) # - response_tags = span_to_tag(scheme=self.labeling_scheme, - text=sample.full_text, - start=starts, - end=ends, - tokens=sample.tokens, - scores=scores, - tag=tags) + response_tags = span_to_tag( + scheme=self.labeling_scheme, + text=sample.full_text, + start=starts, + end=ends, + tokens=sample.tokens, + scores=scores, + tag=tags, + ) return response_tags @@ -70,41 +74,53 @@ if __name__ == "__main__": # Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity entities_mapping = { - 'PERSON': 'PERSON', - 'EMAIL': 'EMAIL_ADDRESS', - 'CREDIT_CARD': 'CREDIT_CARD', - 'FIRST_NAME': 'PERSON', - 'PHONE_NUMBER': 'PHONE_NUMBER', - 'BIRTHDAY': 'DATE_TIME', - 'DATE': 'DATE_TIME', - 'DOMAIN': 'DOMAIN', - 'CITY': 'LOCATION', - 'ADDRESS': 'LOCATION', - 'IBAN': 'IBAN_CODE', - 'URL': 'DOMAIN_NAME', - 'US_SSN': 'US_SSN', - 'IP_ADDRESS': 'IP_ADDRESS', - 'ORGANIZATION': 'ORG', - 'O': 'O' + "PERSON": "PERSON", + "EMAIL": "EMAIL_ADDRESS", + "CREDIT_CARD": "CREDIT_CARD", + "FIRST_NAME": "PERSON", + "PHONE_NUMBER": "PHONE_NUMBER", + "BIRTHDAY": "DATE_TIME", + "DATE": "DATE_TIME", + "DOMAIN": "DOMAIN", + "CITY": "LOCATION", + "ADDRESS": "LOCATION", + "IBAN": "IBAN_CODE", + "URL": "DOMAIN_NAME", + "US_SSN": "US_SSN", + "IP_ADDRESS": "IP_ADDRESS", + "ORGANIZATION": "ORG", + "O": "O", } - updated_samples = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples, - entities_mapping) + updated_samples = ModelEvaluator.align_input_samples_to_presidio_analyzer( + input_samples, entities_mapping + ) flatten = lambda l: [item for sublist in l for item in sublist] from collections import Counter count_per_entity = Counter( - [span.entity_type for span in flatten([input_sample.spans for input_sample in updated_samples])]) + [ + span.entity_type + for span in flatten( + [input_sample.spans for input_sample in updated_samples] + ) + ] + ) print("Evaluating samples") analyzer = PresidioAnalyzer(entities_to_keep=count_per_entity.keys()) evaluated_samples = analyzer.evaluate_all(updated_samples) # print("Estimating metrics") - precision, recall, \ - entity_recall, entity_precision, \ - f, errors = analyzer.calculate_score(evaluation_results=evaluated_samples, beta=2.5) + ( + precision, + recall, + entity_recall, + entity_precision, + f, + errors, + ) = analyzer.calculate_score(evaluation_results=evaluated_samples, beta=2.5) # print("precision: {}".format(precision)) print("Recall: {}".format(recall)) @@ -112,22 +128,24 @@ if __name__ == "__main__": print("Precision per entity: {}".format(entity_precision)) print("Recall per entity: {}".format(entity_recall)) # - FN_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == 'FN'] - FP_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == 'FP'] - other_mistakes = [mistake for mistake in flatten(errors) if "Wrong entity" in mistake] + FN_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == "FN"] + FP_mistakes = [mistake for mistake in flatten(errors) if mistake[0:2] == "FP"] + other_mistakes = [ + mistake for mistake in flatten(errors) if "Wrong entity" in mistake + ] - fn = open('../data/fn_30000.txt', 'w+', encoding='utf-8') - fn1 = '\n'.join(FN_mistakes) + fn = open("../data/fn_30000.txt", "w+", encoding="utf-8") + fn1 = "\n".join(FN_mistakes) fn.write(fn1) fn.close() - fp = open('../data/fp_30000.txt', 'w+', encoding='utf-8') - fp1 = '\n'.join(FP_mistakes) + fp = open("../data/fp_30000.txt", "w+", encoding="utf-8") + fp1 = "\n".join(FP_mistakes) fp.write(fp1) fp.close() - mistakes_file = open('../data/mistakes_30000.txt', 'w+', encoding='utf-8') - mistakes1 = '\n'.join(other_mistakes) + mistakes_file = open("../data/mistakes_30000.txt", "w+", encoding="utf-8") + mistakes1 = "\n".join(other_mistakes) mistakes_file.write(mistakes1) mistakes_file.close() diff --git a/presidio_evaluator/presidio_recognizer_evaluator.py b/presidio_evaluator/presidio_recognizer_evaluator.py index d27e21d..3d781bb 100644 --- a/presidio_evaluator/presidio_recognizer_evaluator.py +++ b/presidio_evaluator/presidio_recognizer_evaluator.py @@ -1,40 +1,50 @@ -''' +""" Presidio Analyzer not yet on PyPI, therefore it cannot be referenced explicitly -''' +""" import math from typing import List, Tuple, Dict -from presidio_evaluator import ModelEvaluator, InputSample +from presidio_analyzer.nlp_engine import SpacyNlpEngine + +from presidio_evaluator import ModelEvaluator, InputSample, EvaluationResult from presidio_evaluator.span_to_tag import span_to_tag class PresidioRecognizerEvaluator(ModelEvaluator): - def __init__(self, recognizer, nlp_engine, entities_to_keep=None, - with_nlp_artifacts=False, verbose=False, compare_by_io=True, - ): + def __init__( + self, + recognizer, + nlp_engine, + entities_to_keep=None, + with_nlp_artifacts=False, + verbose=False, + compare_by_io=True, + ): """ Evaluator for one recognizer :param recognizer: An object of type EntityRecognizer (in presidion-analyzer) :param nlp_engine: An object of type NlpEngine, e.g. SpacyNlpEngine (in presidio-analyzer) """ - super().__init__(entities_to_keep=entities_to_keep, - verbose=verbose, compare_by_io=compare_by_io) + super().__init__( + entities_to_keep=entities_to_keep, + verbose=verbose, + compare_by_io=compare_by_io, + ) self.withNlpArtifacts = with_nlp_artifacts self.recognizer = recognizer self.nlp_engine = nlp_engine # def __make_nlp_artifacts(self, text: str): - return self.nlp_engine.process_text(text, 'en') + return self.nlp_engine.process_text(text, "en") # def predict(self, sample: InputSample) -> List[str]: nlpArtifacts = None if self.withNlpArtifacts: nlpArtifacts = self.__make_nlp_artifacts(sample.full_text) - results = self.recognizer.analyze(sample.full_text, self.entities, - nlpArtifacts) + results = self.recognizer.analyze(sample.full_text, self.entities, nlpArtifacts) starts = [] ends = [] tags = [] @@ -46,37 +56,33 @@ class PresidioRecognizerEvaluator(ModelEvaluator): ends.append(res.end) tags.append(res.entity_type) scores.append(res.score) - response_tags = span_to_tag(scheme=self.labeling_scheme, - text=sample.full_text, - start=starts, - end=ends, - tag=tags, - tokens=sample.tokens, - scores=scores, - io_tags_only=self.compare_by_io) + response_tags = span_to_tag( + scheme=self.labeling_scheme, + text=sample.full_text, + start=starts, + end=ends, + tag=tags, + tokens=sample.tokens, + scores=scores, + io_tags_only=self.compare_by_io, + ) if len(sample.tags) == 0: - sample.tags = ['0' for word in response_tags] + sample.tags = ["0" for word in response_tags] return response_tags -def score_presidio_recognizer(recognizer, entities_to_keep, input_samples, - withNlpArtifacts=False) \ - -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[ - str, float], Dict[str, float], List[str]]: - model = PresidioRecognizerEvaluator(recognizer=recognizer, - entities_to_keep=entities_to_keep, - with_nlp_artifacts=withNlpArtifacts) +def score_presidio_recognizer( + recognizer, entities_to_keep, input_samples, withNlpArtifacts=False +) -> EvaluationResult: + model = PresidioRecognizerEvaluator( + recognizer=recognizer, + entities_to_keep=entities_to_keep, + nlp_engine=SpacyNlpEngine(), + with_nlp_artifacts=withNlpArtifacts, + ) evaluated_samples = model.evaluate_all(input_samples[:]) - precision, recall, ent_recall, \ - ent_precision, fscore, mistakes = model.calculate_score( - evaluated_samples, beta=2.5) - print("p={precision}, r={recall},f={f}," - "entity recall={ent},entity precision={prec}".format( - precision=precision, - recall=recall, - f=fscore, - ent=ent_recall, - prec=ent_precision)) - if math.isnan(precision): - precision = 0 - return precision, recall, ent_recall, ent_precision, fscore, mistakes + evaluation_result = model.calculate_score(evaluated_samples, beta=2.5) + evaluation_result.print() + if math.isnan(evaluation_result.pii_precision): + evaluation_result.pii_precision = 0 + return evaluation_result diff --git a/requirements.txt b/requirements.txt index 337213e..9b8c15c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ regex #flair sklearn_crfsuite pytest +presidio_analyzer \ No newline at end of file diff --git a/tests/test_flair_evaluator.py b/tests/test_flair_evaluator.py index 5cd0403..e883ef6 100644 --- a/tests/test_flair_evaluator.py +++ b/tests/test_flair_evaluator.py @@ -1,7 +1,9 @@ +import pytest + try: from flair.models import SequenceTagger -except ImportError: - print("Flair is not installed by default") +except: + ImportError("Flair is not installed by default") from presidio_evaluator.data_generator import read_synth_dataset from presidio_evaluator.flair_evaluator import FlairEvaluator @@ -9,18 +11,26 @@ from presidio_evaluator.flair_evaluator import FlairEvaluator import numpy as np # no-unit because flair is not a dependency by default -def no_unit_test_flair_simple(): +@pytest.mark.skip(reason="Flair not installed by default") +def test_flair_simple(): import os + dir_path = os.path.dirname(os.path.realpath(__file__)) - input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt")) + input_samples = read_synth_dataset( + os.path.join(dir_path, "data/generated_small.txt") + ) - model = SequenceTagger.load('ner-ontonotes-fast') # .load('ner') + model = SequenceTagger.load("ner-ontonotes-fast") # .load('ner') - flair_evaluator = FlairEvaluator(model=model, entities_to_keep=['PERSON']) + flair_evaluator = FlairEvaluator(model=model, entities_to_keep=["PERSON"]) evaluation_results = flair_evaluator.evaluate_all(input_samples) scores = flair_evaluator.calculate_score(evaluation_results) - np.testing.assert_almost_equal(scores.pii_precision, scores.entity_precision_dict['PERSON']) - np.testing.assert_almost_equal(scores.pii_recall, scores.entity_recall_dict['PERSON']) + np.testing.assert_almost_equal( + scores.pii_precision, scores.entity_precision_dict["PERSON"] + ) + np.testing.assert_almost_equal( + scores.pii_recall, scores.entity_recall_dict["PERSON"] + ) assert scores.pii_recall > 0 assert scores.pii_precision > 0 diff --git a/tests/test_presidio_analyzer.py b/tests/test_presidio_analyzer.py index 70c961b..9635e2b 100644 --- a/tests/test_presidio_analyzer.py +++ b/tests/test_presidio_analyzer.py @@ -1,80 +1,101 @@ -''' -Presidio Analyzer not yet on PyPI, ignoring temporarily -''' -# -# import pytest -# -# from presidio_evaluator import InputSample, Span -# from presidio_evaluator.data_generator import read_synth_dataset -# from presidio_evaluator.presidio_analyzer import PresidioAnalyzer -# -# -# class GeneratedTextTestCase: -# def __init__(self, test_name, test_input, acceptance_threshold, marks): -# self.test_name = test_name -# self.test_input = test_input -# self.acceptance_threshold = acceptance_threshold -# self.marks = marks -# -# def to_pytest_param(self): -# return pytest.param(self.test_input, self.acceptance_threshold, -# id=self.test_name, marks=self.marks) -# -# -# # generated-text test cases -# analyzer_test_generate_text_testdata = [ -# # small set fixture which expects all results. -# GeneratedTextTestCase( -# test_name="small-set", -# test_input="{}/data/generated_small.txt", -# acceptance_threshold=0.3, -# marks=pytest.mark.none -# ) -# ] -# -# -# @pytest.mark.skip(reason="Presidio analyzer not on PyPi") -# def test_analyzer_simple_input(): -# model = PresidioAnalyzer(entities_to_keep=['PERSON']) -# -# sample = InputSample(full_text="My name is Mike", -# masked="My name is [PERSON]", -# spans=[Span('PERSON', 'Mike', 10, 14)], -# create_tags_from_span=True) -# -# evaluated = model.evaluate_sample(sample) -# metrics = model.calculate_score( -# [evaluated]) -# -# assert metrics.pii_precision == 1 -# assert metrics.pii_recall == 1 -# -# -# # analyzer tests on generated data -# @pytest.mark.skip(reason="Presidio analyzer not on PyPi") -# @pytest.mark.parametrize("test_input,acceptance_threshold", -# [testcase.to_pytest_param() for testcase in -# analyzer_test_generate_text_testdata]) -# def test_analyzer_with_generated_text(test_input, acceptance_threshold): -# """ -# Test analyzer with a generated dataset text file -# :param test_input: input text file location -# :param acceptance_threshold: minimim precision/recall -# allowed for tests to pass -# """ -# # read test input from generated file -# -# import os -# dir_path = os.path.dirname(os.path.realpath(__file__)) -# input_samples = read_synth_dataset( -# test_input.format(dir_path)) -# -# updated_samples = PresidioAnalyzer. \ -# align_input_samples_to_presidio_analyzer(input_samples) -# -# analyzer = PresidioAnalyzer() -# evaluated_samples = analyzer.evaluate_all(updated_samples) -# scores = analyzer.calculate_score(evaluation_results=evaluated_samples) -# -# assert acceptance_threshold <= scores.pii_precision -# assert acceptance_threshold <= scores.pii_recall +import pytest + +from presidio_evaluator import InputSample, Span +from presidio_evaluator.data_generator import read_synth_dataset +from presidio_evaluator.presidio_analyzer import PresidioAnalyzer + +# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity +entities_mapping = { + "PERSON": "PERSON", + "EMAIL": "EMAIL_ADDRESS", + "CREDIT_CARD": "CREDIT_CARD", + "FIRST_NAME": "PERSON", + "PHONE_NUMBER": "PHONE_NUMBER", + "BIRTHDAY": "DATE_TIME", + "DATE": "DATE_TIME", + "DOMAIN": "DOMAIN", + "CITY": "LOCATION", + "ADDRESS": "LOCATION", + "IBAN": "IBAN_CODE", + "URL": "DOMAIN_NAME", + "US_SSN": "US_SSN", + "IP_ADDRESS": "IP_ADDRESS", + "ORGANIZATION": "ORG", + "O": "O", +} + + +class GeneratedTextTestCase: + def __init__(self, test_name, test_input, acceptance_threshold, marks): + self.test_name = test_name + self.test_input = test_input + self.acceptance_threshold = acceptance_threshold + self.marks = marks + + def to_pytest_param(self): + return pytest.param( + self.test_input, + self.acceptance_threshold, + id=self.test_name, + marks=self.marks, + ) + + +# generated-text test cases +analyzer_test_generate_text_testdata = [ + # small set fixture which expects all results. + GeneratedTextTestCase( + test_name="small-set", + test_input="{}/data/generated_small.txt", + acceptance_threshold=0.3, + marks=pytest.mark.none, + ) +] + + +def test_analyzer_simple_input(): + model = PresidioAnalyzer(entities_to_keep=["PERSON"]) + + sample = InputSample( + full_text="My name is Mike", + masked="My name is [PERSON]", + spans=[Span("PERSON", "Mike", 10, 14)], + create_tags_from_span=True, + ) + + evaluated = model.evaluate_sample(sample) + metrics = model.calculate_score([evaluated]) + + assert metrics.pii_precision == 1 + assert metrics.pii_recall == 1 + + +# analyzer tests on generated data +@pytest.mark.parametrize( + "test_input,acceptance_threshold", + [testcase.to_pytest_param() for testcase in analyzer_test_generate_text_testdata], +) +def test_analyzer_with_generated_text(test_input, acceptance_threshold): + """ + Test analyzer with a generated dataset text file + :param test_input: input text file location + :param acceptance_threshold: minimim precision/recall + allowed for tests to pass + """ + # read test input from generated file + + import os + + dir_path = os.path.dirname(os.path.realpath(__file__)) + input_samples = read_synth_dataset(test_input.format(dir_path)) + + updated_samples = PresidioAnalyzer.align_input_samples_to_presidio_analyzer( + input_samples=input_samples, entities_mapping=entities_mapping + ) + + analyzer = PresidioAnalyzer() + evaluated_samples = analyzer.evaluate_all(updated_samples) + scores = analyzer.calculate_score(evaluation_results=evaluated_samples) + + assert acceptance_threshold <= scores.pii_precision + assert acceptance_threshold <= scores.pii_recall diff --git a/tests/test_recognizers_generated_text.py b/tests/test_recognizers_generated_text.py index 9144e65..31dbe30 100644 --- a/tests/test_recognizers_generated_text.py +++ b/tests/test_recognizers_generated_text.py @@ -1,62 +1,58 @@ -''' -Presidio Analyzer not yet on PyPI, ignoring temporarily -''' +from presidio_evaluator.data_generator import read_synth_dataset +from presidio_evaluator.presidio_recognizer_evaluator import score_presidio_recognizer +import pytest -# from presidio_evaluator.data_generator import read_synth_dataset -# from presidio_evaluator.presidio_recognizer_evaluator import score_presidio_recognizer -# import pytest -# -# from analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer -# -# # test case parameters for tests with dataset which was previously generated. -# class GeneratedTextTestCase: -# def __init__(self, test_name, test_input, acceptance_threshold, marks): -# self.test_name = test_name -# self.test_input = test_input -# self.acceptance_threshold = acceptance_threshold -# self.marks = marks -# -# def to_pytest_param(self): -# return pytest.param(self.test_input, self.acceptance_threshold, -# id=self.test_name, marks=self.marks) -# -# -# # generated-text test cases -# cc_test_generate_text_testdata = [ -# # small set fixture which expects all type results. -# GeneratedTextTestCase( -# test_name="small-set", -# test_input="{}/data/generated_small.txt", -# acceptance_threshold=1, -# marks=pytest.mark.none -# ), -# # large set fixture which expects all type results. marked as "slow" -# GeneratedTextTestCase( -# test_name="large_set", -# test_input="{}/data/generated_large.txt", -# acceptance_threshold=1, -# marks=pytest.mark.slow -# ) -# ] -# -# -# # credit card recognizer tests on generated data -# @pytest.mark.parametrize("test_input,acceptance_threshold", -# [testcase.to_pytest_param() -# for testcase in cc_test_generate_text_testdata]) -# def test_credit_card_recognizer_with_generated_text(test_input, acceptance_threshold): -# """ -# Test credit card recognizer with a generated dataset text file -# :param test_input: input text file location -# :param acceptance_threshold: minimim precision/recall -# allowed for tests to pass -# """ -# -# # read test input from generated file -# import os -# dir_path = os.path.dirname(os.path.realpath(__file__)) -# input_samples = read_synth_dataset( -# test_input.format(dir_path)) -# scores = score_presidio_recognizer( -# CreditCardRecognizer(), 'CREDIT_CARD', input_samples) -# assert acceptance_threshold <= scores.pii_f +from presidio_analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer + +# test case parameters for tests with dataset which was previously generated. +class GeneratedTextTestCase: + def __init__(self, test_name, test_input, acceptance_threshold, marks): + self.test_name = test_name + self.test_input = test_input + self.acceptance_threshold = acceptance_threshold + self.marks = marks + + def to_pytest_param(self): + return pytest.param(self.test_input, self.acceptance_threshold, + id=self.test_name, marks=self.marks) + + +# generated-text test cases +cc_test_generate_text_testdata = [ + # small set fixture which expects all type results. + GeneratedTextTestCase( + test_name="small-set", + test_input="{}/data/generated_small.txt", + acceptance_threshold=1, + marks=pytest.mark.none + ), + # large set fixture which expects all type results. marked as "slow" + GeneratedTextTestCase( + test_name="large_set", + test_input="{}/data/generated_large.txt", + acceptance_threshold=1, + marks=pytest.mark.slow + ) +] + + +# credit card recognizer tests on generated data +@pytest.mark.parametrize("test_input,acceptance_threshold", + [testcase.to_pytest_param() + for testcase in cc_test_generate_text_testdata]) +def test_credit_card_recognizer_with_generated_text(test_input, acceptance_threshold): + """ + Test credit card recognizer with a generated dataset text file + :param test_input: input text file location + :param acceptance_threshold: minimim precision/recall + allowed for tests to pass + """ + + # read test input from generated file + import os + dir_path = os.path.dirname(os.path.realpath(__file__)) + input_samples = read_synth_dataset( + test_input.format(dir_path)) + scores = score_presidio_recognizer( + CreditCardRecognizer(), 'CREDIT_CARD', input_samples) + assert acceptance_threshold <= scores.pii_f diff --git a/tests/test_recognizers_template_csv.py b/tests/test_recognizers_template_csv.py index 3bb17b6..de655a2 100644 --- a/tests/test_recognizers_template_csv.py +++ b/tests/test_recognizers_template_csv.py @@ -1,83 +1,79 @@ -''' -Presidio Analyzer not yet on PyPI, ignoring temporarily -''' +from presidio_evaluator.data_generator import generate +from presidio_evaluator.presidio_recognizer_evaluator import \ + score_presidio_recognizer +import pytest +import numpy as np -# from presidio_evaluator.data_generator import generate -# from presidio_evaluator.presidio_recognizer_evaluator import \ -# score_presidio_recognizer -# import pytest -# import numpy as np -# -# from analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer -# -# # test case parameters for tests with dataset generated from a template and csv values -# class TemplateTextTestCase: -# def __init__(self, test_name, pii_csv, utterances, dictionary_path, -# num_of_examples, acceptance_threshold, marks): -# self.test_name = test_name -# self.pii_csv = pii_csv -# self.utterances = utterances -# self.dictionary_path = dictionary_path -# self.num_of_examples = num_of_examples -# self.acceptance_threshold = acceptance_threshold -# self.marks = marks -# -# def to_pytest_param(self): -# return pytest.param(self.pii_csv, self.utterances, self.dictionary_path, -# self.num_of_examples, self.acceptance_threshold, -# id=self.test_name, marks=self.marks) -# -# -# # template-dataset test cases -# cc_test_template_testdata = [ -# # large dataset fixture. marked as slow -# TemplateTextTestCase( -# test_name="fake-names-100", -# pii_csv="{}/data/FakeNameGenerator.com_100.csv", -# utterances="{}/data/templates.txt", -# dictionary_path="{}/data/Dictionary_test.csv", -# num_of_examples=100, -# acceptance_threshold=0.9, -# marks=pytest.mark.slow -# ) -# ] -# -# -# # credit card recognizer tests on template-generates data -# @pytest.mark.parametrize("pii_csv, " -# "utterances, " -# "dictionary_path, " -# "num_of_examples, " -# "acceptance_threshold", -# [testcase.to_pytest_param() -# for testcase in cc_test_template_testdata]) -# def test_credit_card_recognizer_with_template(pii_csv, utterances, -# dictionary_path, -# num_of_examples, -# acceptance_threshold): -# """ -# Test credit card recognizer with a dataset generated from -# template and a CSV values file -# :param pii_csv: input csv file location -# :param utterances: template file location -# :param dictionary_path: dictionary/vocabulary file location -# :param num_of_examples: number of samples to be used from dataset -# to test -# :param acceptance_threshold: minimim precision/recall -# allowed for tests to pass -# """ -# -# # read template and CSV files -# import os -# dir_path = os.path.dirname(os.path.realpath(__file__)) -# -# input_samples = generate(fake_pii_csv=pii_csv.format(dir_path), -# utterances_file=utterances.format(dir_path), -# dictionary_path=dictionary_path.format(dir_path), -# lower_case_ratio=0.5, -# num_of_examples=num_of_examples) -# -# scores = score_presidio_recognizer( -# CreditCardRecognizer(), 'CREDIT_CARD', input_samples) -# if not np.isnan(scores.pii_f): -# assert acceptance_threshold <= scores.pii_f +from presidio_analyzer.predefined_recognizers.credit_card_recognizer import CreditCardRecognizer + +# test case parameters for tests with dataset generated from a template and csv values +class TemplateTextTestCase: + def __init__(self, test_name, pii_csv, utterances, dictionary_path, + num_of_examples, acceptance_threshold, marks): + self.test_name = test_name + self.pii_csv = pii_csv + self.utterances = utterances + self.dictionary_path = dictionary_path + self.num_of_examples = num_of_examples + self.acceptance_threshold = acceptance_threshold + self.marks = marks + + def to_pytest_param(self): + return pytest.param(self.pii_csv, self.utterances, self.dictionary_path, + self.num_of_examples, self.acceptance_threshold, + id=self.test_name, marks=self.marks) + + +# template-dataset test cases +cc_test_template_testdata = [ + # large dataset fixture. marked as slow + TemplateTextTestCase( + test_name="fake-names-100", + pii_csv="{}/data/FakeNameGenerator.com_100.csv", + utterances="{}/data/templates.txt", + dictionary_path="{}/data/Dictionary_test.csv", + num_of_examples=100, + acceptance_threshold=0.9, + marks=pytest.mark.slow + ) +] + + +# credit card recognizer tests on template-generates data +@pytest.mark.parametrize("pii_csv, " + "utterances, " + "dictionary_path, " + "num_of_examples, " + "acceptance_threshold", + [testcase.to_pytest_param() + for testcase in cc_test_template_testdata]) +def test_credit_card_recognizer_with_template(pii_csv, utterances, + dictionary_path, + num_of_examples, + acceptance_threshold): + """ + Test credit card recognizer with a dataset generated from + template and a CSV values file + :param pii_csv: input csv file location + :param utterances: template file location + :param dictionary_path: dictionary/vocabulary file location + :param num_of_examples: number of samples to be used from dataset + to test + :param acceptance_threshold: minimim precision/recall + allowed for tests to pass + """ + + # read template and CSV files + import os + dir_path = os.path.dirname(os.path.realpath(__file__)) + + input_samples = generate(fake_pii_csv=pii_csv.format(dir_path), + utterances_file=utterances.format(dir_path), + dictionary_path=dictionary_path.format(dir_path), + lower_case_ratio=0.5, + num_of_examples=num_of_examples) + + scores = score_presidio_recognizer( + CreditCardRecognizer(), 'CREDIT_CARD', input_samples) + if not np.isnan(scores.pii_f): + assert acceptance_threshold <= scores.pii_f diff --git a/tests/test_recognizers_template_join_csv.py b/tests/test_recognizers_template_join_csv.py index 30eb7a7..bc996c8 100644 --- a/tests/test_recognizers_template_join_csv.py +++ b/tests/test_recognizers_template_join_csv.py @@ -1,148 +1,144 @@ -''' -Presidio Analyzer not yet on PyPI, ignoring temporarily -''' +from presidio_evaluator.data_generator import FakeDataGenerator +from presidio_evaluator.presidio_recognizer_evaluator import \ + score_presidio_recognizer +import pandas as pd +import pytest +import numpy as np -# from presidio_evaluator.data_generator import FakeDataGenerator -# from presidio_evaluator.presidio_recognizer_evaluator import \ -# score_presidio_recognizer -# import pandas as pd -# import pytest -# import numpy as np -# -# from analyzer import Pattern, PatternRecognizer -# -# # test case parameters for tests with dataset generated from a template and -# # two csv value files, one containing the common-entities and another one with custom entities -# class PatternRecognizerTestCase: -# def __init__(self, test_name, entity_name, pattern, score, pii_csv, ext_csv, -# utterances, dictionary_path, num_of_examples, acceptance_threshold, -# max_mistakes_number, marks): -# self.test_name = test_name -# self.entity_name = entity_name -# self.pattern = pattern -# self.score = score -# self.pii_csv = pii_csv -# self.ext_csv = ext_csv -# self.utterances = utterances -# self.dictionary_path = dictionary_path -# self.num_of_examples = num_of_examples -# self.acceptance_threshold = acceptance_threshold -# self.max_mistakes_number = max_mistakes_number -# self.marks = marks -# -# def to_pytest_param(self): -# return pytest.param(self.pii_csv, self.ext_csv, self.utterances, -# self.dictionary_path, -# self.entity_name, self.pattern, self.score, -# self.num_of_examples, self.acceptance_threshold, -# self.max_mistakes_number, id=self.test_name, -# marks=self.marks) -# -# -# # template-dataset test cases -# rocket_test_template_testdata = [ -# # large dataset fixture. marked as slow. -# # all input is correct, test is conclusive -# PatternRecognizerTestCase( -# test_name="rocket-no-errors", -# entity_name="ROCKET", -# pattern=r'\W*(rocket)\W*', -# score=0.8, -# pii_csv="{}/data/FakeNameGenerator.com_100.csv", -# ext_csv="{}/data/FakeRocketGenerator.csv", -# utterances="{}/data/rocket_example_sentences.txt", -# dictionary_path="{}/data/Dictionary_test.csv", -# num_of_examples=100, -# acceptance_threshold=1, -# max_mistakes_number=0, -# marks=pytest.mark.slow -# ), -# # large dataset fixture. marked as slow -# # all input is correct, test is conclusive -# PatternRecognizerTestCase( -# test_name="rocket-all-errors", -# entity_name="ROCKET", -# pattern=r'\W*(rocket)\W*', -# score=0.8, -# pii_csv="{}/data/FakeNameGenerator.com_100.csv", -# ext_csv="{}/data/FakeRocketErrorsGenerator.csv", -# utterances="{}/data/rocket_example_sentences.txt", -# dictionary_path="{}/data/Dictionary_test.csv", -# num_of_examples=100, -# acceptance_threshold=0, -# max_mistakes_number=100, -# marks=pytest.mark.slow -# ), -# # large dataset fixture. marked as slow -# # some input is correct some is not, test is inconclusive -# PatternRecognizerTestCase( -# test_name="rocket-some-errors", -# entity_name="ROCKET", -# pattern=r'\W*(rocket)\W*', -# score=0.8, -# pii_csv="{}/data/FakeNameGenerator.com_100.csv", -# ext_csv="{}/data/FakeRocket50PercentErrorsGenerator.csv", -# utterances="{}/data/rocket_example_sentences.txt", -# dictionary_path="{}/data/Dictionary_test.csv", -# num_of_examples=100, -# acceptance_threshold=0.3, -# max_mistakes_number=70, -# marks=[pytest.mark.slow, pytest.mark.inconclusive] -# ) -# ] -# -# -# @pytest.mark.parametrize( -# "pii_csv, ext_csv, utterances, dictionary_path, " -# "entity_name, pattern, score, num_of_examples, " -# "acceptance_threshold, max_mistakes_number", -# [testcase.to_pytest_param() -# for testcase in rocket_test_template_testdata]) -# def test_pattern_recognizer(pii_csv, ext_csv, utterances, dictionary_path, -# entity_name, pattern, -# score, num_of_examples, acceptance_threshold, -# max_mistakes_number): -# """ -# Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities -# and another CSV values file with a custom entity -# :param pii_csv: input csv file location with the common entities -# :param ext_csv: input csv file location with custom entities -# :param utterances: template file location -# :param dictionary_path: vocabulary/dictionary file location -# :param entity_name: custom entity name -# :param pattern: recognizer pattern -# :param num_of_examples: number of samples to be used from dataset to test -# :param acceptance_threshold: minimim precision/recall -# allowed for tests to pass -# """ -# -# import os -# dir_path = os.path.dirname(os.path.realpath(__file__)) -# dfpii = pd.read_csv(pii_csv.format(dir_path), encoding='utf-8') -# dfext = pd.read_csv(ext_csv.format(dir_path), encoding='utf-8') -# dictionary_path = dictionary_path.format(dir_path) -# ext_column_name = dfext.columns[0] -# -# def get_from_ext(i): -# index = i % dfext.shape[0] -# return dfext.iat[index, 0] -# -# # extend pii with ext data -# dfpii[ext_column_name] = [get_from_ext(i) for i in range(0, dfpii.shape[0])] -# -# # generate examples -# generator = FakeDataGenerator(fake_pii_csv_file=dfpii, -# utterances_file=utterances.format(dir_path), -# dictionary_path=dictionary_path) -# examples = generator.sample_examples(num_of_examples) -# -# pattern = Pattern("test pattern", pattern, score) -# pattern_recognizer = PatternRecognizer(entity_name, -# name="test recognizer", -# patterns=[pattern]) -# -# scores = score_presidio_recognizer( -# pattern_recognizer, [entity_name], examples) -# if not np.isnan(scores.pii_f): -# assert acceptance_threshold <= scores.pii_f -# assert max_mistakes_number >= len(scores.model_errors) +from presidio_analyzer import Pattern, PatternRecognizer + +# test case parameters for tests with dataset generated from a template and +# two csv value files, one containing the common-entities and another one with custom entities +class PatternRecognizerTestCase: + def __init__(self, test_name, entity_name, pattern, score, pii_csv, ext_csv, + utterances, dictionary_path, num_of_examples, acceptance_threshold, + max_mistakes_number, marks): + self.test_name = test_name + self.entity_name = entity_name + self.pattern = pattern + self.score = score + self.pii_csv = pii_csv + self.ext_csv = ext_csv + self.utterances = utterances + self.dictionary_path = dictionary_path + self.num_of_examples = num_of_examples + self.acceptance_threshold = acceptance_threshold + self.max_mistakes_number = max_mistakes_number + self.marks = marks + + def to_pytest_param(self): + return pytest.param(self.pii_csv, self.ext_csv, self.utterances, + self.dictionary_path, + self.entity_name, self.pattern, self.score, + self.num_of_examples, self.acceptance_threshold, + self.max_mistakes_number, id=self.test_name, + marks=self.marks) + + +# template-dataset test cases +rocket_test_template_testdata = [ + # large dataset fixture. marked as slow. + # all input is correct, test is conclusive + PatternRecognizerTestCase( + test_name="rocket-no-errors", + entity_name="ROCKET", + pattern=r'\W*(rocket)\W*', + score=0.8, + pii_csv="{}/data/FakeNameGenerator.com_100.csv", + ext_csv="{}/data/FakeRocketGenerator.csv", + utterances="{}/data/rocket_example_sentences.txt", + dictionary_path="{}/data/Dictionary_test.csv", + num_of_examples=100, + acceptance_threshold=1, + max_mistakes_number=0, + marks=pytest.mark.slow + ), + # large dataset fixture. marked as slow + # all input is correct, test is conclusive + PatternRecognizerTestCase( + test_name="rocket-all-errors", + entity_name="ROCKET", + pattern=r'\W*(rocket)\W*', + score=0.8, + pii_csv="{}/data/FakeNameGenerator.com_100.csv", + ext_csv="{}/data/FakeRocketErrorsGenerator.csv", + utterances="{}/data/rocket_example_sentences.txt", + dictionary_path="{}/data/Dictionary_test.csv", + num_of_examples=100, + acceptance_threshold=0, + max_mistakes_number=100, + marks=pytest.mark.slow + ), + # large dataset fixture. marked as slow + # some input is correct some is not, test is inconclusive + PatternRecognizerTestCase( + test_name="rocket-some-errors", + entity_name="ROCKET", + pattern=r'\W*(rocket)\W*', + score=0.8, + pii_csv="{}/data/FakeNameGenerator.com_100.csv", + ext_csv="{}/data/FakeRocket50PercentErrorsGenerator.csv", + utterances="{}/data/rocket_example_sentences.txt", + dictionary_path="{}/data/Dictionary_test.csv", + num_of_examples=100, + acceptance_threshold=0.3, + max_mistakes_number=70, + marks=[pytest.mark.slow, pytest.mark.inconclusive] + ) +] + + +@pytest.mark.parametrize( + "pii_csv, ext_csv, utterances, dictionary_path, " + "entity_name, pattern, score, num_of_examples, " + "acceptance_threshold, max_mistakes_number", + [testcase.to_pytest_param() + for testcase in rocket_test_template_testdata]) +def test_pattern_recognizer(pii_csv, ext_csv, utterances, dictionary_path, + entity_name, pattern, + score, num_of_examples, acceptance_threshold, + max_mistakes_number): + """ + Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities + and another CSV values file with a custom entity + :param pii_csv: input csv file location with the common entities + :param ext_csv: input csv file location with custom entities + :param utterances: template file location + :param dictionary_path: vocabulary/dictionary file location + :param entity_name: custom entity name + :param pattern: recognizer pattern + :param num_of_examples: number of samples to be used from dataset to test + :param acceptance_threshold: minimim precision/recall + allowed for tests to pass + """ + + import os + dir_path = os.path.dirname(os.path.realpath(__file__)) + dfpii = pd.read_csv(pii_csv.format(dir_path), encoding='utf-8') + dfext = pd.read_csv(ext_csv.format(dir_path), encoding='utf-8') + dictionary_path = dictionary_path.format(dir_path) + ext_column_name = dfext.columns[0] + + def get_from_ext(i): + index = i % dfext.shape[0] + return dfext.iat[index, 0] + + # extend pii with ext data + dfpii[ext_column_name] = [get_from_ext(i) for i in range(0, dfpii.shape[0])] + + # generate examples + generator = FakeDataGenerator(fake_pii_csv_file=dfpii, + utterances_file=utterances.format(dir_path), + dictionary_path=dictionary_path) + examples = generator.sample_examples(num_of_examples) + + pattern = Pattern("test pattern", pattern, score) + pattern_recognizer = PatternRecognizer(entity_name, + name="test recognizer", + patterns=[pattern]) + + scores = score_presidio_recognizer( + pattern_recognizer, [entity_name], examples) + if not np.isnan(scores.pii_f): + assert acceptance_threshold <= scores.pii_f + assert max_mistakes_number >= len(scores.model_errors) diff --git a/tests/test_spacy_recognizer_generated_text.py b/tests/test_spacy_recognizer_generated_text.py index 9e2c88c..3063afb 100644 --- a/tests/test_spacy_recognizer_generated_text.py +++ b/tests/test_spacy_recognizer_generated_text.py @@ -1,63 +1,59 @@ -''' -Presidio Analyzer not yet on PyPI, ignoring temporarily -''' +from presidio_evaluator.data_generator import read_synth_dataset +from presidio_evaluator.presidio_recognizer_evaluator import \ + score_presidio_recognizer -# from presidio_evaluator.data_generator import read_synth_dataset -# from presidio_evaluator.presidio_recognizer_evaluator import \ -# score_presidio_recognizer -# -# import pytest -# from analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer -# -# # test case parameters for tests with dataset which was previously generated. -# class GeneratedTextTestCase: -# def __init__(self, test_name, test_input, acceptance_threshold, marks): -# self.test_name = test_name -# self.test_input = test_input -# self.acceptance_threshold = acceptance_threshold -# self.marks = marks -# -# def to_pytest_param(self): -# return pytest.param(self.test_input, self.acceptance_threshold, -# id=self.test_name, marks=self.marks) -# -# -# # generated-text test cases -# cc_test_generate_text_testdata = [ -# # small dataset, inconclusive results -# GeneratedTextTestCase( -# test_name="small-set", -# test_input="{}/data/generated_small.txt", -# acceptance_threshold=0.5, -# marks=pytest.mark.inconclusive -# ), -# # large dataset - test is slow and inconclusive -# GeneratedTextTestCase( -# test_name="large-set", -# test_input="{}/data/generated_large.txt", -# acceptance_threshold=0.5, -# marks=pytest.mark.slow -# ) -# ] -# -# -# # credit card recognizer tests on generated data -# @pytest.mark.parametrize("test_input,acceptance_threshold", -# [testcase.to_pytest_param() for testcase in -# cc_test_generate_text_testdata]) -# def test_spacy_recognizer_with_generated_text(test_input, acceptance_threshold): -# """ -# Test spacy recognizer with a generated dataset text file -# :param test_input: input text file location -# :param acceptance_threshold: minimim precision/recall -# allowed for tests to pass -# """ -# -# # read test input from generated file -# import os -# dir_path = os.path.dirname(os.path.realpath(__file__)) -# input_samples = read_synth_dataset( -# test_input.format(dir_path)) -# scores = score_presidio_recognizer( -# SpacyRecognizer(), ['PERSON'], input_samples, True) -# assert acceptance_threshold <= scores.pii_f +import pytest +from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer + +# test case parameters for tests with dataset which was previously generated. +class GeneratedTextTestCase: + def __init__(self, test_name, test_input, acceptance_threshold, marks): + self.test_name = test_name + self.test_input = test_input + self.acceptance_threshold = acceptance_threshold + self.marks = marks + + def to_pytest_param(self): + return pytest.param(self.test_input, self.acceptance_threshold, + id=self.test_name, marks=self.marks) + + +# generated-text test cases +cc_test_generate_text_testdata = [ + # small dataset, inconclusive results + GeneratedTextTestCase( + test_name="small-set", + test_input="{}/data/generated_small.txt", + acceptance_threshold=0.5, + marks=pytest.mark.inconclusive + ), + # large dataset - test is slow and inconclusive + GeneratedTextTestCase( + test_name="large-set", + test_input="{}/data/generated_large.txt", + acceptance_threshold=0.5, + marks=pytest.mark.slow + ) +] + + +# credit card recognizer tests on generated data +@pytest.mark.parametrize("test_input,acceptance_threshold", + [testcase.to_pytest_param() for testcase in + cc_test_generate_text_testdata]) +def test_spacy_recognizer_with_generated_text(test_input, acceptance_threshold): + """ + Test spacy recognizer with a generated dataset text file + :param test_input: input text file location + :param acceptance_threshold: minimim precision/recall + allowed for tests to pass + """ + + # read test input from generated file + import os + dir_path = os.path.dirname(os.path.realpath(__file__)) + input_samples = read_synth_dataset( + test_input.format(dir_path)) + scores = score_presidio_recognizer( + SpacyRecognizer(), ['PERSON'], input_samples, True) + assert acceptance_threshold <= scores.pii_f