From e68b6b0c2bf626081dcf7fd478bee5f8890053af Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 5 Feb 2023 16:51:46 +0200 Subject: [PATCH] added additional entities to mapping --- .../models/presidio_analyzer_wrapper.py | 7 +- tests/test_recognizers_template_join_csv.py | 156 ------------------ 2 files changed, 6 insertions(+), 157 deletions(-) delete mode 100644 tests/test_recognizers_template_join_csv.py diff --git a/presidio_evaluator/models/presidio_analyzer_wrapper.py b/presidio_evaluator/models/presidio_analyzer_wrapper.py index b9a554d..f49f75a 100644 --- a/presidio_evaluator/models/presidio_analyzer_wrapper.py +++ b/presidio_evaluator/models/presidio_analyzer_wrapper.py @@ -69,14 +69,18 @@ class PresidioAnalyzerWrapper(BaseModel): # Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity presidio_entities_map = dict(PERSON="PERSON", + NAME="PERSON", GPE="LOCATION", EMAIL_ADDRESS="EMAIL_ADDRESS", + EMAIL="EMAIL_ADDRESS", CREDIT_CARD="CREDIT_CARD", + CREDIT_CARD_NUMBER="CREDIT_CARD", FIRST_NAME="PERSON", LAST_NAME="PERSON", PHONE_NUMBER="PHONE_NUMBER", BIRTHDAY="DATE_TIME", DATE_TIME="DATE_TIME", + DATE_OF_BIRTH="DATE_TIME", DOMAIN_NAME="URL", CITY="LOCATION", ADDRESS="LOCATION", @@ -90,8 +94,9 @@ class PresidioAnalyzerWrapper(BaseModel): NRP="NRP", TITLE="O", # not supported PREFIX="O", # not supported - STREET_ADDRESS="O", # not supported + STREET_ADDRESS="LOCATION", ZIP_CODE="O", # not supported + ZIPCODE="O", # not supported AGE="O", # not supported O="O") diff --git a/tests/test_recognizers_template_join_csv.py b/tests/test_recognizers_template_join_csv.py deleted file mode 100644 index 47df9f4..0000000 --- a/tests/test_recognizers_template_join_csv.py +++ /dev/null @@ -1,156 +0,0 @@ -from presidio_evaluator.data_generator import PresidioSentenceFaker -from presidio_evaluator.evaluation.scorers import score_presidio_recognizer -import pandas as pd -import pytest -import numpy as np - -from presidio_analyzer import Pattern, PatternRecognizer - - -class PatternRecognizerTestCase: - """ - Test case parameters for tests with dataset generated from a template and - two csv value files, one containing the common-entities and another one with custom entities. - """ - - def __init__( - self, - test_name, - entity_name, - pattern, - score, - pii_csv, - ext_csv, - utterances, - num_of_examples, - acceptance_threshold, - max_mistakes_number, - ): - self.test_name = test_name - self.entity_name = entity_name - self.pattern = pattern - self.score = score - self.pii_csv = pii_csv - self.ext_csv = ext_csv - self.utterances = utterances - self.num_of_examples = num_of_examples - self.acceptance_threshold = acceptance_threshold - self.max_mistakes_number = max_mistakes_number - - def to_pytest_param(self): - return pytest.param( - self.pii_csv, - self.ext_csv, - self.utterances, - self.entity_name, - self.pattern, - self.score, - self.num_of_examples, - self.acceptance_threshold, - self.max_mistakes_number, - id=self.test_name, - ) - - -# template-dataset test cases -rocket_test_template_testdata = [ - PatternRecognizerTestCase( - test_name="rocket-no-errors", - entity_name="ROCKET", - pattern=r"\W*(rocket)\W*", - score=0.8, - pii_csv="{}/data/FakeNameGenerator.com_100.csv", - ext_csv="{}/data/FakeRocketGenerator.csv", - utterances="{}/data/rocket_example_sentences.txt", - num_of_examples=100, - acceptance_threshold=1, - max_mistakes_number=0, - ), - PatternRecognizerTestCase( - test_name="rocket-all-errors", - entity_name="ROCKET", - pattern=r"\W*(rocket)\W*", - score=0.8, - pii_csv="{}/data/FakeNameGenerator.com_100.csv", - ext_csv="{}/data/FakeRocketErrorsGenerator.csv", - utterances="{}/data/rocket_example_sentences.txt", - num_of_examples=100, - acceptance_threshold=0, - max_mistakes_number=100, - ), - PatternRecognizerTestCase( - test_name="rocket-some-errors", - entity_name="ROCKET", - pattern=r"\W*(rocket)\W*", - score=0.8, - pii_csv="{}/data/FakeNameGenerator.com_100.csv", - ext_csv="{}/data/FakeRocket50PercentErrorsGenerator.csv", - utterances="{}/data/rocket_example_sentences.txt", - num_of_examples=100, - acceptance_threshold=0.3, - max_mistakes_number=70, - ), -] - - -@pytest.mark.parametrize( - "pii_csv, ext_csv, utterances, " - "entity_name, pattern, score, num_of_examples, " - "acceptance_threshold, max_mistakes_number", - [testcase.to_pytest_param() for testcase in rocket_test_template_testdata], -) -def test_pattern_recognizer( - pii_csv, - ext_csv, - utterances, - entity_name, - pattern, - score, - num_of_examples, - acceptance_threshold, - max_mistakes_number, -): - """ - Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities - and another CSV values file with a custom entity - :param pii_csv: input csv file location with the common entities - :param ext_csv: input csv file location with custom entities - :param utterances: template file location - :param entity_name: custom entity name - :param pattern: recognizer pattern - :param num_of_examples: number of samples to be used from dataset to test - :param acceptance_threshold: minimum precision/recall - allowed for tests to pass - """ - - import os - - dir_path = os.path.dirname(os.path.realpath(__file__)) - dfpii = pd.read_csv(pii_csv.format(dir_path), encoding="utf-8") - dfext = pd.read_csv(ext_csv.format(dir_path), encoding="utf-8") - ext_column_name = dfext.columns[0] - - def get_from_ext(i): - index = i % dfext.shape[0] - return dfext.iat[index, 0] - - # extend pii with ext data - dfpii[ext_column_name] = [get_from_ext(i) for i in range(0, dfpii.shape[0])] - - templates = utterances.format(dir_path) - sentence_faker = PresidioSentenceFaker('en_US', lower_case_ratio=0.05, sentence_templates=templates) - input_samples = sentence_faker.generate_new_fake_sentences(num_of_examples) - - pattern = Pattern("test pattern", pattern, score) - pattern_recognizer = PatternRecognizer( - entity_name, name="test recognizer", patterns=[pattern] - ) - - scores = score_presidio_recognizer( - recognizer=pattern_recognizer, - entities_to_keep=[entity_name], - input_samples=input_samples, - ) - if not np.isnan(scores.pii_f): - assert acceptance_threshold <= scores.pii_f - assert max_mistakes_number >= len(scores.model_errors)