added additional entities to mapping

This commit is contained in:
Omri Mendels 2023-02-05 16:51:46 +02:00
Родитель 9aef7635ce
Коммит e68b6b0c2b
2 изменённых файлов: 6 добавлений и 157 удалений

Просмотреть файл

@ -69,14 +69,18 @@ class PresidioAnalyzerWrapper(BaseModel):
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
presidio_entities_map = dict(PERSON="PERSON",
NAME="PERSON",
GPE="LOCATION",
EMAIL_ADDRESS="EMAIL_ADDRESS",
EMAIL="EMAIL_ADDRESS",
CREDIT_CARD="CREDIT_CARD",
CREDIT_CARD_NUMBER="CREDIT_CARD",
FIRST_NAME="PERSON",
LAST_NAME="PERSON",
PHONE_NUMBER="PHONE_NUMBER",
BIRTHDAY="DATE_TIME",
DATE_TIME="DATE_TIME",
DATE_OF_BIRTH="DATE_TIME",
DOMAIN_NAME="URL",
CITY="LOCATION",
ADDRESS="LOCATION",
@ -90,8 +94,9 @@ class PresidioAnalyzerWrapper(BaseModel):
NRP="NRP",
TITLE="O", # not supported
PREFIX="O", # not supported
STREET_ADDRESS="O", # not supported
STREET_ADDRESS="LOCATION",
ZIP_CODE="O", # not supported
ZIPCODE="O", # not supported
AGE="O", # not supported
O="O")

Просмотреть файл

@ -1,156 +0,0 @@
from presidio_evaluator.data_generator import PresidioSentenceFaker
from presidio_evaluator.evaluation.scorers import score_presidio_recognizer
import pandas as pd
import pytest
import numpy as np
from presidio_analyzer import Pattern, PatternRecognizer
class PatternRecognizerTestCase:
"""
Test case parameters for tests with dataset generated from a template and
two csv value files, one containing the common-entities and another one with custom entities.
"""
def __init__(
self,
test_name,
entity_name,
pattern,
score,
pii_csv,
ext_csv,
utterances,
num_of_examples,
acceptance_threshold,
max_mistakes_number,
):
self.test_name = test_name
self.entity_name = entity_name
self.pattern = pattern
self.score = score
self.pii_csv = pii_csv
self.ext_csv = ext_csv
self.utterances = utterances
self.num_of_examples = num_of_examples
self.acceptance_threshold = acceptance_threshold
self.max_mistakes_number = max_mistakes_number
def to_pytest_param(self):
return pytest.param(
self.pii_csv,
self.ext_csv,
self.utterances,
self.entity_name,
self.pattern,
self.score,
self.num_of_examples,
self.acceptance_threshold,
self.max_mistakes_number,
id=self.test_name,
)
# template-dataset test cases
rocket_test_template_testdata = [
PatternRecognizerTestCase(
test_name="rocket-no-errors",
entity_name="ROCKET",
pattern=r"\W*(rocket)\W*",
score=0.8,
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
ext_csv="{}/data/FakeRocketGenerator.csv",
utterances="{}/data/rocket_example_sentences.txt",
num_of_examples=100,
acceptance_threshold=1,
max_mistakes_number=0,
),
PatternRecognizerTestCase(
test_name="rocket-all-errors",
entity_name="ROCKET",
pattern=r"\W*(rocket)\W*",
score=0.8,
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
ext_csv="{}/data/FakeRocketErrorsGenerator.csv",
utterances="{}/data/rocket_example_sentences.txt",
num_of_examples=100,
acceptance_threshold=0,
max_mistakes_number=100,
),
PatternRecognizerTestCase(
test_name="rocket-some-errors",
entity_name="ROCKET",
pattern=r"\W*(rocket)\W*",
score=0.8,
pii_csv="{}/data/FakeNameGenerator.com_100.csv",
ext_csv="{}/data/FakeRocket50PercentErrorsGenerator.csv",
utterances="{}/data/rocket_example_sentences.txt",
num_of_examples=100,
acceptance_threshold=0.3,
max_mistakes_number=70,
),
]
@pytest.mark.parametrize(
"pii_csv, ext_csv, utterances, "
"entity_name, pattern, score, num_of_examples, "
"acceptance_threshold, max_mistakes_number",
[testcase.to_pytest_param() for testcase in rocket_test_template_testdata],
)
def test_pattern_recognizer(
pii_csv,
ext_csv,
utterances,
entity_name,
pattern,
score,
num_of_examples,
acceptance_threshold,
max_mistakes_number,
):
"""
Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities
and another CSV values file with a custom entity
:param pii_csv: input csv file location with the common entities
:param ext_csv: input csv file location with custom entities
:param utterances: template file location
:param entity_name: custom entity name
:param pattern: recognizer pattern
:param num_of_examples: number of samples to be used from dataset to test
:param acceptance_threshold: minimum precision/recall
allowed for tests to pass
"""
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
dfpii = pd.read_csv(pii_csv.format(dir_path), encoding="utf-8")
dfext = pd.read_csv(ext_csv.format(dir_path), encoding="utf-8")
ext_column_name = dfext.columns[0]
def get_from_ext(i):
index = i % dfext.shape[0]
return dfext.iat[index, 0]
# extend pii with ext data
dfpii[ext_column_name] = [get_from_ext(i) for i in range(0, dfpii.shape[0])]
templates = utterances.format(dir_path)
sentence_faker = PresidioSentenceFaker('en_US', lower_case_ratio=0.05, sentence_templates=templates)
input_samples = sentence_faker.generate_new_fake_sentences(num_of_examples)
pattern = Pattern("test pattern", pattern, score)
pattern_recognizer = PatternRecognizer(
entity_name, name="test recognizer", patterns=[pattern]
)
scores = score_presidio_recognizer(
recognizer=pattern_recognizer,
entities_to_keep=[entity_name],
input_samples=input_samples,
)
if not np.isnan(scores.pii_f):
assert acceptance_threshold <= scores.pii_f
assert max_mistakes_number >= len(scores.model_errors)