This commit is contained in:
omri374 2021-04-28 15:37:16 +03:00
Родитель 3df6e1aca8
Коммит 3e335a6a3a
28 изменённых файлов: 396 добавлений и 335 удалений

Просмотреть файл

@ -30,7 +30,7 @@ steps:
- script: |
python -m pip install --upgrade pip
pip install -r requirements.txt
python m spacy download en_core_web_lg
python -m spacy download en_core_web_lg
displayName: 'Install dependencies'

Просмотреть файл

@ -1,2 +0,0 @@
from .spacy_retrain import SpacyRetrainer
from .flair_train import FlairTrainer

Просмотреть файл

@ -3,28 +3,35 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true
}
},
"metadata": {},
"outputs": [],
"source": [
"from presidio_evaluator.data_generator import read_synth_dataset\n",
"from presidio_evaluator import ModelEvaluator\n",
"from presidio_evaluator.evaluation import ModelError, Evaluator\n",
"from presidio_evaluator.models import BaseModel, PresidioAnalyzerWrapper\n",
"from collections import Counter\n",
"\n",
"import pandas as pd\n",
"\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"MY_PRESIDIO_ENDPOINT = \"http://presidio-api.westeurope.cloudapp.azure.com/api/v1/projects/test/analyze\""
"pd.options.display.max_columns = None\n",
"pd.options.display.width=None"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate your Presidio instance via the Presidio API"
"# Evaluate Presidio Analyzer\n",
"This notebook runs the PresidioAnalyzerEvaluator class on top of synthetic data.\n",
"\n",
"One can perform the following changes:\n",
"1. Replace the synthetic data creation with real data or with other type of synthetic data\n",
"2. Adapt the Presidio `AnalyzerEngine` to a specific engine with a different set of recognizers or configured to be used on different languages\n",
"\n",
"\n"
]
},
{
@ -37,15 +44,12 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true
}
},
"metadata": {},
"outputs": [],
"source": [
"input_samples = read_synth_dataset(\"../data/synth_dataset.txt\")\n",
"print(\"Read {} samples\".format(len(input_samples)))"
"print(\"Read {} samples\".format(len(input_samples)))\n",
"input_samples[0]"
]
},
{
@ -58,11 +62,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true
}
},
"metadata": {},
"outputs": [],
"source": [
"flatten = lambda l: [item for sublist in l for item in sublist]\n",
@ -84,32 +84,29 @@
"metadata": {},
"outputs": [],
"source": [
"# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity\n",
"entities_mapping = {\n",
" 'PERSON': 'PERSON',\n",
" 'EMAIL': 'EMAIL_ADDRESS',\n",
" 'CREDIT_CARD': 'CREDIT_CARD',\n",
" 'FIRST_NAME': 'PERSON',\n",
" 'PHONE_NUMBER': 'PHONE_NUMBER',\n",
" 'LOCATION':'LOCATION',\n",
" # 'BIRTHDAY': 'DATE_TIME',\n",
" # 'DATE': 'DATE_TIME',\n",
" 'DOMAIN': 'DOMAIN',\n",
" # 'CITY': 'LOCATION',\n",
" # 'ADDRESS': 'LOCATION',\n",
" 'IBAN': 'IBAN_CODE',\n",
" # 'URL': 'DOMAIN_NAME',\n",
" 'US_SSN': 'US_SSN',\n",
" 'IP_ADDRESS': 'IP_ADDRESS',\n",
" # 'ORGANIZATION':'ORG'\n",
" 'O': 'O'\n",
"presidio_entities_map = {\n",
" \"PERSON\": \"PERSON\",\n",
" \"EMAIL_ADDRESS\": \"EMAIL_ADDRESS\",\n",
" \"CREDIT_CARD\": \"CREDIT_CARD\",\n",
" \"FIRST_NAME\": \"PERSON\",\n",
" \"PHONE_NUMBER\": \"PHONE_NUMBER\",\n",
" \"BIRTHDAY\": \"DATE_TIME\",\n",
" \"DATE_TIME\": \"DATE_TIME\",\n",
" \"DOMAIN\": \"DOMAIN\",\n",
" \"CITY\": \"LOCATION\",\n",
" \"ADDRESS\": \"LOCATION\",\n",
" \"NATIONALITY\": \"LOCATION\",\n",
" \"LOCATION\": \"LOCATION\",\n",
" \"IBAN\": \"IBAN_CODE\",\n",
" \"URL\": \"DOMAIN_NAME\",\n",
" \"US_SSN\": \"US_SSN\",\n",
" \"IP_ADDRESS\": \"IP_ADDRESS\",\n",
" \"ORGANIZATION\": \"ORG\",\n",
" \"TITLE\" : \"O\", # skipping evaluation of titles\n",
" \"O\": \"O\",\n",
"}\n",
"presidio_fields = ['CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'DOMAIN_NAME', 'EMAIL_ADDRESS', 'IBAN_CODE',\n",
" 'IP_ADDRESS', 'NRP', 'LOCATION', 'PERSON', 'PHONE_NUMBER', 'US_SSN']\n",
"\n",
"new_list = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,\n",
" entities_mapping,\n",
" presidio_fields)"
"new_list = Evaluator.align_entity_types(input_samples, presidio_entities_map)"
]
},
{
@ -143,9 +140,9 @@
"metadata": {},
"outputs": [],
"source": [
"from presidio_evaluator import PresidioAPIEvaluator\n",
"presidio = PresidioAPIEvaluator(entities_to_keep=list(count_per_entity_new.keys()),endpoint=MY_PRESIDIO_ENDPOINT)\n",
"evaluted_samples = presidio.evaluate_all(new_list[:100])"
"presidio = PresidioAnalyzerWrapper(entities_to_keep=list(count_per_entity_new.keys()))\n",
"evaluator = Evaluator(model=presidio)\n",
"evaluted_samples = evaluator.evaluate_all(new_list[:100])"
]
},
{
@ -163,7 +160,7 @@
"metadata": {},
"outputs": [],
"source": [
"evaluation_result = presidio.calculate_score(evaluted_samples)"
"evaluation_result = evaluator.calculate_score(evaluted_samples)"
]
},
{
@ -197,7 +194,7 @@
"metadata": {},
"outputs": [],
"source": [
"ModelEvaluator.most_common_fp_tokens(errors,n=5)"
"ModelError.most_common_fp_tokens(errors,n=5)"
]
},
{
@ -210,8 +207,9 @@
},
"outputs": [],
"source": [
"fps_df = ModelEvaluator.get_fps_dataframe(errors,entity='PERSON')\n",
"fps_df[['full_text','token','prediction']]"
"fps_df = ModelError.get_fps_dataframe(errors,entity='PERSON')\n",
"if fps_df is not None:\n",
" fps_df[['full_text','token','prediction']]"
]
},
{
@ -220,16 +218,30 @@
"metadata": {},
"outputs": [],
"source": [
"fns_df = ModelEvaluator.get_fns_dataframe(errors,entity='PERSON')\n",
"fns_df = ModelError.get_fns_dataframe(errors,entity='PERSON')\n",
"fns_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"name": "pycharm-c8930cf3",
"display_name": "presidio-research",
"language": "python",
"display_name": "PyCharm (presidio-research)"
"name": "presidio-research"
},
"language_info": {
"codemirror_mode": {
@ -241,16 +253,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
"version": "3.8.8"
}
},
"nbformat": 4,

Просмотреть файл

@ -204,9 +204,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "presidio-research",
"language": "python",
"name": "python3"
"name": "presidio-research"
},
"language_info": {
"codemirror_mode": {
@ -218,18 +218,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}

Просмотреть файл

@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -47,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -59,7 +59,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -68,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -81,10 +81,11 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#!pip install wordcloud\n",
"from wordcloud import WordCloud\n",
"\n",
"def series_to_wordcloud(series):\n",
@ -98,7 +99,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -107,7 +108,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -116,7 +117,7 @@
},
{
"cell_type": "code",
"execution_count": 96,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -125,7 +126,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -134,7 +135,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -150,7 +151,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -159,7 +160,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -168,7 +169,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -177,7 +178,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -187,7 +188,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -197,7 +198,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -207,7 +208,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -222,7 +223,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -233,7 +234,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -252,9 +253,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "presidio-research",
"language": "python",
"name": "python3"
"name": "presidio-research"
},
"language_info": {
"codemirror_mode": {
@ -266,18 +267,18 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.8.8"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}

Просмотреть файл

@ -21,9 +21,10 @@
},
"outputs": [],
"source": [
"# install presidio via pip\n",
"# install presidio via pip if not yet installed\n",
"\n",
"#!pip install presidio-analyzer"
"#!pip install presidio-analyzer\n",
"#!pip install presidio-anonymizer"
]
},
{
@ -100,7 +101,7 @@
"source": [
"original_text = \"Hi my name is Doug Funny and this is my website: https://www.dougf.io/\"\n",
"\n",
"presidio_response = analyzer.analyze(original_text,language='en',all_fields=True)\n",
"presidio_response = analyzer.analyze(original_text,language='en')\n",
"presidio_response\n"
]
},
@ -165,7 +166,7 @@
"\n",
"text = \"Our son asdfhlk used to work in Germany\"\n",
"\n",
"response = analyzer.analyze(text=text,language='en',all_fields=True)\n",
"response = analyzer.analyze(text=text,language='en')\n",
"print(f\"Presidio' response: {response}\")\n",
"\n",
"\n",
@ -188,9 +189,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "PyCharm (presidio-research)",
"display_name": "presidio-research",
"language": "python",
"name": "pycharm-c8930cf3"
"name": "presidio-research"
},
"language_info": {
"codemirror_mode": {
@ -202,7 +203,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.8.8"
}
},
"nbformat": 4,

Просмотреть файл

@ -19,6 +19,7 @@
"from presidio_evaluator import InputSample\n",
"from presidio_evaluator.data_generator import read_synth_dataset\n",
"from presidio_evaluator.validation import split_dataset, save_to_json\n",
"from datetime import date\n",
"\n",
"%reload_ext autoreload"
]
@ -36,7 +37,7 @@
"metadata": {},
"outputs": [],
"source": [
"all_samples = read_synth_dataset(\"../presidio_evaluator/data_generator/synth_dataset.txt\")\n",
"all_samples = read_synth_dataset(\"../data/synth_dataset.txt\")\n",
"print(len(all_samples))"
]
},
@ -90,9 +91,11 @@
"metadata": {},
"outputs": [],
"source": [
"DATE_DATE = date.today().strftime(\"%b-%d-%Y\")\n",
"\n",
"save_to_json(train,\"../data/train_{}.json\".format(DATE_DATE))\n",
"save_to_json(test,\"../data/test_{}.json\".format(DATE_DATE))\n",
"save_to_json(validation,\"../data/1validation_{}.json\".format(DATE_DATE))\n"
"save_to_json(validation,\"../data/validation_{}.json\".format(DATE_DATE))\n"
]
},
{
@ -130,27 +133,22 @@
"source": [
"assert len(train) + len(test) + len(validation) == len(all_samples)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "presidio-research",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"name": "presidio-research"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}

Просмотреть файл

@ -18,7 +18,7 @@
"import pandas as pd\n",
"pd.options.display.max_rows = 4000\n",
"pd.set_option('display.max_colwidth', -1)\n",
"from deeppavlov.dataset_readers.conll2003_reader import Conll2003DatasetReader"
"#TODO: fix CONLL2003 download and usage"
]
},
{
@ -31,8 +31,8 @@
},
"outputs": [],
"source": [
"reader = Conll2003DatasetReader()\n",
"dataset = reader.read(data_path =\"../../data\",dataset_name='conll2003')\n",
"#reader = Conll2003DatasetReader()\n",
"#dataset = reader.read(data_path =\"../../data\",dataset_name='conll2003')\n",
"#Note: make sure you haven't downloaded something else with this function before, \n",
"# as it will not download a new dataset (even if your previous download was for a different dataset)"
]
@ -645,15 +645,6 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,

Просмотреть файл

@ -13,7 +13,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -23,7 +23,7 @@
"from sklearn_crfsuite import metrics\n",
"\n",
"from presidio_evaluator import InputSample\n",
"from presidio_evaluator.crf_evaluator import CRFEvaluator\n",
"from presidio_evaluator.models.crf_model import CRFModel\n",
"from presidio_evaluator.data_generator import read_synth_dataset"
]
},
@ -48,7 +48,6 @@
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": true,
"name": "#%%\n"
}
},
@ -100,7 +99,7 @@
"metadata": {},
"outputs": [],
"source": [
"CRFEvaluator.sent2features(train_sents[0])[0]"
"CRFModel.sent2features(train_sents[0])[0]"
]
},
{
@ -110,11 +109,11 @@
"outputs": [],
"source": [
"%%time\n",
"X_train = [CRFEvaluator.sent2features(s) for s in train_sents]\n",
"y_train = [CRFEvaluator.sent2labels(s) for s in train_sents]\n",
"X_train = [CRFModel.sent2features(s) for s in train_sents]\n",
"y_train = [CRFModel.sent2labels(s) for s in train_sents]\n",
"\n",
"X_test = [CRFEvaluator.sent2features(s) for s in test_sents]\n",
"y_test = [CRFEvaluator.sent2labels(s) for s in test_sents]"
"X_test = [CRFModel.sent2features(s) for s in test_sents]\n",
"y_test = [CRFModel.sent2labels(s) for s in test_sents]"
]
},
{
@ -149,7 +148,7 @@
"source": [
"import pickle\n",
"with open(\"../../models/crf.pickle\",'wb') as f:\n",
" data = pickle.dump(crf, f,protocol=pickle.HIGHEST_PROTOCOL)\n",
" pickle.dump(crf, f,protocol=pickle.HIGHEST_PROTOCOL)\n",
" "
]
},
@ -277,9 +276,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "PyCharm (presidio-research)",
"language": "python",
"name": "python3"
"name": "pycharm-c8930cf3"
},
"language_info": {
"codemirror_mode": {
@ -291,18 +290,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
}

Просмотреть файл

@ -15,24 +15,22 @@
}
},
"source": [
"This notebook takes train and test datasets (of type `List[InputSample]`)\n",
"This notebook takes train and test datasets (of type `List[InputSample]`)\n",
"and transforms them into two structures consumed by Spacy:\n",
"1. Spacy JSON (see https://spacy.io/api/annotation#json-input)\n",
"2. Spacy Pickle files (of structure `[(full_text,\"entities\":[(start, end, type),(...))]`. \n",
"See more details here: https://spacy.io/api/annotation#json-input)\n",
"\n",
"JSON is used for Spacy's CLI trainer. \n",
"Pickle is used for fine-tuning using the logic in [../models/spacy_retrain.py](../models/spacy_retrain.py)"
">> Note that this notebook uses the old spaCy 2.0 structure. In order to train spaCy 3 models,\n",
"> the spacy `convert` CLI call should be used:\n",
"\n",
">>`python -m spacy convert ./data.json ./output.spacy`"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from presidio_evaluator.data_generator import read_synth_dataset\n",
@ -62,7 +60,7 @@
},
"outputs": [],
"source": [
"data_path = \"../data/generated_{}_{}.json\"\n",
"data_path = \"../../data/generated_{}_{}.json\"\n",
"\n",
"train_samples = read_synth_dataset(data_path.format(\"train\",DATA_DATE))\n",
"print(\"Read {} samples\".format(len(train_samples)))"
@ -81,7 +79,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -146,12 +144,11 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"entities_spacy = [x[1]['entities'] for x in spacy_train]\n",
"entities_spacy\n",
"entities_spacy_flat = []\n",
"for samp in entities_spacy:\n",
" for ent in samp:\n",
@ -168,7 +165,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -185,7 +182,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -194,7 +191,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -210,16 +207,16 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"import json\n",
"with open(\"../data/train.pickle\", 'wb') as handle:\n",
"with open(\"../../data/train.pickle\", 'wb') as handle:\n",
" pickle.dump(spacy_train,handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
"\n",
"with open(\"../data/train.json\",\"w\") as f:\n",
"with open(\"../../data/train.json\",\"w\") as f:\n",
" json.dump(spacy_train_json,f)\n",
" "
]
@ -233,7 +230,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -243,7 +240,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -261,15 +258,15 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"with open(\"../data/test.pickle\", 'wb') as handle:\n",
"with open(\"../../data/test.pickle\", 'wb') as handle:\n",
" pickle.dump(spacy_test,handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
" \n",
"with open(\"../data/test.json\",\"w\") as f:\n",
"with open(\"../../data/test.json\",\"w\") as f:\n",
" json.dump(spacy_test_json,f)\n",
" "
]
@ -284,9 +281,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "PyCharm (presidio-research)",
"language": "python",
"name": "python3"
"name": "pycharm-c8930cf3"
},
"language_info": {
"codemirror_mode": {
@ -298,16 +295,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
"version": "3.8.8"
}
},
"nbformat": 4,

Просмотреть файл

@ -342,18 +342,18 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.7.9"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}

Просмотреть файл

@ -20,6 +20,7 @@
"outputs": [],
"source": [
"from presidio_evaluator.data_generator import read_synth_dataset\n",
"from presidio_evaluator.evaluation import ModelError, Evaluator\n",
"%reload_ext autoreload\n",
"%autoreload 2"
]
@ -42,8 +43,49 @@
"outputs": [],
"source": [
"synth_samples = read_synth_dataset(\"../../data/synth_dataset.txt\")\n",
"print(len(synth_samples))\n",
"DATASET = synth_samples"
"print(len(synth_samples))\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"is_executing": false
}
},
"source": [
"Map entity types"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"presidio_entities_map = {\n",
" \"PERSON\": \"PER\",\n",
" \"EMAIL_ADDRESS\": \"O\",\n",
" \"CREDIT_CARD\": \"O\",\n",
" \"FIRST_NAME\": \"PER\",\n",
" \"PHONE_NUMBER\": \"O\",\n",
" \"BIRTHDAY\": \"O\",\n",
" \"DATE_TIME\": \"O\",\n",
" \"DOMAIN\": \"O\",\n",
" \"CITY\": \"LOC\",\n",
" \"ADDRESS\": \"LOC\",\n",
" \"NATIONALITY\": \"LOC\",\n",
" \"LOCATION\": \"LOC\",\n",
" \"IBAN\": \"O\",\n",
" \"URL\": \"O\",\n",
" \"US_SSN\": \"O\",\n",
" \"IP_ADDRESS\": \"O\",\n",
" \"ORGANIZATION\": \"ORG\",\n",
" \"TITLE\" : \"O\", # skipping evaluation of titles\n",
" \"O\": \"O\",\n",
"}\n",
"\n",
"synth_samples = Evaluator.align_entity_types(synth_samples, presidio_entities_map)"
]
},
{
@ -58,7 +100,7 @@
"source": [
"from collections import Counter\n",
"entity_counter = Counter()\n",
"for sample in DATASET:\n",
"for sample in synth_samples:\n",
" for tag in sample.tags:\n",
" entity_counter[tag]+=1"
]
@ -79,15 +121,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"metadata": {},
"outputs": [],
"source": [
"DATASET[1]"
]
"source": []
},
{
"cell_type": "code",
@ -100,7 +136,7 @@
"outputs": [],
"source": [
"#max length sentence\n",
"max([len(sample.tokens) for sample in DATASET])"
"max([len(sample.tokens) for sample in synth_samples])"
]
},
{
@ -123,9 +159,7 @@
"flair_ner = 'ner'\n",
"flair_ner_fast = 'ner-fast'\n",
"flair_ontonotes = 'ner-ontonotes-fast'\n",
"flair_bert_embeddings = '../../models/presidio-ner/flair-bert-embeddings.pt'\n",
"glove_flair_embeddings = '../../models/presidio-ner/flair-embeddings.pt'\n",
"models = [flair_bert_embeddings, glove_flair_embeddings, flair_ner,flair_ner_fast,flair_ontonotes]"
"models = [flair_ner, flair_ner_fast]"
]
},
{
@ -138,14 +172,15 @@
},
"outputs": [],
"source": [
"from presidio_evaluator.flair_evaluator import FlairEvaluator\n",
"from presidio_evaluator.models import FlairModel\n",
"\n",
"for model in models:\n",
" print(\"-----------------------------------\")\n",
" print(\"Evaluating model {}\".format(model))\n",
" flair_evaluator = FlairEvaluator(model_path=model)\n",
" evaluation_results = flair_evaluator.evaluate_all(DATASET)\n",
" scores = flair_evaluator.calculate_score(evaluation_results)\n",
" flair_model = FlairModel(model_path=model)\n",
" evaluator = Evaluator(model=flair_model)\n",
" evaluation_results = evaluator.evaluate_all(synth_samples)\n",
" scores = evaluator.calculate_score(evaluation_results)\n",
" \n",
" \n",
" print(\"Confusion matrix:\")\n",
@ -189,8 +224,7 @@
"source": [
"errors = scores.model_errors\n",
"\n",
"from presidio_evaluator import ModelEvaluator\n",
"ModelEvaluator.most_common_fp_tokens(errors)#[model_error for model_error in errors if model_error.error_type =='FP']\n"
"ModelError.most_common_fp_tokens(errors)"
]
},
{
@ -203,7 +237,7 @@
},
"outputs": [],
"source": [
"fps_df = ModelEvaluator.get_fps_dataframe(errors,entity=['PERSON'])\n",
"fps_df = ModelError.get_fps_dataframe(errors,entity=['PERSON'])\n",
"fps_df[['full_text','token','prediction']]"
]
},
@ -224,7 +258,7 @@
},
"outputs": [],
"source": [
"ModelEvaluator.most_common_fn_tokens(errors,n=50, entity=['PERSON'])"
"ModelError.most_common_fn_tokens(errors,n=50, entity=['PER'])"
]
},
{
@ -244,7 +278,7 @@
},
"outputs": [],
"source": [
"fns_df = ModelEvaluator.get_fns_dataframe(errors,entity=['PERSON'])"
"fns_df = ModelError.get_fns_dataframe(errors,entity=['PERSON'])"
]
},
{
@ -264,9 +298,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "presidio-research",
"language": "python",
"name": "python3"
"name": "presidio-research"
},
"language_info": {
"codemirror_mode": {
@ -278,18 +312,18 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.8.8"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}

Просмотреть файл

@ -21,7 +21,7 @@
"source": [
"import spacy\n",
"\n",
"from presidio_evaluator import ModelEvaluator\n",
"from presidio_evaluator.evaluation import Evaluator, ModelError\n",
"from presidio_evaluator.data_generator import read_synth_dataset\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
@ -129,14 +129,7 @@
},
"outputs": [],
"source": [
"models = []\n",
"\n",
"en_core_web_lg = r\"en_core_web_lg\"\n",
"spacy_new_ontonotes28 = r\"C:\\Users\\ommendel\\OneDrive - Microsoft\\Projects\\presidio\\Presidio-internal\\presidio-evaluator\\models\\spacy_new_ontonotes28\"\n",
"\n",
"spacy_ft_100 = r\"C:\\Users\\ommendel\\OneDrive - Microsoft\\Projects\\presidio\\Presidio-internal\\presidio-evaluator\\models\\spacy_ft_100\\model-final\"\n",
"\n",
"models = [en_core_web_lg, spacy_new_ontonotes28, spacy_ft_100]"
"models = [\"en_core_web_lg\", \"en_core_web_trf\"]"
]
},
{
@ -156,15 +149,17 @@
},
"outputs": [],
"source": [
"from presidio_evaluator.spacy_evaluator import SpacyEvaluator\n",
"from presidio_evaluator.models import SpacyModel\n",
"\n",
"\n",
"for model in models:\n",
" print(\"-----------------------------------\")\n",
" print(\"Evaluating model {}\".format(model))\n",
" nlp = spacy.load(model)\n",
" spacy_evaluator = SpacyEvaluator(model=nlp,entities_to_keep=['PERSON','GPE','ORG'])\n",
" evaluation_results = spacy_evaluator.evaluate_all(DATASET)\n",
" scores = spacy_evaluator.calculate_score(evaluation_results)\n",
" spacy_model = SpacyModel(model=nlp,entities_to_keep=['PERSON','GPE','ORG'])\n",
" evaluator = Evaluator(model=spacy_model)\n",
" evaluation_results = evaluator.evaluate_all(DATASET)\n",
" scores = evaluator.calculate_score(evaluation_results)\n",
" \n",
" print(\"Confusion matrix:\")\n",
" print(scores.results)\n",
@ -243,7 +238,7 @@
},
"outputs": [],
"source": [
"ModelEvaluator.most_common_fp_tokens(errors)#[model_error for model_error in errors if model_error.error_type =='FP']"
"ModelError.most_common_fp_tokens(errors)"
]
},
{
@ -256,7 +251,7 @@
},
"outputs": [],
"source": [
"fps_df = ModelEvaluator.get_fps_dataframe(errors,entity=['LOCATION'])\n",
"fps_df = ModelError.get_fps_dataframe(errors,entity=['GPE'])\n",
"fps_df[['full_text','token','prediction']]"
]
},
@ -278,7 +273,7 @@
"outputs": [],
"source": [
"errors = scores.model_errors\n",
"ModelEvaluator.most_common_fn_tokens(errors,n=50, entity=['PERSON'])"
"ModelError.most_common_fn_tokens(errors,n=50, entity=['PERSON'])"
]
},
{
@ -298,7 +293,7 @@
},
"outputs": [],
"source": [
"fns_df = ModelEvaluator.get_fns_dataframe(errors,entity=['GPE'])"
"fns_df = ModelError.get_fns_dataframe(errors,entity=['GPE'])"
]
},
{
@ -320,15 +315,23 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"All errors:\\n\")\n",
"[print(error,\"\\n\") for error in errors]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "PyCharm (presidio-research)",
"language": "python",
"name": "python3"
"name": "pycharm-c8930cf3"
},
"language_info": {
"codemirror_mode": {
@ -340,18 +343,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}

Просмотреть файл

@ -3,7 +3,7 @@ from typing import List, Optional
import spacy
import srsly
from spacy.tokens import Token
from spacy.training import docs_to_json
from spacy.training import docs_to_json, iob_to_biluo
from tqdm import tqdm
from presidio_evaluator import span_to_tag, tokenize
@ -106,7 +106,7 @@ class Span:
return cls(**data)
class SimpleSpacyExtensions(object):
class SimpleSpacyExtensions:
def __init__(self, **kwargs):
"""
Serialization of Spacy Token extensions.
@ -119,7 +119,7 @@ class SimpleSpacyExtensions(object):
return self.__dict__
class SimpleToken(object):
class SimpleToken:
"""
A class mimicking the Spacy Token class, for serialization purposes
"""
@ -359,19 +359,38 @@ class InputSample(object):
return self.full_text, {"entities": new_entities}
@classmethod
def from_spacy(cls, text, annotations, translate_from_spacy=True):
def from_spacy_doc(cls, doc, map_spacy_entities_to_presidio=True, scheme="BILUO"):
if scheme not in ("BILUO","BILOU","BIO","IOB"):
raise ValueError("scheme should be one of \"BILUO\",\"BILOU\",\"BIO\",\"IOB\"")
spans = []
for annotation in annotations:
tag = (
cls.rename_from_spacy_tags([annotation[2]])[0]
if translate_from_spacy
else annotation[2]
for ent in doc.ents:
entity_type = (
cls.rename_from_spacy_tags(ent.label_)
if map_spacy_entities_to_presidio
else ent.label_
)
span = Span(
tag, text[annotation[0] : annotation[1]], annotation[0], annotation[1]
entity_type=entity_type,
entity_value=ent.text,
start_position=ent.start_char,
end_position=ent.end_char,
)
spans.append(span)
return cls(full_text=text, masked=None, spans=spans)
tags = [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc]
if scheme in ("BILUO", "BILOU"):
tags = iob_to_biluo(tags)
return cls(
full_text=doc.text,
masked=None,
spans=spans,
tokens=doc,
tags=tags,
create_tags_from_span=False,
scheme=scheme
)
@staticmethod
def create_spacy_dataset(

Просмотреть файл

@ -2,7 +2,8 @@ from pathlib import Path
from typing import List, Optional
import requests
from spacy.training import converters
from spacy.training.converters import conll_ner_to_docs
from tqdm import tqdm
from presidio_evaluator import InputSample
from presidio_evaluator.dataset_formatters import DatasetFormatter
@ -11,15 +12,15 @@ from presidio_evaluator.dataset_formatters import DatasetFormatter
class CONLL2003Formatter(DatasetFormatter):
def __init__(
self,
files_path=Path("../data/conll2003").resolve(),
glob_pattern: str = "*.iob",
files_path=Path("../../data/conll2003").resolve(),
glob_pattern: str = "*.*",
):
self.files_path = files_path
self.glob_pattern = glob_pattern
@staticmethod
def download(
local_data_path=Path("../data/conll2003").resolve(),
local_data_path=Path("../../data/conll2003").resolve(),
conll_gh_path="https://raw.githubusercontent.com/glample/tagger/master/dataset/",
):
@ -43,6 +44,7 @@ class CONLL2003Formatter(DatasetFormatter):
def to_input_samples(self, fold: Optional[str] = None) -> List[InputSample]:
files_found = False
input_samples = []
for i, file_path in enumerate(self.files_path.glob(self.glob_pattern)):
if fold and fold not in file_path.name:
continue
@ -53,10 +55,19 @@ class CONLL2003Formatter(DatasetFormatter):
text = "".join(text)
output_docs = converters.conll_ner2json(
output_docs = conll_ner_to_docs(
input_data=text, n_sents=None, no_print=True
)
for doc in tqdm(output_docs, f"Processing doc for file {file_path.name}"):
input_samples.append(InputSample.from_spacy_doc(doc=doc))
# TODO: Translate to InputSample
if not files_found:
raise FileNotFoundError(f"No files found for pattern {self.glob_pattern}")
raise FileNotFoundError(f"No files found for pattern {self.glob_pattern} and fold {fold}")
return input_samples
if __name__ == "__main__":
conll_formatter = CONLL2003Formatter()
train_samples = conll_formatter.to_input_samples(fold="train")
print(train_samples[:5])

Просмотреть файл

@ -150,7 +150,7 @@ class Evaluator:
return evaluation_results
@staticmethod
def align_input_samples_to_presidio_analyzer(
def align_entity_types(
input_samples: List[InputSample],
entities_mapping: Dict[
str, str
@ -166,24 +166,23 @@ class Evaluator:
# A list that will contain updated input samples,
new_list = []
# Iterate on all samples
for input_sample in new_input_samples:
contains_presidio_field = False
contains_field_in_mapping = False
new_spans = []
# Update spans to match Presidio's entity name
# Update spans to match the entity types in the values of entities_mapping
for span in input_sample.spans:
in_presidio_field = False
if span.entity_type in entities_mapping.keys():
new_name = entities_mapping.get(span.entity_type)
span.entity_type = new_name
contains_presidio_field = True
contains_field_in_mapping = True
# Add to new span list, if the span contains an entity relevant to Presidio
new_spans.append(span)
else:
raise ValueError(f"Key {span.entity_type} cannot be found in the provided entities_mapping")
input_sample.spans = new_spans
# Update tags in case this sample has relevant entities for evaluation
if contains_presidio_field:
if contains_field_in_mapping:
for i, tag in enumerate(input_sample.tags):
has_prefix = "-" in tag
if has_prefix:
@ -200,7 +199,9 @@ class Evaluator:
input_sample.tags[i] = "O"
new_list.append(input_sample)
return new_list
# Iterate on all samples
def calculate_score(
self,

Просмотреть файл

@ -95,7 +95,7 @@ def score_presidio_recognizer(
print("Preparing dataset by aligning entity names to Presidio's entity names")
updated_samples = Evaluator.align_input_samples_to_presidio_analyzer(input_samples)
updated_samples = Evaluator.align_entity_types(input_samples)
model = PresidioRecognizerWrapper(
recognizer=recognizer,
@ -127,7 +127,7 @@ def score_presidio_analyzer(
print("Preparing dataset by aligning entity names to Presidio's entity names")
updated_samples = Evaluator.align_input_samples_to_presidio_analyzer(input_samples)
updated_samples = Evaluator.align_entity_types(input_samples)
flatten = lambda l: [item for sublist in l for item in sublist]
from collections import Counter

Просмотреть файл

@ -1,9 +1,9 @@
from typing import List
from typing import List, Optional, Dict
import spacy
try:
from flair.data import Sentence, build_spacy_tokenizer
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.tokenization import SpacyTokenizer
except ImportError:
@ -21,7 +21,6 @@ class FlairModel(BaseModel):
model_path: str = None,
entities_to_keep: List[str] = None,
verbose: bool = False,
translate_to_spacy_entities=True,
):
"""
Evaluator for Flair models
@ -29,7 +28,7 @@ class FlairModel(BaseModel):
:param model_path:
:param entities_to_keep:
:param verbose:
:param translate_to_spacy_entities:
and model expected entity types
"""
super().__init__(
entities_to_keep=entities_to_keep,
@ -43,18 +42,9 @@ class FlairModel(BaseModel):
self.model = model
self.spacy_tokenizer = SpacyTokenizer(model=spacy.load("en_core_web_lg"))
self.translate_to_spacy_entities = translate_to_spacy_entities
if self.translate_to_spacy_entities:
print(
"Translating entities using this dictionary: {}".format(
PRESIDIO_SPACY_ENTITIES
)
)
def predict(self, sample: InputSample) -> List[str]:
if self.translate_to_spacy_entities:
sample.translate_input_sample_tags()
sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer)
self.model.predict(sentence)

Просмотреть файл

Просмотреть файл

@ -71,10 +71,12 @@ class PresidioAnalyzerWrapper(BaseModel):
"CITY": "LOCATION",
"ADDRESS": "LOCATION",
"NATIONALITY": "LOCATION",
"LOCATION": "LOCATION",
"IBAN": "IBAN_CODE",
"URL": "DOMAIN_NAME",
"US_SSN": "US_SSN",
"IP_ADDRESS": "IP_ADDRESS",
"ORGANIZATION": "ORG",
"TITLE": "O",
"O": "O",
}

Просмотреть файл

@ -5481,7 +5481,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "SvenZimmer@fleckens.hu",
"start_position": 39,
"end_position": 61
@ -5585,7 +5585,7 @@
"O",
"O",
"O",
"U-EMAIL"
"U-EMAIL_ADDRESS"
],
"template_id": null,
"metadata": {
@ -9288,7 +9288,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "EmilySanderson@jourrapide.com",
"start_position": 59,
"end_position": 88
@ -9440,7 +9440,7 @@
"O",
"O",
"O",
"U-EMAIL"
"U-EMAIL_ADDRESS"
],
"template_id": null,
"metadata": {
@ -20492,7 +20492,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "NatalinaLucchese@superrito.com",
"start_position": 59,
"end_position": 89
@ -20644,7 +20644,7 @@
"O",
"O",
"O",
"U-EMAIL"
"U-EMAIL_ADDRESS"
],
"template_id": null,
"metadata": {
@ -25723,7 +25723,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "HannaUkkonen@dayrep.com",
"start_position": 39,
"end_position": 62
@ -25827,7 +25827,7 @@
"O",
"O",
"O",
"U-EMAIL"
"U-EMAIL_ADDRESS"
],
"template_id": null,
"metadata": {
@ -32783,7 +32783,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "yahyaeriksson@gustr.com",
"start_position": 23,
"end_position": 46
@ -32918,7 +32918,7 @@
"O",
"O",
"O",
"U-EMAIL",
"U-EMAIL_ADDRESS",
"O",
"O",
"O",
@ -40833,7 +40833,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "VictorAndreyev@cuvox.de",
"start_position": 23,
"end_position": 46
@ -40968,7 +40968,7 @@
"O",
"O",
"O",
"U-EMAIL",
"U-EMAIL_ADDRESS",
"O",
"O",
"O",
@ -44468,7 +44468,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "HarrisonBarnes@fleckens.hu",
"start_position": 59,
"end_position": 85
@ -44620,7 +44620,7 @@
"O",
"O",
"O",
"U-EMAIL"
"U-EMAIL_ADDRESS"
],
"template_id": null,
"metadata": {
@ -49165,7 +49165,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "MathiasEJespersen@armyspy.com",
"start_position": 23,
"end_position": 52
@ -49300,7 +49300,7 @@
"O",
"O",
"O",
"U-EMAIL",
"U-EMAIL_ADDRESS",
"O",
"O",
"O",
@ -62644,7 +62644,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "ElishaFedorov@fleckens.hu",
"start_position": 39,
"end_position": 64
@ -62748,7 +62748,7 @@
"O",
"O",
"O",
"U-EMAIL"
"U-EMAIL_ADDRESS"
],
"template_id": null,
"metadata": {
@ -68659,7 +68659,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "HartmannAntonsson@jourrapide.com",
"start_position": 59,
"end_position": 91
@ -68811,7 +68811,7 @@
"O",
"O",
"O",
"U-EMAIL"
"U-EMAIL_ADDRESS"
],
"template_id": null,
"metadata": {
@ -72669,7 +72669,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "MakarMaslow@teleworm.us",
"start_position": 39,
"end_position": 62
@ -72773,7 +72773,7 @@
"O",
"O",
"O",
"U-EMAIL"
"U-EMAIL_ADDRESS"
],
"template_id": null,
"metadata": {

Просмотреть файл

@ -4,7 +4,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "syedsimensen@cuvox.de",
"start_position": 59,
"end_position": 80
@ -156,7 +156,7 @@
"O",
"O",
"O",
"U-EMAIL"
"U-EMAIL_ADDRESS"
],
"template_id": null,
"metadata": {
@ -2140,7 +2140,7 @@
"masked": null,
"spans": [
{
"entity_type": "EMAIL",
"entity_type": "EMAIL_ADDRESS",
"entity_value": "AkahoYokoi@cuvox.de",
"start_position": 59,
"end_position": 78
@ -2292,7 +2292,7 @@
"O",
"O",
"O",
"U-EMAIL"
"U-EMAIL_ADDRESS"
],
"template_id": null,
"metadata": {

Просмотреть файл

@ -1,4 +1,4 @@
My email is [EMAIL]
My email is [EMAIL_ADDRESS]
My address is [ADDRESS]
My first name is [FIRST_NAME] and my last is [LAST_NAME]
My name is [PERSON]

Просмотреть файл

@ -1,6 +1,9 @@
import numpy as np
from collections import Counter
from presidio_evaluator import InputSample
import numpy as np
import pytest
from presidio_evaluator import InputSample, Span
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator.evaluation import EvaluationResult, Evaluator
from tests.mocks import (
@ -296,3 +299,49 @@ def test_dataset_to_metric_50_50_model():
assert metrics.pii_precision == 1
assert metrics.pii_recall < 0.75
assert metrics.pii_recall > 0.25
def test_align_entity_types_correct_output():
sample1 = InputSample(
"I live in ABC",
spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)],
create_tags_from_span=False,
)
sample2 = InputSample(
"I live in ABC",
spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("C", "c", 100, 101)],
create_tags_from_span=False,
)
samples = [sample1, sample2]
mapping = {
"A": "1",
"B": "2",
"C": "1",
}
new_samples = Evaluator.align_entity_types(samples, mapping)
count_per_entity = Counter()
for sample in new_samples:
for span in sample.spans:
count_per_entity[span.entity_type] += 1
assert count_per_entity["1"] == 5
assert count_per_entity["2"] == 1
def test_align_entity_types_wrong_mapping_exception():
sample1 = InputSample(
"I live in ABC",
spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)],
create_tags_from_span=False,
)
entities_mapping = {"Z": "z"}
with pytest.raises(ValueError):
Evaluator.align_entity_types(
input_samples=[sample1], entities_mapping=entities_mapping
)

Просмотреть файл

@ -72,7 +72,7 @@ def test_analyzer_with_generated_text(test_input, acceptance_threshold):
dir_path = os.path.dirname(os.path.realpath(__file__))
input_samples = read_synth_dataset(test_input.format(dir_path))
updated_samples = Evaluator.align_input_samples_to_presidio_analyzer(
updated_samples = Evaluator.align_entity_types(
input_samples=input_samples, entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map
)