notebooks update
This commit is contained in:
Родитель
3df6e1aca8
Коммит
3e335a6a3a
|
@ -30,7 +30,7 @@ steps:
|
|||
- script: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
python m spacy download en_core_web_lg
|
||||
python -m spacy download en_core_web_lg
|
||||
|
||||
displayName: 'Install dependencies'
|
||||
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
from .spacy_retrain import SpacyRetrainer
|
||||
from .flair_train import FlairTrainer
|
|
@ -3,28 +3,35 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from presidio_evaluator.data_generator import read_synth_dataset\n",
|
||||
"from presidio_evaluator import ModelEvaluator\n",
|
||||
"from presidio_evaluator.evaluation import ModelError, Evaluator\n",
|
||||
"from presidio_evaluator.models import BaseModel, PresidioAnalyzerWrapper\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"\n",
|
||||
"MY_PRESIDIO_ENDPOINT = \"http://presidio-api.westeurope.cloudapp.azure.com/api/v1/projects/test/analyze\""
|
||||
"pd.options.display.max_columns = None\n",
|
||||
"pd.options.display.width=None"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate your Presidio instance via the Presidio API"
|
||||
"# Evaluate Presidio Analyzer\n",
|
||||
"This notebook runs the PresidioAnalyzerEvaluator class on top of synthetic data.\n",
|
||||
"\n",
|
||||
"One can perform the following changes:\n",
|
||||
"1. Replace the synthetic data creation with real data or with other type of synthetic data\n",
|
||||
"2. Adapt the Presidio `AnalyzerEngine` to a specific engine with a different set of recognizers or configured to be used on different languages\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -37,15 +44,12 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input_samples = read_synth_dataset(\"../data/synth_dataset.txt\")\n",
|
||||
"print(\"Read {} samples\".format(len(input_samples)))"
|
||||
"print(\"Read {} samples\".format(len(input_samples)))\n",
|
||||
"input_samples[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -58,11 +62,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"flatten = lambda l: [item for sublist in l for item in sublist]\n",
|
||||
|
@ -84,32 +84,29 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity\n",
|
||||
"entities_mapping = {\n",
|
||||
" 'PERSON': 'PERSON',\n",
|
||||
" 'EMAIL': 'EMAIL_ADDRESS',\n",
|
||||
" 'CREDIT_CARD': 'CREDIT_CARD',\n",
|
||||
" 'FIRST_NAME': 'PERSON',\n",
|
||||
" 'PHONE_NUMBER': 'PHONE_NUMBER',\n",
|
||||
" 'LOCATION':'LOCATION',\n",
|
||||
" # 'BIRTHDAY': 'DATE_TIME',\n",
|
||||
" # 'DATE': 'DATE_TIME',\n",
|
||||
" 'DOMAIN': 'DOMAIN',\n",
|
||||
" # 'CITY': 'LOCATION',\n",
|
||||
" # 'ADDRESS': 'LOCATION',\n",
|
||||
" 'IBAN': 'IBAN_CODE',\n",
|
||||
" # 'URL': 'DOMAIN_NAME',\n",
|
||||
" 'US_SSN': 'US_SSN',\n",
|
||||
" 'IP_ADDRESS': 'IP_ADDRESS',\n",
|
||||
" # 'ORGANIZATION':'ORG'\n",
|
||||
" 'O': 'O'\n",
|
||||
"presidio_entities_map = {\n",
|
||||
" \"PERSON\": \"PERSON\",\n",
|
||||
" \"EMAIL_ADDRESS\": \"EMAIL_ADDRESS\",\n",
|
||||
" \"CREDIT_CARD\": \"CREDIT_CARD\",\n",
|
||||
" \"FIRST_NAME\": \"PERSON\",\n",
|
||||
" \"PHONE_NUMBER\": \"PHONE_NUMBER\",\n",
|
||||
" \"BIRTHDAY\": \"DATE_TIME\",\n",
|
||||
" \"DATE_TIME\": \"DATE_TIME\",\n",
|
||||
" \"DOMAIN\": \"DOMAIN\",\n",
|
||||
" \"CITY\": \"LOCATION\",\n",
|
||||
" \"ADDRESS\": \"LOCATION\",\n",
|
||||
" \"NATIONALITY\": \"LOCATION\",\n",
|
||||
" \"LOCATION\": \"LOCATION\",\n",
|
||||
" \"IBAN\": \"IBAN_CODE\",\n",
|
||||
" \"URL\": \"DOMAIN_NAME\",\n",
|
||||
" \"US_SSN\": \"US_SSN\",\n",
|
||||
" \"IP_ADDRESS\": \"IP_ADDRESS\",\n",
|
||||
" \"ORGANIZATION\": \"ORG\",\n",
|
||||
" \"TITLE\" : \"O\", # skipping evaluation of titles\n",
|
||||
" \"O\": \"O\",\n",
|
||||
"}\n",
|
||||
"presidio_fields = ['CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'DOMAIN_NAME', 'EMAIL_ADDRESS', 'IBAN_CODE',\n",
|
||||
" 'IP_ADDRESS', 'NRP', 'LOCATION', 'PERSON', 'PHONE_NUMBER', 'US_SSN']\n",
|
||||
"\n",
|
||||
"new_list = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,\n",
|
||||
" entities_mapping,\n",
|
||||
" presidio_fields)"
|
||||
"new_list = Evaluator.align_entity_types(input_samples, presidio_entities_map)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -143,9 +140,9 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from presidio_evaluator import PresidioAPIEvaluator\n",
|
||||
"presidio = PresidioAPIEvaluator(entities_to_keep=list(count_per_entity_new.keys()),endpoint=MY_PRESIDIO_ENDPOINT)\n",
|
||||
"evaluted_samples = presidio.evaluate_all(new_list[:100])"
|
||||
"presidio = PresidioAnalyzerWrapper(entities_to_keep=list(count_per_entity_new.keys()))\n",
|
||||
"evaluator = Evaluator(model=presidio)\n",
|
||||
"evaluted_samples = evaluator.evaluate_all(new_list[:100])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -163,7 +160,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"evaluation_result = presidio.calculate_score(evaluted_samples)"
|
||||
"evaluation_result = evaluator.calculate_score(evaluted_samples)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -197,7 +194,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ModelEvaluator.most_common_fp_tokens(errors,n=5)"
|
||||
"ModelError.most_common_fp_tokens(errors,n=5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -210,8 +207,9 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fps_df = ModelEvaluator.get_fps_dataframe(errors,entity='PERSON')\n",
|
||||
"fps_df[['full_text','token','prediction']]"
|
||||
"fps_df = ModelError.get_fps_dataframe(errors,entity='PERSON')\n",
|
||||
"if fps_df is not None:\n",
|
||||
" fps_df[['full_text','token','prediction']]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -220,16 +218,30 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fns_df = ModelEvaluator.get_fns_dataframe(errors,entity='PERSON')\n",
|
||||
"fns_df = ModelError.get_fns_dataframe(errors,entity='PERSON')\n",
|
||||
"fns_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"name": "pycharm-c8930cf3",
|
||||
"display_name": "presidio-research",
|
||||
"language": "python",
|
||||
"display_name": "PyCharm (presidio-research)"
|
||||
"name": "presidio-research"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -241,16 +253,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
"version": "3.8.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
|
@ -204,9 +204,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "presidio-research",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "presidio-research"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -218,18 +218,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
"version": "3.8.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -38,7 +38,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -47,7 +47,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -59,7 +59,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -68,7 +68,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -81,10 +81,11 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install wordcloud\n",
|
||||
"from wordcloud import WordCloud\n",
|
||||
"\n",
|
||||
"def series_to_wordcloud(series):\n",
|
||||
|
@ -98,7 +99,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -107,7 +108,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -116,7 +117,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 96,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -125,7 +126,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -134,7 +135,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -150,7 +151,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -159,7 +160,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -168,7 +169,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -177,7 +178,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -187,7 +188,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -197,7 +198,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -207,7 +208,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -222,7 +223,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -233,7 +234,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -252,9 +253,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "presidio-research",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "presidio-research"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -266,18 +267,18 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
"version": "3.8.8"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
"source": []
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,9 +21,10 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# install presidio via pip\n",
|
||||
"# install presidio via pip if not yet installed\n",
|
||||
"\n",
|
||||
"#!pip install presidio-analyzer"
|
||||
"#!pip install presidio-analyzer\n",
|
||||
"#!pip install presidio-anonymizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -100,7 +101,7 @@
|
|||
"source": [
|
||||
"original_text = \"Hi my name is Doug Funny and this is my website: https://www.dougf.io/\"\n",
|
||||
"\n",
|
||||
"presidio_response = analyzer.analyze(original_text,language='en',all_fields=True)\n",
|
||||
"presidio_response = analyzer.analyze(original_text,language='en')\n",
|
||||
"presidio_response\n"
|
||||
]
|
||||
},
|
||||
|
@ -165,7 +166,7 @@
|
|||
"\n",
|
||||
"text = \"Our son asdfhlk used to work in Germany\"\n",
|
||||
"\n",
|
||||
"response = analyzer.analyze(text=text,language='en',all_fields=True)\n",
|
||||
"response = analyzer.analyze(text=text,language='en')\n",
|
||||
"print(f\"Presidio' response: {response}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
|
@ -188,9 +189,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "PyCharm (presidio-research)",
|
||||
"display_name": "presidio-research",
|
||||
"language": "python",
|
||||
"name": "pycharm-c8930cf3"
|
||||
"name": "presidio-research"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -202,7 +203,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6"
|
||||
"version": "3.8.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
"from presidio_evaluator import InputSample\n",
|
||||
"from presidio_evaluator.data_generator import read_synth_dataset\n",
|
||||
"from presidio_evaluator.validation import split_dataset, save_to_json\n",
|
||||
"from datetime import date\n",
|
||||
"\n",
|
||||
"%reload_ext autoreload"
|
||||
]
|
||||
|
@ -36,7 +37,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"all_samples = read_synth_dataset(\"../presidio_evaluator/data_generator/synth_dataset.txt\")\n",
|
||||
"all_samples = read_synth_dataset(\"../data/synth_dataset.txt\")\n",
|
||||
"print(len(all_samples))"
|
||||
]
|
||||
},
|
||||
|
@ -90,9 +91,11 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DATE_DATE = date.today().strftime(\"%b-%d-%Y\")\n",
|
||||
"\n",
|
||||
"save_to_json(train,\"../data/train_{}.json\".format(DATE_DATE))\n",
|
||||
"save_to_json(test,\"../data/test_{}.json\".format(DATE_DATE))\n",
|
||||
"save_to_json(validation,\"../data/1validation_{}.json\".format(DATE_DATE))\n"
|
||||
"save_to_json(validation,\"../data/validation_{}.json\".format(DATE_DATE))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -130,27 +133,22 @@
|
|||
"source": [
|
||||
"assert len(train) + len(test) + len(validation) == len(all_samples)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "presidio-research",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
"name": "presidio-research"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
|
@ -18,7 +18,7 @@
|
|||
"import pandas as pd\n",
|
||||
"pd.options.display.max_rows = 4000\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"from deeppavlov.dataset_readers.conll2003_reader import Conll2003DatasetReader"
|
||||
"#TODO: fix CONLL2003 download and usage"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -31,8 +31,8 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"reader = Conll2003DatasetReader()\n",
|
||||
"dataset = reader.read(data_path =\"../../data\",dataset_name='conll2003')\n",
|
||||
"#reader = Conll2003DatasetReader()\n",
|
||||
"#dataset = reader.read(data_path =\"../../data\",dataset_name='conll2003')\n",
|
||||
"#Note: make sure you haven't downloaded something else with this function before, \n",
|
||||
"# as it will not download a new dataset (even if your previous download was for a different dataset)"
|
||||
]
|
||||
|
@ -645,15 +645,6 @@
|
|||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"source": []
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
|
@ -13,7 +13,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -23,7 +23,7 @@
|
|||
"from sklearn_crfsuite import metrics\n",
|
||||
"\n",
|
||||
"from presidio_evaluator import InputSample\n",
|
||||
"from presidio_evaluator.crf_evaluator import CRFEvaluator\n",
|
||||
"from presidio_evaluator.models.crf_model import CRFModel\n",
|
||||
"from presidio_evaluator.data_generator import read_synth_dataset"
|
||||
]
|
||||
},
|
||||
|
@ -48,7 +48,6 @@
|
|||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": true,
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
|
@ -100,7 +99,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"CRFEvaluator.sent2features(train_sents[0])[0]"
|
||||
"CRFModel.sent2features(train_sents[0])[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -110,11 +109,11 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"X_train = [CRFEvaluator.sent2features(s) for s in train_sents]\n",
|
||||
"y_train = [CRFEvaluator.sent2labels(s) for s in train_sents]\n",
|
||||
"X_train = [CRFModel.sent2features(s) for s in train_sents]\n",
|
||||
"y_train = [CRFModel.sent2labels(s) for s in train_sents]\n",
|
||||
"\n",
|
||||
"X_test = [CRFEvaluator.sent2features(s) for s in test_sents]\n",
|
||||
"y_test = [CRFEvaluator.sent2labels(s) for s in test_sents]"
|
||||
"X_test = [CRFModel.sent2features(s) for s in test_sents]\n",
|
||||
"y_test = [CRFModel.sent2labels(s) for s in test_sents]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -149,7 +148,7 @@
|
|||
"source": [
|
||||
"import pickle\n",
|
||||
"with open(\"../../models/crf.pickle\",'wb') as f:\n",
|
||||
" data = pickle.dump(crf, f,protocol=pickle.HIGHEST_PROTOCOL)\n",
|
||||
" pickle.dump(crf, f,protocol=pickle.HIGHEST_PROTOCOL)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
|
@ -277,9 +276,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "PyCharm (presidio-research)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "pycharm-c8930cf3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -291,18 +290,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
"version": "3.8.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,24 +15,22 @@
|
|||
}
|
||||
},
|
||||
"source": [
|
||||
"This notebook takes train and test datasets (of type `List[InputSample]`)\n",
|
||||
"This notebook takes train and test datasets (of type `List[InputSample]`)\n",
|
||||
"and transforms them into two structures consumed by Spacy:\n",
|
||||
"1. Spacy JSON (see https://spacy.io/api/annotation#json-input)\n",
|
||||
"2. Spacy Pickle files (of structure `[(full_text,\"entities\":[(start, end, type),(...))]`. \n",
|
||||
"See more details here: https://spacy.io/api/annotation#json-input)\n",
|
||||
"\n",
|
||||
"JSON is used for Spacy's CLI trainer. \n",
|
||||
"Pickle is used for fine-tuning using the logic in [../models/spacy_retrain.py](../models/spacy_retrain.py)"
|
||||
">> Note that this notebook uses the old spaCy 2.0 structure. In order to train spaCy 3 models,\n",
|
||||
"> the spacy `convert` CLI call should be used:\n",
|
||||
"\n",
|
||||
">>`python -m spacy convert ./data.json ./output.spacy`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": false
|
||||
}
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from presidio_evaluator.data_generator import read_synth_dataset\n",
|
||||
|
@ -62,7 +60,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = \"../data/generated_{}_{}.json\"\n",
|
||||
"data_path = \"../../data/generated_{}_{}.json\"\n",
|
||||
"\n",
|
||||
"train_samples = read_synth_dataset(data_path.format(\"train\",DATA_DATE))\n",
|
||||
"print(\"Read {} samples\".format(len(train_samples)))"
|
||||
|
@ -81,7 +79,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -146,12 +144,11 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"entities_spacy = [x[1]['entities'] for x in spacy_train]\n",
|
||||
"entities_spacy\n",
|
||||
"entities_spacy_flat = []\n",
|
||||
"for samp in entities_spacy:\n",
|
||||
" for ent in samp:\n",
|
||||
|
@ -168,7 +165,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -185,7 +182,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -194,7 +191,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -210,16 +207,16 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pickle\n",
|
||||
"import json\n",
|
||||
"with open(\"../data/train.pickle\", 'wb') as handle:\n",
|
||||
"with open(\"../../data/train.pickle\", 'wb') as handle:\n",
|
||||
" pickle.dump(spacy_train,handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
|
||||
"\n",
|
||||
"with open(\"../data/train.json\",\"w\") as f:\n",
|
||||
"with open(\"../../data/train.json\",\"w\") as f:\n",
|
||||
" json.dump(spacy_train_json,f)\n",
|
||||
" "
|
||||
]
|
||||
|
@ -233,7 +230,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -243,7 +240,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -261,15 +258,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pickle\n",
|
||||
"with open(\"../data/test.pickle\", 'wb') as handle:\n",
|
||||
"with open(\"../../data/test.pickle\", 'wb') as handle:\n",
|
||||
" pickle.dump(spacy_test,handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
|
||||
" \n",
|
||||
"with open(\"../data/test.json\",\"w\") as f:\n",
|
||||
"with open(\"../../data/test.json\",\"w\") as f:\n",
|
||||
" json.dump(spacy_test_json,f)\n",
|
||||
" "
|
||||
]
|
||||
|
@ -284,9 +281,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "PyCharm (presidio-research)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "pycharm-c8930cf3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -298,16 +295,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"source": []
|
||||
}
|
||||
"version": "3.8.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -342,18 +342,18 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
"version": "3.7.9"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
"source": []
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"from presidio_evaluator.data_generator import read_synth_dataset\n",
|
||||
"from presidio_evaluator.evaluation import ModelError, Evaluator\n",
|
||||
"%reload_ext autoreload\n",
|
||||
"%autoreload 2"
|
||||
]
|
||||
|
@ -42,8 +43,49 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"synth_samples = read_synth_dataset(\"../../data/synth_dataset.txt\")\n",
|
||||
"print(len(synth_samples))\n",
|
||||
"DATASET = synth_samples"
|
||||
"print(len(synth_samples))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"Map entity types"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"presidio_entities_map = {\n",
|
||||
" \"PERSON\": \"PER\",\n",
|
||||
" \"EMAIL_ADDRESS\": \"O\",\n",
|
||||
" \"CREDIT_CARD\": \"O\",\n",
|
||||
" \"FIRST_NAME\": \"PER\",\n",
|
||||
" \"PHONE_NUMBER\": \"O\",\n",
|
||||
" \"BIRTHDAY\": \"O\",\n",
|
||||
" \"DATE_TIME\": \"O\",\n",
|
||||
" \"DOMAIN\": \"O\",\n",
|
||||
" \"CITY\": \"LOC\",\n",
|
||||
" \"ADDRESS\": \"LOC\",\n",
|
||||
" \"NATIONALITY\": \"LOC\",\n",
|
||||
" \"LOCATION\": \"LOC\",\n",
|
||||
" \"IBAN\": \"O\",\n",
|
||||
" \"URL\": \"O\",\n",
|
||||
" \"US_SSN\": \"O\",\n",
|
||||
" \"IP_ADDRESS\": \"O\",\n",
|
||||
" \"ORGANIZATION\": \"ORG\",\n",
|
||||
" \"TITLE\" : \"O\", # skipping evaluation of titles\n",
|
||||
" \"O\": \"O\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"synth_samples = Evaluator.align_entity_types(synth_samples, presidio_entities_map)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -58,7 +100,7 @@
|
|||
"source": [
|
||||
"from collections import Counter\n",
|
||||
"entity_counter = Counter()\n",
|
||||
"for sample in DATASET:\n",
|
||||
"for sample in synth_samples:\n",
|
||||
" for tag in sample.tags:\n",
|
||||
" entity_counter[tag]+=1"
|
||||
]
|
||||
|
@ -79,15 +121,9 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": false
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DATASET[1]"
|
||||
]
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
|
@ -100,7 +136,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"#max length sentence\n",
|
||||
"max([len(sample.tokens) for sample in DATASET])"
|
||||
"max([len(sample.tokens) for sample in synth_samples])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -123,9 +159,7 @@
|
|||
"flair_ner = 'ner'\n",
|
||||
"flair_ner_fast = 'ner-fast'\n",
|
||||
"flair_ontonotes = 'ner-ontonotes-fast'\n",
|
||||
"flair_bert_embeddings = '../../models/presidio-ner/flair-bert-embeddings.pt'\n",
|
||||
"glove_flair_embeddings = '../../models/presidio-ner/flair-embeddings.pt'\n",
|
||||
"models = [flair_bert_embeddings, glove_flair_embeddings, flair_ner,flair_ner_fast,flair_ontonotes]"
|
||||
"models = [flair_ner, flair_ner_fast]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -138,14 +172,15 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from presidio_evaluator.flair_evaluator import FlairEvaluator\n",
|
||||
"from presidio_evaluator.models import FlairModel\n",
|
||||
"\n",
|
||||
"for model in models:\n",
|
||||
" print(\"-----------------------------------\")\n",
|
||||
" print(\"Evaluating model {}\".format(model))\n",
|
||||
" flair_evaluator = FlairEvaluator(model_path=model)\n",
|
||||
" evaluation_results = flair_evaluator.evaluate_all(DATASET)\n",
|
||||
" scores = flair_evaluator.calculate_score(evaluation_results)\n",
|
||||
" flair_model = FlairModel(model_path=model)\n",
|
||||
" evaluator = Evaluator(model=flair_model)\n",
|
||||
" evaluation_results = evaluator.evaluate_all(synth_samples)\n",
|
||||
" scores = evaluator.calculate_score(evaluation_results)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" print(\"Confusion matrix:\")\n",
|
||||
|
@ -189,8 +224,7 @@
|
|||
"source": [
|
||||
"errors = scores.model_errors\n",
|
||||
"\n",
|
||||
"from presidio_evaluator import ModelEvaluator\n",
|
||||
"ModelEvaluator.most_common_fp_tokens(errors)#[model_error for model_error in errors if model_error.error_type =='FP']\n"
|
||||
"ModelError.most_common_fp_tokens(errors)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -203,7 +237,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fps_df = ModelEvaluator.get_fps_dataframe(errors,entity=['PERSON'])\n",
|
||||
"fps_df = ModelError.get_fps_dataframe(errors,entity=['PERSON'])\n",
|
||||
"fps_df[['full_text','token','prediction']]"
|
||||
]
|
||||
},
|
||||
|
@ -224,7 +258,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ModelEvaluator.most_common_fn_tokens(errors,n=50, entity=['PERSON'])"
|
||||
"ModelError.most_common_fn_tokens(errors,n=50, entity=['PER'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -244,7 +278,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fns_df = ModelEvaluator.get_fns_dataframe(errors,entity=['PERSON'])"
|
||||
"fns_df = ModelError.get_fns_dataframe(errors,entity=['PERSON'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -264,9 +298,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "presidio-research",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "presidio-research"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -278,18 +312,18 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
"version": "3.8.8"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
"source": []
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
"source": [
|
||||
"import spacy\n",
|
||||
"\n",
|
||||
"from presidio_evaluator import ModelEvaluator\n",
|
||||
"from presidio_evaluator.evaluation import Evaluator, ModelError\n",
|
||||
"from presidio_evaluator.data_generator import read_synth_dataset\n",
|
||||
"%reload_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
|
@ -129,14 +129,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"models = []\n",
|
||||
"\n",
|
||||
"en_core_web_lg = r\"en_core_web_lg\"\n",
|
||||
"spacy_new_ontonotes28 = r\"C:\\Users\\ommendel\\OneDrive - Microsoft\\Projects\\presidio\\Presidio-internal\\presidio-evaluator\\models\\spacy_new_ontonotes28\"\n",
|
||||
"\n",
|
||||
"spacy_ft_100 = r\"C:\\Users\\ommendel\\OneDrive - Microsoft\\Projects\\presidio\\Presidio-internal\\presidio-evaluator\\models\\spacy_ft_100\\model-final\"\n",
|
||||
"\n",
|
||||
"models = [en_core_web_lg, spacy_new_ontonotes28, spacy_ft_100]"
|
||||
"models = [\"en_core_web_lg\", \"en_core_web_trf\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -156,15 +149,17 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from presidio_evaluator.spacy_evaluator import SpacyEvaluator\n",
|
||||
"from presidio_evaluator.models import SpacyModel\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"for model in models:\n",
|
||||
" print(\"-----------------------------------\")\n",
|
||||
" print(\"Evaluating model {}\".format(model))\n",
|
||||
" nlp = spacy.load(model)\n",
|
||||
" spacy_evaluator = SpacyEvaluator(model=nlp,entities_to_keep=['PERSON','GPE','ORG'])\n",
|
||||
" evaluation_results = spacy_evaluator.evaluate_all(DATASET)\n",
|
||||
" scores = spacy_evaluator.calculate_score(evaluation_results)\n",
|
||||
" spacy_model = SpacyModel(model=nlp,entities_to_keep=['PERSON','GPE','ORG'])\n",
|
||||
" evaluator = Evaluator(model=spacy_model)\n",
|
||||
" evaluation_results = evaluator.evaluate_all(DATASET)\n",
|
||||
" scores = evaluator.calculate_score(evaluation_results)\n",
|
||||
" \n",
|
||||
" print(\"Confusion matrix:\")\n",
|
||||
" print(scores.results)\n",
|
||||
|
@ -243,7 +238,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ModelEvaluator.most_common_fp_tokens(errors)#[model_error for model_error in errors if model_error.error_type =='FP']"
|
||||
"ModelError.most_common_fp_tokens(errors)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -256,7 +251,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fps_df = ModelEvaluator.get_fps_dataframe(errors,entity=['LOCATION'])\n",
|
||||
"fps_df = ModelError.get_fps_dataframe(errors,entity=['GPE'])\n",
|
||||
"fps_df[['full_text','token','prediction']]"
|
||||
]
|
||||
},
|
||||
|
@ -278,7 +273,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"errors = scores.model_errors\n",
|
||||
"ModelEvaluator.most_common_fn_tokens(errors,n=50, entity=['PERSON'])"
|
||||
"ModelError.most_common_fn_tokens(errors,n=50, entity=['PERSON'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -298,7 +293,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fns_df = ModelEvaluator.get_fns_dataframe(errors,entity=['GPE'])"
|
||||
"fns_df = ModelError.get_fns_dataframe(errors,entity=['GPE'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -320,15 +315,23 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"All errors:\\n\")\n",
|
||||
"[print(error,\"\\n\") for error in errors]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "PyCharm (presidio-research)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "pycharm-c8930cf3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -340,18 +343,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
"version": "3.8.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ from typing import List, Optional
|
|||
import spacy
|
||||
import srsly
|
||||
from spacy.tokens import Token
|
||||
from spacy.training import docs_to_json
|
||||
from spacy.training import docs_to_json, iob_to_biluo
|
||||
from tqdm import tqdm
|
||||
|
||||
from presidio_evaluator import span_to_tag, tokenize
|
||||
|
@ -106,7 +106,7 @@ class Span:
|
|||
return cls(**data)
|
||||
|
||||
|
||||
class SimpleSpacyExtensions(object):
|
||||
class SimpleSpacyExtensions:
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
Serialization of Spacy Token extensions.
|
||||
|
@ -119,7 +119,7 @@ class SimpleSpacyExtensions(object):
|
|||
return self.__dict__
|
||||
|
||||
|
||||
class SimpleToken(object):
|
||||
class SimpleToken:
|
||||
"""
|
||||
A class mimicking the Spacy Token class, for serialization purposes
|
||||
"""
|
||||
|
@ -359,19 +359,38 @@ class InputSample(object):
|
|||
return self.full_text, {"entities": new_entities}
|
||||
|
||||
@classmethod
|
||||
def from_spacy(cls, text, annotations, translate_from_spacy=True):
|
||||
def from_spacy_doc(cls, doc, map_spacy_entities_to_presidio=True, scheme="BILUO"):
|
||||
if scheme not in ("BILUO","BILOU","BIO","IOB"):
|
||||
raise ValueError("scheme should be one of \"BILUO\",\"BILOU\",\"BIO\",\"IOB\"")
|
||||
|
||||
spans = []
|
||||
for annotation in annotations:
|
||||
tag = (
|
||||
cls.rename_from_spacy_tags([annotation[2]])[0]
|
||||
if translate_from_spacy
|
||||
else annotation[2]
|
||||
for ent in doc.ents:
|
||||
entity_type = (
|
||||
cls.rename_from_spacy_tags(ent.label_)
|
||||
if map_spacy_entities_to_presidio
|
||||
else ent.label_
|
||||
)
|
||||
span = Span(
|
||||
tag, text[annotation[0] : annotation[1]], annotation[0], annotation[1]
|
||||
entity_type=entity_type,
|
||||
entity_value=ent.text,
|
||||
start_position=ent.start_char,
|
||||
end_position=ent.end_char,
|
||||
)
|
||||
spans.append(span)
|
||||
return cls(full_text=text, masked=None, spans=spans)
|
||||
|
||||
tags = [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc]
|
||||
if scheme in ("BILUO", "BILOU"):
|
||||
tags = iob_to_biluo(tags)
|
||||
|
||||
return cls(
|
||||
full_text=doc.text,
|
||||
masked=None,
|
||||
spans=spans,
|
||||
tokens=doc,
|
||||
tags=tags,
|
||||
create_tags_from_span=False,
|
||||
scheme=scheme
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def create_spacy_dataset(
|
||||
|
|
|
@ -2,7 +2,8 @@ from pathlib import Path
|
|||
from typing import List, Optional
|
||||
|
||||
import requests
|
||||
from spacy.training import converters
|
||||
from spacy.training.converters import conll_ner_to_docs
|
||||
from tqdm import tqdm
|
||||
|
||||
from presidio_evaluator import InputSample
|
||||
from presidio_evaluator.dataset_formatters import DatasetFormatter
|
||||
|
@ -11,15 +12,15 @@ from presidio_evaluator.dataset_formatters import DatasetFormatter
|
|||
class CONLL2003Formatter(DatasetFormatter):
|
||||
def __init__(
|
||||
self,
|
||||
files_path=Path("../data/conll2003").resolve(),
|
||||
glob_pattern: str = "*.iob",
|
||||
files_path=Path("../../data/conll2003").resolve(),
|
||||
glob_pattern: str = "*.*",
|
||||
):
|
||||
self.files_path = files_path
|
||||
self.glob_pattern = glob_pattern
|
||||
|
||||
@staticmethod
|
||||
def download(
|
||||
local_data_path=Path("../data/conll2003").resolve(),
|
||||
local_data_path=Path("../../data/conll2003").resolve(),
|
||||
conll_gh_path="https://raw.githubusercontent.com/glample/tagger/master/dataset/",
|
||||
):
|
||||
|
||||
|
@ -43,6 +44,7 @@ class CONLL2003Formatter(DatasetFormatter):
|
|||
|
||||
def to_input_samples(self, fold: Optional[str] = None) -> List[InputSample]:
|
||||
files_found = False
|
||||
input_samples = []
|
||||
for i, file_path in enumerate(self.files_path.glob(self.glob_pattern)):
|
||||
if fold and fold not in file_path.name:
|
||||
continue
|
||||
|
@ -53,10 +55,19 @@ class CONLL2003Formatter(DatasetFormatter):
|
|||
|
||||
text = "".join(text)
|
||||
|
||||
output_docs = converters.conll_ner2json(
|
||||
output_docs = conll_ner_to_docs(
|
||||
input_data=text, n_sents=None, no_print=True
|
||||
)
|
||||
for doc in tqdm(output_docs, f"Processing doc for file {file_path.name}"):
|
||||
input_samples.append(InputSample.from_spacy_doc(doc=doc))
|
||||
|
||||
# TODO: Translate to InputSample
|
||||
if not files_found:
|
||||
raise FileNotFoundError(f"No files found for pattern {self.glob_pattern}")
|
||||
raise FileNotFoundError(f"No files found for pattern {self.glob_pattern} and fold {fold}")
|
||||
|
||||
return input_samples
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
conll_formatter = CONLL2003Formatter()
|
||||
train_samples = conll_formatter.to_input_samples(fold="train")
|
||||
print(train_samples[:5])
|
||||
|
|
|
@ -150,7 +150,7 @@ class Evaluator:
|
|||
return evaluation_results
|
||||
|
||||
@staticmethod
|
||||
def align_input_samples_to_presidio_analyzer(
|
||||
def align_entity_types(
|
||||
input_samples: List[InputSample],
|
||||
entities_mapping: Dict[
|
||||
str, str
|
||||
|
@ -166,24 +166,23 @@ class Evaluator:
|
|||
# A list that will contain updated input samples,
|
||||
new_list = []
|
||||
|
||||
# Iterate on all samples
|
||||
for input_sample in new_input_samples:
|
||||
contains_presidio_field = False
|
||||
contains_field_in_mapping = False
|
||||
new_spans = []
|
||||
# Update spans to match Presidio's entity name
|
||||
# Update spans to match the entity types in the values of entities_mapping
|
||||
for span in input_sample.spans:
|
||||
in_presidio_field = False
|
||||
if span.entity_type in entities_mapping.keys():
|
||||
new_name = entities_mapping.get(span.entity_type)
|
||||
span.entity_type = new_name
|
||||
contains_presidio_field = True
|
||||
contains_field_in_mapping = True
|
||||
|
||||
# Add to new span list, if the span contains an entity relevant to Presidio
|
||||
new_spans.append(span)
|
||||
else:
|
||||
raise ValueError(f"Key {span.entity_type} cannot be found in the provided entities_mapping")
|
||||
input_sample.spans = new_spans
|
||||
|
||||
# Update tags in case this sample has relevant entities for evaluation
|
||||
if contains_presidio_field:
|
||||
if contains_field_in_mapping:
|
||||
for i, tag in enumerate(input_sample.tags):
|
||||
has_prefix = "-" in tag
|
||||
if has_prefix:
|
||||
|
@ -200,7 +199,9 @@ class Evaluator:
|
|||
input_sample.tags[i] = "O"
|
||||
|
||||
new_list.append(input_sample)
|
||||
|
||||
return new_list
|
||||
# Iterate on all samples
|
||||
|
||||
def calculate_score(
|
||||
self,
|
||||
|
|
|
@ -95,7 +95,7 @@ def score_presidio_recognizer(
|
|||
|
||||
print("Preparing dataset by aligning entity names to Presidio's entity names")
|
||||
|
||||
updated_samples = Evaluator.align_input_samples_to_presidio_analyzer(input_samples)
|
||||
updated_samples = Evaluator.align_entity_types(input_samples)
|
||||
|
||||
model = PresidioRecognizerWrapper(
|
||||
recognizer=recognizer,
|
||||
|
@ -127,7 +127,7 @@ def score_presidio_analyzer(
|
|||
|
||||
print("Preparing dataset by aligning entity names to Presidio's entity names")
|
||||
|
||||
updated_samples = Evaluator.align_input_samples_to_presidio_analyzer(input_samples)
|
||||
updated_samples = Evaluator.align_entity_types(input_samples)
|
||||
|
||||
flatten = lambda l: [item for sublist in l for item in sublist]
|
||||
from collections import Counter
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from typing import List
|
||||
from typing import List, Optional, Dict
|
||||
|
||||
import spacy
|
||||
|
||||
try:
|
||||
from flair.data import Sentence, build_spacy_tokenizer
|
||||
from flair.data import Sentence
|
||||
from flair.models import SequenceTagger
|
||||
from flair.tokenization import SpacyTokenizer
|
||||
except ImportError:
|
||||
|
@ -21,7 +21,6 @@ class FlairModel(BaseModel):
|
|||
model_path: str = None,
|
||||
entities_to_keep: List[str] = None,
|
||||
verbose: bool = False,
|
||||
translate_to_spacy_entities=True,
|
||||
):
|
||||
"""
|
||||
Evaluator for Flair models
|
||||
|
@ -29,7 +28,7 @@ class FlairModel(BaseModel):
|
|||
:param model_path:
|
||||
:param entities_to_keep:
|
||||
:param verbose:
|
||||
:param translate_to_spacy_entities:
|
||||
and model expected entity types
|
||||
"""
|
||||
super().__init__(
|
||||
entities_to_keep=entities_to_keep,
|
||||
|
@ -43,18 +42,9 @@ class FlairModel(BaseModel):
|
|||
self.model = model
|
||||
|
||||
self.spacy_tokenizer = SpacyTokenizer(model=spacy.load("en_core_web_lg"))
|
||||
self.translate_to_spacy_entities = translate_to_spacy_entities
|
||||
|
||||
if self.translate_to_spacy_entities:
|
||||
print(
|
||||
"Translating entities using this dictionary: {}".format(
|
||||
PRESIDIO_SPACY_ENTITIES
|
||||
)
|
||||
)
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
if self.translate_to_spacy_entities:
|
||||
sample.translate_input_sample_tags()
|
||||
|
||||
sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer)
|
||||
self.model.predict(sentence)
|
||||
|
||||
|
|
|
@ -71,10 +71,12 @@ class PresidioAnalyzerWrapper(BaseModel):
|
|||
"CITY": "LOCATION",
|
||||
"ADDRESS": "LOCATION",
|
||||
"NATIONALITY": "LOCATION",
|
||||
"LOCATION": "LOCATION",
|
||||
"IBAN": "IBAN_CODE",
|
||||
"URL": "DOMAIN_NAME",
|
||||
"US_SSN": "US_SSN",
|
||||
"IP_ADDRESS": "IP_ADDRESS",
|
||||
"ORGANIZATION": "ORG",
|
||||
"TITLE": "O",
|
||||
"O": "O",
|
||||
}
|
||||
|
|
|
@ -5481,7 +5481,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "SvenZimmer@fleckens.hu",
|
||||
"start_position": 39,
|
||||
"end_position": 61
|
||||
|
@ -5585,7 +5585,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL"
|
||||
"U-EMAIL_ADDRESS"
|
||||
],
|
||||
"template_id": null,
|
||||
"metadata": {
|
||||
|
@ -9288,7 +9288,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "EmilySanderson@jourrapide.com",
|
||||
"start_position": 59,
|
||||
"end_position": 88
|
||||
|
@ -9440,7 +9440,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL"
|
||||
"U-EMAIL_ADDRESS"
|
||||
],
|
||||
"template_id": null,
|
||||
"metadata": {
|
||||
|
@ -20492,7 +20492,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "NatalinaLucchese@superrito.com",
|
||||
"start_position": 59,
|
||||
"end_position": 89
|
||||
|
@ -20644,7 +20644,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL"
|
||||
"U-EMAIL_ADDRESS"
|
||||
],
|
||||
"template_id": null,
|
||||
"metadata": {
|
||||
|
@ -25723,7 +25723,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "HannaUkkonen@dayrep.com",
|
||||
"start_position": 39,
|
||||
"end_position": 62
|
||||
|
@ -25827,7 +25827,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL"
|
||||
"U-EMAIL_ADDRESS"
|
||||
],
|
||||
"template_id": null,
|
||||
"metadata": {
|
||||
|
@ -32783,7 +32783,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "yahyaeriksson@gustr.com",
|
||||
"start_position": 23,
|
||||
"end_position": 46
|
||||
|
@ -32918,7 +32918,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL",
|
||||
"U-EMAIL_ADDRESS",
|
||||
"O",
|
||||
"O",
|
||||
"O",
|
||||
|
@ -40833,7 +40833,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "VictorAndreyev@cuvox.de",
|
||||
"start_position": 23,
|
||||
"end_position": 46
|
||||
|
@ -40968,7 +40968,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL",
|
||||
"U-EMAIL_ADDRESS",
|
||||
"O",
|
||||
"O",
|
||||
"O",
|
||||
|
@ -44468,7 +44468,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "HarrisonBarnes@fleckens.hu",
|
||||
"start_position": 59,
|
||||
"end_position": 85
|
||||
|
@ -44620,7 +44620,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL"
|
||||
"U-EMAIL_ADDRESS"
|
||||
],
|
||||
"template_id": null,
|
||||
"metadata": {
|
||||
|
@ -49165,7 +49165,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "MathiasEJespersen@armyspy.com",
|
||||
"start_position": 23,
|
||||
"end_position": 52
|
||||
|
@ -49300,7 +49300,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL",
|
||||
"U-EMAIL_ADDRESS",
|
||||
"O",
|
||||
"O",
|
||||
"O",
|
||||
|
@ -62644,7 +62644,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "ElishaFedorov@fleckens.hu",
|
||||
"start_position": 39,
|
||||
"end_position": 64
|
||||
|
@ -62748,7 +62748,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL"
|
||||
"U-EMAIL_ADDRESS"
|
||||
],
|
||||
"template_id": null,
|
||||
"metadata": {
|
||||
|
@ -68659,7 +68659,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "HartmannAntonsson@jourrapide.com",
|
||||
"start_position": 59,
|
||||
"end_position": 91
|
||||
|
@ -68811,7 +68811,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL"
|
||||
"U-EMAIL_ADDRESS"
|
||||
],
|
||||
"template_id": null,
|
||||
"metadata": {
|
||||
|
@ -72669,7 +72669,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "MakarMaslow@teleworm.us",
|
||||
"start_position": 39,
|
||||
"end_position": 62
|
||||
|
@ -72773,7 +72773,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL"
|
||||
"U-EMAIL_ADDRESS"
|
||||
],
|
||||
"template_id": null,
|
||||
"metadata": {
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "syedsimensen@cuvox.de",
|
||||
"start_position": 59,
|
||||
"end_position": 80
|
||||
|
@ -156,7 +156,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL"
|
||||
"U-EMAIL_ADDRESS"
|
||||
],
|
||||
"template_id": null,
|
||||
"metadata": {
|
||||
|
@ -2140,7 +2140,7 @@
|
|||
"masked": null,
|
||||
"spans": [
|
||||
{
|
||||
"entity_type": "EMAIL",
|
||||
"entity_type": "EMAIL_ADDRESS",
|
||||
"entity_value": "AkahoYokoi@cuvox.de",
|
||||
"start_position": 59,
|
||||
"end_position": 78
|
||||
|
@ -2292,7 +2292,7 @@
|
|||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"U-EMAIL"
|
||||
"U-EMAIL_ADDRESS"
|
||||
],
|
||||
"template_id": null,
|
||||
"metadata": {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
My email is [EMAIL]
|
||||
My email is [EMAIL_ADDRESS]
|
||||
My address is [ADDRESS]
|
||||
My first name is [FIRST_NAME] and my last is [LAST_NAME]
|
||||
My name is [PERSON]
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
import numpy as np
|
||||
from collections import Counter
|
||||
|
||||
from presidio_evaluator import InputSample
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from presidio_evaluator import InputSample, Span
|
||||
from presidio_evaluator.data_generator import read_synth_dataset
|
||||
from presidio_evaluator.evaluation import EvaluationResult, Evaluator
|
||||
from tests.mocks import (
|
||||
|
@ -296,3 +299,49 @@ def test_dataset_to_metric_50_50_model():
|
|||
assert metrics.pii_precision == 1
|
||||
assert metrics.pii_recall < 0.75
|
||||
assert metrics.pii_recall > 0.25
|
||||
|
||||
|
||||
def test_align_entity_types_correct_output():
|
||||
|
||||
sample1 = InputSample(
|
||||
"I live in ABC",
|
||||
spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)],
|
||||
create_tags_from_span=False,
|
||||
)
|
||||
sample2 = InputSample(
|
||||
"I live in ABC",
|
||||
spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("C", "c", 100, 101)],
|
||||
create_tags_from_span=False,
|
||||
)
|
||||
samples = [sample1, sample2]
|
||||
mapping = {
|
||||
"A": "1",
|
||||
"B": "2",
|
||||
"C": "1",
|
||||
}
|
||||
|
||||
new_samples = Evaluator.align_entity_types(samples, mapping)
|
||||
|
||||
count_per_entity = Counter()
|
||||
for sample in new_samples:
|
||||
for span in sample.spans:
|
||||
count_per_entity[span.entity_type] += 1
|
||||
|
||||
assert count_per_entity["1"] == 5
|
||||
assert count_per_entity["2"] == 1
|
||||
|
||||
|
||||
def test_align_entity_types_wrong_mapping_exception():
|
||||
|
||||
sample1 = InputSample(
|
||||
"I live in ABC",
|
||||
spans=[Span("A", "a", 0, 1), Span("A", "a", 10, 11), Span("B", "b", 100, 101)],
|
||||
create_tags_from_span=False,
|
||||
)
|
||||
|
||||
entities_mapping = {"Z": "z"}
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
Evaluator.align_entity_types(
|
||||
input_samples=[sample1], entities_mapping=entities_mapping
|
||||
)
|
||||
|
|
|
@ -72,7 +72,7 @@ def test_analyzer_with_generated_text(test_input, acceptance_threshold):
|
|||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
input_samples = read_synth_dataset(test_input.format(dir_path))
|
||||
|
||||
updated_samples = Evaluator.align_input_samples_to_presidio_analyzer(
|
||||
updated_samples = Evaluator.align_entity_types(
|
||||
input_samples=input_samples, entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map
|
||||
)
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче