This commit is contained in:
Omri Mendels 2023-12-27 21:45:04 +02:00
Родитель 793fb99b77
Коммит cffb49e7ce
26 изменённых файлов: 969 добавлений и 348 удалений

Просмотреть файл

@ -72,7 +72,7 @@ Furthermore, it tokenizes the data, creates tags (either IO/BIO/BILUO) and spans
Once data is generated, it could be split into train/test/validation sets
while ensuring that each template only exists in one set.
See [this notebook for more details](notebooks/3_Split_by_pattern_%23.ipynb).
See [this notebook for more details](notebooks/3_Split_by_pattern_number.ipynb).
## 2. Data representation

Просмотреть файл

@ -1 +1 @@
0.1.2
0.1.3

Просмотреть файл

@ -2,8 +2,10 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 1,
"metadata": {
"is_executing": true
},
"outputs": [],
"source": [
"# install presidio via pip if not yet installed\n",
@ -14,8 +16,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {
"is_executing": true,
"scrolled": true
},
"outputs": [],
@ -81,9 +84,34 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 3,
"metadata": {
"is_executing": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Sampling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9149.88it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"My name is Joshua Jackson\n",
"[{\"value\": \"Joshua Jackson\", \"start\": 11, \"end\": 25, \"type\": \"name\"}]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"sentence_templates = [\n",
" \"My name is {{name}}\",\n",
@ -126,8 +154,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {
"is_executing": true,
"scrolled": true
},
"outputs": [],
@ -165,13 +194,228 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>number</th>\n",
" <th>gender</th>\n",
" <th>nationality</th>\n",
" <th>prefix</th>\n",
" <th>first_name</th>\n",
" <th>middle_initial</th>\n",
" <th>last_name</th>\n",
" <th>street_name</th>\n",
" <th>city</th>\n",
" <th>state_abbr</th>\n",
" <th>...</th>\n",
" <th>company</th>\n",
" <th>domain_name</th>\n",
" <th>person</th>\n",
" <th>name</th>\n",
" <th>first_name_female</th>\n",
" <th>first_name_male</th>\n",
" <th>prefix_female</th>\n",
" <th>prefix_male</th>\n",
" <th>last_name_female</th>\n",
" <th>last_name_male</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>Czech</td>\n",
" <td>Mrs.</td>\n",
" <td>Marie</td>\n",
" <td>J</td>\n",
" <td>Hamanová</td>\n",
" <td>P.O. Box 255</td>\n",
" <td>Kangerlussuaq</td>\n",
" <td>QE</td>\n",
" <td>...</td>\n",
" <td>Simple Solutions</td>\n",
" <td>MarathonDancing.gl</td>\n",
" <td>Marie J Hamanová</td>\n",
" <td>Marie J Hamanová</td>\n",
" <td>Marie</td>\n",
" <td></td>\n",
" <td>Mrs.</td>\n",
" <td></td>\n",
" <td>Hamanová</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>French</td>\n",
" <td>Ms.</td>\n",
" <td>Patricia</td>\n",
" <td>G</td>\n",
" <td>Desrosiers</td>\n",
" <td>Avenida Noruega 42</td>\n",
" <td>Vila Real</td>\n",
" <td>VR</td>\n",
" <td>...</td>\n",
" <td>Formula Gray</td>\n",
" <td>LostMillions.com.pt</td>\n",
" <td>Patricia Desrosiers</td>\n",
" <td>Patricia Desrosiers</td>\n",
" <td>Patricia</td>\n",
" <td></td>\n",
" <td>Ms.</td>\n",
" <td></td>\n",
" <td>Desrosiers</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>American</td>\n",
" <td>Ms.</td>\n",
" <td>Debra</td>\n",
" <td>O</td>\n",
" <td>Neal</td>\n",
" <td>1659 Hoog St</td>\n",
" <td>Brakpan</td>\n",
" <td>GA</td>\n",
" <td>...</td>\n",
" <td>Dahlkemper's</td>\n",
" <td>MediumTube.co.za</td>\n",
" <td>Debra O Neal</td>\n",
" <td>Debra O Neal</td>\n",
" <td>Debra</td>\n",
" <td></td>\n",
" <td>Ms.</td>\n",
" <td></td>\n",
" <td>Neal</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>male</td>\n",
" <td>French</td>\n",
" <td>Mr.</td>\n",
" <td>Peverell</td>\n",
" <td>C</td>\n",
" <td>Racine</td>\n",
" <td>183 Epimenidou Street</td>\n",
" <td>Limassol</td>\n",
" <td>LI</td>\n",
" <td>...</td>\n",
" <td>Quickbiz</td>\n",
" <td>ImproveLook.com.cy</td>\n",
" <td>Peverell Racine</td>\n",
" <td>Peverell Racine</td>\n",
" <td></td>\n",
" <td>Peverell</td>\n",
" <td></td>\n",
" <td>Mr.</td>\n",
" <td></td>\n",
" <td>Racine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>female</td>\n",
" <td>Slovenian</td>\n",
" <td>Mrs.</td>\n",
" <td>Iolanda</td>\n",
" <td>S</td>\n",
" <td>Tratnik</td>\n",
" <td>Karu põik 61</td>\n",
" <td>Pärnu</td>\n",
" <td>PR</td>\n",
" <td>...</td>\n",
" <td>Dubrow's Cafeteria</td>\n",
" <td>PostTan.com.ee</td>\n",
" <td>Iolanda Tratnik</td>\n",
" <td>Iolanda Tratnik</td>\n",
" <td>Iolanda</td>\n",
" <td></td>\n",
" <td>Mrs.</td>\n",
" <td></td>\n",
" <td>Tratnik</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 37 columns</p>\n",
"</div>"
],
"text/plain": [
" number gender nationality prefix first_name middle_initial last_name \\\n",
"0 1 female Czech Mrs. Marie J Hamanová \n",
"1 2 female French Ms. Patricia G Desrosiers \n",
"2 3 female American Ms. Debra O Neal \n",
"3 4 male French Mr. Peverell C Racine \n",
"4 5 female Slovenian Mrs. Iolanda S Tratnik \n",
"\n",
" street_name city state_abbr ... company \\\n",
"0 P.O. Box 255 Kangerlussuaq QE ... Simple Solutions \n",
"1 Avenida Noruega 42 Vila Real VR ... Formula Gray \n",
"2 1659 Hoog St Brakpan GA ... Dahlkemper's \n",
"3 183 Epimenidou Street Limassol LI ... Quickbiz \n",
"4 Karu põik 61 Pärnu PR ... Dubrow's Cafeteria \n",
"\n",
" domain_name person name \\\n",
"0 MarathonDancing.gl Marie J Hamanová Marie J Hamanová \n",
"1 LostMillions.com.pt Patricia Desrosiers Patricia Desrosiers \n",
"2 MediumTube.co.za Debra O Neal Debra O Neal \n",
"3 ImproveLook.com.cy Peverell Racine Peverell Racine \n",
"4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n",
"\n",
" first_name_female first_name_male prefix_female prefix_male \\\n",
"0 Marie Mrs. \n",
"1 Patricia Ms. \n",
"2 Debra Ms. \n",
"3 Peverell Mr. \n",
"4 Iolanda Mrs. \n",
"\n",
" last_name_female last_name_male \n",
"0 Hamanová \n",
"1 Desrosiers \n",
"2 Neal \n",
"3 Racine \n",
"4 Tratnik \n",
"\n",
"[5 rows x 37 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read FakeNameGenerator CSV\n",
"fake_name_generator_df = pd.read_csv(fake_name_generator_file)\n",
@ -190,8 +434,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {
"is_executing": true,
"scrolled": true
},
"outputs": [],
@ -209,8 +454,10 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 7,
"metadata": {
"is_executing": true
},
"outputs": [],
"source": [
"fake.add_provider(IpAddressProvider) # Both Ipv4 and IPv6 IP addresses\n",
@ -235,8 +482,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@ -270,13 +518,36 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Sampling: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 17987.56it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"year\"}], \"template\": \"Title VII of the Civil Rights Act of {{year}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"sentence_templates = PresidioDataGenerator.read_template_file(templates_file_path)\n",
"fake_records = data_generator.generate_fake_data(\n",
@ -296,11 +567,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {
"is_executing": true,
"scrolled": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total: 1500\n",
"Avg # of records per template: 7.142857142857143\n",
"Median # of records per template: 7.0\n",
"Std: 2.5872528966106905\n"
]
}
],
"source": [
"count_per_template_id = Counter([sample.template_id for sample in fake_records])\n",
"\n",
@ -323,13 +606,65 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"Counter({'organization': 257,\n",
" 'first_name': 244,\n",
" 'person': 238,\n",
" 'city': 235,\n",
" 'address': 209,\n",
" 'street_name': 164,\n",
" 'name': 162,\n",
" 'country': 154,\n",
" 'credit_card_number': 152,\n",
" 'phone_number': 121,\n",
" 'last_name': 119,\n",
" 'building_number': 110,\n",
" 'age': 72,\n",
" 'secondary_address': 64,\n",
" 'year': 58,\n",
" 'nationality': 55,\n",
" 'postcode': 49,\n",
" 'zipcode': 45,\n",
" 'url': 39,\n",
" 'email': 39,\n",
" 'name_female': 37,\n",
" 'job': 33,\n",
" 'first_name_male': 31,\n",
" 'name_male': 29,\n",
" 'prefix_male': 28,\n",
" 'date_of_birth': 24,\n",
" 'iban': 22,\n",
" 'date_time': 21,\n",
" 'prefix_female': 21,\n",
" 'day_of_week': 16,\n",
" 'state_abbr': 15,\n",
" 'last_name_male': 15,\n",
" 'prefix': 12,\n",
" 'ip_address': 11,\n",
" 'ssn': 11,\n",
" 'nation_plural': 9,\n",
" 'nation_woman': 8,\n",
" 'first_name_nonbinary': 6,\n",
" 'us_driver_license': 6,\n",
" 'first_name_female': 3,\n",
" 'last_name_female': 3})"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_per_entity = Counter()\n",
"for record in fake_records:\n",
@ -351,8 +686,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@ -421,9 +757,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 13,
"metadata": {
"is_executing": true
},
"outputs": [
{
"data": {
"text/plain": [
"{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"DATE_TIME\"}], \"template\": \"Title VII of the Civil Rights Act of {{DATE_TIME}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_records[0]"
]
@ -437,13 +786,41 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[('PERSON', 887),\n",
" ('STREET_ADDRESS', 596),\n",
" ('GPE', 404),\n",
" ('ORGANIZATION', 257),\n",
" ('CREDIT_CARD', 152),\n",
" ('PHONE_NUMBER', 121),\n",
" ('DATE_TIME', 119),\n",
" ('TITLE', 94),\n",
" ('NRP', 72),\n",
" ('AGE', 72),\n",
" ('ZIP_CODE', 45),\n",
" ('DOMAIN_NAME', 39),\n",
" ('EMAIL_ADDRESS', 39),\n",
" ('IBAN_CODE', 22),\n",
" ('IP_ADDRESS', 11),\n",
" ('US_SSN', 11),\n",
" ('US_DRIVER_LICENSE', 6)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"count_per_entity_new = Counter()\n",
@ -463,13 +840,51 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/1500 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading model en_core_web_sm\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:06<00:00, 215.70it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6.76 s, sys: 33.8 ms, total: 6.8 s\n",
"Wall time: 6.96 s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"%%time\n",
"input_samples = [\n",
@ -491,8 +906,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@ -515,21 +931,31 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 76888.23it/s]\n"
]
}
],
"source": [
"conll = InputSample.create_conll_dataset(input_samples)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@ -546,7 +972,7 @@
"### Next steps\n",
"\n",
"- Evaluate Presidio using this fake data. [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_#.ipynb)\n",
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_number.ipynb)\n",
"- Conduct a small exploratory data analysis on the generated data. [Sample](2_PII_EDA.ipynb)"
]
},
@ -569,9 +995,9 @@
"hash": "2509fbe9adc3579fd0ef23e6a2c6fb50cb745caa174aafdf017283479e60bc43"
},
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -583,9 +1009,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

Просмотреть файл

@ -72,7 +72,7 @@
"metadata": {},
"outputs": [],
"source": [
"for (name, series) in pii_df.iteritems():\n",
"for (name, series) in pii_df.items():\n",
" print(name)\n",
" print(\"Unique values: {}\".format(len(series.unique())))\n",
" print(series.value_counts())\n",
@ -123,7 +123,7 @@
"metadata": {},
"outputs": [],
"source": [
"series_to_wordcloud(pii_df.country_full)"
"series_to_wordcloud(pii_df.country)"
]
},
{
@ -187,9 +187,9 @@
"metadata": {},
"outputs": [],
"source": [
"countries = [get_entity_values_from_sample(sample, [\"LOCATION\"]) for sample in synth]\n",
"countries = [get_entity_values_from_sample(sample, [\"TITLE\"]) for sample in synth]\n",
"countries = [item for sublist in countries for item in sublist]\n",
"series_to_wordcloud(pd.Series(countries, name=\"LOCATION\"))"
"series_to_wordcloud(pd.Series(countries, name=\"TITLE\"))"
]
},
{
@ -213,9 +213,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -227,9 +227,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
"nbformat_minor": 4
}

Просмотреть файл

@ -143,13 +143,6 @@
"assert len(train) + len(test) + len(validation) == len(all_samples)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
@ -160,9 +153,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -174,9 +167,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

Просмотреть файл

@ -5,7 +5,7 @@
"id": "847acd88",
"metadata": {},
"source": [
"Evaluate Presidio Analyzer using the Presidio Evaluator framework"
"# Evaluate Presidio Analyzer using the Presidio Evaluator framework"
]
},
{
@ -17,7 +17,8 @@
"source": [
"# install presidio via pip if not yet installed\n",
"\n",
"#!pip install presidio-analyzer\n",
"#!pip install presidio-evaluator\n",
"#!pip install \"presidio-analyzer[transformers]\"\n",
"#!pip install presidio-evaluator"
]
},
@ -32,6 +33,10 @@
"from copy import deepcopy\n",
"from pprint import pprint\n",
"from collections import Counter\n",
"from typing import List\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"from presidio_evaluator import InputSample\n",
"from presidio_evaluator.evaluation import Evaluator, ModelError\n",
@ -45,7 +50,8 @@
"pd.set_option(\"display.max_colwidth\", None)\n",
"\n",
"%reload_ext autoreload\n",
"%autoreload 2"
"%autoreload 2\n",
"%matplotlib inline"
]
},
{
@ -65,6 +71,9 @@
"source": [
"dataset_name = \"synth_dataset_v2.json\"\n",
"dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, \"data\", dataset_name))\n",
"\n",
"dataset = dataset[:300] # top 300 samples\n",
"\n",
"print(len(dataset))"
]
},
@ -75,10 +84,12 @@
"metadata": {},
"outputs": [],
"source": [
"entity_counter = Counter()\n",
"for sample in dataset:\n",
" for tag in sample.tags:\n",
" entity_counter[tag] += 1"
"def get_entity_counts(dataset:List[InputSample]):\n",
" entity_counter = Counter()\n",
" for sample in dataset:\n",
" for tag in sample.tags:\n",
" entity_counter[tag] += 1\n",
" return entity_counter\n"
]
},
{
@ -89,7 +100,7 @@
"outputs": [],
"source": [
"print(\"Count per entity:\")\n",
"pprint(entity_counter.most_common())\n",
"pprint(get_entity_counts(dataset).most_common())\n",
"\n",
"print(\"\\nExample sentence:\")\n",
"print(dataset[1])\n",
@ -107,12 +118,121 @@
")"
]
},
{
"cell_type": "markdown",
"id": "9c5e16cb-bee8-4f0a-a543-4879daa35b9e",
"metadata": {},
"source": [
"### Define the AnalyzerEngine object \n",
"In this case, using a huggingface model: obi/deid_roberta_i2b2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "313b508f-e901-40b9-b575-c7fb8a794652",
"metadata": {},
"outputs": [],
"source": [
"from presidio_analyzer import AnalyzerEngine\n",
"from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration\n",
"\n",
"\n",
"# Here we define a transformers based NLP engine, \n",
"# but you can use this cell to customize your Presidio Analyzer instance\n",
"\n",
"# Define which model to use\n",
"model_config = [{\"lang_code\": \"en\", \"model_name\": {\n",
" \"spacy\": \"en_core_web_sm\", # use a small spaCy model for lemmas, tokens etc.\n",
" \"transformers\": \"obi/deid_roberta_i2b2\"\n",
" }\n",
"}]\n",
"\n",
"# Map transformers model labels to Presidio's\n",
"model_to_presidio_entity_mapping = dict(\n",
" PER=\"PERSON\",\n",
" PERSON=\"PERSON\",\n",
" LOC= \"LOCATION\",\n",
" LOCATION= \"LOCATION\",\n",
" GPE=\"LOCATION\",\n",
" ORG=\"ORGANIZATION\",\n",
" ORGANIZATION=\"ORGANIZATION\",\n",
" NORP=\"NRP\",\n",
" AGE=\"AGE\",\n",
" ID=\"ID\",\n",
" EMAIL=\"EMAIL\",\n",
" PATIENT=\"PERSON\",\n",
" STAFF=\"PERSON\",\n",
" HOSP=\"ORGANIZATION\",\n",
" PATORG=\"ORGANIZATION\",\n",
" DATE=\"DATE_TIME\",\n",
" TIME=\"DATE_TIME\",\n",
" PHONE=\"PHONE_NUMBER\",\n",
" HCW=\"PERSON\",\n",
" HOSPITAL=\"ORGANIZATION\",\n",
" FACILITY=\"LOCATION\",\n",
")\n",
"\n",
"ner_model_configuration = NerModelConfiguration(labels_to_ignore = [\"O\"], \n",
" model_to_presidio_entity_mapping=model_to_presidio_entity_mapping)\n",
"\n",
"nlp_engine = TransformersNlpEngine(models=model_config,\n",
" ner_model_configuration=ner_model_configuration)\n",
"\n",
"# Set up the engine, loads the NLP module (spaCy model by default) \n",
"# and other PII recognizers\n",
"analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)"
]
},
{
"cell_type": "markdown",
"id": "aae4c379",
"metadata": {},
"source": [
"Run evaluation:"
"### Run evaluation"
]
},
{
"cell_type": "markdown",
"id": "16dbf6d6-a554-4602-8907-589786d47a12",
"metadata": {},
"source": [
"#### Define experiment"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29d39ff1-4f14-4e32-ae84-ecc6c739f829",
"metadata": {},
"outputs": [],
"source": [
"experiment = get_experiment_tracker()\n",
"model = PresidioAnalyzerWrapper(analyzer_engine)\n",
"\n",
"# Define evaluator and experiment tracking\n",
"\n",
"evaluator = Evaluator(model=model)\n",
"dataset = Evaluator.align_entity_types(\n",
" deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
")\n",
"\n",
"print(\"Count per entity after alignment:\")\n",
"pprint(get_entity_counts(dataset).most_common())\n",
"\n",
"# Track model and dataset params\n",
"params = {\"dataset_name\": dataset_name, \"model_name\": model.name}\n",
"params.update(model.to_log())\n",
"experiment.log_parameters(params)\n",
"experiment.log_dataset_hash(dataset)"
]
},
{
"cell_type": "markdown",
"id": "2a7d6626-d094-4dfd-8f37-c0443edf00dc",
"metadata": {},
"source": [
"#### Run experiment"
]
},
{
@ -122,39 +242,37 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"Evaluating Presidio Analyzer\")\n",
"\n",
"experiment = get_experiment_tracker()\n",
"model_name = \"Presidio Analyzer\"\n",
"model = PresidioAnalyzerWrapper()\n",
"\n",
"evaluator = Evaluator(model=model)\n",
"dataset = Evaluator.align_entity_types(\n",
" deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
")\n",
"\n",
"# Run experiment\n",
"evaluation_results = evaluator.evaluate_all(dataset)\n",
"results = evaluator.calculate_score(evaluation_results)\n",
"\n",
"# update params tracking\n",
"params = {\"dataset_name\": dataset_name, \"model_name\": model_name}\n",
"params.update(model.to_log())\n",
"experiment.log_parameters(params)\n",
"experiment.log_dataset_hash(dataset)\n",
"# Track experiment results\n",
"experiment.log_metrics(results.to_log())\n",
"entities, confmatrix = results.to_confusion_matrix()\n",
"experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)\n",
"experiment.log_confusion_matrix(matrix=confmatrix, \n",
" labels=entities)\n",
"\n",
"print(\"Confusion matrix:\")\n",
"print(pd.DataFrame(confmatrix, columns=entities, index=entities))\n",
"\n",
"print(\"Precision and recall\")\n",
"print(results)\n",
"# Plot output\n",
"plotter = evaluator.Plotter(model=model, \n",
" results=results, \n",
" output_folder = \".\", \n",
" model_name = model.name, \n",
" beta = 2)\n",
"\n",
"# end experiment\n",
"experiment.end()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b4d662d-596c-4a69-b3c9-1edcda20cc5b",
"metadata": {},
"outputs": [],
"source": [
"plotter.plot_scores()"
]
},
{
"cell_type": "markdown",
"id": "070f8287",
@ -198,7 +316,7 @@
"id": "98f4802e",
"metadata": {},
"source": [
"1. Most false positive tokens:"
"1. Most common false positive tokens:"
]
},
{
@ -219,7 +337,7 @@
"outputs": [],
"source": [
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"LOCATION\"])\n",
"fps_df[[\"full_text\", \"token\", \"prediction\"]]"
"fps_df[[\"full_text\", \"token\", \"annotation\", \"prediction\"]]"
]
},
{
@ -227,7 +345,7 @@
"id": "d0852513",
"metadata": {},
"source": [
"2. False negative examples"
"2. Most common false negative examples"
]
},
{
@ -237,7 +355,7 @@
"metadata": {},
"outputs": [],
"source": [
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
"ModelError.most_common_fn_tokens(errors, n=50)"
]
},
{
@ -255,7 +373,7 @@
"metadata": {},
"outputs": [],
"source": [
"fns_df = ModelError.get_fns_dataframe(errors, entity=[\"PHONE_NUMBER\"])"
"fns_df = ModelError.get_fns_dataframe(errors, entity=[\"IP_ADDRESS\"])"
]
},
{
@ -278,13 +396,21 @@
"print(\"All errors:\\n\")\n",
"[print(error, \"\\n\") for error in errors]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a67ff38d-0817-4864-9991-b3eb1f80eecc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -296,7 +422,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,

Просмотреть файл

@ -88,7 +88,7 @@
{
"data": {
"text/plain": [
"[type: DOMAIN_NAME, start: 57, end: 69, score: 1.0,\n",
"[type: URL, start: 49, end: 69, score: 0.95,\n",
" type: PERSON, start: 14, end: 24, score: 0.85]"
]
},
@ -116,11 +116,11 @@
{
"data": {
"text/plain": [
"['Hi my name is Albert Cohen and this is my website: https://http://chapman-downs.info/',\n",
" 'Hi my name is Lisa Miller and this is my website: https://http://benson.org/',\n",
" 'Hi my name is Kathleen Hale and this is my website: https://http://www.garcia.com/',\n",
" 'Hi my name is Michelle Frederick and this is my website: https://https://robinson.com/',\n",
" 'Hi my name is Alicia Santana and this is my website: https://https://www.ray.org/']"
"['Hi my name is Tammy Ryan and this is my website: https://www.cardenas.info/',\n",
" 'Hi my name is Jessica Smith and this is my website: http://jones-hunt.info/',\n",
" 'Hi my name is Michele Marsh and this is my website: https://guerrero.com/',\n",
" 'Hi my name is Kathleen Miller and this is my website: https://lopez.com/',\n",
" 'Hi my name is Paul Brown and this is my website: http://www.banks-evans.info/']"
]
},
"execution_count": 6,
@ -153,11 +153,11 @@
"-------------\n",
"Fake examples:\n",
"\n",
"Our son R2D2 used to work in Botswana\n",
"Our son R2D2 used to work in American Samoa\n",
"Our son R2D2 used to work in Malawi\n",
"Our son R2D2 used to work in Montenegro\n",
"our son r2d2 used to work in lebanon\n"
"Our son R2D2 used to work in Nigeria\n",
"Our son R2D2 used to work in Guam\n",
"Our son R2D2 used to work in Reunion\n",
"Our son R2D2 used to work in Vanuatu\n",
"Our son R2D2 used to work in Malaysia\n"
]
}
],
@ -176,13 +176,20 @@
"print(f\"-------------\\nFake examples:\\n\")\n",
"print(*fake_samples, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -194,9 +201,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 1
"nbformat_minor": 4
}

Просмотреть файл

@ -23,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -34,7 +34,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
@ -42,55 +42,18 @@
},
"outputs": [],
"source": [
"DATA_DATE = \"Dec-19-2021\""
"DATA_DATE = \"Dec-27-2023\" # Change to the date when notebook 3 (split to train/test) was ran"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"tokenizing input: 0%| | 0/2122 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading model en_core_web_sm\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"tokenizing input: 100%|███████████████████████████████████████████████████████████| 2122/2122 [00:19<00:00, 109.66it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Read 2122 samples\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"outputs": [],
"source": [
"data_path = \"../../data/{}_{}.json\"\n",
"\n",
@ -111,17 +74,9 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Kept 1940 samples after removal of non-tagged samples\n"
]
}
],
"outputs": [],
"source": [
"train_tagged = [sample for sample in train_samples if len(sample.spans) > 0]\n",
"print(\"Kept {} samples after removal of non-tagged samples\".format(len(train_tagged)))"
@ -140,45 +95,13 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Entities found in training set:\n"
]
},
{
"data": {
"text/plain": [
"{'ADDRESS',\n",
" 'CREDIT_CARD',\n",
" 'DATE_TIME',\n",
" 'DOMAIN_NAME',\n",
" 'EMAIL_ADDRESS',\n",
" 'IBAN_CODE',\n",
" 'IP_ADDRESS',\n",
" 'LOCATION',\n",
" 'O',\n",
" 'ORGANIZATION',\n",
" 'PERSON',\n",
" 'PHONE_NUMBER',\n",
" 'PREFIX',\n",
" 'TITLE',\n",
" 'US_SSN'}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"print(\"Entities found in training set:\")\n",
"entities = []\n",
@ -206,16 +129,7 @@
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping illegal span None, text=ΜΟΝΗ ΑΓΙΩΝ ΑΝΑΡΓΥΡΩΝ\n",
"Skipping illegal span None, text=U.N\n"
]
}
],
"outputs": [],
"source": [
"spacy_train = InputSample.create_spacy_dataset(\n",
" dataset=train_tagged, output_path=\"train.spacy\"\n",
@ -281,9 +195,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -295,9 +209,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
"nbformat_minor": 4
}

Просмотреть файл

@ -39,6 +39,16 @@
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aee00770-a972-4a19-b423-1724214cc88c",
"metadata": {},
"outputs": [],
"source": [
"#!pip install sklearn_crfsuite"
]
},
{
"cell_type": "markdown",
"id": "a0d2d772",
@ -58,8 +68,9 @@
},
"outputs": [],
"source": [
"DATA_DATE = \"Jan-15-2022\"\n",
"dataset = InputSample.read_dataset_json(\"../../data/test_{}.json\".format(DATA_DATE))\n",
"DATA_DATE = \"Dec-27-2023\" # Date when the split to train/test notebook was ran\n",
"dataset_name = \"../../data/test_{}.json\".format(DATA_DATE)\n",
"dataset = InputSample.read_dataset_json(dataset_name)\n",
"print(len(dataset))"
]
},
@ -76,7 +87,7 @@
"source": [
"entity_counter = Counter()\n",
"for sample in dataset:\n",
" for t>ag in sample.tags:\n",
" for tag in sample.tags:\n",
" entity_counter[tag] += 1"
]
},
@ -257,7 +268,7 @@
"metadata": {},
"outputs": [],
"source": [
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"GPE\"])\n",
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"PERSON\"])\n",
"fps_df[[\"full_text\", \"token\", \"prediction\"]]"
]
},
@ -276,7 +287,7 @@
"metadata": {},
"outputs": [],
"source": [
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"ORGANIZATION\"])"
]
},
{
@ -325,13 +336,21 @@
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf3e4646-ca93-44c5-a998-cd77f4bf2708",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -343,9 +362,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
}

Просмотреть файл

@ -205,7 +205,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.13 ('presidio')",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -219,9 +219,8 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.9.18"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "371968787ec79dd50357533864944a85029366968470cac36beb694745c2f7d6"
@ -229,5 +228,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

Просмотреть файл

@ -35,6 +35,16 @@
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0c3285c-06a2-4361-aec2-8375496f75b3",
"metadata": {},
"outputs": [],
"source": [
"#!pip install flair"
]
},
{
"cell_type": "markdown",
"id": "f036de59",
@ -111,15 +121,14 @@
"metadata": {},
"outputs": [],
"source": [
"flair_ner = \"ner-english\"\n",
"flair_ner_fast = \"ner-english-fast\"\n",
"flair_ontonotes_fast = \"ner-english-ontonotes-fast\"\n",
"flair_ontonotes_large = \"ner-english-ontonotes-large\"\n",
"flair_ner = \"flair/ner-english\"\n",
"flair_ner_fast = \"flair/ner-english-fast\"\n",
"flair_ontonotes_fast = \"flair/ner-english-ontonotes-fast\"\n",
"flair_ontonotes_large = \"flair/ner-english-ontonotes-large\"\n",
"models = [\n",
" flair_ner,\n",
" flair_ner_fast,\n",
" flair_ontonotes_fast,\n",
" flair_ner_fast,\n",
" flair_ontonotes_large,\n",
"]"
]
@ -312,9 +321,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -326,7 +335,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,

Просмотреть файл

@ -109,7 +109,10 @@
"metadata": {},
"outputs": [],
"source": [
"models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]"
"models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]\n",
"\n",
"# If needed, install models using `python -m spacy download X` where x is the model name, or use spacy.cli.download:\n",
"#spacy.cli.download(\"en_core_web_trf\")"
]
},
{
@ -334,9 +337,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -348,9 +351,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
"nbformat_minor": 4
}

Просмотреть файл

@ -170,6 +170,8 @@ class PresidioDataGenerator:
new_provider = BaseProvider(self.faker)
setattr(new_provider, new_name, original)
setattr(new_provider, new_name.lower(), original) # avoid case sensitivity
setattr(new_provider, new_name.upper(), original) # avoid case sensitivity
self.faker.add_provider(new_provider)
@staticmethod

Просмотреть файл

@ -25,7 +25,7 @@ class PresidioPseudonymization(PresidioDataGenerator):
self.add_provider_alias("credit_card_number", "CREDIT_CARD")
self.add_provider_alias("iban", "IBAN_CODE")
self.add_provider_alias("phone_number", "PHONE_NUMBER")
self.add_provider_alias("url", "DOMAIN_NAME")
self.add_provider_alias("url", "URL")
self.add_provider_alias("ssn", "US_SSN")
self.add_provider_alias("email", "EMAIL_ADDRESS")
self.add_provider_alias("date_time", "DATE_TIME")

Просмотреть файл

@ -1,6 +1,8 @@
import copy
from collections import Counter
from typing import List, Optional, Dict
from pathlib import Path
import string
import numpy as np
from tqdm import tqdm
@ -39,7 +41,6 @@ class Evaluator:
self.entities_to_keep = self.model.entities
def compare(self, input_sample: InputSample, prediction: List[str]):
"""
Compares ground truth tags (annotation) and predicted (prediction)
:param input_sample: input sample containing list of tags with scheme
@ -71,6 +72,9 @@ class Evaluator:
if self.entities_to_keep:
prediction = self._adjust_per_entities(prediction)
new_annotation = self._adjust_per_entities(new_annotation)
skip_words = self.get_skip_words()
for i in range(0, len(new_annotation)):
results[(new_annotation[i], prediction[i])] += 1
@ -81,6 +85,10 @@ class Evaluator:
# check if there was an error
is_error = new_annotation[i] != prediction[i]
if str(tokens[i]).lower().strip() in skip_words:
is_error = False
results[(new_annotation[i], prediction[i])] -= 1
if is_error:
if prediction[i] == "O":
mistakes.append(
@ -151,7 +159,6 @@ class Evaluator:
f"Mapping entity values using this dictionary: {self.model.entity_mapping}"
)
for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"):
# Align tag values to the ones expected by the model
self.model.align_entity_types(sample)
@ -345,13 +352,13 @@ class Evaluator:
if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0):
return np.nan
return ((1 + beta ** 2) * precision * recall) / (
((beta ** 2) * precision) + recall
return ((1 + beta**2) * precision * recall) / (
((beta**2) * precision) + recall
)
class Plotter:
"""
Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
for a PII detection model evaluated via Evaluator
:param model: Instance of a fitted model (of base type BaseModel)
@ -362,7 +369,9 @@ class Evaluator:
which gives more or less weight to precision vs. recall
"""
def __init__(self, model, results, output_folder: Path, model_name: str, beta: float):
def __init__(
self, model, results, output_folder: Path, model_name: str, beta: float
):
self.model = model
self.results = results
self.output_folder = output_folder
@ -372,41 +381,66 @@ class Evaluator:
def plot_scores(self) -> None:
"""
Plots per-entity recall, precision, or F2 score for evaluated model.
:param plot_type: which metric to graph (default is F2 score)
Plots per-entity recall, precision, or F2 score for evaluated model.
"""
scores = {}
scores['entity'] = list(self.results.entity_recall_dict.keys())
scores['recall'] = list(self.results.entity_recall_dict.values())
scores['precision'] = list(self.results.entity_precision_dict.values())
scores['count'] = list(self.results.n_dict.values())
scores[f"f{self.beta}_score"] = [Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
for recall, precision in zip(scores['recall'], scores['precision'])]
entity_recall_dict = copy.deepcopy(self.results.entity_recall_dict)
entity_precision_dict = copy.deepcopy(self.results.entity_precision_dict)
scores["entity"] = list(entity_recall_dict.keys())
scores["recall"] = list(entity_recall_dict.values())
scores["precision"] = list(entity_precision_dict.values())
scores["count"] = list(self.results.n_dict.values())
scores[f"f{self.beta}_score"] = [
Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
for recall, precision in zip(scores["recall"], scores["precision"])
]
# Add PII detection rates
scores["entity"].append("PII")
scores["recall"].append(self.results.pii_recall)
scores["precision"].append(self.results.pii_precision)
scores["count"].append(self.results.n)
scores[f"f{self.beta}_score"].append(self.results.pii_f)
df = pd.DataFrame(scores)
df['model'] = self.model_name
df["model"] = self.model_name
self._plot(df, plot_type="f2_score")
self._plot(df, plot_type="precision")
self._plot(df, plot_type="recall")
def _plot(self, df, plot_type) -> None:
fig = px.bar(df, text_auto=".2", y='entity', orientation="h",
x=plot_type, color='count', barmode='group', title=f"Per-entity {plot_type} for {self.model_name}")
fig.update_layout(barmode='group', yaxis={
'categoryorder': 'total ascending'})
fig = px.bar(
df,
text_auto=".2",
y="entity",
orientation="h",
x=plot_type,
color="count",
barmode="group",
height=30*len(set(df["entity"])),
title=f"Per-entity {plot_type} for {self.model_name}",
)
fig.update_layout(
barmode="group", yaxis={"categoryorder": "total ascending"}
)
fig.update_layout(yaxis_title=f"{plot_type}", xaxis_title="PII Entity")
fig.update_traces(textfont_size=12, textangle=0,
textposition="outside", cliponaxis=False)
fig.update_traces(
textfont_size=12, textangle=0, textposition="outside", cliponaxis=False
)
fig.update_layout(
plot_bgcolor="#FFF",
xaxis=dict(
title="PII entity",
linecolor="#BCCCDC", # Sets color of X-axis line
showgrid=False # Removes X-axis grid lines
showgrid=False, # Removes X-axis grid lines
),
yaxis=dict(
title=f"{plot_type}",
linecolor="#BCCCDC", # Sets color of X-axis line
showgrid=False # Removes X-axis grid lines
showgrid=False, # Removes X-axis grid lines
),
)
fig.show()
@ -419,47 +453,100 @@ class Evaluator:
for entity in self.model.entity_mapping.values():
fps_df = ModelError.get_fps_dataframe(self.errors, entity=[entity])
if fps_df is not None:
fps_path = self.output_folder / \
f"{self.model_name}-{entity}-fps.csv"
fps_path = (
self.output_folder / f"{self.model_name}-{entity}-fps.csv"
)
fps_df.to_csv(fps_path)
fps_frames.append(fps_path)
fns_df = ModelError.get_fns_dataframe(self.errors, entity=[entity])
if fns_df is not None:
fns_path = self.output_folder / \
f"{self.model_name}-{entity}-fns.csv"
fns_path = (
self.output_folder / f"{self.model_name}-{entity}-fns.csv"
)
fns_df.to_csv(fns_path)
fns_frames.append(fns_path)
def group_tokens(df):
return df.groupby(['token', 'annotation']).size().to_frame(
).sort_values([0], ascending=False).head(3).reset_index()
return (
df.groupby(["token", "annotation"])
.size()
.to_frame()
.sort_values([0], ascending=False)
.head(3)
.reset_index()
)
fps_tokens_df = pd.concat(
[group_tokens(pd.read_csv(df_path)) for df_path in fps_frames])
[group_tokens(pd.read_csv(df_path)) for df_path in fps_frames]
)
fns_tokens_df = pd.concat(
[group_tokens(pd.read_csv(df_path)) for df_path in fns_frames])
[group_tokens(pd.read_csv(df_path)) for df_path in fns_frames]
)
def generate_graph(title, tokens_df):
fig = px.histogram(tokens_df, x=0, y="token", orientation='h', color='annotation',
title=f"Most common {title} for {self.model_name}")
fig = px.histogram(
tokens_df,
x=0,
y="token",
orientation="h",
color="annotation",
title=f"Most common {title} for {self.model_name}",
)
fig.update_layout(yaxis_title=f"count", xaxis_title="PII Entity")
fig.update_traces(textfont_size=12, textangle=0,
textposition="outside", cliponaxis=False)
fig.update_traces(
textfont_size=12,
textangle=0,
textposition="outside",
cliponaxis=False,
)
fig.update_layout(
plot_bgcolor="#FFF",
xaxis=dict(
title="Count",
linecolor="#BCCCDC", # Sets color of X-axis line
showgrid=False # Removes X-axis grid lines
showgrid=False, # Removes X-axis grid lines
),
yaxis=dict(
title=f"Tokens",
linecolor="#BCCCDC", # Sets color of X-axis line
showgrid=False # Removes X-axis grid lines
showgrid=False, # Removes X-axis grid lines
),
)
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.update_layout(yaxis={"categoryorder": "total ascending"})
fig.show()
generate_graph(title="false-negatives", tokens_df=fns_tokens_df)
generate_graph(title="false-positives", tokens_df=fps_tokens_df)
@staticmethod
def get_skip_words():
skip_words = [x for x in string.punctuation]
skip_words.extend(
[
"\n",
"\n\n",
"\n\n\n",
">>",
">>>",
">>>>",
"street",
"st.",
"st",
"de",
"rue",
"via",
"and",
"or",
"do",
"as",
"of",
"day",
"address",
"country",
"state",
"city",
]
)
return skip_words

Просмотреть файл

@ -31,6 +31,7 @@ class BaseModel(ABC):
self.labeling_scheme = labeling_scheme
self.entity_mapping = entity_mapping
self.verbose = verbose
self.name = self.__class__.__name__
@abstractmethod
def predict(self, sample: InputSample, **kwargs) -> List[str]:

Просмотреть файл

@ -85,7 +85,7 @@ class CRFModel(BaseModel):
y_train = [self.sent2labels(s) for s in sentences]
return X_train, y_train
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
tags = CRFModel.crf_predict(sample, self.model)
if len(tags) != len(sample.tokens):

Просмотреть файл

@ -48,7 +48,7 @@ class FlairModel(BaseModel):
self.spacy_tokenizer = SpacyTokenizer(model=spacy.load("en_core_web_sm"))
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer)
self.model.predict(sentence)

Просмотреть файл

@ -91,23 +91,28 @@ class PresidioAnalyzerWrapper(BaseModel):
"PHONE_NUMBER": "PHONE_NUMBER",
"BIRTHDAY": "DATE_TIME",
"DATE_TIME": "DATE_TIME",
"DOMAIN_NAME": "DOMAIN_NAME",
"DOMAIN_NAME": "URL",
"TIME" : "DATE_TIME",
"DATE" : "DATE_TIME",
"CITY": "LOCATION",
"ADDRESS": "LOCATION",
"STREET_ADDRESS": "LOCATION",
"NATIONALITY": "LOCATION",
"LOCATION": "LOCATION",
"IBAN_CODE": "IBAN_CODE",
"URL": "DOMAIN_NAME",
"URL": "URL",
"US_SSN": "US_SSN",
"IP_ADDRESS": "IP_ADDRESS",
"ORGANIZATION": "ORG",
"ORGANIZATION": "ORGANIZATION",
"ORG": "ORGANIZATION",
"US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
"NRP": "NRP",
"TITLE": "O", # not supported
"PREFIX": "O", # not supported
"STREET_ADDRESS": "O", # not supported
"ZIP_CODE": "O", # not supported
"AGE": "O", # not supported
"NRP": "LOCATION",
"NORP": "LOCATION",
"ID": "ID",
"TITLE": "O", # not supported through spaCy
"PREFIX": "O", # not supported through spaCy
"ZIP_CODE": "O", # not supported through spaCy
"AGE": "O", # not supported through spaCy
"O": "O",
}

Просмотреть файл

@ -31,7 +31,7 @@ class SpacyModel(BaseModel):
else:
self.model = model
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
"""
Predict a list of tags for an inpuit sample.
:param sample: InputSample

Просмотреть файл

@ -51,7 +51,7 @@ class StanzaModel(SpacyModel):
entity_mapping=entity_mapping,
)
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
"""
Predict the tags using a stanza model.

Просмотреть файл

@ -48,8 +48,7 @@ class TextAnalyticsWrapper(BaseModel):
)
return text_analytics_client
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
documents = [sample.full_text]
response = self.ta_client.recognize_pii_entities(documents,
language="en")

Просмотреть файл

@ -4,26 +4,23 @@ version = "0.1.0"
description = ""
authors = ["Omri Mendels <omri374@users.noreply.github.com>"]
readme = "README.md"
include = [{ path= "presidio_evaluator/data_generator/raw_data/*"}]
[tool.poetry.dependencies]
python = "^3.9"
spacy = ">=3.2.0, <4.0.0"
numpy = ">=1.20.2,<2.0.0"
jupyter = ">=1"
pandas = ">=1.2.4,<2.0.0"
tqdm = ">=4.60.0,<5.0.0"
haikunator = ">=2.1.0,<3.0.0"
schwifty = ">=2023.11.2,<2024.0.0"
faker = ">=9.6.0,<10.0.0"
scikit-learn = ">1.3.2,<2.0.0"
pytest = ">=6.2.3"
spacy = "^3.5.0"
numpy = "^1.22"
pandas = "^2.1.4"
tqdm = "^4.60.0"
faker = "^21.0"
scikit-learn = "^1.3.2"
presidio-analyzer = "^2.2.351"
presidio-anonymizer = "^2.2.351"
requests = ">=2.25.1"
xmltodict = ">=0.12.0"
requests = "^2.25"
xmltodict = "^0.12.0"
python-dotenv = "^1.0.0"
plotly = "^5.18.0"
azure-ai-textanalytics = ">=5.3.0"
azure-ai-textanalytics = "^5.3.0"
en_core_web_sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz"}
en_core_web_lg = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz"}

Просмотреть файл

@ -1,54 +1,57 @@
from setuptools import setup, find_packages
import os.path
# read the contents of the README file
# -*- coding: utf-8 -*-
from setuptools import setup
import os
from os import path
this_directory = path.abspath(path.dirname(__file__))
with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
long_description = f.read()
# print(long_description)
with open(os.path.join(this_directory, "VERSION")) as version_file:
__version__ = version_file.read().strip()
version = version_file.read().strip()
packages = [
"presidio_evaluator",
"presidio_evaluator.data_generator",
"presidio_evaluator.data_generator.faker_extensions",
"presidio_evaluator.dataset_formatters",
"presidio_evaluator.evaluation",
"presidio_evaluator.experiment_tracking",
"presidio_evaluator.models",
]
package_data = {"": ["*"], "presidio_evaluator.data_generator": ["raw_data/*"]}
install_requires = [
"azure-ai-textanalytics>=5.3.0,<6.0.0",
"en_core_web_lg @ "
"https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz",
"en_core_web_sm @ "
"https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz",
"faker>=21.0,<22.0",
"numpy>=1.22,<2.0",
"pandas>=2.1.4,<3.0.0",
"plotly>=5.18.0,<6.0.0",
"presidio-analyzer>=2.2.351,<3.0.0",
"presidio-anonymizer>=2.2.351,<3.0.0",
"python-dotenv>=1.0.0,<2.0.0",
"requests>=2.25,<3.0",
"scikit-learn>=1.3.2,<2.0.0",
"spacy>=3.5.0,<4.0.0",
"tqdm>=4.60.0,<5.0.0",
"xmltodict>=0.12.0,<0.13.0",
]
setup(
name="presidio-evaluator",
long_description=long_description,
long_description_content_type="text/markdown",
version=__version__,
packages=find_packages(exclude=["tests"]),
url="https://www.github.com/microsoft/presidio-research",
version=version,
license="MIT",
description="PII dataset generator, model evaluator for Presidio and PII data in general", # noqa
data_files=[
(
"presidio_evaluator/data_generator/raw_data",
[
"presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv", # noqa
"presidio_evaluator/data_generator/raw_data/templates.txt",
"presidio_evaluator/data_generator/raw_data/companies_and_organizations.csv",
"presidio_evaluator/data_generator/raw_data/nationalities.csv",
"presidio_evaluator/data_generator/raw_data/us_driver_licenses.csv",
],
)
],
include_package_data=True,
install_requires=[
"presidio_analyzer",
"presidio_anonymizer",
"spacy>=3.0.0",
"requests",
"numpy",
"pandas",
"tqdm>=4.32.1",
"jupyter>=1.0.0",
"pytest>=4.6.2",
"haikunator",
"schwifty",
"faker",
"sklearn_crfsuite",
"python-dotenv",
"azure-ai-textanalytics==5.2.0"
],
)
packages=packages,
package_data=package_data,
install_requires=install_requires,
python_requires=">=3.8,<4.0",
)

Просмотреть файл

@ -345,3 +345,22 @@ def test_align_entity_types_wrong_mapping_exception():
Evaluator.align_entity_types(
input_samples=[sample1], entities_mapping=entities_mapping
)
def test_skip_words_are_not_counted_as_errors():
prediction = ["U-PERSON", "O", "O", "O", "U-LOCATION"]
model = MockTokensModel(prediction=prediction,
entities_to_keep=["LOCATION", "PERSON"])
evaluator = Evaluator(model=model)
sample = InputSample(
full_text="John is on the street", masked="I am the street", spans=None
)
sample.tokens = ["John", "is", "on", "the", "street"]
sample.tags = ["U-PERSON", "O", "O", "O", "O"]
evaluated = evaluator.evaluate_sample(sample, prediction)
final_evaluation = evaluator.calculate_score([evaluated])
assert final_evaluation.pii_precision == 1
assert final_evaluation.pii_recall == 1

Просмотреть файл

@ -30,7 +30,7 @@ def fake_faker():
],
# fmt: on
)
def test_presidio_psudonymize_two_entities(
def test_presidio_pseudonymize_two_entities(
text, entity1, entity2, start1, end1, start2, end2, value1, value2, fake_faker
):
@ -51,3 +51,15 @@ def test_presidio_psudonymize_two_entities(
assert value2 in pseudonym
assert text[:start1].lower() in pseudonym.lower()
assert text[end1:start2].lower() in pseudonym.lower()
def test_simple_scenario():
original_text = "Hi my name is Doug Funny and this is my website: https://www.dougf.io" # noqa
presidio_response = [
RecognizerResult(entity_type="PERSON", start=14, end=24, score=0.85),
RecognizerResult(entity_type="URL", start=49, end=69, score=0.95),
]
PresidioPseudonymization().pseudonymize(original_text=original_text,
presidio_response=presidio_response,
count=5)