refresh to various things
This commit is contained in:
Родитель
793fb99b77
Коммит
cffb49e7ce
|
@ -72,7 +72,7 @@ Furthermore, it tokenizes the data, creates tags (either IO/BIO/BILUO) and spans
|
|||
|
||||
Once data is generated, it could be split into train/test/validation sets
|
||||
while ensuring that each template only exists in one set.
|
||||
See [this notebook for more details](notebooks/3_Split_by_pattern_%23.ipynb).
|
||||
See [this notebook for more details](notebooks/3_Split_by_pattern_number.ipynb).
|
||||
|
||||
## 2. Data representation
|
||||
|
||||
|
|
2
VERSION
2
VERSION
|
@ -1 +1 @@
|
|||
0.1.2
|
||||
0.1.3
|
||||
|
|
|
@ -2,8 +2,10 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"is_executing": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# install presidio via pip if not yet installed\n",
|
||||
|
@ -14,8 +16,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
|
@ -81,9 +84,34 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"is_executing": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sampling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9149.88it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"My name is Joshua Jackson\n",
|
||||
"[{\"value\": \"Joshua Jackson\", \"start\": 11, \"end\": 25, \"type\": \"name\"}]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sentence_templates = [\n",
|
||||
" \"My name is {{name}}\",\n",
|
||||
|
@ -126,8 +154,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
|
@ -165,13 +194,228 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>number</th>\n",
|
||||
" <th>gender</th>\n",
|
||||
" <th>nationality</th>\n",
|
||||
" <th>prefix</th>\n",
|
||||
" <th>first_name</th>\n",
|
||||
" <th>middle_initial</th>\n",
|
||||
" <th>last_name</th>\n",
|
||||
" <th>street_name</th>\n",
|
||||
" <th>city</th>\n",
|
||||
" <th>state_abbr</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>company</th>\n",
|
||||
" <th>domain_name</th>\n",
|
||||
" <th>person</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" <th>first_name_female</th>\n",
|
||||
" <th>first_name_male</th>\n",
|
||||
" <th>prefix_female</th>\n",
|
||||
" <th>prefix_male</th>\n",
|
||||
" <th>last_name_female</th>\n",
|
||||
" <th>last_name_male</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>Czech</td>\n",
|
||||
" <td>Mrs.</td>\n",
|
||||
" <td>Marie</td>\n",
|
||||
" <td>J</td>\n",
|
||||
" <td>Hamanová</td>\n",
|
||||
" <td>P.O. Box 255</td>\n",
|
||||
" <td>Kangerlussuaq</td>\n",
|
||||
" <td>QE</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Simple Solutions</td>\n",
|
||||
" <td>MarathonDancing.gl</td>\n",
|
||||
" <td>Marie J Hamanová</td>\n",
|
||||
" <td>Marie J Hamanová</td>\n",
|
||||
" <td>Marie</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Mrs.</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Hamanová</td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>French</td>\n",
|
||||
" <td>Ms.</td>\n",
|
||||
" <td>Patricia</td>\n",
|
||||
" <td>G</td>\n",
|
||||
" <td>Desrosiers</td>\n",
|
||||
" <td>Avenida Noruega 42</td>\n",
|
||||
" <td>Vila Real</td>\n",
|
||||
" <td>VR</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Formula Gray</td>\n",
|
||||
" <td>LostMillions.com.pt</td>\n",
|
||||
" <td>Patricia Desrosiers</td>\n",
|
||||
" <td>Patricia Desrosiers</td>\n",
|
||||
" <td>Patricia</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Ms.</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Desrosiers</td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>American</td>\n",
|
||||
" <td>Ms.</td>\n",
|
||||
" <td>Debra</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" <td>Neal</td>\n",
|
||||
" <td>1659 Hoog St</td>\n",
|
||||
" <td>Brakpan</td>\n",
|
||||
" <td>GA</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Dahlkemper's</td>\n",
|
||||
" <td>MediumTube.co.za</td>\n",
|
||||
" <td>Debra O Neal</td>\n",
|
||||
" <td>Debra O Neal</td>\n",
|
||||
" <td>Debra</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Ms.</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Neal</td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>male</td>\n",
|
||||
" <td>French</td>\n",
|
||||
" <td>Mr.</td>\n",
|
||||
" <td>Peverell</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" <td>Racine</td>\n",
|
||||
" <td>183 Epimenidou Street</td>\n",
|
||||
" <td>Limassol</td>\n",
|
||||
" <td>LI</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Quickbiz</td>\n",
|
||||
" <td>ImproveLook.com.cy</td>\n",
|
||||
" <td>Peverell Racine</td>\n",
|
||||
" <td>Peverell Racine</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Peverell</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Mr.</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Racine</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>Slovenian</td>\n",
|
||||
" <td>Mrs.</td>\n",
|
||||
" <td>Iolanda</td>\n",
|
||||
" <td>S</td>\n",
|
||||
" <td>Tratnik</td>\n",
|
||||
" <td>Karu põik 61</td>\n",
|
||||
" <td>Pärnu</td>\n",
|
||||
" <td>PR</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Dubrow's Cafeteria</td>\n",
|
||||
" <td>PostTan.com.ee</td>\n",
|
||||
" <td>Iolanda Tratnik</td>\n",
|
||||
" <td>Iolanda Tratnik</td>\n",
|
||||
" <td>Iolanda</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Mrs.</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Tratnik</td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 37 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" number gender nationality prefix first_name middle_initial last_name \\\n",
|
||||
"0 1 female Czech Mrs. Marie J Hamanová \n",
|
||||
"1 2 female French Ms. Patricia G Desrosiers \n",
|
||||
"2 3 female American Ms. Debra O Neal \n",
|
||||
"3 4 male French Mr. Peverell C Racine \n",
|
||||
"4 5 female Slovenian Mrs. Iolanda S Tratnik \n",
|
||||
"\n",
|
||||
" street_name city state_abbr ... company \\\n",
|
||||
"0 P.O. Box 255 Kangerlussuaq QE ... Simple Solutions \n",
|
||||
"1 Avenida Noruega 42 Vila Real VR ... Formula Gray \n",
|
||||
"2 1659 Hoog St Brakpan GA ... Dahlkemper's \n",
|
||||
"3 183 Epimenidou Street Limassol LI ... Quickbiz \n",
|
||||
"4 Karu põik 61 Pärnu PR ... Dubrow's Cafeteria \n",
|
||||
"\n",
|
||||
" domain_name person name \\\n",
|
||||
"0 MarathonDancing.gl Marie J Hamanová Marie J Hamanová \n",
|
||||
"1 LostMillions.com.pt Patricia Desrosiers Patricia Desrosiers \n",
|
||||
"2 MediumTube.co.za Debra O Neal Debra O Neal \n",
|
||||
"3 ImproveLook.com.cy Peverell Racine Peverell Racine \n",
|
||||
"4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n",
|
||||
"\n",
|
||||
" first_name_female first_name_male prefix_female prefix_male \\\n",
|
||||
"0 Marie Mrs. \n",
|
||||
"1 Patricia Ms. \n",
|
||||
"2 Debra Ms. \n",
|
||||
"3 Peverell Mr. \n",
|
||||
"4 Iolanda Mrs. \n",
|
||||
"\n",
|
||||
" last_name_female last_name_male \n",
|
||||
"0 Hamanová \n",
|
||||
"1 Desrosiers \n",
|
||||
"2 Neal \n",
|
||||
"3 Racine \n",
|
||||
"4 Tratnik \n",
|
||||
"\n",
|
||||
"[5 rows x 37 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Read FakeNameGenerator CSV\n",
|
||||
"fake_name_generator_df = pd.read_csv(fake_name_generator_file)\n",
|
||||
|
@ -190,8 +434,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
|
@ -209,8 +454,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"is_executing": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fake.add_provider(IpAddressProvider) # Both Ipv4 and IPv6 IP addresses\n",
|
||||
|
@ -235,8 +482,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
|
@ -270,13 +518,36 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sampling: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 17987.56it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"year\"}], \"template\": \"Title VII of the Civil Rights Act of {{year}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sentence_templates = PresidioDataGenerator.read_template_file(templates_file_path)\n",
|
||||
"fake_records = data_generator.generate_fake_data(\n",
|
||||
|
@ -296,11 +567,23 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 10,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total: 1500\n",
|
||||
"Avg # of records per template: 7.142857142857143\n",
|
||||
"Median # of records per template: 7.0\n",
|
||||
"Std: 2.5872528966106905\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"count_per_template_id = Counter([sample.template_id for sample in fake_records])\n",
|
||||
"\n",
|
||||
|
@ -323,13 +606,65 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 11,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Counter({'organization': 257,\n",
|
||||
" 'first_name': 244,\n",
|
||||
" 'person': 238,\n",
|
||||
" 'city': 235,\n",
|
||||
" 'address': 209,\n",
|
||||
" 'street_name': 164,\n",
|
||||
" 'name': 162,\n",
|
||||
" 'country': 154,\n",
|
||||
" 'credit_card_number': 152,\n",
|
||||
" 'phone_number': 121,\n",
|
||||
" 'last_name': 119,\n",
|
||||
" 'building_number': 110,\n",
|
||||
" 'age': 72,\n",
|
||||
" 'secondary_address': 64,\n",
|
||||
" 'year': 58,\n",
|
||||
" 'nationality': 55,\n",
|
||||
" 'postcode': 49,\n",
|
||||
" 'zipcode': 45,\n",
|
||||
" 'url': 39,\n",
|
||||
" 'email': 39,\n",
|
||||
" 'name_female': 37,\n",
|
||||
" 'job': 33,\n",
|
||||
" 'first_name_male': 31,\n",
|
||||
" 'name_male': 29,\n",
|
||||
" 'prefix_male': 28,\n",
|
||||
" 'date_of_birth': 24,\n",
|
||||
" 'iban': 22,\n",
|
||||
" 'date_time': 21,\n",
|
||||
" 'prefix_female': 21,\n",
|
||||
" 'day_of_week': 16,\n",
|
||||
" 'state_abbr': 15,\n",
|
||||
" 'last_name_male': 15,\n",
|
||||
" 'prefix': 12,\n",
|
||||
" 'ip_address': 11,\n",
|
||||
" 'ssn': 11,\n",
|
||||
" 'nation_plural': 9,\n",
|
||||
" 'nation_woman': 8,\n",
|
||||
" 'first_name_nonbinary': 6,\n",
|
||||
" 'us_driver_license': 6,\n",
|
||||
" 'first_name_female': 3,\n",
|
||||
" 'last_name_female': 3})"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"count_per_entity = Counter()\n",
|
||||
"for record in fake_records:\n",
|
||||
|
@ -351,8 +686,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 12,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
|
@ -421,9 +757,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"is_executing": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"DATE_TIME\"}], \"template\": \"Title VII of the Civil Rights Act of {{DATE_TIME}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"fake_records[0]"
|
||||
]
|
||||
|
@ -437,13 +786,41 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('PERSON', 887),\n",
|
||||
" ('STREET_ADDRESS', 596),\n",
|
||||
" ('GPE', 404),\n",
|
||||
" ('ORGANIZATION', 257),\n",
|
||||
" ('CREDIT_CARD', 152),\n",
|
||||
" ('PHONE_NUMBER', 121),\n",
|
||||
" ('DATE_TIME', 119),\n",
|
||||
" ('TITLE', 94),\n",
|
||||
" ('NRP', 72),\n",
|
||||
" ('AGE', 72),\n",
|
||||
" ('ZIP_CODE', 45),\n",
|
||||
" ('DOMAIN_NAME', 39),\n",
|
||||
" ('EMAIL_ADDRESS', 39),\n",
|
||||
" ('IBAN_CODE', 22),\n",
|
||||
" ('IP_ADDRESS', 11),\n",
|
||||
" ('US_SSN', 11),\n",
|
||||
" ('US_DRIVER_LICENSE', 6)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"count_per_entity_new = Counter()\n",
|
||||
|
@ -463,13 +840,51 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 15,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 0%| | 0/1500 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"loading model en_core_web_sm\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:06<00:00, 215.70it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 6.76 s, sys: 33.8 ms, total: 6.8 s\n",
|
||||
"Wall time: 6.96 s\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"input_samples = [\n",
|
||||
|
@ -491,8 +906,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 16,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
|
@ -515,21 +931,31 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 17,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 76888.23it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"conll = InputSample.create_conll_dataset(input_samples)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 18,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
|
@ -546,7 +972,7 @@
|
|||
"### Next steps\n",
|
||||
"\n",
|
||||
"- Evaluate Presidio using this fake data. [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
|
||||
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_#.ipynb)\n",
|
||||
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_number.ipynb)\n",
|
||||
"- Conduct a small exploratory data analysis on the generated data. [Sample](2_PII_EDA.ipynb)"
|
||||
]
|
||||
},
|
||||
|
@ -569,9 +995,9 @@
|
|||
"hash": "2509fbe9adc3579fd0ef23e6a2c6fb50cb745caa174aafdf017283479e60bc43"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -583,9 +1009,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -72,7 +72,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for (name, series) in pii_df.iteritems():\n",
|
||||
"for (name, series) in pii_df.items():\n",
|
||||
" print(name)\n",
|
||||
" print(\"Unique values: {}\".format(len(series.unique())))\n",
|
||||
" print(series.value_counts())\n",
|
||||
|
@ -123,7 +123,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"series_to_wordcloud(pii_df.country_full)"
|
||||
"series_to_wordcloud(pii_df.country)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -187,9 +187,9 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"countries = [get_entity_values_from_sample(sample, [\"LOCATION\"]) for sample in synth]\n",
|
||||
"countries = [get_entity_values_from_sample(sample, [\"TITLE\"]) for sample in synth]\n",
|
||||
"countries = [item for sublist in countries for item in sublist]\n",
|
||||
"series_to_wordcloud(pd.Series(countries, name=\"LOCATION\"))"
|
||||
"series_to_wordcloud(pd.Series(countries, name=\"TITLE\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -213,9 +213,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -227,9 +227,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -143,13 +143,6 @@
|
|||
"assert len(train) + len(test) + len(validation) == len(all_samples)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -160,9 +153,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -174,9 +167,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -5,7 +5,7 @@
|
|||
"id": "847acd88",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Evaluate Presidio Analyzer using the Presidio Evaluator framework"
|
||||
"# Evaluate Presidio Analyzer using the Presidio Evaluator framework"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -17,7 +17,8 @@
|
|||
"source": [
|
||||
"# install presidio via pip if not yet installed\n",
|
||||
"\n",
|
||||
"#!pip install presidio-analyzer\n",
|
||||
"#!pip install presidio-evaluator\n",
|
||||
"#!pip install \"presidio-analyzer[transformers]\"\n",
|
||||
"#!pip install presidio-evaluator"
|
||||
]
|
||||
},
|
||||
|
@ -32,6 +33,10 @@
|
|||
"from copy import deepcopy\n",
|
||||
"from pprint import pprint\n",
|
||||
"from collections import Counter\n",
|
||||
"from typing import List\n",
|
||||
"\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"\n",
|
||||
"from presidio_evaluator import InputSample\n",
|
||||
"from presidio_evaluator.evaluation import Evaluator, ModelError\n",
|
||||
|
@ -45,7 +50,8 @@
|
|||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"\n",
|
||||
"%reload_ext autoreload\n",
|
||||
"%autoreload 2"
|
||||
"%autoreload 2\n",
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -65,6 +71,9 @@
|
|||
"source": [
|
||||
"dataset_name = \"synth_dataset_v2.json\"\n",
|
||||
"dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, \"data\", dataset_name))\n",
|
||||
"\n",
|
||||
"dataset = dataset[:300] # top 300 samples\n",
|
||||
"\n",
|
||||
"print(len(dataset))"
|
||||
]
|
||||
},
|
||||
|
@ -75,10 +84,12 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"entity_counter = Counter()\n",
|
||||
"for sample in dataset:\n",
|
||||
" for tag in sample.tags:\n",
|
||||
" entity_counter[tag] += 1"
|
||||
"def get_entity_counts(dataset:List[InputSample]):\n",
|
||||
" entity_counter = Counter()\n",
|
||||
" for sample in dataset:\n",
|
||||
" for tag in sample.tags:\n",
|
||||
" entity_counter[tag] += 1\n",
|
||||
" return entity_counter\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -89,7 +100,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Count per entity:\")\n",
|
||||
"pprint(entity_counter.most_common())\n",
|
||||
"pprint(get_entity_counts(dataset).most_common())\n",
|
||||
"\n",
|
||||
"print(\"\\nExample sentence:\")\n",
|
||||
"print(dataset[1])\n",
|
||||
|
@ -107,12 +118,121 @@
|
|||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9c5e16cb-bee8-4f0a-a543-4879daa35b9e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Define the AnalyzerEngine object \n",
|
||||
"In this case, using a huggingface model: obi/deid_roberta_i2b2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "313b508f-e901-40b9-b575-c7fb8a794652",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from presidio_analyzer import AnalyzerEngine\n",
|
||||
"from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Here we define a transformers based NLP engine, \n",
|
||||
"# but you can use this cell to customize your Presidio Analyzer instance\n",
|
||||
"\n",
|
||||
"# Define which model to use\n",
|
||||
"model_config = [{\"lang_code\": \"en\", \"model_name\": {\n",
|
||||
" \"spacy\": \"en_core_web_sm\", # use a small spaCy model for lemmas, tokens etc.\n",
|
||||
" \"transformers\": \"obi/deid_roberta_i2b2\"\n",
|
||||
" }\n",
|
||||
"}]\n",
|
||||
"\n",
|
||||
"# Map transformers model labels to Presidio's\n",
|
||||
"model_to_presidio_entity_mapping = dict(\n",
|
||||
" PER=\"PERSON\",\n",
|
||||
" PERSON=\"PERSON\",\n",
|
||||
" LOC= \"LOCATION\",\n",
|
||||
" LOCATION= \"LOCATION\",\n",
|
||||
" GPE=\"LOCATION\",\n",
|
||||
" ORG=\"ORGANIZATION\",\n",
|
||||
" ORGANIZATION=\"ORGANIZATION\",\n",
|
||||
" NORP=\"NRP\",\n",
|
||||
" AGE=\"AGE\",\n",
|
||||
" ID=\"ID\",\n",
|
||||
" EMAIL=\"EMAIL\",\n",
|
||||
" PATIENT=\"PERSON\",\n",
|
||||
" STAFF=\"PERSON\",\n",
|
||||
" HOSP=\"ORGANIZATION\",\n",
|
||||
" PATORG=\"ORGANIZATION\",\n",
|
||||
" DATE=\"DATE_TIME\",\n",
|
||||
" TIME=\"DATE_TIME\",\n",
|
||||
" PHONE=\"PHONE_NUMBER\",\n",
|
||||
" HCW=\"PERSON\",\n",
|
||||
" HOSPITAL=\"ORGANIZATION\",\n",
|
||||
" FACILITY=\"LOCATION\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ner_model_configuration = NerModelConfiguration(labels_to_ignore = [\"O\"], \n",
|
||||
" model_to_presidio_entity_mapping=model_to_presidio_entity_mapping)\n",
|
||||
"\n",
|
||||
"nlp_engine = TransformersNlpEngine(models=model_config,\n",
|
||||
" ner_model_configuration=ner_model_configuration)\n",
|
||||
"\n",
|
||||
"# Set up the engine, loads the NLP module (spaCy model by default) \n",
|
||||
"# and other PII recognizers\n",
|
||||
"analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "aae4c379",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Run evaluation:"
|
||||
"### Run evaluation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "16dbf6d6-a554-4602-8907-589786d47a12",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Define experiment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29d39ff1-4f14-4e32-ae84-ecc6c739f829",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"experiment = get_experiment_tracker()\n",
|
||||
"model = PresidioAnalyzerWrapper(analyzer_engine)\n",
|
||||
"\n",
|
||||
"# Define evaluator and experiment tracking\n",
|
||||
"\n",
|
||||
"evaluator = Evaluator(model=model)\n",
|
||||
"dataset = Evaluator.align_entity_types(\n",
|
||||
" deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Count per entity after alignment:\")\n",
|
||||
"pprint(get_entity_counts(dataset).most_common())\n",
|
||||
"\n",
|
||||
"# Track model and dataset params\n",
|
||||
"params = {\"dataset_name\": dataset_name, \"model_name\": model.name}\n",
|
||||
"params.update(model.to_log())\n",
|
||||
"experiment.log_parameters(params)\n",
|
||||
"experiment.log_dataset_hash(dataset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2a7d6626-d094-4dfd-8f37-c0443edf00dc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Run experiment"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -122,39 +242,37 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Evaluating Presidio Analyzer\")\n",
|
||||
"\n",
|
||||
"experiment = get_experiment_tracker()\n",
|
||||
"model_name = \"Presidio Analyzer\"\n",
|
||||
"model = PresidioAnalyzerWrapper()\n",
|
||||
"\n",
|
||||
"evaluator = Evaluator(model=model)\n",
|
||||
"dataset = Evaluator.align_entity_types(\n",
|
||||
" deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Run experiment\n",
|
||||
"evaluation_results = evaluator.evaluate_all(dataset)\n",
|
||||
"results = evaluator.calculate_score(evaluation_results)\n",
|
||||
"\n",
|
||||
"# update params tracking\n",
|
||||
"params = {\"dataset_name\": dataset_name, \"model_name\": model_name}\n",
|
||||
"params.update(model.to_log())\n",
|
||||
"experiment.log_parameters(params)\n",
|
||||
"experiment.log_dataset_hash(dataset)\n",
|
||||
"# Track experiment results\n",
|
||||
"experiment.log_metrics(results.to_log())\n",
|
||||
"entities, confmatrix = results.to_confusion_matrix()\n",
|
||||
"experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)\n",
|
||||
"experiment.log_confusion_matrix(matrix=confmatrix, \n",
|
||||
" labels=entities)\n",
|
||||
"\n",
|
||||
"print(\"Confusion matrix:\")\n",
|
||||
"print(pd.DataFrame(confmatrix, columns=entities, index=entities))\n",
|
||||
"\n",
|
||||
"print(\"Precision and recall\")\n",
|
||||
"print(results)\n",
|
||||
"# Plot output\n",
|
||||
"plotter = evaluator.Plotter(model=model, \n",
|
||||
" results=results, \n",
|
||||
" output_folder = \".\", \n",
|
||||
" model_name = model.name, \n",
|
||||
" beta = 2)\n",
|
||||
"\n",
|
||||
"# end experiment\n",
|
||||
"experiment.end()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5b4d662d-596c-4a69-b3c9-1edcda20cc5b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plotter.plot_scores()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "070f8287",
|
||||
|
@ -198,7 +316,7 @@
|
|||
"id": "98f4802e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1. Most false positive tokens:"
|
||||
"1. Most common false positive tokens:"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -219,7 +337,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"LOCATION\"])\n",
|
||||
"fps_df[[\"full_text\", \"token\", \"prediction\"]]"
|
||||
"fps_df[[\"full_text\", \"token\", \"annotation\", \"prediction\"]]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -227,7 +345,7 @@
|
|||
"id": "d0852513",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. False negative examples"
|
||||
"2. Most common false negative examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -237,7 +355,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
|
||||
"ModelError.most_common_fn_tokens(errors, n=50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -255,7 +373,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fns_df = ModelError.get_fns_dataframe(errors, entity=[\"PHONE_NUMBER\"])"
|
||||
"fns_df = ModelError.get_fns_dataframe(errors, entity=[\"IP_ADDRESS\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -278,13 +396,21 @@
|
|||
"print(\"All errors:\\n\")\n",
|
||||
"[print(error, \"\\n\") for error in errors]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a67ff38d-0817-4864-9991-b3eb1f80eecc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -296,7 +422,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -88,7 +88,7 @@
|
|||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[type: DOMAIN_NAME, start: 57, end: 69, score: 1.0,\n",
|
||||
"[type: URL, start: 49, end: 69, score: 0.95,\n",
|
||||
" type: PERSON, start: 14, end: 24, score: 0.85]"
|
||||
]
|
||||
},
|
||||
|
@ -116,11 +116,11 @@
|
|||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Hi my name is Albert Cohen and this is my website: https://http://chapman-downs.info/',\n",
|
||||
" 'Hi my name is Lisa Miller and this is my website: https://http://benson.org/',\n",
|
||||
" 'Hi my name is Kathleen Hale and this is my website: https://http://www.garcia.com/',\n",
|
||||
" 'Hi my name is Michelle Frederick and this is my website: https://https://robinson.com/',\n",
|
||||
" 'Hi my name is Alicia Santana and this is my website: https://https://www.ray.org/']"
|
||||
"['Hi my name is Tammy Ryan and this is my website: https://www.cardenas.info/',\n",
|
||||
" 'Hi my name is Jessica Smith and this is my website: http://jones-hunt.info/',\n",
|
||||
" 'Hi my name is Michele Marsh and this is my website: https://guerrero.com/',\n",
|
||||
" 'Hi my name is Kathleen Miller and this is my website: https://lopez.com/',\n",
|
||||
" 'Hi my name is Paul Brown and this is my website: http://www.banks-evans.info/']"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
|
@ -153,11 +153,11 @@
|
|||
"-------------\n",
|
||||
"Fake examples:\n",
|
||||
"\n",
|
||||
"Our son R2D2 used to work in Botswana\n",
|
||||
"Our son R2D2 used to work in American Samoa\n",
|
||||
"Our son R2D2 used to work in Malawi\n",
|
||||
"Our son R2D2 used to work in Montenegro\n",
|
||||
"our son r2d2 used to work in lebanon\n"
|
||||
"Our son R2D2 used to work in Nigeria\n",
|
||||
"Our son R2D2 used to work in Guam\n",
|
||||
"Our son R2D2 used to work in Reunion\n",
|
||||
"Our son R2D2 used to work in Vanuatu\n",
|
||||
"Our son R2D2 used to work in Malaysia\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -176,13 +176,20 @@
|
|||
"print(f\"-------------\\nFake examples:\\n\")\n",
|
||||
"print(*fake_samples, sep=\"\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -194,9 +201,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -34,7 +34,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
|
@ -42,55 +42,18 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DATA_DATE = \"Dec-19-2021\""
|
||||
"DATA_DATE = \"Dec-27-2023\" # Change to the date when notebook 3 (split to train/test) was ran"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\r",
|
||||
"tokenizing input: 0%| | 0/2122 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"loading model en_core_web_sm\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tokenizing input: 100%|███████████████████████████████████████████████████████████| 2122/2122 [00:19<00:00, 109.66it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Read 2122 samples\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = \"../../data/{}_{}.json\"\n",
|
||||
"\n",
|
||||
|
@ -111,17 +74,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Kept 1940 samples after removal of non-tagged samples\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tagged = [sample for sample in train_samples if len(sample.spans) > 0]\n",
|
||||
"print(\"Kept {} samples after removal of non-tagged samples\".format(len(train_tagged)))"
|
||||
|
@ -140,45 +95,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Entities found in training set:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'ADDRESS',\n",
|
||||
" 'CREDIT_CARD',\n",
|
||||
" 'DATE_TIME',\n",
|
||||
" 'DOMAIN_NAME',\n",
|
||||
" 'EMAIL_ADDRESS',\n",
|
||||
" 'IBAN_CODE',\n",
|
||||
" 'IP_ADDRESS',\n",
|
||||
" 'LOCATION',\n",
|
||||
" 'O',\n",
|
||||
" 'ORGANIZATION',\n",
|
||||
" 'PERSON',\n",
|
||||
" 'PHONE_NUMBER',\n",
|
||||
" 'PREFIX',\n",
|
||||
" 'TITLE',\n",
|
||||
" 'US_SSN'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Entities found in training set:\")\n",
|
||||
"entities = []\n",
|
||||
|
@ -206,16 +129,7 @@
|
|||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Skipping illegal span None, text=ΜΟΝΗ ΑΓΙΩΝ ΑΝΑΡΓΥΡΩΝ\n",
|
||||
"Skipping illegal span None, text=U.N\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"spacy_train = InputSample.create_spacy_dataset(\n",
|
||||
" dataset=train_tagged, output_path=\"train.spacy\"\n",
|
||||
|
@ -281,9 +195,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -295,9 +209,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -39,6 +39,16 @@
|
|||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aee00770-a972-4a19-b423-1724214cc88c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install sklearn_crfsuite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a0d2d772",
|
||||
|
@ -58,8 +68,9 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DATA_DATE = \"Jan-15-2022\"\n",
|
||||
"dataset = InputSample.read_dataset_json(\"../../data/test_{}.json\".format(DATA_DATE))\n",
|
||||
"DATA_DATE = \"Dec-27-2023\" # Date when the split to train/test notebook was ran\n",
|
||||
"dataset_name = \"../../data/test_{}.json\".format(DATA_DATE)\n",
|
||||
"dataset = InputSample.read_dataset_json(dataset_name)\n",
|
||||
"print(len(dataset))"
|
||||
]
|
||||
},
|
||||
|
@ -76,7 +87,7 @@
|
|||
"source": [
|
||||
"entity_counter = Counter()\n",
|
||||
"for sample in dataset:\n",
|
||||
" for t>ag in sample.tags:\n",
|
||||
" for tag in sample.tags:\n",
|
||||
" entity_counter[tag] += 1"
|
||||
]
|
||||
},
|
||||
|
@ -257,7 +268,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"GPE\"])\n",
|
||||
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"PERSON\"])\n",
|
||||
"fps_df[[\"full_text\", \"token\", \"prediction\"]]"
|
||||
]
|
||||
},
|
||||
|
@ -276,7 +287,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
|
||||
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"ORGANIZATION\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -325,13 +336,21 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cf3e4646-ca93-44c5-a998-cd77f4bf2708",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -343,9 +362,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
}
|
||||
|
|
|
@ -205,7 +205,7 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.13 ('presidio')",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
@ -219,9 +219,8 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.13"
|
||||
"version": "3.9.18"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "371968787ec79dd50357533864944a85029366968470cac36beb694745c2f7d6"
|
||||
|
@ -229,5 +228,5 @@
|
|||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -35,6 +35,16 @@
|
|||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a0c3285c-06a2-4361-aec2-8375496f75b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install flair"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f036de59",
|
||||
|
@ -111,15 +121,14 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"flair_ner = \"ner-english\"\n",
|
||||
"flair_ner_fast = \"ner-english-fast\"\n",
|
||||
"flair_ontonotes_fast = \"ner-english-ontonotes-fast\"\n",
|
||||
"flair_ontonotes_large = \"ner-english-ontonotes-large\"\n",
|
||||
"flair_ner = \"flair/ner-english\"\n",
|
||||
"flair_ner_fast = \"flair/ner-english-fast\"\n",
|
||||
"flair_ontonotes_fast = \"flair/ner-english-ontonotes-fast\"\n",
|
||||
"flair_ontonotes_large = \"flair/ner-english-ontonotes-large\"\n",
|
||||
"models = [\n",
|
||||
" flair_ner,\n",
|
||||
" flair_ner_fast,\n",
|
||||
" flair_ontonotes_fast,\n",
|
||||
" flair_ner_fast,\n",
|
||||
" flair_ontonotes_large,\n",
|
||||
"]"
|
||||
]
|
||||
|
@ -312,9 +321,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -326,7 +335,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -109,7 +109,10 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]"
|
||||
"models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]\n",
|
||||
"\n",
|
||||
"# If needed, install models using `python -m spacy download X` where x is the model name, or use spacy.cli.download:\n",
|
||||
"#spacy.cli.download(\"en_core_web_trf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -334,9 +337,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -348,9 +351,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -170,6 +170,8 @@ class PresidioDataGenerator:
|
|||
|
||||
new_provider = BaseProvider(self.faker)
|
||||
setattr(new_provider, new_name, original)
|
||||
setattr(new_provider, new_name.lower(), original) # avoid case sensitivity
|
||||
setattr(new_provider, new_name.upper(), original) # avoid case sensitivity
|
||||
self.faker.add_provider(new_provider)
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -25,7 +25,7 @@ class PresidioPseudonymization(PresidioDataGenerator):
|
|||
self.add_provider_alias("credit_card_number", "CREDIT_CARD")
|
||||
self.add_provider_alias("iban", "IBAN_CODE")
|
||||
self.add_provider_alias("phone_number", "PHONE_NUMBER")
|
||||
self.add_provider_alias("url", "DOMAIN_NAME")
|
||||
self.add_provider_alias("url", "URL")
|
||||
self.add_provider_alias("ssn", "US_SSN")
|
||||
self.add_provider_alias("email", "EMAIL_ADDRESS")
|
||||
self.add_provider_alias("date_time", "DATE_TIME")
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import copy
|
||||
from collections import Counter
|
||||
from typing import List, Optional, Dict
|
||||
from pathlib import Path
|
||||
import string
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
@ -39,7 +41,6 @@ class Evaluator:
|
|||
self.entities_to_keep = self.model.entities
|
||||
|
||||
def compare(self, input_sample: InputSample, prediction: List[str]):
|
||||
|
||||
"""
|
||||
Compares ground truth tags (annotation) and predicted (prediction)
|
||||
:param input_sample: input sample containing list of tags with scheme
|
||||
|
@ -71,6 +72,9 @@ class Evaluator:
|
|||
if self.entities_to_keep:
|
||||
prediction = self._adjust_per_entities(prediction)
|
||||
new_annotation = self._adjust_per_entities(new_annotation)
|
||||
|
||||
skip_words = self.get_skip_words()
|
||||
|
||||
for i in range(0, len(new_annotation)):
|
||||
results[(new_annotation[i], prediction[i])] += 1
|
||||
|
||||
|
@ -81,6 +85,10 @@ class Evaluator:
|
|||
|
||||
# check if there was an error
|
||||
is_error = new_annotation[i] != prediction[i]
|
||||
if str(tokens[i]).lower().strip() in skip_words:
|
||||
is_error = False
|
||||
results[(new_annotation[i], prediction[i])] -= 1
|
||||
|
||||
if is_error:
|
||||
if prediction[i] == "O":
|
||||
mistakes.append(
|
||||
|
@ -151,7 +159,6 @@ class Evaluator:
|
|||
f"Mapping entity values using this dictionary: {self.model.entity_mapping}"
|
||||
)
|
||||
for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"):
|
||||
|
||||
# Align tag values to the ones expected by the model
|
||||
self.model.align_entity_types(sample)
|
||||
|
||||
|
@ -345,13 +352,13 @@ class Evaluator:
|
|||
if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0):
|
||||
return np.nan
|
||||
|
||||
return ((1 + beta ** 2) * precision * recall) / (
|
||||
((beta ** 2) * precision) + recall
|
||||
return ((1 + beta**2) * precision * recall) / (
|
||||
((beta**2) * precision) + recall
|
||||
)
|
||||
|
||||
class Plotter:
|
||||
"""
|
||||
Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
|
||||
Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
|
||||
for a PII detection model evaluated via Evaluator
|
||||
|
||||
:param model: Instance of a fitted model (of base type BaseModel)
|
||||
|
@ -362,7 +369,9 @@ class Evaluator:
|
|||
which gives more or less weight to precision vs. recall
|
||||
"""
|
||||
|
||||
def __init__(self, model, results, output_folder: Path, model_name: str, beta: float):
|
||||
def __init__(
|
||||
self, model, results, output_folder: Path, model_name: str, beta: float
|
||||
):
|
||||
self.model = model
|
||||
self.results = results
|
||||
self.output_folder = output_folder
|
||||
|
@ -372,41 +381,66 @@ class Evaluator:
|
|||
|
||||
def plot_scores(self) -> None:
|
||||
"""
|
||||
Plots per-entity recall, precision, or F2 score for evaluated model.
|
||||
:param plot_type: which metric to graph (default is F2 score)
|
||||
Plots per-entity recall, precision, or F2 score for evaluated model.
|
||||
"""
|
||||
scores = {}
|
||||
scores['entity'] = list(self.results.entity_recall_dict.keys())
|
||||
scores['recall'] = list(self.results.entity_recall_dict.values())
|
||||
scores['precision'] = list(self.results.entity_precision_dict.values())
|
||||
scores['count'] = list(self.results.n_dict.values())
|
||||
scores[f"f{self.beta}_score"] = [Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
|
||||
for recall, precision in zip(scores['recall'], scores['precision'])]
|
||||
|
||||
entity_recall_dict = copy.deepcopy(self.results.entity_recall_dict)
|
||||
entity_precision_dict = copy.deepcopy(self.results.entity_precision_dict)
|
||||
|
||||
scores["entity"] = list(entity_recall_dict.keys())
|
||||
scores["recall"] = list(entity_recall_dict.values())
|
||||
scores["precision"] = list(entity_precision_dict.values())
|
||||
scores["count"] = list(self.results.n_dict.values())
|
||||
|
||||
scores[f"f{self.beta}_score"] = [
|
||||
Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
|
||||
for recall, precision in zip(scores["recall"], scores["precision"])
|
||||
]
|
||||
|
||||
# Add PII detection rates
|
||||
scores["entity"].append("PII")
|
||||
scores["recall"].append(self.results.pii_recall)
|
||||
scores["precision"].append(self.results.pii_precision)
|
||||
scores["count"].append(self.results.n)
|
||||
scores[f"f{self.beta}_score"].append(self.results.pii_f)
|
||||
|
||||
df = pd.DataFrame(scores)
|
||||
df['model'] = self.model_name
|
||||
df["model"] = self.model_name
|
||||
self._plot(df, plot_type="f2_score")
|
||||
self._plot(df, plot_type="precision")
|
||||
self._plot(df, plot_type="recall")
|
||||
|
||||
def _plot(self, df, plot_type) -> None:
|
||||
fig = px.bar(df, text_auto=".2", y='entity', orientation="h",
|
||||
x=plot_type, color='count', barmode='group', title=f"Per-entity {plot_type} for {self.model_name}")
|
||||
fig.update_layout(barmode='group', yaxis={
|
||||
'categoryorder': 'total ascending'})
|
||||
fig = px.bar(
|
||||
df,
|
||||
text_auto=".2",
|
||||
y="entity",
|
||||
orientation="h",
|
||||
x=plot_type,
|
||||
color="count",
|
||||
barmode="group",
|
||||
height=30*len(set(df["entity"])),
|
||||
title=f"Per-entity {plot_type} for {self.model_name}",
|
||||
)
|
||||
fig.update_layout(
|
||||
barmode="group", yaxis={"categoryorder": "total ascending"}
|
||||
)
|
||||
fig.update_layout(yaxis_title=f"{plot_type}", xaxis_title="PII Entity")
|
||||
fig.update_traces(textfont_size=12, textangle=0,
|
||||
textposition="outside", cliponaxis=False)
|
||||
fig.update_traces(
|
||||
textfont_size=12, textangle=0, textposition="outside", cliponaxis=False
|
||||
)
|
||||
fig.update_layout(
|
||||
plot_bgcolor="#FFF",
|
||||
xaxis=dict(
|
||||
title="PII entity",
|
||||
linecolor="#BCCCDC", # Sets color of X-axis line
|
||||
showgrid=False # Removes X-axis grid lines
|
||||
showgrid=False, # Removes X-axis grid lines
|
||||
),
|
||||
yaxis=dict(
|
||||
title=f"{plot_type}",
|
||||
linecolor="#BCCCDC", # Sets color of X-axis line
|
||||
showgrid=False # Removes X-axis grid lines
|
||||
showgrid=False, # Removes X-axis grid lines
|
||||
),
|
||||
)
|
||||
fig.show()
|
||||
|
@ -419,47 +453,100 @@ class Evaluator:
|
|||
for entity in self.model.entity_mapping.values():
|
||||
fps_df = ModelError.get_fps_dataframe(self.errors, entity=[entity])
|
||||
if fps_df is not None:
|
||||
fps_path = self.output_folder / \
|
||||
f"{self.model_name}-{entity}-fps.csv"
|
||||
fps_path = (
|
||||
self.output_folder / f"{self.model_name}-{entity}-fps.csv"
|
||||
)
|
||||
fps_df.to_csv(fps_path)
|
||||
fps_frames.append(fps_path)
|
||||
fns_df = ModelError.get_fns_dataframe(self.errors, entity=[entity])
|
||||
if fns_df is not None:
|
||||
fns_path = self.output_folder / \
|
||||
f"{self.model_name}-{entity}-fns.csv"
|
||||
fns_path = (
|
||||
self.output_folder / f"{self.model_name}-{entity}-fns.csv"
|
||||
)
|
||||
fns_df.to_csv(fns_path)
|
||||
fns_frames.append(fns_path)
|
||||
|
||||
def group_tokens(df):
|
||||
return df.groupby(['token', 'annotation']).size().to_frame(
|
||||
).sort_values([0], ascending=False).head(3).reset_index()
|
||||
return (
|
||||
df.groupby(["token", "annotation"])
|
||||
.size()
|
||||
.to_frame()
|
||||
.sort_values([0], ascending=False)
|
||||
.head(3)
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
fps_tokens_df = pd.concat(
|
||||
[group_tokens(pd.read_csv(df_path)) for df_path in fps_frames])
|
||||
[group_tokens(pd.read_csv(df_path)) for df_path in fps_frames]
|
||||
)
|
||||
fns_tokens_df = pd.concat(
|
||||
[group_tokens(pd.read_csv(df_path)) for df_path in fns_frames])
|
||||
[group_tokens(pd.read_csv(df_path)) for df_path in fns_frames]
|
||||
)
|
||||
|
||||
def generate_graph(title, tokens_df):
|
||||
fig = px.histogram(tokens_df, x=0, y="token", orientation='h', color='annotation',
|
||||
title=f"Most common {title} for {self.model_name}")
|
||||
fig = px.histogram(
|
||||
tokens_df,
|
||||
x=0,
|
||||
y="token",
|
||||
orientation="h",
|
||||
color="annotation",
|
||||
title=f"Most common {title} for {self.model_name}",
|
||||
)
|
||||
|
||||
fig.update_layout(yaxis_title=f"count", xaxis_title="PII Entity")
|
||||
fig.update_traces(textfont_size=12, textangle=0,
|
||||
textposition="outside", cliponaxis=False)
|
||||
fig.update_traces(
|
||||
textfont_size=12,
|
||||
textangle=0,
|
||||
textposition="outside",
|
||||
cliponaxis=False,
|
||||
)
|
||||
fig.update_layout(
|
||||
plot_bgcolor="#FFF",
|
||||
xaxis=dict(
|
||||
title="Count",
|
||||
linecolor="#BCCCDC", # Sets color of X-axis line
|
||||
showgrid=False # Removes X-axis grid lines
|
||||
showgrid=False, # Removes X-axis grid lines
|
||||
),
|
||||
yaxis=dict(
|
||||
title=f"Tokens",
|
||||
linecolor="#BCCCDC", # Sets color of X-axis line
|
||||
showgrid=False # Removes X-axis grid lines
|
||||
showgrid=False, # Removes X-axis grid lines
|
||||
),
|
||||
)
|
||||
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
|
||||
fig.update_layout(yaxis={"categoryorder": "total ascending"})
|
||||
fig.show()
|
||||
|
||||
generate_graph(title="false-negatives", tokens_df=fns_tokens_df)
|
||||
generate_graph(title="false-positives", tokens_df=fps_tokens_df)
|
||||
|
||||
@staticmethod
|
||||
def get_skip_words():
|
||||
skip_words = [x for x in string.punctuation]
|
||||
skip_words.extend(
|
||||
[
|
||||
"\n",
|
||||
"\n\n",
|
||||
"\n\n\n",
|
||||
">>",
|
||||
">>>",
|
||||
">>>>",
|
||||
"street",
|
||||
"st.",
|
||||
"st",
|
||||
"de",
|
||||
"rue",
|
||||
"via",
|
||||
"and",
|
||||
"or",
|
||||
"do",
|
||||
"as",
|
||||
"of",
|
||||
"day",
|
||||
"address",
|
||||
"country",
|
||||
"state",
|
||||
"city",
|
||||
]
|
||||
)
|
||||
|
||||
return skip_words
|
||||
|
|
|
@ -31,6 +31,7 @@ class BaseModel(ABC):
|
|||
self.labeling_scheme = labeling_scheme
|
||||
self.entity_mapping = entity_mapping
|
||||
self.verbose = verbose
|
||||
self.name = self.__class__.__name__
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
|
|
|
@ -85,7 +85,7 @@ class CRFModel(BaseModel):
|
|||
y_train = [self.sent2labels(s) for s in sentences]
|
||||
return X_train, y_train
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
tags = CRFModel.crf_predict(sample, self.model)
|
||||
|
||||
if len(tags) != len(sample.tokens):
|
||||
|
|
|
@ -48,7 +48,7 @@ class FlairModel(BaseModel):
|
|||
|
||||
self.spacy_tokenizer = SpacyTokenizer(model=spacy.load("en_core_web_sm"))
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
|
||||
sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer)
|
||||
self.model.predict(sentence)
|
||||
|
|
|
@ -91,23 +91,28 @@ class PresidioAnalyzerWrapper(BaseModel):
|
|||
"PHONE_NUMBER": "PHONE_NUMBER",
|
||||
"BIRTHDAY": "DATE_TIME",
|
||||
"DATE_TIME": "DATE_TIME",
|
||||
"DOMAIN_NAME": "DOMAIN_NAME",
|
||||
"DOMAIN_NAME": "URL",
|
||||
"TIME" : "DATE_TIME",
|
||||
"DATE" : "DATE_TIME",
|
||||
"CITY": "LOCATION",
|
||||
"ADDRESS": "LOCATION",
|
||||
"STREET_ADDRESS": "LOCATION",
|
||||
"NATIONALITY": "LOCATION",
|
||||
"LOCATION": "LOCATION",
|
||||
"IBAN_CODE": "IBAN_CODE",
|
||||
"URL": "DOMAIN_NAME",
|
||||
"URL": "URL",
|
||||
"US_SSN": "US_SSN",
|
||||
"IP_ADDRESS": "IP_ADDRESS",
|
||||
"ORGANIZATION": "ORG",
|
||||
"ORGANIZATION": "ORGANIZATION",
|
||||
"ORG": "ORGANIZATION",
|
||||
"US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
|
||||
"NRP": "NRP",
|
||||
"TITLE": "O", # not supported
|
||||
"PREFIX": "O", # not supported
|
||||
"STREET_ADDRESS": "O", # not supported
|
||||
"ZIP_CODE": "O", # not supported
|
||||
"AGE": "O", # not supported
|
||||
"NRP": "LOCATION",
|
||||
"NORP": "LOCATION",
|
||||
"ID": "ID",
|
||||
"TITLE": "O", # not supported through spaCy
|
||||
"PREFIX": "O", # not supported through spaCy
|
||||
"ZIP_CODE": "O", # not supported through spaCy
|
||||
"AGE": "O", # not supported through spaCy
|
||||
"O": "O",
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ class SpacyModel(BaseModel):
|
|||
else:
|
||||
self.model = model
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
"""
|
||||
Predict a list of tags for an inpuit sample.
|
||||
:param sample: InputSample
|
||||
|
|
|
@ -51,7 +51,7 @@ class StanzaModel(SpacyModel):
|
|||
entity_mapping=entity_mapping,
|
||||
)
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
"""
|
||||
Predict the tags using a stanza model.
|
||||
|
||||
|
|
|
@ -48,8 +48,7 @@ class TextAnalyticsWrapper(BaseModel):
|
|||
)
|
||||
return text_analytics_client
|
||||
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
documents = [sample.full_text]
|
||||
response = self.ta_client.recognize_pii_entities(documents,
|
||||
language="en")
|
||||
|
|
|
@ -4,26 +4,23 @@ version = "0.1.0"
|
|||
description = ""
|
||||
authors = ["Omri Mendels <omri374@users.noreply.github.com>"]
|
||||
readme = "README.md"
|
||||
include = [{ path= "presidio_evaluator/data_generator/raw_data/*"}]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
spacy = ">=3.2.0, <4.0.0"
|
||||
numpy = ">=1.20.2,<2.0.0"
|
||||
jupyter = ">=1"
|
||||
pandas = ">=1.2.4,<2.0.0"
|
||||
tqdm = ">=4.60.0,<5.0.0"
|
||||
haikunator = ">=2.1.0,<3.0.0"
|
||||
schwifty = ">=2023.11.2,<2024.0.0"
|
||||
faker = ">=9.6.0,<10.0.0"
|
||||
scikit-learn = ">1.3.2,<2.0.0"
|
||||
pytest = ">=6.2.3"
|
||||
spacy = "^3.5.0"
|
||||
numpy = "^1.22"
|
||||
pandas = "^2.1.4"
|
||||
tqdm = "^4.60.0"
|
||||
faker = "^21.0"
|
||||
scikit-learn = "^1.3.2"
|
||||
presidio-analyzer = "^2.2.351"
|
||||
presidio-anonymizer = "^2.2.351"
|
||||
requests = ">=2.25.1"
|
||||
xmltodict = ">=0.12.0"
|
||||
requests = "^2.25"
|
||||
xmltodict = "^0.12.0"
|
||||
python-dotenv = "^1.0.0"
|
||||
plotly = "^5.18.0"
|
||||
azure-ai-textanalytics = ">=5.3.0"
|
||||
azure-ai-textanalytics = "^5.3.0"
|
||||
en_core_web_sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz"}
|
||||
en_core_web_lg = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz"}
|
||||
|
||||
|
|
83
setup.py
83
setup.py
|
@ -1,54 +1,57 @@
|
|||
from setuptools import setup, find_packages
|
||||
import os.path
|
||||
|
||||
# read the contents of the README file
|
||||
# -*- coding: utf-8 -*-
|
||||
from setuptools import setup
|
||||
import os
|
||||
from os import path
|
||||
|
||||
this_directory = path.abspath(path.dirname(__file__))
|
||||
with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
|
||||
long_description = f.read()
|
||||
# print(long_description)
|
||||
|
||||
with open(os.path.join(this_directory, "VERSION")) as version_file:
|
||||
__version__ = version_file.read().strip()
|
||||
version = version_file.read().strip()
|
||||
|
||||
|
||||
packages = [
|
||||
"presidio_evaluator",
|
||||
"presidio_evaluator.data_generator",
|
||||
"presidio_evaluator.data_generator.faker_extensions",
|
||||
"presidio_evaluator.dataset_formatters",
|
||||
"presidio_evaluator.evaluation",
|
||||
"presidio_evaluator.experiment_tracking",
|
||||
"presidio_evaluator.models",
|
||||
]
|
||||
|
||||
package_data = {"": ["*"], "presidio_evaluator.data_generator": ["raw_data/*"]}
|
||||
|
||||
install_requires = [
|
||||
"azure-ai-textanalytics>=5.3.0,<6.0.0",
|
||||
"en_core_web_lg @ "
|
||||
"https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz",
|
||||
"en_core_web_sm @ "
|
||||
"https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz",
|
||||
"faker>=21.0,<22.0",
|
||||
"numpy>=1.22,<2.0",
|
||||
"pandas>=2.1.4,<3.0.0",
|
||||
"plotly>=5.18.0,<6.0.0",
|
||||
"presidio-analyzer>=2.2.351,<3.0.0",
|
||||
"presidio-anonymizer>=2.2.351,<3.0.0",
|
||||
"python-dotenv>=1.0.0,<2.0.0",
|
||||
"requests>=2.25,<3.0",
|
||||
"scikit-learn>=1.3.2,<2.0.0",
|
||||
"spacy>=3.5.0,<4.0.0",
|
||||
"tqdm>=4.60.0,<5.0.0",
|
||||
"xmltodict>=0.12.0,<0.13.0",
|
||||
]
|
||||
|
||||
setup(
|
||||
name="presidio-evaluator",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
version=__version__,
|
||||
packages=find_packages(exclude=["tests"]),
|
||||
url="https://www.github.com/microsoft/presidio-research",
|
||||
version=version,
|
||||
license="MIT",
|
||||
description="PII dataset generator, model evaluator for Presidio and PII data in general", # noqa
|
||||
data_files=[
|
||||
(
|
||||
"presidio_evaluator/data_generator/raw_data",
|
||||
[
|
||||
"presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv", # noqa
|
||||
"presidio_evaluator/data_generator/raw_data/templates.txt",
|
||||
"presidio_evaluator/data_generator/raw_data/companies_and_organizations.csv",
|
||||
"presidio_evaluator/data_generator/raw_data/nationalities.csv",
|
||||
"presidio_evaluator/data_generator/raw_data/us_driver_licenses.csv",
|
||||
],
|
||||
)
|
||||
],
|
||||
include_package_data=True,
|
||||
install_requires=[
|
||||
"presidio_analyzer",
|
||||
"presidio_anonymizer",
|
||||
"spacy>=3.0.0",
|
||||
"requests",
|
||||
"numpy",
|
||||
"pandas",
|
||||
"tqdm>=4.32.1",
|
||||
"jupyter>=1.0.0",
|
||||
"pytest>=4.6.2",
|
||||
"haikunator",
|
||||
"schwifty",
|
||||
"faker",
|
||||
"sklearn_crfsuite",
|
||||
"python-dotenv",
|
||||
"azure-ai-textanalytics==5.2.0"
|
||||
],
|
||||
)
|
||||
packages=packages,
|
||||
package_data=package_data,
|
||||
install_requires=install_requires,
|
||||
python_requires=">=3.8,<4.0",
|
||||
)
|
|
@ -345,3 +345,22 @@ def test_align_entity_types_wrong_mapping_exception():
|
|||
Evaluator.align_entity_types(
|
||||
input_samples=[sample1], entities_mapping=entities_mapping
|
||||
)
|
||||
|
||||
|
||||
def test_skip_words_are_not_counted_as_errors():
|
||||
prediction = ["U-PERSON", "O", "O", "O", "U-LOCATION"]
|
||||
model = MockTokensModel(prediction=prediction,
|
||||
entities_to_keep=["LOCATION", "PERSON"])
|
||||
|
||||
evaluator = Evaluator(model=model)
|
||||
sample = InputSample(
|
||||
full_text="John is on the street", masked="I am the street", spans=None
|
||||
)
|
||||
sample.tokens = ["John", "is", "on", "the", "street"]
|
||||
sample.tags = ["U-PERSON", "O", "O", "O", "O"]
|
||||
|
||||
evaluated = evaluator.evaluate_sample(sample, prediction)
|
||||
final_evaluation = evaluator.calculate_score([evaluated])
|
||||
|
||||
assert final_evaluation.pii_precision == 1
|
||||
assert final_evaluation.pii_recall == 1
|
||||
|
|
|
@ -30,7 +30,7 @@ def fake_faker():
|
|||
],
|
||||
# fmt: on
|
||||
)
|
||||
def test_presidio_psudonymize_two_entities(
|
||||
def test_presidio_pseudonymize_two_entities(
|
||||
text, entity1, entity2, start1, end1, start2, end2, value1, value2, fake_faker
|
||||
):
|
||||
|
||||
|
@ -51,3 +51,15 @@ def test_presidio_psudonymize_two_entities(
|
|||
assert value2 in pseudonym
|
||||
assert text[:start1].lower() in pseudonym.lower()
|
||||
assert text[end1:start2].lower() in pseudonym.lower()
|
||||
|
||||
|
||||
def test_simple_scenario():
|
||||
original_text = "Hi my name is Doug Funny and this is my website: https://www.dougf.io" # noqa
|
||||
presidio_response = [
|
||||
RecognizerResult(entity_type="PERSON", start=14, end=24, score=0.85),
|
||||
RecognizerResult(entity_type="URL", start=49, end=69, score=0.95),
|
||||
]
|
||||
|
||||
PresidioPseudonymization().pseudonymize(original_text=original_text,
|
||||
presidio_response=presidio_response,
|
||||
count=5)
|
||||
|
|
Загрузка…
Ссылка в новой задаче