Update azure text analytics evaluation notebook

2022-11-25 16:39:27 +01:00 · 2022-11-25 16:39:27 +01:00 · 40b84d42dd
--- a/notebooks/models/Evaluate
+++ b/notebooks/models/Evaluate
@ -0,0 +1,449 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Evaluate Azure Text Analytics for PII detection using the Presidio Evaluator framework"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "stanza and spacy_stanza are not installed\n",
+      "Flair is not installed by default\n",
+      "Flair is not installed\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pathlib import Path\n",
+    "from copy import deepcopy\n",
+    "from pprint import pprint\n",
+    "from collections import Counter\n",
+    "\n",
+    "from presidio_evaluator import InputSample\n",
+    "from presidio_evaluator.evaluation import Evaluator, ModelError\n",
+    "from presidio_evaluator.models import TextAnalyticsWrapper\n",
+    "from presidio_evaluator.experiment_tracking import get_experiment_tracker\n",
+    "import pandas as pd\n",
+    "\n",
+    "pd.set_option(\"display.max_columns\", None)\n",
+    "pd.set_option(\"display.max_rows\", None)\n",
+    "pd.set_option(\"display.max_colwidth\", None)\n",
+    "\n",
+    "%reload_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Select data for evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "tokenizing input: 100%|██████████| 1500/1500 [00:09<00:00, 153.03it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1500\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset_name = \"synth_dataset_v2.json\"\n",
+    "dataset = InputSample.read_dataset_json(Path(Path.cwd().parent.parent, \"data\", dataset_name))\n",
+    "print(len(dataset))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "entity_counter = Counter()\n",
+    "for sample in dataset:\n",
+    "    for tag in sample.tags:\n",
+    "        entity_counter[tag] += 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Dataset exploration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Count per entity:\n",
+      "[('O', 19626),\n",
+      " ('STREET_ADDRESS', 3071),\n",
+      " ('PERSON', 1369),\n",
+      " ('GPE', 521),\n",
+      " ('ORGANIZATION', 504),\n",
+      " ('PHONE_NUMBER', 350),\n",
+      " ('DATE_TIME', 219),\n",
+      " ('TITLE', 142),\n",
+      " ('CREDIT_CARD', 136),\n",
+      " ('US_SSN', 80),\n",
+      " ('AGE', 74),\n",
+      " ('NRP', 55),\n",
+      " ('ZIP_CODE', 50),\n",
+      " ('EMAIL_ADDRESS', 49),\n",
+      " ('DOMAIN_NAME', 37),\n",
+      " ('IP_ADDRESS', 22),\n",
+      " ('IBAN_CODE', 21),\n",
+      " ('US_DRIVER_LICENSE', 9)]\n",
+      "\n",
+      "Example sentence:\n",
+      "Full text: What are my options?\n",
+      "Spans: []\n",
+      "Tokens: What are my options?\n",
+      "Tags: ['O', 'O', 'O', 'O', 'O']\n",
+      "\n",
+      "\n",
+      "Min and max number of tokens in dataset:\n",
+      "Min: 3, Max: 78\n",
+      "\n",
+      "Min and max sentence length in dataset:\n",
+      "Min: 9, Max: 407\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Count per entity:\")\n",
+    "pprint(entity_counter.most_common())\n",
+    "\n",
+    "print(\"\\nExample sentence:\")\n",
+    "print(dataset[1])\n",
+    "\n",
+    "print(\"\\nMin and max number of tokens in dataset:\")\n",
+    "print(\n",
+    "    f\"Min: {min([len(sample.tokens) for sample in dataset])}, \"\n",
+    "    f\"Max: {max([len(sample.tokens) for sample in dataset])}\"\n",
+    ")\n",
+    "\n",
+    "print(\"\\nMin and max sentence length in dataset:\")\n",
+    "print(\n",
+    "    f\"Min: {min([len(sample.full_text) for sample in dataset])}, \"\n",
+    "    f\"Max: {max([len(sample.full_text) for sample in dataset])}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"Text analytics Analyzer\"\n",
+    "key = \"3f96f95c33394578be4a67a8b17cb809\"\n",
+    "endpoint = \"https://taclinicalpii556702.cognitiveservices.azure.com/\"\n",
+    "model = TextAnalyticsWrapper(ta_key=key, ta_endpoint=endpoint)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating Azure Text Analytics.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluating <class 'presidio_evaluator.models.text_analytics_wrapper.TextAnalyticsWrapper'>: 100%|██████████| 1500/1500 [01:36<00:00, 15.61it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "saving experiment data to experiment_20221125-162355.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Evaluating Azure Text Analytics.\")\n",
+    "\n",
+    "experiment = get_experiment_tracker()\n",
+    "\n",
+    "# Mapping from dataset Entities to Text Analytics Entities. \n",
+    "# All supported PII entity categories in Text Analytics are listed in this link: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/concepts/conversations-entity-categories\n",
+    "i2b2_entities_to_text_analytics =  {\"PERSON\":\"Person\",\n",
+    "                                \"STREET_ADDRESS\":\"Address\",\n",
+    "                                \"GPE\": \"O\",\n",
+    "                                \"PHONE_NUMBER\":\"PhoneNumber\",\n",
+    "                                \"ORGANIZATION\":\"Organization\",\n",
+    "                                \"DATE_TIME\": \"DateTime\",\n",
+    "                                \"TITLE\":\"O\",\n",
+    "                                \"CREDIT_CARD\":\"CreditCardNumber\",\n",
+    "                                \"US_SSN\":\"USSocialSecurityNumber\",\n",
+    "                                \"AGE\": \"Age\",\n",
+    "                                \"NRP\":\"O\",\n",
+    "                                \"ZIP_CODE\":\"O\",\n",
+    "                                \"EMAIL_ADDRESS\":\"Email\",\n",
+    "                                \"DOMAIN_NAME\":\"URL\",\n",
+    "                                \"IP_ADDRESS\":\"IPAddress\",\n",
+    "                                \"IBAN_CODE\":\"InternationalBankingAccountNumber\",   \n",
+    "                                \"US_DRIVER_LICENSE\":\"USDriversLicenseNumber\"\n",
+    "                                }\n",
+    "evaluator = Evaluator(model=model)\n",
+    "dataset_ = Evaluator.align_entity_types(\n",
+    "    deepcopy(dataset), entities_mapping=i2b2_entities_to_text_analytics\n",
+    ")\n",
+    "\n",
+    "evaluation_results = evaluator.evaluate_all(dataset_)\n",
+    "results = evaluator.calculate_score(evaluation_results)\n",
+    "\n",
+    "# update params tracking\n",
+    "params = {\"dataset_name\": dataset_name, \"model_name\": model_name}\n",
+    "params.update(model.to_log())\n",
+    "experiment.log_parameters(params)\n",
+    "experiment.log_dataset_hash(dataset)\n",
+    "experiment.log_metrics(results.to_log())\n",
+    "entities, confmatrix = results.to_confusion_matrix()\n",
+    "experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)\n",
+    "\n",
+    "# end experiment\n",
+    "experiment.end()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Confusion matrix:\n",
+      "                                   Address  Age  CreditCardNumber  DateTime  \\\n",
+      "Address                               1522    0                 0         9   \n",
+      "Age                                      0    0                 0         0   \n",
+      "CreditCardNumber                         0    0                70         0   \n",
+      "DateTime                                 0    0                 0       219   \n",
+      "Email                                    0    0                 0         0   \n",
+      "IPAddress                                0    0                 0         0   \n",
+      "InternationalBankingAccountNumber        0    0                 0         0   \n",
+      "O                                      110    0                 0       395   \n",
+      "Organization                             1    0                 0         0   \n",
+      "Person                                   0    0                 0         0   \n",
+      "PhoneNumber                              0    0                 0         3   \n",
+      "URL                                      0    0                 0         0   \n",
+      "USDriversLicenseNumber                   0    0                 0         0   \n",
+      "USSocialSecurityNumber                   0    0                 0         0   \n",
+      "\n",
+      "                                   Email  IPAddress  \\\n",
+      "Address                                0          0   \n",
+      "Age                                    0          0   \n",
+      "CreditCardNumber                       0          0   \n",
+      "DateTime                               0          0   \n",
+      "Email                                 28          0   \n",
+      "IPAddress                              0         22   \n",
+      "InternationalBankingAccountNumber      0          0   \n",
+      "O                                      0          0   \n",
+      "Organization                           0          0   \n",
+      "Person                                 0          0   \n",
+      "PhoneNumber                            0          2   \n",
+      "URL                                    0          0   \n",
+      "USDriversLicenseNumber                 0          0   \n",
+      "USSocialSecurityNumber                 0          0   \n",
+      "\n",
+      "                                   InternationalBankingAccountNumber      O  \\\n",
+      "Address                                                            0   1406   \n",
+      "Age                                                                0     42   \n",
+      "CreditCardNumber                                                   0     38   \n",
+      "DateTime                                                           0      0   \n",
+      "Email                                                              0      0   \n",
+      "IPAddress                                                          0      0   \n",
+      "InternationalBankingAccountNumber                                 21      0   \n",
+      "O                                                                  0  19331   \n",
+      "Organization                                                       0     58   \n",
+      "Person                                                             0     24   \n",
+      "PhoneNumber                                                        0     67   \n",
+      "URL                                                                0      0   \n",
+      "USDriversLicenseNumber                                             0      2   \n",
+      "USSocialSecurityNumber                                             0      0   \n",
+      "\n",
+      "                                   Organization  Person  PhoneNumber  URL  \\\n",
+      "Address                                      47      72           14    0   \n",
+      "Age                                           0       0            0    0   \n",
+      "CreditCardNumber                              0       0           10    0   \n",
+      "DateTime                                      0       0            0    0   \n",
+      "Email                                         0      21            0    0   \n",
+      "IPAddress                                     0       0            0    0   \n",
+      "InternationalBankingAccountNumber             0       0            0    0   \n",
+      "O                                           136      39            9    0   \n",
+      "Organization                                391      54            0    0   \n",
+      "Person                                        4    1340            0    0   \n",
+      "PhoneNumber                                   0       0          278    0   \n",
+      "URL                                           0       0            0   37   \n",
+      "USDriversLicenseNumber                        0       0            7    0   \n",
+      "USSocialSecurityNumber                        0       0            0    0   \n",
+      "\n",
+      "                                   USDriversLicenseNumber  \\\n",
+      "Address                                                 0   \n",
+      "Age                                                     0   \n",
+      "CreditCardNumber                                        0   \n",
+      "DateTime                                                0   \n",
+      "Email                                                   0   \n",
+      "IPAddress                                               0   \n",
+      "InternationalBankingAccountNumber                       0   \n",
+      "O                                                       0   \n",
+      "Organization                                            0   \n",
+      "Person                                                  0   \n",
+      "PhoneNumber                                             0   \n",
+      "URL                                                     0   \n",
+      "USDriversLicenseNumber                                  0   \n",
+      "USSocialSecurityNumber                                  0   \n",
+      "\n",
+      "                                   USSocialSecurityNumber  \n",
+      "Address                                                 0  \n",
+      "Age                                                     0  \n",
+      "CreditCardNumber                                        0  \n",
+      "DateTime                                                0  \n",
+      "Email                                                   0  \n",
+      "IPAddress                                               0  \n",
+      "InternationalBankingAccountNumber                       0  \n",
+      "O                                                       0  \n",
+      "Organization                                            0  \n",
+      "Person                                                  0  \n",
+      "PhoneNumber                                             0  \n",
+      "URL                                                     0  \n",
+      "USDriversLicenseNumber                                  0  \n",
+      "USSocialSecurityNumber                                 80  \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Confusion matrix:\")\n",
+    "print(pd.DataFrame(confmatrix, columns=entities, index=entities))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Precision and recall\n",
+      "              Entity           Precision              Recall   Number of samples\n",
+      "              Person              87.81%              97.88%                1369\n",
+      "                 Age                nan%               0.00%                  74\n",
+      "               Email             100.00%              57.14%                  49\n",
+      "                 URL             100.00%             100.00%                  37\n",
+      "InternationalBankingAccountNumber             100.00%             100.00%                  21\n",
+      "        Organization              67.65%              77.58%                 504\n",
+      "             Address              93.20%              49.56%                3071\n",
+      "USSocialSecurityNumber             100.00%             100.00%                  80\n",
+      "    CreditCardNumber             100.00%              51.47%                 136\n",
+      "           IPAddress              91.67%             100.00%                  22\n",
+      "            DateTime              34.98%             100.00%                 219\n",
+      "USDriversLicenseNumber                nan%               0.00%                   9\n",
+      "         PhoneNumber              87.42%              79.43%                 350\n",
+      "                 PII              80.19%              72.45%                5941\n",
+      "PII F measure: 73.42%\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Precision and recall\")\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.13 ('presidio')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "371968787ec79dd50357533864944a85029366968470cac36beb694745c2f7d6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/models/experiment_20221125-162355.json
+++ b/notebooks/models/experiment_20221125-162355.json
@ -0,0 +1 @@
+{"parameters": {"dataset_name": "synth_dataset_v2.json", "model_name": "Text analytics Analyzer", "labeling_scheme": "BIO", "entities_to_keep": null}, "metrics": {"pii_f": 0.7342419981999261, "Person_precision": 0.8781127129750983, "Age_precision": NaN, "Email_precision": 1.0, "URL_precision": 1.0, "InternationalBankingAccountNumber_precision": 1.0, "Organization_precision": 0.6764705882352942, "Address_precision": 0.9320269442743417, "USSocialSecurityNumber_precision": 1.0, "CreditCardNumber_precision": 1.0, "IPAddress_precision": 0.9166666666666666, "DateTime_precision": 0.3498402555910543, "USDriversLicenseNumber_precision": NaN, "PhoneNumber_precision": 0.8742138364779874, "Person_recall": 0.9788166544923301, "Age_recall": 0.0, "Email_recall": 0.5714285714285714, "URL_recall": 1.0, "InternationalBankingAccountNumber_recall": 1.0, "Organization_recall": 0.7757936507936508, "Address_recall": 0.49560403777271245, "USSocialSecurityNumber_recall": 1.0, "CreditCardNumber_recall": 0.5147058823529411, "IPAddress_recall": 1.0, "DateTime_recall": 1.0, "USDriversLicenseNumber_recall": 0.0, "PhoneNumber_recall": 0.7942857142857143, "Person": 1369, "Age": 74, "Email": 49, "URL": 37, "InternationalBankingAccountNumber": 21, "Organization": 504, "Address": 3071, "USSocialSecurityNumber": 80, "CreditCardNumber": 136, "IPAddress": 22, "DateTime": 219, "USDriversLicenseNumber": 9, "PhoneNumber": 350}, "dataset_info": null, "confusion_matrix": [[1522, 0, 0, 9, 0, 0, 0, 1406, 47, 72, 14, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0], [0, 0, 70, 0, 0, 0, 0, 38, 0, 0, 10, 0, 0, 0], [0, 0, 0, 219, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 28, 0, 0, 0, 0, 21, 0, 0, 0, 0], [0, 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0], [110, 0, 0, 395, 0, 0, 0, 19331, 136, 39, 9, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 58, 391, 54, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 24, 4, 1340, 0, 0, 0, 0], [0, 0, 0, 3, 0, 2, 0, 67, 0, 0, 278, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 0, 0], [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 7, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80]], "labels": ["Address", "Age", "CreditCardNumber", "DateTime", "Email", "IPAddress", "InternationalBankingAccountNumber", "O", "Organization", "Person", "PhoneNumber", "URL", "USDriversLicenseNumber", "USSocialSecurityNumber"]}
--- a/presidio_evaluator/models/init.py
+++ b/presidio_evaluator/models/init.py
@ -3,6 +3,7 @@ from .base_model import BaseModel
 from .crf_model import CRFModel
 from .presidio_analyzer_wrapper import PresidioAnalyzerWrapper
 from .presidio_recognizer_wrapper import PresidioRecognizerWrapper
+from .text_analytics_wrapper import TextAnalyticsWrapper
 from .spacy_model import SpacyModel
 from .stanza_model import StanzaModel
 from .flair_model import FlairModel
				`@ -0,0 +1 @@`
				{"parameters": {"dataset_name": "synth_dataset_v2.json", "model_name": "Text analytics Analyzer", "labeling_scheme": "BIO", "entities_to_keep": null}, "metrics": {"pii_f": 0.7342419981999261, "Person_precision": 0.8781127129750983, "Age_precision": NaN, "Email_precision": 1.0, "URL_precision": 1.0, "InternationalBankingAccountNumber_precision": 1.0, "Organization_precision": 0.6764705882352942, "Address_precision": 0.9320269442743417, "USSocialSecurityNumber_precision": 1.0, "CreditCardNumber_precision": 1.0, "IPAddress_precision": 0.9166666666666666, "DateTime_precision": 0.3498402555910543, "USDriversLicenseNumber_precision": NaN, "PhoneNumber_precision": 0.8742138364779874, "Person_recall": 0.9788166544923301, "Age_recall": 0.0, "Email_recall": 0.5714285714285714, "URL_recall": 1.0, "InternationalBankingAccountNumber_recall": 1.0, "Organization_recall": 0.7757936507936508, "Address_recall": 0.49560403777271245, "USSocialSecurityNumber_recall": 1.0, "CreditCardNumber_recall": 0.5147058823529411, "IPAddress_recall": 1.0, "DateTime_recall": 1.0, "USDriversLicenseNumber_recall": 0.0, "PhoneNumber_recall": 0.7942857142857143, "Person": 1369, "Age": 74, "Email": 49, "URL": 37, "InternationalBankingAccountNumber": 21, "Organization": 504, "Address": 3071, "USSocialSecurityNumber": 80, "CreditCardNumber": 136, "IPAddress": 22, "DateTime": 219, "USDriversLicenseNumber": 9, "PhoneNumber": 350}, "dataset_info": null, "confusion_matrix": [[1522, 0, 0, 9, 0, 0, 0, 1406, 47, 72, 14, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0], [0, 0, 70, 0, 0, 0, 0, 38, 0, 0, 10, 0, 0, 0], [0, 0, 0, 219, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 28, 0, 0, 0, 0, 21, 0, 0, 0, 0], [0, 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0], [110, 0, 0, 395, 0, 0, 0, 19331, 136, 39, 9, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 58, 391, 54, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 24, 4, 1340, 0, 0, 0, 0], [0, 0, 0, 3, 0, 2, 0, 67, 0, 0, 278, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 0, 0], [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 7, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80]], "labels": ["Address", "Age", "CreditCardNumber", "DateTime", "Email", "IPAddress", "InternationalBankingAccountNumber", "O", "Organization", "Person", "PhoneNumber", "URL", "USDriversLicenseNumber", "USSocialSecurityNumber"]}