Update azure text analytics evaluation notebook
This commit is contained in:
Родитель
75ba71514c
Коммит
40b84d42dd
|
@ -0,0 +1,449 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Evaluate Azure Text Analytics for PII detection using the Presidio Evaluator framework"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"stanza and spacy_stanza are not installed\n",
|
||||
"Flair is not installed by default\n",
|
||||
"Flair is not installed\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"from copy import deepcopy\n",
|
||||
"from pprint import pprint\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"from presidio_evaluator import InputSample\n",
|
||||
"from presidio_evaluator.evaluation import Evaluator, ModelError\n",
|
||||
"from presidio_evaluator.models import TextAnalyticsWrapper\n",
|
||||
"from presidio_evaluator.experiment_tracking import get_experiment_tracker\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"pd.set_option(\"display.max_columns\", None)\n",
|
||||
"pd.set_option(\"display.max_rows\", None)\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"\n",
|
||||
"%reload_ext autoreload\n",
|
||||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Select data for evaluation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tokenizing input: 100%|██████████| 1500/1500 [00:09<00:00, 153.03it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1500\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dataset_name = \"synth_dataset_v2.json\"\n",
|
||||
"dataset = InputSample.read_dataset_json(Path(Path.cwd().parent.parent, \"data\", dataset_name))\n",
|
||||
"print(len(dataset))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"entity_counter = Counter()\n",
|
||||
"for sample in dataset:\n",
|
||||
" for tag in sample.tags:\n",
|
||||
" entity_counter[tag] += 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Dataset exploration"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Count per entity:\n",
|
||||
"[('O', 19626),\n",
|
||||
" ('STREET_ADDRESS', 3071),\n",
|
||||
" ('PERSON', 1369),\n",
|
||||
" ('GPE', 521),\n",
|
||||
" ('ORGANIZATION', 504),\n",
|
||||
" ('PHONE_NUMBER', 350),\n",
|
||||
" ('DATE_TIME', 219),\n",
|
||||
" ('TITLE', 142),\n",
|
||||
" ('CREDIT_CARD', 136),\n",
|
||||
" ('US_SSN', 80),\n",
|
||||
" ('AGE', 74),\n",
|
||||
" ('NRP', 55),\n",
|
||||
" ('ZIP_CODE', 50),\n",
|
||||
" ('EMAIL_ADDRESS', 49),\n",
|
||||
" ('DOMAIN_NAME', 37),\n",
|
||||
" ('IP_ADDRESS', 22),\n",
|
||||
" ('IBAN_CODE', 21),\n",
|
||||
" ('US_DRIVER_LICENSE', 9)]\n",
|
||||
"\n",
|
||||
"Example sentence:\n",
|
||||
"Full text: What are my options?\n",
|
||||
"Spans: []\n",
|
||||
"Tokens: What are my options?\n",
|
||||
"Tags: ['O', 'O', 'O', 'O', 'O']\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Min and max number of tokens in dataset:\n",
|
||||
"Min: 3, Max: 78\n",
|
||||
"\n",
|
||||
"Min and max sentence length in dataset:\n",
|
||||
"Min: 9, Max: 407\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"Count per entity:\")\n",
|
||||
"pprint(entity_counter.most_common())\n",
|
||||
"\n",
|
||||
"print(\"\\nExample sentence:\")\n",
|
||||
"print(dataset[1])\n",
|
||||
"\n",
|
||||
"print(\"\\nMin and max number of tokens in dataset:\")\n",
|
||||
"print(\n",
|
||||
" f\"Min: {min([len(sample.tokens) for sample in dataset])}, \"\n",
|
||||
" f\"Max: {max([len(sample.tokens) for sample in dataset])}\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"\\nMin and max sentence length in dataset:\")\n",
|
||||
"print(\n",
|
||||
" f\"Min: {min([len(sample.full_text) for sample in dataset])}, \"\n",
|
||||
" f\"Max: {max([len(sample.full_text) for sample in dataset])}\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Run evaluation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_name = \"Text analytics Analyzer\"\n",
|
||||
"key = \"3f96f95c33394578be4a67a8b17cb809\"\n",
|
||||
"endpoint = \"https://taclinicalpii556702.cognitiveservices.azure.com/\"\n",
|
||||
"model = TextAnalyticsWrapper(ta_key=key, ta_endpoint=endpoint)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Evaluating Azure Text Analytics.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Evaluating <class 'presidio_evaluator.models.text_analytics_wrapper.TextAnalyticsWrapper'>: 100%|██████████| 1500/1500 [01:36<00:00, 15.61it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"saving experiment data to experiment_20221125-162355.json\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"Evaluating Azure Text Analytics.\")\n",
|
||||
"\n",
|
||||
"experiment = get_experiment_tracker()\n",
|
||||
"\n",
|
||||
"# Mapping from dataset Entities to Text Analytics Entities. \n",
|
||||
"# All supported PII entity categories in Text Analytics are listed in this link: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/concepts/conversations-entity-categories\n",
|
||||
"i2b2_entities_to_text_analytics = {\"PERSON\":\"Person\",\n",
|
||||
" \"STREET_ADDRESS\":\"Address\",\n",
|
||||
" \"GPE\": \"O\",\n",
|
||||
" \"PHONE_NUMBER\":\"PhoneNumber\",\n",
|
||||
" \"ORGANIZATION\":\"Organization\",\n",
|
||||
" \"DATE_TIME\": \"DateTime\",\n",
|
||||
" \"TITLE\":\"O\",\n",
|
||||
" \"CREDIT_CARD\":\"CreditCardNumber\",\n",
|
||||
" \"US_SSN\":\"USSocialSecurityNumber\",\n",
|
||||
" \"AGE\": \"Age\",\n",
|
||||
" \"NRP\":\"O\",\n",
|
||||
" \"ZIP_CODE\":\"O\",\n",
|
||||
" \"EMAIL_ADDRESS\":\"Email\",\n",
|
||||
" \"DOMAIN_NAME\":\"URL\",\n",
|
||||
" \"IP_ADDRESS\":\"IPAddress\",\n",
|
||||
" \"IBAN_CODE\":\"InternationalBankingAccountNumber\", \n",
|
||||
" \"US_DRIVER_LICENSE\":\"USDriversLicenseNumber\"\n",
|
||||
" }\n",
|
||||
"evaluator = Evaluator(model=model)\n",
|
||||
"dataset_ = Evaluator.align_entity_types(\n",
|
||||
" deepcopy(dataset), entities_mapping=i2b2_entities_to_text_analytics\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"evaluation_results = evaluator.evaluate_all(dataset_)\n",
|
||||
"results = evaluator.calculate_score(evaluation_results)\n",
|
||||
"\n",
|
||||
"# update params tracking\n",
|
||||
"params = {\"dataset_name\": dataset_name, \"model_name\": model_name}\n",
|
||||
"params.update(model.to_log())\n",
|
||||
"experiment.log_parameters(params)\n",
|
||||
"experiment.log_dataset_hash(dataset)\n",
|
||||
"experiment.log_metrics(results.to_log())\n",
|
||||
"entities, confmatrix = results.to_confusion_matrix()\n",
|
||||
"experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)\n",
|
||||
"\n",
|
||||
"# end experiment\n",
|
||||
"experiment.end()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Confusion matrix:\n",
|
||||
" Address Age CreditCardNumber DateTime \\\n",
|
||||
"Address 1522 0 0 9 \n",
|
||||
"Age 0 0 0 0 \n",
|
||||
"CreditCardNumber 0 0 70 0 \n",
|
||||
"DateTime 0 0 0 219 \n",
|
||||
"Email 0 0 0 0 \n",
|
||||
"IPAddress 0 0 0 0 \n",
|
||||
"InternationalBankingAccountNumber 0 0 0 0 \n",
|
||||
"O 110 0 0 395 \n",
|
||||
"Organization 1 0 0 0 \n",
|
||||
"Person 0 0 0 0 \n",
|
||||
"PhoneNumber 0 0 0 3 \n",
|
||||
"URL 0 0 0 0 \n",
|
||||
"USDriversLicenseNumber 0 0 0 0 \n",
|
||||
"USSocialSecurityNumber 0 0 0 0 \n",
|
||||
"\n",
|
||||
" Email IPAddress \\\n",
|
||||
"Address 0 0 \n",
|
||||
"Age 0 0 \n",
|
||||
"CreditCardNumber 0 0 \n",
|
||||
"DateTime 0 0 \n",
|
||||
"Email 28 0 \n",
|
||||
"IPAddress 0 22 \n",
|
||||
"InternationalBankingAccountNumber 0 0 \n",
|
||||
"O 0 0 \n",
|
||||
"Organization 0 0 \n",
|
||||
"Person 0 0 \n",
|
||||
"PhoneNumber 0 2 \n",
|
||||
"URL 0 0 \n",
|
||||
"USDriversLicenseNumber 0 0 \n",
|
||||
"USSocialSecurityNumber 0 0 \n",
|
||||
"\n",
|
||||
" InternationalBankingAccountNumber O \\\n",
|
||||
"Address 0 1406 \n",
|
||||
"Age 0 42 \n",
|
||||
"CreditCardNumber 0 38 \n",
|
||||
"DateTime 0 0 \n",
|
||||
"Email 0 0 \n",
|
||||
"IPAddress 0 0 \n",
|
||||
"InternationalBankingAccountNumber 21 0 \n",
|
||||
"O 0 19331 \n",
|
||||
"Organization 0 58 \n",
|
||||
"Person 0 24 \n",
|
||||
"PhoneNumber 0 67 \n",
|
||||
"URL 0 0 \n",
|
||||
"USDriversLicenseNumber 0 2 \n",
|
||||
"USSocialSecurityNumber 0 0 \n",
|
||||
"\n",
|
||||
" Organization Person PhoneNumber URL \\\n",
|
||||
"Address 47 72 14 0 \n",
|
||||
"Age 0 0 0 0 \n",
|
||||
"CreditCardNumber 0 0 10 0 \n",
|
||||
"DateTime 0 0 0 0 \n",
|
||||
"Email 0 21 0 0 \n",
|
||||
"IPAddress 0 0 0 0 \n",
|
||||
"InternationalBankingAccountNumber 0 0 0 0 \n",
|
||||
"O 136 39 9 0 \n",
|
||||
"Organization 391 54 0 0 \n",
|
||||
"Person 4 1340 0 0 \n",
|
||||
"PhoneNumber 0 0 278 0 \n",
|
||||
"URL 0 0 0 37 \n",
|
||||
"USDriversLicenseNumber 0 0 7 0 \n",
|
||||
"USSocialSecurityNumber 0 0 0 0 \n",
|
||||
"\n",
|
||||
" USDriversLicenseNumber \\\n",
|
||||
"Address 0 \n",
|
||||
"Age 0 \n",
|
||||
"CreditCardNumber 0 \n",
|
||||
"DateTime 0 \n",
|
||||
"Email 0 \n",
|
||||
"IPAddress 0 \n",
|
||||
"InternationalBankingAccountNumber 0 \n",
|
||||
"O 0 \n",
|
||||
"Organization 0 \n",
|
||||
"Person 0 \n",
|
||||
"PhoneNumber 0 \n",
|
||||
"URL 0 \n",
|
||||
"USDriversLicenseNumber 0 \n",
|
||||
"USSocialSecurityNumber 0 \n",
|
||||
"\n",
|
||||
" USSocialSecurityNumber \n",
|
||||
"Address 0 \n",
|
||||
"Age 0 \n",
|
||||
"CreditCardNumber 0 \n",
|
||||
"DateTime 0 \n",
|
||||
"Email 0 \n",
|
||||
"IPAddress 0 \n",
|
||||
"InternationalBankingAccountNumber 0 \n",
|
||||
"O 0 \n",
|
||||
"Organization 0 \n",
|
||||
"Person 0 \n",
|
||||
"PhoneNumber 0 \n",
|
||||
"URL 0 \n",
|
||||
"USDriversLicenseNumber 0 \n",
|
||||
"USSocialSecurityNumber 80 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"Confusion matrix:\")\n",
|
||||
"print(pd.DataFrame(confmatrix, columns=entities, index=entities))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Precision and recall\n",
|
||||
" Entity Precision Recall Number of samples\n",
|
||||
" Person 87.81% 97.88% 1369\n",
|
||||
" Age nan% 0.00% 74\n",
|
||||
" Email 100.00% 57.14% 49\n",
|
||||
" URL 100.00% 100.00% 37\n",
|
||||
"InternationalBankingAccountNumber 100.00% 100.00% 21\n",
|
||||
" Organization 67.65% 77.58% 504\n",
|
||||
" Address 93.20% 49.56% 3071\n",
|
||||
"USSocialSecurityNumber 100.00% 100.00% 80\n",
|
||||
" CreditCardNumber 100.00% 51.47% 136\n",
|
||||
" IPAddress 91.67% 100.00% 22\n",
|
||||
" DateTime 34.98% 100.00% 219\n",
|
||||
"USDriversLicenseNumber nan% 0.00% 9\n",
|
||||
" PhoneNumber 87.42% 79.43% 350\n",
|
||||
" PII 80.19% 72.45% 5941\n",
|
||||
"PII F measure: 73.42%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"Precision and recall\")\n",
|
||||
"print(results)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.13 ('presidio')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.13"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "371968787ec79dd50357533864944a85029366968470cac36beb694745c2f7d6"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
{"parameters": {"dataset_name": "synth_dataset_v2.json", "model_name": "Text analytics Analyzer", "labeling_scheme": "BIO", "entities_to_keep": null}, "metrics": {"pii_f": 0.7342419981999261, "Person_precision": 0.8781127129750983, "Age_precision": NaN, "Email_precision": 1.0, "URL_precision": 1.0, "InternationalBankingAccountNumber_precision": 1.0, "Organization_precision": 0.6764705882352942, "Address_precision": 0.9320269442743417, "USSocialSecurityNumber_precision": 1.0, "CreditCardNumber_precision": 1.0, "IPAddress_precision": 0.9166666666666666, "DateTime_precision": 0.3498402555910543, "USDriversLicenseNumber_precision": NaN, "PhoneNumber_precision": 0.8742138364779874, "Person_recall": 0.9788166544923301, "Age_recall": 0.0, "Email_recall": 0.5714285714285714, "URL_recall": 1.0, "InternationalBankingAccountNumber_recall": 1.0, "Organization_recall": 0.7757936507936508, "Address_recall": 0.49560403777271245, "USSocialSecurityNumber_recall": 1.0, "CreditCardNumber_recall": 0.5147058823529411, "IPAddress_recall": 1.0, "DateTime_recall": 1.0, "USDriversLicenseNumber_recall": 0.0, "PhoneNumber_recall": 0.7942857142857143, "Person": 1369, "Age": 74, "Email": 49, "URL": 37, "InternationalBankingAccountNumber": 21, "Organization": 504, "Address": 3071, "USSocialSecurityNumber": 80, "CreditCardNumber": 136, "IPAddress": 22, "DateTime": 219, "USDriversLicenseNumber": 9, "PhoneNumber": 350}, "dataset_info": null, "confusion_matrix": [[1522, 0, 0, 9, 0, 0, 0, 1406, 47, 72, 14, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0], [0, 0, 70, 0, 0, 0, 0, 38, 0, 0, 10, 0, 0, 0], [0, 0, 0, 219, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 28, 0, 0, 0, 0, 21, 0, 0, 0, 0], [0, 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0], [110, 0, 0, 395, 0, 0, 0, 19331, 136, 39, 9, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 58, 391, 54, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 24, 4, 1340, 0, 0, 0, 0], [0, 0, 0, 3, 0, 2, 0, 67, 0, 0, 278, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 0, 0], [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 7, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80]], "labels": ["Address", "Age", "CreditCardNumber", "DateTime", "Email", "IPAddress", "InternationalBankingAccountNumber", "O", "Organization", "Person", "PhoneNumber", "URL", "USDriversLicenseNumber", "USSocialSecurityNumber"]}
|
|
@ -3,6 +3,7 @@ from .base_model import BaseModel
|
|||
from .crf_model import CRFModel
|
||||
from .presidio_analyzer_wrapper import PresidioAnalyzerWrapper
|
||||
from .presidio_recognizer_wrapper import PresidioRecognizerWrapper
|
||||
from .text_analytics_wrapper import TextAnalyticsWrapper
|
||||
from .spacy_model import SpacyModel
|
||||
from .stanza_model import StanzaModel
|
||||
from .flair_model import FlairModel
|
||||
|
|
Загрузка…
Ссылка в новой задаче