Update azure text analytics evaluation notebook

This commit is contained in:
Trang Nguyen 2022-11-25 16:39:27 +01:00
Родитель 75ba71514c
Коммит 40b84d42dd
3 изменённых файлов: 451 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,449 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Evaluate Azure Text Analytics for PII detection using the Presidio Evaluator framework"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"stanza and spacy_stanza are not installed\n",
"Flair is not installed by default\n",
"Flair is not installed\n"
]
}
],
"source": [
"from pathlib import Path\n",
"from copy import deepcopy\n",
"from pprint import pprint\n",
"from collections import Counter\n",
"\n",
"from presidio_evaluator import InputSample\n",
"from presidio_evaluator.evaluation import Evaluator, ModelError\n",
"from presidio_evaluator.models import TextAnalyticsWrapper\n",
"from presidio_evaluator.experiment_tracking import get_experiment_tracker\n",
"import pandas as pd\n",
"\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.max_rows\", None)\n",
"pd.set_option(\"display.max_colwidth\", None)\n",
"\n",
"%reload_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Select data for evaluation"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"tokenizing input: 100%|██████████| 1500/1500 [00:09<00:00, 153.03it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1500\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"dataset_name = \"synth_dataset_v2.json\"\n",
"dataset = InputSample.read_dataset_json(Path(Path.cwd().parent.parent, \"data\", dataset_name))\n",
"print(len(dataset))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"entity_counter = Counter()\n",
"for sample in dataset:\n",
" for tag in sample.tags:\n",
" entity_counter[tag] += 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Dataset exploration"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Count per entity:\n",
"[('O', 19626),\n",
" ('STREET_ADDRESS', 3071),\n",
" ('PERSON', 1369),\n",
" ('GPE', 521),\n",
" ('ORGANIZATION', 504),\n",
" ('PHONE_NUMBER', 350),\n",
" ('DATE_TIME', 219),\n",
" ('TITLE', 142),\n",
" ('CREDIT_CARD', 136),\n",
" ('US_SSN', 80),\n",
" ('AGE', 74),\n",
" ('NRP', 55),\n",
" ('ZIP_CODE', 50),\n",
" ('EMAIL_ADDRESS', 49),\n",
" ('DOMAIN_NAME', 37),\n",
" ('IP_ADDRESS', 22),\n",
" ('IBAN_CODE', 21),\n",
" ('US_DRIVER_LICENSE', 9)]\n",
"\n",
"Example sentence:\n",
"Full text: What are my options?\n",
"Spans: []\n",
"Tokens: What are my options?\n",
"Tags: ['O', 'O', 'O', 'O', 'O']\n",
"\n",
"\n",
"Min and max number of tokens in dataset:\n",
"Min: 3, Max: 78\n",
"\n",
"Min and max sentence length in dataset:\n",
"Min: 9, Max: 407\n"
]
}
],
"source": [
"print(\"Count per entity:\")\n",
"pprint(entity_counter.most_common())\n",
"\n",
"print(\"\\nExample sentence:\")\n",
"print(dataset[1])\n",
"\n",
"print(\"\\nMin and max number of tokens in dataset:\")\n",
"print(\n",
" f\"Min: {min([len(sample.tokens) for sample in dataset])}, \"\n",
" f\"Max: {max([len(sample.tokens) for sample in dataset])}\"\n",
")\n",
"\n",
"print(\"\\nMin and max sentence length in dataset:\")\n",
"print(\n",
" f\"Min: {min([len(sample.full_text) for sample in dataset])}, \"\n",
" f\"Max: {max([len(sample.full_text) for sample in dataset])}\"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run evaluation"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"model_name = \"Text analytics Analyzer\"\n",
"key = \"3f96f95c33394578be4a67a8b17cb809\"\n",
"endpoint = \"https://taclinicalpii556702.cognitiveservices.azure.com/\"\n",
"model = TextAnalyticsWrapper(ta_key=key, ta_endpoint=endpoint)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluating Azure Text Analytics.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating <class 'presidio_evaluator.models.text_analytics_wrapper.TextAnalyticsWrapper'>: 100%|██████████| 1500/1500 [01:36<00:00, 15.61it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"saving experiment data to experiment_20221125-162355.json\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"print(\"Evaluating Azure Text Analytics.\")\n",
"\n",
"experiment = get_experiment_tracker()\n",
"\n",
"# Mapping from dataset Entities to Text Analytics Entities. \n",
"# All supported PII entity categories in Text Analytics are listed in this link: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/concepts/conversations-entity-categories\n",
"i2b2_entities_to_text_analytics = {\"PERSON\":\"Person\",\n",
" \"STREET_ADDRESS\":\"Address\",\n",
" \"GPE\": \"O\",\n",
" \"PHONE_NUMBER\":\"PhoneNumber\",\n",
" \"ORGANIZATION\":\"Organization\",\n",
" \"DATE_TIME\": \"DateTime\",\n",
" \"TITLE\":\"O\",\n",
" \"CREDIT_CARD\":\"CreditCardNumber\",\n",
" \"US_SSN\":\"USSocialSecurityNumber\",\n",
" \"AGE\": \"Age\",\n",
" \"NRP\":\"O\",\n",
" \"ZIP_CODE\":\"O\",\n",
" \"EMAIL_ADDRESS\":\"Email\",\n",
" \"DOMAIN_NAME\":\"URL\",\n",
" \"IP_ADDRESS\":\"IPAddress\",\n",
" \"IBAN_CODE\":\"InternationalBankingAccountNumber\", \n",
" \"US_DRIVER_LICENSE\":\"USDriversLicenseNumber\"\n",
" }\n",
"evaluator = Evaluator(model=model)\n",
"dataset_ = Evaluator.align_entity_types(\n",
" deepcopy(dataset), entities_mapping=i2b2_entities_to_text_analytics\n",
")\n",
"\n",
"evaluation_results = evaluator.evaluate_all(dataset_)\n",
"results = evaluator.calculate_score(evaluation_results)\n",
"\n",
"# update params tracking\n",
"params = {\"dataset_name\": dataset_name, \"model_name\": model_name}\n",
"params.update(model.to_log())\n",
"experiment.log_parameters(params)\n",
"experiment.log_dataset_hash(dataset)\n",
"experiment.log_metrics(results.to_log())\n",
"entities, confmatrix = results.to_confusion_matrix()\n",
"experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)\n",
"\n",
"# end experiment\n",
"experiment.end()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion matrix:\n",
" Address Age CreditCardNumber DateTime \\\n",
"Address 1522 0 0 9 \n",
"Age 0 0 0 0 \n",
"CreditCardNumber 0 0 70 0 \n",
"DateTime 0 0 0 219 \n",
"Email 0 0 0 0 \n",
"IPAddress 0 0 0 0 \n",
"InternationalBankingAccountNumber 0 0 0 0 \n",
"O 110 0 0 395 \n",
"Organization 1 0 0 0 \n",
"Person 0 0 0 0 \n",
"PhoneNumber 0 0 0 3 \n",
"URL 0 0 0 0 \n",
"USDriversLicenseNumber 0 0 0 0 \n",
"USSocialSecurityNumber 0 0 0 0 \n",
"\n",
" Email IPAddress \\\n",
"Address 0 0 \n",
"Age 0 0 \n",
"CreditCardNumber 0 0 \n",
"DateTime 0 0 \n",
"Email 28 0 \n",
"IPAddress 0 22 \n",
"InternationalBankingAccountNumber 0 0 \n",
"O 0 0 \n",
"Organization 0 0 \n",
"Person 0 0 \n",
"PhoneNumber 0 2 \n",
"URL 0 0 \n",
"USDriversLicenseNumber 0 0 \n",
"USSocialSecurityNumber 0 0 \n",
"\n",
" InternationalBankingAccountNumber O \\\n",
"Address 0 1406 \n",
"Age 0 42 \n",
"CreditCardNumber 0 38 \n",
"DateTime 0 0 \n",
"Email 0 0 \n",
"IPAddress 0 0 \n",
"InternationalBankingAccountNumber 21 0 \n",
"O 0 19331 \n",
"Organization 0 58 \n",
"Person 0 24 \n",
"PhoneNumber 0 67 \n",
"URL 0 0 \n",
"USDriversLicenseNumber 0 2 \n",
"USSocialSecurityNumber 0 0 \n",
"\n",
" Organization Person PhoneNumber URL \\\n",
"Address 47 72 14 0 \n",
"Age 0 0 0 0 \n",
"CreditCardNumber 0 0 10 0 \n",
"DateTime 0 0 0 0 \n",
"Email 0 21 0 0 \n",
"IPAddress 0 0 0 0 \n",
"InternationalBankingAccountNumber 0 0 0 0 \n",
"O 136 39 9 0 \n",
"Organization 391 54 0 0 \n",
"Person 4 1340 0 0 \n",
"PhoneNumber 0 0 278 0 \n",
"URL 0 0 0 37 \n",
"USDriversLicenseNumber 0 0 7 0 \n",
"USSocialSecurityNumber 0 0 0 0 \n",
"\n",
" USDriversLicenseNumber \\\n",
"Address 0 \n",
"Age 0 \n",
"CreditCardNumber 0 \n",
"DateTime 0 \n",
"Email 0 \n",
"IPAddress 0 \n",
"InternationalBankingAccountNumber 0 \n",
"O 0 \n",
"Organization 0 \n",
"Person 0 \n",
"PhoneNumber 0 \n",
"URL 0 \n",
"USDriversLicenseNumber 0 \n",
"USSocialSecurityNumber 0 \n",
"\n",
" USSocialSecurityNumber \n",
"Address 0 \n",
"Age 0 \n",
"CreditCardNumber 0 \n",
"DateTime 0 \n",
"Email 0 \n",
"IPAddress 0 \n",
"InternationalBankingAccountNumber 0 \n",
"O 0 \n",
"Organization 0 \n",
"Person 0 \n",
"PhoneNumber 0 \n",
"URL 0 \n",
"USDriversLicenseNumber 0 \n",
"USSocialSecurityNumber 80 \n"
]
}
],
"source": [
"print(\"Confusion matrix:\")\n",
"print(pd.DataFrame(confmatrix, columns=entities, index=entities))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Precision and recall\n",
" Entity Precision Recall Number of samples\n",
" Person 87.81% 97.88% 1369\n",
" Age nan% 0.00% 74\n",
" Email 100.00% 57.14% 49\n",
" URL 100.00% 100.00% 37\n",
"InternationalBankingAccountNumber 100.00% 100.00% 21\n",
" Organization 67.65% 77.58% 504\n",
" Address 93.20% 49.56% 3071\n",
"USSocialSecurityNumber 100.00% 100.00% 80\n",
" CreditCardNumber 100.00% 51.47% 136\n",
" IPAddress 91.67% 100.00% 22\n",
" DateTime 34.98% 100.00% 219\n",
"USDriversLicenseNumber nan% 0.00% 9\n",
" PhoneNumber 87.42% 79.43% 350\n",
" PII 80.19% 72.45% 5941\n",
"PII F measure: 73.42%\n"
]
}
],
"source": [
"print(\"Precision and recall\")\n",
"print(results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.13 ('presidio')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "371968787ec79dd50357533864944a85029366968470cac36beb694745c2f7d6"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1 @@
{"parameters": {"dataset_name": "synth_dataset_v2.json", "model_name": "Text analytics Analyzer", "labeling_scheme": "BIO", "entities_to_keep": null}, "metrics": {"pii_f": 0.7342419981999261, "Person_precision": 0.8781127129750983, "Age_precision": NaN, "Email_precision": 1.0, "URL_precision": 1.0, "InternationalBankingAccountNumber_precision": 1.0, "Organization_precision": 0.6764705882352942, "Address_precision": 0.9320269442743417, "USSocialSecurityNumber_precision": 1.0, "CreditCardNumber_precision": 1.0, "IPAddress_precision": 0.9166666666666666, "DateTime_precision": 0.3498402555910543, "USDriversLicenseNumber_precision": NaN, "PhoneNumber_precision": 0.8742138364779874, "Person_recall": 0.9788166544923301, "Age_recall": 0.0, "Email_recall": 0.5714285714285714, "URL_recall": 1.0, "InternationalBankingAccountNumber_recall": 1.0, "Organization_recall": 0.7757936507936508, "Address_recall": 0.49560403777271245, "USSocialSecurityNumber_recall": 1.0, "CreditCardNumber_recall": 0.5147058823529411, "IPAddress_recall": 1.0, "DateTime_recall": 1.0, "USDriversLicenseNumber_recall": 0.0, "PhoneNumber_recall": 0.7942857142857143, "Person": 1369, "Age": 74, "Email": 49, "URL": 37, "InternationalBankingAccountNumber": 21, "Organization": 504, "Address": 3071, "USSocialSecurityNumber": 80, "CreditCardNumber": 136, "IPAddress": 22, "DateTime": 219, "USDriversLicenseNumber": 9, "PhoneNumber": 350}, "dataset_info": null, "confusion_matrix": [[1522, 0, 0, 9, 0, 0, 0, 1406, 47, 72, 14, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0], [0, 0, 70, 0, 0, 0, 0, 38, 0, 0, 10, 0, 0, 0], [0, 0, 0, 219, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 28, 0, 0, 0, 0, 21, 0, 0, 0, 0], [0, 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0], [110, 0, 0, 395, 0, 0, 0, 19331, 136, 39, 9, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 58, 391, 54, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 24, 4, 1340, 0, 0, 0, 0], [0, 0, 0, 3, 0, 2, 0, 67, 0, 0, 278, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 0, 0], [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 7, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80]], "labels": ["Address", "Age", "CreditCardNumber", "DateTime", "Email", "IPAddress", "InternationalBankingAccountNumber", "O", "Organization", "Person", "PhoneNumber", "URL", "USDriversLicenseNumber", "USSocialSecurityNumber"]}

Просмотреть файл

@ -3,6 +3,7 @@ from .base_model import BaseModel
from .crf_model import CRFModel
from .presidio_analyzer_wrapper import PresidioAnalyzerWrapper
from .presidio_recognizer_wrapper import PresidioRecognizerWrapper
from .text_analytics_wrapper import TextAnalyticsWrapper
from .spacy_model import SpacyModel
from .stanza_model import StanzaModel
from .flair_model import FlairModel