diff --git a/README.md b/README.md index 295447f..15d7fb4 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ Furthermore, it tokenizes the data, creates tags (either IO/BIO/BILUO) and spans Once data is generated, it could be split into train/test/validation sets while ensuring that each template only exists in one set. -See [this notebook for more details](notebooks/3_Split_by_pattern_%23.ipynb). +See [this notebook for more details](notebooks/3_Split_by_pattern_number.ipynb). ## 2. Data representation diff --git a/VERSION b/VERSION index d917d3e..b1e80bb 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.2 +0.1.3 diff --git a/notebooks/1_Generate_data.ipynb b/notebooks/1_Generate_data.ipynb index 98b5686..7272802 100644 --- a/notebooks/1_Generate_data.ipynb +++ b/notebooks/1_Generate_data.ipynb @@ -2,8 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 1, + "metadata": { + "is_executing": true + }, "outputs": [], "source": [ "# install presidio via pip if not yet installed\n", @@ -14,8 +16,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { + "is_executing": true, "scrolled": true }, "outputs": [], @@ -81,9 +84,34 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": { + "is_executing": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sampling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9149.88it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My name is Joshua Jackson\n", + "[{\"value\": \"Joshua Jackson\", \"start\": 11, \"end\": 25, \"type\": \"name\"}]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "sentence_templates = [\n", " \"My name is {{name}}\",\n", @@ -126,8 +154,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { + "is_executing": true, "scrolled": true }, "outputs": [], @@ -165,13 +194,228 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { + "is_executing": true, "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
numbergendernationalityprefixfirst_namemiddle_initiallast_namestreet_namecitystate_abbr...companydomain_namepersonnamefirst_name_femalefirst_name_maleprefix_femaleprefix_malelast_name_femalelast_name_male
01femaleCzechMrs.MarieJHamanováP.O. Box 255KangerlussuaqQE...Simple SolutionsMarathonDancing.glMarie J HamanováMarie J HamanováMarieMrs.Hamanová
12femaleFrenchMs.PatriciaGDesrosiersAvenida Noruega 42Vila RealVR...Formula GrayLostMillions.com.ptPatricia DesrosiersPatricia DesrosiersPatriciaMs.Desrosiers
23femaleAmericanMs.DebraONeal1659 Hoog StBrakpanGA...Dahlkemper'sMediumTube.co.zaDebra O NealDebra O NealDebraMs.Neal
34maleFrenchMr.PeverellCRacine183 Epimenidou StreetLimassolLI...QuickbizImproveLook.com.cyPeverell RacinePeverell RacinePeverellMr.Racine
45femaleSlovenianMrs.IolandaSTratnikKaru põik 61PärnuPR...Dubrow's CafeteriaPostTan.com.eeIolanda TratnikIolanda TratnikIolandaMrs.Tratnik
\n", + "

5 rows × 37 columns

\n", + "
" + ], + "text/plain": [ + " number gender nationality prefix first_name middle_initial last_name \\\n", + "0 1 female Czech Mrs. Marie J Hamanová \n", + "1 2 female French Ms. Patricia G Desrosiers \n", + "2 3 female American Ms. Debra O Neal \n", + "3 4 male French Mr. Peverell C Racine \n", + "4 5 female Slovenian Mrs. Iolanda S Tratnik \n", + "\n", + " street_name city state_abbr ... company \\\n", + "0 P.O. Box 255 Kangerlussuaq QE ... Simple Solutions \n", + "1 Avenida Noruega 42 Vila Real VR ... Formula Gray \n", + "2 1659 Hoog St Brakpan GA ... Dahlkemper's \n", + "3 183 Epimenidou Street Limassol LI ... Quickbiz \n", + "4 Karu põik 61 Pärnu PR ... Dubrow's Cafeteria \n", + "\n", + " domain_name person name \\\n", + "0 MarathonDancing.gl Marie J Hamanová Marie J Hamanová \n", + "1 LostMillions.com.pt Patricia Desrosiers Patricia Desrosiers \n", + "2 MediumTube.co.za Debra O Neal Debra O Neal \n", + "3 ImproveLook.com.cy Peverell Racine Peverell Racine \n", + "4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n", + "\n", + " first_name_female first_name_male prefix_female prefix_male \\\n", + "0 Marie Mrs. \n", + "1 Patricia Ms. \n", + "2 Debra Ms. \n", + "3 Peverell Mr. \n", + "4 Iolanda Mrs. \n", + "\n", + " last_name_female last_name_male \n", + "0 Hamanová \n", + "1 Desrosiers \n", + "2 Neal \n", + "3 Racine \n", + "4 Tratnik \n", + "\n", + "[5 rows x 37 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Read FakeNameGenerator CSV\n", "fake_name_generator_df = pd.read_csv(fake_name_generator_file)\n", @@ -190,8 +434,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { + "is_executing": true, "scrolled": true }, "outputs": [], @@ -209,8 +454,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 7, + "metadata": { + "is_executing": true + }, "outputs": [], "source": [ "fake.add_provider(IpAddressProvider) # Both Ipv4 and IPv6 IP addresses\n", @@ -235,8 +482,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { + "is_executing": true, "pycharm": { "name": "#%%\n" } @@ -270,13 +518,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { + "is_executing": true, "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sampling: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 17987.56it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"year\"}], \"template\": \"Title VII of the Civil Rights Act of {{year}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "sentence_templates = PresidioDataGenerator.read_template_file(templates_file_path)\n", "fake_records = data_generator.generate_fake_data(\n", @@ -296,11 +567,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { + "is_executing": true, "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total: 1500\n", + "Avg # of records per template: 7.142857142857143\n", + "Median # of records per template: 7.0\n", + "Std: 2.5872528966106905\n" + ] + } + ], "source": [ "count_per_template_id = Counter([sample.template_id for sample in fake_records])\n", "\n", @@ -323,13 +606,65 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { + "is_executing": true, "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'organization': 257,\n", + " 'first_name': 244,\n", + " 'person': 238,\n", + " 'city': 235,\n", + " 'address': 209,\n", + " 'street_name': 164,\n", + " 'name': 162,\n", + " 'country': 154,\n", + " 'credit_card_number': 152,\n", + " 'phone_number': 121,\n", + " 'last_name': 119,\n", + " 'building_number': 110,\n", + " 'age': 72,\n", + " 'secondary_address': 64,\n", + " 'year': 58,\n", + " 'nationality': 55,\n", + " 'postcode': 49,\n", + " 'zipcode': 45,\n", + " 'url': 39,\n", + " 'email': 39,\n", + " 'name_female': 37,\n", + " 'job': 33,\n", + " 'first_name_male': 31,\n", + " 'name_male': 29,\n", + " 'prefix_male': 28,\n", + " 'date_of_birth': 24,\n", + " 'iban': 22,\n", + " 'date_time': 21,\n", + " 'prefix_female': 21,\n", + " 'day_of_week': 16,\n", + " 'state_abbr': 15,\n", + " 'last_name_male': 15,\n", + " 'prefix': 12,\n", + " 'ip_address': 11,\n", + " 'ssn': 11,\n", + " 'nation_plural': 9,\n", + " 'nation_woman': 8,\n", + " 'first_name_nonbinary': 6,\n", + " 'us_driver_license': 6,\n", + " 'first_name_female': 3,\n", + " 'last_name_female': 3})" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "count_per_entity = Counter()\n", "for record in fake_records:\n", @@ -351,8 +686,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { + "is_executing": true, "pycharm": { "name": "#%%\n" } @@ -421,9 +757,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 13, + "metadata": { + "is_executing": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"DATE_TIME\"}], \"template\": \"Title VII of the Civil Rights Act of {{DATE_TIME}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "fake_records[0]" ] @@ -437,13 +786,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { + "is_executing": true, "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[('PERSON', 887),\n", + " ('STREET_ADDRESS', 596),\n", + " ('GPE', 404),\n", + " ('ORGANIZATION', 257),\n", + " ('CREDIT_CARD', 152),\n", + " ('PHONE_NUMBER', 121),\n", + " ('DATE_TIME', 119),\n", + " ('TITLE', 94),\n", + " ('NRP', 72),\n", + " ('AGE', 72),\n", + " ('ZIP_CODE', 45),\n", + " ('DOMAIN_NAME', 39),\n", + " ('EMAIL_ADDRESS', 39),\n", + " ('IBAN_CODE', 22),\n", + " ('IP_ADDRESS', 11),\n", + " ('US_SSN', 11),\n", + " ('US_DRIVER_LICENSE', 6)]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "\n", "count_per_entity_new = Counter()\n", @@ -463,13 +840,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { + "is_executing": true, "pycharm": { "name": "#%%\n" } }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1500 [00:00 0]\n", "print(\"Kept {} samples after removal of non-tagged samples\".format(len(train_tagged)))" @@ -140,45 +95,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Entities found in training set:\n" - ] - }, - { - "data": { - "text/plain": [ - "{'ADDRESS',\n", - " 'CREDIT_CARD',\n", - " 'DATE_TIME',\n", - " 'DOMAIN_NAME',\n", - " 'EMAIL_ADDRESS',\n", - " 'IBAN_CODE',\n", - " 'IP_ADDRESS',\n", - " 'LOCATION',\n", - " 'O',\n", - " 'ORGANIZATION',\n", - " 'PERSON',\n", - " 'PHONE_NUMBER',\n", - " 'PREFIX',\n", - " 'TITLE',\n", - " 'US_SSN'}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "print(\"Entities found in training set:\")\n", "entities = []\n", @@ -206,16 +129,7 @@ "name": "#%%\n" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Skipping illegal span None, text=ΜΟΝΗ ΑΓΙΩΝ ΑΝΑΡΓΥΡΩΝ\n", - "Skipping illegal span None, text=U.N\n" - ] - } - ], + "outputs": [], "source": [ "spacy_train = InputSample.create_spacy_dataset(\n", " dataset=train_tagged, output_path=\"train.spacy\"\n", @@ -281,9 +195,9 @@ ], "metadata": { "kernelspec": { - "display_name": "presidio", + "display_name": "presidio-evaluator", "language": "python", - "name": "presidio" + "name": "presidio-evaluator" }, "language_info": { "codemirror_mode": { @@ -295,9 +209,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.9.18" } }, "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file + "nbformat_minor": 4 +} diff --git a/notebooks/models/Evaluate CRF models.ipynb b/notebooks/models/Evaluate CRF models.ipynb index fecf820..6e10f36 100644 --- a/notebooks/models/Evaluate CRF models.ipynb +++ b/notebooks/models/Evaluate CRF models.ipynb @@ -39,6 +39,16 @@ "%autoreload 2" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "aee00770-a972-4a19-b423-1724214cc88c", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install sklearn_crfsuite" + ] + }, { "cell_type": "markdown", "id": "a0d2d772", @@ -58,8 +68,9 @@ }, "outputs": [], "source": [ - "DATA_DATE = \"Jan-15-2022\"\n", - "dataset = InputSample.read_dataset_json(\"../../data/test_{}.json\".format(DATA_DATE))\n", + "DATA_DATE = \"Dec-27-2023\" # Date when the split to train/test notebook was ran\n", + "dataset_name = \"../../data/test_{}.json\".format(DATA_DATE)\n", + "dataset = InputSample.read_dataset_json(dataset_name)\n", "print(len(dataset))" ] }, @@ -76,7 +87,7 @@ "source": [ "entity_counter = Counter()\n", "for sample in dataset:\n", - " for t>ag in sample.tags:\n", + " for tag in sample.tags:\n", " entity_counter[tag] += 1" ] }, @@ -257,7 +268,7 @@ "metadata": {}, "outputs": [], "source": [ - "fps_df = ModelError.get_fps_dataframe(errors, entity=[\"GPE\"])\n", + "fps_df = ModelError.get_fps_dataframe(errors, entity=[\"PERSON\"])\n", "fps_df[[\"full_text\", \"token\", \"prediction\"]]" ] }, @@ -276,7 +287,7 @@ "metadata": {}, "outputs": [], "source": [ - "ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])" + "ModelError.most_common_fn_tokens(errors, n=50, entity=[\"ORGANIZATION\"])" ] }, { @@ -325,13 +336,21 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf3e4646-ca93-44c5-a998-cd77f4bf2708", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "presidio", + "display_name": "presidio-evaluator", "language": "python", - "name": "presidio" + "name": "presidio-evaluator" }, "language_info": { "codemirror_mode": { @@ -343,9 +362,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/notebooks/models/Evaluate azure text analytics.ipynb b/notebooks/models/Evaluate azure text analytics.ipynb index f7f122d..5e11369 100644 --- a/notebooks/models/Evaluate azure text analytics.ipynb +++ b/notebooks/models/Evaluate azure text analytics.ipynb @@ -205,7 +205,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.9.13 ('presidio')", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -219,9 +219,8 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.9.18" }, - "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "371968787ec79dd50357533864944a85029366968470cac36beb694745c2f7d6" @@ -229,5 +228,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/models/Evaluate flair models.ipynb b/notebooks/models/Evaluate flair models.ipynb index 949906f..22b6d39 100644 --- a/notebooks/models/Evaluate flair models.ipynb +++ b/notebooks/models/Evaluate flair models.ipynb @@ -35,6 +35,16 @@ "%autoreload 2" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0c3285c-06a2-4361-aec2-8375496f75b3", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install flair" + ] + }, { "cell_type": "markdown", "id": "f036de59", @@ -111,15 +121,14 @@ "metadata": {}, "outputs": [], "source": [ - "flair_ner = \"ner-english\"\n", - "flair_ner_fast = \"ner-english-fast\"\n", - "flair_ontonotes_fast = \"ner-english-ontonotes-fast\"\n", - "flair_ontonotes_large = \"ner-english-ontonotes-large\"\n", + "flair_ner = \"flair/ner-english\"\n", + "flair_ner_fast = \"flair/ner-english-fast\"\n", + "flair_ontonotes_fast = \"flair/ner-english-ontonotes-fast\"\n", + "flair_ontonotes_large = \"flair/ner-english-ontonotes-large\"\n", "models = [\n", " flair_ner,\n", " flair_ner_fast,\n", " flair_ontonotes_fast,\n", - " flair_ner_fast,\n", " flair_ontonotes_large,\n", "]" ] @@ -312,9 +321,9 @@ ], "metadata": { "kernelspec": { - "display_name": "presidio", + "display_name": "presidio-evaluator", "language": "python", - "name": "presidio" + "name": "presidio-evaluator" }, "language_info": { "codemirror_mode": { @@ -326,7 +335,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/notebooks/models/Evaluate spacy models.ipynb b/notebooks/models/Evaluate spacy models.ipynb index 1f69f59..d9a7047 100644 --- a/notebooks/models/Evaluate spacy models.ipynb +++ b/notebooks/models/Evaluate spacy models.ipynb @@ -109,7 +109,10 @@ "metadata": {}, "outputs": [], "source": [ - "models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]" + "models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]\n", + "\n", + "# If needed, install models using `python -m spacy download X` where x is the model name, or use spacy.cli.download:\n", + "#spacy.cli.download(\"en_core_web_trf\")" ] }, { @@ -334,9 +337,9 @@ ], "metadata": { "kernelspec": { - "display_name": "presidio", + "display_name": "presidio-evaluator", "language": "python", - "name": "presidio" + "name": "presidio-evaluator" }, "language_info": { "codemirror_mode": { @@ -348,9 +351,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.9.18" } }, "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file + "nbformat_minor": 4 +} diff --git a/presidio_evaluator/data_generator/presidio_data_generator.py b/presidio_evaluator/data_generator/presidio_data_generator.py index 80633db..fe1a9c3 100644 --- a/presidio_evaluator/data_generator/presidio_data_generator.py +++ b/presidio_evaluator/data_generator/presidio_data_generator.py @@ -170,6 +170,8 @@ class PresidioDataGenerator: new_provider = BaseProvider(self.faker) setattr(new_provider, new_name, original) + setattr(new_provider, new_name.lower(), original) # avoid case sensitivity + setattr(new_provider, new_name.upper(), original) # avoid case sensitivity self.faker.add_provider(new_provider) @staticmethod diff --git a/presidio_evaluator/data_generator/presidio_pseudonymize.py b/presidio_evaluator/data_generator/presidio_pseudonymize.py index 9e859cf..334bce7 100644 --- a/presidio_evaluator/data_generator/presidio_pseudonymize.py +++ b/presidio_evaluator/data_generator/presidio_pseudonymize.py @@ -25,7 +25,7 @@ class PresidioPseudonymization(PresidioDataGenerator): self.add_provider_alias("credit_card_number", "CREDIT_CARD") self.add_provider_alias("iban", "IBAN_CODE") self.add_provider_alias("phone_number", "PHONE_NUMBER") - self.add_provider_alias("url", "DOMAIN_NAME") + self.add_provider_alias("url", "URL") self.add_provider_alias("ssn", "US_SSN") self.add_provider_alias("email", "EMAIL_ADDRESS") self.add_provider_alias("date_time", "DATE_TIME") diff --git a/presidio_evaluator/evaluation/evaluator.py b/presidio_evaluator/evaluation/evaluator.py index 6532657..0799198 100644 --- a/presidio_evaluator/evaluation/evaluator.py +++ b/presidio_evaluator/evaluation/evaluator.py @@ -1,6 +1,8 @@ +import copy from collections import Counter from typing import List, Optional, Dict from pathlib import Path +import string import numpy as np from tqdm import tqdm @@ -39,7 +41,6 @@ class Evaluator: self.entities_to_keep = self.model.entities def compare(self, input_sample: InputSample, prediction: List[str]): - """ Compares ground truth tags (annotation) and predicted (prediction) :param input_sample: input sample containing list of tags with scheme @@ -71,6 +72,9 @@ class Evaluator: if self.entities_to_keep: prediction = self._adjust_per_entities(prediction) new_annotation = self._adjust_per_entities(new_annotation) + + skip_words = self.get_skip_words() + for i in range(0, len(new_annotation)): results[(new_annotation[i], prediction[i])] += 1 @@ -81,6 +85,10 @@ class Evaluator: # check if there was an error is_error = new_annotation[i] != prediction[i] + if str(tokens[i]).lower().strip() in skip_words: + is_error = False + results[(new_annotation[i], prediction[i])] -= 1 + if is_error: if prediction[i] == "O": mistakes.append( @@ -151,7 +159,6 @@ class Evaluator: f"Mapping entity values using this dictionary: {self.model.entity_mapping}" ) for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"): - # Align tag values to the ones expected by the model self.model.align_entity_types(sample) @@ -345,13 +352,13 @@ class Evaluator: if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0): return np.nan - return ((1 + beta ** 2) * precision * recall) / ( - ((beta ** 2) * precision) + recall + return ((1 + beta**2) * precision * recall) / ( + ((beta**2) * precision) + recall ) class Plotter: """ - Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives) + Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives) for a PII detection model evaluated via Evaluator :param model: Instance of a fitted model (of base type BaseModel) @@ -362,7 +369,9 @@ class Evaluator: which gives more or less weight to precision vs. recall """ - def __init__(self, model, results, output_folder: Path, model_name: str, beta: float): + def __init__( + self, model, results, output_folder: Path, model_name: str, beta: float + ): self.model = model self.results = results self.output_folder = output_folder @@ -372,41 +381,66 @@ class Evaluator: def plot_scores(self) -> None: """ - Plots per-entity recall, precision, or F2 score for evaluated model. - :param plot_type: which metric to graph (default is F2 score) + Plots per-entity recall, precision, or F2 score for evaluated model. """ scores = {} - scores['entity'] = list(self.results.entity_recall_dict.keys()) - scores['recall'] = list(self.results.entity_recall_dict.values()) - scores['precision'] = list(self.results.entity_precision_dict.values()) - scores['count'] = list(self.results.n_dict.values()) - scores[f"f{self.beta}_score"] = [Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta) - for recall, precision in zip(scores['recall'], scores['precision'])] + + entity_recall_dict = copy.deepcopy(self.results.entity_recall_dict) + entity_precision_dict = copy.deepcopy(self.results.entity_precision_dict) + + scores["entity"] = list(entity_recall_dict.keys()) + scores["recall"] = list(entity_recall_dict.values()) + scores["precision"] = list(entity_precision_dict.values()) + scores["count"] = list(self.results.n_dict.values()) + + scores[f"f{self.beta}_score"] = [ + Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta) + for recall, precision in zip(scores["recall"], scores["precision"]) + ] + + # Add PII detection rates + scores["entity"].append("PII") + scores["recall"].append(self.results.pii_recall) + scores["precision"].append(self.results.pii_precision) + scores["count"].append(self.results.n) + scores[f"f{self.beta}_score"].append(self.results.pii_f) + df = pd.DataFrame(scores) - df['model'] = self.model_name + df["model"] = self.model_name self._plot(df, plot_type="f2_score") self._plot(df, plot_type="precision") self._plot(df, plot_type="recall") def _plot(self, df, plot_type) -> None: - fig = px.bar(df, text_auto=".2", y='entity', orientation="h", - x=plot_type, color='count', barmode='group', title=f"Per-entity {plot_type} for {self.model_name}") - fig.update_layout(barmode='group', yaxis={ - 'categoryorder': 'total ascending'}) + fig = px.bar( + df, + text_auto=".2", + y="entity", + orientation="h", + x=plot_type, + color="count", + barmode="group", + height=30*len(set(df["entity"])), + title=f"Per-entity {plot_type} for {self.model_name}", + ) + fig.update_layout( + barmode="group", yaxis={"categoryorder": "total ascending"} + ) fig.update_layout(yaxis_title=f"{plot_type}", xaxis_title="PII Entity") - fig.update_traces(textfont_size=12, textangle=0, - textposition="outside", cliponaxis=False) + fig.update_traces( + textfont_size=12, textangle=0, textposition="outside", cliponaxis=False + ) fig.update_layout( plot_bgcolor="#FFF", xaxis=dict( title="PII entity", linecolor="#BCCCDC", # Sets color of X-axis line - showgrid=False # Removes X-axis grid lines + showgrid=False, # Removes X-axis grid lines ), yaxis=dict( title=f"{plot_type}", linecolor="#BCCCDC", # Sets color of X-axis line - showgrid=False # Removes X-axis grid lines + showgrid=False, # Removes X-axis grid lines ), ) fig.show() @@ -419,47 +453,100 @@ class Evaluator: for entity in self.model.entity_mapping.values(): fps_df = ModelError.get_fps_dataframe(self.errors, entity=[entity]) if fps_df is not None: - fps_path = self.output_folder / \ - f"{self.model_name}-{entity}-fps.csv" + fps_path = ( + self.output_folder / f"{self.model_name}-{entity}-fps.csv" + ) fps_df.to_csv(fps_path) fps_frames.append(fps_path) fns_df = ModelError.get_fns_dataframe(self.errors, entity=[entity]) if fns_df is not None: - fns_path = self.output_folder / \ - f"{self.model_name}-{entity}-fns.csv" + fns_path = ( + self.output_folder / f"{self.model_name}-{entity}-fns.csv" + ) fns_df.to_csv(fns_path) fns_frames.append(fns_path) def group_tokens(df): - return df.groupby(['token', 'annotation']).size().to_frame( - ).sort_values([0], ascending=False).head(3).reset_index() + return ( + df.groupby(["token", "annotation"]) + .size() + .to_frame() + .sort_values([0], ascending=False) + .head(3) + .reset_index() + ) fps_tokens_df = pd.concat( - [group_tokens(pd.read_csv(df_path)) for df_path in fps_frames]) + [group_tokens(pd.read_csv(df_path)) for df_path in fps_frames] + ) fns_tokens_df = pd.concat( - [group_tokens(pd.read_csv(df_path)) for df_path in fns_frames]) + [group_tokens(pd.read_csv(df_path)) for df_path in fns_frames] + ) def generate_graph(title, tokens_df): - fig = px.histogram(tokens_df, x=0, y="token", orientation='h', color='annotation', - title=f"Most common {title} for {self.model_name}") + fig = px.histogram( + tokens_df, + x=0, + y="token", + orientation="h", + color="annotation", + title=f"Most common {title} for {self.model_name}", + ) fig.update_layout(yaxis_title=f"count", xaxis_title="PII Entity") - fig.update_traces(textfont_size=12, textangle=0, - textposition="outside", cliponaxis=False) + fig.update_traces( + textfont_size=12, + textangle=0, + textposition="outside", + cliponaxis=False, + ) fig.update_layout( plot_bgcolor="#FFF", xaxis=dict( title="Count", linecolor="#BCCCDC", # Sets color of X-axis line - showgrid=False # Removes X-axis grid lines + showgrid=False, # Removes X-axis grid lines ), yaxis=dict( title=f"Tokens", linecolor="#BCCCDC", # Sets color of X-axis line - showgrid=False # Removes X-axis grid lines + showgrid=False, # Removes X-axis grid lines ), ) - fig.update_layout(yaxis={'categoryorder': 'total ascending'}) + fig.update_layout(yaxis={"categoryorder": "total ascending"}) fig.show() + generate_graph(title="false-negatives", tokens_df=fns_tokens_df) generate_graph(title="false-positives", tokens_df=fps_tokens_df) + + @staticmethod + def get_skip_words(): + skip_words = [x for x in string.punctuation] + skip_words.extend( + [ + "\n", + "\n\n", + "\n\n\n", + ">>", + ">>>", + ">>>>", + "street", + "st.", + "st", + "de", + "rue", + "via", + "and", + "or", + "do", + "as", + "of", + "day", + "address", + "country", + "state", + "city", + ] + ) + + return skip_words diff --git a/presidio_evaluator/models/base_model.py b/presidio_evaluator/models/base_model.py index bd07658..6f27d96 100644 --- a/presidio_evaluator/models/base_model.py +++ b/presidio_evaluator/models/base_model.py @@ -31,6 +31,7 @@ class BaseModel(ABC): self.labeling_scheme = labeling_scheme self.entity_mapping = entity_mapping self.verbose = verbose + self.name = self.__class__.__name__ @abstractmethod def predict(self, sample: InputSample, **kwargs) -> List[str]: diff --git a/presidio_evaluator/models/crf_model.py b/presidio_evaluator/models/crf_model.py index 5a25011..5a4462c 100644 --- a/presidio_evaluator/models/crf_model.py +++ b/presidio_evaluator/models/crf_model.py @@ -85,7 +85,7 @@ class CRFModel(BaseModel): y_train = [self.sent2labels(s) for s in sentences] return X_train, y_train - def predict(self, sample: InputSample) -> List[str]: + def predict(self, sample: InputSample, **kwargs) -> List[str]: tags = CRFModel.crf_predict(sample, self.model) if len(tags) != len(sample.tokens): diff --git a/presidio_evaluator/models/flair_model.py b/presidio_evaluator/models/flair_model.py index bf25b4e..da382a3 100644 --- a/presidio_evaluator/models/flair_model.py +++ b/presidio_evaluator/models/flair_model.py @@ -48,7 +48,7 @@ class FlairModel(BaseModel): self.spacy_tokenizer = SpacyTokenizer(model=spacy.load("en_core_web_sm")) - def predict(self, sample: InputSample) -> List[str]: + def predict(self, sample: InputSample, **kwargs) -> List[str]: sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer) self.model.predict(sentence) diff --git a/presidio_evaluator/models/presidio_analyzer_wrapper.py b/presidio_evaluator/models/presidio_analyzer_wrapper.py index 9598b2b..6372ad2 100644 --- a/presidio_evaluator/models/presidio_analyzer_wrapper.py +++ b/presidio_evaluator/models/presidio_analyzer_wrapper.py @@ -91,23 +91,28 @@ class PresidioAnalyzerWrapper(BaseModel): "PHONE_NUMBER": "PHONE_NUMBER", "BIRTHDAY": "DATE_TIME", "DATE_TIME": "DATE_TIME", - "DOMAIN_NAME": "DOMAIN_NAME", + "DOMAIN_NAME": "URL", + "TIME" : "DATE_TIME", + "DATE" : "DATE_TIME", "CITY": "LOCATION", "ADDRESS": "LOCATION", + "STREET_ADDRESS": "LOCATION", "NATIONALITY": "LOCATION", "LOCATION": "LOCATION", "IBAN_CODE": "IBAN_CODE", - "URL": "DOMAIN_NAME", + "URL": "URL", "US_SSN": "US_SSN", "IP_ADDRESS": "IP_ADDRESS", - "ORGANIZATION": "ORG", + "ORGANIZATION": "ORGANIZATION", + "ORG": "ORGANIZATION", "US_DRIVER_LICENSE": "US_DRIVER_LICENSE", - "NRP": "NRP", - "TITLE": "O", # not supported - "PREFIX": "O", # not supported - "STREET_ADDRESS": "O", # not supported - "ZIP_CODE": "O", # not supported - "AGE": "O", # not supported + "NRP": "LOCATION", + "NORP": "LOCATION", + "ID": "ID", + "TITLE": "O", # not supported through spaCy + "PREFIX": "O", # not supported through spaCy + "ZIP_CODE": "O", # not supported through spaCy + "AGE": "O", # not supported through spaCy "O": "O", } diff --git a/presidio_evaluator/models/spacy_model.py b/presidio_evaluator/models/spacy_model.py index e919ccf..0ed30ea 100644 --- a/presidio_evaluator/models/spacy_model.py +++ b/presidio_evaluator/models/spacy_model.py @@ -31,7 +31,7 @@ class SpacyModel(BaseModel): else: self.model = model - def predict(self, sample: InputSample) -> List[str]: + def predict(self, sample: InputSample, **kwargs) -> List[str]: """ Predict a list of tags for an inpuit sample. :param sample: InputSample diff --git a/presidio_evaluator/models/stanza_model.py b/presidio_evaluator/models/stanza_model.py index 9dd6a01..2d0d1cb 100644 --- a/presidio_evaluator/models/stanza_model.py +++ b/presidio_evaluator/models/stanza_model.py @@ -51,7 +51,7 @@ class StanzaModel(SpacyModel): entity_mapping=entity_mapping, ) - def predict(self, sample: InputSample) -> List[str]: + def predict(self, sample: InputSample, **kwargs) -> List[str]: """ Predict the tags using a stanza model. diff --git a/presidio_evaluator/models/text_analytics_wrapper.py b/presidio_evaluator/models/text_analytics_wrapper.py index b353c13..42fd308 100644 --- a/presidio_evaluator/models/text_analytics_wrapper.py +++ b/presidio_evaluator/models/text_analytics_wrapper.py @@ -48,8 +48,7 @@ class TextAnalyticsWrapper(BaseModel): ) return text_analytics_client - - def predict(self, sample: InputSample) -> List[str]: + def predict(self, sample: InputSample, **kwargs) -> List[str]: documents = [sample.full_text] response = self.ta_client.recognize_pii_entities(documents, language="en") diff --git a/pyproject.toml b/pyproject.toml index 94beda4..a0ec05e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,26 +4,23 @@ version = "0.1.0" description = "" authors = ["Omri Mendels "] readme = "README.md" +include = [{ path= "presidio_evaluator/data_generator/raw_data/*"}] [tool.poetry.dependencies] python = "^3.9" -spacy = ">=3.2.0, <4.0.0" -numpy = ">=1.20.2,<2.0.0" -jupyter = ">=1" -pandas = ">=1.2.4,<2.0.0" -tqdm = ">=4.60.0,<5.0.0" -haikunator = ">=2.1.0,<3.0.0" -schwifty = ">=2023.11.2,<2024.0.0" -faker = ">=9.6.0,<10.0.0" -scikit-learn = ">1.3.2,<2.0.0" -pytest = ">=6.2.3" +spacy = "^3.5.0" +numpy = "^1.22" +pandas = "^2.1.4" +tqdm = "^4.60.0" +faker = "^21.0" +scikit-learn = "^1.3.2" presidio-analyzer = "^2.2.351" presidio-anonymizer = "^2.2.351" -requests = ">=2.25.1" -xmltodict = ">=0.12.0" +requests = "^2.25" +xmltodict = "^0.12.0" python-dotenv = "^1.0.0" plotly = "^5.18.0" -azure-ai-textanalytics = ">=5.3.0" +azure-ai-textanalytics = "^5.3.0" en_core_web_sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz"} en_core_web_lg = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz"} diff --git a/setup.py b/setup.py index e7005b0..4566045 100644 --- a/setup.py +++ b/setup.py @@ -1,54 +1,57 @@ -from setuptools import setup, find_packages -import os.path - -# read the contents of the README file +# -*- coding: utf-8 -*- +from setuptools import setup +import os from os import path this_directory = path.abspath(path.dirname(__file__)) with open(path.join(this_directory, "README.md"), encoding="utf-8") as f: long_description = f.read() - # print(long_description) with open(os.path.join(this_directory, "VERSION")) as version_file: - __version__ = version_file.read().strip() + version = version_file.read().strip() + + +packages = [ + "presidio_evaluator", + "presidio_evaluator.data_generator", + "presidio_evaluator.data_generator.faker_extensions", + "presidio_evaluator.dataset_formatters", + "presidio_evaluator.evaluation", + "presidio_evaluator.experiment_tracking", + "presidio_evaluator.models", +] + +package_data = {"": ["*"], "presidio_evaluator.data_generator": ["raw_data/*"]} + +install_requires = [ + "azure-ai-textanalytics>=5.3.0,<6.0.0", + "en_core_web_lg @ " + "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz", + "en_core_web_sm @ " + "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz", + "faker>=21.0,<22.0", + "numpy>=1.22,<2.0", + "pandas>=2.1.4,<3.0.0", + "plotly>=5.18.0,<6.0.0", + "presidio-analyzer>=2.2.351,<3.0.0", + "presidio-anonymizer>=2.2.351,<3.0.0", + "python-dotenv>=1.0.0,<2.0.0", + "requests>=2.25,<3.0", + "scikit-learn>=1.3.2,<2.0.0", + "spacy>=3.5.0,<4.0.0", + "tqdm>=4.60.0,<5.0.0", + "xmltodict>=0.12.0,<0.13.0", +] setup( name="presidio-evaluator", long_description=long_description, long_description_content_type="text/markdown", - version=__version__, - packages=find_packages(exclude=["tests"]), url="https://www.github.com/microsoft/presidio-research", + version=version, license="MIT", - description="PII dataset generator, model evaluator for Presidio and PII data in general", # noqa - data_files=[ - ( - "presidio_evaluator/data_generator/raw_data", - [ - "presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv", # noqa - "presidio_evaluator/data_generator/raw_data/templates.txt", - "presidio_evaluator/data_generator/raw_data/companies_and_organizations.csv", - "presidio_evaluator/data_generator/raw_data/nationalities.csv", - "presidio_evaluator/data_generator/raw_data/us_driver_licenses.csv", - ], - ) - ], - include_package_data=True, - install_requires=[ - "presidio_analyzer", - "presidio_anonymizer", - "spacy>=3.0.0", - "requests", - "numpy", - "pandas", - "tqdm>=4.32.1", - "jupyter>=1.0.0", - "pytest>=4.6.2", - "haikunator", - "schwifty", - "faker", - "sklearn_crfsuite", - "python-dotenv", - "azure-ai-textanalytics==5.2.0" - ], -) + packages=packages, + package_data=package_data, + install_requires=install_requires, + python_requires=">=3.8,<4.0", +) \ No newline at end of file diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py index 8319e05..a5e4cec 100644 --- a/tests/test_evaluator.py +++ b/tests/test_evaluator.py @@ -345,3 +345,22 @@ def test_align_entity_types_wrong_mapping_exception(): Evaluator.align_entity_types( input_samples=[sample1], entities_mapping=entities_mapping ) + + +def test_skip_words_are_not_counted_as_errors(): + prediction = ["U-PERSON", "O", "O", "O", "U-LOCATION"] + model = MockTokensModel(prediction=prediction, + entities_to_keep=["LOCATION", "PERSON"]) + + evaluator = Evaluator(model=model) + sample = InputSample( + full_text="John is on the street", masked="I am the street", spans=None + ) + sample.tokens = ["John", "is", "on", "the", "street"] + sample.tags = ["U-PERSON", "O", "O", "O", "O"] + + evaluated = evaluator.evaluate_sample(sample, prediction) + final_evaluation = evaluator.calculate_score([evaluated]) + + assert final_evaluation.pii_precision == 1 + assert final_evaluation.pii_recall == 1 diff --git a/tests/test_presidio_pseudonymize.py b/tests/test_presidio_pseudonymize.py index a6d28ed..a756548 100644 --- a/tests/test_presidio_pseudonymize.py +++ b/tests/test_presidio_pseudonymize.py @@ -30,7 +30,7 @@ def fake_faker(): ], # fmt: on ) -def test_presidio_psudonymize_two_entities( +def test_presidio_pseudonymize_two_entities( text, entity1, entity2, start1, end1, start2, end2, value1, value2, fake_faker ): @@ -51,3 +51,15 @@ def test_presidio_psudonymize_two_entities( assert value2 in pseudonym assert text[:start1].lower() in pseudonym.lower() assert text[end1:start2].lower() in pseudonym.lower() + + +def test_simple_scenario(): + original_text = "Hi my name is Doug Funny and this is my website: https://www.dougf.io" # noqa + presidio_response = [ + RecognizerResult(entity_type="PERSON", start=14, end=24, score=0.85), + RecognizerResult(entity_type="URL", start=49, end=69, score=0.95), + ] + + PresidioPseudonymization().pseudonymize(original_text=original_text, + presidio_response=presidio_response, + count=5)