refresh to various things

2023-12-27 21:45:04 +02:00 · 2023-12-27 21:45:04 +02:00 · cffb49e7ce
--- a/README.md
+++ b/README.md
@ -72,7 +72,7 @@ Furthermore, it tokenizes the data, creates tags (either IO/BIO/BILUO) and spans

 Once data is generated, it could be split into train/test/validation sets 
 while ensuring that each template only exists in one set. 
-See [this notebook for more details](notebooks/3_Split_by_pattern_%23.ipynb).
+See [this notebook for more details](notebooks/3_Split_by_pattern_number.ipynb).

 ## 2. Data representation

--- a/2
+++ b/2
@ -1 +1 @@
-0.1.2
+0.1.3
--- a/notebooks/1_Generate_data.ipynb
+++ b/notebooks/1_Generate_data.ipynb
@ -2,8 +2,10 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 1,
+   "metadata": {
+    "is_executing": true
+   },
   "outputs": [],
   "source": [
    "# install presidio via pip if not yet installed\n",
@ -14,8 +16,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {
+    "is_executing": true,
    "scrolled": true
   },
   "outputs": [],
@ -81,9 +84,34 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 3,
+   "metadata": {
+    "is_executing": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sampling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9149.88it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "My name is Joshua Jackson\n",
+      "[{\"value\": \"Joshua Jackson\", \"start\": 11, \"end\": 25, \"type\": \"name\"}]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
   "source": [
    "sentence_templates = [\n",
    "    \"My name is {{name}}\",\n",
@ -126,8 +154,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {
+    "is_executing": true,
    "scrolled": true
   },
   "outputs": [],
@ -165,13 +194,228 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {
+    "is_executing": true,
    "pycharm": {
     "name": "#%%\n"
    }
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>number</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>nationality</th>\n",
+       "      <th>prefix</th>\n",
+       "      <th>first_name</th>\n",
+       "      <th>middle_initial</th>\n",
+       "      <th>last_name</th>\n",
+       "      <th>street_name</th>\n",
+       "      <th>city</th>\n",
+       "      <th>state_abbr</th>\n",
+       "      <th>...</th>\n",
+       "      <th>company</th>\n",
+       "      <th>domain_name</th>\n",
+       "      <th>person</th>\n",
+       "      <th>name</th>\n",
+       "      <th>first_name_female</th>\n",
+       "      <th>first_name_male</th>\n",
+       "      <th>prefix_female</th>\n",
+       "      <th>prefix_male</th>\n",
+       "      <th>last_name_female</th>\n",
+       "      <th>last_name_male</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>female</td>\n",
+       "      <td>Czech</td>\n",
+       "      <td>Mrs.</td>\n",
+       "      <td>Marie</td>\n",
+       "      <td>J</td>\n",
+       "      <td>Hamanová</td>\n",
+       "      <td>P.O. Box 255</td>\n",
+       "      <td>Kangerlussuaq</td>\n",
+       "      <td>QE</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Simple Solutions</td>\n",
+       "      <td>MarathonDancing.gl</td>\n",
+       "      <td>Marie J Hamanová</td>\n",
+       "      <td>Marie J Hamanová</td>\n",
+       "      <td>Marie</td>\n",
+       "      <td></td>\n",
+       "      <td>Mrs.</td>\n",
+       "      <td></td>\n",
+       "      <td>Hamanová</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>female</td>\n",
+       "      <td>French</td>\n",
+       "      <td>Ms.</td>\n",
+       "      <td>Patricia</td>\n",
+       "      <td>G</td>\n",
+       "      <td>Desrosiers</td>\n",
+       "      <td>Avenida Noruega 42</td>\n",
+       "      <td>Vila Real</td>\n",
+       "      <td>VR</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Formula Gray</td>\n",
+       "      <td>LostMillions.com.pt</td>\n",
+       "      <td>Patricia Desrosiers</td>\n",
+       "      <td>Patricia Desrosiers</td>\n",
+       "      <td>Patricia</td>\n",
+       "      <td></td>\n",
+       "      <td>Ms.</td>\n",
+       "      <td></td>\n",
+       "      <td>Desrosiers</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>female</td>\n",
+       "      <td>American</td>\n",
+       "      <td>Ms.</td>\n",
+       "      <td>Debra</td>\n",
+       "      <td>O</td>\n",
+       "      <td>Neal</td>\n",
+       "      <td>1659 Hoog St</td>\n",
+       "      <td>Brakpan</td>\n",
+       "      <td>GA</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Dahlkemper's</td>\n",
+       "      <td>MediumTube.co.za</td>\n",
+       "      <td>Debra O Neal</td>\n",
+       "      <td>Debra O Neal</td>\n",
+       "      <td>Debra</td>\n",
+       "      <td></td>\n",
+       "      <td>Ms.</td>\n",
+       "      <td></td>\n",
+       "      <td>Neal</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>male</td>\n",
+       "      <td>French</td>\n",
+       "      <td>Mr.</td>\n",
+       "      <td>Peverell</td>\n",
+       "      <td>C</td>\n",
+       "      <td>Racine</td>\n",
+       "      <td>183 Epimenidou Street</td>\n",
+       "      <td>Limassol</td>\n",
+       "      <td>LI</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Quickbiz</td>\n",
+       "      <td>ImproveLook.com.cy</td>\n",
+       "      <td>Peverell Racine</td>\n",
+       "      <td>Peverell Racine</td>\n",
+       "      <td></td>\n",
+       "      <td>Peverell</td>\n",
+       "      <td></td>\n",
+       "      <td>Mr.</td>\n",
+       "      <td></td>\n",
+       "      <td>Racine</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>female</td>\n",
+       "      <td>Slovenian</td>\n",
+       "      <td>Mrs.</td>\n",
+       "      <td>Iolanda</td>\n",
+       "      <td>S</td>\n",
+       "      <td>Tratnik</td>\n",
+       "      <td>Karu põik 61</td>\n",
+       "      <td>Pärnu</td>\n",
+       "      <td>PR</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Dubrow's Cafeteria</td>\n",
+       "      <td>PostTan.com.ee</td>\n",
+       "      <td>Iolanda Tratnik</td>\n",
+       "      <td>Iolanda Tratnik</td>\n",
+       "      <td>Iolanda</td>\n",
+       "      <td></td>\n",
+       "      <td>Mrs.</td>\n",
+       "      <td></td>\n",
+       "      <td>Tratnik</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 37 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   number  gender nationality prefix first_name middle_initial   last_name  \\\n",
+       "0       1  female       Czech   Mrs.      Marie              J    Hamanová   \n",
+       "1       2  female      French    Ms.   Patricia              G  Desrosiers   \n",
+       "2       3  female    American    Ms.      Debra              O        Neal   \n",
+       "3       4    male      French    Mr.   Peverell              C      Racine   \n",
+       "4       5  female   Slovenian   Mrs.    Iolanda              S     Tratnik   \n",
+       "\n",
+       "             street_name           city state_abbr  ...             company  \\\n",
+       "0           P.O. Box 255  Kangerlussuaq         QE  ...    Simple Solutions   \n",
+       "1     Avenida Noruega 42      Vila Real         VR  ...        Formula Gray   \n",
+       "2           1659 Hoog St        Brakpan         GA  ...        Dahlkemper's   \n",
+       "3  183 Epimenidou Street       Limassol         LI  ...            Quickbiz   \n",
+       "4           Karu põik 61          Pärnu         PR  ...  Dubrow's Cafeteria   \n",
+       "\n",
+       "           domain_name               person                 name  \\\n",
+       "0   MarathonDancing.gl     Marie J Hamanová     Marie J Hamanová   \n",
+       "1  LostMillions.com.pt  Patricia Desrosiers  Patricia Desrosiers   \n",
+       "2     MediumTube.co.za         Debra O Neal         Debra O Neal   \n",
+       "3   ImproveLook.com.cy      Peverell Racine      Peverell Racine   \n",
+       "4       PostTan.com.ee      Iolanda Tratnik      Iolanda Tratnik   \n",
+       "\n",
+       "  first_name_female first_name_male prefix_female prefix_male  \\\n",
+       "0             Marie                          Mrs.               \n",
+       "1          Patricia                           Ms.               \n",
+       "2             Debra                           Ms.               \n",
+       "3                          Peverell                       Mr.   \n",
+       "4           Iolanda                          Mrs.               \n",
+       "\n",
+       "   last_name_female last_name_male  \n",
+       "0          Hamanová                 \n",
+       "1        Desrosiers                 \n",
+       "2              Neal                 \n",
+       "3                           Racine  \n",
+       "4           Tratnik                 \n",
+       "\n",
+       "[5 rows x 37 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "# Read FakeNameGenerator CSV\n",
    "fake_name_generator_df = pd.read_csv(fake_name_generator_file)\n",
@ -190,8 +434,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {
+    "is_executing": true,
    "scrolled": true
   },
   "outputs": [],
@ -209,8 +454,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 7,
+   "metadata": {
+    "is_executing": true
+   },
   "outputs": [],
   "source": [
    "fake.add_provider(IpAddressProvider)  # Both Ipv4 and IPv6 IP addresses\n",
@ -235,8 +482,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {
+    "is_executing": true,
    "pycharm": {
     "name": "#%%\n"
    }
@ -270,13 +518,36 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {
+    "is_executing": true,
    "pycharm": {
     "name": "#%%\n"
    }
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Sampling: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 17987.56it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"year\"}], \"template\": \"Title VII of the Civil Rights Act of {{year}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
   "source": [
    "sentence_templates = PresidioDataGenerator.read_template_file(templates_file_path)\n",
    "fake_records = data_generator.generate_fake_data(\n",
@ -296,11 +567,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "metadata": {
+    "is_executing": true,
    "scrolled": true
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total: 1500\n",
+      "Avg # of records per template: 7.142857142857143\n",
+      "Median # of records per template: 7.0\n",
+      "Std: 2.5872528966106905\n"
+     ]
+    }
+   ],
   "source": [
    "count_per_template_id = Counter([sample.template_id for sample in fake_records])\n",
    "\n",
@ -323,13 +606,65 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "metadata": {
+    "is_executing": true,
    "pycharm": {
     "name": "#%%\n"
    }
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Counter({'organization': 257,\n",
+       "         'first_name': 244,\n",
+       "         'person': 238,\n",
+       "         'city': 235,\n",
+       "         'address': 209,\n",
+       "         'street_name': 164,\n",
+       "         'name': 162,\n",
+       "         'country': 154,\n",
+       "         'credit_card_number': 152,\n",
+       "         'phone_number': 121,\n",
+       "         'last_name': 119,\n",
+       "         'building_number': 110,\n",
+       "         'age': 72,\n",
+       "         'secondary_address': 64,\n",
+       "         'year': 58,\n",
+       "         'nationality': 55,\n",
+       "         'postcode': 49,\n",
+       "         'zipcode': 45,\n",
+       "         'url': 39,\n",
+       "         'email': 39,\n",
+       "         'name_female': 37,\n",
+       "         'job': 33,\n",
+       "         'first_name_male': 31,\n",
+       "         'name_male': 29,\n",
+       "         'prefix_male': 28,\n",
+       "         'date_of_birth': 24,\n",
+       "         'iban': 22,\n",
+       "         'date_time': 21,\n",
+       "         'prefix_female': 21,\n",
+       "         'day_of_week': 16,\n",
+       "         'state_abbr': 15,\n",
+       "         'last_name_male': 15,\n",
+       "         'prefix': 12,\n",
+       "         'ip_address': 11,\n",
+       "         'ssn': 11,\n",
+       "         'nation_plural': 9,\n",
+       "         'nation_woman': 8,\n",
+       "         'first_name_nonbinary': 6,\n",
+       "         'us_driver_license': 6,\n",
+       "         'first_name_female': 3,\n",
+       "         'last_name_female': 3})"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "count_per_entity = Counter()\n",
    "for record in fake_records:\n",
@ -351,8 +686,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "metadata": {
+    "is_executing": true,
    "pycharm": {
     "name": "#%%\n"
    }
@ -421,9 +757,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 13,
+   "metadata": {
+    "is_executing": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"DATE_TIME\"}], \"template\": \"Title VII of the Civil Rights Act of {{DATE_TIME}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "fake_records[0]"
   ]
@ -437,13 +786,41 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "metadata": {
+    "is_executing": true,
    "pycharm": {
     "name": "#%%\n"
    }
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('PERSON', 887),\n",
+       " ('STREET_ADDRESS', 596),\n",
+       " ('GPE', 404),\n",
+       " ('ORGANIZATION', 257),\n",
+       " ('CREDIT_CARD', 152),\n",
+       " ('PHONE_NUMBER', 121),\n",
+       " ('DATE_TIME', 119),\n",
+       " ('TITLE', 94),\n",
+       " ('NRP', 72),\n",
+       " ('AGE', 72),\n",
+       " ('ZIP_CODE', 45),\n",
+       " ('DOMAIN_NAME', 39),\n",
+       " ('EMAIL_ADDRESS', 39),\n",
+       " ('IBAN_CODE', 22),\n",
+       " ('IP_ADDRESS', 11),\n",
+       " ('US_SSN', 11),\n",
+       " ('US_DRIVER_LICENSE', 6)]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "\n",
    "count_per_entity_new = Counter()\n",
@ -463,13 +840,51 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {
+    "is_executing": true,
    "pycharm": {
     "name": "#%%\n"
    }
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                                                   | 0/1500 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loading model en_core_web_sm\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:06<00:00, 215.70it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 6.76 s, sys: 33.8 ms, total: 6.8 s\n",
+      "Wall time: 6.96 s\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
   "source": [
    "%%time\n",
    "input_samples = [\n",
@ -491,8 +906,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "metadata": {
+    "is_executing": true,
    "pycharm": {
     "name": "#%%\n"
    }
@ -515,21 +931,31 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
   "metadata": {
+    "is_executing": true,
    "pycharm": {
     "name": "#%%\n"
    }
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 76888.23it/s]\n"
+     ]
+    }
+   ],
   "source": [
    "conll = InputSample.create_conll_dataset(input_samples)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
   "metadata": {
+    "is_executing": true,
    "pycharm": {
     "name": "#%%\n"
    }
@ -546,7 +972,7 @@
    "### Next steps\n",
    "\n",
    "- Evaluate Presidio using this fake data. [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
-    "- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_#.ipynb)\n",
+    "- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_number.ipynb)\n",
    "- Conduct a small exploratory data analysis on the generated data. [Sample](2_PII_EDA.ipynb)"
   ]
  },
@ -569,9 +995,9 @@
   "hash": "2509fbe9adc3579fd0ef23e6a2c6fb50cb745caa174aafdf017283479e60bc43"
  },
  "kernelspec": {
-   "display_name": "presidio",
+   "display_name": "presidio-evaluator",
   "language": "python",
-   "name": "presidio"
+   "name": "presidio-evaluator"
  },
  "language_info": {
   "codemirror_mode": {
@ -583,9 +1009,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.9.18"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/notebooks/2_PII_EDA.ipynb
+++ b/notebooks/2_PII_EDA.ipynb
@ -72,7 +72,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "for (name, series) in pii_df.iteritems():\n",
+    "for (name, series) in pii_df.items():\n",
    "    print(name)\n",
    "    print(\"Unique values: {}\".format(len(series.unique())))\n",
    "    print(series.value_counts())\n",
@ -123,7 +123,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "series_to_wordcloud(pii_df.country_full)"
+    "series_to_wordcloud(pii_df.country)"
   ]
  },
  {
@ -187,9 +187,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "countries = [get_entity_values_from_sample(sample, [\"LOCATION\"]) for sample in synth]\n",
+    "countries = [get_entity_values_from_sample(sample, [\"TITLE\"]) for sample in synth]\n",
    "countries = [item for sublist in countries for item in sublist]\n",
-    "series_to_wordcloud(pd.Series(countries, name=\"LOCATION\"))"
+    "series_to_wordcloud(pd.Series(countries, name=\"TITLE\"))"
   ]
  },
  {
@ -213,9 +213,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "presidio",
+   "display_name": "presidio-evaluator",
   "language": "python",
-   "name": "presidio"
+   "name": "presidio-evaluator"
  },
  "language_info": {
   "codemirror_mode": {
@ -227,9 +227,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.9.18"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
-}
+ "nbformat_minor": 4
+}
--- a/notebooks/3_Split_by_pattern_number.ipynb
+++ b/notebooks/3_Split_by_pattern_number.ipynb
@ -143,13 +143,6 @@
    "assert len(train) + len(test) + len(validation) == len(all_samples)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -160,9 +153,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "presidio",
+   "display_name": "presidio-evaluator",
   "language": "python",
-   "name": "presidio"
+   "name": "presidio-evaluator"
  },
  "language_info": {
   "codemirror_mode": {
@ -174,9 +167,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.9.18"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/notebooks/4_Evaluate_Presidio_Analyzer.ipynb
+++ b/notebooks/4_Evaluate_Presidio_Analyzer.ipynb
@ -5,7 +5,7 @@
   "id": "847acd88",
   "metadata": {},
   "source": [
-    "Evaluate Presidio Analyzer using the Presidio Evaluator framework"
+    "# Evaluate Presidio Analyzer using the Presidio Evaluator framework"
   ]
  },
  {
@ -17,7 +17,8 @@
   "source": [
    "# install presidio via pip if not yet installed\n",
    "\n",
-    "#!pip install presidio-analyzer\n",
+    "#!pip install presidio-evaluator\n",
+    "#!pip install \"presidio-analyzer[transformers]\"\n",
    "#!pip install presidio-evaluator"
   ]
  },
@ -32,6 +33,10 @@
    "from copy import deepcopy\n",
    "from pprint import pprint\n",
    "from collections import Counter\n",
+    "from typing import List\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
    "\n",
    "from presidio_evaluator import InputSample\n",
    "from presidio_evaluator.evaluation import Evaluator, ModelError\n",
@ -45,7 +50,8 @@
    "pd.set_option(\"display.max_colwidth\", None)\n",
    "\n",
    "%reload_ext autoreload\n",
-    "%autoreload 2"
+    "%autoreload 2\n",
+    "%matplotlib inline"
   ]
  },
  {
@ -65,6 +71,9 @@
   "source": [
    "dataset_name = \"synth_dataset_v2.json\"\n",
    "dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, \"data\", dataset_name))\n",
+    "\n",
+    "dataset = dataset[:300] # top 300 samples\n",
+    "\n",
    "print(len(dataset))"
   ]
  },
@ -75,10 +84,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "entity_counter = Counter()\n",
-    "for sample in dataset:\n",
-    "    for tag in sample.tags:\n",
-    "        entity_counter[tag] += 1"
+    "def get_entity_counts(dataset:List[InputSample]):\n",
+    "    entity_counter = Counter()\n",
+    "    for sample in dataset:\n",
+    "        for tag in sample.tags:\n",
+    "            entity_counter[tag] += 1\n",
+    "    return entity_counter\n"
   ]
  },
  {
@ -89,7 +100,7 @@
   "outputs": [],
   "source": [
    "print(\"Count per entity:\")\n",
-    "pprint(entity_counter.most_common())\n",
+    "pprint(get_entity_counts(dataset).most_common())\n",
    "\n",
    "print(\"\\nExample sentence:\")\n",
    "print(dataset[1])\n",
@ -107,12 +118,121 @@
    ")"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "9c5e16cb-bee8-4f0a-a543-4879daa35b9e",
+   "metadata": {},
+   "source": [
+    "### Define the AnalyzerEngine object \n",
+    "In this case, using a huggingface model: obi/deid_roberta_i2b2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "313b508f-e901-40b9-b575-c7fb8a794652",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from presidio_analyzer import AnalyzerEngine\n",
+    "from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration\n",
+    "\n",
+    "\n",
+    "# Here we define a transformers based NLP engine, \n",
+    "# but you can use this cell to customize your Presidio Analyzer instance\n",
+    "\n",
+    "# Define which model to use\n",
+    "model_config = [{\"lang_code\": \"en\", \"model_name\": {\n",
+    "    \"spacy\": \"en_core_web_sm\",  # use a small spaCy model for lemmas, tokens etc.\n",
+    "    \"transformers\": \"obi/deid_roberta_i2b2\"\n",
+    "    }\n",
+    "}]\n",
+    "\n",
+    "# Map transformers model labels to Presidio's\n",
+    "model_to_presidio_entity_mapping = dict(\n",
+    "    PER=\"PERSON\",\n",
+    "    PERSON=\"PERSON\",\n",
+    "    LOC= \"LOCATION\",\n",
+    "    LOCATION= \"LOCATION\",\n",
+    "    GPE=\"LOCATION\",\n",
+    "    ORG=\"ORGANIZATION\",\n",
+    "    ORGANIZATION=\"ORGANIZATION\",\n",
+    "    NORP=\"NRP\",\n",
+    "    AGE=\"AGE\",\n",
+    "    ID=\"ID\",\n",
+    "    EMAIL=\"EMAIL\",\n",
+    "    PATIENT=\"PERSON\",\n",
+    "    STAFF=\"PERSON\",\n",
+    "    HOSP=\"ORGANIZATION\",\n",
+    "    PATORG=\"ORGANIZATION\",\n",
+    "    DATE=\"DATE_TIME\",\n",
+    "    TIME=\"DATE_TIME\",\n",
+    "    PHONE=\"PHONE_NUMBER\",\n",
+    "    HCW=\"PERSON\",\n",
+    "    HOSPITAL=\"ORGANIZATION\",\n",
+    "    FACILITY=\"LOCATION\",\n",
+    ")\n",
+    "\n",
+    "ner_model_configuration = NerModelConfiguration(labels_to_ignore = [\"O\"], \n",
+    "                                                model_to_presidio_entity_mapping=model_to_presidio_entity_mapping)\n",
+    "\n",
+    "nlp_engine = TransformersNlpEngine(models=model_config,\n",
+    "                                   ner_model_configuration=ner_model_configuration)\n",
+    "\n",
+    "# Set up the engine, loads the NLP module (spaCy model by default) \n",
+    "# and other PII recognizers\n",
+    "analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "aae4c379",
   "metadata": {},
   "source": [
-    "Run evaluation:"
+    "### Run evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "16dbf6d6-a554-4602-8907-589786d47a12",
+   "metadata": {},
+   "source": [
+    "#### Define experiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29d39ff1-4f14-4e32-ae84-ecc6c739f829",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "experiment = get_experiment_tracker()\n",
+    "model = PresidioAnalyzerWrapper(analyzer_engine)\n",
+    "\n",
+    "# Define evaluator and experiment tracking\n",
+    "\n",
+    "evaluator = Evaluator(model=model)\n",
+    "dataset = Evaluator.align_entity_types(\n",
+    "    deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
+    ")\n",
+    "\n",
+    "print(\"Count per entity after alignment:\")\n",
+    "pprint(get_entity_counts(dataset).most_common())\n",
+    "\n",
+    "# Track model and dataset params\n",
+    "params = {\"dataset_name\": dataset_name, \"model_name\": model.name}\n",
+    "params.update(model.to_log())\n",
+    "experiment.log_parameters(params)\n",
+    "experiment.log_dataset_hash(dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2a7d6626-d094-4dfd-8f37-c0443edf00dc",
+   "metadata": {},
+   "source": [
+    "#### Run experiment"
   ]
  },
  {
@ -122,39 +242,37 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "print(\"Evaluating Presidio Analyzer\")\n",
-    "\n",
-    "experiment = get_experiment_tracker()\n",
-    "model_name = \"Presidio Analyzer\"\n",
-    "model = PresidioAnalyzerWrapper()\n",
-    "\n",
-    "evaluator = Evaluator(model=model)\n",
-    "dataset = Evaluator.align_entity_types(\n",
-    "    deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
-    ")\n",
-    "\n",
+    "# Run experiment\n",
    "evaluation_results = evaluator.evaluate_all(dataset)\n",
    "results = evaluator.calculate_score(evaluation_results)\n",
    "\n",
-    "# update params tracking\n",
-    "params = {\"dataset_name\": dataset_name, \"model_name\": model_name}\n",
-    "params.update(model.to_log())\n",
-    "experiment.log_parameters(params)\n",
-    "experiment.log_dataset_hash(dataset)\n",
+    "# Track experiment results\n",
    "experiment.log_metrics(results.to_log())\n",
    "entities, confmatrix = results.to_confusion_matrix()\n",
-    "experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)\n",
+    "experiment.log_confusion_matrix(matrix=confmatrix, \n",
+    "                                labels=entities)\n",
    "\n",
-    "print(\"Confusion matrix:\")\n",
-    "print(pd.DataFrame(confmatrix, columns=entities, index=entities))\n",
-    "\n",
-    "print(\"Precision and recall\")\n",
-    "print(results)\n",
+    "# Plot output\n",
+    "plotter = evaluator.Plotter(model=model, \n",
+    "                            results=results, \n",
+    "                            output_folder = \".\", \n",
+    "                            model_name = model.name, \n",
+    "                            beta = 2)\n",
    "\n",
    "# end experiment\n",
    "experiment.end()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b4d662d-596c-4a69-b3c9-1edcda20cc5b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plotter.plot_scores()"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "070f8287",
@ -198,7 +316,7 @@
   "id": "98f4802e",
   "metadata": {},
   "source": [
-    "1. Most false positive tokens:"
+    "1. Most common false positive tokens:"
   ]
  },
  {
@ -219,7 +337,7 @@
   "outputs": [],
   "source": [
    "fps_df = ModelError.get_fps_dataframe(errors, entity=[\"LOCATION\"])\n",
-    "fps_df[[\"full_text\", \"token\", \"prediction\"]]"
+    "fps_df[[\"full_text\", \"token\", \"annotation\", \"prediction\"]]"
   ]
  },
  {
@ -227,7 +345,7 @@
   "id": "d0852513",
   "metadata": {},
   "source": [
-    "2. False negative examples"
+    "2. Most common false negative examples"
   ]
  },
  {
@ -237,7 +355,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
+    "ModelError.most_common_fn_tokens(errors, n=50)"
   ]
  },
  {
@ -255,7 +373,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "fns_df = ModelError.get_fns_dataframe(errors, entity=[\"PHONE_NUMBER\"])"
+    "fns_df = ModelError.get_fns_dataframe(errors, entity=[\"IP_ADDRESS\"])"
   ]
  },
  {
@ -278,13 +396,21 @@
    "print(\"All errors:\\n\")\n",
    "[print(error, \"\\n\") for error in errors]"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a67ff38d-0817-4864-9991-b3eb1f80eecc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "presidio",
+   "display_name": "presidio-evaluator",
   "language": "python",
-   "name": "presidio"
+   "name": "presidio-evaluator"
  },
  "language_info": {
   "codemirror_mode": {
@ -296,7 +422,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.9.18"
  }
 },
 "nbformat": 4,
--- a/notebooks/5_Pseudonymization_demo.ipynb
+++ b/notebooks/5_Pseudonymization_demo.ipynb
@ -88,7 +88,7 @@
    {
     "data": {
      "text/plain": [
-       "[type: DOMAIN_NAME, start: 57, end: 69, score: 1.0,\n",
+       "[type: URL, start: 49, end: 69, score: 0.95,\n",
       " type: PERSON, start: 14, end: 24, score: 0.85]"
      ]
     },
@ -116,11 +116,11 @@
    {
     "data": {
      "text/plain": [
-       "['Hi my name is Albert Cohen and this is my website: https://http://chapman-downs.info/',\n",
-       " 'Hi my name is Lisa Miller and this is my website: https://http://benson.org/',\n",
-       " 'Hi my name is Kathleen Hale and this is my website: https://http://www.garcia.com/',\n",
-       " 'Hi my name is Michelle Frederick and this is my website: https://https://robinson.com/',\n",
-       " 'Hi my name is Alicia Santana and this is my website: https://https://www.ray.org/']"
+       "['Hi my name is Tammy Ryan and this is my website: https://www.cardenas.info/',\n",
+       " 'Hi my name is Jessica Smith and this is my website: http://jones-hunt.info/',\n",
+       " 'Hi my name is Michele Marsh and this is my website: https://guerrero.com/',\n",
+       " 'Hi my name is Kathleen Miller and this is my website: https://lopez.com/',\n",
+       " 'Hi my name is Paul Brown and this is my website: http://www.banks-evans.info/']"
      ]
     },
     "execution_count": 6,
@ -153,11 +153,11 @@
      "-------------\n",
      "Fake examples:\n",
      "\n",
-      "Our son R2D2 used to work in Botswana\n",
-      "Our son R2D2 used to work in American Samoa\n",
-      "Our son R2D2 used to work in Malawi\n",
-      "Our son R2D2 used to work in Montenegro\n",
-      "our son r2d2 used to work in lebanon\n"
+      "Our son R2D2 used to work in Nigeria\n",
+      "Our son R2D2 used to work in Guam\n",
+      "Our son R2D2 used to work in Reunion\n",
+      "Our son R2D2 used to work in Vanuatu\n",
+      "Our son R2D2 used to work in Malaysia\n"
     ]
    }
   ],
@ -176,13 +176,20 @@
    "print(f\"-------------\\nFake examples:\\n\")\n",
    "print(*fake_samples, sep=\"\\n\")"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "presidio",
+   "display_name": "presidio-evaluator",
   "language": "python",
-   "name": "presidio"
+   "name": "presidio-evaluator"
  },
  "language_info": {
   "codemirror_mode": {
@ -194,9 +201,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.9.18"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
 }
--- a/notebooks/models/Create
+++ b/notebooks/models/Create
@ -23,7 +23,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -34,7 +34,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
@ -42,55 +42,18 @@
   },
   "outputs": [],
   "source": [
-    "DATA_DATE = \"Dec-19-2021\""
+    "DATA_DATE = \"Dec-27-2023\" # Change to the date when notebook 3 (split to train/test) was ran"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "tokenizing input:   0%|                                                                       | 0/2122 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "loading model en_core_web_sm\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "tokenizing input: 100%|███████████████████████████████████████████████████████████| 2122/2122 [00:19<00:00, 109.66it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Read 2122 samples\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "data_path = \"../../data/{}_{}.json\"\n",
    "\n",
@ -111,17 +74,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Kept 1940 samples after removal of non-tagged samples\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "train_tagged = [sample for sample in train_samples if len(sample.spans) > 0]\n",
    "print(\"Kept {} samples after removal of non-tagged samples\".format(len(train_tagged)))"
@ -140,45 +95,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Entities found in training set:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'ADDRESS',\n",
-       " 'CREDIT_CARD',\n",
-       " 'DATE_TIME',\n",
-       " 'DOMAIN_NAME',\n",
-       " 'EMAIL_ADDRESS',\n",
-       " 'IBAN_CODE',\n",
-       " 'IP_ADDRESS',\n",
-       " 'LOCATION',\n",
-       " 'O',\n",
-       " 'ORGANIZATION',\n",
-       " 'PERSON',\n",
-       " 'PHONE_NUMBER',\n",
-       " 'PREFIX',\n",
-       " 'TITLE',\n",
-       " 'US_SSN'}"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "print(\"Entities found in training set:\")\n",
    "entities = []\n",
@ -206,16 +129,7 @@
     "name": "#%%\n"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Skipping illegal span None, text=ΜΟΝΗ ΑΓΙΩΝ ΑΝΑΡΓΥΡΩΝ\n",
-      "Skipping illegal span None, text=U.N\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "spacy_train = InputSample.create_spacy_dataset(\n",
    "    dataset=train_tagged, output_path=\"train.spacy\"\n",
@ -281,9 +195,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "presidio",
+   "display_name": "presidio-evaluator",
   "language": "python",
-   "name": "presidio"
+   "name": "presidio-evaluator"
  },
  "language_info": {
   "codemirror_mode": {
@ -295,9 +209,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.9.18"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
-}
+ "nbformat_minor": 4
+}
--- a/notebooks/models/Evaluate
+++ b/notebooks/models/Evaluate
@ -39,6 +39,16 @@
    "%autoreload 2"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aee00770-a972-4a19-b423-1724214cc88c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install sklearn_crfsuite"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "a0d2d772",
@ -58,8 +68,9 @@
   },
   "outputs": [],
   "source": [
-    "DATA_DATE = \"Jan-15-2022\"\n",
-    "dataset = InputSample.read_dataset_json(\"../../data/test_{}.json\".format(DATA_DATE))\n",
+    "DATA_DATE = \"Dec-27-2023\" # Date when the split to train/test notebook was ran\n",
+    "dataset_name = \"../../data/test_{}.json\".format(DATA_DATE)\n",
+    "dataset = InputSample.read_dataset_json(dataset_name)\n",
    "print(len(dataset))"
   ]
  },
@ -76,7 +87,7 @@
   "source": [
    "entity_counter = Counter()\n",
    "for sample in dataset:\n",
-    "    for t>ag in sample.tags:\n",
+    "    for tag in sample.tags:\n",
    "        entity_counter[tag] += 1"
   ]
  },
@ -257,7 +268,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "fps_df = ModelError.get_fps_dataframe(errors, entity=[\"GPE\"])\n",
+    "fps_df = ModelError.get_fps_dataframe(errors, entity=[\"PERSON\"])\n",
    "fps_df[[\"full_text\", \"token\", \"prediction\"]]"
   ]
  },
@ -276,7 +287,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
+    "ModelError.most_common_fn_tokens(errors, n=50, entity=[\"ORGANIZATION\"])"
   ]
  },
  {
@ -325,13 +336,21 @@
   "metadata": {},
   "outputs": [],
   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf3e4646-ca93-44c5-a998-cd77f4bf2708",
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "presidio",
+   "display_name": "presidio-evaluator",
   "language": "python",
-   "name": "presidio"
+   "name": "presidio-evaluator"
  },
  "language_info": {
   "codemirror_mode": {
@ -343,9 +362,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
-}
+}
--- a/notebooks/models/Evaluate
+++ b/notebooks/models/Evaluate
@ -205,7 +205,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3.9.13 ('presidio')",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -219,9 +219,8 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.9.18"
  },
-  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "371968787ec79dd50357533864944a85029366968470cac36beb694745c2f7d6"
@ -229,5 +228,5 @@
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/notebooks/models/Evaluate
+++ b/notebooks/models/Evaluate
@ -35,6 +35,16 @@
    "%autoreload 2"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0c3285c-06a2-4361-aec2-8375496f75b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install flair"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "f036de59",
@ -111,15 +121,14 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "flair_ner = \"ner-english\"\n",
-    "flair_ner_fast = \"ner-english-fast\"\n",
-    "flair_ontonotes_fast = \"ner-english-ontonotes-fast\"\n",
-    "flair_ontonotes_large = \"ner-english-ontonotes-large\"\n",
+    "flair_ner = \"flair/ner-english\"\n",
+    "flair_ner_fast = \"flair/ner-english-fast\"\n",
+    "flair_ontonotes_fast = \"flair/ner-english-ontonotes-fast\"\n",
+    "flair_ontonotes_large = \"flair/ner-english-ontonotes-large\"\n",
    "models = [\n",
    "    flair_ner,\n",
    "    flair_ner_fast,\n",
    "    flair_ontonotes_fast,\n",
-    "    flair_ner_fast,\n",
    "    flair_ontonotes_large,\n",
    "]"
   ]
@ -312,9 +321,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "presidio",
+   "display_name": "presidio-evaluator",
   "language": "python",
-   "name": "presidio"
+   "name": "presidio-evaluator"
  },
  "language_info": {
   "codemirror_mode": {
@ -326,7 +335,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.9.18"
  }
 },
 "nbformat": 4,
--- a/notebooks/models/Evaluate
+++ b/notebooks/models/Evaluate
@ -109,7 +109,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]"
+    "models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]\n",
+    "\n",
+    "# If needed, install models using `python -m spacy download X` where x is the model name, or use spacy.cli.download:\n",
+    "#spacy.cli.download(\"en_core_web_trf\")"
   ]
  },
  {
@ -334,9 +337,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "presidio",
+   "display_name": "presidio-evaluator",
   "language": "python",
-   "name": "presidio"
+   "name": "presidio-evaluator"
  },
  "language_info": {
   "codemirror_mode": {
@ -348,9 +351,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.9.18"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
-}
+ "nbformat_minor": 4
+}
--- a/presidio_evaluator/data_generator/presidio_data_generator.py
+++ b/presidio_evaluator/data_generator/presidio_data_generator.py
@ -170,6 +170,8 @@ class PresidioDataGenerator:

        new_provider = BaseProvider(self.faker)
        setattr(new_provider, new_name, original)
+        setattr(new_provider, new_name.lower(), original)  # avoid case sensitivity
+        setattr(new_provider, new_name.upper(), original)  # avoid case sensitivity
        self.faker.add_provider(new_provider)

    @staticmethod
--- a/presidio_evaluator/data_generator/presidio_pseudonymize.py
+++ b/presidio_evaluator/data_generator/presidio_pseudonymize.py
@ -25,7 +25,7 @@ class PresidioPseudonymization(PresidioDataGenerator):
            self.add_provider_alias("credit_card_number", "CREDIT_CARD")
            self.add_provider_alias("iban", "IBAN_CODE")
            self.add_provider_alias("phone_number", "PHONE_NUMBER")
-            self.add_provider_alias("url", "DOMAIN_NAME")
+            self.add_provider_alias("url", "URL")
            self.add_provider_alias("ssn", "US_SSN")
            self.add_provider_alias("email", "EMAIL_ADDRESS")
            self.add_provider_alias("date_time", "DATE_TIME")
--- a/presidio_evaluator/evaluation/evaluator.py
+++ b/presidio_evaluator/evaluation/evaluator.py
@ -1,6 +1,8 @@
+import copy
 from collections import Counter
 from typing import List, Optional, Dict
 from pathlib import Path
+import string

 import numpy as np
 from tqdm import tqdm
@ -39,7 +41,6 @@ class Evaluator:
            self.entities_to_keep = self.model.entities

    def compare(self, input_sample: InputSample, prediction: List[str]):
-
        """
        Compares ground truth tags (annotation) and predicted (prediction)
        :param input_sample: input sample containing list of tags with scheme
@ -71,6 +72,9 @@ class Evaluator:
        if self.entities_to_keep:
            prediction = self._adjust_per_entities(prediction)
            new_annotation = self._adjust_per_entities(new_annotation)
+
+        skip_words = self.get_skip_words()
+
        for i in range(0, len(new_annotation)):
            results[(new_annotation[i], prediction[i])] += 1

@ -81,6 +85,10 @@ class Evaluator:

            # check if there was an error
            is_error = new_annotation[i] != prediction[i]
+            if str(tokens[i]).lower().strip() in skip_words:
+                is_error = False
+                results[(new_annotation[i], prediction[i])] -= 1
+
            if is_error:
                if prediction[i] == "O":
                    mistakes.append(
@ -151,7 +159,6 @@ class Evaluator:
                f"Mapping entity values using this dictionary: {self.model.entity_mapping}"
            )
        for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"):
-
            # Align tag values to the ones expected by the model
            self.model.align_entity_types(sample)

@ -345,13 +352,13 @@ class Evaluator:
        if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0):
            return np.nan

-        return ((1 + beta ** 2) * precision * recall) / (
-            ((beta ** 2) * precision) + recall
+        return ((1 + beta**2) * precision * recall) / (
+            ((beta**2) * precision) + recall
        )

    class Plotter:
        """
-        Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives) 
+        Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
        for a PII detection model evaluated via Evaluator

        :param model: Instance of a fitted model (of base type BaseModel)
@ -362,7 +369,9 @@ class Evaluator:
        which gives more or less weight to precision vs. recall
        """

-        def __init__(self, model, results, output_folder: Path, model_name: str, beta: float):
+        def __init__(
+            self, model, results, output_folder: Path, model_name: str, beta: float
+        ):
            self.model = model
            self.results = results
            self.output_folder = output_folder
@ -372,41 +381,66 @@ class Evaluator:

        def plot_scores(self) -> None:
            """
-            Plots per-entity recall, precision, or F2 score for evaluated model. 
-            :param plot_type: which metric to graph (default is F2 score)
+            Plots per-entity recall, precision, or F2 score for evaluated model.
            """
            scores = {}
-            scores['entity'] = list(self.results.entity_recall_dict.keys())
-            scores['recall'] = list(self.results.entity_recall_dict.values())
-            scores['precision'] = list(self.results.entity_precision_dict.values())
-            scores['count'] = list(self.results.n_dict.values())
-            scores[f"f{self.beta}_score"] = [Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
-                                  for recall, precision in zip(scores['recall'], scores['precision'])]
+
+            entity_recall_dict = copy.deepcopy(self.results.entity_recall_dict)
+            entity_precision_dict = copy.deepcopy(self.results.entity_precision_dict)
+
+            scores["entity"] = list(entity_recall_dict.keys())
+            scores["recall"] = list(entity_recall_dict.values())
+            scores["precision"] = list(entity_precision_dict.values())
+            scores["count"] = list(self.results.n_dict.values())
+
+            scores[f"f{self.beta}_score"] = [
+                Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
+                for recall, precision in zip(scores["recall"], scores["precision"])
+            ]
+
+            # Add PII detection rates
+            scores["entity"].append("PII")
+            scores["recall"].append(self.results.pii_recall)
+            scores["precision"].append(self.results.pii_precision)
+            scores["count"].append(self.results.n)
+            scores[f"f{self.beta}_score"].append(self.results.pii_f)
+
            df = pd.DataFrame(scores)
-            df['model'] = self.model_name
+            df["model"] = self.model_name
            self._plot(df, plot_type="f2_score")
            self._plot(df, plot_type="precision")
            self._plot(df, plot_type="recall")

        def _plot(self, df, plot_type) -> None:
-            fig = px.bar(df, text_auto=".2", y='entity', orientation="h",
-                         x=plot_type, color='count', barmode='group', title=f"Per-entity {plot_type} for {self.model_name}")
-            fig.update_layout(barmode='group', yaxis={
-                'categoryorder': 'total ascending'})
+            fig = px.bar(
+                df,
+                text_auto=".2",
+                y="entity",
+                orientation="h",
+                x=plot_type,
+                color="count",
+                barmode="group",
+                height=30*len(set(df["entity"])),
+                title=f"Per-entity {plot_type} for {self.model_name}",
+            )
+            fig.update_layout(
+                barmode="group", yaxis={"categoryorder": "total ascending"}
+            )
            fig.update_layout(yaxis_title=f"{plot_type}", xaxis_title="PII Entity")
-            fig.update_traces(textfont_size=12, textangle=0,
-                              textposition="outside", cliponaxis=False)
+            fig.update_traces(
+                textfont_size=12, textangle=0, textposition="outside", cliponaxis=False
+            )
            fig.update_layout(
                plot_bgcolor="#FFF",
                xaxis=dict(
                    title="PII entity",
                    linecolor="#BCCCDC",  # Sets color of X-axis line
-                    showgrid=False  # Removes X-axis grid lines
+                    showgrid=False,  # Removes X-axis grid lines
                ),
                yaxis=dict(
                    title=f"{plot_type}",
                    linecolor="#BCCCDC",  # Sets color of X-axis line
-                    showgrid=False  # Removes X-axis grid lines
+                    showgrid=False,  # Removes X-axis grid lines
                ),
            )
            fig.show()
@ -419,47 +453,100 @@ class Evaluator:
            for entity in self.model.entity_mapping.values():
                fps_df = ModelError.get_fps_dataframe(self.errors, entity=[entity])
                if fps_df is not None:
-                    fps_path = self.output_folder / \
-                        f"{self.model_name}-{entity}-fps.csv"
+                    fps_path = (
+                        self.output_folder / f"{self.model_name}-{entity}-fps.csv"
+                    )
                    fps_df.to_csv(fps_path)
                    fps_frames.append(fps_path)
                fns_df = ModelError.get_fns_dataframe(self.errors, entity=[entity])
                if fns_df is not None:
-                    fns_path = self.output_folder / \
-                        f"{self.model_name}-{entity}-fns.csv"
+                    fns_path = (
+                        self.output_folder / f"{self.model_name}-{entity}-fns.csv"
+                    )
                    fns_df.to_csv(fns_path)
                    fns_frames.append(fns_path)

            def group_tokens(df):
-                return df.groupby(['token', 'annotation']).size().to_frame(
-                ).sort_values([0], ascending=False).head(3).reset_index()
+                return (
+                    df.groupby(["token", "annotation"])
+                    .size()
+                    .to_frame()
+                    .sort_values([0], ascending=False)
+                    .head(3)
+                    .reset_index()
+                )

            fps_tokens_df = pd.concat(
-                [group_tokens(pd.read_csv(df_path)) for df_path in fps_frames])
+                [group_tokens(pd.read_csv(df_path)) for df_path in fps_frames]
+            )
            fns_tokens_df = pd.concat(
-                [group_tokens(pd.read_csv(df_path)) for df_path in fns_frames])
+                [group_tokens(pd.read_csv(df_path)) for df_path in fns_frames]
+            )

            def generate_graph(title, tokens_df):
-                fig = px.histogram(tokens_df, x=0, y="token", orientation='h', color='annotation',
-                                   title=f"Most common {title} for {self.model_name}")
+                fig = px.histogram(
+                    tokens_df,
+                    x=0,
+                    y="token",
+                    orientation="h",
+                    color="annotation",
+                    title=f"Most common {title} for {self.model_name}",
+                )

                fig.update_layout(yaxis_title=f"count", xaxis_title="PII Entity")
-                fig.update_traces(textfont_size=12, textangle=0,
-                                  textposition="outside", cliponaxis=False)
+                fig.update_traces(
+                    textfont_size=12,
+                    textangle=0,
+                    textposition="outside",
+                    cliponaxis=False,
+                )
                fig.update_layout(
                    plot_bgcolor="#FFF",
                    xaxis=dict(
                        title="Count",
                        linecolor="#BCCCDC",  # Sets color of X-axis line
-                        showgrid=False  # Removes X-axis grid lines
+                        showgrid=False,  # Removes X-axis grid lines
                    ),
                    yaxis=dict(
                        title=f"Tokens",
                        linecolor="#BCCCDC",  # Sets color of X-axis line
-                        showgrid=False  # Removes X-axis grid lines
+                        showgrid=False,  # Removes X-axis grid lines
                    ),
                )
-                fig.update_layout(yaxis={'categoryorder': 'total ascending'})
+                fig.update_layout(yaxis={"categoryorder": "total ascending"})
                fig.show()
+
            generate_graph(title="false-negatives", tokens_df=fns_tokens_df)
            generate_graph(title="false-positives", tokens_df=fps_tokens_df)
+
+    @staticmethod
+    def get_skip_words():
+        skip_words = [x for x in string.punctuation]
+        skip_words.extend(
+            [
+                "\n",
+                "\n\n",
+                "\n\n\n",
+                ">>",
+                ">>>",
+                ">>>>",
+                "street",
+                "st.",
+                "st",
+                "de",
+                "rue",
+                "via",
+                "and",
+                "or",
+                "do",
+                "as",
+                "of",
+                "day",
+                "address",
+                "country",
+                "state",
+                "city",
+            ]
+        )
+
+        return skip_words
--- a/presidio_evaluator/models/base_model.py
+++ b/presidio_evaluator/models/base_model.py
@ -31,6 +31,7 @@ class BaseModel(ABC):
        self.labeling_scheme = labeling_scheme
        self.entity_mapping = entity_mapping
        self.verbose = verbose
+        self.name = self.__class__.__name__

    @abstractmethod
    def predict(self, sample: InputSample, **kwargs) -> List[str]:
--- a/presidio_evaluator/models/crf_model.py
+++ b/presidio_evaluator/models/crf_model.py
@ -85,7 +85,7 @@ class CRFModel(BaseModel):
        y_train = [self.sent2labels(s) for s in sentences]
        return X_train, y_train

-    def predict(self, sample: InputSample) -> List[str]:
+    def predict(self, sample: InputSample, **kwargs) -> List[str]:
        tags = CRFModel.crf_predict(sample, self.model)

        if len(tags) != len(sample.tokens):
--- a/presidio_evaluator/models/flair_model.py
+++ b/presidio_evaluator/models/flair_model.py
@ -48,7 +48,7 @@ class FlairModel(BaseModel):

        self.spacy_tokenizer = SpacyTokenizer(model=spacy.load("en_core_web_sm"))

-    def predict(self, sample: InputSample) -> List[str]:
+    def predict(self, sample: InputSample, **kwargs) -> List[str]:

        sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer)
        self.model.predict(sentence)
--- a/presidio_evaluator/models/presidio_analyzer_wrapper.py
+++ b/presidio_evaluator/models/presidio_analyzer_wrapper.py
@ -91,23 +91,28 @@ class PresidioAnalyzerWrapper(BaseModel):
        "PHONE_NUMBER": "PHONE_NUMBER",
        "BIRTHDAY": "DATE_TIME",
        "DATE_TIME": "DATE_TIME",
-        "DOMAIN_NAME": "DOMAIN_NAME",
+        "DOMAIN_NAME": "URL",
+        "TIME" : "DATE_TIME",
+        "DATE" : "DATE_TIME",
        "CITY": "LOCATION",
        "ADDRESS": "LOCATION",
+        "STREET_ADDRESS": "LOCATION",
        "NATIONALITY": "LOCATION",
        "LOCATION": "LOCATION",
        "IBAN_CODE": "IBAN_CODE",
-        "URL": "DOMAIN_NAME",
+        "URL": "URL",
        "US_SSN": "US_SSN",
        "IP_ADDRESS": "IP_ADDRESS",
-        "ORGANIZATION": "ORG",
+        "ORGANIZATION": "ORGANIZATION",
+        "ORG": "ORGANIZATION",
        "US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
-        "NRP": "NRP",
-        "TITLE": "O",  # not supported
-        "PREFIX": "O",  # not supported
-        "STREET_ADDRESS": "O",  # not supported
-        "ZIP_CODE": "O",  # not supported
-        "AGE": "O",  # not supported
+        "NRP": "LOCATION",
+        "NORP": "LOCATION",
+        "ID": "ID",
+        "TITLE": "O",  # not supported through spaCy
+        "PREFIX": "O",  # not supported through spaCy
+        "ZIP_CODE": "O",  # not supported through spaCy
+        "AGE": "O",  # not supported through spaCy
        "O": "O",
    }

--- a/presidio_evaluator/models/spacy_model.py
+++ b/presidio_evaluator/models/spacy_model.py
@ -31,7 +31,7 @@ class SpacyModel(BaseModel):
        else:
            self.model = model

-    def predict(self, sample: InputSample) -> List[str]:
+    def predict(self, sample: InputSample, **kwargs) -> List[str]:
        """
        Predict a list of tags for an inpuit sample.
        :param sample: InputSample
--- a/presidio_evaluator/models/stanza_model.py
+++ b/presidio_evaluator/models/stanza_model.py
@ -51,7 +51,7 @@ class StanzaModel(SpacyModel):
            entity_mapping=entity_mapping,
        )

-    def predict(self, sample: InputSample) -> List[str]:
+    def predict(self, sample: InputSample, **kwargs) -> List[str]:
        """
        Predict the tags using a stanza model.

--- a/presidio_evaluator/models/text_analytics_wrapper.py
+++ b/presidio_evaluator/models/text_analytics_wrapper.py
@ -48,8 +48,7 @@ class TextAnalyticsWrapper(BaseModel):
        )
        return text_analytics_client

-
-    def predict(self, sample: InputSample) -> List[str]:
+    def predict(self, sample: InputSample, **kwargs) -> List[str]:
        documents = [sample.full_text]
        response = self.ta_client.recognize_pii_entities(documents, 
                                                        language="en")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,26 +4,23 @@ version = "0.1.0"
 description = ""
 authors = ["Omri Mendels <omri374@users.noreply.github.com>"]
 readme = "README.md"
+include = [{ path= "presidio_evaluator/data_generator/raw_data/*"}]

 [tool.poetry.dependencies]
 python = "^3.9"
-spacy = ">=3.2.0, <4.0.0"
-numpy = ">=1.20.2,<2.0.0"
-jupyter = ">=1"
-pandas = ">=1.2.4,<2.0.0"
-tqdm = ">=4.60.0,<5.0.0"
-haikunator = ">=2.1.0,<3.0.0"
-schwifty = ">=2023.11.2,<2024.0.0"
-faker = ">=9.6.0,<10.0.0"
-scikit-learn = ">1.3.2,<2.0.0"
-pytest = ">=6.2.3"
+spacy = "^3.5.0"
+numpy = "^1.22"
+pandas = "^2.1.4"
+tqdm = "^4.60.0"
+faker = "^21.0"
+scikit-learn = "^1.3.2"
 presidio-analyzer = "^2.2.351"
 presidio-anonymizer = "^2.2.351"
-requests = ">=2.25.1"
-xmltodict = ">=0.12.0"
+requests = "^2.25"
+xmltodict = "^0.12.0"
 python-dotenv = "^1.0.0"
 plotly = "^5.18.0"
-azure-ai-textanalytics = ">=5.3.0"
+azure-ai-textanalytics = "^5.3.0"
 en_core_web_sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz"}
 en_core_web_lg = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz"}

--- a/setup.py
+++ b/setup.py
@ -1,54 +1,57 @@
-from setuptools import setup, find_packages
-import os.path
-
-# read the contents of the README file
+# -*- coding: utf-8 -*-
+from setuptools import setup
+import os
 from os import path

 this_directory = path.abspath(path.dirname(__file__))
 with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
    long_description = f.read()
-    # print(long_description)

 with open(os.path.join(this_directory, "VERSION")) as version_file:
-    __version__ = version_file.read().strip()
+    version = version_file.read().strip()
+
+
+packages = [
+    "presidio_evaluator",
+    "presidio_evaluator.data_generator",
+    "presidio_evaluator.data_generator.faker_extensions",
+    "presidio_evaluator.dataset_formatters",
+    "presidio_evaluator.evaluation",
+    "presidio_evaluator.experiment_tracking",
+    "presidio_evaluator.models",
+]
+
+package_data = {"": ["*"], "presidio_evaluator.data_generator": ["raw_data/*"]}
+
+install_requires = [
+    "azure-ai-textanalytics>=5.3.0,<6.0.0",
+    "en_core_web_lg @ "
+    "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz",
+    "en_core_web_sm @ "
+    "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz",
+    "faker>=21.0,<22.0",
+    "numpy>=1.22,<2.0",
+    "pandas>=2.1.4,<3.0.0",
+    "plotly>=5.18.0,<6.0.0",
+    "presidio-analyzer>=2.2.351,<3.0.0",
+    "presidio-anonymizer>=2.2.351,<3.0.0",
+    "python-dotenv>=1.0.0,<2.0.0",
+    "requests>=2.25,<3.0",
+    "scikit-learn>=1.3.2,<2.0.0",
+    "spacy>=3.5.0,<4.0.0",
+    "tqdm>=4.60.0,<5.0.0",
+    "xmltodict>=0.12.0,<0.13.0",
+]

 setup(
    name="presidio-evaluator",
    long_description=long_description,
    long_description_content_type="text/markdown",
-    version=__version__,
-    packages=find_packages(exclude=["tests"]),
    url="https://www.github.com/microsoft/presidio-research",
+    version=version,
    license="MIT",
-    description="PII dataset generator, model evaluator for Presidio and PII data in general",  # noqa
-    data_files=[
-        (
-            "presidio_evaluator/data_generator/raw_data",
-            [
-                "presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv",  # noqa
-                "presidio_evaluator/data_generator/raw_data/templates.txt",
-                "presidio_evaluator/data_generator/raw_data/companies_and_organizations.csv",
-                "presidio_evaluator/data_generator/raw_data/nationalities.csv",
-                "presidio_evaluator/data_generator/raw_data/us_driver_licenses.csv",
-            ],
-        )
-    ],
-    include_package_data=True,
-    install_requires=[
-        "presidio_analyzer",
-        "presidio_anonymizer",
-        "spacy>=3.0.0",
-        "requests",
-        "numpy",
-        "pandas",
-        "tqdm>=4.32.1",
-        "jupyter>=1.0.0",
-        "pytest>=4.6.2",
-        "haikunator",
-        "schwifty",
-        "faker",
-        "sklearn_crfsuite",
-        "python-dotenv",
-        "azure-ai-textanalytics==5.2.0"
-    ],
-)
+    packages=packages,
+    package_data=package_data,
+    install_requires=install_requires,
+    python_requires=">=3.8,<4.0",
+)
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@ -345,3 +345,22 @@ def test_align_entity_types_wrong_mapping_exception():
        Evaluator.align_entity_types(
            input_samples=[sample1], entities_mapping=entities_mapping
        )
+
+
+def test_skip_words_are_not_counted_as_errors():
+    prediction = ["U-PERSON", "O", "O", "O", "U-LOCATION"]
+    model = MockTokensModel(prediction=prediction,
+                            entities_to_keep=["LOCATION", "PERSON"])
+
+    evaluator = Evaluator(model=model)
+    sample = InputSample(
+        full_text="John is on the street", masked="I am the street", spans=None
+    )
+    sample.tokens = ["John", "is", "on", "the", "street"]
+    sample.tags = ["U-PERSON", "O", "O", "O", "O"]
+
+    evaluated = evaluator.evaluate_sample(sample, prediction)
+    final_evaluation = evaluator.calculate_score([evaluated])
+
+    assert final_evaluation.pii_precision == 1
+    assert final_evaluation.pii_recall == 1
--- a/tests/test_presidio_pseudonymize.py
+++ b/tests/test_presidio_pseudonymize.py
@ -30,7 +30,7 @@ def fake_faker():
    ],
    # fmt: on
 )
-def test_presidio_psudonymize_two_entities(
+def test_presidio_pseudonymize_two_entities(
    text, entity1, entity2, start1, end1, start2, end2, value1, value2, fake_faker
 ):

@ -51,3 +51,15 @@ def test_presidio_psudonymize_two_entities(
        assert value2 in pseudonym
        assert text[:start1].lower() in pseudonym.lower()
        assert text[end1:start2].lower() in pseudonym.lower()
+
+
+def test_simple_scenario():
+    original_text = "Hi my name is Doug Funny and this is my website: https://www.dougf.io" # noqa
+    presidio_response = [
+        RecognizerResult(entity_type="PERSON", start=14, end=24, score=0.85),
+        RecognizerResult(entity_type="URL", start=49, end=69, score=0.95),
+    ]
+
+    PresidioPseudonymization().pseudonymize(original_text=original_text,
+                                            presidio_response=presidio_response,
+                                            count=5)
 @ -1 +1 @@
 .1.2
 .1.3