Merge branch 'master' into omri/add_spacy_to_nb

This commit is contained in:
Omri Mendels 2023-12-31 11:33:31 +02:00 коммит произвёл GitHub
Родитель d8b44bfeab 26e472e433
Коммит 48f0b359c5
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
35 изменённых файлов: 1109 добавлений и 475 удалений

44
.github/workflows/ci.yml поставляемый
Просмотреть файл

@ -1,44 +0,0 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Python package
on:
push:
branches: [ "master", "feature/*" ]
pull_request:
branches:
- main
- 'feature/**'
jobs:
build:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest
pip install -r requirements.txt
python -m spacy download en_core_web_sm
python -m spacy download en_core_web_lg
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest

3
.gitignore поставляемый
Просмотреть файл

@ -189,4 +189,5 @@ datasets/
/data
*.spacy
*.pickle
*.pickle
/poetry.lock

Просмотреть файл

@ -1,45 +0,0 @@
# Python package
# Create and test a Python package on multiple Python versions.
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
trigger:
- master
pr:
branches:
include:
- master
- feature/*
pool:
vmImage: 'ubuntu-latest'
strategy:
matrix:
Python37:
python.version: '3.7'
Python38:
python.version: '3.8'
Python39:
python.version: '3.9'
Python310:
python.version: '3.10'
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '$(python.version)'
displayName: 'Use Python $(python.version)'
- script: |
python -m pip install --upgrade pip
pip install wheel
pip install -r requirements.txt
python -m spacy download en_core_web_lg
python -m spacy download en_core_web_sm
displayName: 'Install base dependencies'
- script: |
pip install pytest pytest-azurepipelines
pytest
displayName: 'pytest'

Просмотреть файл

@ -13,6 +13,8 @@ In addition, it contains a fake data generator which creates fake sentences base
## Getting started
>Note: Presidio evaluator requires Python>=3.9
### From PyPI
``` sh
@ -36,8 +38,10 @@ conda create --name presidio python=3.9
conda activate presidio
# Install package+dependencies
pip install -r requirements.txt
python setup.py install
pip install poetry
poetry install
# To install with all additional NER dependencies (e.g. Flair, Stanza, CRF), run:
# poetry install --with ner
# Download a spaCy model used by presidio-analyzer
python -m spacy download en_core_web_lg
@ -70,7 +74,7 @@ Furthermore, it tokenizes the data, creates tags (either IO/BIO/BILUO) and spans
Once data is generated, it could be split into train/test/validation sets
while ensuring that each template only exists in one set.
See [this notebook for more details](notebooks/3_Split_by_pattern_%23.ipynb).
See [this notebook for more details](notebooks/3_Split_by_pattern_number.ipynb).
## 2. Data representation

Просмотреть файл

@ -1,2 +1 @@
0.1.1
0.1.3

35
azure-pipelines.yml Normal file
Просмотреть файл

@ -0,0 +1,35 @@
pr:
branches:
include:
- master
- feature/*
pool:
vmImage: 'ubuntu-latest'
strategy:
matrix:
Python38:
python.version: '3.8'
Python39:
python.version: '3.9'
Python310:
python.version: '3.10'
Python311:
python.version: '3.11'
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '$(python.version)'
displayName: 'Use Python $(python.version)'
- script: |
python -m pip install --upgrade pip
pip install poetry
poetry install --with dev,ner
displayName: 'Install dependencies'
- script: |
poetry add pytest-azurepipelines
poetry run pytest --runslow
displayName: 'pytest'

Просмотреть файл

@ -2,8 +2,23 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"is_executing": true
},
"outputs": [],
"source": [
"# install presidio via pip if not yet installed\n",
"\n",
"#!pip install presidio-analyzer\n",
"#!pip install presidio-evaluator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"is_executing": true,
"scrolled": true
},
"outputs": [],
@ -69,9 +84,34 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 3,
"metadata": {
"is_executing": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Sampling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9149.88it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"My name is Joshua Jackson\n",
"[{\"value\": \"Joshua Jackson\", \"start\": 11, \"end\": 25, \"type\": \"name\"}]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"sentence_templates = [\n",
" \"My name is {{name}}\",\n",
@ -114,8 +154,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {
"is_executing": true,
"scrolled": true
},
"outputs": [],
@ -153,13 +194,228 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>number</th>\n",
" <th>gender</th>\n",
" <th>nationality</th>\n",
" <th>prefix</th>\n",
" <th>first_name</th>\n",
" <th>middle_initial</th>\n",
" <th>last_name</th>\n",
" <th>street_name</th>\n",
" <th>city</th>\n",
" <th>state_abbr</th>\n",
" <th>...</th>\n",
" <th>company</th>\n",
" <th>domain_name</th>\n",
" <th>person</th>\n",
" <th>name</th>\n",
" <th>first_name_female</th>\n",
" <th>first_name_male</th>\n",
" <th>prefix_female</th>\n",
" <th>prefix_male</th>\n",
" <th>last_name_female</th>\n",
" <th>last_name_male</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>Czech</td>\n",
" <td>Mrs.</td>\n",
" <td>Marie</td>\n",
" <td>J</td>\n",
" <td>Hamanová</td>\n",
" <td>P.O. Box 255</td>\n",
" <td>Kangerlussuaq</td>\n",
" <td>QE</td>\n",
" <td>...</td>\n",
" <td>Simple Solutions</td>\n",
" <td>MarathonDancing.gl</td>\n",
" <td>Marie J Hamanová</td>\n",
" <td>Marie J Hamanová</td>\n",
" <td>Marie</td>\n",
" <td></td>\n",
" <td>Mrs.</td>\n",
" <td></td>\n",
" <td>Hamanová</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>French</td>\n",
" <td>Ms.</td>\n",
" <td>Patricia</td>\n",
" <td>G</td>\n",
" <td>Desrosiers</td>\n",
" <td>Avenida Noruega 42</td>\n",
" <td>Vila Real</td>\n",
" <td>VR</td>\n",
" <td>...</td>\n",
" <td>Formula Gray</td>\n",
" <td>LostMillions.com.pt</td>\n",
" <td>Patricia Desrosiers</td>\n",
" <td>Patricia Desrosiers</td>\n",
" <td>Patricia</td>\n",
" <td></td>\n",
" <td>Ms.</td>\n",
" <td></td>\n",
" <td>Desrosiers</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>American</td>\n",
" <td>Ms.</td>\n",
" <td>Debra</td>\n",
" <td>O</td>\n",
" <td>Neal</td>\n",
" <td>1659 Hoog St</td>\n",
" <td>Brakpan</td>\n",
" <td>GA</td>\n",
" <td>...</td>\n",
" <td>Dahlkemper's</td>\n",
" <td>MediumTube.co.za</td>\n",
" <td>Debra O Neal</td>\n",
" <td>Debra O Neal</td>\n",
" <td>Debra</td>\n",
" <td></td>\n",
" <td>Ms.</td>\n",
" <td></td>\n",
" <td>Neal</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>male</td>\n",
" <td>French</td>\n",
" <td>Mr.</td>\n",
" <td>Peverell</td>\n",
" <td>C</td>\n",
" <td>Racine</td>\n",
" <td>183 Epimenidou Street</td>\n",
" <td>Limassol</td>\n",
" <td>LI</td>\n",
" <td>...</td>\n",
" <td>Quickbiz</td>\n",
" <td>ImproveLook.com.cy</td>\n",
" <td>Peverell Racine</td>\n",
" <td>Peverell Racine</td>\n",
" <td></td>\n",
" <td>Peverell</td>\n",
" <td></td>\n",
" <td>Mr.</td>\n",
" <td></td>\n",
" <td>Racine</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>female</td>\n",
" <td>Slovenian</td>\n",
" <td>Mrs.</td>\n",
" <td>Iolanda</td>\n",
" <td>S</td>\n",
" <td>Tratnik</td>\n",
" <td>Karu põik 61</td>\n",
" <td>Pärnu</td>\n",
" <td>PR</td>\n",
" <td>...</td>\n",
" <td>Dubrow's Cafeteria</td>\n",
" <td>PostTan.com.ee</td>\n",
" <td>Iolanda Tratnik</td>\n",
" <td>Iolanda Tratnik</td>\n",
" <td>Iolanda</td>\n",
" <td></td>\n",
" <td>Mrs.</td>\n",
" <td></td>\n",
" <td>Tratnik</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 37 columns</p>\n",
"</div>"
],
"text/plain": [
" number gender nationality prefix first_name middle_initial last_name \\\n",
"0 1 female Czech Mrs. Marie J Hamanová \n",
"1 2 female French Ms. Patricia G Desrosiers \n",
"2 3 female American Ms. Debra O Neal \n",
"3 4 male French Mr. Peverell C Racine \n",
"4 5 female Slovenian Mrs. Iolanda S Tratnik \n",
"\n",
" street_name city state_abbr ... company \\\n",
"0 P.O. Box 255 Kangerlussuaq QE ... Simple Solutions \n",
"1 Avenida Noruega 42 Vila Real VR ... Formula Gray \n",
"2 1659 Hoog St Brakpan GA ... Dahlkemper's \n",
"3 183 Epimenidou Street Limassol LI ... Quickbiz \n",
"4 Karu põik 61 Pärnu PR ... Dubrow's Cafeteria \n",
"\n",
" domain_name person name \\\n",
"0 MarathonDancing.gl Marie J Hamanová Marie J Hamanová \n",
"1 LostMillions.com.pt Patricia Desrosiers Patricia Desrosiers \n",
"2 MediumTube.co.za Debra O Neal Debra O Neal \n",
"3 ImproveLook.com.cy Peverell Racine Peverell Racine \n",
"4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n",
"\n",
" first_name_female first_name_male prefix_female prefix_male \\\n",
"0 Marie Mrs. \n",
"1 Patricia Ms. \n",
"2 Debra Ms. \n",
"3 Peverell Mr. \n",
"4 Iolanda Mrs. \n",
"\n",
" last_name_female last_name_male \n",
"0 Hamanová \n",
"1 Desrosiers \n",
"2 Neal \n",
"3 Racine \n",
"4 Tratnik \n",
"\n",
"[5 rows x 37 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read FakeNameGenerator CSV\n",
"fake_name_generator_df = pd.read_csv(fake_name_generator_file)\n",
@ -178,8 +434,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {
"is_executing": true,
"scrolled": true
},
"outputs": [],
@ -197,8 +454,10 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 7,
"metadata": {
"is_executing": true
},
"outputs": [],
"source": [
"fake.add_provider(IpAddressProvider) # Both Ipv4 and IPv6 IP addresses\n",
@ -223,8 +482,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@ -258,13 +518,36 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Sampling: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 17987.56it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"year\"}], \"template\": \"Title VII of the Civil Rights Act of {{year}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"sentence_templates = PresidioDataGenerator.read_template_file(templates_file_path)\n",
"fake_records = data_generator.generate_fake_data(\n",
@ -284,11 +567,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {
"is_executing": true,
"scrolled": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total: 1500\n",
"Avg # of records per template: 7.142857142857143\n",
"Median # of records per template: 7.0\n",
"Std: 2.5872528966106905\n"
]
}
],
"source": [
"count_per_template_id = Counter([sample.template_id for sample in fake_records])\n",
"\n",
@ -311,13 +606,65 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"Counter({'organization': 257,\n",
" 'first_name': 244,\n",
" 'person': 238,\n",
" 'city': 235,\n",
" 'address': 209,\n",
" 'street_name': 164,\n",
" 'name': 162,\n",
" 'country': 154,\n",
" 'credit_card_number': 152,\n",
" 'phone_number': 121,\n",
" 'last_name': 119,\n",
" 'building_number': 110,\n",
" 'age': 72,\n",
" 'secondary_address': 64,\n",
" 'year': 58,\n",
" 'nationality': 55,\n",
" 'postcode': 49,\n",
" 'zipcode': 45,\n",
" 'url': 39,\n",
" 'email': 39,\n",
" 'name_female': 37,\n",
" 'job': 33,\n",
" 'first_name_male': 31,\n",
" 'name_male': 29,\n",
" 'prefix_male': 28,\n",
" 'date_of_birth': 24,\n",
" 'iban': 22,\n",
" 'date_time': 21,\n",
" 'prefix_female': 21,\n",
" 'day_of_week': 16,\n",
" 'state_abbr': 15,\n",
" 'last_name_male': 15,\n",
" 'prefix': 12,\n",
" 'ip_address': 11,\n",
" 'ssn': 11,\n",
" 'nation_plural': 9,\n",
" 'nation_woman': 8,\n",
" 'first_name_nonbinary': 6,\n",
" 'us_driver_license': 6,\n",
" 'first_name_female': 3,\n",
" 'last_name_female': 3})"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_per_entity = Counter()\n",
"for record in fake_records:\n",
@ -339,8 +686,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@ -409,9 +757,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 13,
"metadata": {
"is_executing": true
},
"outputs": [
{
"data": {
"text/plain": [
"{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"DATE_TIME\"}], \"template\": \"Title VII of the Civil Rights Act of {{DATE_TIME}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_records[0]"
]
@ -425,13 +786,41 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[('PERSON', 887),\n",
" ('STREET_ADDRESS', 596),\n",
" ('GPE', 404),\n",
" ('ORGANIZATION', 257),\n",
" ('CREDIT_CARD', 152),\n",
" ('PHONE_NUMBER', 121),\n",
" ('DATE_TIME', 119),\n",
" ('TITLE', 94),\n",
" ('NRP', 72),\n",
" ('AGE', 72),\n",
" ('ZIP_CODE', 45),\n",
" ('DOMAIN_NAME', 39),\n",
" ('EMAIL_ADDRESS', 39),\n",
" ('IBAN_CODE', 22),\n",
" ('IP_ADDRESS', 11),\n",
" ('US_SSN', 11),\n",
" ('US_DRIVER_LICENSE', 6)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"count_per_entity_new = Counter()\n",
@ -467,13 +856,51 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/1500 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading model en_core_web_sm\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:06<00:00, 215.70it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6.76 s, sys: 33.8 ms, total: 6.8 s\n",
"Wall time: 6.96 s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"%%time\n",
"input_samples = [\n",
@ -495,8 +922,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@ -519,21 +947,31 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 76888.23it/s]\n"
]
}
],
"source": [
"conll = InputSample.create_conll_dataset(input_samples)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {
"is_executing": true,
"pycharm": {
"name": "#%%\n"
}
@ -550,7 +988,7 @@
"### Next steps\n",
"\n",
"- Evaluate Presidio using this fake data. [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_#.ipynb)\n",
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_number.ipynb)\n",
"- Conduct a small exploratory data analysis on the generated data. [Sample](2_PII_EDA.ipynb)"
]
},

Просмотреть файл

@ -72,7 +72,7 @@
"metadata": {},
"outputs": [],
"source": [
"for (name, series) in pii_df.iteritems():\n",
"for (name, series) in pii_df.items():\n",
" print(name)\n",
" print(\"Unique values: {}\".format(len(series.unique())))\n",
" print(series.value_counts())\n",
@ -123,7 +123,7 @@
"metadata": {},
"outputs": [],
"source": [
"series_to_wordcloud(pii_df.country_full)"
"series_to_wordcloud(pii_df.country)"
]
},
{
@ -187,9 +187,9 @@
"metadata": {},
"outputs": [],
"source": [
"countries = [get_entity_values_from_sample(sample, [\"LOCATION\"]) for sample in synth]\n",
"countries = [get_entity_values_from_sample(sample, [\"TITLE\"]) for sample in synth]\n",
"countries = [item for sublist in countries for item in sublist]\n",
"series_to_wordcloud(pd.Series(countries, name=\"LOCATION\"))"
"series_to_wordcloud(pd.Series(countries, name=\"TITLE\"))"
]
},
{
@ -213,9 +213,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -227,9 +227,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
"nbformat_minor": 4
}

Просмотреть файл

@ -10,6 +10,18 @@
"This is different from the normal split since we don't want sentences generated from the same pattern to be in more than one set. (Applicable only if the dataset was generated from templates)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# install presidio via pip if not yet installed\n",
"\n",
"#!pip install presidio-analyzer\n",
"#!pip install presidio-evaluator"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -131,13 +143,6 @@
"assert len(train) + len(test) + len(validation) == len(all_samples)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
@ -148,9 +153,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -162,9 +167,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
"nbformat_minor": 4
}

Просмотреть файл

@ -5,7 +5,21 @@
"id": "847acd88",
"metadata": {},
"source": [
"Evaluate Presidio Analyzer using the Presidio Evaluator framework"
"# Evaluate Presidio Analyzer using the Presidio Evaluator framework"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b946feda",
"metadata": {},
"outputs": [],
"source": [
"# install presidio via pip if not yet installed\n",
"\n",
"#!pip install presidio-evaluator\n",
"#!pip install \"presidio-analyzer[transformers]\"\n",
"#!pip install presidio-evaluator"
]
},
{
@ -19,6 +33,10 @@
"from copy import deepcopy\n",
"from pprint import pprint\n",
"from collections import Counter\n",
"from typing import List\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"from presidio_evaluator import InputSample\n",
"from presidio_evaluator.evaluation import Evaluator, ModelError\n",
@ -32,7 +50,8 @@
"pd.set_option(\"display.max_colwidth\", None)\n",
"\n",
"%reload_ext autoreload\n",
"%autoreload 2"
"%autoreload 2\n",
"%matplotlib inline"
]
},
{
@ -52,6 +71,9 @@
"source": [
"dataset_name = \"synth_dataset_v2.json\"\n",
"dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, \"data\", dataset_name))\n",
"\n",
"dataset = dataset[:300] # top 300 samples\n",
"\n",
"print(len(dataset))"
]
},
@ -62,10 +84,12 @@
"metadata": {},
"outputs": [],
"source": [
"entity_counter = Counter()\n",
"for sample in dataset:\n",
" for tag in sample.tags:\n",
" entity_counter[tag] += 1"
"def get_entity_counts(dataset:List[InputSample]):\n",
" entity_counter = Counter()\n",
" for sample in dataset:\n",
" for tag in sample.tags:\n",
" entity_counter[tag] += 1\n",
" return entity_counter\n"
]
},
{
@ -76,7 +100,7 @@
"outputs": [],
"source": [
"print(\"Count per entity:\")\n",
"pprint(entity_counter.most_common())\n",
"pprint(get_entity_counts(dataset).most_common())\n",
"\n",
"print(\"\\nExample sentence:\")\n",
"print(dataset[1])\n",
@ -94,12 +118,121 @@
")"
]
},
{
"cell_type": "markdown",
"id": "9c5e16cb-bee8-4f0a-a543-4879daa35b9e",
"metadata": {},
"source": [
"### Define the AnalyzerEngine object \n",
"In this case, using a huggingface model: obi/deid_roberta_i2b2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "313b508f-e901-40b9-b575-c7fb8a794652",
"metadata": {},
"outputs": [],
"source": [
"from presidio_analyzer import AnalyzerEngine\n",
"from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration\n",
"\n",
"\n",
"# Here we define a transformers based NLP engine, \n",
"# but you can use this cell to customize your Presidio Analyzer instance\n",
"\n",
"# Define which model to use\n",
"model_config = [{\"lang_code\": \"en\", \"model_name\": {\n",
" \"spacy\": \"en_core_web_sm\", # use a small spaCy model for lemmas, tokens etc.\n",
" \"transformers\": \"obi/deid_roberta_i2b2\"\n",
" }\n",
"}]\n",
"\n",
"# Map transformers model labels to Presidio's\n",
"model_to_presidio_entity_mapping = dict(\n",
" PER=\"PERSON\",\n",
" PERSON=\"PERSON\",\n",
" LOC= \"LOCATION\",\n",
" LOCATION= \"LOCATION\",\n",
" GPE=\"LOCATION\",\n",
" ORG=\"ORGANIZATION\",\n",
" ORGANIZATION=\"ORGANIZATION\",\n",
" NORP=\"NRP\",\n",
" AGE=\"AGE\",\n",
" ID=\"ID\",\n",
" EMAIL=\"EMAIL\",\n",
" PATIENT=\"PERSON\",\n",
" STAFF=\"PERSON\",\n",
" HOSP=\"ORGANIZATION\",\n",
" PATORG=\"ORGANIZATION\",\n",
" DATE=\"DATE_TIME\",\n",
" TIME=\"DATE_TIME\",\n",
" PHONE=\"PHONE_NUMBER\",\n",
" HCW=\"PERSON\",\n",
" HOSPITAL=\"ORGANIZATION\",\n",
" FACILITY=\"LOCATION\",\n",
")\n",
"\n",
"ner_model_configuration = NerModelConfiguration(labels_to_ignore = [\"O\"], \n",
" model_to_presidio_entity_mapping=model_to_presidio_entity_mapping)\n",
"\n",
"nlp_engine = TransformersNlpEngine(models=model_config,\n",
" ner_model_configuration=ner_model_configuration)\n",
"\n",
"# Set up the engine, loads the NLP module (spaCy model by default) \n",
"# and other PII recognizers\n",
"analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)"
]
},
{
"cell_type": "markdown",
"id": "aae4c379",
"metadata": {},
"source": [
"Run evaluation:"
"### Run evaluation"
]
},
{
"cell_type": "markdown",
"id": "16dbf6d6-a554-4602-8907-589786d47a12",
"metadata": {},
"source": [
"#### Define experiment"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29d39ff1-4f14-4e32-ae84-ecc6c739f829",
"metadata": {},
"outputs": [],
"source": [
"experiment = get_experiment_tracker()\n",
"model = PresidioAnalyzerWrapper(analyzer_engine)\n",
"\n",
"# Define evaluator and experiment tracking\n",
"\n",
"evaluator = Evaluator(model=model)\n",
"dataset = Evaluator.align_entity_types(\n",
" deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
")\n",
"\n",
"print(\"Count per entity after alignment:\")\n",
"pprint(get_entity_counts(dataset).most_common())\n",
"\n",
"# Track model and dataset params\n",
"params = {\"dataset_name\": dataset_name, \"model_name\": model.name}\n",
"params.update(model.to_log())\n",
"experiment.log_parameters(params)\n",
"experiment.log_dataset_hash(dataset)"
]
},
{
"cell_type": "markdown",
"id": "2a7d6626-d094-4dfd-8f37-c0443edf00dc",
"metadata": {},
"source": [
"#### Run experiment"
]
},
{
@ -109,39 +242,37 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"Evaluating Presidio Analyzer\")\n",
"\n",
"experiment = get_experiment_tracker()\n",
"model_name = \"Presidio Analyzer\"\n",
"model = PresidioAnalyzerWrapper()\n",
"\n",
"evaluator = Evaluator(model=model)\n",
"dataset = Evaluator.align_entity_types(\n",
" deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
")\n",
"\n",
"# Run experiment\n",
"evaluation_results = evaluator.evaluate_all(dataset)\n",
"results = evaluator.calculate_score(evaluation_results)\n",
"\n",
"# update params tracking\n",
"params = {\"dataset_name\": dataset_name, \"model_name\": model_name}\n",
"params.update(model.to_log())\n",
"experiment.log_parameters(params)\n",
"experiment.log_dataset_hash(dataset)\n",
"# Track experiment results\n",
"experiment.log_metrics(results.to_log())\n",
"entities, confmatrix = results.to_confusion_matrix()\n",
"experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)\n",
"experiment.log_confusion_matrix(matrix=confmatrix, \n",
" labels=entities)\n",
"\n",
"print(\"Confusion matrix:\")\n",
"print(pd.DataFrame(confmatrix, columns=entities, index=entities))\n",
"\n",
"print(\"Precision and recall\")\n",
"print(results)\n",
"# Plot output\n",
"plotter = evaluator.Plotter(model=model, \n",
" results=results, \n",
" output_folder = \".\", \n",
" model_name = model.name, \n",
" beta = 2)\n",
"\n",
"# end experiment\n",
"experiment.end()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b4d662d-596c-4a69-b3c9-1edcda20cc5b",
"metadata": {},
"outputs": [],
"source": [
"plotter.plot_scores()"
]
},
{
"cell_type": "markdown",
"id": "070f8287",
@ -185,7 +316,7 @@
"id": "98f4802e",
"metadata": {},
"source": [
"1. Most false positive tokens:"
"1. Most common false positive tokens:"
]
},
{
@ -206,7 +337,7 @@
"outputs": [],
"source": [
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"LOCATION\"])\n",
"fps_df[[\"full_text\", \"token\", \"prediction\"]]"
"fps_df[[\"full_text\", \"token\", \"annotation\", \"prediction\"]]"
]
},
{
@ -214,7 +345,7 @@
"id": "d0852513",
"metadata": {},
"source": [
"2. False negative examples"
"2. Most common false negative examples"
]
},
{
@ -224,7 +355,7 @@
"metadata": {},
"outputs": [],
"source": [
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
"ModelError.most_common_fn_tokens(errors, n=50)"
]
},
{
@ -242,7 +373,7 @@
"metadata": {},
"outputs": [],
"source": [
"fns_df = ModelError.get_fns_dataframe(errors, entity=[\"PHONE_NUMBER\"])"
"fns_df = ModelError.get_fns_dataframe(errors, entity=[\"IP_ADDRESS\"])"
]
},
{
@ -265,13 +396,21 @@
"print(\"All errors:\\n\")\n",
"[print(error, \"\\n\") for error in errors]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a67ff38d-0817-4864-9991-b3eb1f80eecc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -283,9 +422,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
}

Просмотреть файл

@ -24,7 +24,12 @@
"# install presidio via pip if not yet installed\n",
"\n",
"#!pip install presidio-analyzer\n",
"#!pip install presidio-anonymizer"
"#!pip install presidio-anonymizer\n",
"#!pip install presidio-evaluator\n",
"\n",
"# install trained model for pipeline\n",
"\n",
"#!python -m spacy download en_core_web_sm"
]
},
{
@ -83,7 +88,7 @@
{
"data": {
"text/plain": [
"[type: DOMAIN_NAME, start: 57, end: 69, score: 1.0,\n",
"[type: URL, start: 49, end: 69, score: 0.95,\n",
" type: PERSON, start: 14, end: 24, score: 0.85]"
]
},
@ -111,11 +116,11 @@
{
"data": {
"text/plain": [
"['Hi my name is Albert Cohen and this is my website: https://http://chapman-downs.info/',\n",
" 'Hi my name is Lisa Miller and this is my website: https://http://benson.org/',\n",
" 'Hi my name is Kathleen Hale and this is my website: https://http://www.garcia.com/',\n",
" 'Hi my name is Michelle Frederick and this is my website: https://https://robinson.com/',\n",
" 'Hi my name is Alicia Santana and this is my website: https://https://www.ray.org/']"
"['Hi my name is Tammy Ryan and this is my website: https://www.cardenas.info/',\n",
" 'Hi my name is Jessica Smith and this is my website: http://jones-hunt.info/',\n",
" 'Hi my name is Michele Marsh and this is my website: https://guerrero.com/',\n",
" 'Hi my name is Kathleen Miller and this is my website: https://lopez.com/',\n",
" 'Hi my name is Paul Brown and this is my website: http://www.banks-evans.info/']"
]
},
"execution_count": 6,
@ -148,11 +153,11 @@
"-------------\n",
"Fake examples:\n",
"\n",
"Our son R2D2 used to work in Botswana\n",
"Our son R2D2 used to work in American Samoa\n",
"Our son R2D2 used to work in Malawi\n",
"Our son R2D2 used to work in Montenegro\n",
"our son r2d2 used to work in lebanon\n"
"Our son R2D2 used to work in Nigeria\n",
"Our son R2D2 used to work in Guam\n",
"Our son R2D2 used to work in Reunion\n",
"Our son R2D2 used to work in Vanuatu\n",
"Our son R2D2 used to work in Malaysia\n"
]
}
],
@ -171,13 +176,20 @@
"print(f\"-------------\\nFake examples:\\n\")\n",
"print(*fake_samples, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -189,9 +201,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
"nbformat_minor": 4
}

Просмотреть файл

@ -23,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -34,7 +34,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
@ -42,55 +42,18 @@
},
"outputs": [],
"source": [
"DATA_DATE = \"Dec-19-2021\""
"DATA_DATE = \"Dec-27-2023\" # Change to the date when notebook 3 (split to train/test) was ran"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"tokenizing input: 0%| | 0/2122 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading model en_core_web_sm\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"tokenizing input: 100%|███████████████████████████████████████████████████████████| 2122/2122 [00:19<00:00, 109.66it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Read 2122 samples\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"outputs": [],
"source": [
"data_path = \"../../data/{}_{}.json\"\n",
"\n",
@ -111,17 +74,9 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Kept 1940 samples after removal of non-tagged samples\n"
]
}
],
"outputs": [],
"source": [
"train_tagged = [sample for sample in train_samples if len(sample.spans) > 0]\n",
"print(\"Kept {} samples after removal of non-tagged samples\".format(len(train_tagged)))"
@ -140,45 +95,13 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Entities found in training set:\n"
]
},
{
"data": {
"text/plain": [
"{'ADDRESS',\n",
" 'CREDIT_CARD',\n",
" 'DATE_TIME',\n",
" 'DOMAIN_NAME',\n",
" 'EMAIL_ADDRESS',\n",
" 'IBAN_CODE',\n",
" 'IP_ADDRESS',\n",
" 'LOCATION',\n",
" 'O',\n",
" 'ORGANIZATION',\n",
" 'PERSON',\n",
" 'PHONE_NUMBER',\n",
" 'PREFIX',\n",
" 'TITLE',\n",
" 'US_SSN'}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"print(\"Entities found in training set:\")\n",
"entities = []\n",
@ -206,16 +129,7 @@
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping illegal span None, text=ΜΟΝΗ ΑΓΙΩΝ ΑΝΑΡΓΥΡΩΝ\n",
"Skipping illegal span None, text=U.N\n"
]
}
],
"outputs": [],
"source": [
"spacy_train = InputSample.create_spacy_dataset(\n",
" dataset=train_tagged, output_path=\"train.spacy\"\n",
@ -281,9 +195,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -295,9 +209,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
"nbformat_minor": 4
}

Просмотреть файл

@ -39,6 +39,16 @@
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aee00770-a972-4a19-b423-1724214cc88c",
"metadata": {},
"outputs": [],
"source": [
"#!pip install sklearn_crfsuite"
]
},
{
"cell_type": "markdown",
"id": "a0d2d772",
@ -58,8 +68,9 @@
},
"outputs": [],
"source": [
"DATA_DATE = \"Jan-15-2022\"\n",
"dataset = InputSample.read_dataset_json(\"../../data/test_{}.json\".format(DATA_DATE))\n",
"DATA_DATE = \"Dec-27-2023\" # Date when the split to train/test notebook was ran\n",
"dataset_name = \"../../data/test_{}.json\".format(DATA_DATE)\n",
"dataset = InputSample.read_dataset_json(dataset_name)\n",
"print(len(dataset))"
]
},
@ -76,7 +87,7 @@
"source": [
"entity_counter = Counter()\n",
"for sample in dataset:\n",
" for t>ag in sample.tags:\n",
" for tag in sample.tags:\n",
" entity_counter[tag] += 1"
]
},
@ -257,7 +268,7 @@
"metadata": {},
"outputs": [],
"source": [
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"GPE\"])\n",
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"PERSON\"])\n",
"fps_df[[\"full_text\", \"token\", \"prediction\"]]"
]
},
@ -276,7 +287,7 @@
"metadata": {},
"outputs": [],
"source": [
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"ORGANIZATION\"])"
]
},
{
@ -325,13 +336,21 @@
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf3e4646-ca93-44c5-a998-cd77f4bf2708",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -343,9 +362,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
}

Просмотреть файл

@ -205,7 +205,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.13 ('presidio')",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -219,9 +219,8 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.9.18"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "371968787ec79dd50357533864944a85029366968470cac36beb694745c2f7d6"
@ -229,5 +228,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

Просмотреть файл

@ -35,6 +35,16 @@
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0c3285c-06a2-4361-aec2-8375496f75b3",
"metadata": {},
"outputs": [],
"source": [
"#!pip install flair"
]
},
{
"cell_type": "markdown",
"id": "f036de59",
@ -111,15 +121,14 @@
"metadata": {},
"outputs": [],
"source": [
"flair_ner = \"ner-english\"\n",
"flair_ner_fast = \"ner-english-fast\"\n",
"flair_ontonotes_fast = \"ner-english-ontonotes-fast\"\n",
"flair_ontonotes_large = \"ner-english-ontonotes-large\"\n",
"flair_ner = \"flair/ner-english\"\n",
"flair_ner_fast = \"flair/ner-english-fast\"\n",
"flair_ontonotes_fast = \"flair/ner-english-ontonotes-fast\"\n",
"flair_ontonotes_large = \"flair/ner-english-ontonotes-large\"\n",
"models = [\n",
" flair_ner,\n",
" flair_ner_fast,\n",
" flair_ontonotes_fast,\n",
" flair_ner_fast,\n",
" flair_ontonotes_large,\n",
"]"
]
@ -312,9 +321,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -326,7 +335,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,

Просмотреть файл

@ -109,7 +109,10 @@
"metadata": {},
"outputs": [],
"source": [
"models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]"
"models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]\n",
"\n",
"# If needed, install models using `python -m spacy download X` where x is the model name, or use spacy.cli.download:\n",
"#spacy.cli.download(\"en_core_web_trf\")"
]
},
{
@ -334,9 +337,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "presidio",
"display_name": "presidio-evaluator",
"language": "python",
"name": "presidio"
"name": "presidio-evaluator"
},
"language_info": {
"codemirror_mode": {
@ -348,9 +351,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
"nbformat_minor": 4
}

Просмотреть файл

@ -77,7 +77,7 @@ class UsDriverLicenseProvider(BaseProvider):
formats = yaml.safe_load(open(us_driver_license_file))
self.formats = formats['en']['faker']['driving_license']['usa']
def driver_license(self) -> str:
def us_driver_license(self) -> str:
# US driver's licenses patterns vary by state. Here we sample a random state and format
us_state = random.choice(list(self.formats))
us_state_format = random.choice(self.formats[us_state])

Просмотреть файл

@ -170,6 +170,8 @@ class PresidioDataGenerator:
new_provider = BaseProvider(self.faker)
setattr(new_provider, new_name, original)
setattr(new_provider, new_name.lower(), original) # avoid case sensitivity
setattr(new_provider, new_name.upper(), original) # avoid case sensitivity
self.faker.add_provider(new_provider)
@staticmethod

Просмотреть файл

@ -25,7 +25,7 @@ class PresidioPseudonymization(PresidioDataGenerator):
self.add_provider_alias("credit_card_number", "CREDIT_CARD")
self.add_provider_alias("iban", "IBAN_CODE")
self.add_provider_alias("phone_number", "PHONE_NUMBER")
self.add_provider_alias("url", "DOMAIN_NAME")
self.add_provider_alias("url", "URL")
self.add_provider_alias("ssn", "US_SSN")
self.add_provider_alias("email", "EMAIL_ADDRESS")
self.add_provider_alias("date_time", "DATE_TIME")

Просмотреть файл

@ -537,7 +537,7 @@ class InputSample(object):
if span.entity_type in dictionary:
span.entity_type = dictionary[span.entity_type]
elif ignore_unknown:
span.entity_value = "O"
span.entity_type = "O"
# Remove spans if they were changed to "O"
self.spans = [span for span in self.spans if span.entity_type != "O"]

Просмотреть файл

@ -1,6 +1,8 @@
import copy
from collections import Counter
from typing import List, Optional, Dict
from pathlib import Path
import string
import numpy as np
from tqdm import tqdm
@ -39,7 +41,6 @@ class Evaluator:
self.entities_to_keep = self.model.entities
def compare(self, input_sample: InputSample, prediction: List[str]):
"""
Compares ground truth tags (annotation) and predicted (prediction)
:param input_sample: input sample containing list of tags with scheme
@ -71,6 +72,9 @@ class Evaluator:
if self.entities_to_keep:
prediction = self._adjust_per_entities(prediction)
new_annotation = self._adjust_per_entities(new_annotation)
skip_words = self.get_skip_words()
for i in range(0, len(new_annotation)):
results[(new_annotation[i], prediction[i])] += 1
@ -81,6 +85,10 @@ class Evaluator:
# check if there was an error
is_error = new_annotation[i] != prediction[i]
if str(tokens[i]).lower().strip() in skip_words:
is_error = False
results[(new_annotation[i], prediction[i])] -= 1
if is_error:
if prediction[i] == "O":
mistakes.append(
@ -151,7 +159,6 @@ class Evaluator:
f"Mapping entity values using this dictionary: {self.model.entity_mapping}"
)
for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"):
# Align tag values to the ones expected by the model
self.model.align_entity_types(sample)
@ -345,13 +352,13 @@ class Evaluator:
if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0):
return np.nan
return ((1 + beta ** 2) * precision * recall) / (
((beta ** 2) * precision) + recall
return ((1 + beta**2) * precision * recall) / (
((beta**2) * precision) + recall
)
class Plotter:
"""
Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
for a PII detection model evaluated via Evaluator
:param model: Instance of a fitted model (of base type BaseModel)
@ -362,7 +369,9 @@ class Evaluator:
which gives more or less weight to precision vs. recall
"""
def __init__(self, model, results, output_folder: Path, model_name: str, beta: float):
def __init__(
self, model, results, output_folder: Path, model_name: str, beta: float
):
self.model = model
self.results = results
self.output_folder = output_folder
@ -372,41 +381,66 @@ class Evaluator:
def plot_scores(self) -> None:
"""
Plots per-entity recall, precision, or F2 score for evaluated model.
:param plot_type: which metric to graph (default is F2 score)
Plots per-entity recall, precision, or F2 score for evaluated model.
"""
scores = {}
scores['entity'] = list(self.results.entity_recall_dict.keys())
scores['recall'] = list(self.results.entity_recall_dict.values())
scores['precision'] = list(self.results.entity_precision_dict.values())
scores['count'] = list(self.results.n_dict.values())
scores[f"f{self.beta}_score"] = [Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
for recall, precision in zip(scores['recall'], scores['precision'])]
entity_recall_dict = copy.deepcopy(self.results.entity_recall_dict)
entity_precision_dict = copy.deepcopy(self.results.entity_precision_dict)
scores["entity"] = list(entity_recall_dict.keys())
scores["recall"] = list(entity_recall_dict.values())
scores["precision"] = list(entity_precision_dict.values())
scores["count"] = list(self.results.n_dict.values())
scores[f"f{self.beta}_score"] = [
Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
for recall, precision in zip(scores["recall"], scores["precision"])
]
# Add PII detection rates
scores["entity"].append("PII")
scores["recall"].append(self.results.pii_recall)
scores["precision"].append(self.results.pii_precision)
scores["count"].append(self.results.n)
scores[f"f{self.beta}_score"].append(self.results.pii_f)
df = pd.DataFrame(scores)
df['model'] = self.model_name
df["model"] = self.model_name
self._plot(df, plot_type="f2_score")
self._plot(df, plot_type="precision")
self._plot(df, plot_type="recall")
def _plot(self, df, plot_type) -> None:
fig = px.bar(df, text_auto=".2", y='entity', orientation="h",
x=plot_type, color='count', barmode='group', title=f"Per-entity {plot_type} for {self.model_name}")
fig.update_layout(barmode='group', yaxis={
'categoryorder': 'total ascending'})
fig = px.bar(
df,
text_auto=".2",
y="entity",
orientation="h",
x=plot_type,
color="count",
barmode="group",
height=30*len(set(df["entity"])),
title=f"Per-entity {plot_type} for {self.model_name}",
)
fig.update_layout(
barmode="group", yaxis={"categoryorder": "total ascending"}
)
fig.update_layout(yaxis_title=f"{plot_type}", xaxis_title="PII Entity")
fig.update_traces(textfont_size=12, textangle=0,
textposition="outside", cliponaxis=False)
fig.update_traces(
textfont_size=12, textangle=0, textposition="outside", cliponaxis=False
)
fig.update_layout(
plot_bgcolor="#FFF",
xaxis=dict(
title="PII entity",
linecolor="#BCCCDC", # Sets color of X-axis line
showgrid=False # Removes X-axis grid lines
showgrid=False, # Removes X-axis grid lines
),
yaxis=dict(
title=f"{plot_type}",
linecolor="#BCCCDC", # Sets color of X-axis line
showgrid=False # Removes X-axis grid lines
showgrid=False, # Removes X-axis grid lines
),
)
fig.show()
@ -419,47 +453,100 @@ class Evaluator:
for entity in self.model.entity_mapping.values():
fps_df = ModelError.get_fps_dataframe(self.errors, entity=[entity])
if fps_df is not None:
fps_path = self.output_folder / \
f"{self.model_name}-{entity}-fps.csv"
fps_path = (
self.output_folder / f"{self.model_name}-{entity}-fps.csv"
)
fps_df.to_csv(fps_path)
fps_frames.append(fps_path)
fns_df = ModelError.get_fns_dataframe(self.errors, entity=[entity])
if fns_df is not None:
fns_path = self.output_folder / \
f"{self.model_name}-{entity}-fns.csv"
fns_path = (
self.output_folder / f"{self.model_name}-{entity}-fns.csv"
)
fns_df.to_csv(fns_path)
fns_frames.append(fns_path)
def group_tokens(df):
return df.groupby(['token', 'annotation']).size().to_frame(
).sort_values([0], ascending=False).head(3).reset_index()
return (
df.groupby(["token", "annotation"])
.size()
.to_frame()
.sort_values([0], ascending=False)
.head(3)
.reset_index()
)
fps_tokens_df = pd.concat(
[group_tokens(pd.read_csv(df_path)) for df_path in fps_frames])
[group_tokens(pd.read_csv(df_path)) for df_path in fps_frames]
)
fns_tokens_df = pd.concat(
[group_tokens(pd.read_csv(df_path)) for df_path in fns_frames])
[group_tokens(pd.read_csv(df_path)) for df_path in fns_frames]
)
def generate_graph(title, tokens_df):
fig = px.histogram(tokens_df, x=0, y="token", orientation='h', color='annotation',
title=f"Most common {title} for {self.model_name}")
fig = px.histogram(
tokens_df,
x=0,
y="token",
orientation="h",
color="annotation",
title=f"Most common {title} for {self.model_name}",
)
fig.update_layout(yaxis_title=f"count", xaxis_title="PII Entity")
fig.update_traces(textfont_size=12, textangle=0,
textposition="outside", cliponaxis=False)
fig.update_traces(
textfont_size=12,
textangle=0,
textposition="outside",
cliponaxis=False,
)
fig.update_layout(
plot_bgcolor="#FFF",
xaxis=dict(
title="Count",
linecolor="#BCCCDC", # Sets color of X-axis line
showgrid=False # Removes X-axis grid lines
showgrid=False, # Removes X-axis grid lines
),
yaxis=dict(
title=f"Tokens",
linecolor="#BCCCDC", # Sets color of X-axis line
showgrid=False # Removes X-axis grid lines
showgrid=False, # Removes X-axis grid lines
),
)
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.update_layout(yaxis={"categoryorder": "total ascending"})
fig.show()
generate_graph(title="false-negatives", tokens_df=fns_tokens_df)
generate_graph(title="false-positives", tokens_df=fps_tokens_df)
@staticmethod
def get_skip_words():
skip_words = [x for x in string.punctuation]
skip_words.extend(
[
"\n",
"\n\n",
"\n\n\n",
">>",
">>>",
">>>>",
"street",
"st.",
"st",
"de",
"rue",
"via",
"and",
"or",
"do",
"as",
"of",
"day",
"address",
"country",
"state",
"city",
]
)
return skip_words

Просмотреть файл

@ -31,9 +31,10 @@ class BaseModel(ABC):
self.labeling_scheme = labeling_scheme
self.entity_mapping = entity_mapping
self.verbose = verbose
self.name = self.__class__.__name__
@abstractmethod
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
"""
Abstract. Returns the predicted tokens/spans from the evaluated model
:param sample: Sample to be evaluated

Просмотреть файл

@ -85,7 +85,7 @@ class CRFModel(BaseModel):
y_train = [self.sent2labels(s) for s in sentences]
return X_train, y_train
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
tags = CRFModel.crf_predict(sample, self.model)
if len(tags) != len(sample.tokens):

Просмотреть файл

@ -48,7 +48,7 @@ class FlairModel(BaseModel):
self.spacy_tokenizer = SpacyTokenizer(model=spacy.load("en_core_web_sm"))
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer)
self.model.predict(sentence)

Просмотреть файл

@ -1,6 +1,6 @@
from typing import List, Optional, Dict
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer import AnalyzerEngine, EntityRecognizer
from presidio_evaluator import InputSample, span_to_tag
from presidio_evaluator.models import BaseModel
@ -16,6 +16,9 @@ class PresidioAnalyzerWrapper(BaseModel):
score_threshold: float = 0.4,
language: str = "en",
entity_mapping: Optional[Dict[str, str]] = None,
ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
context: Optional[List[str]] = None,
allow_list: Optional[List[str]] = None,
):
"""
Evaluation wrapper for the Presidio Analyzer
@ -29,25 +32,37 @@ class PresidioAnalyzerWrapper(BaseModel):
)
self.score_threshold = score_threshold
self.language = language
self.ad_hoc_recognizers = ad_hoc_recognizers
self.context = context
self.allow_list = allow_list
if not analyzer_engine:
analyzer_engine = AnalyzerEngine()
self._update_recognizers_based_on_entities_to_keep(analyzer_engine)
self.analyzer_engine = analyzer_engine
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
language = kwargs.get("language", self.language)
score_threshold = kwargs.get("score_threshold", self.score_threshold)
ad_hoc_recognizers = kwargs.get("ad_hoc_recognizers", self.ad_hoc_recognizers)
context = kwargs.get("context", self.context)
allow_list = kwargs.get("allow_list", self.allow_list)
results = self.analyzer_engine.analyze(
text=sample.full_text,
entities=self.entities,
language=self.language,
score_threshold=self.score_threshold,
language=language,
score_threshold=score_threshold,
ad_hoc_recognizers=ad_hoc_recognizers,
context=context,
allow_list=allow_list,
**kwargs,
)
starts = []
ends = []
scores = []
tags = []
#
for res in results:
starts.append(res.start)
ends.append(res.end)
@ -76,23 +91,28 @@ class PresidioAnalyzerWrapper(BaseModel):
"PHONE_NUMBER": "PHONE_NUMBER",
"BIRTHDAY": "DATE_TIME",
"DATE_TIME": "DATE_TIME",
"DOMAIN_NAME": "DOMAIN_NAME",
"DOMAIN_NAME": "URL",
"TIME" : "DATE_TIME",
"DATE" : "DATE_TIME",
"CITY": "LOCATION",
"ADDRESS": "LOCATION",
"STREET_ADDRESS": "LOCATION",
"NATIONALITY": "LOCATION",
"LOCATION": "LOCATION",
"IBAN_CODE": "IBAN_CODE",
"URL": "DOMAIN_NAME",
"URL": "URL",
"US_SSN": "US_SSN",
"IP_ADDRESS": "IP_ADDRESS",
"ORGANIZATION": "ORG",
"ORGANIZATION": "ORGANIZATION",
"ORG": "ORGANIZATION",
"US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
"NRP": "NRP",
"TITLE": "O", # not supported
"PREFIX": "O", # not supported
"STREET_ADDRESS": "O", # not supported
"ZIP_CODE": "O", # not supported
"AGE": "O", # not supported
"NRP": "LOCATION",
"NORP": "LOCATION",
"ID": "ID",
"TITLE": "O", # not supported through spaCy
"PREFIX": "O", # not supported through spaCy
"ZIP_CODE": "O", # not supported through spaCy
"AGE": "O", # not supported through spaCy
"O": "O",
}

Просмотреть файл

@ -41,12 +41,15 @@ class PresidioRecognizerWrapper(BaseModel):
self.recognizer = recognizer
self.nlp_engine = nlp_engine
if not self.nlp_engine.is_loaded():
self.nlp_engine.load()
#
def __make_nlp_artifacts(self, text: str):
return self.nlp_engine.process_text(text, "en")
#
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
nlp_artifacts = None
if self.with_nlp_artifacts:
nlp_artifacts = self.__make_nlp_artifacts(sample.full_text)

Просмотреть файл

@ -31,7 +31,7 @@ class SpacyModel(BaseModel):
else:
self.model = model
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
"""
Predict a list of tags for an inpuit sample.
:param sample: InputSample

Просмотреть файл

@ -51,7 +51,7 @@ class StanzaModel(SpacyModel):
entity_mapping=entity_mapping,
)
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
"""
Predict the tags using a stanza model.

Просмотреть файл

@ -48,8 +48,7 @@ class TextAnalyticsWrapper(BaseModel):
)
return text_analytics_client
def predict(self, sample: InputSample) -> List[str]:
def predict(self, sample: InputSample, **kwargs) -> List[str]:
documents = [sample.full_text]
response = self.ta_client.recognize_pii_entities(documents,
language="en")

45
pyproject.toml Normal file
Просмотреть файл

@ -0,0 +1,45 @@
[tool.poetry]
name = "presidio_evaluator"
version = "0.1.0"
description = ""
authors = ["Omri Mendels <omri374@users.noreply.github.com>"]
readme = "README.md"
include = [{ path= "presidio_evaluator/data_generator/raw_data/*"}]
[tool.poetry.dependencies]
python = "^3.9"
spacy = "^3.5.0"
numpy = "^1.22"
pandas = "^2.1.4"
tqdm = "^4.60.0"
faker = "^21.0"
scikit-learn = "^1.3.2"
presidio-analyzer = "^2.2.351"
presidio-anonymizer = "^2.2.351"
requests = "^2.25"
xmltodict = "^0.12.0"
python-dotenv = "^1.0.0"
plotly = "^5.18.0"
azure-ai-textanalytics = "^5.3.0"
en_core_web_sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz"}
en_core_web_lg = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz"}
# optional dependencies for the different NLP approaches
[tool.poetry.group.ner]
optional=true
[tool.poetry.group.ner.dependencies]
flair = "^0.13.0"
spacy_stanza = "^1.0.0"
sklearn_crfsuite = "^0.3.6"
spacy_huggingface_pipelines = "^0.0.4"
[tool.poetry.group.dev.dependencies]
pytest = ">=6.*"
flake8 = ">=3.*"
pytest-azurepipelines = "^1.0.5"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

Просмотреть файл

@ -1,21 +0,0 @@
spacy>=3.2.0
numpy>=1.20.2
jupyter>=1
pandas>=1.2.4
tqdm>=4.60.0
haikunator>=2.1.0
schwifty
faker>=9.6.0
scikit_learn
#flair
#stanza
#spacy_stanza
#sklearn_crfsuite
pytest>=6.2.3
presidio_analyzer
presidio_anonymizer
requests>=2.25.1
xmltodict>=0.12.0
python-dotenv
plotly
azure-ai-textanalytics==5.2.0

Просмотреть файл

@ -1,20 +0,0 @@
spacy>=3.2.0
numpy>=1.12.4
jupyter>=1
pandas>=1.3.4
tqdm>=4.60.0
haikunator>=2.1.0
schwifty
faker>=9.6.0
scikit_learn<0.24
pytest>=6.2.3
presidio_analyzer
presidio_anonymizer
requests>=2.25.1
xmltodict>=0.12.0
torch>=1.10.1
python-dotenv
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
flair>=0.10
stanza>=1.3.0
spacy-stanza>=1.0.1

Просмотреть файл

@ -1,54 +1,53 @@
from setuptools import setup, find_packages
import os.path
# read the contents of the README file
# -*- coding: utf-8 -*-
from setuptools import setup
import os
from os import path
this_directory = path.abspath(path.dirname(__file__))
with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
long_description = f.read()
# print(long_description)
with open(os.path.join(this_directory, "VERSION")) as version_file:
__version__ = version_file.read().strip()
version = version_file.read().strip()
packages = [
"presidio_evaluator",
"presidio_evaluator.data_generator",
"presidio_evaluator.data_generator.faker_extensions",
"presidio_evaluator.dataset_formatters",
"presidio_evaluator.evaluation",
"presidio_evaluator.experiment_tracking",
"presidio_evaluator.models",
]
package_data = {"": ["*"], "presidio_evaluator.data_generator": ["raw_data/*"]}
install_requires = [
"azure-ai-textanalytics>=5.3.0,<6.0.0",
"faker>=21.0,<22.0",
"numpy>=1.22,<2.0",
"pandas>=2.1.4,<3.0.0",
"plotly>=5.18.0,<6.0.0",
"presidio-analyzer>=2.2.351,<3.0.0",
"presidio-anonymizer>=2.2.351,<3.0.0",
"python-dotenv>=1.0.0,<2.0.0",
"requests>=2.25,<3.0",
"scikit-learn>=1.3.2,<2.0.0",
"spacy>=3.5.0,<4.0.0",
"tqdm>=4.60.0,<5.0.0",
"xmltodict>=0.12.0,<0.13.0",
]
setup(
name="presidio-evaluator",
long_description=long_description,
long_description_content_type="text/markdown",
version=__version__,
packages=find_packages(exclude=["tests"]),
url="https://www.github.com/microsoft/presidio-research",
version=version,
license="MIT",
description="PII dataset generator, model evaluator for Presidio and PII data in general", # noqa
data_files=[
(
"presidio_evaluator/data_generator/raw_data",
[
"presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv", # noqa
"presidio_evaluator/data_generator/raw_data/templates.txt",
"presidio_evaluator/data_generator/raw_data/companies_and_organizations.csv",
"presidio_evaluator/data_generator/raw_data/nationalities.csv",
"presidio_evaluator/data_generator/raw_data/us_driver_licenses.csv",
],
)
],
include_package_data=True,
install_requires=[
"presidio_analyzer",
"presidio_anonymizer",
"spacy>=3.0.0",
"requests",
"numpy",
"pandas",
"tqdm>=4.32.1",
"jupyter>=1.0.0",
"pytest>=4.6.2",
"haikunator",
"schwifty",
"faker",
"sklearn_crfsuite",
"python-dotenv",
"azure-ai-textanalytics==5.2.0"
],
)
packages=packages,
package_data=package_data,
install_requires=install_requires,
python_requires=">=3.8,<4.0",
)

Просмотреть файл

@ -345,3 +345,22 @@ def test_align_entity_types_wrong_mapping_exception():
Evaluator.align_entity_types(
input_samples=[sample1], entities_mapping=entities_mapping
)
def test_skip_words_are_not_counted_as_errors():
prediction = ["U-PERSON", "O", "O", "O", "U-LOCATION"]
model = MockTokensModel(prediction=prediction,
entities_to_keep=["LOCATION", "PERSON"])
evaluator = Evaluator(model=model)
sample = InputSample(
full_text="John is on the street", masked="I am the street", spans=None
)
sample.tokens = ["John", "is", "on", "the", "street"]
sample.tags = ["U-PERSON", "O", "O", "O", "O"]
evaluated = evaluator.evaluate_sample(sample, prediction)
final_evaluation = evaluator.calculate_score([evaluated])
assert final_evaluation.pii_precision == 1
assert final_evaluation.pii_recall == 1

Просмотреть файл

@ -30,7 +30,7 @@ def fake_faker():
],
# fmt: on
)
def test_presidio_psudonymize_two_entities(
def test_presidio_pseudonymize_two_entities(
text, entity1, entity2, start1, end1, start2, end2, value1, value2, fake_faker
):
@ -51,3 +51,15 @@ def test_presidio_psudonymize_two_entities(
assert value2 in pseudonym
assert text[:start1].lower() in pseudonym.lower()
assert text[end1:start2].lower() in pseudonym.lower()
def test_simple_scenario():
original_text = "Hi my name is Doug Funny and this is my website: https://www.dougf.io" # noqa
presidio_response = [
RecognizerResult(entity_type="PERSON", start=14, end=24, score=0.85),
RecognizerResult(entity_type="URL", start=49, end=69, score=0.95),
]
PresidioPseudonymization().pseudonymize(original_text=original_text,
presidio_response=presidio_response,
count=5)