Merge branch 'master' into omri/add_spacy_to_nb
This commit is contained in:
Коммит
48f0b359c5
|
@ -1,44 +0,0 @@
|
|||
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
|
||||
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
|
||||
|
||||
name: Python package
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ "master", "feature/*" ]
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- 'feature/**'
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: ["3.7", "3.8", "3.9", "3.10"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install flake8 pytest
|
||||
pip install -r requirements.txt
|
||||
python -m spacy download en_core_web_sm
|
||||
python -m spacy download en_core_web_lg
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
# stop the build if there are Python syntax errors or undefined names
|
||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pytest
|
|
@ -189,4 +189,5 @@ datasets/
|
|||
/data
|
||||
|
||||
*.spacy
|
||||
*.pickle
|
||||
*.pickle
|
||||
/poetry.lock
|
||||
|
|
|
@ -1,45 +0,0 @@
|
|||
# Python package
|
||||
# Create and test a Python package on multiple Python versions.
|
||||
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
|
||||
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
|
||||
|
||||
trigger:
|
||||
- master
|
||||
|
||||
pr:
|
||||
branches:
|
||||
include:
|
||||
- master
|
||||
- feature/*
|
||||
|
||||
pool:
|
||||
vmImage: 'ubuntu-latest'
|
||||
strategy:
|
||||
matrix:
|
||||
Python37:
|
||||
python.version: '3.7'
|
||||
Python38:
|
||||
python.version: '3.8'
|
||||
Python39:
|
||||
python.version: '3.9'
|
||||
Python310:
|
||||
python.version: '3.10'
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: '$(python.version)'
|
||||
displayName: 'Use Python $(python.version)'
|
||||
|
||||
- script: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install wheel
|
||||
pip install -r requirements.txt
|
||||
python -m spacy download en_core_web_lg
|
||||
python -m spacy download en_core_web_sm
|
||||
|
||||
displayName: 'Install base dependencies'
|
||||
|
||||
- script: |
|
||||
pip install pytest pytest-azurepipelines
|
||||
pytest
|
||||
displayName: 'pytest'
|
10
README.md
10
README.md
|
@ -13,6 +13,8 @@ In addition, it contains a fake data generator which creates fake sentences base
|
|||
|
||||
## Getting started
|
||||
|
||||
>Note: Presidio evaluator requires Python>=3.9
|
||||
|
||||
### From PyPI
|
||||
|
||||
``` sh
|
||||
|
@ -36,8 +38,10 @@ conda create --name presidio python=3.9
|
|||
conda activate presidio
|
||||
|
||||
# Install package+dependencies
|
||||
pip install -r requirements.txt
|
||||
python setup.py install
|
||||
pip install poetry
|
||||
poetry install
|
||||
# To install with all additional NER dependencies (e.g. Flair, Stanza, CRF), run:
|
||||
# poetry install --with ner
|
||||
|
||||
# Download a spaCy model used by presidio-analyzer
|
||||
python -m spacy download en_core_web_lg
|
||||
|
@ -70,7 +74,7 @@ Furthermore, it tokenizes the data, creates tags (either IO/BIO/BILUO) and spans
|
|||
|
||||
Once data is generated, it could be split into train/test/validation sets
|
||||
while ensuring that each template only exists in one set.
|
||||
See [this notebook for more details](notebooks/3_Split_by_pattern_%23.ipynb).
|
||||
See [this notebook for more details](notebooks/3_Split_by_pattern_number.ipynb).
|
||||
|
||||
## 2. Data representation
|
||||
|
||||
|
|
3
VERSION
3
VERSION
|
@ -1,2 +1 @@
|
|||
0.1.1
|
||||
|
||||
0.1.3
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
pr:
|
||||
branches:
|
||||
include:
|
||||
- master
|
||||
- feature/*
|
||||
|
||||
pool:
|
||||
vmImage: 'ubuntu-latest'
|
||||
strategy:
|
||||
matrix:
|
||||
Python38:
|
||||
python.version: '3.8'
|
||||
Python39:
|
||||
python.version: '3.9'
|
||||
Python310:
|
||||
python.version: '3.10'
|
||||
Python311:
|
||||
python.version: '3.11'
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: '$(python.version)'
|
||||
displayName: 'Use Python $(python.version)'
|
||||
|
||||
- script: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install poetry
|
||||
poetry install --with dev,ner
|
||||
|
||||
displayName: 'Install dependencies'
|
||||
|
||||
- script: |
|
||||
poetry add pytest-azurepipelines
|
||||
poetry run pytest --runslow
|
||||
displayName: 'pytest'
|
|
@ -2,8 +2,23 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"is_executing": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# install presidio via pip if not yet installed\n",
|
||||
"\n",
|
||||
"#!pip install presidio-analyzer\n",
|
||||
"#!pip install presidio-evaluator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
|
@ -69,9 +84,34 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"is_executing": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sampling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9149.88it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"My name is Joshua Jackson\n",
|
||||
"[{\"value\": \"Joshua Jackson\", \"start\": 11, \"end\": 25, \"type\": \"name\"}]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sentence_templates = [\n",
|
||||
" \"My name is {{name}}\",\n",
|
||||
|
@ -114,8 +154,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
|
@ -153,13 +194,228 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>number</th>\n",
|
||||
" <th>gender</th>\n",
|
||||
" <th>nationality</th>\n",
|
||||
" <th>prefix</th>\n",
|
||||
" <th>first_name</th>\n",
|
||||
" <th>middle_initial</th>\n",
|
||||
" <th>last_name</th>\n",
|
||||
" <th>street_name</th>\n",
|
||||
" <th>city</th>\n",
|
||||
" <th>state_abbr</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>company</th>\n",
|
||||
" <th>domain_name</th>\n",
|
||||
" <th>person</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" <th>first_name_female</th>\n",
|
||||
" <th>first_name_male</th>\n",
|
||||
" <th>prefix_female</th>\n",
|
||||
" <th>prefix_male</th>\n",
|
||||
" <th>last_name_female</th>\n",
|
||||
" <th>last_name_male</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>Czech</td>\n",
|
||||
" <td>Mrs.</td>\n",
|
||||
" <td>Marie</td>\n",
|
||||
" <td>J</td>\n",
|
||||
" <td>Hamanová</td>\n",
|
||||
" <td>P.O. Box 255</td>\n",
|
||||
" <td>Kangerlussuaq</td>\n",
|
||||
" <td>QE</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Simple Solutions</td>\n",
|
||||
" <td>MarathonDancing.gl</td>\n",
|
||||
" <td>Marie J Hamanová</td>\n",
|
||||
" <td>Marie J Hamanová</td>\n",
|
||||
" <td>Marie</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Mrs.</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Hamanová</td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>French</td>\n",
|
||||
" <td>Ms.</td>\n",
|
||||
" <td>Patricia</td>\n",
|
||||
" <td>G</td>\n",
|
||||
" <td>Desrosiers</td>\n",
|
||||
" <td>Avenida Noruega 42</td>\n",
|
||||
" <td>Vila Real</td>\n",
|
||||
" <td>VR</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Formula Gray</td>\n",
|
||||
" <td>LostMillions.com.pt</td>\n",
|
||||
" <td>Patricia Desrosiers</td>\n",
|
||||
" <td>Patricia Desrosiers</td>\n",
|
||||
" <td>Patricia</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Ms.</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Desrosiers</td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>American</td>\n",
|
||||
" <td>Ms.</td>\n",
|
||||
" <td>Debra</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" <td>Neal</td>\n",
|
||||
" <td>1659 Hoog St</td>\n",
|
||||
" <td>Brakpan</td>\n",
|
||||
" <td>GA</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Dahlkemper's</td>\n",
|
||||
" <td>MediumTube.co.za</td>\n",
|
||||
" <td>Debra O Neal</td>\n",
|
||||
" <td>Debra O Neal</td>\n",
|
||||
" <td>Debra</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Ms.</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Neal</td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>male</td>\n",
|
||||
" <td>French</td>\n",
|
||||
" <td>Mr.</td>\n",
|
||||
" <td>Peverell</td>\n",
|
||||
" <td>C</td>\n",
|
||||
" <td>Racine</td>\n",
|
||||
" <td>183 Epimenidou Street</td>\n",
|
||||
" <td>Limassol</td>\n",
|
||||
" <td>LI</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Quickbiz</td>\n",
|
||||
" <td>ImproveLook.com.cy</td>\n",
|
||||
" <td>Peverell Racine</td>\n",
|
||||
" <td>Peverell Racine</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Peverell</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Mr.</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Racine</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>Slovenian</td>\n",
|
||||
" <td>Mrs.</td>\n",
|
||||
" <td>Iolanda</td>\n",
|
||||
" <td>S</td>\n",
|
||||
" <td>Tratnik</td>\n",
|
||||
" <td>Karu põik 61</td>\n",
|
||||
" <td>Pärnu</td>\n",
|
||||
" <td>PR</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Dubrow's Cafeteria</td>\n",
|
||||
" <td>PostTan.com.ee</td>\n",
|
||||
" <td>Iolanda Tratnik</td>\n",
|
||||
" <td>Iolanda Tratnik</td>\n",
|
||||
" <td>Iolanda</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Mrs.</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>Tratnik</td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 37 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" number gender nationality prefix first_name middle_initial last_name \\\n",
|
||||
"0 1 female Czech Mrs. Marie J Hamanová \n",
|
||||
"1 2 female French Ms. Patricia G Desrosiers \n",
|
||||
"2 3 female American Ms. Debra O Neal \n",
|
||||
"3 4 male French Mr. Peverell C Racine \n",
|
||||
"4 5 female Slovenian Mrs. Iolanda S Tratnik \n",
|
||||
"\n",
|
||||
" street_name city state_abbr ... company \\\n",
|
||||
"0 P.O. Box 255 Kangerlussuaq QE ... Simple Solutions \n",
|
||||
"1 Avenida Noruega 42 Vila Real VR ... Formula Gray \n",
|
||||
"2 1659 Hoog St Brakpan GA ... Dahlkemper's \n",
|
||||
"3 183 Epimenidou Street Limassol LI ... Quickbiz \n",
|
||||
"4 Karu põik 61 Pärnu PR ... Dubrow's Cafeteria \n",
|
||||
"\n",
|
||||
" domain_name person name \\\n",
|
||||
"0 MarathonDancing.gl Marie J Hamanová Marie J Hamanová \n",
|
||||
"1 LostMillions.com.pt Patricia Desrosiers Patricia Desrosiers \n",
|
||||
"2 MediumTube.co.za Debra O Neal Debra O Neal \n",
|
||||
"3 ImproveLook.com.cy Peverell Racine Peverell Racine \n",
|
||||
"4 PostTan.com.ee Iolanda Tratnik Iolanda Tratnik \n",
|
||||
"\n",
|
||||
" first_name_female first_name_male prefix_female prefix_male \\\n",
|
||||
"0 Marie Mrs. \n",
|
||||
"1 Patricia Ms. \n",
|
||||
"2 Debra Ms. \n",
|
||||
"3 Peverell Mr. \n",
|
||||
"4 Iolanda Mrs. \n",
|
||||
"\n",
|
||||
" last_name_female last_name_male \n",
|
||||
"0 Hamanová \n",
|
||||
"1 Desrosiers \n",
|
||||
"2 Neal \n",
|
||||
"3 Racine \n",
|
||||
"4 Tratnik \n",
|
||||
"\n",
|
||||
"[5 rows x 37 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Read FakeNameGenerator CSV\n",
|
||||
"fake_name_generator_df = pd.read_csv(fake_name_generator_file)\n",
|
||||
|
@ -178,8 +434,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
|
@ -197,8 +454,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"is_executing": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fake.add_provider(IpAddressProvider) # Both Ipv4 and IPv6 IP addresses\n",
|
||||
|
@ -223,8 +482,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
|
@ -258,13 +518,36 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sampling: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 17987.56it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"year\"}], \"template\": \"Title VII of the Civil Rights Act of {{year}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sentence_templates = PresidioDataGenerator.read_template_file(templates_file_path)\n",
|
||||
"fake_records = data_generator.generate_fake_data(\n",
|
||||
|
@ -284,11 +567,23 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 10,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total: 1500\n",
|
||||
"Avg # of records per template: 7.142857142857143\n",
|
||||
"Median # of records per template: 7.0\n",
|
||||
"Std: 2.5872528966106905\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"count_per_template_id = Counter([sample.template_id for sample in fake_records])\n",
|
||||
"\n",
|
||||
|
@ -311,13 +606,65 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 11,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Counter({'organization': 257,\n",
|
||||
" 'first_name': 244,\n",
|
||||
" 'person': 238,\n",
|
||||
" 'city': 235,\n",
|
||||
" 'address': 209,\n",
|
||||
" 'street_name': 164,\n",
|
||||
" 'name': 162,\n",
|
||||
" 'country': 154,\n",
|
||||
" 'credit_card_number': 152,\n",
|
||||
" 'phone_number': 121,\n",
|
||||
" 'last_name': 119,\n",
|
||||
" 'building_number': 110,\n",
|
||||
" 'age': 72,\n",
|
||||
" 'secondary_address': 64,\n",
|
||||
" 'year': 58,\n",
|
||||
" 'nationality': 55,\n",
|
||||
" 'postcode': 49,\n",
|
||||
" 'zipcode': 45,\n",
|
||||
" 'url': 39,\n",
|
||||
" 'email': 39,\n",
|
||||
" 'name_female': 37,\n",
|
||||
" 'job': 33,\n",
|
||||
" 'first_name_male': 31,\n",
|
||||
" 'name_male': 29,\n",
|
||||
" 'prefix_male': 28,\n",
|
||||
" 'date_of_birth': 24,\n",
|
||||
" 'iban': 22,\n",
|
||||
" 'date_time': 21,\n",
|
||||
" 'prefix_female': 21,\n",
|
||||
" 'day_of_week': 16,\n",
|
||||
" 'state_abbr': 15,\n",
|
||||
" 'last_name_male': 15,\n",
|
||||
" 'prefix': 12,\n",
|
||||
" 'ip_address': 11,\n",
|
||||
" 'ssn': 11,\n",
|
||||
" 'nation_plural': 9,\n",
|
||||
" 'nation_woman': 8,\n",
|
||||
" 'first_name_nonbinary': 6,\n",
|
||||
" 'us_driver_license': 6,\n",
|
||||
" 'first_name_female': 3,\n",
|
||||
" 'last_name_female': 3})"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"count_per_entity = Counter()\n",
|
||||
"for record in fake_records:\n",
|
||||
|
@ -339,8 +686,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 12,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
|
@ -409,9 +757,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"is_executing": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{\"fake\": \"Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"spans\": [{\"value\": \"2005\", \"start\": 37, \"end\": 41, \"type\": \"DATE_TIME\"}], \"template\": \"Title VII of the Civil Rights Act of {{DATE_TIME}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\\n\", \"template_id\": 190}"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"fake_records[0]"
|
||||
]
|
||||
|
@ -425,13 +786,41 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('PERSON', 887),\n",
|
||||
" ('STREET_ADDRESS', 596),\n",
|
||||
" ('GPE', 404),\n",
|
||||
" ('ORGANIZATION', 257),\n",
|
||||
" ('CREDIT_CARD', 152),\n",
|
||||
" ('PHONE_NUMBER', 121),\n",
|
||||
" ('DATE_TIME', 119),\n",
|
||||
" ('TITLE', 94),\n",
|
||||
" ('NRP', 72),\n",
|
||||
" ('AGE', 72),\n",
|
||||
" ('ZIP_CODE', 45),\n",
|
||||
" ('DOMAIN_NAME', 39),\n",
|
||||
" ('EMAIL_ADDRESS', 39),\n",
|
||||
" ('IBAN_CODE', 22),\n",
|
||||
" ('IP_ADDRESS', 11),\n",
|
||||
" ('US_SSN', 11),\n",
|
||||
" ('US_DRIVER_LICENSE', 6)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"count_per_entity_new = Counter()\n",
|
||||
|
@ -467,13 +856,51 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 15,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 0%| | 0/1500 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"loading model en_core_web_sm\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:06<00:00, 215.70it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 6.76 s, sys: 33.8 ms, total: 6.8 s\n",
|
||||
"Wall time: 6.96 s\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"input_samples = [\n",
|
||||
|
@ -495,8 +922,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 16,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
|
@ -519,21 +947,31 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 17,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 76888.23it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"conll = InputSample.create_conll_dataset(input_samples)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 18,
|
||||
"metadata": {
|
||||
"is_executing": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
|
@ -550,7 +988,7 @@
|
|||
"### Next steps\n",
|
||||
"\n",
|
||||
"- Evaluate Presidio using this fake data. [Sample](4_Evaluate_Presidio_Analyzer.ipynb)\n",
|
||||
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_#.ipynb)\n",
|
||||
"- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_number.ipynb)\n",
|
||||
"- Conduct a small exploratory data analysis on the generated data. [Sample](2_PII_EDA.ipynb)"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -72,7 +72,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for (name, series) in pii_df.iteritems():\n",
|
||||
"for (name, series) in pii_df.items():\n",
|
||||
" print(name)\n",
|
||||
" print(\"Unique values: {}\".format(len(series.unique())))\n",
|
||||
" print(series.value_counts())\n",
|
||||
|
@ -123,7 +123,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"series_to_wordcloud(pii_df.country_full)"
|
||||
"series_to_wordcloud(pii_df.country)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -187,9 +187,9 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"countries = [get_entity_values_from_sample(sample, [\"LOCATION\"]) for sample in synth]\n",
|
||||
"countries = [get_entity_values_from_sample(sample, [\"TITLE\"]) for sample in synth]\n",
|
||||
"countries = [item for sublist in countries for item in sublist]\n",
|
||||
"series_to_wordcloud(pd.Series(countries, name=\"LOCATION\"))"
|
||||
"series_to_wordcloud(pd.Series(countries, name=\"TITLE\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -213,9 +213,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -227,9 +227,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -10,6 +10,18 @@
|
|||
"This is different from the normal split since we don't want sentences generated from the same pattern to be in more than one set. (Applicable only if the dataset was generated from templates)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# install presidio via pip if not yet installed\n",
|
||||
"\n",
|
||||
"#!pip install presidio-analyzer\n",
|
||||
"#!pip install presidio-evaluator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -131,13 +143,6 @@
|
|||
"assert len(train) + len(test) + len(validation) == len(all_samples)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -148,9 +153,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -162,9 +167,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -5,7 +5,21 @@
|
|||
"id": "847acd88",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Evaluate Presidio Analyzer using the Presidio Evaluator framework"
|
||||
"# Evaluate Presidio Analyzer using the Presidio Evaluator framework"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b946feda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# install presidio via pip if not yet installed\n",
|
||||
"\n",
|
||||
"#!pip install presidio-evaluator\n",
|
||||
"#!pip install \"presidio-analyzer[transformers]\"\n",
|
||||
"#!pip install presidio-evaluator"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -19,6 +33,10 @@
|
|||
"from copy import deepcopy\n",
|
||||
"from pprint import pprint\n",
|
||||
"from collections import Counter\n",
|
||||
"from typing import List\n",
|
||||
"\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"\n",
|
||||
"from presidio_evaluator import InputSample\n",
|
||||
"from presidio_evaluator.evaluation import Evaluator, ModelError\n",
|
||||
|
@ -32,7 +50,8 @@
|
|||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"\n",
|
||||
"%reload_ext autoreload\n",
|
||||
"%autoreload 2"
|
||||
"%autoreload 2\n",
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -52,6 +71,9 @@
|
|||
"source": [
|
||||
"dataset_name = \"synth_dataset_v2.json\"\n",
|
||||
"dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, \"data\", dataset_name))\n",
|
||||
"\n",
|
||||
"dataset = dataset[:300] # top 300 samples\n",
|
||||
"\n",
|
||||
"print(len(dataset))"
|
||||
]
|
||||
},
|
||||
|
@ -62,10 +84,12 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"entity_counter = Counter()\n",
|
||||
"for sample in dataset:\n",
|
||||
" for tag in sample.tags:\n",
|
||||
" entity_counter[tag] += 1"
|
||||
"def get_entity_counts(dataset:List[InputSample]):\n",
|
||||
" entity_counter = Counter()\n",
|
||||
" for sample in dataset:\n",
|
||||
" for tag in sample.tags:\n",
|
||||
" entity_counter[tag] += 1\n",
|
||||
" return entity_counter\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -76,7 +100,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Count per entity:\")\n",
|
||||
"pprint(entity_counter.most_common())\n",
|
||||
"pprint(get_entity_counts(dataset).most_common())\n",
|
||||
"\n",
|
||||
"print(\"\\nExample sentence:\")\n",
|
||||
"print(dataset[1])\n",
|
||||
|
@ -94,12 +118,121 @@
|
|||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9c5e16cb-bee8-4f0a-a543-4879daa35b9e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Define the AnalyzerEngine object \n",
|
||||
"In this case, using a huggingface model: obi/deid_roberta_i2b2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "313b508f-e901-40b9-b575-c7fb8a794652",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from presidio_analyzer import AnalyzerEngine\n",
|
||||
"from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Here we define a transformers based NLP engine, \n",
|
||||
"# but you can use this cell to customize your Presidio Analyzer instance\n",
|
||||
"\n",
|
||||
"# Define which model to use\n",
|
||||
"model_config = [{\"lang_code\": \"en\", \"model_name\": {\n",
|
||||
" \"spacy\": \"en_core_web_sm\", # use a small spaCy model for lemmas, tokens etc.\n",
|
||||
" \"transformers\": \"obi/deid_roberta_i2b2\"\n",
|
||||
" }\n",
|
||||
"}]\n",
|
||||
"\n",
|
||||
"# Map transformers model labels to Presidio's\n",
|
||||
"model_to_presidio_entity_mapping = dict(\n",
|
||||
" PER=\"PERSON\",\n",
|
||||
" PERSON=\"PERSON\",\n",
|
||||
" LOC= \"LOCATION\",\n",
|
||||
" LOCATION= \"LOCATION\",\n",
|
||||
" GPE=\"LOCATION\",\n",
|
||||
" ORG=\"ORGANIZATION\",\n",
|
||||
" ORGANIZATION=\"ORGANIZATION\",\n",
|
||||
" NORP=\"NRP\",\n",
|
||||
" AGE=\"AGE\",\n",
|
||||
" ID=\"ID\",\n",
|
||||
" EMAIL=\"EMAIL\",\n",
|
||||
" PATIENT=\"PERSON\",\n",
|
||||
" STAFF=\"PERSON\",\n",
|
||||
" HOSP=\"ORGANIZATION\",\n",
|
||||
" PATORG=\"ORGANIZATION\",\n",
|
||||
" DATE=\"DATE_TIME\",\n",
|
||||
" TIME=\"DATE_TIME\",\n",
|
||||
" PHONE=\"PHONE_NUMBER\",\n",
|
||||
" HCW=\"PERSON\",\n",
|
||||
" HOSPITAL=\"ORGANIZATION\",\n",
|
||||
" FACILITY=\"LOCATION\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ner_model_configuration = NerModelConfiguration(labels_to_ignore = [\"O\"], \n",
|
||||
" model_to_presidio_entity_mapping=model_to_presidio_entity_mapping)\n",
|
||||
"\n",
|
||||
"nlp_engine = TransformersNlpEngine(models=model_config,\n",
|
||||
" ner_model_configuration=ner_model_configuration)\n",
|
||||
"\n",
|
||||
"# Set up the engine, loads the NLP module (spaCy model by default) \n",
|
||||
"# and other PII recognizers\n",
|
||||
"analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "aae4c379",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Run evaluation:"
|
||||
"### Run evaluation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "16dbf6d6-a554-4602-8907-589786d47a12",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Define experiment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29d39ff1-4f14-4e32-ae84-ecc6c739f829",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"experiment = get_experiment_tracker()\n",
|
||||
"model = PresidioAnalyzerWrapper(analyzer_engine)\n",
|
||||
"\n",
|
||||
"# Define evaluator and experiment tracking\n",
|
||||
"\n",
|
||||
"evaluator = Evaluator(model=model)\n",
|
||||
"dataset = Evaluator.align_entity_types(\n",
|
||||
" deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Count per entity after alignment:\")\n",
|
||||
"pprint(get_entity_counts(dataset).most_common())\n",
|
||||
"\n",
|
||||
"# Track model and dataset params\n",
|
||||
"params = {\"dataset_name\": dataset_name, \"model_name\": model.name}\n",
|
||||
"params.update(model.to_log())\n",
|
||||
"experiment.log_parameters(params)\n",
|
||||
"experiment.log_dataset_hash(dataset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2a7d6626-d094-4dfd-8f37-c0443edf00dc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Run experiment"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -109,39 +242,37 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Evaluating Presidio Analyzer\")\n",
|
||||
"\n",
|
||||
"experiment = get_experiment_tracker()\n",
|
||||
"model_name = \"Presidio Analyzer\"\n",
|
||||
"model = PresidioAnalyzerWrapper()\n",
|
||||
"\n",
|
||||
"evaluator = Evaluator(model=model)\n",
|
||||
"dataset = Evaluator.align_entity_types(\n",
|
||||
" deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Run experiment\n",
|
||||
"evaluation_results = evaluator.evaluate_all(dataset)\n",
|
||||
"results = evaluator.calculate_score(evaluation_results)\n",
|
||||
"\n",
|
||||
"# update params tracking\n",
|
||||
"params = {\"dataset_name\": dataset_name, \"model_name\": model_name}\n",
|
||||
"params.update(model.to_log())\n",
|
||||
"experiment.log_parameters(params)\n",
|
||||
"experiment.log_dataset_hash(dataset)\n",
|
||||
"# Track experiment results\n",
|
||||
"experiment.log_metrics(results.to_log())\n",
|
||||
"entities, confmatrix = results.to_confusion_matrix()\n",
|
||||
"experiment.log_confusion_matrix(matrix=confmatrix, labels=entities)\n",
|
||||
"experiment.log_confusion_matrix(matrix=confmatrix, \n",
|
||||
" labels=entities)\n",
|
||||
"\n",
|
||||
"print(\"Confusion matrix:\")\n",
|
||||
"print(pd.DataFrame(confmatrix, columns=entities, index=entities))\n",
|
||||
"\n",
|
||||
"print(\"Precision and recall\")\n",
|
||||
"print(results)\n",
|
||||
"# Plot output\n",
|
||||
"plotter = evaluator.Plotter(model=model, \n",
|
||||
" results=results, \n",
|
||||
" output_folder = \".\", \n",
|
||||
" model_name = model.name, \n",
|
||||
" beta = 2)\n",
|
||||
"\n",
|
||||
"# end experiment\n",
|
||||
"experiment.end()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5b4d662d-596c-4a69-b3c9-1edcda20cc5b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plotter.plot_scores()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "070f8287",
|
||||
|
@ -185,7 +316,7 @@
|
|||
"id": "98f4802e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1. Most false positive tokens:"
|
||||
"1. Most common false positive tokens:"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -206,7 +337,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"LOCATION\"])\n",
|
||||
"fps_df[[\"full_text\", \"token\", \"prediction\"]]"
|
||||
"fps_df[[\"full_text\", \"token\", \"annotation\", \"prediction\"]]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -214,7 +345,7 @@
|
|||
"id": "d0852513",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. False negative examples"
|
||||
"2. Most common false negative examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -224,7 +355,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
|
||||
"ModelError.most_common_fn_tokens(errors, n=50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -242,7 +373,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fns_df = ModelError.get_fns_dataframe(errors, entity=[\"PHONE_NUMBER\"])"
|
||||
"fns_df = ModelError.get_fns_dataframe(errors, entity=[\"IP_ADDRESS\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -265,13 +396,21 @@
|
|||
"print(\"All errors:\\n\")\n",
|
||||
"[print(error, \"\\n\") for error in errors]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a67ff38d-0817-4864-9991-b3eb1f80eecc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -283,9 +422,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,12 @@
|
|||
"# install presidio via pip if not yet installed\n",
|
||||
"\n",
|
||||
"#!pip install presidio-analyzer\n",
|
||||
"#!pip install presidio-anonymizer"
|
||||
"#!pip install presidio-anonymizer\n",
|
||||
"#!pip install presidio-evaluator\n",
|
||||
"\n",
|
||||
"# install trained model for pipeline\n",
|
||||
"\n",
|
||||
"#!python -m spacy download en_core_web_sm"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -83,7 +88,7 @@
|
|||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[type: DOMAIN_NAME, start: 57, end: 69, score: 1.0,\n",
|
||||
"[type: URL, start: 49, end: 69, score: 0.95,\n",
|
||||
" type: PERSON, start: 14, end: 24, score: 0.85]"
|
||||
]
|
||||
},
|
||||
|
@ -111,11 +116,11 @@
|
|||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Hi my name is Albert Cohen and this is my website: https://http://chapman-downs.info/',\n",
|
||||
" 'Hi my name is Lisa Miller and this is my website: https://http://benson.org/',\n",
|
||||
" 'Hi my name is Kathleen Hale and this is my website: https://http://www.garcia.com/',\n",
|
||||
" 'Hi my name is Michelle Frederick and this is my website: https://https://robinson.com/',\n",
|
||||
" 'Hi my name is Alicia Santana and this is my website: https://https://www.ray.org/']"
|
||||
"['Hi my name is Tammy Ryan and this is my website: https://www.cardenas.info/',\n",
|
||||
" 'Hi my name is Jessica Smith and this is my website: http://jones-hunt.info/',\n",
|
||||
" 'Hi my name is Michele Marsh and this is my website: https://guerrero.com/',\n",
|
||||
" 'Hi my name is Kathleen Miller and this is my website: https://lopez.com/',\n",
|
||||
" 'Hi my name is Paul Brown and this is my website: http://www.banks-evans.info/']"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
|
@ -148,11 +153,11 @@
|
|||
"-------------\n",
|
||||
"Fake examples:\n",
|
||||
"\n",
|
||||
"Our son R2D2 used to work in Botswana\n",
|
||||
"Our son R2D2 used to work in American Samoa\n",
|
||||
"Our son R2D2 used to work in Malawi\n",
|
||||
"Our son R2D2 used to work in Montenegro\n",
|
||||
"our son r2d2 used to work in lebanon\n"
|
||||
"Our son R2D2 used to work in Nigeria\n",
|
||||
"Our son R2D2 used to work in Guam\n",
|
||||
"Our son R2D2 used to work in Reunion\n",
|
||||
"Our son R2D2 used to work in Vanuatu\n",
|
||||
"Our son R2D2 used to work in Malaysia\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -171,13 +176,20 @@
|
|||
"print(f\"-------------\\nFake examples:\\n\")\n",
|
||||
"print(*fake_samples, sep=\"\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -189,9 +201,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -34,7 +34,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
|
@ -42,55 +42,18 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DATA_DATE = \"Dec-19-2021\""
|
||||
"DATA_DATE = \"Dec-27-2023\" # Change to the date when notebook 3 (split to train/test) was ran"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\r",
|
||||
"tokenizing input: 0%| | 0/2122 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"loading model en_core_web_sm\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tokenizing input: 100%|███████████████████████████████████████████████████████████| 2122/2122 [00:19<00:00, 109.66it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Read 2122 samples\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_path = \"../../data/{}_{}.json\"\n",
|
||||
"\n",
|
||||
|
@ -111,17 +74,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Kept 1940 samples after removal of non-tagged samples\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tagged = [sample for sample in train_samples if len(sample.spans) > 0]\n",
|
||||
"print(\"Kept {} samples after removal of non-tagged samples\".format(len(train_tagged)))"
|
||||
|
@ -140,45 +95,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Entities found in training set:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'ADDRESS',\n",
|
||||
" 'CREDIT_CARD',\n",
|
||||
" 'DATE_TIME',\n",
|
||||
" 'DOMAIN_NAME',\n",
|
||||
" 'EMAIL_ADDRESS',\n",
|
||||
" 'IBAN_CODE',\n",
|
||||
" 'IP_ADDRESS',\n",
|
||||
" 'LOCATION',\n",
|
||||
" 'O',\n",
|
||||
" 'ORGANIZATION',\n",
|
||||
" 'PERSON',\n",
|
||||
" 'PHONE_NUMBER',\n",
|
||||
" 'PREFIX',\n",
|
||||
" 'TITLE',\n",
|
||||
" 'US_SSN'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Entities found in training set:\")\n",
|
||||
"entities = []\n",
|
||||
|
@ -206,16 +129,7 @@
|
|||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Skipping illegal span None, text=ΜΟΝΗ ΑΓΙΩΝ ΑΝΑΡΓΥΡΩΝ\n",
|
||||
"Skipping illegal span None, text=U.N\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"spacy_train = InputSample.create_spacy_dataset(\n",
|
||||
" dataset=train_tagged, output_path=\"train.spacy\"\n",
|
||||
|
@ -281,9 +195,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -295,9 +209,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -39,6 +39,16 @@
|
|||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aee00770-a972-4a19-b423-1724214cc88c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install sklearn_crfsuite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a0d2d772",
|
||||
|
@ -58,8 +68,9 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DATA_DATE = \"Jan-15-2022\"\n",
|
||||
"dataset = InputSample.read_dataset_json(\"../../data/test_{}.json\".format(DATA_DATE))\n",
|
||||
"DATA_DATE = \"Dec-27-2023\" # Date when the split to train/test notebook was ran\n",
|
||||
"dataset_name = \"../../data/test_{}.json\".format(DATA_DATE)\n",
|
||||
"dataset = InputSample.read_dataset_json(dataset_name)\n",
|
||||
"print(len(dataset))"
|
||||
]
|
||||
},
|
||||
|
@ -76,7 +87,7 @@
|
|||
"source": [
|
||||
"entity_counter = Counter()\n",
|
||||
"for sample in dataset:\n",
|
||||
" for t>ag in sample.tags:\n",
|
||||
" for tag in sample.tags:\n",
|
||||
" entity_counter[tag] += 1"
|
||||
]
|
||||
},
|
||||
|
@ -257,7 +268,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"GPE\"])\n",
|
||||
"fps_df = ModelError.get_fps_dataframe(errors, entity=[\"PERSON\"])\n",
|
||||
"fps_df[[\"full_text\", \"token\", \"prediction\"]]"
|
||||
]
|
||||
},
|
||||
|
@ -276,7 +287,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"PERSON\"])"
|
||||
"ModelError.most_common_fn_tokens(errors, n=50, entity=[\"ORGANIZATION\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -325,13 +336,21 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cf3e4646-ca93-44c5-a998-cd77f4bf2708",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -343,9 +362,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
}
|
||||
|
|
|
@ -205,7 +205,7 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.13 ('presidio')",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
@ -219,9 +219,8 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.13"
|
||||
"version": "3.9.18"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "371968787ec79dd50357533864944a85029366968470cac36beb694745c2f7d6"
|
||||
|
@ -229,5 +228,5 @@
|
|||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -35,6 +35,16 @@
|
|||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a0c3285c-06a2-4361-aec2-8375496f75b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install flair"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f036de59",
|
||||
|
@ -111,15 +121,14 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"flair_ner = \"ner-english\"\n",
|
||||
"flair_ner_fast = \"ner-english-fast\"\n",
|
||||
"flair_ontonotes_fast = \"ner-english-ontonotes-fast\"\n",
|
||||
"flair_ontonotes_large = \"ner-english-ontonotes-large\"\n",
|
||||
"flair_ner = \"flair/ner-english\"\n",
|
||||
"flair_ner_fast = \"flair/ner-english-fast\"\n",
|
||||
"flair_ontonotes_fast = \"flair/ner-english-ontonotes-fast\"\n",
|
||||
"flair_ontonotes_large = \"flair/ner-english-ontonotes-large\"\n",
|
||||
"models = [\n",
|
||||
" flair_ner,\n",
|
||||
" flair_ner_fast,\n",
|
||||
" flair_ontonotes_fast,\n",
|
||||
" flair_ner_fast,\n",
|
||||
" flair_ontonotes_large,\n",
|
||||
"]"
|
||||
]
|
||||
|
@ -312,9 +321,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -326,7 +335,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -109,7 +109,10 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]"
|
||||
"models = [\"en_core_web_sm\", \"en_core_web_lg\", \"en_core_web_trf\"]\n",
|
||||
"\n",
|
||||
"# If needed, install models using `python -m spacy download X` where x is the model name, or use spacy.cli.download:\n",
|
||||
"#spacy.cli.download(\"en_core_web_trf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -334,9 +337,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "presidio",
|
||||
"display_name": "presidio-evaluator",
|
||||
"language": "python",
|
||||
"name": "presidio"
|
||||
"name": "presidio-evaluator"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -348,9 +351,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -77,7 +77,7 @@ class UsDriverLicenseProvider(BaseProvider):
|
|||
formats = yaml.safe_load(open(us_driver_license_file))
|
||||
self.formats = formats['en']['faker']['driving_license']['usa']
|
||||
|
||||
def driver_license(self) -> str:
|
||||
def us_driver_license(self) -> str:
|
||||
# US driver's licenses patterns vary by state. Here we sample a random state and format
|
||||
us_state = random.choice(list(self.formats))
|
||||
us_state_format = random.choice(self.formats[us_state])
|
||||
|
|
|
@ -170,6 +170,8 @@ class PresidioDataGenerator:
|
|||
|
||||
new_provider = BaseProvider(self.faker)
|
||||
setattr(new_provider, new_name, original)
|
||||
setattr(new_provider, new_name.lower(), original) # avoid case sensitivity
|
||||
setattr(new_provider, new_name.upper(), original) # avoid case sensitivity
|
||||
self.faker.add_provider(new_provider)
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -25,7 +25,7 @@ class PresidioPseudonymization(PresidioDataGenerator):
|
|||
self.add_provider_alias("credit_card_number", "CREDIT_CARD")
|
||||
self.add_provider_alias("iban", "IBAN_CODE")
|
||||
self.add_provider_alias("phone_number", "PHONE_NUMBER")
|
||||
self.add_provider_alias("url", "DOMAIN_NAME")
|
||||
self.add_provider_alias("url", "URL")
|
||||
self.add_provider_alias("ssn", "US_SSN")
|
||||
self.add_provider_alias("email", "EMAIL_ADDRESS")
|
||||
self.add_provider_alias("date_time", "DATE_TIME")
|
||||
|
|
|
@ -537,7 +537,7 @@ class InputSample(object):
|
|||
if span.entity_type in dictionary:
|
||||
span.entity_type = dictionary[span.entity_type]
|
||||
elif ignore_unknown:
|
||||
span.entity_value = "O"
|
||||
span.entity_type = "O"
|
||||
|
||||
# Remove spans if they were changed to "O"
|
||||
self.spans = [span for span in self.spans if span.entity_type != "O"]
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import copy
|
||||
from collections import Counter
|
||||
from typing import List, Optional, Dict
|
||||
from pathlib import Path
|
||||
import string
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
@ -39,7 +41,6 @@ class Evaluator:
|
|||
self.entities_to_keep = self.model.entities
|
||||
|
||||
def compare(self, input_sample: InputSample, prediction: List[str]):
|
||||
|
||||
"""
|
||||
Compares ground truth tags (annotation) and predicted (prediction)
|
||||
:param input_sample: input sample containing list of tags with scheme
|
||||
|
@ -71,6 +72,9 @@ class Evaluator:
|
|||
if self.entities_to_keep:
|
||||
prediction = self._adjust_per_entities(prediction)
|
||||
new_annotation = self._adjust_per_entities(new_annotation)
|
||||
|
||||
skip_words = self.get_skip_words()
|
||||
|
||||
for i in range(0, len(new_annotation)):
|
||||
results[(new_annotation[i], prediction[i])] += 1
|
||||
|
||||
|
@ -81,6 +85,10 @@ class Evaluator:
|
|||
|
||||
# check if there was an error
|
||||
is_error = new_annotation[i] != prediction[i]
|
||||
if str(tokens[i]).lower().strip() in skip_words:
|
||||
is_error = False
|
||||
results[(new_annotation[i], prediction[i])] -= 1
|
||||
|
||||
if is_error:
|
||||
if prediction[i] == "O":
|
||||
mistakes.append(
|
||||
|
@ -151,7 +159,6 @@ class Evaluator:
|
|||
f"Mapping entity values using this dictionary: {self.model.entity_mapping}"
|
||||
)
|
||||
for sample in tqdm(dataset, desc=f"Evaluating {self.model.__class__}"):
|
||||
|
||||
# Align tag values to the ones expected by the model
|
||||
self.model.align_entity_types(sample)
|
||||
|
||||
|
@ -345,13 +352,13 @@ class Evaluator:
|
|||
if np.isnan(precision) or np.isnan(recall) or (precision == 0 and recall == 0):
|
||||
return np.nan
|
||||
|
||||
return ((1 + beta ** 2) * precision * recall) / (
|
||||
((beta ** 2) * precision) + recall
|
||||
return ((1 + beta**2) * precision * recall) / (
|
||||
((beta**2) * precision) + recall
|
||||
)
|
||||
|
||||
class Plotter:
|
||||
"""
|
||||
Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
|
||||
Plot scores (f2, precision, recall) and errors (false-positivies, false-negatives)
|
||||
for a PII detection model evaluated via Evaluator
|
||||
|
||||
:param model: Instance of a fitted model (of base type BaseModel)
|
||||
|
@ -362,7 +369,9 @@ class Evaluator:
|
|||
which gives more or less weight to precision vs. recall
|
||||
"""
|
||||
|
||||
def __init__(self, model, results, output_folder: Path, model_name: str, beta: float):
|
||||
def __init__(
|
||||
self, model, results, output_folder: Path, model_name: str, beta: float
|
||||
):
|
||||
self.model = model
|
||||
self.results = results
|
||||
self.output_folder = output_folder
|
||||
|
@ -372,41 +381,66 @@ class Evaluator:
|
|||
|
||||
def plot_scores(self) -> None:
|
||||
"""
|
||||
Plots per-entity recall, precision, or F2 score for evaluated model.
|
||||
:param plot_type: which metric to graph (default is F2 score)
|
||||
Plots per-entity recall, precision, or F2 score for evaluated model.
|
||||
"""
|
||||
scores = {}
|
||||
scores['entity'] = list(self.results.entity_recall_dict.keys())
|
||||
scores['recall'] = list(self.results.entity_recall_dict.values())
|
||||
scores['precision'] = list(self.results.entity_precision_dict.values())
|
||||
scores['count'] = list(self.results.n_dict.values())
|
||||
scores[f"f{self.beta}_score"] = [Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
|
||||
for recall, precision in zip(scores['recall'], scores['precision'])]
|
||||
|
||||
entity_recall_dict = copy.deepcopy(self.results.entity_recall_dict)
|
||||
entity_precision_dict = copy.deepcopy(self.results.entity_precision_dict)
|
||||
|
||||
scores["entity"] = list(entity_recall_dict.keys())
|
||||
scores["recall"] = list(entity_recall_dict.values())
|
||||
scores["precision"] = list(entity_precision_dict.values())
|
||||
scores["count"] = list(self.results.n_dict.values())
|
||||
|
||||
scores[f"f{self.beta}_score"] = [
|
||||
Evaluator.f_beta(precision=precision, recall=recall, beta=self.beta)
|
||||
for recall, precision in zip(scores["recall"], scores["precision"])
|
||||
]
|
||||
|
||||
# Add PII detection rates
|
||||
scores["entity"].append("PII")
|
||||
scores["recall"].append(self.results.pii_recall)
|
||||
scores["precision"].append(self.results.pii_precision)
|
||||
scores["count"].append(self.results.n)
|
||||
scores[f"f{self.beta}_score"].append(self.results.pii_f)
|
||||
|
||||
df = pd.DataFrame(scores)
|
||||
df['model'] = self.model_name
|
||||
df["model"] = self.model_name
|
||||
self._plot(df, plot_type="f2_score")
|
||||
self._plot(df, plot_type="precision")
|
||||
self._plot(df, plot_type="recall")
|
||||
|
||||
def _plot(self, df, plot_type) -> None:
|
||||
fig = px.bar(df, text_auto=".2", y='entity', orientation="h",
|
||||
x=plot_type, color='count', barmode='group', title=f"Per-entity {plot_type} for {self.model_name}")
|
||||
fig.update_layout(barmode='group', yaxis={
|
||||
'categoryorder': 'total ascending'})
|
||||
fig = px.bar(
|
||||
df,
|
||||
text_auto=".2",
|
||||
y="entity",
|
||||
orientation="h",
|
||||
x=plot_type,
|
||||
color="count",
|
||||
barmode="group",
|
||||
height=30*len(set(df["entity"])),
|
||||
title=f"Per-entity {plot_type} for {self.model_name}",
|
||||
)
|
||||
fig.update_layout(
|
||||
barmode="group", yaxis={"categoryorder": "total ascending"}
|
||||
)
|
||||
fig.update_layout(yaxis_title=f"{plot_type}", xaxis_title="PII Entity")
|
||||
fig.update_traces(textfont_size=12, textangle=0,
|
||||
textposition="outside", cliponaxis=False)
|
||||
fig.update_traces(
|
||||
textfont_size=12, textangle=0, textposition="outside", cliponaxis=False
|
||||
)
|
||||
fig.update_layout(
|
||||
plot_bgcolor="#FFF",
|
||||
xaxis=dict(
|
||||
title="PII entity",
|
||||
linecolor="#BCCCDC", # Sets color of X-axis line
|
||||
showgrid=False # Removes X-axis grid lines
|
||||
showgrid=False, # Removes X-axis grid lines
|
||||
),
|
||||
yaxis=dict(
|
||||
title=f"{plot_type}",
|
||||
linecolor="#BCCCDC", # Sets color of X-axis line
|
||||
showgrid=False # Removes X-axis grid lines
|
||||
showgrid=False, # Removes X-axis grid lines
|
||||
),
|
||||
)
|
||||
fig.show()
|
||||
|
@ -419,47 +453,100 @@ class Evaluator:
|
|||
for entity in self.model.entity_mapping.values():
|
||||
fps_df = ModelError.get_fps_dataframe(self.errors, entity=[entity])
|
||||
if fps_df is not None:
|
||||
fps_path = self.output_folder / \
|
||||
f"{self.model_name}-{entity}-fps.csv"
|
||||
fps_path = (
|
||||
self.output_folder / f"{self.model_name}-{entity}-fps.csv"
|
||||
)
|
||||
fps_df.to_csv(fps_path)
|
||||
fps_frames.append(fps_path)
|
||||
fns_df = ModelError.get_fns_dataframe(self.errors, entity=[entity])
|
||||
if fns_df is not None:
|
||||
fns_path = self.output_folder / \
|
||||
f"{self.model_name}-{entity}-fns.csv"
|
||||
fns_path = (
|
||||
self.output_folder / f"{self.model_name}-{entity}-fns.csv"
|
||||
)
|
||||
fns_df.to_csv(fns_path)
|
||||
fns_frames.append(fns_path)
|
||||
|
||||
def group_tokens(df):
|
||||
return df.groupby(['token', 'annotation']).size().to_frame(
|
||||
).sort_values([0], ascending=False).head(3).reset_index()
|
||||
return (
|
||||
df.groupby(["token", "annotation"])
|
||||
.size()
|
||||
.to_frame()
|
||||
.sort_values([0], ascending=False)
|
||||
.head(3)
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
fps_tokens_df = pd.concat(
|
||||
[group_tokens(pd.read_csv(df_path)) for df_path in fps_frames])
|
||||
[group_tokens(pd.read_csv(df_path)) for df_path in fps_frames]
|
||||
)
|
||||
fns_tokens_df = pd.concat(
|
||||
[group_tokens(pd.read_csv(df_path)) for df_path in fns_frames])
|
||||
[group_tokens(pd.read_csv(df_path)) for df_path in fns_frames]
|
||||
)
|
||||
|
||||
def generate_graph(title, tokens_df):
|
||||
fig = px.histogram(tokens_df, x=0, y="token", orientation='h', color='annotation',
|
||||
title=f"Most common {title} for {self.model_name}")
|
||||
fig = px.histogram(
|
||||
tokens_df,
|
||||
x=0,
|
||||
y="token",
|
||||
orientation="h",
|
||||
color="annotation",
|
||||
title=f"Most common {title} for {self.model_name}",
|
||||
)
|
||||
|
||||
fig.update_layout(yaxis_title=f"count", xaxis_title="PII Entity")
|
||||
fig.update_traces(textfont_size=12, textangle=0,
|
||||
textposition="outside", cliponaxis=False)
|
||||
fig.update_traces(
|
||||
textfont_size=12,
|
||||
textangle=0,
|
||||
textposition="outside",
|
||||
cliponaxis=False,
|
||||
)
|
||||
fig.update_layout(
|
||||
plot_bgcolor="#FFF",
|
||||
xaxis=dict(
|
||||
title="Count",
|
||||
linecolor="#BCCCDC", # Sets color of X-axis line
|
||||
showgrid=False # Removes X-axis grid lines
|
||||
showgrid=False, # Removes X-axis grid lines
|
||||
),
|
||||
yaxis=dict(
|
||||
title=f"Tokens",
|
||||
linecolor="#BCCCDC", # Sets color of X-axis line
|
||||
showgrid=False # Removes X-axis grid lines
|
||||
showgrid=False, # Removes X-axis grid lines
|
||||
),
|
||||
)
|
||||
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
|
||||
fig.update_layout(yaxis={"categoryorder": "total ascending"})
|
||||
fig.show()
|
||||
|
||||
generate_graph(title="false-negatives", tokens_df=fns_tokens_df)
|
||||
generate_graph(title="false-positives", tokens_df=fps_tokens_df)
|
||||
|
||||
@staticmethod
|
||||
def get_skip_words():
|
||||
skip_words = [x for x in string.punctuation]
|
||||
skip_words.extend(
|
||||
[
|
||||
"\n",
|
||||
"\n\n",
|
||||
"\n\n\n",
|
||||
">>",
|
||||
">>>",
|
||||
">>>>",
|
||||
"street",
|
||||
"st.",
|
||||
"st",
|
||||
"de",
|
||||
"rue",
|
||||
"via",
|
||||
"and",
|
||||
"or",
|
||||
"do",
|
||||
"as",
|
||||
"of",
|
||||
"day",
|
||||
"address",
|
||||
"country",
|
||||
"state",
|
||||
"city",
|
||||
]
|
||||
)
|
||||
|
||||
return skip_words
|
||||
|
|
|
@ -31,9 +31,10 @@ class BaseModel(ABC):
|
|||
self.labeling_scheme = labeling_scheme
|
||||
self.entity_mapping = entity_mapping
|
||||
self.verbose = verbose
|
||||
self.name = self.__class__.__name__
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
"""
|
||||
Abstract. Returns the predicted tokens/spans from the evaluated model
|
||||
:param sample: Sample to be evaluated
|
||||
|
|
|
@ -85,7 +85,7 @@ class CRFModel(BaseModel):
|
|||
y_train = [self.sent2labels(s) for s in sentences]
|
||||
return X_train, y_train
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
tags = CRFModel.crf_predict(sample, self.model)
|
||||
|
||||
if len(tags) != len(sample.tokens):
|
||||
|
|
|
@ -48,7 +48,7 @@ class FlairModel(BaseModel):
|
|||
|
||||
self.spacy_tokenizer = SpacyTokenizer(model=spacy.load("en_core_web_sm"))
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
|
||||
sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer)
|
||||
self.model.predict(sentence)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from typing import List, Optional, Dict
|
||||
|
||||
from presidio_analyzer import AnalyzerEngine
|
||||
from presidio_analyzer import AnalyzerEngine, EntityRecognizer
|
||||
|
||||
from presidio_evaluator import InputSample, span_to_tag
|
||||
from presidio_evaluator.models import BaseModel
|
||||
|
@ -16,6 +16,9 @@ class PresidioAnalyzerWrapper(BaseModel):
|
|||
score_threshold: float = 0.4,
|
||||
language: str = "en",
|
||||
entity_mapping: Optional[Dict[str, str]] = None,
|
||||
ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
|
||||
context: Optional[List[str]] = None,
|
||||
allow_list: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Evaluation wrapper for the Presidio Analyzer
|
||||
|
@ -29,25 +32,37 @@ class PresidioAnalyzerWrapper(BaseModel):
|
|||
)
|
||||
self.score_threshold = score_threshold
|
||||
self.language = language
|
||||
self.ad_hoc_recognizers = ad_hoc_recognizers
|
||||
self.context = context
|
||||
self.allow_list = allow_list
|
||||
|
||||
if not analyzer_engine:
|
||||
analyzer_engine = AnalyzerEngine()
|
||||
self._update_recognizers_based_on_entities_to_keep(analyzer_engine)
|
||||
self.analyzer_engine = analyzer_engine
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
language = kwargs.get("language", self.language)
|
||||
score_threshold = kwargs.get("score_threshold", self.score_threshold)
|
||||
ad_hoc_recognizers = kwargs.get("ad_hoc_recognizers", self.ad_hoc_recognizers)
|
||||
context = kwargs.get("context", self.context)
|
||||
allow_list = kwargs.get("allow_list", self.allow_list)
|
||||
|
||||
results = self.analyzer_engine.analyze(
|
||||
text=sample.full_text,
|
||||
entities=self.entities,
|
||||
language=self.language,
|
||||
score_threshold=self.score_threshold,
|
||||
language=language,
|
||||
score_threshold=score_threshold,
|
||||
ad_hoc_recognizers=ad_hoc_recognizers,
|
||||
context=context,
|
||||
allow_list=allow_list,
|
||||
**kwargs,
|
||||
)
|
||||
starts = []
|
||||
ends = []
|
||||
scores = []
|
||||
tags = []
|
||||
#
|
||||
|
||||
for res in results:
|
||||
starts.append(res.start)
|
||||
ends.append(res.end)
|
||||
|
@ -76,23 +91,28 @@ class PresidioAnalyzerWrapper(BaseModel):
|
|||
"PHONE_NUMBER": "PHONE_NUMBER",
|
||||
"BIRTHDAY": "DATE_TIME",
|
||||
"DATE_TIME": "DATE_TIME",
|
||||
"DOMAIN_NAME": "DOMAIN_NAME",
|
||||
"DOMAIN_NAME": "URL",
|
||||
"TIME" : "DATE_TIME",
|
||||
"DATE" : "DATE_TIME",
|
||||
"CITY": "LOCATION",
|
||||
"ADDRESS": "LOCATION",
|
||||
"STREET_ADDRESS": "LOCATION",
|
||||
"NATIONALITY": "LOCATION",
|
||||
"LOCATION": "LOCATION",
|
||||
"IBAN_CODE": "IBAN_CODE",
|
||||
"URL": "DOMAIN_NAME",
|
||||
"URL": "URL",
|
||||
"US_SSN": "US_SSN",
|
||||
"IP_ADDRESS": "IP_ADDRESS",
|
||||
"ORGANIZATION": "ORG",
|
||||
"ORGANIZATION": "ORGANIZATION",
|
||||
"ORG": "ORGANIZATION",
|
||||
"US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
|
||||
"NRP": "NRP",
|
||||
"TITLE": "O", # not supported
|
||||
"PREFIX": "O", # not supported
|
||||
"STREET_ADDRESS": "O", # not supported
|
||||
"ZIP_CODE": "O", # not supported
|
||||
"AGE": "O", # not supported
|
||||
"NRP": "LOCATION",
|
||||
"NORP": "LOCATION",
|
||||
"ID": "ID",
|
||||
"TITLE": "O", # not supported through spaCy
|
||||
"PREFIX": "O", # not supported through spaCy
|
||||
"ZIP_CODE": "O", # not supported through spaCy
|
||||
"AGE": "O", # not supported through spaCy
|
||||
"O": "O",
|
||||
}
|
||||
|
||||
|
|
|
@ -41,12 +41,15 @@ class PresidioRecognizerWrapper(BaseModel):
|
|||
self.recognizer = recognizer
|
||||
self.nlp_engine = nlp_engine
|
||||
|
||||
if not self.nlp_engine.is_loaded():
|
||||
self.nlp_engine.load()
|
||||
|
||||
#
|
||||
def __make_nlp_artifacts(self, text: str):
|
||||
return self.nlp_engine.process_text(text, "en")
|
||||
|
||||
#
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
nlp_artifacts = None
|
||||
if self.with_nlp_artifacts:
|
||||
nlp_artifacts = self.__make_nlp_artifacts(sample.full_text)
|
||||
|
|
|
@ -31,7 +31,7 @@ class SpacyModel(BaseModel):
|
|||
else:
|
||||
self.model = model
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
"""
|
||||
Predict a list of tags for an inpuit sample.
|
||||
:param sample: InputSample
|
||||
|
|
|
@ -51,7 +51,7 @@ class StanzaModel(SpacyModel):
|
|||
entity_mapping=entity_mapping,
|
||||
)
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
"""
|
||||
Predict the tags using a stanza model.
|
||||
|
||||
|
|
|
@ -48,8 +48,7 @@ class TextAnalyticsWrapper(BaseModel):
|
|||
)
|
||||
return text_analytics_client
|
||||
|
||||
|
||||
def predict(self, sample: InputSample) -> List[str]:
|
||||
def predict(self, sample: InputSample, **kwargs) -> List[str]:
|
||||
documents = [sample.full_text]
|
||||
response = self.ta_client.recognize_pii_entities(documents,
|
||||
language="en")
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
[tool.poetry]
|
||||
name = "presidio_evaluator"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["Omri Mendels <omri374@users.noreply.github.com>"]
|
||||
readme = "README.md"
|
||||
include = [{ path= "presidio_evaluator/data_generator/raw_data/*"}]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
spacy = "^3.5.0"
|
||||
numpy = "^1.22"
|
||||
pandas = "^2.1.4"
|
||||
tqdm = "^4.60.0"
|
||||
faker = "^21.0"
|
||||
scikit-learn = "^1.3.2"
|
||||
presidio-analyzer = "^2.2.351"
|
||||
presidio-anonymizer = "^2.2.351"
|
||||
requests = "^2.25"
|
||||
xmltodict = "^0.12.0"
|
||||
python-dotenv = "^1.0.0"
|
||||
plotly = "^5.18.0"
|
||||
azure-ai-textanalytics = "^5.3.0"
|
||||
en_core_web_sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz"}
|
||||
en_core_web_lg = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz"}
|
||||
|
||||
# optional dependencies for the different NLP approaches
|
||||
[tool.poetry.group.ner]
|
||||
optional=true
|
||||
|
||||
[tool.poetry.group.ner.dependencies]
|
||||
flair = "^0.13.0"
|
||||
spacy_stanza = "^1.0.0"
|
||||
sklearn_crfsuite = "^0.3.6"
|
||||
spacy_huggingface_pipelines = "^0.0.4"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = ">=6.*"
|
||||
flake8 = ">=3.*"
|
||||
pytest-azurepipelines = "^1.0.5"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
|
@ -1,21 +0,0 @@
|
|||
spacy>=3.2.0
|
||||
numpy>=1.20.2
|
||||
jupyter>=1
|
||||
pandas>=1.2.4
|
||||
tqdm>=4.60.0
|
||||
haikunator>=2.1.0
|
||||
schwifty
|
||||
faker>=9.6.0
|
||||
scikit_learn
|
||||
#flair
|
||||
#stanza
|
||||
#spacy_stanza
|
||||
#sklearn_crfsuite
|
||||
pytest>=6.2.3
|
||||
presidio_analyzer
|
||||
presidio_anonymizer
|
||||
requests>=2.25.1
|
||||
xmltodict>=0.12.0
|
||||
python-dotenv
|
||||
plotly
|
||||
azure-ai-textanalytics==5.2.0
|
|
@ -1,20 +0,0 @@
|
|||
spacy>=3.2.0
|
||||
numpy>=1.12.4
|
||||
jupyter>=1
|
||||
pandas>=1.3.4
|
||||
tqdm>=4.60.0
|
||||
haikunator>=2.1.0
|
||||
schwifty
|
||||
faker>=9.6.0
|
||||
scikit_learn<0.24
|
||||
pytest>=6.2.3
|
||||
presidio_analyzer
|
||||
presidio_anonymizer
|
||||
requests>=2.25.1
|
||||
xmltodict>=0.12.0
|
||||
torch>=1.10.1
|
||||
python-dotenv
|
||||
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
|
||||
flair>=0.10
|
||||
stanza>=1.3.0
|
||||
spacy-stanza>=1.0.1
|
79
setup.py
79
setup.py
|
@ -1,54 +1,53 @@
|
|||
from setuptools import setup, find_packages
|
||||
import os.path
|
||||
|
||||
# read the contents of the README file
|
||||
# -*- coding: utf-8 -*-
|
||||
from setuptools import setup
|
||||
import os
|
||||
from os import path
|
||||
|
||||
this_directory = path.abspath(path.dirname(__file__))
|
||||
with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
|
||||
long_description = f.read()
|
||||
# print(long_description)
|
||||
|
||||
with open(os.path.join(this_directory, "VERSION")) as version_file:
|
||||
__version__ = version_file.read().strip()
|
||||
version = version_file.read().strip()
|
||||
|
||||
|
||||
packages = [
|
||||
"presidio_evaluator",
|
||||
"presidio_evaluator.data_generator",
|
||||
"presidio_evaluator.data_generator.faker_extensions",
|
||||
"presidio_evaluator.dataset_formatters",
|
||||
"presidio_evaluator.evaluation",
|
||||
"presidio_evaluator.experiment_tracking",
|
||||
"presidio_evaluator.models",
|
||||
]
|
||||
|
||||
package_data = {"": ["*"], "presidio_evaluator.data_generator": ["raw_data/*"]}
|
||||
|
||||
install_requires = [
|
||||
"azure-ai-textanalytics>=5.3.0,<6.0.0",
|
||||
"faker>=21.0,<22.0",
|
||||
"numpy>=1.22,<2.0",
|
||||
"pandas>=2.1.4,<3.0.0",
|
||||
"plotly>=5.18.0,<6.0.0",
|
||||
"presidio-analyzer>=2.2.351,<3.0.0",
|
||||
"presidio-anonymizer>=2.2.351,<3.0.0",
|
||||
"python-dotenv>=1.0.0,<2.0.0",
|
||||
"requests>=2.25,<3.0",
|
||||
"scikit-learn>=1.3.2,<2.0.0",
|
||||
"spacy>=3.5.0,<4.0.0",
|
||||
"tqdm>=4.60.0,<5.0.0",
|
||||
"xmltodict>=0.12.0,<0.13.0",
|
||||
]
|
||||
|
||||
setup(
|
||||
name="presidio-evaluator",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
version=__version__,
|
||||
packages=find_packages(exclude=["tests"]),
|
||||
url="https://www.github.com/microsoft/presidio-research",
|
||||
version=version,
|
||||
license="MIT",
|
||||
description="PII dataset generator, model evaluator for Presidio and PII data in general", # noqa
|
||||
data_files=[
|
||||
(
|
||||
"presidio_evaluator/data_generator/raw_data",
|
||||
[
|
||||
"presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv", # noqa
|
||||
"presidio_evaluator/data_generator/raw_data/templates.txt",
|
||||
"presidio_evaluator/data_generator/raw_data/companies_and_organizations.csv",
|
||||
"presidio_evaluator/data_generator/raw_data/nationalities.csv",
|
||||
"presidio_evaluator/data_generator/raw_data/us_driver_licenses.csv",
|
||||
],
|
||||
)
|
||||
],
|
||||
include_package_data=True,
|
||||
install_requires=[
|
||||
"presidio_analyzer",
|
||||
"presidio_anonymizer",
|
||||
"spacy>=3.0.0",
|
||||
"requests",
|
||||
"numpy",
|
||||
"pandas",
|
||||
"tqdm>=4.32.1",
|
||||
"jupyter>=1.0.0",
|
||||
"pytest>=4.6.2",
|
||||
"haikunator",
|
||||
"schwifty",
|
||||
"faker",
|
||||
"sklearn_crfsuite",
|
||||
"python-dotenv",
|
||||
"azure-ai-textanalytics==5.2.0"
|
||||
],
|
||||
)
|
||||
packages=packages,
|
||||
package_data=package_data,
|
||||
install_requires=install_requires,
|
||||
python_requires=">=3.8,<4.0",
|
||||
)
|
|
@ -345,3 +345,22 @@ def test_align_entity_types_wrong_mapping_exception():
|
|||
Evaluator.align_entity_types(
|
||||
input_samples=[sample1], entities_mapping=entities_mapping
|
||||
)
|
||||
|
||||
|
||||
def test_skip_words_are_not_counted_as_errors():
|
||||
prediction = ["U-PERSON", "O", "O", "O", "U-LOCATION"]
|
||||
model = MockTokensModel(prediction=prediction,
|
||||
entities_to_keep=["LOCATION", "PERSON"])
|
||||
|
||||
evaluator = Evaluator(model=model)
|
||||
sample = InputSample(
|
||||
full_text="John is on the street", masked="I am the street", spans=None
|
||||
)
|
||||
sample.tokens = ["John", "is", "on", "the", "street"]
|
||||
sample.tags = ["U-PERSON", "O", "O", "O", "O"]
|
||||
|
||||
evaluated = evaluator.evaluate_sample(sample, prediction)
|
||||
final_evaluation = evaluator.calculate_score([evaluated])
|
||||
|
||||
assert final_evaluation.pii_precision == 1
|
||||
assert final_evaluation.pii_recall == 1
|
||||
|
|
|
@ -30,7 +30,7 @@ def fake_faker():
|
|||
],
|
||||
# fmt: on
|
||||
)
|
||||
def test_presidio_psudonymize_two_entities(
|
||||
def test_presidio_pseudonymize_two_entities(
|
||||
text, entity1, entity2, start1, end1, start2, end2, value1, value2, fake_faker
|
||||
):
|
||||
|
||||
|
@ -51,3 +51,15 @@ def test_presidio_psudonymize_two_entities(
|
|||
assert value2 in pseudonym
|
||||
assert text[:start1].lower() in pseudonym.lower()
|
||||
assert text[end1:start2].lower() in pseudonym.lower()
|
||||
|
||||
|
||||
def test_simple_scenario():
|
||||
original_text = "Hi my name is Doug Funny and this is my website: https://www.dougf.io" # noqa
|
||||
presidio_response = [
|
||||
RecognizerResult(entity_type="PERSON", start=14, end=24, score=0.85),
|
||||
RecognizerResult(entity_type="URL", start=49, end=69, score=0.95),
|
||||
]
|
||||
|
||||
PresidioPseudonymization().pseudonymize(original_text=original_text,
|
||||
presidio_response=presidio_response,
|
||||
count=5)
|
||||
|
|
Загрузка…
Ссылка в новой задаче