This commit is contained in:
omri374 2021-04-29 14:00:45 +03:00
Родитель 222b466fa2
Коммит 55839323b0
10 изменённых файлов: 97 добавлений и 65 удалений

Просмотреть файл

@ -45,7 +45,7 @@ Then, it creates new synthetic sentences by sampling templates and PII values. F
- For information on data generation/augmentation, see the data generator [README](presidio_evaluator/data_generator/README.md).
- For an example for running the generation process, see [this notebook](notebooks/Generate%20data.ipynb).
- For an example for running the generation process, see [this notebook](notebooks/data%20generation/Generate%20data.ipynb).
- For an understanding of the underlying fake PII data used, see this [exploratory data analysis notebook](notebooks/PII%20EDA.ipynb).
Note that the generation process might not work off-the-shelf as we are not sharing the fake PII datasets and templates used in this analysis, do to copyright and other restrictions.
@ -130,4 +130,5 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio
Copyright notice:
Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/)
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/).
Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.

Просмотреть файл

@ -9,7 +9,7 @@
"outputs": [],
"source": [
"from tqdm import tqdm_notebook as tqdm\n",
"from presidio_evaluator.data_generator.main import generate,read_synth_dataset\n",
"from presidio_evaluator.data_generator.main import generate, read_synth_dataset\n",
"\n",
"import datetime\n",
"import json"
@ -71,7 +71,7 @@
"source": [
"EXAMPLES = 100\n",
"SPAN_TO_TAG = True #Whether to create tokens + token labels (tags)\n",
"TEMPLATES_FILE = '../presidio_evaluator/data_generator/' \\\n",
"TEMPLATES_FILE = '../../presidio_evaluator/data_generator/' \\\n",
" 'raw_data/templates.txt'\n",
"KEEP_ONLY_TAGGED = False\n",
"LOWER_CASE_RATIO = 0.1\n",
@ -79,9 +79,9 @@
"\n",
"cur_time = datetime.date.today().strftime(\"%B_%d_%Y\")\n",
"\n",
"OUTPUT = \"../data/generated_size_{}_date_{}.json\".format(EXAMPLES, cur_time)\n",
"OUTPUT = \"../../data/generated_size_{}_date_{}.json\".format(EXAMPLES, cur_time)\n",
"\n",
"fake_pii_csv = '../presidio_evaluator/data_generator/' \\\n",
"fake_pii_csv = '../../presidio_evaluator/data_generator/' \\\n",
" 'raw_data/FakeNameGenerator.com_3000.csv'\n",
"utterances_file = TEMPLATES_FILE\n",
"dictionary_path = None\n",
@ -198,8 +198,20 @@
"Data generated for evaluation was created using Fake Name Generator.\n",
"\n",
"Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) \n",
"are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.\n"
"are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC."
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
@ -223,4 +235,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

Просмотреть файл

@ -11,19 +11,20 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"pd.options.display.max_rows = 4000\n",
"pd.set_option('display.max_colwidth', -1)\n",
"#TODO: fix CONLL2003 download and usage"
"\n",
"from presidio_evaluator.dataset_formatters import CONLL2003Formatter()\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": false
@ -31,10 +32,8 @@
},
"outputs": [],
"source": [
"#reader = Conll2003DatasetReader()\n",
"#dataset = reader.read(data_path =\"../../data\",dataset_name='conll2003')\n",
"#Note: make sure you haven't downloaded something else with this function before, \n",
"# as it will not download a new dataset (even if your previous download was for a different dataset)"
"conll_formatter = CONLL2003Formatter()\n",
"train_samples = conll_formatter.to_input_samples(fold=\"train\")"
]
},
{
@ -630,9 +629,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "presidio-research",
"language": "python",
"name": "python3"
"name": "presidio-research"
},
"language_info": {
"codemirror_mode": {
@ -644,9 +643,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}

Просмотреть файл

@ -31,7 +31,7 @@ that samples from the same template would only appear in one set
Notes:
- For steps 5, 6, 7 see the main [README](../../README.md).
- For a simple data generation pipeline,
[see this notebook](../../notebooks/Generate data.ipynb).
[see this notebook](../../notebooks/data%20generation/Generate%20data.ipynb).
- For information on transforming a NER dataset into a templates,
see the notebooks in the [helper notebooks](../../notebooks/data%20generation) folder.

Просмотреть файл

@ -4,20 +4,10 @@ import pandas as pd
from faker import Faker
from haikunator import Haikunator
from presidio_evaluator.data_generator import (
NationalityGenerator,
OrgNameGenerator,
UsDriverLicenseGenerator,
)
fake = Faker()
haikunator = Haikunator()
IP_V4_RATIO = 0.8
org_name_generator = OrgNameGenerator()
nationality_generator = NationalityGenerator()
us_driver_license_generator = UsDriverLicenseGenerator()
def generate_url(domain: pd.Series):
def generate_url_postfix():
@ -82,7 +72,7 @@ def generate_iban(country: pd.Series):
return country.apply(generate_one_iban)
def generate_company_names(length):
def generate_company_names(length, org_name_generator):
return [org_name_generator.get_organization() for _ in range(length)]
@ -149,27 +139,27 @@ def generate_roles(length):
return [random.choice(roles) for _ in range(length)]
def generate_nationality(length):
def generate_nationality(length, nationality_generator):
return [nationality_generator.get_nationality() for _ in range(length)]
def generate_us_driver_licenses(length):
def generate_us_driver_licenses(length, us_driver_license_generator):
return [
us_driver_license_generator.get_driver_license_number() for _ in range(length)
]
def generate_country(length):
def generate_country(length, nationality_generator):
return [nationality_generator.get_country() for _ in range(length)]
def generate_nation_woman(length):
def generate_nation_woman(length, nationality_generator):
return [nationality_generator.get_nation_woman() for _ in range(length)]
def generate_nation_man(length):
def generate_nation_man(length, nationality_generator):
return [nationality_generator.get_nation_man() for _ in range(length)]
def generate_nation_plural(length):
def generate_nation_plural(length, nationality_generator):
return [nationality_generator.get_nation_plural() for _ in range(length)]

Просмотреть файл

@ -8,6 +8,11 @@ from spacy.tokens import Token
from tqdm import tqdm
from presidio_evaluator import Span, InputSample
from presidio_evaluator.data_generator import (
OrgNameGenerator,
NationalityGenerator,
UsDriverLicenseGenerator,
)
from presidio_evaluator.data_generator.extensions import (
generate_iban,
generate_ip_addresses,
@ -91,6 +96,10 @@ class FakeDataGenerator:
self.span_to_tag = span_to_tag
self.labeling_scheme = labeling_scheme
self.org_name_generator = OrgNameGenerator()
self.nationality_generator = NationalityGenerator()
self.us_driver_license_generator = UsDriverLicenseGenerator()
def get_is_in_vocabulary(self, token):
return token.text.lower() in self.vocabulary_words
@ -138,7 +147,7 @@ class FakeDataGenerator:
if "COUNTRY" not in self.ignore_types:
df["COUNTRY"] = generate_country(
len(df)
len(df), self.nationality_generator
) # replace previous country which has limited options
# Copied entities
@ -174,10 +183,16 @@ class FakeDataGenerator:
if "NATIONALITY" not in self.ignore_types:
print("Generating nationalities")
df["NATIONALITY"] = generate_nationality(len(df))
df["NATION_MAN"] = generate_nation_man(len(df))
df["NATION_WOMAN"] = generate_nation_woman(len(df))
df["NATION_PLURAL"] = generate_nation_plural(len(df))
df["NATIONALITY"] = generate_nationality(
len(df), self.nationality_generator
)
df["NATION_MAN"] = generate_nation_man(len(df), self.nationality_generator)
df["NATION_WOMAN"] = generate_nation_woman(
len(df), self.nationality_generator
)
df["NATION_PLURAL"] = generate_nation_plural(
len(df), self.nationality_generator
)
if "IBAN" not in self.ignore_types:
print("Generating IBANs")
@ -193,7 +208,9 @@ class FakeDataGenerator:
if "US_DRIVER_LICENSE" not in self.ignore_types:
print("Generating US driver license numbers")
df["US_DRIVER_LICENSE"] = generate_us_driver_licenses(len(df))
df["US_DRIVER_LICENSE"] = generate_us_driver_licenses(
len(df), self.us_driver_license_generator
)
if "URL" not in self.ignore_types:
print("Generating URLs")
@ -204,7 +221,7 @@ class FakeDataGenerator:
if "ORGANIZATION" not in self.ignore_types:
print("Generating company names")
df["ORG"] = generate_company_names(len(df))
df["ORG"] = generate_company_names(len(df), self.org_name_generator)
if "Company" in df:
df["ORGANIZATION"] = df[random.choice(["Company", "ORG"])].str.title()
else:
@ -249,7 +266,8 @@ class FakeDataGenerator:
print("Preparing sample sentences for ingestion")
# Todo: introduce typos
templates = [
l.strip().replace("[", "{").replace("]", "}") for l in raw_templates
template.strip().replace("[", "{").replace("]", "}")
for template in raw_templates
]
return templates

Просмотреть файл

@ -1,38 +1,50 @@
from setuptools import setup, find_packages
import os.path
# read the contents of the README file
from os import path
this_directory = path.abspath(path.dirname(__file__))
with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
long_description = f.read()
# print(long_description)
with open(os.path.join(this_directory, 'VERSION')) as version_file:
with open(os.path.join(this_directory, "VERSION")) as version_file:
__version__ = version_file.read().strip()
setup(
name='presidio-evaluator',
name="presidio-evaluator",
long_description=long_description,
long_description_content_type='text/markdown',
long_description_content_type="text/markdown",
version=__version__,
packages=find_packages(exclude=["tests"]),
url='https://www.github.com/microsoft/presidio',
license='MIT',
description='PII dataset generator, model evaluator for Presidio and PII data in general',
data_files=[('presidio_evaluator/data_generator/raw_data', ['presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv', 'presidio_evaluator/data_generator/raw_data/templates.txt', 'presidio_evaluator/data_generator/raw_data/organizations.csv', 'presidio_evaluator/data_generator/raw_data/nationalities.csv'])],
url="https://www.github.com/microsoft/presidio",
license="MIT",
description="PII dataset generator, model evaluator for Presidio and PII data in general",
data_files=[
(
"presidio_evaluator/data_generator/raw_data",
[
"presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv",
"presidio_evaluator/data_generator/raw_data/templates.txt",
"presidio_evaluator/data_generator/raw_data/organizations.csv",
"presidio_evaluator/data_generator/raw_data/nationalities.csv",
"presidio_evaluator/data_generator/raw_data/us_driver_license.csv",
],
)
],
include_package_data=True,
install_requires=[
'spacy>=3.0.0',
'requests',
'numpy',
'pandas',
'tqdm>=4.32.1',
'jupyter>=1.0.0',
'pytest>=4.6.2',
'haikunator',
'schwifty',
'faker',
'sklearn_crfsuite']
"spacy>=3.0.0",
"requests",
"numpy",
"pandas",
"tqdm>=4.32.1",
"jupyter>=1.0.0",
"pytest>=4.6.2",
"haikunator",
"schwifty",
"faker",
"sklearn_crfsuite",
],
)