small updates
This commit is contained in:
Родитель
222b466fa2
Коммит
55839323b0
|
@ -45,7 +45,7 @@ Then, it creates new synthetic sentences by sampling templates and PII values. F
|
|||
|
||||
- For information on data generation/augmentation, see the data generator [README](presidio_evaluator/data_generator/README.md).
|
||||
|
||||
- For an example for running the generation process, see [this notebook](notebooks/Generate%20data.ipynb).
|
||||
- For an example for running the generation process, see [this notebook](notebooks/data%20generation/Generate%20data.ipynb).
|
||||
|
||||
- For an understanding of the underlying fake PII data used, see this [exploratory data analysis notebook](notebooks/PII%20EDA.ipynb).
|
||||
Note that the generation process might not work off-the-shelf as we are not sharing the fake PII datasets and templates used in this analysis, do to copyright and other restrictions.
|
||||
|
@ -130,4 +130,5 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio
|
|||
Copyright notice:
|
||||
|
||||
Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/)
|
||||
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.
|
||||
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/).
|
||||
Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"from tqdm import tqdm_notebook as tqdm\n",
|
||||
"from presidio_evaluator.data_generator.main import generate,read_synth_dataset\n",
|
||||
"from presidio_evaluator.data_generator.main import generate, read_synth_dataset\n",
|
||||
"\n",
|
||||
"import datetime\n",
|
||||
"import json"
|
||||
|
@ -71,7 +71,7 @@
|
|||
"source": [
|
||||
"EXAMPLES = 100\n",
|
||||
"SPAN_TO_TAG = True #Whether to create tokens + token labels (tags)\n",
|
||||
"TEMPLATES_FILE = '../presidio_evaluator/data_generator/' \\\n",
|
||||
"TEMPLATES_FILE = '../../presidio_evaluator/data_generator/' \\\n",
|
||||
" 'raw_data/templates.txt'\n",
|
||||
"KEEP_ONLY_TAGGED = False\n",
|
||||
"LOWER_CASE_RATIO = 0.1\n",
|
||||
|
@ -79,9 +79,9 @@
|
|||
"\n",
|
||||
"cur_time = datetime.date.today().strftime(\"%B_%d_%Y\")\n",
|
||||
"\n",
|
||||
"OUTPUT = \"../data/generated_size_{}_date_{}.json\".format(EXAMPLES, cur_time)\n",
|
||||
"OUTPUT = \"../../data/generated_size_{}_date_{}.json\".format(EXAMPLES, cur_time)\n",
|
||||
"\n",
|
||||
"fake_pii_csv = '../presidio_evaluator/data_generator/' \\\n",
|
||||
"fake_pii_csv = '../../presidio_evaluator/data_generator/' \\\n",
|
||||
" 'raw_data/FakeNameGenerator.com_3000.csv'\n",
|
||||
"utterances_file = TEMPLATES_FILE\n",
|
||||
"dictionary_path = None\n",
|
||||
|
@ -198,8 +198,20 @@
|
|||
"Data generated for evaluation was created using Fake Name Generator.\n",
|
||||
"\n",
|
||||
"Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) \n",
|
||||
"are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.\n"
|
||||
"are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -223,4 +235,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
|
@ -11,19 +11,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"pd.options.display.max_rows = 4000\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"#TODO: fix CONLL2003 download and usage"
|
||||
"\n",
|
||||
"from presidio_evaluator.dataset_formatters import CONLL2003Formatter()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": false
|
||||
|
@ -31,10 +32,8 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#reader = Conll2003DatasetReader()\n",
|
||||
"#dataset = reader.read(data_path =\"../../data\",dataset_name='conll2003')\n",
|
||||
"#Note: make sure you haven't downloaded something else with this function before, \n",
|
||||
"# as it will not download a new dataset (even if your previous download was for a different dataset)"
|
||||
"conll_formatter = CONLL2003Formatter()\n",
|
||||
"train_samples = conll_formatter.to_input_samples(fold=\"train\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -630,9 +629,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "presidio-research",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "presidio-research"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -644,9 +643,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
"version": "3.7.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
|
@ -31,7 +31,7 @@ that samples from the same template would only appear in one set
|
|||
Notes:
|
||||
- For steps 5, 6, 7 see the main [README](../../README.md).
|
||||
- For a simple data generation pipeline,
|
||||
[see this notebook](../../notebooks/Generate data.ipynb).
|
||||
[see this notebook](../../notebooks/data%20generation/Generate%20data.ipynb).
|
||||
- For information on transforming a NER dataset into a templates,
|
||||
see the notebooks in the [helper notebooks](../../notebooks/data%20generation) folder.
|
||||
|
||||
|
|
|
@ -4,20 +4,10 @@ import pandas as pd
|
|||
from faker import Faker
|
||||
from haikunator import Haikunator
|
||||
|
||||
from presidio_evaluator.data_generator import (
|
||||
NationalityGenerator,
|
||||
OrgNameGenerator,
|
||||
UsDriverLicenseGenerator,
|
||||
)
|
||||
|
||||
fake = Faker()
|
||||
haikunator = Haikunator()
|
||||
IP_V4_RATIO = 0.8
|
||||
|
||||
org_name_generator = OrgNameGenerator()
|
||||
nationality_generator = NationalityGenerator()
|
||||
us_driver_license_generator = UsDriverLicenseGenerator()
|
||||
|
||||
|
||||
def generate_url(domain: pd.Series):
|
||||
def generate_url_postfix():
|
||||
|
@ -82,7 +72,7 @@ def generate_iban(country: pd.Series):
|
|||
return country.apply(generate_one_iban)
|
||||
|
||||
|
||||
def generate_company_names(length):
|
||||
def generate_company_names(length, org_name_generator):
|
||||
return [org_name_generator.get_organization() for _ in range(length)]
|
||||
|
||||
|
||||
|
@ -149,27 +139,27 @@ def generate_roles(length):
|
|||
return [random.choice(roles) for _ in range(length)]
|
||||
|
||||
|
||||
def generate_nationality(length):
|
||||
def generate_nationality(length, nationality_generator):
|
||||
return [nationality_generator.get_nationality() for _ in range(length)]
|
||||
|
||||
|
||||
def generate_us_driver_licenses(length):
|
||||
def generate_us_driver_licenses(length, us_driver_license_generator):
|
||||
return [
|
||||
us_driver_license_generator.get_driver_license_number() for _ in range(length)
|
||||
]
|
||||
|
||||
|
||||
def generate_country(length):
|
||||
def generate_country(length, nationality_generator):
|
||||
return [nationality_generator.get_country() for _ in range(length)]
|
||||
|
||||
|
||||
def generate_nation_woman(length):
|
||||
def generate_nation_woman(length, nationality_generator):
|
||||
return [nationality_generator.get_nation_woman() for _ in range(length)]
|
||||
|
||||
|
||||
def generate_nation_man(length):
|
||||
def generate_nation_man(length, nationality_generator):
|
||||
return [nationality_generator.get_nation_man() for _ in range(length)]
|
||||
|
||||
|
||||
def generate_nation_plural(length):
|
||||
def generate_nation_plural(length, nationality_generator):
|
||||
return [nationality_generator.get_nation_plural() for _ in range(length)]
|
||||
|
|
|
@ -8,6 +8,11 @@ from spacy.tokens import Token
|
|||
from tqdm import tqdm
|
||||
|
||||
from presidio_evaluator import Span, InputSample
|
||||
from presidio_evaluator.data_generator import (
|
||||
OrgNameGenerator,
|
||||
NationalityGenerator,
|
||||
UsDriverLicenseGenerator,
|
||||
)
|
||||
from presidio_evaluator.data_generator.extensions import (
|
||||
generate_iban,
|
||||
generate_ip_addresses,
|
||||
|
@ -91,6 +96,10 @@ class FakeDataGenerator:
|
|||
self.span_to_tag = span_to_tag
|
||||
self.labeling_scheme = labeling_scheme
|
||||
|
||||
self.org_name_generator = OrgNameGenerator()
|
||||
self.nationality_generator = NationalityGenerator()
|
||||
self.us_driver_license_generator = UsDriverLicenseGenerator()
|
||||
|
||||
def get_is_in_vocabulary(self, token):
|
||||
return token.text.lower() in self.vocabulary_words
|
||||
|
||||
|
@ -138,7 +147,7 @@ class FakeDataGenerator:
|
|||
|
||||
if "COUNTRY" not in self.ignore_types:
|
||||
df["COUNTRY"] = generate_country(
|
||||
len(df)
|
||||
len(df), self.nationality_generator
|
||||
) # replace previous country which has limited options
|
||||
|
||||
# Copied entities
|
||||
|
@ -174,10 +183,16 @@ class FakeDataGenerator:
|
|||
|
||||
if "NATIONALITY" not in self.ignore_types:
|
||||
print("Generating nationalities")
|
||||
df["NATIONALITY"] = generate_nationality(len(df))
|
||||
df["NATION_MAN"] = generate_nation_man(len(df))
|
||||
df["NATION_WOMAN"] = generate_nation_woman(len(df))
|
||||
df["NATION_PLURAL"] = generate_nation_plural(len(df))
|
||||
df["NATIONALITY"] = generate_nationality(
|
||||
len(df), self.nationality_generator
|
||||
)
|
||||
df["NATION_MAN"] = generate_nation_man(len(df), self.nationality_generator)
|
||||
df["NATION_WOMAN"] = generate_nation_woman(
|
||||
len(df), self.nationality_generator
|
||||
)
|
||||
df["NATION_PLURAL"] = generate_nation_plural(
|
||||
len(df), self.nationality_generator
|
||||
)
|
||||
|
||||
if "IBAN" not in self.ignore_types:
|
||||
print("Generating IBANs")
|
||||
|
@ -193,7 +208,9 @@ class FakeDataGenerator:
|
|||
|
||||
if "US_DRIVER_LICENSE" not in self.ignore_types:
|
||||
print("Generating US driver license numbers")
|
||||
df["US_DRIVER_LICENSE"] = generate_us_driver_licenses(len(df))
|
||||
df["US_DRIVER_LICENSE"] = generate_us_driver_licenses(
|
||||
len(df), self.us_driver_license_generator
|
||||
)
|
||||
|
||||
if "URL" not in self.ignore_types:
|
||||
print("Generating URLs")
|
||||
|
@ -204,7 +221,7 @@ class FakeDataGenerator:
|
|||
|
||||
if "ORGANIZATION" not in self.ignore_types:
|
||||
print("Generating company names")
|
||||
df["ORG"] = generate_company_names(len(df))
|
||||
df["ORG"] = generate_company_names(len(df), self.org_name_generator)
|
||||
if "Company" in df:
|
||||
df["ORGANIZATION"] = df[random.choice(["Company", "ORG"])].str.title()
|
||||
else:
|
||||
|
@ -249,7 +266,8 @@ class FakeDataGenerator:
|
|||
print("Preparing sample sentences for ingestion")
|
||||
# Todo: introduce typos
|
||||
templates = [
|
||||
l.strip().replace("[", "{").replace("]", "}") for l in raw_templates
|
||||
template.strip().replace("[", "{").replace("]", "}")
|
||||
for template in raw_templates
|
||||
]
|
||||
return templates
|
||||
|
||||
|
|
52
setup.py
52
setup.py
|
@ -1,38 +1,50 @@
|
|||
from setuptools import setup, find_packages
|
||||
import os.path
|
||||
|
||||
# read the contents of the README file
|
||||
from os import path
|
||||
|
||||
this_directory = path.abspath(path.dirname(__file__))
|
||||
with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
|
||||
with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
|
||||
long_description = f.read()
|
||||
# print(long_description)
|
||||
|
||||
with open(os.path.join(this_directory, 'VERSION')) as version_file:
|
||||
with open(os.path.join(this_directory, "VERSION")) as version_file:
|
||||
__version__ = version_file.read().strip()
|
||||
|
||||
setup(
|
||||
name='presidio-evaluator',
|
||||
name="presidio-evaluator",
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
long_description_content_type="text/markdown",
|
||||
version=__version__,
|
||||
packages=find_packages(exclude=["tests"]),
|
||||
url='https://www.github.com/microsoft/presidio',
|
||||
license='MIT',
|
||||
description='PII dataset generator, model evaluator for Presidio and PII data in general',
|
||||
data_files=[('presidio_evaluator/data_generator/raw_data', ['presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv', 'presidio_evaluator/data_generator/raw_data/templates.txt', 'presidio_evaluator/data_generator/raw_data/organizations.csv', 'presidio_evaluator/data_generator/raw_data/nationalities.csv'])],
|
||||
url="https://www.github.com/microsoft/presidio",
|
||||
license="MIT",
|
||||
description="PII dataset generator, model evaluator for Presidio and PII data in general",
|
||||
data_files=[
|
||||
(
|
||||
"presidio_evaluator/data_generator/raw_data",
|
||||
[
|
||||
"presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv",
|
||||
"presidio_evaluator/data_generator/raw_data/templates.txt",
|
||||
"presidio_evaluator/data_generator/raw_data/organizations.csv",
|
||||
"presidio_evaluator/data_generator/raw_data/nationalities.csv",
|
||||
"presidio_evaluator/data_generator/raw_data/us_driver_license.csv",
|
||||
],
|
||||
)
|
||||
],
|
||||
include_package_data=True,
|
||||
install_requires=[
|
||||
'spacy>=3.0.0',
|
||||
'requests',
|
||||
'numpy',
|
||||
'pandas',
|
||||
'tqdm>=4.32.1',
|
||||
'jupyter>=1.0.0',
|
||||
'pytest>=4.6.2',
|
||||
'haikunator',
|
||||
'schwifty',
|
||||
'faker',
|
||||
'sklearn_crfsuite']
|
||||
|
||||
"spacy>=3.0.0",
|
||||
"requests",
|
||||
"numpy",
|
||||
"pandas",
|
||||
"tqdm>=4.32.1",
|
||||
"jupyter>=1.0.0",
|
||||
"pytest>=4.6.2",
|
||||
"haikunator",
|
||||
"schwifty",
|
||||
"faker",
|
||||
"sklearn_crfsuite",
|
||||
],
|
||||
)
|
||||
|
|
Загрузка…
Ссылка в новой задаче