This commit is contained in:
omri374 2021-10-20 11:05:33 +03:00
Родитель 636743b125
Коммит 3eb0026729
2 изменённых файлов: 67 добавлений и 62 удалений

Просмотреть файл

@ -3,27 +3,28 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from tqdm import tqdm_notebook as tqdm\n",
"from presidio_evaluator.data_generator.main import generate, read_synth_dataset\n",
"\n",
"import datetime\n",
"import json"
],
"outputs": [],
"metadata": {
"scrolled": true
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generate fake PII data using Presidio's data generator"
],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Presidio's data generator allows you to generate a synthetic dataset with two preriquisites:\n",
"1. A fake PII csv (We used https://www.fakenamegenerator.com/)\n",
@ -50,20 +51,23 @@
"What's your last name? It's [LAST_NAME]\n",
"\n",
"Every time I see you falling I get down on my knees and pray\n"
],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generate files\n",
"Based on these two prerequisites, a requested number of examples and an output file name:"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"EXAMPLES = 100\n",
"SPAN_TO_TAG = True #Whether to create tokens + token labels (tags)\n",
@ -91,69 +95,69 @@
" ignore_types=IGNORE_TYPES,\n",
" keep_only_tagged=KEEP_ONLY_TAGGED,\n",
" span_to_tag=SPAN_TO_TAG)"
],
"outputs": [],
"metadata": {
"scrolled": true
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To read a dataset file into the InputSample format, use `read_synth_dataset`:"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"input_samples = read_synth_dataset(OUTPUT)"
],
"outputs": [],
"metadata": {
"scrolled": true
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"input_samples[0]"
],
"outputs": [],
"metadata": {
"scrolled": true
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The full structure of each input_sample is the following. It includes different feature values per token as calculated by Spacy"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"input_samples[0].to_dict()"
],
"outputs": [],
"metadata": {
"scrolled": false
}
},
"outputs": [],
"source": [
"input_samples[0].to_dict()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Verify randomness of dataset"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from collections import Counter\n",
"count_per_template_id = Counter([sample.metadata['Template#'] for sample in input_samples])\n",
@ -161,35 +165,32 @@
" print(\"{}: {}\".format(key,count_per_template_id[key]))\n",
" \n",
"print(sum(count_per_template_id.values()))"
],
"outputs": [],
"metadata": {
"scrolled": true
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Transform to the CONLL structure:"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from presidio_evaluator import InputSample\n",
"\n",
"conll = InputSample.create_conll_dataset(input_samples)\n",
"conll.head(5)"
],
"outputs": [],
"metadata": {
"scrolled": true
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Copyright notice:\n",
"\n",
@ -198,14 +199,13 @@
"\n",
"Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) \n",
"are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC."
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
@ -216,8 +216,9 @@
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.11 64-bit ('presidio': conda)"
"display_name": "presidio-research",
"language": "python",
"name": "presidio-research"
},
"language_info": {
"codemirror_mode": {
@ -229,10 +230,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.11"
},
"interpreter": {
"hash": "2509fbe9adc3579fd0ef23e6a2c6fb50cb745caa174aafdf017283479e60bc43"
"version": "3.8.8"
}
},
"nbformat": 4,

Просмотреть файл

@ -2,7 +2,7 @@ import dataclasses
import json
import re
from dataclasses import dataclass
from typing import List
from typing import List, Union
from faker import Generator
@ -34,7 +34,7 @@ class SpansResult:
def __repr__(self):
spans_dict = json.dumps([dataclasses.asdict(span) for span in self.spans])
return json.dumps({"fake":self.fake, "spans": spans_dict})
return json.dumps({"fake": self.fake, "spans": spans_dict})
class SpanGenerator(Generator):
@ -57,9 +57,16 @@ class SpanGenerator(Generator):
'My name is Allison Hill and i live in 819 Johnson Course\nEast William, OH 26563.'
"""
def parse(self, text, add_spans=False) -> SpansResult:
def parse(self, text, add_spans=False) -> Union[str, SpansResult]:
if not add_spans:
return super().parse(text)
else:
return self.parse_with_spans(text)
def parse_with_spans(self, text) -> SpansResult:
"""Parses a Faker template and returns a `SpanResult` object.
:param text: Text holding the faker template, e.g. "My name is {{name}}".
"""
spans = self._match_to_span(text)
@ -98,7 +105,7 @@ class SpanGenerator(Generator):
type=formatter,
start=match.start(),
end=match.end(),
value=super().format(formatter.strip())
value=super().format(formatter.strip()),
)
)