minor updates
This commit is contained in:
Родитель
636743b125
Коммит
3eb0026729
|
@ -3,27 +3,28 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tqdm import tqdm_notebook as tqdm\n",
|
||||
"from presidio_evaluator.data_generator.main import generate, read_synth_dataset\n",
|
||||
"\n",
|
||||
"import datetime\n",
|
||||
"import json"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Generate fake PII data using Presidio's data generator"
|
||||
],
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Presidio's data generator allows you to generate a synthetic dataset with two preriquisites:\n",
|
||||
"1. A fake PII csv (We used https://www.fakenamegenerator.com/)\n",
|
||||
|
@ -50,20 +51,23 @@
|
|||
"What's your last name? It's [LAST_NAME]\n",
|
||||
"\n",
|
||||
"Every time I see you falling I get down on my knees and pray\n"
|
||||
],
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Generate files\n",
|
||||
"Based on these two prerequisites, a requested number of examples and an output file name:"
|
||||
],
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"EXAMPLES = 100\n",
|
||||
"SPAN_TO_TAG = True #Whether to create tokens + token labels (tags)\n",
|
||||
|
@ -91,69 +95,69 @@
|
|||
" ignore_types=IGNORE_TYPES,\n",
|
||||
" keep_only_tagged=KEEP_ONLY_TAGGED,\n",
|
||||
" span_to_tag=SPAN_TO_TAG)"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To read a dataset file into the InputSample format, use `read_synth_dataset`:"
|
||||
],
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input_samples = read_synth_dataset(OUTPUT)"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input_samples[0]"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The full structure of each input_sample is the following. It includes different feature values per token as calculated by Spacy"
|
||||
],
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"input_samples[0].to_dict()"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input_samples[0].to_dict()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Verify randomness of dataset"
|
||||
],
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from collections import Counter\n",
|
||||
"count_per_template_id = Counter([sample.metadata['Template#'] for sample in input_samples])\n",
|
||||
|
@ -161,35 +165,32 @@
|
|||
" print(\"{}: {}\".format(key,count_per_template_id[key]))\n",
|
||||
" \n",
|
||||
"print(sum(count_per_template_id.values()))"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Transform to the CONLL structure:"
|
||||
],
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from presidio_evaluator import InputSample\n",
|
||||
"\n",
|
||||
"conll = InputSample.create_conll_dataset(input_samples)\n",
|
||||
"conll.head(5)"
|
||||
],
|
||||
"outputs": [],
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Copyright notice:\n",
|
||||
"\n",
|
||||
|
@ -198,14 +199,13 @@
|
|||
"\n",
|
||||
"Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) \n",
|
||||
"are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC."
|
||||
],
|
||||
"metadata": {}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [],
|
||||
"outputs": [],
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
|
@ -216,8 +216,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3.8.11 64-bit ('presidio': conda)"
|
||||
"display_name": "presidio-research",
|
||||
"language": "python",
|
||||
"name": "presidio-research"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -229,10 +230,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.11"
|
||||
},
|
||||
"interpreter": {
|
||||
"hash": "2509fbe9adc3579fd0ef23e6a2c6fb50cb745caa174aafdf017283479e60bc43"
|
||||
"version": "3.8.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -2,7 +2,7 @@ import dataclasses
|
|||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
from typing import List, Union
|
||||
|
||||
from faker import Generator
|
||||
|
||||
|
@ -34,7 +34,7 @@ class SpansResult:
|
|||
|
||||
def __repr__(self):
|
||||
spans_dict = json.dumps([dataclasses.asdict(span) for span in self.spans])
|
||||
return json.dumps({"fake":self.fake, "spans": spans_dict})
|
||||
return json.dumps({"fake": self.fake, "spans": spans_dict})
|
||||
|
||||
|
||||
class SpanGenerator(Generator):
|
||||
|
@ -57,9 +57,16 @@ class SpanGenerator(Generator):
|
|||
'My name is Allison Hill and i live in 819 Johnson Course\nEast William, OH 26563.'
|
||||
"""
|
||||
|
||||
def parse(self, text, add_spans=False) -> SpansResult:
|
||||
def parse(self, text, add_spans=False) -> Union[str, SpansResult]:
|
||||
if not add_spans:
|
||||
return super().parse(text)
|
||||
else:
|
||||
return self.parse_with_spans(text)
|
||||
|
||||
def parse_with_spans(self, text) -> SpansResult:
|
||||
"""Parses a Faker template and returns a `SpanResult` object.
|
||||
:param text: Text holding the faker template, e.g. "My name is {{name}}".
|
||||
"""
|
||||
|
||||
spans = self._match_to_span(text)
|
||||
|
||||
|
@ -98,7 +105,7 @@ class SpanGenerator(Generator):
|
|||
type=formatter,
|
||||
start=match.start(),
|
||||
end=match.end(),
|
||||
value=super().format(formatter.strip())
|
||||
value=super().format(formatter.strip()),
|
||||
)
|
||||
)
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче