minor updates

2021-10-20 11:05:33 +03:00 · 2021-10-20 11:05:33 +03:00 · 3eb0026729
--- a/generation/Generate
+++ b/generation/Generate
@ -3,27 +3,28 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
   "source": [
    "from tqdm import tqdm_notebook as tqdm\n",
    "from presidio_evaluator.data_generator.main import generate, read_synth_dataset\n",
    "\n",
    "import datetime\n",
    "import json"
-   ],
-   "outputs": [],
-   "metadata": {
-    "scrolled": true
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "# Generate fake PII data using Presidio's data generator"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "Presidio's data generator allows you to generate a synthetic dataset with two preriquisites:\n",
    "1. A fake PII csv (We used https://www.fakenamegenerator.com/)\n",
@ -50,20 +51,23 @@
    "What's your last name? It's [LAST_NAME]\n",
    "\n",
    "Every time I see you falling I get down on my knees and pray\n"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "### Generate files\n",
    "Based on these two prerequisites, a requested number of examples and an output file name:"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
   "source": [
    "EXAMPLES = 100\n",
    "SPAN_TO_TAG = True #Whether to create tokens + token labels (tags)\n",
@ -91,69 +95,69 @@
    "                        ignore_types=IGNORE_TYPES,\n",
    "                        keep_only_tagged=KEEP_ONLY_TAGGED,\n",
    "                        span_to_tag=SPAN_TO_TAG)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "scrolled": true
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "To read a dataset file into the InputSample format, use `read_synth_dataset`:"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
   "source": [
    "input_samples = read_synth_dataset(OUTPUT)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "scrolled": true
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
   "source": [
    "input_samples[0]"
-   ],
-   "outputs": [],
-   "metadata": {
-    "scrolled": true
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "The full structure of each input_sample is the following. It includes different feature values per token as calculated by Spacy"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [
-    "input_samples[0].to_dict()"
-   ],
-   "outputs": [],
   "metadata": {
    "scrolled": false
-   }
+   },
+   "outputs": [],
+   "source": [
+    "input_samples[0].to_dict()"
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "#### Verify randomness of dataset"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "count_per_template_id = Counter([sample.metadata['Template#'] for sample in input_samples])\n",
@ -161,35 +165,32 @@
    "    print(\"{}: {}\".format(key,count_per_template_id[key]))\n",
    "    \n",
    "print(sum(count_per_template_id.values()))"
-   ],
-   "outputs": [],
-   "metadata": {
-    "scrolled": true
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "#### Transform to the CONLL structure:"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
   "source": [
    "from presidio_evaluator import InputSample\n",
    "\n",
    "conll = InputSample.create_conll_dataset(input_samples)\n",
    "conll.head(5)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "scrolled": true
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "#### Copyright notice:\n",
    "\n",
@ -198,14 +199,13 @@
    "\n",
    "Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) \n",
    "are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC."
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [],
   "outputs": [],
+   "source": [],
   "metadata": {
    "collapsed": false,
    "pycharm": {
@ -216,8 +216,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3.8.11 64-bit ('presidio': conda)"
+   "display_name": "presidio-research",
+   "language": "python",
+   "name": "presidio-research"
  },
  "language_info": {
   "codemirror_mode": {
@ -229,10 +230,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.11"
-  },
-  "interpreter": {
-   "hash": "2509fbe9adc3579fd0ef23e6a2c6fb50cb745caa174aafdf017283479e60bc43"
+   "version": "3.8.8"
  }
 },
 "nbformat": 4,
--- a/presidio_evaluator/data_generator/faker_extensions/span_generator.py
+++ b/presidio_evaluator/data_generator/faker_extensions/span_generator.py
@ -2,7 +2,7 @@ import dataclasses
 import json
 import re
 from dataclasses import dataclass
-from typing import List
+from typing import List, Union

 from faker import Generator

@ -34,7 +34,7 @@ class SpansResult:

    def __repr__(self):
        spans_dict = json.dumps([dataclasses.asdict(span) for span in self.spans])
-        return json.dumps({"fake":self.fake, "spans": spans_dict})
+        return json.dumps({"fake": self.fake, "spans": spans_dict})


 class SpanGenerator(Generator):
@ -57,9 +57,16 @@ class SpanGenerator(Generator):
        'My name is Allison Hill and i live in 819 Johnson Course\nEast William, OH 26563.'
    """

-    def parse(self, text, add_spans=False) -> SpansResult:
+    def parse(self, text, add_spans=False) -> Union[str, SpansResult]:
        if not add_spans:
            return super().parse(text)
+        else:
+            return self.parse_with_spans(text)
+
+    def parse_with_spans(self, text) -> SpansResult:
+        """Parses a Faker template and returns a `SpanResult` object.
+        :param text: Text holding the faker template, e.g. "My name is {{name}}".
+        """

        spans = self._match_to_span(text)

@ -98,7 +105,7 @@ class SpanGenerator(Generator):
                    type=formatter,
                    start=match.start(),
                    end=match.end(),
-                    value=super().format(formatter.strip())
+                    value=super().format(formatter.strip()),
                )
            )