Merge pull request #65 from microsoft/mel/remove_square_bracket

Remove option to place faker entities inside square brackets
2022-12-26 12:52:06 +02:00 · 2022-12-26 12:52:06 +02:00 · 6e6dbb9a2a
--- a/presidio_evaluator/data_generator/README.md
+++ b/presidio_evaluator/data_generator/README.md
@ -66,12 +66,14 @@ The process in high level is the following:
 1. Translate a NER dataset (e.g. CONLL or OntoNotes) into a list of
 templates: `My name is John` -> `My name is [PERSON]`
 2. (Optional) add new Faker providers to the `PresidioDataGenerator` to support types of PII not returned by Faker
-3. Generate samples using the templates list
-4. Split the generated dataset to train/test/validation while making sure
+3. (Optional) map dataset entity names into provider equivalents by calling `PresidioDataGenerator.add_provider_alias`. 
+This will create entity aliases (e.g. faker supports "name" but templates contain "person")
+4. Generate samples using the templates list
+5. Split the generated dataset to train/test/validation while making sure
 that samples from the same template would only appear in one set
-5. Adapt datasets for the various models (Spacy, Flair, CRF, sklearn)
-6. Train models
-7. Evaluate using one of the [evaluation notebooks](../../notebooks/models)
+6. Adapt datasets for the various models (Spacy, Flair, CRF, sklearn)
+7. Train models
+8. Evaluate using one of the [evaluation notebooks](../../notebooks/models)

 Notes:

--- a/presidio_evaluator/data_generator/faker_extensions/span_generator.py
+++ b/presidio_evaluator/data_generator/faker_extensions/span_generator.py
@ -57,7 +57,7 @@ class SpanGenerator(Generator):
            new_len = len(str(span.value))

            # Update full text
-            fake_text = str(text[span.end : prev_end]) + str(fake_text)
+            fake_text = str(text[span.end: prev_end]) + str(fake_text)
            fake_text = str(span.value) + str(fake_text)
            prev_end = span.start

@ -88,7 +88,7 @@ class SpanGenerator(Generator):

        results: List[FakerSpan] = []
        for match in matches:
-            formatter = match.group()[2:-2]
+            formatter = match.group()[2:-2].lower()
            results.append(
                FakerSpan(
                    type=formatter,
--- a/presidio_evaluator/data_generator/presidio_data_generator.py
+++ b/presidio_evaluator/data_generator/presidio_data_generator.py
@ -125,25 +125,6 @@ class PresidioDataGenerator:
            lines = [line.replace("\\n", "\n") for line in lines]
            return lines

-    @staticmethod
-    def _prep_templates(raw_templates):
-        print("Preparing sample sentences for ingestion")
-
-        def make_lower_case(match_obj):
-            if match_obj.group() is not None:
-                return match_obj.group().lower()
-
-        templates = [
-            (
-                re.sub(r"\[.*?\]", make_lower_case, template.strip())
-                .replace("[", "{" + "{")
-                .replace("]", "}" + "}")
-            )
-            for template in raw_templates
-        ]
-
-        return templates
-
    def generate_fake_data(
        self, templates: List[str], n_samples: int
    ) -> Union[Generator[FakerSpansResult, None, None], Generator[str, None, None]]:
@ -152,10 +133,7 @@ class PresidioDataGenerator:
        :param templates: A list of strings containing templates
        :param n_samples: Number of samples to generate
        """
-
-        if templates:
-            templates = self._prep_templates(templates)
-        else:
+        if not templates:
            templates = None

        for _ in tqdm(range(n_samples), desc="Sampling"):
--- a/tests/test_span_generator.py
+++ b/tests/test_span_generator.py
@ -43,6 +43,7 @@ def span_faker(test_provider):
        ("My name is {{foo}}", "My name is bar"),
        ("My name is {{  foo   }}", "My name is bar"),
        ("my name is {{foofoofoo}}", "my name is bar"),
+        ("my name is {{FOO}}", "my name is bar")
    ],
 )
 def test_one_replacement(span_faker, pattern, expected):