Merge pull request #65 from microsoft/mel/remove_square_bracket
Remove option to place faker entities inside square brackets
This commit is contained in:
Коммит
6e6dbb9a2a
|
@ -66,12 +66,14 @@ The process in high level is the following:
|
|||
1. Translate a NER dataset (e.g. CONLL or OntoNotes) into a list of
|
||||
templates: `My name is John` -> `My name is [PERSON]`
|
||||
2. (Optional) add new Faker providers to the `PresidioDataGenerator` to support types of PII not returned by Faker
|
||||
3. Generate samples using the templates list
|
||||
4. Split the generated dataset to train/test/validation while making sure
|
||||
3. (Optional) map dataset entity names into provider equivalents by calling `PresidioDataGenerator.add_provider_alias`.
|
||||
This will create entity aliases (e.g. faker supports "name" but templates contain "person")
|
||||
4. Generate samples using the templates list
|
||||
5. Split the generated dataset to train/test/validation while making sure
|
||||
that samples from the same template would only appear in one set
|
||||
5. Adapt datasets for the various models (Spacy, Flair, CRF, sklearn)
|
||||
6. Train models
|
||||
7. Evaluate using one of the [evaluation notebooks](../../notebooks/models)
|
||||
6. Adapt datasets for the various models (Spacy, Flair, CRF, sklearn)
|
||||
7. Train models
|
||||
8. Evaluate using one of the [evaluation notebooks](../../notebooks/models)
|
||||
|
||||
Notes:
|
||||
|
||||
|
|
|
@ -57,7 +57,7 @@ class SpanGenerator(Generator):
|
|||
new_len = len(str(span.value))
|
||||
|
||||
# Update full text
|
||||
fake_text = str(text[span.end : prev_end]) + str(fake_text)
|
||||
fake_text = str(text[span.end: prev_end]) + str(fake_text)
|
||||
fake_text = str(span.value) + str(fake_text)
|
||||
prev_end = span.start
|
||||
|
||||
|
@ -88,7 +88,7 @@ class SpanGenerator(Generator):
|
|||
|
||||
results: List[FakerSpan] = []
|
||||
for match in matches:
|
||||
formatter = match.group()[2:-2]
|
||||
formatter = match.group()[2:-2].lower()
|
||||
results.append(
|
||||
FakerSpan(
|
||||
type=formatter,
|
||||
|
|
|
@ -125,25 +125,6 @@ class PresidioDataGenerator:
|
|||
lines = [line.replace("\\n", "\n") for line in lines]
|
||||
return lines
|
||||
|
||||
@staticmethod
|
||||
def _prep_templates(raw_templates):
|
||||
print("Preparing sample sentences for ingestion")
|
||||
|
||||
def make_lower_case(match_obj):
|
||||
if match_obj.group() is not None:
|
||||
return match_obj.group().lower()
|
||||
|
||||
templates = [
|
||||
(
|
||||
re.sub(r"\[.*?\]", make_lower_case, template.strip())
|
||||
.replace("[", "{" + "{")
|
||||
.replace("]", "}" + "}")
|
||||
)
|
||||
for template in raw_templates
|
||||
]
|
||||
|
||||
return templates
|
||||
|
||||
def generate_fake_data(
|
||||
self, templates: List[str], n_samples: int
|
||||
) -> Union[Generator[FakerSpansResult, None, None], Generator[str, None, None]]:
|
||||
|
@ -152,10 +133,7 @@ class PresidioDataGenerator:
|
|||
:param templates: A list of strings containing templates
|
||||
:param n_samples: Number of samples to generate
|
||||
"""
|
||||
|
||||
if templates:
|
||||
templates = self._prep_templates(templates)
|
||||
else:
|
||||
if not templates:
|
||||
templates = None
|
||||
|
||||
for _ in tqdm(range(n_samples), desc="Sampling"):
|
||||
|
|
|
@ -43,6 +43,7 @@ def span_faker(test_provider):
|
|||
("My name is {{foo}}", "My name is bar"),
|
||||
("My name is {{ foo }}", "My name is bar"),
|
||||
("my name is {{foofoofoo}}", "my name is bar"),
|
||||
("my name is {{FOO}}", "my name is bar")
|
||||
],
|
||||
)
|
||||
def test_one_replacement(span_faker, pattern, expected):
|
||||
|
|
Загрузка…
Ссылка в новой задаче