Merge pull request #65 from microsoft/mel/remove_square_bracket

Remove option to place faker entities inside square brackets
This commit is contained in:
Omri Mendels 2022-12-26 12:52:06 +02:00 коммит произвёл GitHub
Родитель d34e476662 f7dbb94cf8
Коммит 6e6dbb9a2a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 11 добавлений и 30 удалений

Просмотреть файл

@ -66,12 +66,14 @@ The process in high level is the following:
1. Translate a NER dataset (e.g. CONLL or OntoNotes) into a list of
templates: `My name is John` -> `My name is [PERSON]`
2. (Optional) add new Faker providers to the `PresidioDataGenerator` to support types of PII not returned by Faker
3. Generate samples using the templates list
4. Split the generated dataset to train/test/validation while making sure
3. (Optional) map dataset entity names into provider equivalents by calling `PresidioDataGenerator.add_provider_alias`.
This will create entity aliases (e.g. faker supports "name" but templates contain "person")
4. Generate samples using the templates list
5. Split the generated dataset to train/test/validation while making sure
that samples from the same template would only appear in one set
5. Adapt datasets for the various models (Spacy, Flair, CRF, sklearn)
6. Train models
7. Evaluate using one of the [evaluation notebooks](../../notebooks/models)
6. Adapt datasets for the various models (Spacy, Flair, CRF, sklearn)
7. Train models
8. Evaluate using one of the [evaluation notebooks](../../notebooks/models)
Notes:

Просмотреть файл

@ -57,7 +57,7 @@ class SpanGenerator(Generator):
new_len = len(str(span.value))
# Update full text
fake_text = str(text[span.end : prev_end]) + str(fake_text)
fake_text = str(text[span.end: prev_end]) + str(fake_text)
fake_text = str(span.value) + str(fake_text)
prev_end = span.start
@ -88,7 +88,7 @@ class SpanGenerator(Generator):
results: List[FakerSpan] = []
for match in matches:
formatter = match.group()[2:-2]
formatter = match.group()[2:-2].lower()
results.append(
FakerSpan(
type=formatter,

Просмотреть файл

@ -125,25 +125,6 @@ class PresidioDataGenerator:
lines = [line.replace("\\n", "\n") for line in lines]
return lines
@staticmethod
def _prep_templates(raw_templates):
print("Preparing sample sentences for ingestion")
def make_lower_case(match_obj):
if match_obj.group() is not None:
return match_obj.group().lower()
templates = [
(
re.sub(r"\[.*?\]", make_lower_case, template.strip())
.replace("[", "{" + "{")
.replace("]", "}" + "}")
)
for template in raw_templates
]
return templates
def generate_fake_data(
self, templates: List[str], n_samples: int
) -> Union[Generator[FakerSpansResult, None, None], Generator[str, None, None]]:
@ -152,10 +133,7 @@ class PresidioDataGenerator:
:param templates: A list of strings containing templates
:param n_samples: Number of samples to generate
"""
if templates:
templates = self._prep_templates(templates)
else:
if not templates:
templates = None
for _ in tqdm(range(n_samples), desc="Sampling"):

Просмотреть файл

@ -43,6 +43,7 @@ def span_faker(test_provider):
("My name is {{foo}}", "My name is bar"),
("My name is {{ foo }}", "My name is bar"),
("my name is {{foofoofoo}}", "my name is bar"),
("my name is {{FOO}}", "my name is bar")
],
)
def test_one_replacement(span_faker, pattern, expected):