V2 transformation to anonymizer (#526)

* Clean the engine a little by moving the text handling to another entity.
First draft.
This commit is contained in:
Shiran Rubin 2021-02-17 13:51:52 +02:00 коммит произвёл GitHub
Родитель 28dbfae6de
Коммит 7093281062
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
17 изменённых файлов: 117 добавлений и 93 удалений

Просмотреть файл

@ -31,7 +31,7 @@ Presidio _(Origin from Latin praesidium protection, garrison)_ helps to en
## Presidio's modules
1. [Presidio analyzer](analyzer/index.md): PII identification in text
2. [Presidio anonymizer](anonymizer/index.md): Anonymize detected PII entities using different transformations
2. [Presidio anonymizer](anonymizer/index.md): Anonymize detected PII entities using different anonymizers
3. [Presidio image redactor](image-redactor/index.md): Redact PII entities from images using OCR and PII identification
## Installing Presidio

Просмотреть файл

@ -133,13 +133,13 @@
"# Anonymize Text with Identified PII Entities\n",
"\n",
"<br>Presidio Anonymizer iterates over the Presidio Analyzer result, and provides anonymization capabilities for the identified text.\n",
"<br>The anonymizer provides 4 types of transformations - replace, redact, mask and hash. The default is **replace**\n",
"<br>The anonymizer provides 4 types of anonymizers - replace, redact, mask and hash. The default is **replace**\n",
"\n",
"<br>The following code sample will:\n",
"<ol>\n",
"<li>Convert analyzer results to anonymizer input - a list of dict</li>\n",
"<li>Setup the anonymizer engine </li>\n",
"<li>Create an anonymizer request - text to anonymize, list of transformations to apply and the results from the analyzer request</li>\n",
"<li>Create an anonymizer request - text to anonymize, list of anonymizers to apply and the results from the analyzer request</li>\n",
"<li>Anonymize the text</li>\n",
"</ol>"
],
@ -167,7 +167,7 @@
"\n",
"request = {\n",
" \"text\": text_to_anonymize,\n",
" \"transformations\": {\n",
" \"anonymizers\": {\n",
" \"DEFAULT\": {\"type\": \"replace\", \"new_value\": \"<ANONYMIZED>\"},\n",
" \"PHONE_NUMBER\": {\n",
" \"type\": \"mask\",\n",

Просмотреть файл

@ -51,7 +51,7 @@ def test_given_text_with_pii_then_analyze_and_anonymize_successfully():
anonymizer_request = {
"text": analyzer_request["text"],
"transformations": {
"anonymizers": {
"DEFAULT": {"type": "replace", "new_value": "ANONYMIZED"},
"US_DRIVER_LICENSE": {"type": "mask", "masking_char": "*", "chars_to_mask": 4, "from_end": True},
"PERSON": {"type": "replace", "new_value": "<PERSON>"}
@ -87,7 +87,7 @@ def test_given_a_correct_analyze_input_high_threashold_then_anonymize_partially(
anonymizer_request = {
"text": analyzer_request["text"],
"transformations": {
"anonymizers": {
"DEFAULT": {"type": "replace", "new_value": "ANONYMIZED"},
"US_DRIVER_LICENSE": {"type": "mask", "masking_char": "*", "chars_to_mask": 4, "from_end": True},
"PERSON": {"type": "replace", "new_value": "<PERSON>"}
@ -131,7 +131,7 @@ def test_given_a_correct_analyze_input_with_high_threshold_and_unmatched_entitie
anonymizer_request = {
"text": analyzer_request["text"],
"transformations": {
"anonymizers": {
"DEFAULT": {"type": "replace", "new_value": "ANONYMIZED"},
"US_DRIVER_LICENSE": {"type": "mask", "masking_char": "*", "chars_to_mask": 4, "from_end": True},
"PERSON": {"type": "replace", "new_value": "<PERSON>"}
@ -180,7 +180,7 @@ def test_given_an_unknown_entity_then_anonymize_uses_defaults():
anonymizer_request = {
"text": analyzer_request["text"],
"transformations": {
"anonymizers": {
"ABC": {"type": "replace", "new_value": "<PERSON>"}
},
"analyzer_results": analyzer_data

Просмотреть файл

@ -9,7 +9,7 @@ def test_given_anonymize_called_with_valid_request_then_expected_valid_response_
request_body = """
{
"text": "hello world, my name is Jane Doe. My number is: 034453334",
"transformations": {
"anonymizers": {
"DEFAULT": { "type": "replace", "new_value": "ANONYMIZED" },
"PHONE_NUMBER": { "type": "mask", "masking_char": "*", "chars_to_mask": 4, "from_end": true }
},
@ -37,7 +37,7 @@ def test_given_anonymize_called_with_empty_text_then_invalid_input_message_retur
request_body = """
{
"text": "",
"transformations": {
"anonymizers": {
"DEFAULT": { "type": "replace", "new_value": "ANONYMIZED" }
},
"analyzer_results": [
@ -58,7 +58,7 @@ def test_given_anonymize_called_with_empty_analyzer_results_then_invalid_input_m
request_body = """
{
"text": "hello world, my name is Jane Doe. My number is: 034453334",
"transformations": {
"anonymizers": {
"DEFAULT": { "type": "replace", "new_value": "ANONYMIZED" },
"PHONE_NUMBER": { "type": "mask", "masking_char": "*", "chars_to_mask": 4, "from_end": true }
},
@ -79,7 +79,7 @@ def test_given_anonymize_called_with_deformed_body_then_internal_server_error_re
request_body = """
{
"text": "hello world, my name is Jane Doe. My number is: 034453334",
"transformations": {
"anonymizers": {
"DEFAULT": {"type": "replace", "new_value": "ANONYMIZED"},
},
"analyzer_results": [

Просмотреть файл

@ -36,7 +36,7 @@ Presidio anonymizer comes by default with the following anonymizers:
- "masking_char" - the character to be replaced with.
- "from_end" - Whether to mask the PII from it's end.
Please notice: if default value is not stated in transformations object, the default
Please notice: if default value is not stated in anonymizers object, the default
anonymizer is "replace" for all entities. The replacing value will be the entity type
e.g.: <PHONE_NUMBER>
@ -44,7 +44,7 @@ As the input text could potentially have overlapping PII entities, there are dif
anonymization scenarios:
- No overlap (single PII) - single PII over text entity, uses a given or default
transformation to anonymize and replace the PII text entity.
anonymizer to anonymize and replace the PII text entity.
- Full overlap of PIIs - When one text have several PIIs, the PII with the higher score
will be taken. Between PIIs with identical scores, the selection will be arbitrary.
- One PII is contained in another - anonymizer will use the PII with larger text.
@ -109,7 +109,7 @@ Payload:
```json
{
"text": "hello world, my name is Jane Doe. My number is: 034453334",
"transformations": {
"anonymizers": {
"PHONE_NUMBER": {
"type": "mask",
"masking_char": "*",

Просмотреть файл

@ -11,7 +11,7 @@ class AnonymizerEngine:
AnonymizerEngine class.
Handles the entire logic of the Presidio-anonymizer. Gets the original text
and replaces the PII entities with the desired transformations.
and replaces the PII entities with the desired anonymizers.
"""
logger = logging.getLogger("presidio-anonymizer")
@ -22,9 +22,9 @@ class AnonymizerEngine:
def __init__(
self,
):
"""Handle text replacement for PIIs with requested transformations.
"""Handle text replacement for PIIs with requested anonymizers.
:param data: a map which contains the transformations, analyzer_results and text
:param data: a map which contains the anonymizers, analyzer_results and text
"""
def anonymize(self, engine_request: AnonymizerRequest) -> str:
@ -48,15 +48,15 @@ class AnonymizerEngine:
text_to_anonymize = text_builder.get_text_in_position(
analyzer_result.start, analyzer_result.end)
transformation = engine_request.get_transformation(
anonymizer_dto = engine_request.get_anonymizer_dto(
analyzer_result.entity_type
)
self.logger.debug(
f"for analyzer result {analyzer_result} received transformation "
f"{str(transformation)}"
f"for analyzer result {analyzer_result} received anonymizer "
f"{str(anonymizer_dto)}"
)
anonymized_text = self.__extract_anonymizer_and_anonymize(transformation,
anonymized_text = self.__extract_anonymizer_and_anonymize(anonymizer_dto,
text_to_anonymize)
text_builder.replace_text(anonymized_text, analyzer_result.start,
analyzer_result.end)
@ -68,11 +68,11 @@ class AnonymizerEngine:
names = [p for p in self.builtin_anonymizers.keys()]
return names
def __extract_anonymizer_and_anonymize(self, transformation, text_to_anonymize):
anonymizer = transformation.get("anonymizer")()
def __extract_anonymizer_and_anonymize(self, anonymizer_dto, text_to_anonymize):
anonymizer = anonymizer_dto.get("anonymizer")()
# if the anonymizer is not valid, a InvalidParamException
anonymizer.validate(params=transformation)
anonymizer.validate(params=anonymizer_dto)
anonymized_text = anonymizer.anonymize(
params=transformation, text=text_to_anonymize
params=anonymizer_dto, text=text_to_anonymize
)
return anonymized_text

Просмотреть файл

@ -11,7 +11,7 @@ class AnalyzerResults(list):
It includes removal of unused results and sort by indices order.
Additional information about the rational of this class:
- One PII - uses a given or default transformation to anonymize and replace the PII
- One PII - uses a given or default anonymizer to anonymize and replace the PII
text entity.
- Full overlap of PIIs - When one text have several PIIs, the PII with the higher
score will be taken.
@ -29,7 +29,7 @@ class AnalyzerResults(list):
_remove_conflicts method - removes results which impact the same text and
should be ignored.
using the logic:
- One PII - uses a given or default transformation to anonymize and
- One PII - uses a given or default anonymizer to anonymize and
replace the PII text entity.
- Full overlap of PIIs - When one text have several PIIs,
the PII with the higher score will be taken.

Просмотреть файл

@ -18,32 +18,32 @@ class AnonymizerRequest:
def __init__(self, data: dict, anonymizers):
"""Handle and validate data for the text replacement.
:param data: a map which contains the transformations, analyzer_results and text
:param data: a map which contains the anonymizers, analyzer_results and text
"""
self.anonymizers = anonymizers
self._transformations = {}
self._anonymizers = {}
self._analysis_results = AnalyzerResults()
self.__validate_and_insert_input(data)
self.default_transformation = {
self.default_anonymizer = {
"type": "replace",
"anonymizer": self.anonymizers["replace"],
}
def get_transformation(self, entity_type: str):
def get_anonymizer_dto(self, entity_type: str):
"""
Get the right transformation from the list.
Get the right anonymizer_dto from the list.
When transformation does not exist, we fall back to default.
:param analyzer_result: the result we are going to do the transformation on
:return: transformation
When anonymizer_dto does not exist, we fall back to default.
:param analyzer_result: the result we are going to do the anonymization on
:return: anonymizer_dto
"""
transformation = self._transformations.get(entity_type)
if not transformation:
transformation = self._transformations.get("DEFAULT")
if not transformation:
transformation = self.default_transformation
transformation["entity_type"] = entity_type
return transformation
anonymizer_dto = self._anonymizers.get(entity_type)
if not anonymizer_dto:
anonymizer_dto = self._anonymizers.get("DEFAULT")
if not anonymizer_dto:
anonymizer_dto = self.default_anonymizer
anonymizer_dto["entity_type"] = entity_type
return anonymizer_dto
def get_text(self):
"""Get the text we are working on."""
@ -56,13 +56,13 @@ class AnonymizerRequest:
def __validate_and_insert_input(self, data: dict):
self.__handle_text(data)
self.__handle_analyzer_results(data)
self.__handle_transformations(data)
self.__handle_anonymizers(data)
def __handle_analyzer_results(self, data):
"""
Go over analyzer results, check they are valid and convert to AnalyzeResult.
:param data: contains the text, transformations and analyzer_results
:param data: contains the text, anonymizers and analyzer_results
:return: None
"""
analyzer_results = data.get("analyzer_results")
@ -77,22 +77,22 @@ class AnonymizerRequest:
analyzer_result.validate_position_in_text(text_len)
self._analysis_results.append(analyzer_result)
def __handle_transformations(self, data):
def __handle_anonymizers(self, data):
"""
Go over the transformations and get the relevant anonymizer class for it.
Go over the anonymizers and get the relevant anonymizer class for it.
Inserts the class to the transformation so the engine will use it.
:param data: contains the text, transformations and analyzer_results
Inserts the class to the anonymizer so the engine will use it.
:param data: contains the text, anonymizers and analyzer_results
:return: None
"""
transformations = data.get("transformations")
if transformations is not None:
for key, transformation in transformations.items():
self.logger.debug(f"converting {transformation} to anonymizer class")
anonymizer = self.__get_anonymizer(transformation)
self.logger.debug(f"applying class {anonymizer} to {transformation}")
transformation["anonymizer"] = anonymizer
self._transformations[key] = transformation
anonymizers = data.get("anonymizers")
if anonymizers is not None:
for key, anonymizer_dto in anonymizers.items():
self.logger.debug(f"converting {anonymizer_dto} to anonymizer class")
anonymizer = self.__get_anonymizer(anonymizer_dto)
self.logger.debug(f"applying class {anonymizer} to {anonymizer_dto}")
anonymizer_dto["anonymizer"] = anonymizer
self._anonymizers[key] = anonymizer_dto
def __handle_text(self, data):
self._text = data.get("text")
@ -100,14 +100,14 @@ class AnonymizerRequest:
self.logger.debug("invalid input, json is missing text field")
raise InvalidParamException("Invalid input, text can not be empty")
def __get_anonymizer(self, transformation):
def __get_anonymizer(self, anonymizer):
"""
Extract the anonymizer class from the anonymizers list.
:param transformation: a single transformation value
:param anonymizer: a single anonymizer value
:return: Anonymizer
"""
anonymizer_type = transformation.get("type").lower()
anonymizer_type = anonymizer.get("type").lower()
anonymizer = self.anonymizers.get(anonymizer_type)
if not anonymizer:
self.logger.error(f"No such anonymizer class {anonymizer_type}")

Просмотреть файл

@ -1,6 +1,6 @@
{
"text": "hello world, my name is Jane Doe. My number is: 034453334",
"transformations": {
"anonymizers": {
"DEFAULT": { "type": "hash", "hash_type": "md5" }
},
"analyzer_results": [

Просмотреть файл

@ -1,6 +1,6 @@
{
"text": "hello world, my name is Jane Doe. My number is: 034453334",
"transformations": {
"anonymizers": {
"DEFAULT": { "type": "hash", "hash_type": "sha256" }
},
"analyzer_results": [

Просмотреть файл

@ -1,6 +1,6 @@
{
"text": "hello world, my name is Jane Doe. My number is: 034453334",
"transformations": {
"anonymizers": {
"DEFAULT": { "type": "hash", "hash_type": "sha256" }
},
"analyzer_results": [

Просмотреть файл

@ -1,6 +1,6 @@
{
"text": "hello world, my name is Jane Doe. My number is: 034453334",
"transformations": {
"anonymizers": {
"DEFAULT": { "type": "hash", "hash_type": "sha512" }
},
"analyzer_results": [

Просмотреть файл

@ -1,6 +1,6 @@
{
"text": "hello world, my name is Jane Doe. My number is: 03-4453334",
"transformations": {
"anonymizers": {
"DEFAULT": {
"type": "mask",
"masking_char": "*",

Просмотреть файл

@ -1,6 +1,6 @@
{
"text": "hello world, my name is Jane Doe. My number is: 03-4453334",
"transformations": {
"anonymizers": {
"PHONE_NUMBER": {
"type": "mask",
"masking_char": "non_character",

Просмотреть файл

@ -1,6 +1,6 @@
{
"text": "hello world, my name is Jane Doe. My number is: 034453334",
"transformations": {
"anonymizers": {
"NAME": {
"type": "redact",
"new_value": "ANONYMIZED"

Просмотреть файл

@ -24,7 +24,7 @@ from presidio_anonymizer.entities.invalid_exception import InvalidParamException
"entity_type": "NUMBER"
}
],
"transformations": {
"anonymizers": {
"default": {
"type": "none"
}
@ -63,15 +63,15 @@ def test_given_invalid_json_then_request_creation_should_fail(
assert result_text == e.value.err_msg
def test_given_no_transformations_then_we_get_the_default():
def test_given_no_anonymizers_then_we_get_the_default():
content = get_content()
request = AnonymizerRequest(content, AnonymizerEngine().builtin_anonymizers)
request._transformations = {}
request._anonymizers = {}
analyzer_result = Mock()
analyzer_result.entity_type = "PHONE"
transformation = request.get_transformation(analyzer_result.entity_type)
assert transformation.get("type") == "replace"
assert type(transformation.get("anonymizer")) == type(Replace)
anonymizers = request.get_anonymizer_dto(analyzer_result.entity_type)
assert anonymizers.get("type") == "replace"
assert type(anonymizers.get("anonymizer")) == type(Replace)
def test_given_valid_json_then_request_creation_should_succeed():
@ -79,7 +79,7 @@ def test_given_valid_json_then_request_creation_should_succeed():
data = AnonymizerRequest(content, AnonymizerEngine().builtin_anonymizers)
assert data.get_text() == content.get("text")
assert data._text == content.get("text")
assert data._transformations == content.get("transformations")
assert data._anonymizers == content.get("anonymizers")
assert len(data._analysis_results) == len(content.get("analyzer_results"))
assert data._analysis_results == data.get_analysis_results()
for result_a in data._analysis_results:
@ -90,26 +90,50 @@ def test_given_valid_json_then_request_creation_should_succeed():
assert result_a.score == same_result_in_content.get("score")
assert result_a.start == same_result_in_content.get("start")
assert result_a.end == same_result_in_content.get("end")
assert data.get_transformation(result_a.entity_type)
assert data.get_anonymizer_dto(result_a.entity_type)
def test_given_valid_anonymizer_request_then_get_transformations_successfully():
def test_given_valid_anonymizer_request_then_get_anonymizers_successfully():
content = get_content()
data = AnonymizerRequest(content, AnonymizerEngine().builtin_anonymizers)
replace_result = data.get_analysis_results()[0]
default_replace_transformation = data.get_transformation(replace_result.entity_type)
assert default_replace_transformation.get("type") == "replace"
assert default_replace_transformation.get("new_value") == "ANONYMIZED"
assert type(default_replace_transformation.get("anonymizer")) == type(Replace)
mask_transformation = data.get_transformation(
default_replace_anonymizer = data.get_anonymizer_dto(replace_result.entity_type)
assert default_replace_anonymizer.get("type") == "replace"
assert default_replace_anonymizer.get("new_value") == "ANONYMIZED"
assert type(default_replace_anonymizer.get("anonymizer")) == type(Replace)
mask_anonymizer = data.get_anonymizer_dto(
data.get_analysis_results()[3].entity_type
)
assert mask_transformation.get("type") == "mask"
assert mask_transformation.get("from_end")
assert mask_transformation.get("chars_to_mask") == 4
assert mask_transformation.get("masking_char") == "*"
assert mask_transformation.get("anonymizer")
assert type(mask_transformation.get("anonymizer")) == type(Mask)
assert mask_anonymizer.get("type") == "mask"
assert mask_anonymizer.get("from_end")
assert mask_anonymizer.get("chars_to_mask") == 4
assert mask_anonymizer.get("masking_char") == "*"
assert mask_anonymizer.get("anonymizer")
assert type(mask_anonymizer.get("anonymizer")) == type(Mask)
@pytest.mark.parametrize(
# fmt: off
"original_text,start,end",
[
("hello world", 5, 12),
("hello world", 12, 16),
],
# fmt: on
)
def test_given_analyzer_result_with_an_incorrect_text_positions_then_we_fail(
original_text, start, end):
content = {
"text": original_text,
"analyzer_results": [
{"start": start, "end": end, "score": 0.8, "entity_type": "NAME"},
],
}
content.get("analyzer_results")
err_msg = f"Invalid analyzer result, start: {start} and end: " \
f"{end}, while text length is only 11."
with pytest.raises(InvalidParamException, match=err_msg):
AnonymizerRequest(content, AnonymizerEngine().builtin_anonymizers)
@pytest.mark.parametrize(
@ -146,7 +170,7 @@ def __find_element(content: List, entity_type: str):
def get_content():
return {
"text": "hello world, my name is Jane Doe. My number is: 034453334",
"transformations": {
"anonymizers": {
"DEFAULT": {"type": "replace", "new_value": "ANONYMIZED"},
"PHONE_NUMBER": {
"type": "mask",

Просмотреть файл

@ -4,9 +4,9 @@ from presidio_anonymizer.anonymizer_engine import AnonymizerEngine
from presidio_anonymizer.entities import AnalyzerResult
def test_given_several_transformations_then_we_use_the_correct_one():
transformation = Mock()
transformation.get = get_transformation
def test_given_several_anonymizers_then_we_use_the_correct_one():
anonymizer = Mock()
anonymizer.get = get_anonymizer_dto
mock = Mock()
mock.get_text = lambda: "Number: 0554555556"
analyzer_results = Mock()
@ -15,7 +15,7 @@ def test_given_several_transformations_then_we_use_the_correct_one():
)
analyzer_results.to_sorted_unique_results = lambda reverse: [analyzer_result]
mock.get_analysis_results = lambda: analyzer_results
mock.get_transformation = lambda result: transformation
mock.get_anonymizer_dto = lambda result: anonymizer
text = AnonymizerEngine().anonymize(mock)
assert text == "Number: I am your new text!"
@ -28,6 +28,6 @@ class Anonymizer:
pass
def get_transformation(arg):
def get_anonymizer_dto(arg):
assert arg == "anonymizer"
return Anonymizer