From b8d76c6a7e0a66fcfabcfe1dffd46f8010af2d28 Mon Sep 17 00:00:00 2001 From: Rakan <44086695+rakan41@users.noreply.github.com> Date: Sun, 5 Sep 2021 19:58:17 +1000 Subject: [PATCH] Issue 754 (#755) --- docs/supported_entities.md | 9 ++ e2e-tests/resources/demo.txt | 2 +- e2e-tests/tests/test_analyzer.py | 5 +- .../predefined_recognizers/__init__.py | 8 ++ .../au_abn_recognizer.py | 97 ++++++++++++++++++ .../au_acn_recognizer.py | 94 ++++++++++++++++++ .../au_medicare_recognizer.py | 93 +++++++++++++++++ .../au_tfn_recognizer.py | 99 +++++++++++++++++++ .../recognizer_registry.py | 8 ++ .../tests/test_au_abn_recognizer.py | 50 ++++++++++ .../tests/test_au_acn_recognizer.py | 51 ++++++++++ .../tests/test_au_medicare_recognizer.py | 49 +++++++++ .../tests/test_au_tfn_recognizer.py | 50 ++++++++++ .../tests/test_recognizer_registry.py | 4 +- 14 files changed, 614 insertions(+), 5 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py create mode 100644 presidio-analyzer/tests/test_au_abn_recognizer.py create mode 100644 presidio-analyzer/tests/test_au_acn_recognizer.py create mode 100644 presidio-analyzer/tests/test_au_medicare_recognizer.py create mode 100644 presidio-analyzer/tests/test_au_tfn_recognizer.py diff --git a/docs/supported_entities.md b/docs/supported_entities.md index 26cfc1ac..7cd64b1d 100644 --- a/docs/supported_entities.md +++ b/docs/supported_entities.md @@ -53,6 +53,15 @@ For more information, refer to the [adding new recognizers documentation](analyz |--- |--- |--- | |FIN/NRIC| A National Registration Identification Card | Pattern match and context | +### Australia + +|FieldType|Description|Detection Method| +|--- |--- |--- | +|AU_ABN| The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). | Pattern match, context, and checksum | +|AU_ACN| An Australian Company Number is a unique nine-digit number issued by the Australian Securities and Investments Commission to every company registered under the Commonwealth Corporations Act 2001 as an identifier. | Pattern match, context, and checksum | +|AU_TFN| The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity | Pattern match, context, and checksum | +|AU_MEDICARE| Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system| Pattern match, context, and checksum | + ## Adding a custom PII entity See [this documentation](analyzer/adding_recognizers.md) for instructions on how to add a new Recognizer for a new type of PII entity. diff --git a/e2e-tests/resources/demo.txt b/e2e-tests/resources/demo.txt index 25ac4b0b..60479310 100644 --- a/e2e-tests/resources/demo.txt +++ b/e2e-tests/resources/demo.txt @@ -5,7 +5,7 @@ My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6G On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1. -My passport: 191280345 and my phone number: (212) 555-1234. +My passport: 191280342 and my phone number: (212) 555-1234. This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544? diff --git a/e2e-tests/tests/test_analyzer.py b/e2e-tests/tests/test_analyzer.py index 47a6d55b..1fd1646b 100644 --- a/e2e-tests/tests/test_analyzer.py +++ b/e2e-tests/tests/test_analyzer.py @@ -234,7 +234,7 @@ def test_given_a_correct_input_for_supported_entities_then_expect_a_correct_resp expected_response = """ ["PHONE_NUMBER", "US_DRIVER_LICENSE", "US_PASSPORT", "SG_NRIC_FIN", "LOCATION", "CREDIT_CARD", "CRYPTO", "UK_NHS", "US_SSN", "US_BANK_NUMBER", "EMAIL_ADDRESS", "DATE_TIME", "IP_ADDRESS", "PERSON", "IBAN_CODE", - "NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE"] + "NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE", "AU_ABN", "AU_ACN", "AU_TFN", "AU_MEDICARE"] """ assert response_status == 200 assert equal_json_strings(expected_response, response_content) @@ -266,7 +266,8 @@ def test_given_an_illegal_input_for_supported_entities_then_igonre_and_proceed() expected_response = """ ["PHONE_NUMBER", "US_DRIVER_LICENSE", "US_PASSPORT", "SG_NRIC_FIN", "LOCATION", "CREDIT_CARD", "CRYPTO", "UK_NHS", "US_SSN", "US_BANK_NUMBER", "EMAIL_ADDRESS", "DATE_TIME", "IP_ADDRESS", - "PERSON", "IBAN_CODE", "NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE"] + "PERSON", "IBAN_CODE", "NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE", "AU_ABN", + "AU_ACN", "AU_TFN", "AU_MEDICARE"] """ assert response_status == 200 assert equal_json_strings(expected_response, response_content) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index c8bd7f85..c63c853e 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -21,6 +21,10 @@ from .us_passport_recognizer import UsPassportRecognizer from .us_phone_recognizer import UsPhoneRecognizer from .us_ssn_recognizer import UsSsnRecognizer from .es_nif_recognizer import EsNifRecognizer +from .au_abn_recognizer import AuAbnRecognizer +from .au_acn_recognizer import AuAcnRecognizer +from .au_tfn_recognizer import AuTfnRecognizer +from .au_medicare_recognizer import AuMedicareRecognizer NLP_RECOGNIZERS = {"spacy": SpacyRecognizer, "stanza": StanzaRecognizer} @@ -49,4 +53,8 @@ __all__ = [ "SpacyRecognizer", "StanzaRecognizer", "NLP_RECOGNIZERS", + "AuAbnRecognizer", + "AuAcnRecognizer", + "AuTfnRecognizer", + "AuMedicareRecognizer", ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py new file mode 100644 index 00000000..6d556bcd --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py @@ -0,0 +1,97 @@ +from typing import Optional, List, Tuple + +from presidio_analyzer import Pattern, PatternRecognizer + + +class AuAbnRecognizer(PatternRecognizer): + """ + Recognizes Australian Business Number ("ABN"). + + The Australian Business Number (ABN) is a unique 11 + digit identifier issued to all entities registered in + the Australian Business Register (ABR). + The 11 digit ABN is structured as a 9 digit identifier + with two leading check digits. + The leading check digits are derived using a modulus 89 calculation. + This recognizer identifies ABN using regex, context words and checksum. + Reference: https://abr.business.gov.au/Help/AbnFormat + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + :param replacement_pairs: List of tuples with potential replacement values + for different strings to be used during pattern matching. + This can allow a greater variety in input, for example by removing dashes or spaces. + """ + + PATTERNS = [ + Pattern( + "ABN (Medium)", + r"\b\d{2}\s\d{3}\s\d{3}\s\d{3}\b", + 0.1, + ), + Pattern( + "ABN (Low)", + r"\b\d{11}\b", + 0.01, + ), + ] + + CONTEXT = [ + "australian business number", + "abn", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "AU_ABN", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + ): + self.replacement_pairs = ( + replacement_pairs if replacement_pairs else [("-", ""), (" ", "")] + ) + context = context if context else self.CONTEXT + patterns = patterns if patterns else self.PATTERNS + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: + """ + Validate the pattern logic e.g., by running checksum on a detected pattern. + + :param pattern_text: the text to validated. + Only the part in text that was detected by the regex engine + :return: A bool indicating whether the validation was successful. + """ + # Pre-processing before validation checks + text = self.__sanitize_value(pattern_text, self.replacement_pairs) + abn_list = [int(digit) for digit in text] + + # Set weights based on digit position + weight = [10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19] + + # Perform checksums + abn_list[0] = 9 if abn_list[0] == 0 else abn_list[0] - 1 + sum_product = 0 + for i in range(11): + sum_product += abn_list[i] * weight[i] + remainder = sum_product % 89 + if remainder == 0: + result = True + else: + result = None + return result + + @staticmethod + def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: + for search_string, replacement_string in replacement_pairs: + text = text.replace(search_string, replacement_string) + return text diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py new file mode 100644 index 00000000..01fd11c5 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py @@ -0,0 +1,94 @@ +from typing import Optional, List, Tuple + +from presidio_analyzer import Pattern, PatternRecognizer + + +class AuAcnRecognizer(PatternRecognizer): + """ + Recognizes Australian Company Number ("ACN"). + + The Australian Company Number (ACN) is a nine digit number + with the last digit being a check digit calculated using a + modified modulus 10 calculation. + This recognizer identifies ACN using regex, context words, and checksum. + Reference: https://asic.gov.au/ + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + :param replacement_pairs: List of tuples with potential replacement values + for different strings to be used during pattern matching. + This can allow a greater variety in input, for example by removing dashes or spaces. + """ + + PATTERNS = [ + Pattern( + "ACN (Medium)", + r"\b\d{3}\s\d{3}\s\d{3}\b", + 0.1, + ), + Pattern( + "ACN (Low)", + r"\b\d{9}\b", + 0.01, + ), + ] + + CONTEXT = [ + "australian company number", + "acn", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "AU_ACN", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + ): + self.replacement_pairs = ( + replacement_pairs if replacement_pairs else [("-", ""), (" ", "")] + ) + context = context if context else self.CONTEXT + patterns = patterns if patterns else self.PATTERNS + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: + """ + Validate the pattern logic e.g., by running checksum on a detected pattern. + + :param pattern_text: the text to validated. + Only the part in text that was detected by the regex engine + :return: A bool indicating whether the validation was successful. + """ + # Pre-processing before validation checks + text = self.__sanitize_value(pattern_text, self.replacement_pairs) + acn_list = [int(digit) for digit in text] + + # Set weights based on digit position + weight = [8, 7, 6, 5, 4, 3, 2, 1] + + # Perform checksums + sum_product = 0 + for i in range(8): + sum_product += acn_list[i] * weight[i] + remainder = sum_product % 10 + complement = 10 - remainder + if complement == acn_list[-1]: + result = True + else: + result = None + return result + + @staticmethod + def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: + for search_string, replacement_string in replacement_pairs: + text = text.replace(search_string, replacement_string) + return text diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py new file mode 100644 index 00000000..a75c7f67 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py @@ -0,0 +1,93 @@ +from typing import Optional, List, Tuple + +from presidio_analyzer import Pattern, PatternRecognizer + + +class AuMedicareRecognizer(PatternRecognizer): + """ + Recognizes Australian Medicare number using regex, context words, and checksum. + + Medicare number is a unique identifier issued by Australian Government + that enables the cardholder to receive a rebates of medical expenses + under Australia's Medicare system. + It uses a modulus 10 checksum scheme to validate the number. + Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia) + + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + :param replacement_pairs: List of tuples with potential replacement values + for different strings to be used during pattern matching. + This can allow a greater variety in input, for example by removing dashes or spaces. + """ + + PATTERNS = [ + Pattern( + "Australian Medicare Number (Medium)", + r"\b[2-6]\d{3}\s\d{5}\s\d\b", + 0.1, + ), + Pattern( + "Australian Medicare Number (Low)", + r"\b[2-6]\d{9}\b", + 0.01, + ), + ] + + CONTEXT = [ + "medicare", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "AU_MEDICARE", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + ): + self.replacement_pairs = ( + replacement_pairs if replacement_pairs else [("-", ""), (" ", "")] + ) + context = context if context else self.CONTEXT + patterns = patterns if patterns else self.PATTERNS + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: + """ + Validate the pattern logic e.g., by running checksum on a detected pattern. + + :param pattern_text: the text to validated. + Only the part in text that was detected by the regex engine + :return: A bool indicating whether the validation was successful. + """ + # Pre-processing before validation checks + text = self.__sanitize_value(pattern_text, self.replacement_pairs) + medicare_list = [int(digit) for digit in text] + + # Set weights based on digit position + weight = [1, 3, 7, 9, 1, 3, 7, 9] + + # Perform checksums + sum_product = 0 + for i in range(8): + sum_product += medicare_list[i] * weight[i] + remainder = sum_product % 10 + if remainder == medicare_list[8]: + result = True + else: + result = None + return result + + @staticmethod + def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: + for search_string, replacement_string in replacement_pairs: + text = text.replace(search_string, replacement_string) + return text diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py new file mode 100644 index 00000000..de50f067 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py @@ -0,0 +1,99 @@ +from typing import Optional, List, Tuple + +from presidio_analyzer import Pattern, PatternRecognizer + + +class AuTfnRecognizer(PatternRecognizer): + """ + Recognizes Australian Tax File Numbers ("TFN"). + + The tax file number (TFN) is a unique identifier + issued by the Australian Taxation Office + to each taxpaying entity — an individual, company, + superannuation fund, partnership, or trust. + The TFN consists of a nine digit number, usually + presented in the format NNN NNN NNN. + TFN includes a check digit for detecting erroneous + number based on simple modulo 11. + This recognizer uses regex, context words, + and checksum to identify TFN. + Reference: https://www.ato.gov.au/individuals/tax-file-number/ + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + :param replacement_pairs: List of tuples with potential replacement values + for different strings to be used during pattern matching. + This can allow a greater variety in input, for example by removing dashes or spaces. + """ + + PATTERNS = [ + Pattern( + "TFN (Medium)", + r"\b\d{3}\s\d{3}\s\d{3}\b", + 0.1, + ), + Pattern( + "TFN (Low)", + r"\b\d{9}\b", + 0.01, + ), + ] + + CONTEXT = [ + "tax file number", + "tfn", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "AU_TFN", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + ): + self.replacement_pairs = ( + replacement_pairs if replacement_pairs else [("-", ""), (" ", "")] + ) + context = context if context else self.CONTEXT + patterns = patterns if patterns else self.PATTERNS + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: + """ + Validate the pattern logic e.g., by running checksum on a detected pattern. + + :param pattern_text: the text to validated. + Only the part in text that was detected by the regex engine + :return: A bool indicating whether the validation was successful. + """ + # Pre-processing before validation checks + text = self.__sanitize_value(pattern_text, self.replacement_pairs) + tfn_list = [int(digit) for digit in text] + + # Set weights based on digit position + weight = [1, 4, 3, 7, 5, 8, 6, 9, 10] + + # Perform checksums + sum_product = 0 + for i in range(9): + sum_product += tfn_list[i] * weight[i] + remainder = sum_product % 11 + if remainder == 0: + result = True + else: + result = None + return result + + @staticmethod + def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: + for search_string, replacement_string in replacement_pairs: + text = text.replace(search_string, replacement_string) + return text diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 79797abd..3ea9447b 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -24,6 +24,10 @@ from presidio_analyzer.predefined_recognizers import ( SpacyRecognizer, EsNifRecognizer, StanzaRecognizer, + AuAbnRecognizer, + AuAcnRecognizer, + AuTfnRecognizer, + AuMedicareRecognizer, ) logger = logging.getLogger("presidio-analyzer") @@ -68,6 +72,10 @@ class RecognizerRegistry: UsSsnRecognizer, NhsRecognizer, SgFinRecognizer, + AuAbnRecognizer, + AuAcnRecognizer, + AuTfnRecognizer, + AuMedicareRecognizer, ], "es": [EsNifRecognizer], "ALL": [ diff --git a/presidio-analyzer/tests/test_au_abn_recognizer.py b/presidio-analyzer/tests/test_au_abn_recognizer.py new file mode 100644 index 00000000..925c5ffd --- /dev/null +++ b/presidio-analyzer/tests/test_au_abn_recognizer.py @@ -0,0 +1,50 @@ +import pytest + +from tests import assert_result_within_score_range +from presidio_analyzer.predefined_recognizers import AuAbnRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return AuAbnRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["AU_ABN"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions, expected_score_ranges", + [ + # Valid formatting and valid ABNs + ("51 824 753 556", 1, ((0, 14),), ((1.0, 1.0),), ), + ("51824753556", 1, ((0, 11),), ((1.0, 1.0),), ), + # Valid formatting but invalid ABNs + ("52 824 753 556", 1, ((0, 14),), ((0.01, 0.1),),), + ("52824753556", 1, ((0, 11),), ((0.01, 0.1),),), + # Invalid formatting and ABNs. + ("5282475355632", 0, (), (),), + ("52824753556AF", 0, (), (),), + ("51 824 753 5564", 0, (), (),), + ], +) +def test_when_all_abns_then_succeed( + text, + expected_len, + expected_positions, + expected_score_ranges, + recognizer, + entities, + max_score, +): + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + for res, (st_pos, fn_pos), (st_score, fn_score) in zip( + results, expected_positions, expected_score_ranges + ): + if fn_score == "max": + fn_score = max_score + assert_result_within_score_range( + res, entities[0], st_pos, fn_pos, st_score, fn_score + ) diff --git a/presidio-analyzer/tests/test_au_acn_recognizer.py b/presidio-analyzer/tests/test_au_acn_recognizer.py new file mode 100644 index 00000000..453109cf --- /dev/null +++ b/presidio-analyzer/tests/test_au_acn_recognizer.py @@ -0,0 +1,51 @@ +import pytest + +from tests import assert_result_within_score_range +from presidio_analyzer.predefined_recognizers import AuAcnRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return AuAcnRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["AU_ACN"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions, expected_score_ranges", + [ + # Valid formatting and valid ACNs + ("000 000 019", 1, ((0, 11),), ((1.0, 1.0),), ), + ("005 499 981", 1, ((0, 11),), ((1.0, 1.0),), ), + ("006249976", 1, ((0, 9),), ((1.0, 1.0),), ), + # Valid formatting but invalid ACNs + ("824 753 557", 1, ((0, 11),), ((0.01, 0.1),),), + ("824753557", 1, ((0, 9),), ((0.01, 0.1),),), + # Invalid formatting and ACNs. + ("5282475355632", 0, (), (),), + ("52824753556AF", 0, (), (),), + ("51 824 753 5564", 0, (), (),), + ], +) +def test_when_all_acns_then_succeed( + text, + expected_len, + expected_positions, + expected_score_ranges, + recognizer, + entities, + max_score, +): + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + for res, (st_pos, fn_pos), (st_score, fn_score) in zip( + results, expected_positions, expected_score_ranges + ): + if fn_score == "max": + fn_score = max_score + assert_result_within_score_range( + res, entities[0], st_pos, fn_pos, st_score, fn_score + ) diff --git a/presidio-analyzer/tests/test_au_medicare_recognizer.py b/presidio-analyzer/tests/test_au_medicare_recognizer.py new file mode 100644 index 00000000..63d8fb8c --- /dev/null +++ b/presidio-analyzer/tests/test_au_medicare_recognizer.py @@ -0,0 +1,49 @@ +import pytest + +from tests import assert_result_within_score_range +from presidio_analyzer.predefined_recognizers import AuMedicareRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return AuMedicareRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["AU_MEDICARE"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions, expected_score_ranges", + [ + # Valid formatting and valid Medicare number. + ("2123 45670 1", 1, ((0, 12),), ((1.0, 1.0),), ), + ("2123456701", 1, ((0, 10),), ((1.0, 1.0),), ), + # Valid formatting but invalid Medicare number. + ("2123 25870 1", 1, ((0, 12),), ((0.01, 0.1),),), + ("2123258701", 1, ((0, 10),), ((0.01, 0.1),),), + # Invalid formatting and Medicare number. + ("212345670221", 0, (), (),), + ("2123456702AF", 0, (), (),), + ], +) +def test_when_all_medicares_then_succeed( + text, + expected_len, + expected_positions, + expected_score_ranges, + recognizer, + entities, + max_score, +): + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + for res, (st_pos, fn_pos), (st_score, fn_score) in zip( + results, expected_positions, expected_score_ranges + ): + if fn_score == "max": + fn_score = max_score + assert_result_within_score_range( + res, entities[0], st_pos, fn_pos, st_score, fn_score + ) diff --git a/presidio-analyzer/tests/test_au_tfn_recognizer.py b/presidio-analyzer/tests/test_au_tfn_recognizer.py new file mode 100644 index 00000000..a96b475f --- /dev/null +++ b/presidio-analyzer/tests/test_au_tfn_recognizer.py @@ -0,0 +1,50 @@ +import pytest + +from tests import assert_result_within_score_range +from presidio_analyzer.predefined_recognizers import AuTfnRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return AuTfnRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["AU_TFN"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions, expected_score_ranges", + [ + # Valid formatting and valid TFNs + ("876 543 210", 1, ((0, 11),), ((1.0, 1.0),), ), + ("876543210", 1, ((0, 9),), ((1.0, 1.0),), ), + # Valid formatting but invalid TFNs + ("824 753 557", 1, ((0, 11),), ((0.01, 0.1),),), + ("824753557", 1, ((0, 9),), ((0.01, 0.1),),), + # Invalid formatting and TFNs. + ("5282475355632", 0, (), (),), + ("52824753556AF", 0, (), (),), + ("51 824 753 5564", 0, (), (),), + ], +) +def test_when_all_tfns_then_succeed( + text, + expected_len, + expected_positions, + expected_score_ranges, + recognizer, + entities, + max_score, +): + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + for res, (st_pos, fn_pos), (st_score, fn_score) in zip( + results, expected_positions, expected_score_ranges + ): + if fn_score == "max": + fn_score = max_score + assert_result_within_score_range( + res, entities[0], st_pos, fn_pos, st_score, fn_score + ) diff --git a/presidio-analyzer/tests/test_recognizer_registry.py b/presidio-analyzer/tests/test_recognizer_registry.py index e8b1f20a..3e271588 100644 --- a/presidio-analyzer/tests/test_recognizer_registry.py +++ b/presidio-analyzer/tests/test_recognizer_registry.py @@ -52,8 +52,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi registry = mock_recognizer_registry registry.load_predefined_recognizers() recognizers = registry.get_recognizers(language="en", all_fields=True) - # 1 custom recognizer in english + 17 predefined - assert len(recognizers) == 1 + 17 + # 1 custom recognizer in english + 21 predefined + assert len(recognizers) == 1 + 21 def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry):