зеркало из https://github.com/microsoft/presidio.git
Issue 754 (#755)
This commit is contained in:
Родитель
f76dad3961
Коммит
b8d76c6a7e
|
@ -53,6 +53,15 @@ For more information, refer to the [adding new recognizers documentation](analyz
|
|||
|--- |--- |--- |
|
||||
|FIN/NRIC| A National Registration Identification Card | Pattern match and context |
|
||||
|
||||
### Australia
|
||||
|
||||
|FieldType|Description|Detection Method|
|
||||
|--- |--- |--- |
|
||||
|AU_ABN| The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). | Pattern match, context, and checksum |
|
||||
|AU_ACN| An Australian Company Number is a unique nine-digit number issued by the Australian Securities and Investments Commission to every company registered under the Commonwealth Corporations Act 2001 as an identifier. | Pattern match, context, and checksum |
|
||||
|AU_TFN| The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity | Pattern match, context, and checksum |
|
||||
|AU_MEDICARE| Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system| Pattern match, context, and checksum |
|
||||
|
||||
## Adding a custom PII entity
|
||||
|
||||
See [this documentation](analyzer/adding_recognizers.md) for instructions on how to add a new Recognizer for a new type of PII entity.
|
||||
|
|
|
@ -5,7 +5,7 @@ My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6G
|
|||
|
||||
On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1.
|
||||
|
||||
My passport: 191280345 and my phone number: (212) 555-1234.
|
||||
My passport: 191280342 and my phone number: (212) 555-1234.
|
||||
|
||||
This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
|
||||
|
||||
|
|
|
@ -234,7 +234,7 @@ def test_given_a_correct_input_for_supported_entities_then_expect_a_correct_resp
|
|||
expected_response = """
|
||||
["PHONE_NUMBER", "US_DRIVER_LICENSE", "US_PASSPORT", "SG_NRIC_FIN", "LOCATION", "CREDIT_CARD", "CRYPTO",
|
||||
"UK_NHS", "US_SSN", "US_BANK_NUMBER", "EMAIL_ADDRESS", "DATE_TIME", "IP_ADDRESS", "PERSON", "IBAN_CODE",
|
||||
"NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE"]
|
||||
"NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE", "AU_ABN", "AU_ACN", "AU_TFN", "AU_MEDICARE"]
|
||||
"""
|
||||
assert response_status == 200
|
||||
assert equal_json_strings(expected_response, response_content)
|
||||
|
@ -266,7 +266,8 @@ def test_given_an_illegal_input_for_supported_entities_then_igonre_and_proceed()
|
|||
expected_response = """
|
||||
["PHONE_NUMBER", "US_DRIVER_LICENSE", "US_PASSPORT", "SG_NRIC_FIN", "LOCATION", "CREDIT_CARD",
|
||||
"CRYPTO", "UK_NHS", "US_SSN", "US_BANK_NUMBER", "EMAIL_ADDRESS", "DATE_TIME", "IP_ADDRESS",
|
||||
"PERSON", "IBAN_CODE", "NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE"]
|
||||
"PERSON", "IBAN_CODE", "NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE", "AU_ABN",
|
||||
"AU_ACN", "AU_TFN", "AU_MEDICARE"]
|
||||
"""
|
||||
assert response_status == 200
|
||||
assert equal_json_strings(expected_response, response_content)
|
||||
|
|
|
@ -21,6 +21,10 @@ from .us_passport_recognizer import UsPassportRecognizer
|
|||
from .us_phone_recognizer import UsPhoneRecognizer
|
||||
from .us_ssn_recognizer import UsSsnRecognizer
|
||||
from .es_nif_recognizer import EsNifRecognizer
|
||||
from .au_abn_recognizer import AuAbnRecognizer
|
||||
from .au_acn_recognizer import AuAcnRecognizer
|
||||
from .au_tfn_recognizer import AuTfnRecognizer
|
||||
from .au_medicare_recognizer import AuMedicareRecognizer
|
||||
|
||||
NLP_RECOGNIZERS = {"spacy": SpacyRecognizer, "stanza": StanzaRecognizer}
|
||||
|
||||
|
@ -49,4 +53,8 @@ __all__ = [
|
|||
"SpacyRecognizer",
|
||||
"StanzaRecognizer",
|
||||
"NLP_RECOGNIZERS",
|
||||
"AuAbnRecognizer",
|
||||
"AuAcnRecognizer",
|
||||
"AuTfnRecognizer",
|
||||
"AuMedicareRecognizer",
|
||||
]
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
from typing import Optional, List, Tuple
|
||||
|
||||
from presidio_analyzer import Pattern, PatternRecognizer
|
||||
|
||||
|
||||
class AuAbnRecognizer(PatternRecognizer):
|
||||
"""
|
||||
Recognizes Australian Business Number ("ABN").
|
||||
|
||||
The Australian Business Number (ABN) is a unique 11
|
||||
digit identifier issued to all entities registered in
|
||||
the Australian Business Register (ABR).
|
||||
The 11 digit ABN is structured as a 9 digit identifier
|
||||
with two leading check digits.
|
||||
The leading check digits are derived using a modulus 89 calculation.
|
||||
This recognizer identifies ABN using regex, context words and checksum.
|
||||
Reference: https://abr.business.gov.au/Help/AbnFormat
|
||||
|
||||
:param patterns: List of patterns to be used by this recognizer
|
||||
:param context: List of context words to increase confidence in detection
|
||||
:param supported_language: Language this recognizer supports
|
||||
:param supported_entity: The entity this recognizer can detect
|
||||
:param replacement_pairs: List of tuples with potential replacement values
|
||||
for different strings to be used during pattern matching.
|
||||
This can allow a greater variety in input, for example by removing dashes or spaces.
|
||||
"""
|
||||
|
||||
PATTERNS = [
|
||||
Pattern(
|
||||
"ABN (Medium)",
|
||||
r"\b\d{2}\s\d{3}\s\d{3}\s\d{3}\b",
|
||||
0.1,
|
||||
),
|
||||
Pattern(
|
||||
"ABN (Low)",
|
||||
r"\b\d{11}\b",
|
||||
0.01,
|
||||
),
|
||||
]
|
||||
|
||||
CONTEXT = [
|
||||
"australian business number",
|
||||
"abn",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patterns: Optional[List[Pattern]] = None,
|
||||
context: Optional[List[str]] = None,
|
||||
supported_language: str = "en",
|
||||
supported_entity: str = "AU_ABN",
|
||||
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
|
||||
):
|
||||
self.replacement_pairs = (
|
||||
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
|
||||
)
|
||||
context = context if context else self.CONTEXT
|
||||
patterns = patterns if patterns else self.PATTERNS
|
||||
super().__init__(
|
||||
supported_entity=supported_entity,
|
||||
patterns=patterns,
|
||||
context=context,
|
||||
supported_language=supported_language,
|
||||
)
|
||||
|
||||
def validate_result(self, pattern_text: str) -> bool:
|
||||
"""
|
||||
Validate the pattern logic e.g., by running checksum on a detected pattern.
|
||||
|
||||
:param pattern_text: the text to validated.
|
||||
Only the part in text that was detected by the regex engine
|
||||
:return: A bool indicating whether the validation was successful.
|
||||
"""
|
||||
# Pre-processing before validation checks
|
||||
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
|
||||
abn_list = [int(digit) for digit in text]
|
||||
|
||||
# Set weights based on digit position
|
||||
weight = [10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
|
||||
|
||||
# Perform checksums
|
||||
abn_list[0] = 9 if abn_list[0] == 0 else abn_list[0] - 1
|
||||
sum_product = 0
|
||||
for i in range(11):
|
||||
sum_product += abn_list[i] * weight[i]
|
||||
remainder = sum_product % 89
|
||||
if remainder == 0:
|
||||
result = True
|
||||
else:
|
||||
result = None
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
|
||||
for search_string, replacement_string in replacement_pairs:
|
||||
text = text.replace(search_string, replacement_string)
|
||||
return text
|
|
@ -0,0 +1,94 @@
|
|||
from typing import Optional, List, Tuple
|
||||
|
||||
from presidio_analyzer import Pattern, PatternRecognizer
|
||||
|
||||
|
||||
class AuAcnRecognizer(PatternRecognizer):
|
||||
"""
|
||||
Recognizes Australian Company Number ("ACN").
|
||||
|
||||
The Australian Company Number (ACN) is a nine digit number
|
||||
with the last digit being a check digit calculated using a
|
||||
modified modulus 10 calculation.
|
||||
This recognizer identifies ACN using regex, context words, and checksum.
|
||||
Reference: https://asic.gov.au/
|
||||
|
||||
:param patterns: List of patterns to be used by this recognizer
|
||||
:param context: List of context words to increase confidence in detection
|
||||
:param supported_language: Language this recognizer supports
|
||||
:param supported_entity: The entity this recognizer can detect
|
||||
:param replacement_pairs: List of tuples with potential replacement values
|
||||
for different strings to be used during pattern matching.
|
||||
This can allow a greater variety in input, for example by removing dashes or spaces.
|
||||
"""
|
||||
|
||||
PATTERNS = [
|
||||
Pattern(
|
||||
"ACN (Medium)",
|
||||
r"\b\d{3}\s\d{3}\s\d{3}\b",
|
||||
0.1,
|
||||
),
|
||||
Pattern(
|
||||
"ACN (Low)",
|
||||
r"\b\d{9}\b",
|
||||
0.01,
|
||||
),
|
||||
]
|
||||
|
||||
CONTEXT = [
|
||||
"australian company number",
|
||||
"acn",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patterns: Optional[List[Pattern]] = None,
|
||||
context: Optional[List[str]] = None,
|
||||
supported_language: str = "en",
|
||||
supported_entity: str = "AU_ACN",
|
||||
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
|
||||
):
|
||||
self.replacement_pairs = (
|
||||
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
|
||||
)
|
||||
context = context if context else self.CONTEXT
|
||||
patterns = patterns if patterns else self.PATTERNS
|
||||
super().__init__(
|
||||
supported_entity=supported_entity,
|
||||
patterns=patterns,
|
||||
context=context,
|
||||
supported_language=supported_language,
|
||||
)
|
||||
|
||||
def validate_result(self, pattern_text: str) -> bool:
|
||||
"""
|
||||
Validate the pattern logic e.g., by running checksum on a detected pattern.
|
||||
|
||||
:param pattern_text: the text to validated.
|
||||
Only the part in text that was detected by the regex engine
|
||||
:return: A bool indicating whether the validation was successful.
|
||||
"""
|
||||
# Pre-processing before validation checks
|
||||
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
|
||||
acn_list = [int(digit) for digit in text]
|
||||
|
||||
# Set weights based on digit position
|
||||
weight = [8, 7, 6, 5, 4, 3, 2, 1]
|
||||
|
||||
# Perform checksums
|
||||
sum_product = 0
|
||||
for i in range(8):
|
||||
sum_product += acn_list[i] * weight[i]
|
||||
remainder = sum_product % 10
|
||||
complement = 10 - remainder
|
||||
if complement == acn_list[-1]:
|
||||
result = True
|
||||
else:
|
||||
result = None
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
|
||||
for search_string, replacement_string in replacement_pairs:
|
||||
text = text.replace(search_string, replacement_string)
|
||||
return text
|
|
@ -0,0 +1,93 @@
|
|||
from typing import Optional, List, Tuple
|
||||
|
||||
from presidio_analyzer import Pattern, PatternRecognizer
|
||||
|
||||
|
||||
class AuMedicareRecognizer(PatternRecognizer):
|
||||
"""
|
||||
Recognizes Australian Medicare number using regex, context words, and checksum.
|
||||
|
||||
Medicare number is a unique identifier issued by Australian Government
|
||||
that enables the cardholder to receive a rebates of medical expenses
|
||||
under Australia's Medicare system.
|
||||
It uses a modulus 10 checksum scheme to validate the number.
|
||||
Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)
|
||||
|
||||
|
||||
:param patterns: List of patterns to be used by this recognizer
|
||||
:param context: List of context words to increase confidence in detection
|
||||
:param supported_language: Language this recognizer supports
|
||||
:param supported_entity: The entity this recognizer can detect
|
||||
:param replacement_pairs: List of tuples with potential replacement values
|
||||
for different strings to be used during pattern matching.
|
||||
This can allow a greater variety in input, for example by removing dashes or spaces.
|
||||
"""
|
||||
|
||||
PATTERNS = [
|
||||
Pattern(
|
||||
"Australian Medicare Number (Medium)",
|
||||
r"\b[2-6]\d{3}\s\d{5}\s\d\b",
|
||||
0.1,
|
||||
),
|
||||
Pattern(
|
||||
"Australian Medicare Number (Low)",
|
||||
r"\b[2-6]\d{9}\b",
|
||||
0.01,
|
||||
),
|
||||
]
|
||||
|
||||
CONTEXT = [
|
||||
"medicare",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patterns: Optional[List[Pattern]] = None,
|
||||
context: Optional[List[str]] = None,
|
||||
supported_language: str = "en",
|
||||
supported_entity: str = "AU_MEDICARE",
|
||||
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
|
||||
):
|
||||
self.replacement_pairs = (
|
||||
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
|
||||
)
|
||||
context = context if context else self.CONTEXT
|
||||
patterns = patterns if patterns else self.PATTERNS
|
||||
super().__init__(
|
||||
supported_entity=supported_entity,
|
||||
patterns=patterns,
|
||||
context=context,
|
||||
supported_language=supported_language,
|
||||
)
|
||||
|
||||
def validate_result(self, pattern_text: str) -> bool:
|
||||
"""
|
||||
Validate the pattern logic e.g., by running checksum on a detected pattern.
|
||||
|
||||
:param pattern_text: the text to validated.
|
||||
Only the part in text that was detected by the regex engine
|
||||
:return: A bool indicating whether the validation was successful.
|
||||
"""
|
||||
# Pre-processing before validation checks
|
||||
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
|
||||
medicare_list = [int(digit) for digit in text]
|
||||
|
||||
# Set weights based on digit position
|
||||
weight = [1, 3, 7, 9, 1, 3, 7, 9]
|
||||
|
||||
# Perform checksums
|
||||
sum_product = 0
|
||||
for i in range(8):
|
||||
sum_product += medicare_list[i] * weight[i]
|
||||
remainder = sum_product % 10
|
||||
if remainder == medicare_list[8]:
|
||||
result = True
|
||||
else:
|
||||
result = None
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
|
||||
for search_string, replacement_string in replacement_pairs:
|
||||
text = text.replace(search_string, replacement_string)
|
||||
return text
|
|
@ -0,0 +1,99 @@
|
|||
from typing import Optional, List, Tuple
|
||||
|
||||
from presidio_analyzer import Pattern, PatternRecognizer
|
||||
|
||||
|
||||
class AuTfnRecognizer(PatternRecognizer):
|
||||
"""
|
||||
Recognizes Australian Tax File Numbers ("TFN").
|
||||
|
||||
The tax file number (TFN) is a unique identifier
|
||||
issued by the Australian Taxation Office
|
||||
to each taxpaying entity — an individual, company,
|
||||
superannuation fund, partnership, or trust.
|
||||
The TFN consists of a nine digit number, usually
|
||||
presented in the format NNN NNN NNN.
|
||||
TFN includes a check digit for detecting erroneous
|
||||
number based on simple modulo 11.
|
||||
This recognizer uses regex, context words,
|
||||
and checksum to identify TFN.
|
||||
Reference: https://www.ato.gov.au/individuals/tax-file-number/
|
||||
|
||||
:param patterns: List of patterns to be used by this recognizer
|
||||
:param context: List of context words to increase confidence in detection
|
||||
:param supported_language: Language this recognizer supports
|
||||
:param supported_entity: The entity this recognizer can detect
|
||||
:param replacement_pairs: List of tuples with potential replacement values
|
||||
for different strings to be used during pattern matching.
|
||||
This can allow a greater variety in input, for example by removing dashes or spaces.
|
||||
"""
|
||||
|
||||
PATTERNS = [
|
||||
Pattern(
|
||||
"TFN (Medium)",
|
||||
r"\b\d{3}\s\d{3}\s\d{3}\b",
|
||||
0.1,
|
||||
),
|
||||
Pattern(
|
||||
"TFN (Low)",
|
||||
r"\b\d{9}\b",
|
||||
0.01,
|
||||
),
|
||||
]
|
||||
|
||||
CONTEXT = [
|
||||
"tax file number",
|
||||
"tfn",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patterns: Optional[List[Pattern]] = None,
|
||||
context: Optional[List[str]] = None,
|
||||
supported_language: str = "en",
|
||||
supported_entity: str = "AU_TFN",
|
||||
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
|
||||
):
|
||||
self.replacement_pairs = (
|
||||
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
|
||||
)
|
||||
context = context if context else self.CONTEXT
|
||||
patterns = patterns if patterns else self.PATTERNS
|
||||
super().__init__(
|
||||
supported_entity=supported_entity,
|
||||
patterns=patterns,
|
||||
context=context,
|
||||
supported_language=supported_language,
|
||||
)
|
||||
|
||||
def validate_result(self, pattern_text: str) -> bool:
|
||||
"""
|
||||
Validate the pattern logic e.g., by running checksum on a detected pattern.
|
||||
|
||||
:param pattern_text: the text to validated.
|
||||
Only the part in text that was detected by the regex engine
|
||||
:return: A bool indicating whether the validation was successful.
|
||||
"""
|
||||
# Pre-processing before validation checks
|
||||
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
|
||||
tfn_list = [int(digit) for digit in text]
|
||||
|
||||
# Set weights based on digit position
|
||||
weight = [1, 4, 3, 7, 5, 8, 6, 9, 10]
|
||||
|
||||
# Perform checksums
|
||||
sum_product = 0
|
||||
for i in range(9):
|
||||
sum_product += tfn_list[i] * weight[i]
|
||||
remainder = sum_product % 11
|
||||
if remainder == 0:
|
||||
result = True
|
||||
else:
|
||||
result = None
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
|
||||
for search_string, replacement_string in replacement_pairs:
|
||||
text = text.replace(search_string, replacement_string)
|
||||
return text
|
|
@ -24,6 +24,10 @@ from presidio_analyzer.predefined_recognizers import (
|
|||
SpacyRecognizer,
|
||||
EsNifRecognizer,
|
||||
StanzaRecognizer,
|
||||
AuAbnRecognizer,
|
||||
AuAcnRecognizer,
|
||||
AuTfnRecognizer,
|
||||
AuMedicareRecognizer,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("presidio-analyzer")
|
||||
|
@ -68,6 +72,10 @@ class RecognizerRegistry:
|
|||
UsSsnRecognizer,
|
||||
NhsRecognizer,
|
||||
SgFinRecognizer,
|
||||
AuAbnRecognizer,
|
||||
AuAcnRecognizer,
|
||||
AuTfnRecognizer,
|
||||
AuMedicareRecognizer,
|
||||
],
|
||||
"es": [EsNifRecognizer],
|
||||
"ALL": [
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
import pytest
|
||||
|
||||
from tests import assert_result_within_score_range
|
||||
from presidio_analyzer.predefined_recognizers import AuAbnRecognizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def recognizer():
|
||||
return AuAbnRecognizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def entities():
|
||||
return ["AU_ABN"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected_len, expected_positions, expected_score_ranges",
|
||||
[
|
||||
# Valid formatting and valid ABNs
|
||||
("51 824 753 556", 1, ((0, 14),), ((1.0, 1.0),), ),
|
||||
("51824753556", 1, ((0, 11),), ((1.0, 1.0),), ),
|
||||
# Valid formatting but invalid ABNs
|
||||
("52 824 753 556", 1, ((0, 14),), ((0.01, 0.1),),),
|
||||
("52824753556", 1, ((0, 11),), ((0.01, 0.1),),),
|
||||
# Invalid formatting and ABNs.
|
||||
("5282475355632", 0, (), (),),
|
||||
("52824753556AF", 0, (), (),),
|
||||
("51 824 753 5564", 0, (), (),),
|
||||
],
|
||||
)
|
||||
def test_when_all_abns_then_succeed(
|
||||
text,
|
||||
expected_len,
|
||||
expected_positions,
|
||||
expected_score_ranges,
|
||||
recognizer,
|
||||
entities,
|
||||
max_score,
|
||||
):
|
||||
results = recognizer.analyze(text, entities)
|
||||
assert len(results) == expected_len
|
||||
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
|
||||
results, expected_positions, expected_score_ranges
|
||||
):
|
||||
if fn_score == "max":
|
||||
fn_score = max_score
|
||||
assert_result_within_score_range(
|
||||
res, entities[0], st_pos, fn_pos, st_score, fn_score
|
||||
)
|
|
@ -0,0 +1,51 @@
|
|||
import pytest
|
||||
|
||||
from tests import assert_result_within_score_range
|
||||
from presidio_analyzer.predefined_recognizers import AuAcnRecognizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def recognizer():
|
||||
return AuAcnRecognizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def entities():
|
||||
return ["AU_ACN"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected_len, expected_positions, expected_score_ranges",
|
||||
[
|
||||
# Valid formatting and valid ACNs
|
||||
("000 000 019", 1, ((0, 11),), ((1.0, 1.0),), ),
|
||||
("005 499 981", 1, ((0, 11),), ((1.0, 1.0),), ),
|
||||
("006249976", 1, ((0, 9),), ((1.0, 1.0),), ),
|
||||
# Valid formatting but invalid ACNs
|
||||
("824 753 557", 1, ((0, 11),), ((0.01, 0.1),),),
|
||||
("824753557", 1, ((0, 9),), ((0.01, 0.1),),),
|
||||
# Invalid formatting and ACNs.
|
||||
("5282475355632", 0, (), (),),
|
||||
("52824753556AF", 0, (), (),),
|
||||
("51 824 753 5564", 0, (), (),),
|
||||
],
|
||||
)
|
||||
def test_when_all_acns_then_succeed(
|
||||
text,
|
||||
expected_len,
|
||||
expected_positions,
|
||||
expected_score_ranges,
|
||||
recognizer,
|
||||
entities,
|
||||
max_score,
|
||||
):
|
||||
results = recognizer.analyze(text, entities)
|
||||
assert len(results) == expected_len
|
||||
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
|
||||
results, expected_positions, expected_score_ranges
|
||||
):
|
||||
if fn_score == "max":
|
||||
fn_score = max_score
|
||||
assert_result_within_score_range(
|
||||
res, entities[0], st_pos, fn_pos, st_score, fn_score
|
||||
)
|
|
@ -0,0 +1,49 @@
|
|||
import pytest
|
||||
|
||||
from tests import assert_result_within_score_range
|
||||
from presidio_analyzer.predefined_recognizers import AuMedicareRecognizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def recognizer():
|
||||
return AuMedicareRecognizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def entities():
|
||||
return ["AU_MEDICARE"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected_len, expected_positions, expected_score_ranges",
|
||||
[
|
||||
# Valid formatting and valid Medicare number.
|
||||
("2123 45670 1", 1, ((0, 12),), ((1.0, 1.0),), ),
|
||||
("2123456701", 1, ((0, 10),), ((1.0, 1.0),), ),
|
||||
# Valid formatting but invalid Medicare number.
|
||||
("2123 25870 1", 1, ((0, 12),), ((0.01, 0.1),),),
|
||||
("2123258701", 1, ((0, 10),), ((0.01, 0.1),),),
|
||||
# Invalid formatting and Medicare number.
|
||||
("212345670221", 0, (), (),),
|
||||
("2123456702AF", 0, (), (),),
|
||||
],
|
||||
)
|
||||
def test_when_all_medicares_then_succeed(
|
||||
text,
|
||||
expected_len,
|
||||
expected_positions,
|
||||
expected_score_ranges,
|
||||
recognizer,
|
||||
entities,
|
||||
max_score,
|
||||
):
|
||||
results = recognizer.analyze(text, entities)
|
||||
assert len(results) == expected_len
|
||||
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
|
||||
results, expected_positions, expected_score_ranges
|
||||
):
|
||||
if fn_score == "max":
|
||||
fn_score = max_score
|
||||
assert_result_within_score_range(
|
||||
res, entities[0], st_pos, fn_pos, st_score, fn_score
|
||||
)
|
|
@ -0,0 +1,50 @@
|
|||
import pytest
|
||||
|
||||
from tests import assert_result_within_score_range
|
||||
from presidio_analyzer.predefined_recognizers import AuTfnRecognizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def recognizer():
|
||||
return AuTfnRecognizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def entities():
|
||||
return ["AU_TFN"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected_len, expected_positions, expected_score_ranges",
|
||||
[
|
||||
# Valid formatting and valid TFNs
|
||||
("876 543 210", 1, ((0, 11),), ((1.0, 1.0),), ),
|
||||
("876543210", 1, ((0, 9),), ((1.0, 1.0),), ),
|
||||
# Valid formatting but invalid TFNs
|
||||
("824 753 557", 1, ((0, 11),), ((0.01, 0.1),),),
|
||||
("824753557", 1, ((0, 9),), ((0.01, 0.1),),),
|
||||
# Invalid formatting and TFNs.
|
||||
("5282475355632", 0, (), (),),
|
||||
("52824753556AF", 0, (), (),),
|
||||
("51 824 753 5564", 0, (), (),),
|
||||
],
|
||||
)
|
||||
def test_when_all_tfns_then_succeed(
|
||||
text,
|
||||
expected_len,
|
||||
expected_positions,
|
||||
expected_score_ranges,
|
||||
recognizer,
|
||||
entities,
|
||||
max_score,
|
||||
):
|
||||
results = recognizer.analyze(text, entities)
|
||||
assert len(results) == expected_len
|
||||
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
|
||||
results, expected_positions, expected_score_ranges
|
||||
):
|
||||
if fn_score == "max":
|
||||
fn_score = max_score
|
||||
assert_result_within_score_range(
|
||||
res, entities[0], st_pos, fn_pos, st_score, fn_score
|
||||
)
|
|
@ -52,8 +52,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi
|
|||
registry = mock_recognizer_registry
|
||||
registry.load_predefined_recognizers()
|
||||
recognizers = registry.get_recognizers(language="en", all_fields=True)
|
||||
# 1 custom recognizer in english + 17 predefined
|
||||
assert len(recognizers) == 1 + 17
|
||||
# 1 custom recognizer in english + 21 predefined
|
||||
assert len(recognizers) == 1 + 21
|
||||
|
||||
|
||||
def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry):
|
||||
|
|
Загрузка…
Ссылка в новой задаче