This commit is contained in:
Rakan 2021-09-05 19:58:17 +10:00 коммит произвёл GitHub
Родитель f76dad3961
Коммит b8d76c6a7e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
14 изменённых файлов: 614 добавлений и 5 удалений

Просмотреть файл

@ -53,6 +53,15 @@ For more information, refer to the [adding new recognizers documentation](analyz
|--- |--- |--- |
|FIN/NRIC| A National Registration Identification Card | Pattern match and context |
### Australia
|FieldType|Description|Detection Method|
|--- |--- |--- |
|AU_ABN| The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR). | Pattern match, context, and checksum |
|AU_ACN| An Australian Company Number is a unique nine-digit number issued by the Australian Securities and Investments Commission to every company registered under the Commonwealth Corporations Act 2001 as an identifier. | Pattern match, context, and checksum |
|AU_TFN| The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity | Pattern match, context, and checksum |
|AU_MEDICARE| Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system| Pattern match, context, and checksum |
## Adding a custom PII entity
See [this documentation](analyzer/adding_recognizers.md) for instructions on how to add a new Recognizer for a new type of PII entity.

Просмотреть файл

@ -5,7 +5,7 @@ My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6G
On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1.
My passport: 191280345 and my phone number: (212) 555-1234.
My passport: 191280342 and my phone number: (212) 555-1234.
This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?

Просмотреть файл

@ -234,7 +234,7 @@ def test_given_a_correct_input_for_supported_entities_then_expect_a_correct_resp
expected_response = """
["PHONE_NUMBER", "US_DRIVER_LICENSE", "US_PASSPORT", "SG_NRIC_FIN", "LOCATION", "CREDIT_CARD", "CRYPTO",
"UK_NHS", "US_SSN", "US_BANK_NUMBER", "EMAIL_ADDRESS", "DATE_TIME", "IP_ADDRESS", "PERSON", "IBAN_CODE",
"NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE"]
"NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE", "AU_ABN", "AU_ACN", "AU_TFN", "AU_MEDICARE"]
"""
assert response_status == 200
assert equal_json_strings(expected_response, response_content)
@ -266,7 +266,8 @@ def test_given_an_illegal_input_for_supported_entities_then_igonre_and_proceed()
expected_response = """
["PHONE_NUMBER", "US_DRIVER_LICENSE", "US_PASSPORT", "SG_NRIC_FIN", "LOCATION", "CREDIT_CARD",
"CRYPTO", "UK_NHS", "US_SSN", "US_BANK_NUMBER", "EMAIL_ADDRESS", "DATE_TIME", "IP_ADDRESS",
"PERSON", "IBAN_CODE", "NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE"]
"PERSON", "IBAN_CODE", "NRP", "US_ITIN", "DOMAIN_NAME", "MEDICAL_LICENSE", "AU_ABN",
"AU_ACN", "AU_TFN", "AU_MEDICARE"]
"""
assert response_status == 200
assert equal_json_strings(expected_response, response_content)

Просмотреть файл

@ -21,6 +21,10 @@ from .us_passport_recognizer import UsPassportRecognizer
from .us_phone_recognizer import UsPhoneRecognizer
from .us_ssn_recognizer import UsSsnRecognizer
from .es_nif_recognizer import EsNifRecognizer
from .au_abn_recognizer import AuAbnRecognizer
from .au_acn_recognizer import AuAcnRecognizer
from .au_tfn_recognizer import AuTfnRecognizer
from .au_medicare_recognizer import AuMedicareRecognizer
NLP_RECOGNIZERS = {"spacy": SpacyRecognizer, "stanza": StanzaRecognizer}
@ -49,4 +53,8 @@ __all__ = [
"SpacyRecognizer",
"StanzaRecognizer",
"NLP_RECOGNIZERS",
"AuAbnRecognizer",
"AuAcnRecognizer",
"AuTfnRecognizer",
"AuMedicareRecognizer",
]

Просмотреть файл

@ -0,0 +1,97 @@
from typing import Optional, List, Tuple
from presidio_analyzer import Pattern, PatternRecognizer
class AuAbnRecognizer(PatternRecognizer):
"""
Recognizes Australian Business Number ("ABN").
The Australian Business Number (ABN) is a unique 11
digit identifier issued to all entities registered in
the Australian Business Register (ABR).
The 11 digit ABN is structured as a 9 digit identifier
with two leading check digits.
The leading check digits are derived using a modulus 89 calculation.
This recognizer identifies ABN using regex, context words and checksum.
Reference: https://abr.business.gov.au/Help/AbnFormat
:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
:param replacement_pairs: List of tuples with potential replacement values
for different strings to be used during pattern matching.
This can allow a greater variety in input, for example by removing dashes or spaces.
"""
PATTERNS = [
Pattern(
"ABN (Medium)",
r"\b\d{2}\s\d{3}\s\d{3}\s\d{3}\b",
0.1,
),
Pattern(
"ABN (Low)",
r"\b\d{11}\b",
0.01,
),
]
CONTEXT = [
"australian business number",
"abn",
]
def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = "AU_ABN",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
)
context = context if context else self.CONTEXT
patterns = patterns if patterns else self.PATTERNS
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)
def validate_result(self, pattern_text: str) -> bool:
"""
Validate the pattern logic e.g., by running checksum on a detected pattern.
:param pattern_text: the text to validated.
Only the part in text that was detected by the regex engine
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
abn_list = [int(digit) for digit in text]
# Set weights based on digit position
weight = [10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
# Perform checksums
abn_list[0] = 9 if abn_list[0] == 0 else abn_list[0] - 1
sum_product = 0
for i in range(11):
sum_product += abn_list[i] * weight[i]
remainder = sum_product % 89
if remainder == 0:
result = True
else:
result = None
return result
@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text

Просмотреть файл

@ -0,0 +1,94 @@
from typing import Optional, List, Tuple
from presidio_analyzer import Pattern, PatternRecognizer
class AuAcnRecognizer(PatternRecognizer):
"""
Recognizes Australian Company Number ("ACN").
The Australian Company Number (ACN) is a nine digit number
with the last digit being a check digit calculated using a
modified modulus 10 calculation.
This recognizer identifies ACN using regex, context words, and checksum.
Reference: https://asic.gov.au/
:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
:param replacement_pairs: List of tuples with potential replacement values
for different strings to be used during pattern matching.
This can allow a greater variety in input, for example by removing dashes or spaces.
"""
PATTERNS = [
Pattern(
"ACN (Medium)",
r"\b\d{3}\s\d{3}\s\d{3}\b",
0.1,
),
Pattern(
"ACN (Low)",
r"\b\d{9}\b",
0.01,
),
]
CONTEXT = [
"australian company number",
"acn",
]
def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = "AU_ACN",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
)
context = context if context else self.CONTEXT
patterns = patterns if patterns else self.PATTERNS
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)
def validate_result(self, pattern_text: str) -> bool:
"""
Validate the pattern logic e.g., by running checksum on a detected pattern.
:param pattern_text: the text to validated.
Only the part in text that was detected by the regex engine
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
acn_list = [int(digit) for digit in text]
# Set weights based on digit position
weight = [8, 7, 6, 5, 4, 3, 2, 1]
# Perform checksums
sum_product = 0
for i in range(8):
sum_product += acn_list[i] * weight[i]
remainder = sum_product % 10
complement = 10 - remainder
if complement == acn_list[-1]:
result = True
else:
result = None
return result
@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text

Просмотреть файл

@ -0,0 +1,93 @@
from typing import Optional, List, Tuple
from presidio_analyzer import Pattern, PatternRecognizer
class AuMedicareRecognizer(PatternRecognizer):
"""
Recognizes Australian Medicare number using regex, context words, and checksum.
Medicare number is a unique identifier issued by Australian Government
that enables the cardholder to receive a rebates of medical expenses
under Australia's Medicare system.
It uses a modulus 10 checksum scheme to validate the number.
Reference: https://en.wikipedia.org/wiki/Medicare_card_(Australia)
:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
:param replacement_pairs: List of tuples with potential replacement values
for different strings to be used during pattern matching.
This can allow a greater variety in input, for example by removing dashes or spaces.
"""
PATTERNS = [
Pattern(
"Australian Medicare Number (Medium)",
r"\b[2-6]\d{3}\s\d{5}\s\d\b",
0.1,
),
Pattern(
"Australian Medicare Number (Low)",
r"\b[2-6]\d{9}\b",
0.01,
),
]
CONTEXT = [
"medicare",
]
def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = "AU_MEDICARE",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
)
context = context if context else self.CONTEXT
patterns = patterns if patterns else self.PATTERNS
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)
def validate_result(self, pattern_text: str) -> bool:
"""
Validate the pattern logic e.g., by running checksum on a detected pattern.
:param pattern_text: the text to validated.
Only the part in text that was detected by the regex engine
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
medicare_list = [int(digit) for digit in text]
# Set weights based on digit position
weight = [1, 3, 7, 9, 1, 3, 7, 9]
# Perform checksums
sum_product = 0
for i in range(8):
sum_product += medicare_list[i] * weight[i]
remainder = sum_product % 10
if remainder == medicare_list[8]:
result = True
else:
result = None
return result
@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text

Просмотреть файл

@ -0,0 +1,99 @@
from typing import Optional, List, Tuple
from presidio_analyzer import Pattern, PatternRecognizer
class AuTfnRecognizer(PatternRecognizer):
"""
Recognizes Australian Tax File Numbers ("TFN").
The tax file number (TFN) is a unique identifier
issued by the Australian Taxation Office
to each taxpaying entity an individual, company,
superannuation fund, partnership, or trust.
The TFN consists of a nine digit number, usually
presented in the format NNN NNN NNN.
TFN includes a check digit for detecting erroneous
number based on simple modulo 11.
This recognizer uses regex, context words,
and checksum to identify TFN.
Reference: https://www.ato.gov.au/individuals/tax-file-number/
:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
:param replacement_pairs: List of tuples with potential replacement values
for different strings to be used during pattern matching.
This can allow a greater variety in input, for example by removing dashes or spaces.
"""
PATTERNS = [
Pattern(
"TFN (Medium)",
r"\b\d{3}\s\d{3}\s\d{3}\b",
0.1,
),
Pattern(
"TFN (Low)",
r"\b\d{9}\b",
0.01,
),
]
CONTEXT = [
"tax file number",
"tfn",
]
def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = "AU_TFN",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
)
context = context if context else self.CONTEXT
patterns = patterns if patterns else self.PATTERNS
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)
def validate_result(self, pattern_text: str) -> bool:
"""
Validate the pattern logic e.g., by running checksum on a detected pattern.
:param pattern_text: the text to validated.
Only the part in text that was detected by the regex engine
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
tfn_list = [int(digit) for digit in text]
# Set weights based on digit position
weight = [1, 4, 3, 7, 5, 8, 6, 9, 10]
# Perform checksums
sum_product = 0
for i in range(9):
sum_product += tfn_list[i] * weight[i]
remainder = sum_product % 11
if remainder == 0:
result = True
else:
result = None
return result
@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text

Просмотреть файл

@ -24,6 +24,10 @@ from presidio_analyzer.predefined_recognizers import (
SpacyRecognizer,
EsNifRecognizer,
StanzaRecognizer,
AuAbnRecognizer,
AuAcnRecognizer,
AuTfnRecognizer,
AuMedicareRecognizer,
)
logger = logging.getLogger("presidio-analyzer")
@ -68,6 +72,10 @@ class RecognizerRegistry:
UsSsnRecognizer,
NhsRecognizer,
SgFinRecognizer,
AuAbnRecognizer,
AuAcnRecognizer,
AuTfnRecognizer,
AuMedicareRecognizer,
],
"es": [EsNifRecognizer],
"ALL": [

Просмотреть файл

@ -0,0 +1,50 @@
import pytest
from tests import assert_result_within_score_range
from presidio_analyzer.predefined_recognizers import AuAbnRecognizer
@pytest.fixture(scope="module")
def recognizer():
return AuAbnRecognizer()
@pytest.fixture(scope="module")
def entities():
return ["AU_ABN"]
@pytest.mark.parametrize(
"text, expected_len, expected_positions, expected_score_ranges",
[
# Valid formatting and valid ABNs
("51 824 753 556", 1, ((0, 14),), ((1.0, 1.0),), ),
("51824753556", 1, ((0, 11),), ((1.0, 1.0),), ),
# Valid formatting but invalid ABNs
("52 824 753 556", 1, ((0, 14),), ((0.01, 0.1),),),
("52824753556", 1, ((0, 11),), ((0.01, 0.1),),),
# Invalid formatting and ABNs.
("5282475355632", 0, (), (),),
("52824753556AF", 0, (), (),),
("51 824 753 5564", 0, (), (),),
],
)
def test_when_all_abns_then_succeed(
text,
expected_len,
expected_positions,
expected_score_ranges,
recognizer,
entities,
max_score,
):
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
results, expected_positions, expected_score_ranges
):
if fn_score == "max":
fn_score = max_score
assert_result_within_score_range(
res, entities[0], st_pos, fn_pos, st_score, fn_score
)

Просмотреть файл

@ -0,0 +1,51 @@
import pytest
from tests import assert_result_within_score_range
from presidio_analyzer.predefined_recognizers import AuAcnRecognizer
@pytest.fixture(scope="module")
def recognizer():
return AuAcnRecognizer()
@pytest.fixture(scope="module")
def entities():
return ["AU_ACN"]
@pytest.mark.parametrize(
"text, expected_len, expected_positions, expected_score_ranges",
[
# Valid formatting and valid ACNs
("000 000 019", 1, ((0, 11),), ((1.0, 1.0),), ),
("005 499 981", 1, ((0, 11),), ((1.0, 1.0),), ),
("006249976", 1, ((0, 9),), ((1.0, 1.0),), ),
# Valid formatting but invalid ACNs
("824 753 557", 1, ((0, 11),), ((0.01, 0.1),),),
("824753557", 1, ((0, 9),), ((0.01, 0.1),),),
# Invalid formatting and ACNs.
("5282475355632", 0, (), (),),
("52824753556AF", 0, (), (),),
("51 824 753 5564", 0, (), (),),
],
)
def test_when_all_acns_then_succeed(
text,
expected_len,
expected_positions,
expected_score_ranges,
recognizer,
entities,
max_score,
):
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
results, expected_positions, expected_score_ranges
):
if fn_score == "max":
fn_score = max_score
assert_result_within_score_range(
res, entities[0], st_pos, fn_pos, st_score, fn_score
)

Просмотреть файл

@ -0,0 +1,49 @@
import pytest
from tests import assert_result_within_score_range
from presidio_analyzer.predefined_recognizers import AuMedicareRecognizer
@pytest.fixture(scope="module")
def recognizer():
return AuMedicareRecognizer()
@pytest.fixture(scope="module")
def entities():
return ["AU_MEDICARE"]
@pytest.mark.parametrize(
"text, expected_len, expected_positions, expected_score_ranges",
[
# Valid formatting and valid Medicare number.
("2123 45670 1", 1, ((0, 12),), ((1.0, 1.0),), ),
("2123456701", 1, ((0, 10),), ((1.0, 1.0),), ),
# Valid formatting but invalid Medicare number.
("2123 25870 1", 1, ((0, 12),), ((0.01, 0.1),),),
("2123258701", 1, ((0, 10),), ((0.01, 0.1),),),
# Invalid formatting and Medicare number.
("212345670221", 0, (), (),),
("2123456702AF", 0, (), (),),
],
)
def test_when_all_medicares_then_succeed(
text,
expected_len,
expected_positions,
expected_score_ranges,
recognizer,
entities,
max_score,
):
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
results, expected_positions, expected_score_ranges
):
if fn_score == "max":
fn_score = max_score
assert_result_within_score_range(
res, entities[0], st_pos, fn_pos, st_score, fn_score
)

Просмотреть файл

@ -0,0 +1,50 @@
import pytest
from tests import assert_result_within_score_range
from presidio_analyzer.predefined_recognizers import AuTfnRecognizer
@pytest.fixture(scope="module")
def recognizer():
return AuTfnRecognizer()
@pytest.fixture(scope="module")
def entities():
return ["AU_TFN"]
@pytest.mark.parametrize(
"text, expected_len, expected_positions, expected_score_ranges",
[
# Valid formatting and valid TFNs
("876 543 210", 1, ((0, 11),), ((1.0, 1.0),), ),
("876543210", 1, ((0, 9),), ((1.0, 1.0),), ),
# Valid formatting but invalid TFNs
("824 753 557", 1, ((0, 11),), ((0.01, 0.1),),),
("824753557", 1, ((0, 9),), ((0.01, 0.1),),),
# Invalid formatting and TFNs.
("5282475355632", 0, (), (),),
("52824753556AF", 0, (), (),),
("51 824 753 5564", 0, (), (),),
],
)
def test_when_all_tfns_then_succeed(
text,
expected_len,
expected_positions,
expected_score_ranges,
recognizer,
entities,
max_score,
):
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
results, expected_positions, expected_score_ranges
):
if fn_score == "max":
fn_score = max_score
assert_result_within_score_range(
res, entities[0], st_pos, fn_pos, st_score, fn_score
)

Просмотреть файл

@ -52,8 +52,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi
registry = mock_recognizer_registry
registry.load_predefined_recognizers()
recognizers = registry.get_recognizers(language="en", all_fields=True)
# 1 custom recognizer in english + 17 predefined
assert len(recognizers) == 1 + 17
# 1 custom recognizer in english + 21 predefined
assert len(recognizers) == 1 + 21
def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry):