-
Notifications
You must be signed in to change notification settings - Fork 48
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added changes for Email Address and Credit Card Number (#580)
* Added changes for Email Address and Credit Card Number
- Loading branch information
Showing
7 changed files
with
315 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
106 changes: 106 additions & 0 deletions
106
pebblo/entity_classifier/custom_analyzer/cerdit_card_analyzer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
from typing import List, Tuple | ||
|
||
from presidio_analyzer import Pattern | ||
from presidio_analyzer.predefined_recognizers.credit_card_recognizer import ( | ||
CreditCardRecognizer, | ||
) | ||
|
||
|
||
class ExtendedCreditCardRecognizer(CreditCardRecognizer): | ||
""" | ||
Extends the Credit Card Recognizer by adding support for additional credit card types. | ||
""" | ||
|
||
# Define supported card patterns (can use the regex you provided) | ||
ADDITIONAL_PATTERNS = [ | ||
Pattern("Amex Card", r"\b3[47][0-9]{13}\b", 0.5), | ||
Pattern("BCGlobal", r"\b(6541|6556)[0-9]{12,15}\b", 0.5), | ||
Pattern("Carte Blanche Card", r"\b389[0-9]{11}\b", 0.5), | ||
Pattern("Diners Club", r"\b3(?:0[0-5]|[68][0-9])[0-9]{11}\b", 0.5), | ||
Pattern( | ||
"Discover", | ||
r"\b65[4-9][0-9]{13}|\b64[4-9][0-9]{13}|\b6011[0-9]{12}|\b(622(?:12[6-9]|1[3-9][0-9]|[2-8][0-9]{3}|9[01][0-9]|92[0-5])[0-9]{10})\b", | ||
0.5, | ||
), | ||
Pattern("Insta Payment", r"\b63[7-9][0-9]{13}\b", 0.5), | ||
Pattern("JCB Card", r"\b(?:2131|1800|35\d{3})\d{11}\b", 0.5), | ||
Pattern("KoreanLocalCard", r"\b9[0-9]{15}\b", 0.5), | ||
Pattern("Laser Card", r"\b(6304|6706|6709|6771)[0-9]{12,15}\b", 0.5), | ||
Pattern( | ||
"Maestro Card", r"\b(5018|5020|5038|6304|6759|6761|6763)[0-9]{8,15}\b", 0.5 | ||
), | ||
Pattern( | ||
"Mastercard", | ||
r"\b5[1-5][0-9]{14}\b|\b2(22[1-9][0-9]{12}|2[3-9][0-9]{13}|[3-6][0-9]{14}|7[0-1][0-9]{13}|720[0-9]{12})\b", | ||
0.5, | ||
), | ||
Pattern("Solo Card", r"\b(6334|6767)[0-9]{12,15}\b", 0.5), | ||
Pattern( | ||
"Switch Card", | ||
r"\b(4903|4905|4911|4936|6333|6759)[0-9]{12,15}\b|\b564182[0-9]{10,13}\b|\b633110[0-9]{10,13}\b", | ||
0.5, | ||
), | ||
Pattern("Union Pay", r"\b62[0-9]{14,17}\b", 0.5), | ||
Pattern("Visa Card", r"\b4[0-9]{12}(?:[0-9]{3})?\b", 0.5), | ||
# Pattern( | ||
# "All Credit Cards (weak)", | ||
# r"\b((4\d{3})|(5[0-5]\d{2})|(6\d{3})|(1\d{3})|(3\d{3}))[- ]?(\d{3,4})[- ]?(\d{3,4})[- ]?(\d{3,5})\b", # noqa: E501 | ||
# 0.3, | ||
# ), | ||
] | ||
# Define keywords related to credit cards | ||
CONTEXT = [ | ||
"credit", | ||
"credit_card", | ||
"card" "debit", | ||
"Visa", | ||
"Mastercard", | ||
"Amex", | ||
"Discover", | ||
"JCB", | ||
"Diners Club", | ||
"Carte Blanche", | ||
"Insta Payment", | ||
"Maestro", | ||
"UnionPay", | ||
"BCGlobal", | ||
"KoreanLocalCard", | ||
"credit", | ||
"card", | ||
"cc ", | ||
"diners", | ||
"instapayment", | ||
] | ||
|
||
def __init__(self): | ||
# Call the base class constructor | ||
super().__init__( | ||
supported_entity="CREDIT_CARD", # The entity you are identifying | ||
patterns=self.ADDITIONAL_PATTERNS, # Add the extended patterns | ||
context=self.CONTEXT, | ||
) | ||
|
||
def validate_result(self, pattern_text: str) -> bool: # noqa D102 | ||
sanitized_value = self.__sanitize_value(pattern_text, self.replacement_pairs) | ||
checksum = self.__luhn_checksum(sanitized_value) | ||
|
||
return checksum | ||
|
||
@staticmethod | ||
def __luhn_checksum(sanitized_value: str) -> bool: | ||
def digits_of(n: str) -> List[int]: | ||
return [int(dig) for dig in str(n)] | ||
|
||
digits = digits_of(sanitized_value) | ||
odd_digits = digits[-1::-2] | ||
even_digits = digits[-2::-2] | ||
checksum = sum(odd_digits) | ||
for d in even_digits: | ||
checksum += sum(digits_of(str(d * 2))) | ||
return checksum % 10 == 0 | ||
|
||
@staticmethod | ||
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: | ||
for search_string, replacement_string in replacement_pairs: | ||
text = text.replace(search_string, replacement_string) | ||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import unittest | ||
|
||
from presidio_analyzer import AnalyzerEngine | ||
|
||
from pebblo.entity_classifier.custom_analyzer.cerdit_card_analyzer import ( | ||
ExtendedCreditCardRecognizer, | ||
) | ||
|
||
|
||
class TestExtendedCreditCardRecognizer(unittest.TestCase): | ||
def setUp(self): | ||
# Set up an instance of the ExtendedCreditCardRecognizer | ||
self.analyzer = AnalyzerEngine() | ||
self.recognizer = ExtendedCreditCardRecognizer() | ||
self.analyzer.registry.add_recognizer(self.recognizer) | ||
|
||
def test_visa_card(self): | ||
# Visa card number (no spaces/hyphens) | ||
text = "My card number is 4111111111111111." | ||
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "CREDIT_CARD") | ||
self.assertEqual(results[0].start, 18) | ||
self.assertEqual(results[0].end, 34) | ||
|
||
def test_visa_card_with_spaces(self): | ||
# Visa card number with spaces | ||
text = "My card number is 4111 1111 1111 1111." | ||
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "CREDIT_CARD") | ||
self.assertEqual(results[0].start, 18) | ||
self.assertEqual(results[0].end, 37) | ||
|
||
def test_mastercard_with_hyphens(self): | ||
# Mastercard number with hyphens | ||
text = "My card number is 5500-0000-0000-0004." | ||
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "CREDIT_CARD") | ||
self.assertEqual(results[0].start, 18) | ||
self.assertEqual(results[0].end, 37) | ||
|
||
def test_amex_card(self): | ||
# American Express card number | ||
text = "My Amex card number is 378282246310005." | ||
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "CREDIT_CARD") | ||
self.assertEqual(results[0].start, 23) | ||
self.assertEqual(results[0].end, 38) | ||
|
||
def test_diners_club_card(self): | ||
# Diners Club card number | ||
text = "My Diners Club card is 30569309025904." | ||
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "CREDIT_CARD") | ||
self.assertEqual(results[0].start, 23) | ||
self.assertEqual(results[0].end, 37) | ||
|
||
def test_jcb_card(self): | ||
# JCB card number | ||
text = "My JCB card number is 3530111333300000." | ||
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "CREDIT_CARD") | ||
self.assertEqual(results[0].start, 22) | ||
self.assertEqual(results[0].end, 38) | ||
|
||
def test_invalid_card(self): | ||
# Invalid card number | ||
text = "This is an invalid card number 1234567890123456." | ||
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en") | ||
self.assertEqual(len(results), 0) | ||
|
||
def test_credit_card_with_context(self): | ||
# Credit card number with context words | ||
text = "The credit card number 4111111111111111 is valid." | ||
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "CREDIT_CARD") | ||
self.assertEqual(results[0].start, 23) | ||
self.assertEqual(results[0].end, 39) | ||
|
||
def test_validate_result_with_luhn_checksum(self): | ||
# Valid credit card number using Luhn checksum validation | ||
valid_card = "4111111111111111" | ||
result = self.recognizer.validate_result(valid_card) | ||
self.assertTrue(result) | ||
|
||
def test_validate_result_invalid_luhn_checksum(self): | ||
# Invalid credit card number using Luhn checksum validation | ||
invalid_card = "4111111111111112" | ||
result = self.recognizer.validate_result(invalid_card) | ||
self.assertFalse(result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import unittest | ||
|
||
from presidio_analyzer import AnalyzerEngine | ||
|
||
|
||
class TestEmailRecognizer(unittest.TestCase): | ||
def setUp(self): | ||
# Set up an instance of the EmailRecognizer | ||
self.analyzer = AnalyzerEngine() | ||
|
||
def test_basic_email(self): | ||
# Basic email detection | ||
text = "My email is [email protected]." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS") | ||
self.assertEqual(results[0].start, 12) | ||
self.assertEqual(results[0].end, 32) | ||
|
||
def test_email_with_numbers(self): | ||
# Email with numbers in the username | ||
text = "Contact me at [email protected]." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS") | ||
self.assertEqual(results[0].start, 14) | ||
self.assertEqual(results[0].end, 33) | ||
|
||
def test_email_with_subdomain(self): | ||
# Email with a subdomain | ||
text = "My work email is [email protected]." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS") | ||
self.assertEqual(results[0].start, 17) | ||
self.assertEqual(results[0].end, 42) | ||
|
||
def test_email_with_special_characters(self): | ||
# Email with special characters | ||
text = "My email is [email protected]." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS") | ||
self.assertEqual(results[0].start, 12) | ||
self.assertEqual(results[0].end, 38) | ||
|
||
def test_multiple_emails(self): | ||
# Text with multiple emails | ||
text = "Emails: [email protected], [email protected]." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 2) | ||
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS") | ||
self.assertEqual(results[0].start, 8) | ||
self.assertEqual(results[0].end, 25) | ||
self.assertEqual(results[1].entity_type, "EMAIL_ADDRESS") | ||
self.assertEqual(results[1].start, 27) | ||
self.assertEqual(results[1].end, 42) | ||
|
||
def test_invalid_email_missing_at_symbol(self): | ||
# Invalid email (missing '@' symbol) | ||
text = "This is not a valid email: john.doeexample.com." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 0) | ||
|
||
def test_invalid_email_missing_domain(self): | ||
# Invalid email (missing domain part) | ||
text = "Invalid email: john.doe@." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 0) | ||
|
||
def test_invalid_email_missing_username(self): | ||
# Invalid email (missing username part) | ||
text = "Invalid email: @example.com." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 0) | ||
|
||
def test_email_with_context(self): | ||
# Email with context words like 'email' present | ||
text = "Please contact me at email: [email protected]." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS") | ||
self.assertEqual(results[0].start, 28) | ||
self.assertEqual(results[0].end, 48) | ||
|
||
def test_email_with_trailing_punctuation(self): | ||
# Email with trailing punctuation like comma or period | ||
text = "My email is [email protected], contact me soon." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 1) | ||
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS") | ||
self.assertEqual(results[0].start, 12) | ||
self.assertEqual(results[0].end, 32) | ||
|
||
def test_invalid_email_with_special_characters(self): | ||
# Invalid email with special characters in the wrong places | ||
text = "Invalid email: john.doe@exam#ple.com." | ||
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en") | ||
self.assertEqual(len(results), 0) |