Skip to content

Commit

Permalink
Added changes for Email Address and Credit Card Number (#580)
Browse files Browse the repository at this point in the history
* Added changes for Email Address and Credit Card Number
  • Loading branch information
gr8nishan authored Oct 8, 2024
1 parent c68f56e commit 7a60288
Show file tree
Hide file tree
Showing 7 changed files with 315 additions and 0 deletions.
1 change: 1 addition & 0 deletions pebblo/app/pebblo-ui/src/constants/keywordMapping.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export const KEYWORD_MAPPING = {
"us-passport-number": "US Passport number",
"us-drivers-license": "US Drivers License",
"credit-card-number": "Credit card number",
"email-address": "Email Address",
"us-bank-account-number": "US Bank Account Number",
"iban-code": "IBAN code",
"us-itin": "US ITIN",
Expand Down
106 changes: 106 additions & 0 deletions pebblo/entity_classifier/custom_analyzer/cerdit_card_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from typing import List, Tuple

from presidio_analyzer import Pattern
from presidio_analyzer.predefined_recognizers.credit_card_recognizer import (
CreditCardRecognizer,
)


class ExtendedCreditCardRecognizer(CreditCardRecognizer):
"""
Extends the Credit Card Recognizer by adding support for additional credit card types.
"""

# Define supported card patterns (can use the regex you provided)
ADDITIONAL_PATTERNS = [
Pattern("Amex Card", r"\b3[47][0-9]{13}\b", 0.5),
Pattern("BCGlobal", r"\b(6541|6556)[0-9]{12,15}\b", 0.5),
Pattern("Carte Blanche Card", r"\b389[0-9]{11}\b", 0.5),
Pattern("Diners Club", r"\b3(?:0[0-5]|[68][0-9])[0-9]{11}\b", 0.5),
Pattern(
"Discover",
r"\b65[4-9][0-9]{13}|\b64[4-9][0-9]{13}|\b6011[0-9]{12}|\b(622(?:12[6-9]|1[3-9][0-9]|[2-8][0-9]{3}|9[01][0-9]|92[0-5])[0-9]{10})\b",
0.5,
),
Pattern("Insta Payment", r"\b63[7-9][0-9]{13}\b", 0.5),
Pattern("JCB Card", r"\b(?:2131|1800|35\d{3})\d{11}\b", 0.5),
Pattern("KoreanLocalCard", r"\b9[0-9]{15}\b", 0.5),
Pattern("Laser Card", r"\b(6304|6706|6709|6771)[0-9]{12,15}\b", 0.5),
Pattern(
"Maestro Card", r"\b(5018|5020|5038|6304|6759|6761|6763)[0-9]{8,15}\b", 0.5
),
Pattern(
"Mastercard",
r"\b5[1-5][0-9]{14}\b|\b2(22[1-9][0-9]{12}|2[3-9][0-9]{13}|[3-6][0-9]{14}|7[0-1][0-9]{13}|720[0-9]{12})\b",
0.5,
),
Pattern("Solo Card", r"\b(6334|6767)[0-9]{12,15}\b", 0.5),
Pattern(
"Switch Card",
r"\b(4903|4905|4911|4936|6333|6759)[0-9]{12,15}\b|\b564182[0-9]{10,13}\b|\b633110[0-9]{10,13}\b",
0.5,
),
Pattern("Union Pay", r"\b62[0-9]{14,17}\b", 0.5),
Pattern("Visa Card", r"\b4[0-9]{12}(?:[0-9]{3})?\b", 0.5),
# Pattern(
# "All Credit Cards (weak)",
# r"\b((4\d{3})|(5[0-5]\d{2})|(6\d{3})|(1\d{3})|(3\d{3}))[- ]?(\d{3,4})[- ]?(\d{3,4})[- ]?(\d{3,5})\b", # noqa: E501
# 0.3,
# ),
]
# Define keywords related to credit cards
CONTEXT = [
"credit",
"credit_card",
"card" "debit",
"Visa",
"Mastercard",
"Amex",
"Discover",
"JCB",
"Diners Club",
"Carte Blanche",
"Insta Payment",
"Maestro",
"UnionPay",
"BCGlobal",
"KoreanLocalCard",
"credit",
"card",
"cc ",
"diners",
"instapayment",
]

def __init__(self):
# Call the base class constructor
super().__init__(
supported_entity="CREDIT_CARD", # The entity you are identifying
patterns=self.ADDITIONAL_PATTERNS, # Add the extended patterns
context=self.CONTEXT,
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
sanitized_value = self.__sanitize_value(pattern_text, self.replacement_pairs)
checksum = self.__luhn_checksum(sanitized_value)

return checksum

@staticmethod
def __luhn_checksum(sanitized_value: str) -> bool:
def digits_of(n: str) -> List[int]:
return [int(dig) for dig in str(n)]

digits = digits_of(sanitized_value)
odd_digits = digits[-1::-2]
even_digits = digits[-2::-2]
checksum = sum(odd_digits)
for d in even_digits:
checksum += sum(digits_of(str(d * 2)))
return checksum % 10 == 0

@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
7 changes: 7 additions & 0 deletions pebblo/entity_classifier/entity_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
from presidio_anonymizer import AnonymizerEngine

from pebblo.entity_classifier.custom_analyzer.cerdit_card_analyzer import (
ExtendedCreditCardRecognizer,
)
from pebblo.entity_classifier.custom_analyzer.private_key_analyzer import (
PrivateKeyRecognizer,
)
Expand Down Expand Up @@ -47,6 +50,10 @@ def custom_analyze(self):
# Add the private key recognizer to the Presidio Analyzer
self.analyzer.registry.add_recognizer(pk_recognizer)

cc_recognizer = ExtendedCreditCardRecognizer()
# Add the credit card recognizer to the Presidio Analyzer
self.analyzer.registry.add_recognizer(cc_recognizer)

def analyze_response(
self, input_text: str, anonymize_all_entities: bool = True
) -> list:
Expand Down
5 changes: 5 additions & 0 deletions pebblo/entity_classifier/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class Entities(Enum):
US_PASSPORT = "us-passport-number"
US_DRIVER_LICENSE = "us-drivers-license"

# contactinfo
EMAIL_ADDRESS = "email-address"
# network
IP_ADDRESS = "ip-address"

Expand Down Expand Up @@ -55,13 +57,16 @@ class PIIGroups(Enum):
Financial = "pii-financial"
Secrets = "secrets_and_tokens"
Network = "pii-network"
Contact = "pii-contact-information"


entity_group_conf_mapping = {
# Identification
Entities.US_SSN.value: (0.8, PIIGroups.Identification.value),
Entities.US_PASSPORT.value: (0.4, PIIGroups.Identification.value),
Entities.US_DRIVER_LICENSE.value: (0.4, PIIGroups.Identification.value),
# Contact
Entities.EMAIL_ADDRESS.value: (0.8, PIIGroups.Contact.value),
# Financial
Entities.US_ITIN.value: (0.8, PIIGroups.Financial.value),
Entities.CREDIT_CARD.value: (0.8, PIIGroups.Financial.value),
Expand Down
1 change: 1 addition & 0 deletions pebblo/reports/enums/keyword_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"us-passport-number": "US Passport number",
"us-drivers-license": "US Drivers License",
"credit-card-number": "Credit card number",
"email-address": "Email Address",
"us-bank-account-number": "US Bank Account Number",
"iban-code": "IBAN code",
"us-itin": "US ITIN",
Expand Down
96 changes: 96 additions & 0 deletions tests/entity_classifier/test_credit_card_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import unittest

from presidio_analyzer import AnalyzerEngine

from pebblo.entity_classifier.custom_analyzer.cerdit_card_analyzer import (
ExtendedCreditCardRecognizer,
)


class TestExtendedCreditCardRecognizer(unittest.TestCase):
def setUp(self):
# Set up an instance of the ExtendedCreditCardRecognizer
self.analyzer = AnalyzerEngine()
self.recognizer = ExtendedCreditCardRecognizer()
self.analyzer.registry.add_recognizer(self.recognizer)

def test_visa_card(self):
# Visa card number (no spaces/hyphens)
text = "My card number is 4111111111111111."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 18)
self.assertEqual(results[0].end, 34)

def test_visa_card_with_spaces(self):
# Visa card number with spaces
text = "My card number is 4111 1111 1111 1111."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 18)
self.assertEqual(results[0].end, 37)

def test_mastercard_with_hyphens(self):
# Mastercard number with hyphens
text = "My card number is 5500-0000-0000-0004."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 18)
self.assertEqual(results[0].end, 37)

def test_amex_card(self):
# American Express card number
text = "My Amex card number is 378282246310005."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 23)
self.assertEqual(results[0].end, 38)

def test_diners_club_card(self):
# Diners Club card number
text = "My Diners Club card is 30569309025904."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 23)
self.assertEqual(results[0].end, 37)

def test_jcb_card(self):
# JCB card number
text = "My JCB card number is 3530111333300000."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 22)
self.assertEqual(results[0].end, 38)

def test_invalid_card(self):
# Invalid card number
text = "This is an invalid card number 1234567890123456."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 0)

def test_credit_card_with_context(self):
# Credit card number with context words
text = "The credit card number 4111111111111111 is valid."
results = self.analyzer.analyze(text, entities=["CREDIT_CARD"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "CREDIT_CARD")
self.assertEqual(results[0].start, 23)
self.assertEqual(results[0].end, 39)

def test_validate_result_with_luhn_checksum(self):
# Valid credit card number using Luhn checksum validation
valid_card = "4111111111111111"
result = self.recognizer.validate_result(valid_card)
self.assertTrue(result)

def test_validate_result_invalid_luhn_checksum(self):
# Invalid credit card number using Luhn checksum validation
invalid_card = "4111111111111112"
result = self.recognizer.validate_result(invalid_card)
self.assertFalse(result)
99 changes: 99 additions & 0 deletions tests/entity_classifier/test_email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import unittest

from presidio_analyzer import AnalyzerEngine


class TestEmailRecognizer(unittest.TestCase):
def setUp(self):
# Set up an instance of the EmailRecognizer
self.analyzer = AnalyzerEngine()

def test_basic_email(self):
# Basic email detection
text = "My email is [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 12)
self.assertEqual(results[0].end, 32)

def test_email_with_numbers(self):
# Email with numbers in the username
text = "Contact me at [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 14)
self.assertEqual(results[0].end, 33)

def test_email_with_subdomain(self):
# Email with a subdomain
text = "My work email is [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 17)
self.assertEqual(results[0].end, 42)

def test_email_with_special_characters(self):
# Email with special characters
text = "My email is [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 12)
self.assertEqual(results[0].end, 38)

def test_multiple_emails(self):
# Text with multiple emails
text = "Emails: [email protected], [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 2)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 8)
self.assertEqual(results[0].end, 25)
self.assertEqual(results[1].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[1].start, 27)
self.assertEqual(results[1].end, 42)

def test_invalid_email_missing_at_symbol(self):
# Invalid email (missing '@' symbol)
text = "This is not a valid email: john.doeexample.com."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 0)

def test_invalid_email_missing_domain(self):
# Invalid email (missing domain part)
text = "Invalid email: john.doe@."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 0)

def test_invalid_email_missing_username(self):
# Invalid email (missing username part)
text = "Invalid email: @example.com."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 0)

def test_email_with_context(self):
# Email with context words like 'email' present
text = "Please contact me at email: [email protected]."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 28)
self.assertEqual(results[0].end, 48)

def test_email_with_trailing_punctuation(self):
# Email with trailing punctuation like comma or period
text = "My email is [email protected], contact me soon."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].entity_type, "EMAIL_ADDRESS")
self.assertEqual(results[0].start, 12)
self.assertEqual(results[0].end, 32)

def test_invalid_email_with_special_characters(self):
# Invalid email with special characters in the wrong places
text = "Invalid email: john.doe@exam#ple.com."
results = self.analyzer.analyze(text, entities=["EMAIL_ADDRESS"], language="en")
self.assertEqual(len(results), 0)

0 comments on commit 7a60288

Please sign in to comment.