2476 lines
102 KiB
Python
2476 lines
102 KiB
Python
"""Privacy Scanner Router - PII Detection and Redaction"""
|
||
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
||
from pydantic import BaseModel
|
||
from typing import Optional
|
||
import re
|
||
import io
|
||
import tempfile
|
||
import os
|
||
import duckdb
|
||
import json
|
||
import base64
|
||
import unicodedata
|
||
import html
|
||
|
||
router = APIRouter()
|
||
|
||
# Valid TLDs for email detection - prevents false positives on "at 7pm", "at cvs", etc.
|
||
VALID_TLDS = {
|
||
'com', 'org', 'net', 'edu', 'gov', 'io', 'co', 'ai', 'dev',
|
||
'app', 'xyz', 'info', 'biz', 'me', 'uk', 'de', 'fr', 'ca',
|
||
'au', 'in', 'jp', 'ru', 'br', 'it', 'nl', 'es', 'ch', 'se',
|
||
'no', 'fi', 'dk', 'at', 'be', 'pl', 'pt', 'ie', 'nz', 'sg',
|
||
'hk', 'kr', 'mx', 'ar', 'cl', 'za', 'us', 'mil', 'int', 'eu',
|
||
'asia', 'mobi', 'name', 'pro', 'aero', 'coop', 'museum', 'jobs',
|
||
'travel', 'xxx', 'cat', 'tel', 'post', 'club', 'online', 'site',
|
||
'tech', 'store', 'blog', 'shop', 'live', 'cloud', 'news', 'email'
|
||
}
|
||
|
||
# Words that commonly follow "at" but are NOT domains
|
||
FALSE_TRIGGER_WORDS = {
|
||
'the', 'a', 'an', 'my', 'your', 'his', 'her', 'our', 'their', 'ur',
|
||
'home', 'work', 'office', 'school', 'noon', 'night', 'midnight',
|
||
'dawn', 'dusk', 'once', 'least', 'most', 'first', 'last', 'all',
|
||
'gate', 'terminal', 'platform', 'station', 'airport', 'store',
|
||
'mall', 'gym', 'park', 'beach', 'bar', 'restaurant', 'hotel',
|
||
'clinic', 'hospital', 'bank', 'church', 'cvs', 'target', 'walmart'
|
||
}
|
||
|
||
|
||
# PII Detection Patterns
|
||
PII_PATTERNS = {
|
||
"EMAIL": {
|
||
# Negative lookbehind (?<!:) excludes connection strings like password@host
|
||
"pattern": r'(?<!:)\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
||
"description": "Email addresses",
|
||
"category": "pii"
|
||
},
|
||
"EMAIL_OBFUSCATED": {
|
||
# Obfuscated emails: t-e-s-t [at] example [dot] com, test(at)example(dot)com
|
||
# IMPORTANT: Do NOT match plain " at " - only match bracketed [at], (at), or literal @
|
||
# Plain " at " causes massive false positives ("meet at 5pm", "close at 9am", etc.)
|
||
"pattern": r'[A-Za-z0-9](?:[-\s]*[A-Za-z0-9]){2,}\s*(?:\[at\]|\(at\))\s*[A-Za-z0-9](?:[-\s]*[A-Za-z0-9]){2,}\s*(?:\[dot\]|\(dot\)|\s+dot\s+)\s*[A-Za-z]{2,}',
|
||
"description": "Obfuscated email addresses",
|
||
"category": "pii"
|
||
},
|
||
"PHONE_US": {
|
||
# US phone numbers: (555) 123-4567, 555-123-4567, +1 555 123 4567
|
||
"pattern": r'\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b',
|
||
"description": "US Phone numbers",
|
||
"category": "pii"
|
||
},
|
||
"PHONE_INTL": {
|
||
# International phone numbers: +49 30 1234567, +44 20 7946 0958, +55 11 98765-4321
|
||
# EU: 49(DE), 44(UK), 33(FR), 39(IT), 34(ES), 31(NL), 32(BE), 43(AT), 41(CH), 48(PL), etc.
|
||
# LATAM: 55(BR), 52(MX), 54(AR), 56(CL), 57(CO), 51(PE)
|
||
# APAC: 81(JP), 82(KR), 86(CN), 91(IN), 61(AU), 64(NZ), 65(SG), 852(HK)
|
||
"pattern": r'\+(?:49|44|33|39|34|31|32|43|41|48|351|353|358|47|46|45|420|36|40|359|385|386|421|370|371|372|352|356|357|30|55|52|54|56|57|51|81|82|86|91|61|64|65|852)\s?[0-9]{1,4}[\s-]?[0-9]{3,4}[\s-]?[0-9]{3,6}\b',
|
||
"description": "International Phone numbers (Global)",
|
||
"category": "pii"
|
||
},
|
||
"SSN": {
|
||
# US SSN: Requires separators (dashes, dots, spaces, or underscores)
|
||
# Matches: 123-45-6789, 123.45.6789, 123 45 6789, 123_45_6789
|
||
"pattern": r'\b\d{3}[-.\s_]\d{2}[-.\s_]\d{4}\b',
|
||
"description": "Social Security Numbers (US)",
|
||
"category": "pii"
|
||
},
|
||
"MEDICARE_ID": {
|
||
# US Medicare Beneficiary Identifier (MBI): 11 characters, alphanumeric
|
||
# Format: 1A12-B12-CD12 or 1-A-1-2-B-1-2-C-D-1-2 (with extra dashes)
|
||
"pattern": r'\b[1-9][-]?[A-Z][-]?(?:[A-Z0-9][-]?){2}[A-Z][-]?(?:[A-Z0-9][-]?){2}[A-Z][-]?[A-Z][-]?[0-9][-]?[0-9]\b',
|
||
"description": "Medicare Beneficiary Identifier (MBI)",
|
||
"category": "pii"
|
||
},
|
||
"GERMAN_ID": {
|
||
# German Personalausweisnummer (ID card): 10 characters
|
||
# Format: T220001293 or similar
|
||
"pattern": r'\b[A-Z0-9]{10}\b(?=.*(?:ausweis|personalausweis|id\s*card|national\s*id))',
|
||
"description": "German National ID (Personalausweis)",
|
||
"category": "pii"
|
||
},
|
||
"CREDIT_CARD": {
|
||
# Matches Visa, Mastercard, Amex, Discover with optional spaces/dashes
|
||
"pattern": r'\b(?:4[0-9]{3}[-\s]?[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{4}|4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{2}[-\s]?[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{4}|5[1-5][0-9]{14}|3[47][0-9]{2}[-\s]?[0-9]{6}[-\s]?[0-9]{5}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[-\s]?[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{4}|6(?:011|5[0-9]{2})[0-9]{12})\b',
|
||
"description": "Credit card numbers",
|
||
"category": "financial"
|
||
},
|
||
"IP_ADDRESS": {
|
||
"pattern": r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b',
|
||
"description": "IP addresses",
|
||
"category": "pii"
|
||
},
|
||
"IP_DEFANGED": {
|
||
# Defanged IPs: 192[.]168[.]1[.]1, 192[dot]168[dot]1[dot]1, 192(.)168(.)1(.)1
|
||
"pattern": r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\[(?:\.|\s*dot\s*)\]|\((?:\.|\s*dot\s*)\)|\[\.\])){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b',
|
||
"description": "Defanged IP addresses",
|
||
"category": "pii"
|
||
},
|
||
"DATE_OF_BIRTH": {
|
||
# Matches multiple date formats:
|
||
# MM/DD/YYYY, MM-DD-YYYY, DD/MM/YYYY, YYYY-MM-DD, Month DD, YYYY
|
||
"pattern": r'\b(?:(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:19|20)\d{2}|(?:0?[1-9]|[12][0-9]|3[01])[/-](?:0?[1-9]|1[0-2])[/-](?:19|20)\d{2}|(?:19|20)\d{2}[-/](?:0?[1-9]|1[0-2])[-/](?:0?[1-9]|[12][0-9]|3[01])|(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[.,]?\s+(?:0?[1-9]|[12][0-9]|3[01])(?:st|nd|rd|th)?[.,]?\s+(?:19|20)\d{2})\b',
|
||
"description": "Dates of birth (multiple formats)",
|
||
"category": "pii"
|
||
},
|
||
"IBAN": {
|
||
# International Bank Account Numbers - very flexible pattern
|
||
# Format: 2 letters (country) + 2 digits (check) + up to 30 alphanumeric (BBAN)
|
||
# Supports with/without spaces, common country formats
|
||
"pattern": r'\b(?:DE|GB|FR|ES|IT|NL|BE|AT|CH|PL|PT|IE|FI|NO|SE|DK|CZ|HU|RO|BG|HR|SI|SK|LT|LV|EE|LU|MT|CY|GR)\d{2}[\s]?[A-Z0-9]{4}[\s]?[A-Z0-9]{4}[\s]?[A-Z0-9]{4}[\s]?[A-Z0-9]{0,18}\b',
|
||
"description": "International Bank Account Numbers",
|
||
"category": "financial"
|
||
},
|
||
"BANK_ACCOUNT": {
|
||
# Generic bank account patterns (routing + account, with labels)
|
||
"pattern": r'\b(?:account|acct|a/c|acc)[:\s#]*[0-9]{8,17}\b',
|
||
"description": "Bank Account Numbers",
|
||
"category": "financial"
|
||
},
|
||
"ROUTING_NUMBER": {
|
||
# US Bank Routing/ABA numbers (9 digits)
|
||
"pattern": r'\b(?:routing|aba|rtn)[:\s#]*[0-9]{9}\b',
|
||
"description": "Bank Routing/ABA Numbers",
|
||
"category": "financial"
|
||
},
|
||
"SWIFT_BIC": {
|
||
# SWIFT/BIC codes (8 or 11 chars) - requires colon separator to avoid false positives
|
||
# Format: 4 letters (bank) + 2 letters (country ISO) + 2 alphanum (location) + optional 3 alphanum (branch)
|
||
# Pattern requires colon or "code" keyword to distinguish from "SWIFT transfer" etc.
|
||
"pattern": r'(?:swift|bic|swift/bic)(?::|(?:\s+code\s*))[:\s]*([A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?)\b',
|
||
"description": "SWIFT/BIC Codes",
|
||
"category": "financial"
|
||
},
|
||
"ZIP_CODE": {
|
||
"pattern": r'\b\d{5}(?:[-\s]\d{4})?\b',
|
||
"description": "US ZIP codes",
|
||
"category": "pii"
|
||
},
|
||
"DRIVERS_LICENSE": {
|
||
# US Driver's License - requires context to avoid false positives
|
||
# Most states use letter + 7-12 digits, but pattern too generic without context
|
||
"pattern": r'\b[A-Z][0-9]{7,12}\b(?=.*(?:license|licence|driver|dmv|dl[:#\s]|driving))',
|
||
"description": "Driver's license numbers",
|
||
"category": "pii"
|
||
},
|
||
# Physical Address patterns
|
||
"US_ADDRESS": {
|
||
"pattern": r'\b\d{1,5}\s+(?:[A-Za-z]+\s+){1,4}(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Lane|Ln|Court|Ct|Way|Place|Pl|Circle|Cir|Highway|Hwy)\.?(?:\s+(?:Apt|Suite|Ste|Unit|#)\s*[A-Za-z0-9-]+)?(?:,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s+\d{5}(?:-\d{4})?)?\b',
|
||
"description": "US physical addresses",
|
||
"category": "pii"
|
||
},
|
||
# Secrets and API Keys
|
||
"AWS_ACCESS_KEY": {
|
||
"pattern": r'\b(?:AKIA|ABIA|ACCA|ASIA)[A-Z0-9]{16}\b',
|
||
"description": "AWS Access Key IDs",
|
||
"category": "secret"
|
||
},
|
||
"AWS_SECRET_KEY": {
|
||
"pattern": r'\b[A-Za-z0-9/+=]{40}\b',
|
||
"description": "AWS Secret Access Keys (40 char base64)",
|
||
"category": "secret"
|
||
},
|
||
"GITHUB_TOKEN": {
|
||
"pattern": r'\b(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9]{36,}\b',
|
||
"description": "GitHub Personal Access Tokens",
|
||
"category": "secret"
|
||
},
|
||
"GITHUB_TOKEN_CLASSIC": {
|
||
"pattern": r'\bgh[pousr]_[A-Za-z0-9]{36}\b',
|
||
"description": "GitHub Classic Tokens",
|
||
"category": "secret"
|
||
},
|
||
"SLACK_TOKEN": {
|
||
"pattern": r'\bxox[baprs]-[0-9]{10,13}-[0-9]{10,13}[a-zA-Z0-9-]*\b',
|
||
"description": "Slack API Tokens",
|
||
"category": "secret"
|
||
},
|
||
"GENERIC_API_KEY": {
|
||
"pattern": r'\b(?:api[_-]?key|apikey|api[_-]?secret|secret[_-]?key)["\']?\s*[:=]\s*["\']?([A-Za-z0-9_\-]{20,64})["\']?\b',
|
||
"description": "Generic API keys in key=value format",
|
||
"category": "secret"
|
||
},
|
||
"PASSWORD_IN_URL": {
|
||
# Passwords with = or : or "to" keyword
|
||
# Matches: password=secret, pwd: MyPass123, changed pwd to P@ssw0rd123!
|
||
"pattern": r'(?:password|passwd|pwd|pass|secret|credential)["\']?\s*(?:[:=]|(?:\s+(?:is|to|as)\s+))\s*["\']?([^\s"\'&,]{6,})["\']?',
|
||
"description": "Passwords in plaintext",
|
||
"category": "secret"
|
||
},
|
||
"PRIVATE_KEY": {
|
||
"pattern": r'-----BEGIN\s+(?:RSA\s+)?PRIVATE\s+KEY-----',
|
||
"description": "Private key headers",
|
||
"category": "secret"
|
||
},
|
||
"JWT_TOKEN": {
|
||
"pattern": r'\beyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*\b',
|
||
"description": "JSON Web Tokens",
|
||
"category": "secret"
|
||
},
|
||
"STRIPE_KEY": {
|
||
# Stripe keys: sk_live_xxx, pk_test_xxx - min 8 chars after prefix to catch truncated
|
||
"pattern": r'\b(?:sk|pk)_(?:test|live)_[A-Za-z0-9]{8,}\b',
|
||
"description": "Stripe API Keys",
|
||
"category": "secret"
|
||
},
|
||
"API_KEY_IN_URL": {
|
||
# API keys embedded in URLs: /key/xxx, /api_key/xxx, /token/xxx
|
||
"pattern": r'(?:/(?:key|api[_-]?key|token|secret|auth)[/=])([A-Za-z0-9_-]{16,})',
|
||
"description": "API Keys in URLs",
|
||
"category": "secret"
|
||
},
|
||
"AUTH_CODE": {
|
||
# Authorization codes: auth_code: 9921, authorization: ABC123
|
||
"pattern": r'(?:auth[_-]?code|authorization|auth[_-]?token|otp|verification[_-]?code)[:\s]+[A-Za-z0-9]{4,12}\b',
|
||
"description": "Authorization/Verification Codes",
|
||
"category": "secret"
|
||
},
|
||
"UUID": {
|
||
# UUIDs/GUIDs: 550e8400-e29b-41d4-a716-446655440000
|
||
"pattern": r'\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b',
|
||
"description": "UUIDs/Trace IDs",
|
||
"category": "pii"
|
||
},
|
||
"GEO_COORDINATES": {
|
||
# Geo coordinates: 40.7128° N, 74.0060° W or 40.7128, -74.0060
|
||
"pattern": r'\b-?\d{1,3}\.\d{4,}°?\s*[NS]?\s*,?\s*-?\d{1,3}\.\d{4,}°?\s*[EW]?\b',
|
||
"description": "Geographic Coordinates",
|
||
"category": "pii"
|
||
},
|
||
"GOOGLE_API_KEY": {
|
||
"pattern": r'\bAIza[A-Za-z0-9_-]{35}\b',
|
||
"description": "Google API Keys",
|
||
"category": "secret"
|
||
},
|
||
# GCP and Azure Cloud Keys
|
||
"GCP_SERVICE_ACCOUNT": {
|
||
"pattern": r'\b[a-z0-9-]+@[a-z0-9-]+\.iam\.gserviceaccount\.com\b',
|
||
"description": "GCP Service Account Emails",
|
||
"category": "secret"
|
||
},
|
||
"GCP_PRIVATE_KEY_ID": {
|
||
"pattern": r'"private_key_id"\s*:\s*"([a-f0-9]{40})"',
|
||
"description": "GCP Private Key IDs in JSON",
|
||
"category": "secret"
|
||
},
|
||
"AZURE_CLIENT_SECRET": {
|
||
"pattern": r'\b[a-zA-Z0-9~._-]{34,40}\b(?=.*(?:azure|client[_-]?secret|tenant))',
|
||
"description": "Azure Client Secrets",
|
||
"category": "secret"
|
||
},
|
||
"AZURE_CONNECTION_STRING": {
|
||
# Azure connection strings - match AccountKey=xxx pattern
|
||
"pattern": r'AccountKey=[A-Za-z0-9+/=]{10,}',
|
||
"description": "Azure Storage Connection Strings",
|
||
"category": "secret"
|
||
},
|
||
"AZURE_SAS_TOKEN": {
|
||
"pattern": r'\?sv=\d{4}-\d{2}-\d{2}&[^"\s]+sig=[A-Za-z0-9%]+',
|
||
"description": "Azure SAS Tokens",
|
||
"category": "secret"
|
||
},
|
||
# International Address Patterns
|
||
"UK_POSTCODE": {
|
||
# Valid UK postcode formats: A9 9AA, A99 9AA, A9A 9AA, AA9 9AA, AA99 9AA, AA9A 9AA
|
||
# Must have space between outward and inward codes, and be word-bounded
|
||
"pattern": r'\b(?:[A-Z]{1,2}[0-9][0-9A-Z]?\s+[0-9][A-Z]{2})\b',
|
||
"description": "UK Postcodes",
|
||
"category": "pii"
|
||
},
|
||
"UK_ADDRESS": {
|
||
"pattern": r'\b\d{1,5}\s+[A-Za-z]+(?:\s+[A-Za-z]+)*\s*,\s*[A-Za-z]+(?:\s+[A-Za-z]+)*\s*,?\s*[A-Z]{1,2}[0-9][0-9A-Z]?\s*[0-9][A-Z]{2}\b',
|
||
"description": "UK Addresses with Postcode",
|
||
"category": "pii"
|
||
},
|
||
"EU_ADDRESS": {
|
||
# European addresses: German (strasse 42), French (25 rue xxx, 75016 Paris), etc
|
||
"pattern": r'\b(?:[A-Za-z]+(?:strasse|straße|street|straat|calle|via|strada|gasse|weg|platz|plein|place|plaza)\s+\d{1,5}|\d{1,5}\s+(?:rue|av\.?|avenue|boulevard|blvd\.?|chemin|allée|impasse|passage)\s+[A-Za-z][A-Za-z\s]{2,25})[,\s]+\d{4,5}\s+[A-Za-z]+',
|
||
"description": "European Addresses (DE/FR/NL/IT/ES)",
|
||
"category": "pii"
|
||
},
|
||
"INTERNATIONAL_ADDRESS": {
|
||
# Generic pattern for addresses with postal codes
|
||
"pattern": r'\b\d{1,5}\s+[A-Za-z][A-Za-z\s]{2,30},\s*[A-Za-z][A-Za-z\s]{2,20},?\s*(?:[A-Z]{2}\s*)?\d{4,6}(?:-\d{4})?\b',
|
||
"description": "International Addresses with Postal Codes",
|
||
"category": "pii"
|
||
},
|
||
# Medical IDs (distinct from passports)
|
||
"MEDICAL_RECORD_NUMBER": {
|
||
"pattern": r'\b(?:MRN|MR#|Medical\s*Record|Patient\s*ID)[:\s#]*[A-Z0-9]{6,12}\b',
|
||
"description": "Medical Record Numbers",
|
||
"category": "pii"
|
||
},
|
||
"NPI_NUMBER": {
|
||
"pattern": r'\b(?:NPI)[:\s#]*[0-9]{10}\b',
|
||
"description": "National Provider Identifier (NPI)",
|
||
"category": "pii"
|
||
},
|
||
"DEA_NUMBER": {
|
||
"pattern": r'\b[A-Z][A-Z9][0-9]{7}\b',
|
||
"description": "DEA Registration Numbers",
|
||
"category": "pii"
|
||
},
|
||
# EU VAT Numbers (detected directly via dedicated function for better accuracy)
|
||
"EU_VAT_NUMBER": {
|
||
"pattern": r'\b(?:ATU\d{8}|BE0?\d{9,10}|BG\d{9,10}|CY\d{8}[A-Z]|CZ\d{8,10}|DE\d{9}|DK\d{8}|EE\d{9}|EL\d{9}|ES[A-Z0-9]\d{7}[A-Z0-9]|FI\d{8}|FR[A-Z0-9]{2}\d{9}|HR\d{11}|HU\d{8}|IE\d[A-Z0-9+*]\d{5}[A-Z]{1,2}|IT\d{11}|LT\d{9,12}|LU\d{8}|LV\d{11}|MT\d{8}|NL\d{9}B\d{2}|PL\d{10}|PT\d{9}|RO\d{2,10}|SE\d{12}|SI\d{8}|SK\d{10}|GB\d{9,12})\b',
|
||
"description": "European Union VAT Numbers",
|
||
"category": "financial"
|
||
},
|
||
# Spelled-out SSN pattern (detected via NLP layer)
|
||
"SSN_SPELLED": {
|
||
"pattern": r'\b(?:S\.?S\.?N\.?|social\s*security)\s+(?:is\s+)?(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|oh|o)\s+)+(?:dash|hyphen)\s+(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|oh|o)\s+)+(?:dash|hyphen)\s+(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|oh|o)\s*)+',
|
||
"description": "Spelled-out Social Security Numbers",
|
||
"category": "pii"
|
||
},
|
||
# Base64 encoded secrets (marker for decoded content)
|
||
"BASE64_SECRET": {
|
||
"pattern": r'PLACEHOLDER_FOR_DECODED_BASE64', # Detected via decode layer
|
||
"description": "Base64 Encoded Secrets",
|
||
"category": "secret"
|
||
},
|
||
# JSON embedded PII (marker for nested content)
|
||
"JSON_EMBEDDED_PII": {
|
||
"pattern": r'PLACEHOLDER_FOR_JSON_PII', # Detected via JSON extraction layer
|
||
"description": "PII Found in JSON Blobs",
|
||
"category": "pii"
|
||
},
|
||
# =========================================================================
|
||
# INTERNATIONAL ID PATTERNS (v1.1)
|
||
# =========================================================================
|
||
"UK_NATIONAL_INSURANCE": {
|
||
# UK National Insurance Number: AB123456C
|
||
"pattern": r'\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b',
|
||
"description": "UK National Insurance Number",
|
||
"category": "pii"
|
||
},
|
||
"CANADIAN_SIN": {
|
||
# Canadian Social Insurance Number: 123-456-789 or 123 456 789
|
||
"pattern": r'\b\d{3}[-\s]\d{3}[-\s]\d{3}\b',
|
||
"description": "Canadian Social Insurance Number",
|
||
"category": "pii"
|
||
},
|
||
"INDIA_AADHAAR": {
|
||
# India Aadhaar: 1234 5678 9012 (12 digits with spaces)
|
||
"pattern": r'\b\d{4}\s\d{4}\s\d{4}\b',
|
||
"description": "India Aadhaar Number",
|
||
"category": "pii"
|
||
},
|
||
"INDIA_PAN": {
|
||
# India PAN: ABCDE1234F (5 letters, 4 digits, 1 letter)
|
||
"pattern": r'\b[A-Z]{5}\d{4}[A-Z]\b',
|
||
"description": "India PAN Card Number",
|
||
"category": "pii"
|
||
},
|
||
"AUSTRALIA_TFN": {
|
||
# Australian Tax File Number: 123 456 789 (9 digits)
|
||
"pattern": r'\b\d{3}\s\d{3}\s\d{3}\b',
|
||
"description": "Australian Tax File Number",
|
||
"category": "pii"
|
||
},
|
||
"BRAZIL_CPF": {
|
||
# Brazil CPF: 123.456.789-00
|
||
"pattern": r'\b\d{3}\.\d{3}\.\d{3}-\d{2}\b',
|
||
"description": "Brazil CPF Number",
|
||
"category": "pii"
|
||
},
|
||
"MEXICO_CURP": {
|
||
# Mexico CURP: 18 alphanumeric
|
||
"pattern": r'\b[A-Z]{4}\d{6}[HM][A-Z]{5}[A-Z\d]{2}\b',
|
||
"description": "Mexico CURP",
|
||
"category": "pii"
|
||
},
|
||
"SOUTH_AFRICA_ID": {
|
||
# South Africa ID: 13 digits YYMMDDGGGSCAZ
|
||
"pattern": r'\b\d{2}(?:0[1-9]|1[0-2])(?:0[1-9]|[12]\d|3[01])\d{4}[01]\d{2}\b',
|
||
"description": "South Africa ID Number",
|
||
"category": "pii"
|
||
},
|
||
# =========================================================================
|
||
# ADDITIONAL CLOUD TOKENS (v1.1)
|
||
# =========================================================================
|
||
"DISCORD_TOKEN": {
|
||
# Discord bot/user tokens: base64.base64.base64 format
|
||
"pattern": r'\b[MN][A-Za-z\d]{23,}\.[\w-]{6}\.[\w-]{27}\b',
|
||
"description": "Discord Bot/User Tokens",
|
||
"category": "secret"
|
||
},
|
||
"DISCORD_WEBHOOK": {
|
||
"pattern": r'https://discord(?:app)?\.com/api/webhooks/\d+/[\w-]+',
|
||
"description": "Discord Webhook URLs",
|
||
"category": "secret"
|
||
},
|
||
"TWILIO_API_KEY": {
|
||
"pattern": r'\bSK[a-f0-9]{32}\b',
|
||
"description": "Twilio API Keys",
|
||
"category": "secret"
|
||
},
|
||
"TWILIO_AUTH_TOKEN": {
|
||
"pattern": r'\b[a-f0-9]{32}\b(?=.*(?:twilio|auth_token))',
|
||
"description": "Twilio Auth Tokens",
|
||
"category": "secret"
|
||
},
|
||
"SENDGRID_API_KEY": {
|
||
"pattern": r'\bSG\.[A-Za-z0-9_-]{22}\.[A-Za-z0-9_-]{43}\b',
|
||
"description": "SendGrid API Keys",
|
||
"category": "secret"
|
||
},
|
||
"OPENAI_API_KEY": {
|
||
"pattern": r'\bsk-[A-Za-z0-9]{48}\b',
|
||
"description": "OpenAI API Keys",
|
||
"category": "secret"
|
||
},
|
||
"ANTHROPIC_API_KEY": {
|
||
"pattern": r'\bsk-ant-[A-Za-z0-9_-]{40,}\b',
|
||
"description": "Anthropic API Keys",
|
||
"category": "secret"
|
||
},
|
||
"MAILCHIMP_API_KEY": {
|
||
"pattern": r'\b[a-f0-9]{32}-us\d{1,2}\b',
|
||
"description": "Mailchimp API Keys",
|
||
"category": "secret"
|
||
},
|
||
"MAILGUN_API_KEY": {
|
||
"pattern": r'\bkey-[a-f0-9]{32}\b',
|
||
"description": "Mailgun API Keys",
|
||
"category": "secret"
|
||
},
|
||
"HEROKU_API_KEY": {
|
||
"pattern": r'\b[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}\b',
|
||
"description": "Heroku API Keys",
|
||
"category": "secret"
|
||
},
|
||
"SHOPIFY_ACCESS_TOKEN": {
|
||
"pattern": r'\bshpat_[a-fA-F0-9]{32}\b',
|
||
"description": "Shopify Access Tokens",
|
||
"category": "secret"
|
||
},
|
||
"SHOPIFY_SHARED_SECRET": {
|
||
"pattern": r'\bshpss_[a-fA-F0-9]{32}\b',
|
||
"description": "Shopify Shared Secrets",
|
||
"category": "secret"
|
||
},
|
||
"NPM_TOKEN": {
|
||
"pattern": r'\bnpm_[A-Za-z0-9]{36}\b',
|
||
"description": "NPM Access Tokens",
|
||
"category": "secret"
|
||
},
|
||
"PYPI_TOKEN": {
|
||
"pattern": r'\bpypi-[A-Za-z0-9_-]{50,}\b',
|
||
"description": "PyPI API Tokens",
|
||
"category": "secret"
|
||
},
|
||
"DOCKER_AUTH": {
|
||
"pattern": r'"auth"\s*:\s*"[A-Za-z0-9+/=]{20,}"',
|
||
"description": "Docker Registry Auth",
|
||
"category": "secret"
|
||
},
|
||
# =========================================================================
|
||
# CRYPTO ADDRESSES (v1.1)
|
||
# =========================================================================
|
||
"BITCOIN_ADDRESS": {
|
||
# Bitcoin: 1xxx, 3xxx (P2SH), bc1xxx (Bech32)
|
||
"pattern": r'\b(?:[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-zA-HJ-NP-Z0-9]{39,59})\b',
|
||
"description": "Bitcoin Wallet Addresses",
|
||
"category": "financial"
|
||
},
|
||
"ETHEREUM_ADDRESS": {
|
||
# Ethereum: 0x followed by 40 hex chars
|
||
"pattern": r'\b0x[a-fA-F0-9]{40}\b',
|
||
"description": "Ethereum Wallet Addresses",
|
||
"category": "financial"
|
||
},
|
||
"MONERO_ADDRESS": {
|
||
# Monero: 4 or 8 followed by 93 or 103 chars
|
||
"pattern": r'\b[48][0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b',
|
||
"description": "Monero Wallet Addresses",
|
||
"category": "financial"
|
||
},
|
||
# =========================================================================
|
||
# FINANCIAL IDENTIFIERS (v1.1)
|
||
# =========================================================================
|
||
"CUSIP": {
|
||
# CUSIP: 9 alphanumeric chars (US/Canada securities)
|
||
"pattern": r'\b[A-Z0-9]{9}\b(?=.*(?:cusip|security|stock|bond))',
|
||
"description": "CUSIP Security Identifiers",
|
||
"category": "financial"
|
||
},
|
||
"ISIN": {
|
||
# ISIN: Valid country code + 9 alphanumeric (NSIN) + 1 check digit
|
||
# Requires financial context to reduce false positives
|
||
"pattern": r'\b(?:US|GB|DE|FR|CH|JP|CA|AU|NL|BE|IT|ES|AT|SE|NO|DK|FI|IE|PT|LU|HK|SG|KR|TW|IN|BR|ZA|MX)[A-Z0-9]{9}\d\b(?=.*(?:isin|security|stock|bond|share|equity|fund))',
|
||
"description": "ISIN Security Identifiers",
|
||
"category": "financial"
|
||
},
|
||
"SEDOL": {
|
||
# SEDOL: 7 alphanumeric (UK securities)
|
||
"pattern": r'\b[B-DF-HJ-NP-TV-Z0-9]{7}\b(?=.*(?:sedol|london|lse))',
|
||
"description": "SEDOL Security Identifiers",
|
||
"category": "financial"
|
||
}
|
||
}
|
||
|
||
|
||
class PIIEntity(BaseModel):
|
||
type: str
|
||
value: str
|
||
start: int
|
||
end: int
|
||
confidence: float
|
||
masked_value: str
|
||
|
||
|
||
class ScanResult(BaseModel):
|
||
total_entities: int
|
||
entities_by_type: dict
|
||
entities: list[PIIEntity]
|
||
risk_level: str
|
||
risk_score: int
|
||
redacted_preview: Optional[str] = None
|
||
|
||
|
||
class ScanConfig(BaseModel):
|
||
detect_emails: bool = True
|
||
detect_phones: bool = True
|
||
detect_ssn: bool = True
|
||
detect_credit_cards: bool = True
|
||
detect_ip_addresses: bool = True
|
||
detect_dates: bool = True
|
||
detect_zip_codes: bool = False
|
||
detect_addresses: bool = True
|
||
detect_iban: bool = True
|
||
detect_secrets: bool = True # AWS keys, GitHub tokens, passwords, etc.
|
||
|
||
|
||
# ============================================================================
|
||
# FALSE POSITIVE EXCLUSION PATTERNS (v1.1)
|
||
# ============================================================================
|
||
|
||
# Patterns to EXCLUDE from detection (known false positives)
|
||
FALSE_POSITIVE_PATTERNS = {
|
||
# Git commit SHAs (40 hex chars) - not AWS secrets
|
||
"GIT_SHA": r'\b[a-f0-9]{40}\b',
|
||
# Unix timestamps (13 digits) - not phone numbers
|
||
"UNIX_TIMESTAMP_MS": r'\b1[0-9]{12}\b',
|
||
# Unix timestamps (10 digits) - not phone numbers
|
||
"UNIX_TIMESTAMP": r'\b1[0-9]{9}\b',
|
||
# Slack webhook path identifiers (T/B followed by alphanumeric)
|
||
"SLACK_WEBHOOK_PATH": r'T[A-Z0-9]{8,}/B[A-Z0-9]{8,}',
|
||
# Common test/example patterns
|
||
"TEST_PATTERN": r'\b(?:test|example|sample|demo|fake|dummy|xxx+|000[-\s]?000[-\s]?0000)\b',
|
||
# Version numbers (could look like dates)
|
||
"VERSION_NUMBER": r'\bv?\d+\.\d+\.\d+(?:\.\d+)?\b',
|
||
# File hashes (MD5, SHA variants)
|
||
"FILE_HASH_MD5": r'\b[a-f0-9]{32}\b(?!.*(?:twilio|auth_token))',
|
||
"FILE_HASH_SHA256": r'\b[a-f0-9]{64}\b',
|
||
}
|
||
|
||
|
||
# ============================================================================
|
||
# LAYER 2.0: Pre-Processing - Unicode & HTML Normalization (v1.1)
|
||
# ============================================================================
|
||
|
||
def normalize_unicode(text: str) -> str:
|
||
"""
|
||
Layer 2.0a: Apply Unicode NFKC normalization to catch:
|
||
- Fullwidth characters: john@example.com → john@example.com
|
||
- Unicode dots: john․doe@example․com → john.doe@example.com
|
||
- Other homoglyphs and compatibility characters
|
||
"""
|
||
return unicodedata.normalize('NFKC', text)
|
||
|
||
|
||
def decode_html_entities(text: str) -> str:
|
||
"""
|
||
Layer 2.0b: Decode HTML entities to catch:
|
||
- john&at;example˙com → john@example.com
|
||
- john@example.com → john@example.com
|
||
- & < > etc.
|
||
"""
|
||
# First pass: standard HTML entities
|
||
decoded = html.unescape(text)
|
||
|
||
# Second pass: custom obfuscation patterns
|
||
custom_entities = [
|
||
(r'&at;', '@'),
|
||
(r'˙', '.'),
|
||
(r'@', '@'), # @ in decimal
|
||
(r'.', '.'), # . in decimal
|
||
(r'@', '@'), # @ in hex
|
||
(r'.', '.'), # . in hex
|
||
]
|
||
for pattern, replacement in custom_entities:
|
||
decoded = re.sub(pattern, replacement, decoded, flags=re.IGNORECASE)
|
||
|
||
return decoded
|
||
|
||
|
||
def preprocess_text(text: str) -> tuple[str, list[dict]]:
|
||
"""
|
||
Layer 2.0: Full pre-processing pipeline combining Unicode and HTML normalization.
|
||
Returns preprocessed text and a log of transformations applied.
|
||
"""
|
||
transformations = []
|
||
processed = text
|
||
|
||
# Step 1: Unicode NFKC normalization
|
||
unicode_normalized = normalize_unicode(processed)
|
||
if unicode_normalized != processed:
|
||
transformations.append({
|
||
"layer": "2.0a",
|
||
"type": "unicode_nfkc",
|
||
"changes": len(processed) - len(unicode_normalized)
|
||
})
|
||
processed = unicode_normalized
|
||
|
||
# Step 2: HTML entity decoding
|
||
html_decoded = decode_html_entities(processed)
|
||
if html_decoded != processed:
|
||
transformations.append({
|
||
"layer": "2.0b",
|
||
"type": "html_entity_decode",
|
||
"changes": len(processed) - len(html_decoded)
|
||
})
|
||
processed = html_decoded
|
||
|
||
return processed, transformations
|
||
|
||
|
||
def is_false_positive(value: str, pii_type: str, context: str = "") -> bool:
|
||
"""
|
||
Check if a detected value is a known false positive pattern.
|
||
Returns True if the value should be excluded.
|
||
"""
|
||
value_lower = value.lower()
|
||
context_lower = context.lower() if context else ""
|
||
|
||
# Git SHA check (for AWS_SECRET_KEY false positives)
|
||
if pii_type == "AWS_SECRET_KEY":
|
||
# If it's all lowercase hex and 40 chars, it's probably a git SHA
|
||
if re.match(r'^[a-f0-9]{40}$', value):
|
||
return True
|
||
# Also check if "commit" or "sha" in context
|
||
if any(kw in context_lower for kw in ["commit", "sha", "hash", "git", "rev"]):
|
||
return True
|
||
|
||
# Timestamp check (for PHONE_US false positives)
|
||
if pii_type in ["PHONE_US", "PHONE_US_NORMALIZED"]:
|
||
# Check if it looks like a timestamp (13 digits starting with 1)
|
||
digits_only = re.sub(r'\D', '', value)
|
||
if len(digits_only) == 13 and digits_only.startswith('1'):
|
||
return True
|
||
if len(digits_only) == 10 and digits_only.startswith('1'):
|
||
return True
|
||
|
||
# Slack webhook path check (for DRIVERS_LICENSE false positives)
|
||
if pii_type == "DRIVERS_LICENSE":
|
||
if re.match(r'^T[A-Z0-9]{8,}$', value) or re.match(r'^B[A-Z0-9]{8,}$', value):
|
||
if "slack" in context_lower or "webhook" in context_lower:
|
||
return True
|
||
# B followed by 7 digits could be HK ID or other codes, need context
|
||
if re.match(r'^B\d{7}$', value):
|
||
# Only valid if context suggests driver's license
|
||
if not any(kw in context_lower for kw in ["license", "licence", "driver", "dmv", "dl#", "dl:"]):
|
||
return True
|
||
|
||
# Date check for log timestamps (for DATE_OF_BIRTH false positives)
|
||
if pii_type == "DATE_OF_BIRTH":
|
||
# Log timestamps often have time component nearby
|
||
if any(kw in context_lower for kw in ["log", "timestamp", "created", "updated", "time:", "at ", ":"]):
|
||
if re.search(r'\d{2}:\d{2}:\d{2}', context): # Has time component
|
||
return True
|
||
|
||
# Version number check
|
||
if pii_type != "IP_ADDRESS" and re.match(r'^v?\d+\.\d+\.\d+', value):
|
||
return True
|
||
|
||
# EMAIL and EMAIL_OBFUSCATED false positives - connection strings and URIs
|
||
if pii_type in ["EMAIL", "EMAIL_OBFUSCATED"]:
|
||
# Connection string pattern: ://user:password@host
|
||
# This catches mongodb+srv://user:pass@cluster, redis://default:pass@host, etc.
|
||
if re.search(r'://[^@]*:[^@]*@', context):
|
||
return True
|
||
# MongoDB/database connection strings: password@cluster.mongodb.net
|
||
if any(db in value_lower for db in [".mongodb.net", ".mongodb.com", "cluster0", "cluster1"]):
|
||
return True
|
||
# Redis connection strings
|
||
if any(db in value_lower for db in [".redis", "redis-", "redislabs"]):
|
||
return True
|
||
# Other database connection patterns
|
||
if any(db in context_lower for db in ["mongodb://", "mongodb+srv://", "redis://", "postgres://", "mysql://", "amqp://", "connection_string", "conn_str"]):
|
||
return True
|
||
# Part of URL (e.g., user:pass@host pattern)
|
||
if re.search(r'://[^@]+' + re.escape(value), context):
|
||
return True
|
||
|
||
# UK_POSTCODE false positives - partial passwords, codes
|
||
if pii_type == "UK_POSTCODE":
|
||
# Too short or looks like part of a password/hash
|
||
if len(value) < 6:
|
||
return True
|
||
# Check if it's surrounded by alphanumeric chars (part of longer string)
|
||
value_pos = context.find(value)
|
||
if value_pos > 0:
|
||
char_before = context[value_pos - 1] if value_pos > 0 else ' '
|
||
char_after = context[value_pos + len(value)] if value_pos + len(value) < len(context) else ' '
|
||
if char_before.isalnum() or char_after.isalnum():
|
||
return True
|
||
|
||
# ISIN false positives - needs context validation
|
||
if pii_type == "ISIN":
|
||
# ISIN should have country code followed by alphanumeric
|
||
# Exclude if it looks like a passport number or other ID
|
||
if any(kw in context_lower for kw in ["passport", "travel", "visa", "id card", "identity"]):
|
||
return True
|
||
# Exclude random-looking alphanumeric strings without financial context
|
||
if not any(kw in context_lower for kw in ["isin", "security", "stock", "bond", "cusip", "sedol", "share", "equity", "fund"]):
|
||
return True
|
||
|
||
# BANK_ACCOUNT false positives - exclude credit card numbers
|
||
if pii_type == "BANK_ACCOUNT":
|
||
digits_only = re.sub(r'\D', '', value)
|
||
# If it's 15-16 digits and passes Luhn, it's a credit card, not bank account
|
||
if len(digits_only) in [15, 16]:
|
||
# Luhn check inline
|
||
digits = [int(d) for d in digits_only]
|
||
odd_digits = digits[-1::-2]
|
||
even_digits = digits[-2::-2]
|
||
total = sum(odd_digits)
|
||
for d in even_digits:
|
||
total += sum(divmod(d * 2, 10))
|
||
if total % 10 == 0:
|
||
return True # It's a credit card, not a bank account
|
||
|
||
# INDIA_AADHAAR false positives - exclude IBAN fragments
|
||
if pii_type == "INDIA_AADHAAR":
|
||
# If context suggests IBAN or European bank context, it's likely an IBAN fragment
|
||
if any(kw in context_lower for kw in ["iban", "bic", "swift", "sepa", "bank", "de", "gb", "fr", "nl", "at", "ch"]):
|
||
return True
|
||
# Check if preceded by 2-letter country code + 2 digits (IBAN prefix pattern)
|
||
# Pattern: XX## followed by the detected value
|
||
iban_prefix_pattern = r'[A-Z]{2}\d{2}\s*' + re.escape(value.replace(' ', r'\s*'))
|
||
if re.search(iban_prefix_pattern, context, re.IGNORECASE):
|
||
return True
|
||
|
||
# CANADIAN_SIN false positives - exclude Australian ABN format
|
||
if pii_type == "CANADIAN_SIN":
|
||
# ABN format is similar (9 digits in 3-3-3) but in Australian context
|
||
if any(kw in context_lower for kw in ["abn", "australia", "australian", "business number", "gst"]):
|
||
return True
|
||
|
||
# CREDIT_CARD false positives - exclude order/confirmation/reference numbers
|
||
if pii_type in ["CREDIT_CARD", "POSSIBLE_CARD_PATTERN"]:
|
||
# Check for common prefixes that indicate non-card numbers
|
||
order_prefixes = ["order", "order #", "order#", "order:", "order number",
|
||
"conf", "confirmation", "conf #", "conf#", "confirmation #",
|
||
"ref", "reference", "ref #", "ref#", "reference #",
|
||
"invoice", "invoice #", "inv #", "inv#",
|
||
"tracking", "tracking #", "track #",
|
||
"ticket", "ticket #", "case #", "case#",
|
||
"transaction id", "trans id", "txn"]
|
||
if any(prefix in context_lower for prefix in order_prefixes):
|
||
return True
|
||
# Check for pattern: "# followed immediately by the number"
|
||
if re.search(r'#\s*' + re.escape(value), context):
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
# ============================================================================
|
||
# LAYER 2: Text Normalization Functions
|
||
# ============================================================================
|
||
|
||
def normalize_text(text: str) -> tuple[str, dict]:
|
||
"""
|
||
Layer 2: Normalize text to reveal hidden PII.
|
||
Returns normalized text and a mapping of original positions.
|
||
|
||
Handles:
|
||
- [dot] / (dot) / [.] → .
|
||
- [at] / (at) → @
|
||
- Dashes/underscores in numbers → removed
|
||
- Spaced out characters → joined
|
||
"""
|
||
normalized = text
|
||
transformations = []
|
||
|
||
# Track transformations for position mapping
|
||
# 1. Convert defanged dots: [dot], (dot), [.], (.), [ dot ], etc.
|
||
# Also handle with surrounding spaces: " [dot] " → "."
|
||
dot_patterns = [
|
||
(r'\s*\[\s*dot\s*\]\s*', '.'), # [dot], [ dot ], etc with optional spaces
|
||
(r'\s*\(\s*dot\s*\)\s*', '.'), # (dot), ( dot ), etc with optional spaces
|
||
(r'\s*\[\.\]\s*', '.'), # [.]
|
||
(r'\s*\(\.\)\s*', '.'), # (.)
|
||
(r'\s+dot\s+', '.'), # " dot " (word boundary)
|
||
]
|
||
for pattern, replacement in dot_patterns:
|
||
normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE)
|
||
|
||
# 2. Convert defanged at: [at], (at), [ at ], etc.
|
||
# IMPORTANT: Bracketed forms [at] and (at) are always intentional obfuscation
|
||
# But plain " at " must be validated to avoid "meet at 5pm" → "meet@5pm"
|
||
|
||
# 2a. Bracketed forms - always convert (these are intentional obfuscation)
|
||
bracketed_at_patterns = [
|
||
(r'\s*\[\s*at\s*\]\s*', '@'), # [at], [ at ], etc with optional spaces
|
||
(r'\s*\(\s*at\s*\)\s*', '@'), # (at), ( at ), etc with optional spaces
|
||
]
|
||
for pattern, replacement in bracketed_at_patterns:
|
||
normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE)
|
||
|
||
# 2b. Plain " at " - ONLY convert if it looks like an email pattern:
|
||
# - Preceded by username-like chars (alphanumeric, dots, underscores)
|
||
# - Followed by domain-like pattern (word + dot/space-dot + TLD)
|
||
# Valid: "john at gmail dot com", "mike at company.org"
|
||
# Invalid: "meet at 5pm", "look at this", "ready at cvs", "at gate b7"
|
||
def should_normalize_at(text: str, at_position: int) -> bool:
|
||
"""
|
||
Robust validation for ' at ' to '@' conversion.
|
||
Returns True only if this looks like a genuine obfuscated email.
|
||
|
||
Validates:
|
||
- Username pattern before "at"
|
||
- Rejects false trigger words after "at" (the, my, home, cvs, etc.)
|
||
- Rejects time patterns ("at 7pm", "at 2:30")
|
||
- Domain pattern with valid TLD after "at"
|
||
"""
|
||
# Get context around the "at"
|
||
before_text = text[:at_position]
|
||
after_text = text[at_position + 4:] # Skip " at "
|
||
|
||
# 1. Check username pattern (must end with valid email username chars)
|
||
username_match = re.search(r'([a-zA-Z0-9][a-zA-Z0-9._%+-]{0,63})$', before_text)
|
||
if not username_match:
|
||
return False
|
||
|
||
# 2. Get the first word after "at" and check for false triggers
|
||
first_word_match = re.match(r'([a-zA-Z0-9]+)', after_text)
|
||
if not first_word_match:
|
||
return False
|
||
|
||
first_word = first_word_match.group(1).lower()
|
||
|
||
# Reject if first word is a common false trigger
|
||
if first_word in FALSE_TRIGGER_WORDS:
|
||
return False
|
||
|
||
# 3. Reject time patterns: "at 7pm", "at 2:30", "at noon"
|
||
time_pattern = r'^[0-9]{1,2}(?::[0-9]{2})?(?:\s*(?:am|pm|AM|PM))?\s*(?:$|[.,!?\s])'
|
||
if re.match(time_pattern, after_text):
|
||
return False
|
||
|
||
# 4. Check for valid domain pattern with TLD
|
||
# Pattern: domain + (dot or " dot ") + valid TLD
|
||
domain_pattern = r'^([a-zA-Z0-9][a-zA-Z0-9-]{0,61}[a-zA-Z0-9]?)(?:\.|(?:\s+dot\s+))([a-zA-Z]{2,10})(?:\s|$|[.,!?])'
|
||
domain_match = re.match(domain_pattern, after_text, re.IGNORECASE)
|
||
|
||
if domain_match:
|
||
potential_tld = domain_match.group(2).lower()
|
||
if potential_tld in VALID_TLDS:
|
||
return True
|
||
|
||
return False
|
||
|
||
def smart_at_replacement(match):
|
||
"""Wrapper function for re.sub that uses should_normalize_at()"""
|
||
full_match = match.group(0)
|
||
before = match.group(1) # Username-like part
|
||
after = match.group(2) # Domain-like part
|
||
|
||
# Find the position of " at " in the original normalized text
|
||
# The match.start() gives us where the full match begins
|
||
# We need to check from where "at" actually appears
|
||
at_pos_in_match = len(before) # " at " starts right after the username
|
||
at_pos_in_text = match.start() + at_pos_in_match
|
||
|
||
if should_normalize_at(normalized, at_pos_in_text):
|
||
return before + '@' + after
|
||
return full_match # Return unchanged
|
||
|
||
# Pattern: (username-like) + " at " + (potential domain)
|
||
normalized = re.sub(
|
||
r'([a-zA-Z0-9][a-zA-Z0-9._%+-]*)\s+at\s+([a-zA-Z0-9][a-zA-Z0-9.\s]*)',
|
||
smart_at_replacement,
|
||
normalized,
|
||
flags=re.IGNORECASE
|
||
)
|
||
|
||
# 3. Remove separators from potential numbers (SSN, CC, phone)
|
||
# Only in numeric contexts: sequences that look like numbers with separators
|
||
# Pattern: digit, separator(s), digit, separator(s), digit...
|
||
def clean_numeric_separators(match):
|
||
"""Remove dashes, underscores, spaces from numeric sequences"""
|
||
return re.sub(r'[-_\s]', '', match.group())
|
||
|
||
# Match sequences that look like formatted numbers (3+ digit groups)
|
||
normalized = re.sub(
|
||
r'\b(\d{2,4})[-_\s]+(\d{2,4})[-_\s]+(\d{2,4})(?:[-_\s]+(\d{2,4}))?\b',
|
||
clean_numeric_separators,
|
||
normalized
|
||
)
|
||
|
||
# 4. Join spaced-out characters (t-e-s-t → test, t e s t → test)
|
||
# Only for sequences that look intentionally obfuscated (alternating char-separator)
|
||
def join_spaced_chars(match):
|
||
"""Join characters separated by dashes or spaces"""
|
||
chars = re.findall(r'[A-Za-z0-9]', match.group())
|
||
return ''.join(chars)
|
||
|
||
# Match spaced-out patterns: a-b-c-d or a b c d (3+ chars)
|
||
normalized = re.sub(
|
||
r'\b([A-Za-z0-9])(?:[-\s]([A-Za-z0-9])){3,}\b',
|
||
join_spaced_chars,
|
||
normalized
|
||
)
|
||
|
||
return normalized, {"original_length": len(text), "normalized_length": len(normalized)}
|
||
|
||
|
||
# ============================================================================
|
||
# LAYER 2.5: JSON Blob Extraction & Deep Scanning
|
||
# ============================================================================
|
||
|
||
def extract_json_strings(text: str) -> list[tuple[str, int, int]]:
|
||
"""
|
||
Extract JSON objects embedded in text for deep scanning.
|
||
Returns list of (json_string, start_pos, end_pos)
|
||
"""
|
||
json_strings = []
|
||
|
||
# Pattern to find JSON-like structures: {...} or escaped JSON strings
|
||
# Look for JSON objects that might be embedded as string values
|
||
json_patterns = [
|
||
# Standard JSON objects
|
||
r'(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})',
|
||
# Escaped JSON in string values (e.g., "{\"key\": \"value\"}")
|
||
r'"(\{(?:\\"|[^"])*\})"',
|
||
]
|
||
|
||
for pattern in json_patterns:
|
||
for match in re.finditer(pattern, text):
|
||
json_str = match.group(1) if match.lastindex else match.group()
|
||
# Unescape if it was escaped JSON
|
||
if json_str.startswith('{') and '\\"' in json_str:
|
||
json_str = json_str.replace('\\"', '"')
|
||
json_strings.append((json_str, match.start(), match.end()))
|
||
|
||
return json_strings
|
||
|
||
|
||
def deep_scan_json(json_str: str) -> list[str]:
|
||
"""
|
||
Recursively extract all string values from a JSON structure.
|
||
Returns list of string values to scan.
|
||
"""
|
||
extracted_values = []
|
||
|
||
def extract_strings(obj):
|
||
if isinstance(obj, str):
|
||
extracted_values.append(obj)
|
||
elif isinstance(obj, dict):
|
||
for key, value in obj.items():
|
||
# Also check keys for PII (e.g., "ssn", "email")
|
||
extracted_values.append(f"{key}: {value}" if isinstance(value, str) else key)
|
||
extract_strings(value)
|
||
elif isinstance(obj, list):
|
||
for item in obj:
|
||
extract_strings(item)
|
||
|
||
try:
|
||
parsed = json.loads(json_str)
|
||
extract_strings(parsed)
|
||
except json.JSONDecodeError:
|
||
# If not valid JSON, try to extract key-value pairs with regex
|
||
# Pattern for "key": "value" pairs
|
||
kv_pattern = r'"([^"]+)"\s*:\s*"([^"]*)"'
|
||
for match in re.finditer(kv_pattern, json_str):
|
||
extracted_values.append(f"{match.group(1)}: {match.group(2)}")
|
||
|
||
return extracted_values
|
||
|
||
|
||
# ============================================================================
|
||
# LAYER 2.6: Base64 Auto-Detection & Decoding
|
||
# ============================================================================
|
||
|
||
def is_valid_base64(s: str) -> bool:
|
||
"""Check if a string is valid base64 encoded."""
|
||
# Must be at least 8 chars and multiple of 4 (with padding)
|
||
if len(s) < 8:
|
||
return False
|
||
|
||
# Check for valid base64 characters
|
||
base64_pattern = r'^[A-Za-z0-9+/]+=*$'
|
||
if not re.match(base64_pattern, s):
|
||
return False
|
||
|
||
# Length check (with padding, should be multiple of 4)
|
||
if len(s) % 4 != 0:
|
||
return False
|
||
|
||
return True
|
||
|
||
|
||
def decode_base64_strings(text: str, max_depth: int = 2) -> list[tuple[str, str, int, int, int]]:
|
||
"""
|
||
Find and decode base64 strings in text with recursive decoding support (v1.1).
|
||
Returns list of (original_b64, decoded_text, start_pos, end_pos, decode_depth)
|
||
|
||
Now supports:
|
||
- Recursive decoding (double-encoded base64)
|
||
- JSON extraction from decoded content
|
||
"""
|
||
decoded_strings = []
|
||
|
||
# Pattern to find potential base64 strings (min 16 chars to avoid false positives)
|
||
# Must contain at least one letter and one number to reduce false positives
|
||
b64_pattern = r'\b([A-Za-z0-9+/]{16,}={0,2})\b'
|
||
|
||
def recursive_decode(b64_str: str, start: int, end: int, depth: int = 1):
|
||
"""Recursively decode base64 strings up to max_depth"""
|
||
if depth > max_depth:
|
||
return
|
||
|
||
if not is_valid_base64(b64_str):
|
||
return
|
||
|
||
try:
|
||
decoded = base64.b64decode(b64_str).decode('utf-8', errors='ignore')
|
||
|
||
# Only keep if decoded text is printable and looks like real text
|
||
if decoded and len(decoded) >= 4:
|
||
printable_ratio = sum(1 for c in decoded if c.isprintable()) / len(decoded)
|
||
letter_ratio = sum(1 for c in decoded if c.isalpha()) / len(decoded)
|
||
|
||
if printable_ratio > 0.7 and letter_ratio > 0.2:
|
||
decoded_strings.append((b64_str, decoded, start, end, depth))
|
||
|
||
# Recursive: Check if decoded content contains more base64
|
||
if depth < max_depth:
|
||
for inner_match in re.finditer(b64_pattern, decoded):
|
||
recursive_decode(
|
||
inner_match.group(1),
|
||
start, # Keep original position
|
||
end,
|
||
depth + 1
|
||
)
|
||
|
||
# Also check if decoded content is JSON and scan it
|
||
if decoded.strip().startswith('{') or decoded.strip().startswith('['):
|
||
try:
|
||
json.loads(decoded)
|
||
# Valid JSON - will be picked up by JSON layer
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
except Exception:
|
||
pass
|
||
|
||
for match in re.finditer(b64_pattern, text):
|
||
b64_str = match.group(1)
|
||
recursive_decode(b64_str, match.start(), match.end())
|
||
|
||
return decoded_strings
|
||
|
||
|
||
# ============================================================================
|
||
# LAYER 2.7: Spelled-Out Number Detection (NLP-lite)
|
||
# ============================================================================
|
||
|
||
# Word to digit mapping
|
||
WORD_TO_DIGIT = {
|
||
'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
|
||
'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9',
|
||
'oh': '0', 'o': '0', # "oh" is often used for zero
|
||
}
|
||
|
||
def convert_spelled_numbers(text: str) -> tuple[str, list[tuple[str, str, int, int]]]:
|
||
"""
|
||
Convert spelled-out numbers to digits.
|
||
Returns (converted_text, list of (original, converted, start, end))
|
||
|
||
Examples:
|
||
- "nine zero zero dash twelve dash eight eight two one" → "900-12-8821"
|
||
- "four one five five five one two one two" → "415-555-1212"
|
||
"""
|
||
conversions = []
|
||
result = text
|
||
|
||
# Build pattern for number words
|
||
number_words = '|'.join(WORD_TO_DIGIT.keys())
|
||
|
||
# Pattern: sequence of number words separated by spaces, possibly with "dash" or "hyphen"
|
||
# Must have at least 3 number words to be considered a spelled-out number
|
||
spelled_pattern = rf'\b((?:(?:{number_words})\s*)+(?:(?:dash|hyphen)\s*(?:(?:{number_words})\s*)+)+)\b'
|
||
|
||
matches = list(re.finditer(spelled_pattern, text, re.IGNORECASE))
|
||
|
||
for match in reversed(matches): # Process in reverse to maintain positions
|
||
original = match.group(1)
|
||
|
||
# Convert words to digits
|
||
converted_parts = []
|
||
current_number = ''
|
||
|
||
words = re.split(r'\s+', original.lower())
|
||
for word in words:
|
||
word = word.strip()
|
||
if word in WORD_TO_DIGIT:
|
||
current_number += WORD_TO_DIGIT[word]
|
||
elif word in ['dash', 'hyphen']:
|
||
if current_number:
|
||
converted_parts.append(current_number)
|
||
current_number = ''
|
||
|
||
if current_number:
|
||
converted_parts.append(current_number)
|
||
|
||
if converted_parts:
|
||
converted = '-'.join(converted_parts)
|
||
conversions.append((original, converted, match.start(), match.end()))
|
||
result = result[:match.start()] + converted + result[match.end():]
|
||
|
||
return result, conversions
|
||
|
||
|
||
# ============================================================================
|
||
# LAYER 2.8: Non-Latin Character Support & International Patterns
|
||
# ============================================================================
|
||
|
||
# European VAT Number patterns by country
|
||
EU_VAT_PATTERNS = {
|
||
"AT": r'ATU\d{8}', # Austria
|
||
"BE": r'BE0?\d{9,10}', # Belgium
|
||
"BG": r'BG\d{9,10}', # Bulgaria
|
||
"CY": r'CY\d{8}[A-Z]', # Cyprus
|
||
"CZ": r'CZ\d{8,10}', # Czech Republic
|
||
"DE": r'DE\d{9}', # Germany
|
||
"DK": r'DK\d{8}', # Denmark
|
||
"EE": r'EE\d{9}', # Estonia
|
||
"EL": r'EL\d{9}', # Greece (uses EL not GR)
|
||
"ES": r'ES[A-Z0-9]\d{7}[A-Z0-9]', # Spain
|
||
"FI": r'FI\d{8}', # Finland
|
||
"FR": r'FR[A-Z0-9]{2}\d{9}', # France
|
||
"HR": r'HR\d{11}', # Croatia
|
||
"HU": r'HU\d{8}', # Hungary
|
||
"IE": r'IE\d[A-Z0-9+*]\d{5}[A-Z]{1,2}', # Ireland
|
||
"IT": r'IT\d{11}', # Italy
|
||
"LT": r'LT\d{9,12}', # Lithuania
|
||
"LU": r'LU\d{8}', # Luxembourg
|
||
"LV": r'LV\d{11}', # Latvia
|
||
"MT": r'MT\d{8}', # Malta
|
||
"NL": r'NL\d{9}B\d{2}', # Netherlands
|
||
"PL": r'PL\d{10}', # Poland
|
||
"PT": r'PT\d{9}', # Portugal
|
||
"RO": r'RO\d{2,10}', # Romania
|
||
"SE": r'SE\d{12}', # Sweden
|
||
"SI": r'SI\d{8}', # Slovenia
|
||
"SK": r'SK\d{10}', # Slovakia
|
||
"GB": r'GB\d{9,12}', # UK (pre-Brexit, still used)
|
||
}
|
||
|
||
# Combined EU VAT pattern
|
||
EU_VAT_COMBINED = '|'.join(f'({pattern})' for pattern in EU_VAT_PATTERNS.values())
|
||
|
||
# Greek character transliteration (for detecting Greek names/text)
|
||
GREEK_TO_LATIN = {
|
||
'α': 'a', 'β': 'b', 'γ': 'g', 'δ': 'd', 'ε': 'e', 'ζ': 'z', 'η': 'h',
|
||
'θ': 'th', 'ι': 'i', 'κ': 'k', 'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x',
|
||
'ο': 'o', 'π': 'p', 'ρ': 'r', 'σ': 's', 'ς': 's', 'τ': 't', 'υ': 'y',
|
||
'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
|
||
'Α': 'A', 'Β': 'B', 'Γ': 'G', 'Δ': 'D', 'Ε': 'E', 'Ζ': 'Z', 'Η': 'H',
|
||
'Θ': 'Th', 'Ι': 'I', 'Κ': 'K', 'Λ': 'L', 'Μ': 'M', 'Ν': 'N', 'Ξ': 'X',
|
||
'Ο': 'O', 'Π': 'P', 'Ρ': 'R', 'Σ': 'S', 'Τ': 'T', 'Υ': 'Y',
|
||
'Φ': 'F', 'Χ': 'Ch', 'Ψ': 'Ps', 'Ω': 'O',
|
||
# Accented variants
|
||
'ά': 'a', 'έ': 'e', 'ή': 'h', 'ί': 'i', 'ό': 'o', 'ύ': 'y', 'ώ': 'o',
|
||
'Ά': 'A', 'Έ': 'E', 'Ή': 'H', 'Ί': 'I', 'Ό': 'O', 'Ύ': 'Y', 'Ώ': 'O',
|
||
}
|
||
|
||
# Cyrillic character transliteration
|
||
CYRILLIC_TO_LATIN = {
|
||
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo',
|
||
'ж': 'zh', 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm',
|
||
'н': 'n', 'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u',
|
||
'ф': 'f', 'х': 'kh', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh', 'щ': 'shch',
|
||
'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu', 'я': 'ya',
|
||
'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E', 'Ё': 'Yo',
|
||
'Ж': 'Zh', 'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K', 'Л': 'L', 'М': 'M',
|
||
'Н': 'N', 'О': 'O', 'П': 'P', 'Р': 'R', 'С': 'S', 'Т': 'T', 'У': 'U',
|
||
'Ф': 'F', 'Х': 'Kh', 'Ц': 'Ts', 'Ч': 'Ch', 'Ш': 'Sh', 'Щ': 'Shch',
|
||
'Ъ': '', 'Ы': 'Y', 'Ь': '', 'Э': 'E', 'Ю': 'Yu', 'Я': 'Ya',
|
||
}
|
||
|
||
|
||
def transliterate_text(text: str) -> str:
|
||
"""Transliterate Greek and Cyrillic characters to Latin."""
|
||
result = text
|
||
|
||
# Apply Greek transliteration
|
||
for greek, latin in GREEK_TO_LATIN.items():
|
||
result = result.replace(greek, latin)
|
||
|
||
# Apply Cyrillic transliteration
|
||
for cyrillic, latin in CYRILLIC_TO_LATIN.items():
|
||
result = result.replace(cyrillic, latin)
|
||
|
||
return result
|
||
|
||
|
||
def detect_eu_vat_numbers(text: str) -> list[tuple[str, str, int, int]]:
|
||
"""
|
||
Detect EU VAT numbers in text.
|
||
Returns list of (vat_number, country_code, start, end)
|
||
"""
|
||
vat_matches = []
|
||
|
||
for country, pattern in EU_VAT_PATTERNS.items():
|
||
for match in re.finditer(pattern, text):
|
||
vat_matches.append((match.group(), country, match.start(), match.end()))
|
||
|
||
return vat_matches
|
||
|
||
|
||
def has_non_latin_chars(text: str) -> bool:
|
||
"""Check if text contains non-Latin characters (Greek, Cyrillic, etc.)"""
|
||
# Check for Greek (U+0370–U+03FF) or Cyrillic (U+0400–U+04FF)
|
||
return bool(re.search(r'[\u0370-\u03FF\u0400-\u04FF]', text))
|
||
|
||
|
||
# ============================================================================
|
||
# LAYER 3: Context-Based Confidence Scoring
|
||
# ============================================================================
|
||
|
||
# Context keywords that boost confidence when found near PII
|
||
CONTEXT_KEYWORDS = {
|
||
"SSN": ["ssn", "social security", "social sec", "ss#", "ss #", "ss number"],
|
||
"CREDIT_CARD": ["card", "credit", "debit", "visa", "mastercard", "amex", "payment", "cc#", "card number"],
|
||
"EMAIL": ["email", "e-mail", "mail", "contact", "reach", "@"],
|
||
"PHONE_US": ["phone", "tel", "telephone", "mobile", "cell", "call", "fax", "contact"],
|
||
"PHONE_INTL": ["phone", "tel", "telephone", "mobile", "cell", "call", "fax", "contact"],
|
||
"IP_ADDRESS": ["ip", "address", "server", "host", "network"],
|
||
"IP_DEFANGED": ["ip", "address", "server", "host", "network", "indicator", "ioc"],
|
||
"PASSWORD_IN_URL": ["password", "pwd", "pass", "secret", "credential", "login", "auth"],
|
||
"IBAN": ["iban", "bank", "account", "transfer", "payment", "wire"],
|
||
"BANK_ACCOUNT": ["account", "bank", "routing", "aba", "deposit"],
|
||
"AWS_ACCESS_KEY": ["aws", "amazon", "access", "key", "iam", "credential"],
|
||
"GITHUB_TOKEN": ["github", "token", "api", "pat", "access"],
|
||
"STRIPE_KEY": ["stripe", "payment", "api", "key", "publishable", "secret"],
|
||
"MEDICARE_ID": ["medicare", "mbi", "beneficiary", "cms", "health"],
|
||
"DATE_OF_BIRTH": ["dob", "birth", "born", "birthday", "age"],
|
||
"GEO_COORDINATES": ["location", "coordinates", "lat", "lng", "latitude", "longitude", "gps"],
|
||
"AUTH_CODE": ["auth", "code", "otp", "verification", "2fa", "mfa"],
|
||
}
|
||
|
||
# Anti-context: keywords that reduce confidence (false positive indicators)
|
||
ANTI_CONTEXT_KEYWORDS = {
|
||
"CREDIT_CARD": ["test", "example", "sample", "demo", "fake", "dummy", "xxxx", "part", "sku", "isbn", "serial", "order", "invoice", "ref", "batch", "processing"],
|
||
"SSN": ["test", "example", "sample", "demo", "fake", "dummy", "000-00-0000", "123-45-6789", "part", "sku", "serial", "order", "ref", "batch", "processing", "validation"],
|
||
"PHONE_US": ["test", "example", "555-", "000-000", "sample", "demo", "fake"],
|
||
"IP_ADDRESS": ["example", "test", "localhost", "127.0.0.1", "0.0.0.0"],
|
||
"EMAIL": ["example.com", "test.com", "example.org", "noreply", "sample", "demo"],
|
||
"DATE_OF_BIRTH": ["batch", "created", "updated", "modified", "expires", "valid", "effective", "due", "ship", "order", "processing"],
|
||
"IBAN": ["test", "example", "sample", "demo", "fake", "dummy", "validation"],
|
||
}
|
||
|
||
|
||
def is_future_date(date_str: str) -> bool:
|
||
"""Check if a date string represents a future date (not a valid DOB)."""
|
||
from datetime import datetime, date
|
||
current_year = datetime.now().year
|
||
current_date = date.today()
|
||
|
||
# Try to extract year from common date formats
|
||
full_year_match = re.search(r'((?:19|20)\d{2})', date_str)
|
||
if full_year_match:
|
||
year = int(full_year_match.group(1))
|
||
|
||
# Future year = definitely not a birth date
|
||
if year > current_year:
|
||
return True
|
||
|
||
# Current year - check if the full date is in the future
|
||
if year == current_year:
|
||
# Try to parse the full date
|
||
# Common formats: MM-DD-YYYY, MM/DD/YYYY, YYYY-MM-DD
|
||
date_patterns = [
|
||
(r'(\d{1,2})[-/](\d{1,2})[-/](\d{4})', 'MDY'), # MM-DD-YYYY
|
||
(r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})', 'YMD'), # YYYY-MM-DD
|
||
]
|
||
for pattern, fmt in date_patterns:
|
||
match = re.search(pattern, date_str)
|
||
if match:
|
||
try:
|
||
if fmt == 'MDY':
|
||
month, day, _ = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
||
else: # YMD
|
||
_, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
||
parsed_date = date(year, month, day)
|
||
if parsed_date > current_date:
|
||
return True
|
||
except ValueError:
|
||
pass # Invalid date
|
||
|
||
# Very old date (before 1900) = probably not a birth date
|
||
if year < 1900:
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def apply_context_scoring(text: str, entities: list[PIIEntity], window: int = 100) -> list[PIIEntity]:
|
||
"""
|
||
Layer 3: Adjust confidence scores based on surrounding context.
|
||
|
||
Args:
|
||
text: Original text
|
||
entities: List of detected PII entities
|
||
window: Number of characters to look around each entity
|
||
|
||
Returns:
|
||
Entities with adjusted confidence scores
|
||
"""
|
||
text_lower = text.lower()
|
||
adjusted_entities = []
|
||
|
||
for entity in entities:
|
||
# Get context window around the entity
|
||
start = max(0, entity.start - window)
|
||
end = min(len(text), entity.end + window)
|
||
context = text_lower[start:end]
|
||
|
||
confidence_adjustment = 0.0
|
||
|
||
# Check for boosting keywords
|
||
if entity.type in CONTEXT_KEYWORDS:
|
||
for keyword in CONTEXT_KEYWORDS[entity.type]:
|
||
if keyword in context:
|
||
confidence_adjustment += 0.10 # Boost 10% per matching keyword (increased)
|
||
break # Only apply once per type
|
||
|
||
# Check for anti-context (false positive indicators)
|
||
if entity.type in ANTI_CONTEXT_KEYWORDS:
|
||
for keyword in ANTI_CONTEXT_KEYWORDS[entity.type]:
|
||
if keyword in context:
|
||
confidence_adjustment -= 0.30 # Reduce 30% for test/example data (increased penalty)
|
||
break
|
||
|
||
# LOGIC GATE: Date validation - future dates cannot be birth dates
|
||
if entity.type == "DATE_OF_BIRTH":
|
||
if is_future_date(entity.value):
|
||
confidence_adjustment -= 0.50 # Heavily penalize future dates
|
||
|
||
# Apply adjustment (cap between 0.3 and 0.99)
|
||
new_confidence = max(0.3, min(0.99, entity.confidence + confidence_adjustment))
|
||
|
||
adjusted_entities.append(PIIEntity(
|
||
type=entity.type,
|
||
value=entity.value,
|
||
start=entity.start,
|
||
end=entity.end,
|
||
confidence=new_confidence,
|
||
masked_value=entity.masked_value
|
||
))
|
||
|
||
return adjusted_entities
|
||
|
||
|
||
# ============================================================================
|
||
# LAYER 4: Checksum Validation Functions
|
||
# ============================================================================
|
||
|
||
def validate_iban(iban: str) -> bool:
|
||
"""
|
||
Validate IBAN using MOD-97 checksum algorithm (ISO 7064).
|
||
Returns True if valid, False otherwise.
|
||
"""
|
||
# Remove spaces and convert to uppercase
|
||
iban = re.sub(r'\s', '', iban).upper()
|
||
|
||
# Check minimum length
|
||
if len(iban) < 15:
|
||
return False
|
||
|
||
# Move first 4 chars to end
|
||
rearranged = iban[4:] + iban[:4]
|
||
|
||
# Convert letters to numbers (A=10, B=11, ..., Z=35)
|
||
numeric = ''
|
||
for char in rearranged:
|
||
if char.isdigit():
|
||
numeric += char
|
||
elif char.isalpha():
|
||
numeric += str(ord(char) - ord('A') + 10)
|
||
else:
|
||
return False # Invalid character
|
||
|
||
# Perform MOD-97 check
|
||
try:
|
||
return int(numeric) % 97 == 1
|
||
except ValueError:
|
||
return False
|
||
|
||
|
||
def validate_luhn(number: str) -> bool:
|
||
"""Validate number using Luhn algorithm (credit cards, etc.)"""
|
||
digits = [int(d) for d in re.sub(r'\D', '', number)]
|
||
if len(digits) < 8:
|
||
return False
|
||
|
||
checksum = 0
|
||
for i, d in enumerate(reversed(digits)):
|
||
if i % 2 == 1:
|
||
d *= 2
|
||
if d > 9:
|
||
d -= 9
|
||
checksum += d
|
||
return checksum % 10 == 0
|
||
|
||
|
||
# ============================================================================
|
||
# Multi-Layer Detection Pipeline
|
||
# ============================================================================
|
||
|
||
def detect_pii_multilayer(text: str, config: Optional[ScanConfig] = None, coordinates_only: bool = False) -> list[PIIEntity]:
|
||
"""
|
||
Multi-layer PII detection pipeline (v1.1):
|
||
- Layer 2.0: Pre-processing (Unicode NFKC + HTML entity decode)
|
||
- Layer 1: Standard regex matching (high-speed DuckDB-compatible)
|
||
- Layer 2: Text normalization to catch obfuscated PII
|
||
- Layer 2.5: JSON blob extraction and deep scanning
|
||
- Layer 2.6: Base64 auto-decoding (with recursive decode + JSON scan)
|
||
- Layer 2.7: Spelled-out number detection (NLP-lite)
|
||
- Layer 2.8: Non-Latin character support (Greek, Cyrillic)
|
||
- Layer 3: Context-based confidence adjustment
|
||
- Layer 4: Checksum verification (Luhn, IBAN MOD-97)
|
||
- Layer 5: False positive filtering
|
||
"""
|
||
if config is None:
|
||
config = ScanConfig()
|
||
|
||
# Layer 2.0: Pre-processing - Unicode NFKC + HTML entity decode (v1.1)
|
||
preprocessed_text, preprocessing_log = preprocess_text(text)
|
||
|
||
# For coordinates_only mode: if preprocessing changed text length,
|
||
# we need to use original text for detection to get accurate coordinates
|
||
if coordinates_only and len(preprocessed_text) != len(text):
|
||
# Text length changed during preprocessing - use original text for accurate coords
|
||
detection_text = text
|
||
else:
|
||
detection_text = preprocessed_text
|
||
|
||
# Layer 1: Standard detection
|
||
entities = detect_pii_standard(detection_text, config)
|
||
|
||
# Layer 2: Normalize text and re-scan for hidden PII
|
||
normalized_text, transform_info = normalize_text(text)
|
||
if normalized_text != text:
|
||
# Only re-scan if normalization changed something
|
||
normalized_entities = detect_pii_standard(normalized_text, config)
|
||
|
||
# Mark normalized entities and add unique ones
|
||
existing_values = {e.value.lower() for e in entities}
|
||
for entity in normalized_entities:
|
||
if entity.value.lower() not in existing_values:
|
||
# This is a newly discovered entity from normalization
|
||
# Reduce confidence slightly as it required normalization
|
||
entity_dict = entity.model_dump()
|
||
entity_dict['confidence'] = entity.confidence * 0.95
|
||
entity_dict['type'] = f"{entity.type}_NORMALIZED"
|
||
entities.append(PIIEntity(**entity_dict))
|
||
existing_values.add(entity.value.lower())
|
||
|
||
# Layer 2.5: JSON Blob Extraction & Deep Scanning
|
||
existing_values = {e.value.lower() for e in entities}
|
||
json_blobs = extract_json_strings(text)
|
||
for json_str, start_pos, end_pos in json_blobs:
|
||
extracted_values = deep_scan_json(json_str)
|
||
for extracted in extracted_values:
|
||
# Scan extracted JSON values for PII
|
||
json_entities = detect_pii_standard(extracted, config)
|
||
for entity in json_entities:
|
||
if entity.value.lower() not in existing_values:
|
||
# Mark as found within JSON, adjust positions to original text
|
||
entities.append(PIIEntity(
|
||
type=f"{entity.type}_IN_JSON",
|
||
value=entity.value,
|
||
start=start_pos, # Use JSON blob position
|
||
end=end_pos,
|
||
confidence=entity.confidence * 0.90, # Slight reduction for nested detection
|
||
masked_value=entity.masked_value
|
||
))
|
||
existing_values.add(entity.value.lower())
|
||
|
||
# Layer 2.6: Base64 Auto-Decoding (v1.1 - recursive + JSON scan)
|
||
decoded_b64_strings = decode_base64_strings(preprocessed_text)
|
||
for original_b64, decoded_text, start_pos, end_pos, depth in decoded_b64_strings:
|
||
# Scan decoded base64 content for PII/secrets
|
||
b64_entities = detect_pii_standard(decoded_text, config)
|
||
|
||
# Also scan decoded JSON content if present
|
||
if decoded_text.strip().startswith('{') or decoded_text.strip().startswith('['):
|
||
try:
|
||
json_values = deep_scan_json(decoded_text)
|
||
for json_val in json_values:
|
||
json_entities = detect_pii_standard(json_val, config)
|
||
for je in json_entities:
|
||
if je.value.lower() not in existing_values:
|
||
entities.append(PIIEntity(
|
||
type=f"{je.type}_BASE64_JSON",
|
||
value=je.value,
|
||
start=start_pos,
|
||
end=end_pos,
|
||
confidence=je.confidence * 0.80, # Lower for double-nested
|
||
masked_value=je.masked_value
|
||
))
|
||
existing_values.add(je.value.lower())
|
||
except Exception:
|
||
pass
|
||
|
||
for entity in b64_entities:
|
||
if entity.value.lower() not in existing_values:
|
||
# Mark as found in base64 encoded content
|
||
depth_suffix = f"_DEPTH{depth}" if depth > 1 else ""
|
||
entities.append(PIIEntity(
|
||
type=f"{entity.type}_BASE64{depth_suffix}",
|
||
value=entity.value,
|
||
start=start_pos,
|
||
end=end_pos,
|
||
confidence=entity.confidence * (0.85 ** depth), # Reduce more for deeper encoding
|
||
masked_value=entity.masked_value
|
||
))
|
||
existing_values.add(entity.value.lower())
|
||
|
||
# Also flag the base64 string itself if it decoded to sensitive content
|
||
if b64_entities:
|
||
entities.append(PIIEntity(
|
||
type="BASE64_ENCODED_SECRET",
|
||
value=original_b64[:50] + "..." if len(original_b64) > 50 else original_b64,
|
||
start=start_pos,
|
||
end=end_pos,
|
||
confidence=0.85,
|
||
masked_value="[BASE64 ENCODED CONTENT REDACTED]"
|
||
))
|
||
|
||
# Layer 2.7: Spelled-Out Number Detection
|
||
converted_text, conversions = convert_spelled_numbers(text)
|
||
if conversions:
|
||
# Scan the converted text for SSN patterns
|
||
for original, converted, start_pos, end_pos in conversions:
|
||
# Check if converted looks like an SSN (XXX-XX-XXXX format)
|
||
ssn_match = re.match(r'^(\d{3})-(\d{2})-(\d{4})$', converted)
|
||
if ssn_match:
|
||
entities.append(PIIEntity(
|
||
type="SSN_SPELLED_OUT",
|
||
value=converted,
|
||
start=start_pos,
|
||
end=end_pos,
|
||
confidence=0.90, # High confidence for spelled-out SSN
|
||
masked_value=f"***-**-{ssn_match.group(3)}"
|
||
))
|
||
|
||
# Layer 2.8: Non-Latin Character Support
|
||
if has_non_latin_chars(text):
|
||
# Transliterate and re-scan
|
||
transliterated = transliterate_text(text)
|
||
if transliterated != text:
|
||
trans_entities = detect_pii_standard(transliterated, config)
|
||
for entity in trans_entities:
|
||
if entity.value.lower() not in existing_values:
|
||
entities.append(PIIEntity(
|
||
type=f"{entity.type}_TRANSLITERATED",
|
||
value=entity.value,
|
||
start=entity.start,
|
||
end=entity.end,
|
||
confidence=entity.confidence * 0.90,
|
||
masked_value=entity.masked_value
|
||
))
|
||
existing_values.add(entity.value.lower())
|
||
|
||
# Layer 3: Apply context-based scoring
|
||
entities = apply_context_scoring(text, entities)
|
||
|
||
# Layer 4: Apply checksum verification for applicable types
|
||
verified_entities = []
|
||
for entity in entities:
|
||
if entity.type == "IBAN":
|
||
if validate_iban(entity.value):
|
||
# Valid IBAN - boost confidence
|
||
verified_entities.append(PIIEntity(
|
||
type=entity.type,
|
||
value=entity.value,
|
||
start=entity.start,
|
||
end=entity.end,
|
||
confidence=min(0.99, entity.confidence + 0.10),
|
||
masked_value=entity.masked_value
|
||
))
|
||
else:
|
||
# Invalid checksum - cap at 30% ceiling (consistent across all types)
|
||
verified_entities.append(PIIEntity(
|
||
type="POSSIBLE_IBAN_PATTERN",
|
||
value=entity.value,
|
||
start=entity.start,
|
||
end=entity.end,
|
||
confidence=0.30, # Consistent 30% ceiling for invalid checksums
|
||
masked_value=entity.masked_value
|
||
))
|
||
elif entity.type == "UUID":
|
||
# UUIDs are typically trace IDs, not PII - cap at 40% max confidence
|
||
verified_entities.append(PIIEntity(
|
||
type=entity.type,
|
||
value=entity.value,
|
||
start=entity.start,
|
||
end=entity.end,
|
||
confidence=min(0.40, entity.confidence), # Cap UUID confidence at 40%
|
||
masked_value=entity.masked_value
|
||
))
|
||
elif entity.type == "CREDIT_CARD" or entity.type == "CREDIT_CARD_NORMALIZED":
|
||
# LOGIC GATE: Apply Luhn validation - reclassify invalid credit cards
|
||
if validate_luhn(entity.value):
|
||
# Valid credit card - boost confidence
|
||
verified_entities.append(PIIEntity(
|
||
type=entity.type,
|
||
value=entity.value,
|
||
start=entity.start,
|
||
end=entity.end,
|
||
confidence=min(0.99, entity.confidence + 0.10),
|
||
masked_value=entity.masked_value
|
||
))
|
||
else:
|
||
# Invalid Luhn checksum - reclassify as POSSIBLE_CARD_PATTERN
|
||
# Check if context suggests it's test/example data - if so, discard
|
||
context_start = max(0, entity.start - 50)
|
||
context_end = min(len(text), entity.end + 50)
|
||
context = text[context_start:context_end].lower()
|
||
|
||
# Discard if context clearly indicates test/example data
|
||
if any(kw in context for kw in ["test", "example", "sample", "demo", "fake", "dummy"]):
|
||
# Skip this entity entirely - it's test data
|
||
continue
|
||
|
||
# Reclassify as POSSIBLE_CARD_PATTERN with 30% ceiling (consistent)
|
||
verified_entities.append(PIIEntity(
|
||
type="POSSIBLE_CARD_PATTERN",
|
||
value=entity.value,
|
||
start=entity.start,
|
||
end=entity.end,
|
||
confidence=0.30, # Consistent 30% ceiling for invalid checksums
|
||
masked_value=mask_value(entity.value, "CREDIT_CARD")
|
||
))
|
||
else:
|
||
verified_entities.append(entity)
|
||
|
||
# Deduplicate by position AND by overlapping normalized variants
|
||
# Step 1: Group by exact position
|
||
deduplicated = {}
|
||
for entity in verified_entities:
|
||
key = (entity.start, entity.end)
|
||
if key not in deduplicated or entity.confidence > deduplicated[key].confidence:
|
||
deduplicated[key] = entity
|
||
|
||
# Step 2: Merge overlapping entities of same base type (e.g., PHONE_US vs PHONE_US_NORMALIZED)
|
||
# This prevents showing both "PHONE_US" and "PHONE_US_NORMALIZED" for the same number
|
||
def get_base_type(pii_type: str) -> str:
|
||
"""Get base type without suffixes like _NORMALIZED, _IN_JSON, _BASE64, etc."""
|
||
suffixes = ['_NORMALIZED', '_IN_JSON', '_BASE64', '_TRANSLITERATED', '_SPELLED_OUT']
|
||
for suffix in suffixes:
|
||
if pii_type.endswith(suffix):
|
||
return pii_type[:-len(suffix)]
|
||
return pii_type
|
||
|
||
def normalize_value_for_comparison(value: str, pii_type: str) -> str:
|
||
"""Normalize a value for duplicate comparison (strip formatting)."""
|
||
base = get_base_type(pii_type)
|
||
if base in ['PHONE_US', 'PHONE_INTL', 'SSN', 'CREDIT_CARD']:
|
||
# For these types, compare just the digits
|
||
return re.sub(r'\D', '', value)
|
||
return value.lower().strip()
|
||
|
||
def overlaps(e1, e2, threshold: int = 5) -> bool:
|
||
"""Check if two entities overlap or are within threshold characters."""
|
||
return not (e1.end + threshold < e2.start or e2.end + threshold < e1.start)
|
||
|
||
def same_normalized_value(e1, e2) -> bool:
|
||
"""Check if two entities represent the same underlying value."""
|
||
return normalize_value_for_comparison(e1.value, e1.type) == normalize_value_for_comparison(e2.value, e2.type)
|
||
|
||
# Sort by start position for overlap detection
|
||
sorted_entities = sorted(deduplicated.values(), key=lambda x: x.start)
|
||
merged = []
|
||
|
||
for entity in sorted_entities:
|
||
base_type = get_base_type(entity.type)
|
||
merged_with_existing = False
|
||
|
||
# Check if this overlaps with any existing entity of same base type
|
||
# OR has the same normalized value (handles PHONE_US vs PHONE_US_NORMALIZED with different positions)
|
||
for i, existing in enumerate(merged):
|
||
existing_base = get_base_type(existing.type)
|
||
|
||
if base_type == existing_base:
|
||
# Merge if overlapping OR same value (for _NORMALIZED variants)
|
||
if overlaps(entity, existing) or same_normalized_value(entity, existing):
|
||
# Keep the one with higher confidence
|
||
if entity.confidence > existing.confidence:
|
||
merged[i] = entity
|
||
merged_with_existing = True
|
||
break
|
||
|
||
if not merged_with_existing:
|
||
merged.append(entity)
|
||
|
||
deduplicated = {(e.start, e.end): e for e in merged}
|
||
|
||
# Layer 5: False positive filtering (v1.1)
|
||
filtered_entities = []
|
||
for entity in deduplicated.values():
|
||
# Get context for false positive check
|
||
context_start = max(0, entity.start - 50)
|
||
context_end = min(len(detection_text), entity.end + 50)
|
||
context = detection_text[context_start:context_end]
|
||
|
||
# Skip if detected as false positive
|
||
if is_false_positive(entity.value, entity.type, context):
|
||
continue
|
||
|
||
filtered_entities.append(entity)
|
||
|
||
# Layer 6: Cross-type overlap resolution
|
||
# Sort by confidence (desc), then by length (desc) to prioritize better matches
|
||
sorted_by_priority = sorted(filtered_entities, key=lambda x: (-x.confidence, -(x.end - x.start)))
|
||
|
||
final_entities = []
|
||
for entity in sorted_by_priority:
|
||
# Check if this entity overlaps with any already accepted entity
|
||
overlaps_existing = False
|
||
for existing in final_entities:
|
||
# Check for any overlap
|
||
if not (entity.end <= existing.start or entity.start >= existing.end):
|
||
overlaps_existing = True
|
||
break
|
||
if not overlaps_existing:
|
||
final_entities.append(entity)
|
||
|
||
# Sort by position for output
|
||
result = sorted(final_entities, key=lambda x: x.start)
|
||
|
||
return result
|
||
|
||
|
||
def detect_pii_standard(text: str, config: Optional[ScanConfig] = None) -> list[PIIEntity]:
|
||
"""Layer 1: Standard regex-based PII detection"""
|
||
if config is None:
|
||
config = ScanConfig()
|
||
|
||
entities = []
|
||
|
||
# Map config to pattern types
|
||
type_mapping = {
|
||
"EMAIL": config.detect_emails,
|
||
# Phone numbers (US and International)
|
||
"PHONE_US": config.detect_phones,
|
||
"PHONE_INTL": config.detect_phones,
|
||
# Identity documents
|
||
"SSN": config.detect_ssn,
|
||
"MEDICARE_ID": config.detect_ssn, # Group with SSN as it's US health ID
|
||
"GERMAN_ID": True, # Always detect national IDs
|
||
"CREDIT_CARD": config.detect_credit_cards,
|
||
"IP_ADDRESS": config.detect_ip_addresses,
|
||
"DATE_OF_BIRTH": config.detect_dates,
|
||
"ZIP_CODE": config.detect_zip_codes,
|
||
# Addresses (US and International)
|
||
"US_ADDRESS": config.detect_addresses,
|
||
"UK_POSTCODE": config.detect_addresses,
|
||
"UK_ADDRESS": config.detect_addresses,
|
||
"EU_ADDRESS": config.detect_addresses,
|
||
"INTERNATIONAL_ADDRESS": config.detect_addresses,
|
||
# Financial
|
||
"IBAN": config.detect_iban,
|
||
"BANK_ACCOUNT": config.detect_iban,
|
||
"ROUTING_NUMBER": config.detect_iban,
|
||
"SWIFT_BIC": config.detect_iban,
|
||
# Medical IDs (always on - high sensitivity)
|
||
"MEDICAL_RECORD_NUMBER": True,
|
||
"NPI_NUMBER": True,
|
||
"DEA_NUMBER": True,
|
||
# Secrets - AWS
|
||
"AWS_ACCESS_KEY": config.detect_secrets,
|
||
"AWS_SECRET_KEY": config.detect_secrets,
|
||
# Secrets - GitHub
|
||
"GITHUB_TOKEN": config.detect_secrets,
|
||
"GITHUB_TOKEN_CLASSIC": config.detect_secrets,
|
||
# Secrets - GCP/Azure
|
||
"GCP_SERVICE_ACCOUNT": config.detect_secrets,
|
||
"GCP_PRIVATE_KEY_ID": config.detect_secrets,
|
||
"AZURE_CLIENT_SECRET": config.detect_secrets,
|
||
"AZURE_CONNECTION_STRING": config.detect_secrets,
|
||
"AZURE_SAS_TOKEN": config.detect_secrets,
|
||
# Secrets - Other
|
||
"SLACK_TOKEN": config.detect_secrets,
|
||
"GENERIC_API_KEY": config.detect_secrets,
|
||
"PASSWORD_IN_URL": config.detect_secrets,
|
||
"PRIVATE_KEY": config.detect_secrets,
|
||
"JWT_TOKEN": config.detect_secrets,
|
||
"STRIPE_KEY": config.detect_secrets,
|
||
"GOOGLE_API_KEY": config.detect_secrets,
|
||
"API_KEY_IN_URL": config.detect_secrets,
|
||
"AUTH_CODE": config.detect_secrets,
|
||
# Evasion-resistant patterns
|
||
"EMAIL_OBFUSCATED": config.detect_emails,
|
||
"IP_DEFANGED": config.detect_ip_addresses,
|
||
# Identifiers
|
||
"UUID": True, # Always detect UUIDs/trace IDs
|
||
"GEO_COORDINATES": config.detect_addresses, # Group with addresses
|
||
# =========================================================================
|
||
# NEW v1.1 PATTERNS
|
||
# =========================================================================
|
||
# International IDs
|
||
"UK_NATIONAL_INSURANCE": True,
|
||
"CANADIAN_SIN": True,
|
||
"INDIA_AADHAAR": True,
|
||
"INDIA_PAN": True,
|
||
"AUSTRALIA_TFN": True,
|
||
"BRAZIL_CPF": True,
|
||
"MEXICO_CURP": True,
|
||
"SOUTH_AFRICA_ID": True,
|
||
# Additional cloud tokens
|
||
"DISCORD_TOKEN": config.detect_secrets,
|
||
"DISCORD_WEBHOOK": config.detect_secrets,
|
||
"TWILIO_API_KEY": config.detect_secrets,
|
||
"TWILIO_AUTH_TOKEN": config.detect_secrets,
|
||
"SENDGRID_API_KEY": config.detect_secrets,
|
||
"OPENAI_API_KEY": config.detect_secrets,
|
||
"ANTHROPIC_API_KEY": config.detect_secrets,
|
||
"MAILCHIMP_API_KEY": config.detect_secrets,
|
||
"MAILGUN_API_KEY": config.detect_secrets,
|
||
"HEROKU_API_KEY": config.detect_secrets,
|
||
"SHOPIFY_ACCESS_TOKEN": config.detect_secrets,
|
||
"SHOPIFY_SHARED_SECRET": config.detect_secrets,
|
||
"NPM_TOKEN": config.detect_secrets,
|
||
"PYPI_TOKEN": config.detect_secrets,
|
||
"DOCKER_AUTH": config.detect_secrets,
|
||
# Crypto addresses
|
||
"BITCOIN_ADDRESS": config.detect_iban, # Group with financial
|
||
"ETHEREUM_ADDRESS": config.detect_iban,
|
||
"MONERO_ADDRESS": config.detect_iban,
|
||
# Financial identifiers
|
||
"CUSIP": config.detect_iban,
|
||
"ISIN": config.detect_iban,
|
||
"SEDOL": config.detect_iban,
|
||
}
|
||
|
||
for pii_type, pattern_info in PII_PATTERNS.items():
|
||
# Check if this type should be detected
|
||
if pii_type in type_mapping and not type_mapping[pii_type]:
|
||
continue
|
||
|
||
pattern = pattern_info["pattern"]
|
||
|
||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||
value = match.group()
|
||
|
||
# Calculate confidence based on pattern specificity
|
||
confidence = 0.85
|
||
if pii_type == "EMAIL":
|
||
confidence = 0.95
|
||
elif pii_type == "SSN":
|
||
# Pattern now requires dashes, so high confidence
|
||
confidence = 0.95
|
||
elif pii_type in ["PHONE_US", "PHONE_INTL"]:
|
||
confidence = 0.85
|
||
elif pii_type == "MEDICARE_ID":
|
||
confidence = 0.90
|
||
elif pii_type == "GERMAN_ID":
|
||
confidence = 0.85 # Requires context (ausweis keyword)
|
||
elif pii_type == "CREDIT_CARD":
|
||
# Validate with Luhn algorithm
|
||
if validate_credit_card(value):
|
||
confidence = 0.95
|
||
else:
|
||
confidence = 0.60
|
||
elif pii_type == "ZIP_CODE":
|
||
confidence = 0.70 # Could be any 5-digit number
|
||
elif pii_type in ["US_ADDRESS", "UK_ADDRESS", "EU_ADDRESS", "INTERNATIONAL_ADDRESS"]:
|
||
confidence = 0.80
|
||
elif pii_type == "UK_POSTCODE":
|
||
confidence = 0.85
|
||
elif pii_type == "IBAN":
|
||
# Validate IBAN length and format
|
||
clean_iban = re.sub(r'\s', '', value)
|
||
if len(clean_iban) >= 15 and len(clean_iban) <= 34:
|
||
confidence = 0.95
|
||
else:
|
||
confidence = 0.70
|
||
elif pii_type in ["BANK_ACCOUNT", "ROUTING_NUMBER"]:
|
||
confidence = 0.85
|
||
elif pii_type == "SWIFT_BIC":
|
||
confidence = 0.80 # Could be other 8/11 char codes
|
||
# Cloud provider keys
|
||
elif pii_type in ["AWS_ACCESS_KEY", "GITHUB_TOKEN", "GITHUB_TOKEN_CLASSIC", "STRIPE_KEY", "GOOGLE_API_KEY"]:
|
||
confidence = 0.95 # Very specific patterns
|
||
elif pii_type in ["GCP_SERVICE_ACCOUNT", "GCP_PRIVATE_KEY_ID"]:
|
||
confidence = 0.95
|
||
elif pii_type in ["AZURE_CONNECTION_STRING", "AZURE_SAS_TOKEN"]:
|
||
confidence = 0.95
|
||
elif pii_type == "AZURE_CLIENT_SECRET":
|
||
confidence = 0.75 # Less specific pattern
|
||
elif pii_type in ["AWS_SECRET_KEY", "JWT_TOKEN"]:
|
||
confidence = 0.85
|
||
elif pii_type in ["PASSWORD_IN_URL", "GENERIC_API_KEY"]:
|
||
confidence = 0.80
|
||
elif pii_type == "PRIVATE_KEY":
|
||
confidence = 0.99
|
||
# Medical IDs
|
||
elif pii_type in ["MEDICAL_RECORD_NUMBER", "NPI_NUMBER"]:
|
||
confidence = 0.90
|
||
elif pii_type == "DEA_NUMBER":
|
||
confidence = 0.85
|
||
# Evasion-resistant patterns
|
||
elif pii_type == "EMAIL_OBFUSCATED":
|
||
confidence = 0.90 # High - intentional obfuscation is suspicious
|
||
elif pii_type == "IP_DEFANGED":
|
||
confidence = 0.95 # Very high - defanging is deliberate
|
||
elif pii_type == "API_KEY_IN_URL":
|
||
confidence = 0.85
|
||
elif pii_type == "AUTH_CODE":
|
||
confidence = 0.80
|
||
elif pii_type == "UUID":
|
||
confidence = 0.70 # Could be any GUID
|
||
elif pii_type == "GEO_COORDINATES":
|
||
confidence = 0.85
|
||
|
||
# Create masked value
|
||
masked_value = mask_value(value, pii_type)
|
||
|
||
entities.append(PIIEntity(
|
||
type=pii_type,
|
||
value=value,
|
||
start=match.start(),
|
||
end=match.end(),
|
||
confidence=confidence,
|
||
masked_value=masked_value
|
||
))
|
||
|
||
# Sort by position
|
||
entities.sort(key=lambda x: x.start)
|
||
|
||
return entities
|
||
|
||
|
||
def validate_credit_card(number: str) -> bool:
|
||
"""Validate credit card using Luhn algorithm"""
|
||
digits = [int(d) for d in re.sub(r'\D', '', number)]
|
||
if len(digits) < 13 or len(digits) > 19:
|
||
return False
|
||
|
||
checksum = 0
|
||
for i, d in enumerate(reversed(digits)):
|
||
if i % 2 == 1:
|
||
d *= 2
|
||
if d > 9:
|
||
d -= 9
|
||
checksum += d
|
||
return checksum % 10 == 0
|
||
|
||
|
||
def mask_value(value: str, pii_type: str) -> str:
|
||
"""Mask a PII value based on its type"""
|
||
# Handle various suffixes by using base type for masking
|
||
base_type = pii_type
|
||
for suffix in ["_NORMALIZED", "_IN_JSON", "_BASE64", "_TRANSLITERATED", "_SPELLED_OUT"]:
|
||
base_type = base_type.replace(suffix, "")
|
||
|
||
if base_type == "EMAIL":
|
||
parts = value.split('@')
|
||
if len(parts) == 2:
|
||
return parts[0][:2] + '***@' + parts[1]
|
||
return '***@***'
|
||
elif base_type in ["PHONE_US", "PHONE_INTL"]:
|
||
clean = re.sub(r'\D', '', value)
|
||
if len(clean) >= 4:
|
||
return '***-***-' + clean[-4:]
|
||
return '***-***-****'
|
||
elif base_type == "MEDICARE_ID":
|
||
return '[MEDICARE ID REDACTED]'
|
||
elif base_type == "GERMAN_ID":
|
||
return '[GERMAN ID REDACTED]'
|
||
elif base_type == "SSN":
|
||
return '***-**-' + value[-4:] if len(value) >= 4 else '***-**-****'
|
||
elif base_type == "CREDIT_CARD":
|
||
clean = re.sub(r'\D', '', value)
|
||
return '****-****-****-' + clean[-4:] if len(clean) >= 4 else '****-****-****-****'
|
||
elif base_type == "IP_ADDRESS":
|
||
parts = value.split('.')
|
||
if len(parts) == 4:
|
||
return f'{parts[0]}.***.***.*'
|
||
return '***.***.***.***'
|
||
elif base_type == "DATE_OF_BIRTH":
|
||
return '**/**/' + value[-4:] if len(value) >= 4 else '**/**/****'
|
||
elif base_type == "IBAN":
|
||
# Show country code and last 4 chars
|
||
if len(value) >= 6:
|
||
return value[:2] + '**' + '*' * (len(value) - 6) + value[-4:]
|
||
return '*' * len(value)
|
||
elif base_type in ["US_ADDRESS", "UK_ADDRESS", "EU_ADDRESS", "INTERNATIONAL_ADDRESS"]:
|
||
# Mask the street number and name
|
||
return '[ADDRESS REDACTED]'
|
||
elif base_type == "UK_POSTCODE":
|
||
return value[:2] + '** ***' if len(value) >= 2 else '[POSTCODE REDACTED]'
|
||
elif base_type in ["BANK_ACCOUNT", "ROUTING_NUMBER"]:
|
||
# Show last 4 digits
|
||
clean = re.sub(r'\D', '', value)
|
||
return '******' + clean[-4:] if len(clean) >= 4 else '*' * len(value)
|
||
elif base_type == "SWIFT_BIC":
|
||
return value[:4] + '****' if len(value) >= 4 else '*' * len(value)
|
||
elif base_type in ["AWS_ACCESS_KEY", "AWS_SECRET_KEY"]:
|
||
return value[:4] + '*' * (len(value) - 8) + value[-4:] if len(value) > 8 else '*' * len(value)
|
||
elif base_type in ["GITHUB_TOKEN", "GITHUB_TOKEN_CLASSIC"]:
|
||
return value[:4] + '*' * (len(value) - 8) + value[-4:] if len(value) > 8 else '*' * len(value)
|
||
elif base_type in ["STRIPE_KEY", "GOOGLE_API_KEY"]:
|
||
return value[:7] + '*' * (len(value) - 11) + value[-4:] if len(value) > 11 else '*' * len(value)
|
||
elif base_type == "PASSWORD_IN_URL":
|
||
return '[PASSWORD REDACTED]'
|
||
elif base_type == "PRIVATE_KEY":
|
||
return '[PRIVATE KEY REDACTED]'
|
||
elif base_type == "JWT_TOKEN":
|
||
return value[:10] + '...[JWT REDACTED]...' + value[-10:] if len(value) > 20 else '[JWT REDACTED]'
|
||
elif base_type in ["SLACK_TOKEN", "GENERIC_API_KEY"]:
|
||
if len(value) > 8:
|
||
return value[:4] + '*' * (len(value) - 8) + value[-4:]
|
||
return '*' * len(value)
|
||
# GCP/Azure cloud credentials
|
||
elif base_type == "GCP_SERVICE_ACCOUNT":
|
||
parts = value.split('@')
|
||
if len(parts) == 2:
|
||
return parts[0][:3] + '***@' + parts[1]
|
||
return '[GCP SERVICE ACCOUNT REDACTED]'
|
||
elif base_type == "GCP_PRIVATE_KEY_ID":
|
||
return '[GCP PRIVATE KEY ID REDACTED]'
|
||
elif base_type in ["AZURE_CLIENT_SECRET", "AZURE_CONNECTION_STRING", "AZURE_SAS_TOKEN"]:
|
||
return '[AZURE CREDENTIAL REDACTED]'
|
||
# Medical IDs
|
||
elif base_type in ["MEDICAL_RECORD_NUMBER", "NPI_NUMBER", "DEA_NUMBER"]:
|
||
return '[MEDICAL ID REDACTED]'
|
||
# Evasion-resistant patterns
|
||
elif base_type == "EMAIL_OBFUSCATED":
|
||
return '[OBFUSCATED EMAIL REDACTED]'
|
||
elif base_type == "IP_DEFANGED":
|
||
return '[DEFANGED IP REDACTED]'
|
||
elif base_type == "API_KEY_IN_URL":
|
||
return '[API KEY IN URL REDACTED]'
|
||
elif base_type == "AUTH_CODE":
|
||
return '[AUTH CODE REDACTED]'
|
||
elif base_type == "UUID":
|
||
return value[:8] + '-****-****-****-' + value[-12:] if len(value) >= 36 else '[UUID REDACTED]'
|
||
elif base_type == "GEO_COORDINATES":
|
||
return '[COORDINATES REDACTED]'
|
||
# New advanced detection types
|
||
elif base_type == "EU_VAT_NUMBER":
|
||
return value[:2] + '***' + value[-4:] if len(value) > 6 else '[VAT REDACTED]'
|
||
elif base_type == "SSN_SPELLED_OUT":
|
||
return '***-**-' + value[-4:] if len(value) >= 4 else '***-**-****'
|
||
elif base_type == "BASE64_ENCODED_SECRET":
|
||
return '[BASE64 SECRET REDACTED]'
|
||
else:
|
||
# Generic masking - show first and last char
|
||
if len(value) > 2:
|
||
return value[0] + '*' * (len(value) - 2) + value[-1]
|
||
return '*' * len(value)
|
||
|
||
|
||
def calculate_risk_level(entities: list[PIIEntity]) -> tuple[str, int]:
|
||
"""Calculate risk level based on PII found"""
|
||
if not entities:
|
||
return "LOW", 0
|
||
|
||
# Weight by sensitivity
|
||
sensitivity_weights = {
|
||
# High sensitivity - identity theft risk
|
||
"SSN": 100,
|
||
"CREDIT_CARD": 95,
|
||
"DRIVERS_LICENSE": 85,
|
||
"MEDICARE_ID": 90, # Health insurance ID
|
||
"GERMAN_ID": 90, # National ID
|
||
# Financial
|
||
"IBAN": 85,
|
||
"BANK_ACCOUNT": 80,
|
||
"ROUTING_NUMBER": 75,
|
||
"SWIFT_BIC": 60,
|
||
# Secrets - security breach risk (AWS)
|
||
"AWS_ACCESS_KEY": 100,
|
||
"AWS_SECRET_KEY": 100,
|
||
"PRIVATE_KEY": 100,
|
||
# Secrets - GitHub
|
||
"GITHUB_TOKEN": 95,
|
||
"GITHUB_TOKEN_CLASSIC": 95,
|
||
# Secrets - GCP/Azure
|
||
"GCP_SERVICE_ACCOUNT": 95,
|
||
"GCP_PRIVATE_KEY_ID": 100,
|
||
"AZURE_CLIENT_SECRET": 95,
|
||
"AZURE_CONNECTION_STRING": 100,
|
||
"AZURE_SAS_TOKEN": 90,
|
||
# Secrets - Other
|
||
"STRIPE_KEY": 95,
|
||
"SLACK_TOKEN": 90,
|
||
"GOOGLE_API_KEY": 85,
|
||
"JWT_TOKEN": 85,
|
||
"PASSWORD_IN_URL": 90,
|
||
"GENERIC_API_KEY": 80,
|
||
# Medical IDs - HIPAA compliance
|
||
"MEDICAL_RECORD_NUMBER": 90,
|
||
"NPI_NUMBER": 85,
|
||
"DEA_NUMBER": 80,
|
||
# Phone numbers
|
||
"PHONE_US": 35,
|
||
"PHONE_INTL": 35,
|
||
# Medium sensitivity - Addresses
|
||
"DATE_OF_BIRTH": 50,
|
||
"US_ADDRESS": 55,
|
||
"UK_ADDRESS": 55,
|
||
"EU_ADDRESS": 55,
|
||
"INTERNATIONAL_ADDRESS": 50,
|
||
"UK_POSTCODE": 40,
|
||
# Lower sensitivity
|
||
"EMAIL": 40,
|
||
"IP_ADDRESS": 30,
|
||
"ZIP_CODE": 20,
|
||
# Possible patterns (failed validation but still flagged)
|
||
"POSSIBLE_CARD_PATTERN": 25, # Low - failed Luhn, likely not a real card
|
||
# Advanced detection types
|
||
"EU_VAT_NUMBER": 70,
|
||
"SSN_SPELLED_OUT": 95, # High - intentional evasion
|
||
"BASE64_ENCODED_SECRET": 90, # High - deliberately hidden
|
||
# JSON embedded types inherit from base type + 5
|
||
# Base64 decoded types inherit from base type + 5
|
||
# Transliterated types inherit from base type
|
||
}
|
||
|
||
total_score = 0
|
||
for entity in entities:
|
||
# Handle various suffixes by looking up base type
|
||
base_type = entity.type
|
||
for suffix in ["_NORMALIZED", "_IN_JSON", "_BASE64", "_TRANSLITERATED", "_SPELLED_OUT"]:
|
||
base_type = base_type.replace(suffix, "")
|
||
|
||
# Get weight - add bonus for encoded/hidden PII (evasion attempts)
|
||
weight = sensitivity_weights.get(base_type, sensitivity_weights.get(entity.type, 25))
|
||
|
||
# Bonus for evasion techniques (intentionally hidden PII is more suspicious)
|
||
if "_IN_JSON" in entity.type or "_BASE64" in entity.type:
|
||
weight = min(100, weight + 10) # Cap at 100
|
||
total_score += weight * entity.confidence
|
||
|
||
# Normalize score (0-100)
|
||
risk_score = min(100, int(total_score / max(1, len(entities)) + len(entities) * 5))
|
||
|
||
if risk_score >= 70:
|
||
return "CRITICAL", risk_score
|
||
elif risk_score >= 50:
|
||
return "HIGH", risk_score
|
||
elif risk_score >= 30:
|
||
return "MEDIUM", risk_score
|
||
else:
|
||
return "LOW", risk_score
|
||
|
||
|
||
def redact_text(text: str, entities: list[PIIEntity], mode: str = "mask") -> str:
|
||
"""Redact PII from text"""
|
||
if not entities:
|
||
return text
|
||
|
||
# Sort by position in reverse to not mess up indices
|
||
sorted_entities = sorted(entities, key=lambda x: x.start, reverse=True)
|
||
|
||
result = text
|
||
for entity in sorted_entities:
|
||
if mode == "mask":
|
||
replacement = entity.masked_value
|
||
elif mode == "remove":
|
||
replacement = "[REDACTED]"
|
||
elif mode == "type":
|
||
replacement = f"[{entity.type}]"
|
||
else:
|
||
replacement = entity.masked_value
|
||
|
||
result = result[:entity.start] + replacement + result[entity.end:]
|
||
|
||
return result
|
||
|
||
|
||
# Alias for backwards compatibility
|
||
detect_pii = detect_pii_multilayer
|
||
|
||
|
||
@router.post("/scan-text")
|
||
async def scan_text(
|
||
text: str = Form(...),
|
||
detect_emails: bool = Form(True),
|
||
detect_phones: bool = Form(True),
|
||
detect_ssn: bool = Form(True),
|
||
detect_credit_cards: bool = Form(True),
|
||
detect_ip_addresses: bool = Form(True),
|
||
detect_dates: bool = Form(True),
|
||
detect_addresses: bool = Form(True),
|
||
detect_iban: bool = Form(True),
|
||
detect_secrets: bool = Form(True),
|
||
coordinates_only: bool = Form(False)
|
||
):
|
||
"""
|
||
Scan text for PII and secrets using multi-layer detection.
|
||
|
||
Security Options:
|
||
- coordinates_only: If True, returns only PII positions (start, end, type, confidence)
|
||
without the actual values. The frontend can then perform client-side masking
|
||
using these coordinates, ensuring the backend never "sees" the raw PII.
|
||
This is useful for ultra-sensitive data where even the backend shouldn't
|
||
have access to actual PII values.
|
||
"""
|
||
# Normalize CRLF to LF when returning coordinates for client-side redaction
|
||
# Browser FormData converts LF->CRLF per RFC 7578, but frontend uses LF text
|
||
if coordinates_only:
|
||
text = text.replace("\r\n", "\n")
|
||
|
||
config = ScanConfig(
|
||
detect_emails=detect_emails,
|
||
detect_phones=detect_phones,
|
||
detect_ssn=detect_ssn,
|
||
detect_credit_cards=detect_credit_cards,
|
||
detect_ip_addresses=detect_ip_addresses,
|
||
detect_dates=detect_dates,
|
||
detect_addresses=detect_addresses,
|
||
detect_iban=detect_iban,
|
||
detect_secrets=detect_secrets
|
||
)
|
||
|
||
entities = detect_pii_multilayer(text, config, coordinates_only=coordinates_only)
|
||
|
||
# Group by type
|
||
entities_by_type = {}
|
||
for entity in entities:
|
||
if entity.type not in entities_by_type:
|
||
entities_by_type[entity.type] = 0
|
||
entities_by_type[entity.type] += 1
|
||
|
||
risk_level, risk_score = calculate_risk_level(entities)
|
||
|
||
# Coordinates-only mode: Strip actual PII values for client-side redaction
|
||
if coordinates_only:
|
||
# Return only positions and types - no actual PII values
|
||
# Frontend will use these coordinates to mask text client-side
|
||
entities_coords = [
|
||
{
|
||
"type": e.type,
|
||
"start": e.start,
|
||
"end": e.end,
|
||
"confidence": e.confidence,
|
||
"length": e.end - e.start # So frontend knows how many chars to mask
|
||
}
|
||
for e in entities
|
||
]
|
||
return {
|
||
"total_entities": len(entities),
|
||
"entities_by_type": entities_by_type,
|
||
"entities": entities_coords, # Coordinates only - no values!
|
||
"risk_level": risk_level,
|
||
"risk_score": risk_score,
|
||
"redacted_preview": None, # Frontend handles redaction
|
||
"coordinates_only": True
|
||
}
|
||
|
||
# Standard mode: Return full entity details including values
|
||
redacted_preview = redact_text(text, entities, "mask")
|
||
|
||
return {
|
||
"total_entities": len(entities),
|
||
"entities_by_type": entities_by_type,
|
||
"entities": [e.model_dump() for e in entities],
|
||
"risk_level": risk_level,
|
||
"risk_score": risk_score,
|
||
"redacted_preview": redacted_preview,
|
||
"coordinates_only": False
|
||
}
|
||
|
||
|
||
@router.post("/scan-file")
|
||
async def scan_file(file: UploadFile = File(...)):
|
||
"""Scan a file for PII (CSV, TXT, JSON)"""
|
||
try:
|
||
content = await file.read()
|
||
filename = file.filename.lower()
|
||
|
||
if filename.endswith('.csv'):
|
||
# Use duckdb to read CSV and extract all text
|
||
with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp:
|
||
tmp.write(content)
|
||
tmp_path = tmp.name
|
||
try:
|
||
con = duckdb.connect(':memory:')
|
||
result = con.execute(f"SELECT * FROM read_csv_auto('{tmp_path}')").fetchall()
|
||
columns = con.execute(f"DESCRIBE SELECT * FROM read_csv_auto('{tmp_path}')").fetchall()
|
||
all_text = ' '.join(str(cell) for row in result for cell in row if cell is not None)
|
||
con.close()
|
||
finally:
|
||
os.unlink(tmp_path)
|
||
elif filename.endswith('.json'):
|
||
# Use duckdb to read JSON
|
||
with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
|
||
tmp.write(content)
|
||
tmp_path = tmp.name
|
||
try:
|
||
con = duckdb.connect(':memory:')
|
||
result = con.execute(f"SELECT * FROM read_json_auto('{tmp_path}')").fetchall()
|
||
all_text = ' '.join(str(cell) for row in result for cell in row if cell is not None)
|
||
con.close()
|
||
finally:
|
||
os.unlink(tmp_path)
|
||
else:
|
||
# Treat as text file
|
||
all_text = content.decode('utf-8', errors='ignore')
|
||
|
||
entities = detect_pii(all_text)
|
||
|
||
# Group by type
|
||
entities_by_type = {}
|
||
for entity in entities:
|
||
if entity.type not in entities_by_type:
|
||
entities_by_type[entity.type] = 0
|
||
entities_by_type[entity.type] += 1
|
||
|
||
risk_level, risk_score = calculate_risk_level(entities)
|
||
|
||
# Limit preview length
|
||
preview_text = all_text[:2000] if len(all_text) > 2000 else all_text
|
||
redacted_preview = redact_text(preview_text, [e for e in entities if e.end <= 2000], "mask")
|
||
|
||
return {
|
||
"filename": file.filename,
|
||
"total_entities": len(entities),
|
||
"entities_by_type": entities_by_type,
|
||
"entities": [e.model_dump() for e in entities[:100]], # Limit to first 100
|
||
"risk_level": risk_level,
|
||
"risk_score": risk_score,
|
||
"redacted_preview": redacted_preview
|
||
}
|
||
except Exception as e:
|
||
raise HTTPException(status_code=400, detail=f"Could not process file: {str(e)}")
|
||
|
||
|
||
@router.post("/scan-dataframe")
|
||
async def scan_dataframe(file: UploadFile = File(...)):
|
||
"""Scan a CSV/Excel file and analyze each column for PII"""
|
||
try:
|
||
content = await file.read()
|
||
filename = file.filename.lower()
|
||
|
||
# Determine file extension and create temp file
|
||
if filename.endswith('.csv'):
|
||
suffix = '.csv'
|
||
elif filename.endswith(('.xls', '.xlsx')):
|
||
suffix = '.xlsx' if filename.endswith('.xlsx') else '.xls'
|
||
else:
|
||
raise HTTPException(status_code=400, detail="Unsupported file format. Use CSV or Excel.")
|
||
|
||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||
tmp.write(content)
|
||
tmp_path = tmp.name
|
||
|
||
try:
|
||
con = duckdb.connect(':memory:')
|
||
|
||
# Read file based on type
|
||
if suffix == '.csv':
|
||
# Get column names and data
|
||
schema = con.execute(f"DESCRIBE SELECT * FROM read_csv_auto('{tmp_path}')").fetchall()
|
||
columns = [col[0] for col in schema]
|
||
data = con.execute(f"SELECT * FROM read_csv_auto('{tmp_path}')").fetchall()
|
||
else:
|
||
# For Excel files, duckdb needs the spatial extension or we use a workaround
|
||
# DuckDB can read xlsx via read_xlsx (requires extension) - fall back to CSV-like approach
|
||
try:
|
||
schema = con.execute(f"DESCRIBE SELECT * FROM st_read('{tmp_path}')").fetchall()
|
||
columns = [col[0] for col in schema]
|
||
data = con.execute(f"SELECT * FROM st_read('{tmp_path}')").fetchall()
|
||
except:
|
||
# If spatial extension not available, try xlsx reader
|
||
schema = con.execute(f"DESCRIBE SELECT * FROM read_xlsx('{tmp_path}')").fetchall()
|
||
columns = [col[0] for col in schema]
|
||
data = con.execute(f"SELECT * FROM read_xlsx('{tmp_path}')").fetchall()
|
||
|
||
total_rows = len(data)
|
||
total_columns = len(columns)
|
||
|
||
column_results = []
|
||
total_pii = 0
|
||
|
||
for col_idx, col in enumerate(columns):
|
||
# Extract all values for this column
|
||
col_values = [str(row[col_idx]) for row in data if row[col_idx] is not None]
|
||
col_text = ' '.join(col_values)
|
||
entities = detect_pii(col_text)
|
||
|
||
# Group by type
|
||
entities_by_type = {}
|
||
for entity in entities:
|
||
if entity.type not in entities_by_type:
|
||
entities_by_type[entity.type] = 0
|
||
entities_by_type[entity.type] += 1
|
||
|
||
pii_count = len(entities)
|
||
total_pii += pii_count
|
||
|
||
column_results.append({
|
||
"column": col,
|
||
"pii_count": pii_count,
|
||
"pii_types": entities_by_type,
|
||
"sample_pii": [e.model_dump() for e in entities[:3]] if entities else []
|
||
})
|
||
|
||
con.close()
|
||
finally:
|
||
os.unlink(tmp_path)
|
||
|
||
# Sort by PII count descending
|
||
column_results.sort(key=lambda x: x["pii_count"], reverse=True)
|
||
|
||
risk_level = "CRITICAL" if total_pii > 50 else "HIGH" if total_pii > 20 else "MEDIUM" if total_pii > 5 else "LOW"
|
||
|
||
return {
|
||
"filename": file.filename,
|
||
"total_rows": total_rows,
|
||
"total_columns": total_columns,
|
||
"total_pii_found": total_pii,
|
||
"risk_level": risk_level,
|
||
"columns_with_pii": len([c for c in column_results if c["pii_count"] > 0]),
|
||
"column_analysis": column_results
|
||
}
|
||
except Exception as e:
|
||
raise HTTPException(status_code=400, detail=f"Could not process file: {str(e)}")
|
||
|
||
|
||
@router.post("/redact")
|
||
async def redact_text_endpoint(
|
||
text: str = Form(...),
|
||
mode: str = Form("mask")
|
||
):
|
||
"""Redact PII from text"""
|
||
entities = detect_pii(text)
|
||
redacted = redact_text(text, entities, mode)
|
||
|
||
return {
|
||
"original_length": len(text),
|
||
"redacted_length": len(redacted),
|
||
"entities_redacted": len(entities),
|
||
"redacted_text": redacted
|
||
}
|
||
|
||
|
||
@router.get("/entity-types")
|
||
async def list_entity_types():
|
||
"""List supported PII entity types"""
|
||
return {
|
||
"entity_types": [
|
||
{"type": key, "description": value["description"]}
|
||
for key, value in PII_PATTERNS.items()
|
||
]
|
||
}
|