ai-tools-suite/docs/building-privacy-scanner.html

975 lines
65 KiB
HTML
Raw Normal View History

2025-12-27 15:33:06 +00:00
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.6.33">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<meta name="author" content="AI Tools Suite">
<meta name="dcterms.date" content="2024-12-23">
<title>Building a Privacy Scanner: A Step-by-Step Implementation Guide</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
vertical-align: middle;
}
/* CSS for syntax highlighting */
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
}
pre.numberSource { margin-left: 3em; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
</style>
<script src="building-privacy-scanner_files/libs/clipboard/clipboard.min.js"></script>
<script src="building-privacy-scanner_files/libs/quarto-html/quarto.js"></script>
<script src="building-privacy-scanner_files/libs/quarto-html/popper.min.js"></script>
<script src="building-privacy-scanner_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="building-privacy-scanner_files/libs/quarto-html/anchor.min.js"></script>
<link href="building-privacy-scanner_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="building-privacy-scanner_files/libs/quarto-html/quarto-syntax-highlighting-07ba0ad10f5680c660e360ac31d2f3b6.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="building-privacy-scanner_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="building-privacy-scanner_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="building-privacy-scanner_files/libs/bootstrap/bootstrap-fe6593aca1dacbc749dc3d2ba78c8639.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="light">
</head>
<body>
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<nav id="TOC" role="doc-toc" class="toc-active">
<h2 id="toc-title">Table of contents</h2>
<ul>
<li><a href="#introduction" id="toc-introduction" class="nav-link active" data-scroll-target="#introduction">Introduction</a></li>
<li><a href="#step-1-project-structure" id="toc-step-1-project-structure" class="nav-link" data-scroll-target="#step-1-project-structure">Step 1: Project Structure</a></li>
<li><a href="#step-2-define-pii-patterns" id="toc-step-2-define-pii-patterns" class="nav-link" data-scroll-target="#step-2-define-pii-patterns">Step 2: Define PII Patterns</a></li>
<li><a href="#step-3-build-the-basic-detection-engine" id="toc-step-3-build-the-basic-detection-engine" class="nav-link" data-scroll-target="#step-3-build-the-basic-detection-engine">Step 3: Build the Basic Detection Engine</a></li>
<li><a href="#step-4-add-text-normalization-layer-2" id="toc-step-4-add-text-normalization-layer-2" class="nav-link" data-scroll-target="#step-4-add-text-normalization-layer-2">Step 4: Add Text Normalization (Layer 2)</a></li>
<li><a href="#step-5-implement-checksum-validation-layer-4" id="toc-step-5-implement-checksum-validation-layer-4" class="nav-link" data-scroll-target="#step-5-implement-checksum-validation-layer-4">Step 5: Implement Checksum Validation (Layer 4)</a></li>
<li><a href="#step-6-json-blob-extraction-layer-2.5" id="toc-step-6-json-blob-extraction-layer-2.5" class="nav-link" data-scroll-target="#step-6-json-blob-extraction-layer-2.5">Step 6: JSON Blob Extraction (Layer 2.5)</a></li>
<li><a href="#step-7-base64-auto-decoding-layer-2.6" id="toc-step-7-base64-auto-decoding-layer-2.6" class="nav-link" data-scroll-target="#step-7-base64-auto-decoding-layer-2.6">Step 7: Base64 Auto-Decoding (Layer 2.6)</a></li>
<li><a href="#step-8-build-the-fastapi-endpoint" id="toc-step-8-build-the-fastapi-endpoint" class="nav-link" data-scroll-target="#step-8-build-the-fastapi-endpoint">Step 8: Build the FastAPI Endpoint</a></li>
<li><a href="#step-9-create-the-sveltekit-frontend" id="toc-step-9-create-the-sveltekit-frontend" class="nav-link" data-scroll-target="#step-9-create-the-sveltekit-frontend">Step 9: Create the SvelteKit Frontend</a></li>
<li><a href="#step-10-add-security-features" id="toc-step-10-add-security-features" class="nav-link" data-scroll-target="#step-10-add-security-features">Step 10: Add Security Features</a></li>
<li><a href="#conclusion" id="toc-conclusion" class="nav-link" data-scroll-target="#conclusion">Conclusion</a></li>
</ul>
</nav>
</div>
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Building a Privacy Scanner: A Step-by-Step Implementation Guide</h1>
<div class="quarto-categories">
<div class="quarto-category">tutorial</div>
<div class="quarto-category">privacy</div>
<div class="quarto-category">pii-detection</div>
<div class="quarto-category">python</div>
<div class="quarto-category">svelte</div>
</div>
</div>
<div class="quarto-title-meta">
<div>
<div class="quarto-title-meta-heading">Author</div>
<div class="quarto-title-meta-contents">
<p>AI Tools Suite </p>
</div>
</div>
<div>
<div class="quarto-title-meta-heading">Published</div>
<div class="quarto-title-meta-contents">
<p class="date">December 23, 2024</p>
</div>
</div>
</div>
</header>
<section id="introduction" class="level2">
<h2 class="anchored" data-anchor-id="introduction">Introduction</h2>
<p>In this tutorial, well build a production-grade Privacy Scanner from scratch. By the end, youll have a tool that detects 40+ types of Personally Identifiable Information (PII) using an eight-layer detection pipeline, complete with a modern web interface.</p>
<p>Our stack: <strong>FastAPI</strong> for the backend API, <strong>SvelteKit</strong> for the frontend, and <strong>Python regex</strong> with validation logic for detection.</p>
</section>
<section id="step-1-project-structure" class="level2">
<h2 class="anchored" data-anchor-id="step-1-project-structure">Step 1: Project Structure</h2>
<p>First, create the project scaffolding:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1"></a><span class="fu">mkdir</span> <span class="at">-p</span> ai_tools_suite/<span class="dt">{backend/routers</span><span class="op">,</span><span class="dt">frontend/src/routes/privacy-scanner}</span></span>
<span id="cb1-2"><a href="#cb1-2"></a><span class="bu">cd</span> ai_tools_suite</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Your directory structure should look like:</p>
<pre><code>ai_tools_suite/
├── backend/
│ ├── main.py
│ └── routers/
│ └── privacy.py
└── frontend/
└── src/
└── routes/
└── privacy-scanner/
└── +page.svelte</code></pre>
</section>
<section id="step-2-define-pii-patterns" class="level2">
<h2 class="anchored" data-anchor-id="step-2-define-pii-patterns">Step 2: Define PII Patterns</h2>
<p>The foundation of any PII scanner is its pattern library. Create <code>backend/routers/privacy.py</code> and start with the core patterns:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1"></a><span class="im">import</span> re</span>
<span id="cb3-2"><a href="#cb3-2"></a><span class="im">from</span> typing <span class="im">import</span> List, Dict, Any</span>
<span id="cb3-3"><a href="#cb3-3"></a><span class="im">from</span> pydantic <span class="im">import</span> BaseModel</span>
<span id="cb3-4"><a href="#cb3-4"></a></span>
<span id="cb3-5"><a href="#cb3-5"></a><span class="kw">class</span> PIIEntity(BaseModel):</span>
<span id="cb3-6"><a href="#cb3-6"></a> <span class="bu">type</span>: <span class="bu">str</span></span>
<span id="cb3-7"><a href="#cb3-7"></a> value: <span class="bu">str</span></span>
<span id="cb3-8"><a href="#cb3-8"></a> start: <span class="bu">int</span></span>
<span id="cb3-9"><a href="#cb3-9"></a> end: <span class="bu">int</span></span>
<span id="cb3-10"><a href="#cb3-10"></a> confidence: <span class="bu">float</span></span>
<span id="cb3-11"><a href="#cb3-11"></a> context: <span class="bu">str</span> <span class="op">=</span> <span class="st">""</span></span>
<span id="cb3-12"><a href="#cb3-12"></a></span>
<span id="cb3-13"><a href="#cb3-13"></a>PII_PATTERNS <span class="op">=</span> {</span>
<span id="cb3-14"><a href="#cb3-14"></a> <span class="co"># Identity Documents</span></span>
<span id="cb3-15"><a href="#cb3-15"></a> <span class="st">"SSN"</span>: {</span>
<span id="cb3-16"><a href="#cb3-16"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b\d</span><span class="sc">{3}</span><span class="vs">-\d</span><span class="sc">{2}</span><span class="vs">-\d</span><span class="sc">{4}</span><span class="vs">\b'</span>,</span>
<span id="cb3-17"><a href="#cb3-17"></a> <span class="st">"description"</span>: <span class="st">"US Social Security Number"</span>,</span>
<span id="cb3-18"><a href="#cb3-18"></a> <span class="st">"category"</span>: <span class="st">"identity"</span></span>
<span id="cb3-19"><a href="#cb3-19"></a> },</span>
<span id="cb3-20"><a href="#cb3-20"></a> <span class="st">"PASSPORT"</span>: {</span>
<span id="cb3-21"><a href="#cb3-21"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b[A-Z]{1,2}\d{6,9}\b'</span>,</span>
<span id="cb3-22"><a href="#cb3-22"></a> <span class="st">"description"</span>: <span class="st">"Passport Number"</span>,</span>
<span id="cb3-23"><a href="#cb3-23"></a> <span class="st">"category"</span>: <span class="st">"identity"</span></span>
<span id="cb3-24"><a href="#cb3-24"></a> },</span>
<span id="cb3-25"><a href="#cb3-25"></a></span>
<span id="cb3-26"><a href="#cb3-26"></a> <span class="co"># Financial Information</span></span>
<span id="cb3-27"><a href="#cb3-27"></a> <span class="st">"CREDIT_CARD"</span>: {</span>
<span id="cb3-28"><a href="#cb3-28"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b(?:4[0-9]</span><span class="sc">{12}</span><span class="vs">(?:[0-9]</span><span class="sc">{3}</span><span class="vs">)?|5[1-5][0-9]</span><span class="sc">{14}</span><span class="vs">|3[47][0-9]</span><span class="sc">{13}</span><span class="vs">)\b'</span>,</span>
<span id="cb3-29"><a href="#cb3-29"></a> <span class="st">"description"</span>: <span class="st">"Credit Card Number (Visa, MC, Amex)"</span>,</span>
<span id="cb3-30"><a href="#cb3-30"></a> <span class="st">"category"</span>: <span class="st">"financial"</span></span>
<span id="cb3-31"><a href="#cb3-31"></a> },</span>
<span id="cb3-32"><a href="#cb3-32"></a> <span class="st">"IBAN"</span>: {</span>
<span id="cb3-33"><a href="#cb3-33"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b[A-Z]</span><span class="sc">{2}</span><span class="vs">\d</span><span class="sc">{2}</span><span class="vs">[A-Z0-9]{4,30}\b'</span>,</span>
<span id="cb3-34"><a href="#cb3-34"></a> <span class="st">"description"</span>: <span class="st">"International Bank Account Number"</span>,</span>
<span id="cb3-35"><a href="#cb3-35"></a> <span class="st">"category"</span>: <span class="st">"financial"</span></span>
<span id="cb3-36"><a href="#cb3-36"></a> },</span>
<span id="cb3-37"><a href="#cb3-37"></a></span>
<span id="cb3-38"><a href="#cb3-38"></a> <span class="co"># Contact Information</span></span>
<span id="cb3-39"><a href="#cb3-39"></a> <span class="st">"EMAIL"</span>: {</span>
<span id="cb3-40"><a href="#cb3-40"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'</span>,</span>
<span id="cb3-41"><a href="#cb3-41"></a> <span class="st">"description"</span>: <span class="st">"Email Address"</span>,</span>
<span id="cb3-42"><a href="#cb3-42"></a> <span class="st">"category"</span>: <span class="st">"contact"</span></span>
<span id="cb3-43"><a href="#cb3-43"></a> },</span>
<span id="cb3-44"><a href="#cb3-44"></a> <span class="st">"PHONE_US"</span>: {</span>
<span id="cb3-45"><a href="#cb3-45"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b(?:\+1[-.\s]?)?\(?\d</span><span class="sc">{3}</span><span class="vs">\)?[-.\s]?\d</span><span class="sc">{3}</span><span class="vs">[-.\s]?\d</span><span class="sc">{4}</span><span class="vs">\b'</span>,</span>
<span id="cb3-46"><a href="#cb3-46"></a> <span class="st">"description"</span>: <span class="st">"US Phone Number"</span>,</span>
<span id="cb3-47"><a href="#cb3-47"></a> <span class="st">"category"</span>: <span class="st">"contact"</span></span>
<span id="cb3-48"><a href="#cb3-48"></a> },</span>
<span id="cb3-49"><a href="#cb3-49"></a></span>
<span id="cb3-50"><a href="#cb3-50"></a> <span class="co"># Add more patterns as needed...</span></span>
<span id="cb3-51"><a href="#cb3-51"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Each pattern includes a regex, human-readable description, and category for risk classification.</p>
</section>
<section id="step-3-build-the-basic-detection-engine" class="level2">
<h2 class="anchored" data-anchor-id="step-3-build-the-basic-detection-engine">Step 3: Build the Basic Detection Engine</h2>
<p>Add the core detection function:</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1"></a><span class="kw">def</span> detect_pii_basic(text: <span class="bu">str</span>) <span class="op">-&gt;</span> List[PIIEntity]:</span>
<span id="cb4-2"><a href="#cb4-2"></a> <span class="co">"""Layer 1: Standard regex pattern matching."""</span></span>
<span id="cb4-3"><a href="#cb4-3"></a> entities <span class="op">=</span> []</span>
<span id="cb4-4"><a href="#cb4-4"></a></span>
<span id="cb4-5"><a href="#cb4-5"></a> <span class="cf">for</span> pii_type, config <span class="kw">in</span> PII_PATTERNS.items():</span>
<span id="cb4-6"><a href="#cb4-6"></a> pattern <span class="op">=</span> re.<span class="bu">compile</span>(config[<span class="st">"pattern"</span>], re.IGNORECASE)</span>
<span id="cb4-7"><a href="#cb4-7"></a></span>
<span id="cb4-8"><a href="#cb4-8"></a> <span class="cf">for</span> match <span class="kw">in</span> pattern.finditer(text):</span>
<span id="cb4-9"><a href="#cb4-9"></a> entity <span class="op">=</span> PIIEntity(</span>
<span id="cb4-10"><a href="#cb4-10"></a> <span class="bu">type</span><span class="op">=</span>pii_type,</span>
<span id="cb4-11"><a href="#cb4-11"></a> value<span class="op">=</span>match.group(),</span>
<span id="cb4-12"><a href="#cb4-12"></a> start<span class="op">=</span>match.start(),</span>
<span id="cb4-13"><a href="#cb4-13"></a> end<span class="op">=</span>match.end(),</span>
<span id="cb4-14"><a href="#cb4-14"></a> confidence<span class="op">=</span><span class="fl">0.8</span>, <span class="co"># Base confidence</span></span>
<span id="cb4-15"><a href="#cb4-15"></a> context<span class="op">=</span>text[<span class="bu">max</span>(<span class="dv">0</span>, match.start()<span class="op">-</span><span class="dv">20</span>):match.end()<span class="op">+</span><span class="dv">20</span>]</span>
<span id="cb4-16"><a href="#cb4-16"></a> )</span>
<span id="cb4-17"><a href="#cb4-17"></a> entities.append(entity)</span>
<span id="cb4-18"><a href="#cb4-18"></a></span>
<span id="cb4-19"><a href="#cb4-19"></a> <span class="cf">return</span> entities</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>This gives us working PII detection, but its easily fooled by obfuscation.</p>
</section>
<section id="step-4-add-text-normalization-layer-2" class="level2">
<h2 class="anchored" data-anchor-id="step-4-add-text-normalization-layer-2">Step 4: Add Text Normalization (Layer 2)</h2>
<p>Attackers often hide PII using separators, leetspeak, or unicode tricks. Add normalization:</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1"></a><span class="kw">def</span> normalize_text(text: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">tuple</span>[<span class="bu">str</span>, <span class="bu">dict</span>]:</span>
<span id="cb5-2"><a href="#cb5-2"></a> <span class="co">"""Layer 2: Remove obfuscation techniques."""</span></span>
<span id="cb5-3"><a href="#cb5-3"></a> original <span class="op">=</span> text</span>
<span id="cb5-4"><a href="#cb5-4"></a> mappings <span class="op">=</span> {}</span>
<span id="cb5-5"><a href="#cb5-5"></a></span>
<span id="cb5-6"><a href="#cb5-6"></a> <span class="co"># Remove common separators</span></span>
<span id="cb5-7"><a href="#cb5-7"></a> normalized <span class="op">=</span> re.sub(<span class="vs">r'[\s\-\.\(\)]+'</span>, <span class="st">''</span>, text)</span>
<span id="cb5-8"><a href="#cb5-8"></a></span>
<span id="cb5-9"><a href="#cb5-9"></a> <span class="co"># Leetspeak conversion</span></span>
<span id="cb5-10"><a href="#cb5-10"></a> leet_map <span class="op">=</span> {<span class="st">'0'</span>: <span class="st">'o'</span>, <span class="st">'1'</span>: <span class="st">'i'</span>, <span class="st">'3'</span>: <span class="st">'e'</span>, <span class="st">'4'</span>: <span class="st">'a'</span>, <span class="st">'5'</span>: <span class="st">'s'</span>, <span class="st">'7'</span>: <span class="st">'t'</span>}</span>
<span id="cb5-11"><a href="#cb5-11"></a> <span class="cf">for</span> leet, char <span class="kw">in</span> leet_map.items():</span>
<span id="cb5-12"><a href="#cb5-12"></a> normalized <span class="op">=</span> normalized.replace(leet, char)</span>
<span id="cb5-13"><a href="#cb5-13"></a></span>
<span id="cb5-14"><a href="#cb5-14"></a> <span class="co"># Track position mappings for accurate reporting</span></span>
<span id="cb5-15"><a href="#cb5-15"></a> <span class="co"># (simplified - production code needs full position tracking)</span></span>
<span id="cb5-16"><a href="#cb5-16"></a></span>
<span id="cb5-17"><a href="#cb5-17"></a> <span class="cf">return</span> normalized, mappings</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Now <code>4-5-6-7-8-9-0-1-2-3</code> gets normalized and detected as a potential SSN.</p>
</section>
<section id="step-5-implement-checksum-validation-layer-4" class="level2">
<h2 class="anchored" data-anchor-id="step-5-implement-checksum-validation-layer-4">Step 5: Implement Checksum Validation (Layer 4)</h2>
<p>Not every number sequence is valid PII. Add validation logic:</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1"></a><span class="kw">def</span> luhn_checksum(card_number: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">bool</span>:</span>
<span id="cb6-2"><a href="#cb6-2"></a> <span class="co">"""Validate credit card using Luhn algorithm."""</span></span>
<span id="cb6-3"><a href="#cb6-3"></a> digits <span class="op">=</span> [<span class="bu">int</span>(d) <span class="cf">for</span> d <span class="kw">in</span> card_number <span class="cf">if</span> d.isdigit()]</span>
<span id="cb6-4"><a href="#cb6-4"></a> odd_digits <span class="op">=</span> digits[<span class="op">-</span><span class="dv">1</span>::<span class="op">-</span><span class="dv">2</span>]</span>
<span id="cb6-5"><a href="#cb6-5"></a> even_digits <span class="op">=</span> digits[<span class="op">-</span><span class="dv">2</span>::<span class="op">-</span><span class="dv">2</span>]</span>
<span id="cb6-6"><a href="#cb6-6"></a></span>
<span id="cb6-7"><a href="#cb6-7"></a> total <span class="op">=</span> <span class="bu">sum</span>(odd_digits)</span>
<span id="cb6-8"><a href="#cb6-8"></a> <span class="cf">for</span> d <span class="kw">in</span> even_digits:</span>
<span id="cb6-9"><a href="#cb6-9"></a> total <span class="op">+=</span> <span class="bu">sum</span>(<span class="bu">divmod</span>(d <span class="op">*</span> <span class="dv">2</span>, <span class="dv">10</span>))</span>
<span id="cb6-10"><a href="#cb6-10"></a></span>
<span id="cb6-11"><a href="#cb6-11"></a> <span class="cf">return</span> total <span class="op">%</span> <span class="dv">10</span> <span class="op">==</span> <span class="dv">0</span></span>
<span id="cb6-12"><a href="#cb6-12"></a></span>
<span id="cb6-13"><a href="#cb6-13"></a><span class="kw">def</span> validate_iban(iban: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">bool</span>:</span>
<span id="cb6-14"><a href="#cb6-14"></a> <span class="co">"""Validate IBAN using MOD-97 algorithm."""</span></span>
<span id="cb6-15"><a href="#cb6-15"></a> iban <span class="op">=</span> iban.replace(<span class="st">' '</span>, <span class="st">''</span>).upper()</span>
<span id="cb6-16"><a href="#cb6-16"></a></span>
<span id="cb6-17"><a href="#cb6-17"></a> <span class="co"># Move first 4 chars to end</span></span>
<span id="cb6-18"><a href="#cb6-18"></a> rearranged <span class="op">=</span> iban[<span class="dv">4</span>:] <span class="op">+</span> iban[:<span class="dv">4</span>]</span>
<span id="cb6-19"><a href="#cb6-19"></a></span>
<span id="cb6-20"><a href="#cb6-20"></a> <span class="co"># Convert letters to numbers (A=10, B=11, etc.)</span></span>
<span id="cb6-21"><a href="#cb6-21"></a> numeric <span class="op">=</span> <span class="st">''</span></span>
<span id="cb6-22"><a href="#cb6-22"></a> <span class="cf">for</span> char <span class="kw">in</span> rearranged:</span>
<span id="cb6-23"><a href="#cb6-23"></a> <span class="cf">if</span> char.isdigit():</span>
<span id="cb6-24"><a href="#cb6-24"></a> numeric <span class="op">+=</span> char</span>
<span id="cb6-25"><a href="#cb6-25"></a> <span class="cf">else</span>:</span>
<span id="cb6-26"><a href="#cb6-26"></a> numeric <span class="op">+=</span> <span class="bu">str</span>(<span class="bu">ord</span>(char) <span class="op">-</span> <span class="dv">55</span>)</span>
<span id="cb6-27"><a href="#cb6-27"></a></span>
<span id="cb6-28"><a href="#cb6-28"></a> <span class="cf">return</span> <span class="bu">int</span>(numeric) <span class="op">%</span> <span class="dv">97</span> <span class="op">==</span> <span class="dv">1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>With validation, we can boost confidence for valid numbers and flag invalid ones as <code>POSSIBLE_CARD_PATTERN</code>.</p>
</section>
<section id="step-6-json-blob-extraction-layer-2.5" class="level2">
<h2 class="anchored" data-anchor-id="step-6-json-blob-extraction-layer-2.5">Step 6: JSON Blob Extraction (Layer 2.5)</h2>
<p>PII often hides in JSON payloads within logs or messages:</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1"></a><span class="im">import</span> json</span>
<span id="cb7-2"><a href="#cb7-2"></a></span>
<span id="cb7-3"><a href="#cb7-3"></a><span class="kw">def</span> extract_json_strings(text: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">list</span>[<span class="bu">tuple</span>[<span class="bu">str</span>, <span class="bu">int</span>, <span class="bu">int</span>]]:</span>
<span id="cb7-4"><a href="#cb7-4"></a> <span class="co">"""Find and extract JSON objects from text."""</span></span>
<span id="cb7-5"><a href="#cb7-5"></a> json_objects <span class="op">=</span> []</span>
<span id="cb7-6"><a href="#cb7-6"></a></span>
<span id="cb7-7"><a href="#cb7-7"></a> <span class="co"># Find potential JSON starts</span></span>
<span id="cb7-8"><a href="#cb7-8"></a> <span class="cf">for</span> i, char <span class="kw">in</span> <span class="bu">enumerate</span>(text):</span>
<span id="cb7-9"><a href="#cb7-9"></a> <span class="cf">if</span> char <span class="op">==</span> <span class="st">'{'</span>:</span>
<span id="cb7-10"><a href="#cb7-10"></a> depth <span class="op">=</span> <span class="dv">0</span></span>
<span id="cb7-11"><a href="#cb7-11"></a> <span class="cf">for</span> j <span class="kw">in</span> <span class="bu">range</span>(i, <span class="bu">len</span>(text)):</span>
<span id="cb7-12"><a href="#cb7-12"></a> <span class="cf">if</span> text[j] <span class="op">==</span> <span class="st">'{'</span>:</span>
<span id="cb7-13"><a href="#cb7-13"></a> depth <span class="op">+=</span> <span class="dv">1</span></span>
<span id="cb7-14"><a href="#cb7-14"></a> <span class="cf">elif</span> text[j] <span class="op">==</span> <span class="st">'}'</span>:</span>
<span id="cb7-15"><a href="#cb7-15"></a> depth <span class="op">-=</span> <span class="dv">1</span></span>
<span id="cb7-16"><a href="#cb7-16"></a> <span class="cf">if</span> depth <span class="op">==</span> <span class="dv">0</span>:</span>
<span id="cb7-17"><a href="#cb7-17"></a> <span class="cf">try</span>:</span>
<span id="cb7-18"><a href="#cb7-18"></a> candidate <span class="op">=</span> text[i:j<span class="op">+</span><span class="dv">1</span>]</span>
<span id="cb7-19"><a href="#cb7-19"></a> json.loads(candidate) <span class="co"># Validate</span></span>
<span id="cb7-20"><a href="#cb7-20"></a> json_objects.append((candidate, i, j<span class="op">+</span><span class="dv">1</span>))</span>
<span id="cb7-21"><a href="#cb7-21"></a> <span class="cf">except</span> json.JSONDecodeError:</span>
<span id="cb7-22"><a href="#cb7-22"></a> <span class="cf">pass</span></span>
<span id="cb7-23"><a href="#cb7-23"></a> <span class="cf">break</span></span>
<span id="cb7-24"><a href="#cb7-24"></a></span>
<span id="cb7-25"><a href="#cb7-25"></a> <span class="cf">return</span> json_objects</span>
<span id="cb7-26"><a href="#cb7-26"></a></span>
<span id="cb7-27"><a href="#cb7-27"></a><span class="kw">def</span> deep_scan_json(json_str: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">list</span>[<span class="bu">str</span>]:</span>
<span id="cb7-28"><a href="#cb7-28"></a> <span class="co">"""Recursively extract all string values from JSON."""</span></span>
<span id="cb7-29"><a href="#cb7-29"></a> values <span class="op">=</span> []</span>
<span id="cb7-30"><a href="#cb7-30"></a></span>
<span id="cb7-31"><a href="#cb7-31"></a> <span class="kw">def</span> extract(obj):</span>
<span id="cb7-32"><a href="#cb7-32"></a> <span class="cf">if</span> <span class="bu">isinstance</span>(obj, <span class="bu">str</span>):</span>
<span id="cb7-33"><a href="#cb7-33"></a> values.append(obj)</span>
<span id="cb7-34"><a href="#cb7-34"></a> <span class="cf">elif</span> <span class="bu">isinstance</span>(obj, <span class="bu">dict</span>):</span>
<span id="cb7-35"><a href="#cb7-35"></a> <span class="cf">for</span> v <span class="kw">in</span> obj.values():</span>
<span id="cb7-36"><a href="#cb7-36"></a> extract(v)</span>
<span id="cb7-37"><a href="#cb7-37"></a> <span class="cf">elif</span> <span class="bu">isinstance</span>(obj, <span class="bu">list</span>):</span>
<span id="cb7-38"><a href="#cb7-38"></a> <span class="cf">for</span> item <span class="kw">in</span> obj:</span>
<span id="cb7-39"><a href="#cb7-39"></a> extract(item)</span>
<span id="cb7-40"><a href="#cb7-40"></a></span>
<span id="cb7-41"><a href="#cb7-41"></a> <span class="cf">try</span>:</span>
<span id="cb7-42"><a href="#cb7-42"></a> extract(json.loads(json_str))</span>
<span id="cb7-43"><a href="#cb7-43"></a> <span class="cf">except</span>:</span>
<span id="cb7-44"><a href="#cb7-44"></a> <span class="cf">pass</span></span>
<span id="cb7-45"><a href="#cb7-45"></a></span>
<span id="cb7-46"><a href="#cb7-46"></a> <span class="cf">return</span> values</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="step-7-base64-auto-decoding-layer-2.6" class="level2">
<h2 class="anchored" data-anchor-id="step-7-base64-auto-decoding-layer-2.6">Step 7: Base64 Auto-Decoding (Layer 2.6)</h2>
<p>Encoded PII is common in API responses and logs:</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1"></a><span class="im">import</span> base64</span>
<span id="cb8-2"><a href="#cb8-2"></a></span>
<span id="cb8-3"><a href="#cb8-3"></a><span class="kw">def</span> is_valid_base64(s: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">bool</span>:</span>
<span id="cb8-4"><a href="#cb8-4"></a> <span class="co">"""Check if string is valid base64."""</span></span>
<span id="cb8-5"><a href="#cb8-5"></a> <span class="cf">if</span> <span class="bu">len</span>(s) <span class="op">&lt;</span> <span class="dv">20</span> <span class="kw">or</span> <span class="bu">len</span>(s) <span class="op">%</span> <span class="dv">4</span> <span class="op">!=</span> <span class="dv">0</span>:</span>
<span id="cb8-6"><a href="#cb8-6"></a> <span class="cf">return</span> <span class="va">False</span></span>
<span id="cb8-7"><a href="#cb8-7"></a> <span class="cf">try</span>:</span>
<span id="cb8-8"><a href="#cb8-8"></a> decoded <span class="op">=</span> base64.b64decode(s, validate<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb8-9"><a href="#cb8-9"></a> decoded.decode(<span class="st">'utf-8'</span>) <span class="co"># Must be valid UTF-8</span></span>
<span id="cb8-10"><a href="#cb8-10"></a> <span class="cf">return</span> <span class="va">True</span></span>
<span id="cb8-11"><a href="#cb8-11"></a> <span class="cf">except</span>:</span>
<span id="cb8-12"><a href="#cb8-12"></a> <span class="cf">return</span> <span class="va">False</span></span>
<span id="cb8-13"><a href="#cb8-13"></a></span>
<span id="cb8-14"><a href="#cb8-14"></a><span class="kw">def</span> decode_base64_strings(text: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">list</span>[<span class="bu">tuple</span>[<span class="bu">str</span>, <span class="bu">str</span>, <span class="bu">int</span>, <span class="bu">int</span>]]:</span>
<span id="cb8-15"><a href="#cb8-15"></a> <span class="co">"""Find and decode base64 strings."""</span></span>
<span id="cb8-16"><a href="#cb8-16"></a> results <span class="op">=</span> []</span>
<span id="cb8-17"><a href="#cb8-17"></a> pattern <span class="op">=</span> <span class="vs">r'[A-Za-z0-9+/]{20,}={0,2}'</span></span>
<span id="cb8-18"><a href="#cb8-18"></a></span>
<span id="cb8-19"><a href="#cb8-19"></a> <span class="cf">for</span> match <span class="kw">in</span> re.finditer(pattern, text):</span>
<span id="cb8-20"><a href="#cb8-20"></a> candidate <span class="op">=</span> match.group()</span>
<span id="cb8-21"><a href="#cb8-21"></a> <span class="cf">if</span> is_valid_base64(candidate):</span>
<span id="cb8-22"><a href="#cb8-22"></a> <span class="cf">try</span>:</span>
<span id="cb8-23"><a href="#cb8-23"></a> decoded <span class="op">=</span> base64.b64decode(candidate).decode(<span class="st">'utf-8'</span>)</span>
<span id="cb8-24"><a href="#cb8-24"></a> results.append((candidate, decoded, match.start(), match.end()))</span>
<span id="cb8-25"><a href="#cb8-25"></a> <span class="cf">except</span>:</span>
<span id="cb8-26"><a href="#cb8-26"></a> <span class="cf">pass</span></span>
<span id="cb8-27"><a href="#cb8-27"></a></span>
<span id="cb8-28"><a href="#cb8-28"></a> <span class="cf">return</span> results</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="step-8-build-the-fastapi-endpoint" class="level2">
<h2 class="anchored" data-anchor-id="step-8-build-the-fastapi-endpoint">Step 8: Build the FastAPI Endpoint</h2>
<p>Wire everything together in an API endpoint:</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1"></a><span class="im">from</span> fastapi <span class="im">import</span> APIRouter, Form</span>
<span id="cb9-2"><a href="#cb9-2"></a></span>
<span id="cb9-3"><a href="#cb9-3"></a>router <span class="op">=</span> APIRouter(prefix<span class="op">=</span><span class="st">"/api/privacy"</span>, tags<span class="op">=</span>[<span class="st">"privacy"</span>])</span>
<span id="cb9-4"><a href="#cb9-4"></a></span>
<span id="cb9-5"><a href="#cb9-5"></a><span class="at">@router.post</span>(<span class="st">"/scan-text"</span>)</span>
<span id="cb9-6"><a href="#cb9-6"></a><span class="cf">async</span> <span class="kw">def</span> scan_text(</span>
<span id="cb9-7"><a href="#cb9-7"></a> text: <span class="bu">str</span> <span class="op">=</span> Form(...),</span>
<span id="cb9-8"><a href="#cb9-8"></a> sensitivity: <span class="bu">str</span> <span class="op">=</span> Form(<span class="st">"medium"</span>)</span>
<span id="cb9-9"><a href="#cb9-9"></a>):</span>
<span id="cb9-10"><a href="#cb9-10"></a> <span class="co">"""Main PII scanning endpoint."""</span></span>
<span id="cb9-11"><a href="#cb9-11"></a></span>
<span id="cb9-12"><a href="#cb9-12"></a> <span class="co"># Layer 1: Basic pattern matching</span></span>
<span id="cb9-13"><a href="#cb9-13"></a> entities <span class="op">=</span> detect_pii_basic(text)</span>
<span id="cb9-14"><a href="#cb9-14"></a></span>
<span id="cb9-15"><a href="#cb9-15"></a> <span class="co"># Layer 2: Normalized text scan</span></span>
<span id="cb9-16"><a href="#cb9-16"></a> normalized, mappings <span class="op">=</span> normalize_text(text)</span>
<span id="cb9-17"><a href="#cb9-17"></a> normalized_entities <span class="op">=</span> detect_pii_basic(normalized)</span>
<span id="cb9-18"><a href="#cb9-18"></a> <span class="co"># ... map positions back to original</span></span>
<span id="cb9-19"><a href="#cb9-19"></a></span>
<span id="cb9-20"><a href="#cb9-20"></a> <span class="co"># Layer 2.5: JSON extraction</span></span>
<span id="cb9-21"><a href="#cb9-21"></a> <span class="cf">for</span> json_str, start, end <span class="kw">in</span> extract_json_strings(text):</span>
<span id="cb9-22"><a href="#cb9-22"></a> <span class="cf">for</span> value <span class="kw">in</span> deep_scan_json(json_str):</span>
<span id="cb9-23"><a href="#cb9-23"></a> entities.extend(detect_pii_basic(value))</span>
<span id="cb9-24"><a href="#cb9-24"></a></span>
<span id="cb9-25"><a href="#cb9-25"></a> <span class="co"># Layer 2.6: Base64 decoding</span></span>
<span id="cb9-26"><a href="#cb9-26"></a> <span class="cf">for</span> original, decoded, start, end <span class="kw">in</span> decode_base64_strings(text):</span>
<span id="cb9-27"><a href="#cb9-27"></a> decoded_entities <span class="op">=</span> detect_pii_basic(decoded)</span>
<span id="cb9-28"><a href="#cb9-28"></a> <span class="cf">for</span> e <span class="kw">in</span> decoded_entities:</span>
<span id="cb9-29"><a href="#cb9-29"></a> e.<span class="bu">type</span> <span class="op">=</span> <span class="ss">f"</span><span class="sc">{</span>e<span class="sc">.</span><span class="bu">type</span><span class="sc">}</span><span class="ss">_BASE64_ENCODED"</span></span>
<span id="cb9-30"><a href="#cb9-30"></a> entities.extend(decoded_entities)</span>
<span id="cb9-31"><a href="#cb9-31"></a></span>
<span id="cb9-32"><a href="#cb9-32"></a> <span class="co"># Layer 4: Validation</span></span>
<span id="cb9-33"><a href="#cb9-33"></a> <span class="cf">for</span> entity <span class="kw">in</span> entities:</span>
<span id="cb9-34"><a href="#cb9-34"></a> <span class="cf">if</span> entity.<span class="bu">type</span> <span class="op">==</span> <span class="st">"CREDIT_CARD"</span>:</span>
<span id="cb9-35"><a href="#cb9-35"></a> <span class="cf">if</span> luhn_checksum(entity.value):</span>
<span id="cb9-36"><a href="#cb9-36"></a> entity.confidence <span class="op">=</span> <span class="fl">0.95</span></span>
<span id="cb9-37"><a href="#cb9-37"></a> <span class="cf">else</span>:</span>
<span id="cb9-38"><a href="#cb9-38"></a> entity.<span class="bu">type</span> <span class="op">=</span> <span class="st">"POSSIBLE_CARD_PATTERN"</span></span>
<span id="cb9-39"><a href="#cb9-39"></a> entity.confidence <span class="op">=</span> <span class="fl">0.5</span></span>
<span id="cb9-40"><a href="#cb9-40"></a></span>
<span id="cb9-41"><a href="#cb9-41"></a> <span class="co"># Deduplicate and sort</span></span>
<span id="cb9-42"><a href="#cb9-42"></a> entities <span class="op">=</span> deduplicate_entities(entities)</span>
<span id="cb9-43"><a href="#cb9-43"></a></span>
<span id="cb9-44"><a href="#cb9-44"></a> <span class="co"># Generate masked preview</span></span>
<span id="cb9-45"><a href="#cb9-45"></a> redacted <span class="op">=</span> mask_pii(text, entities)</span>
<span id="cb9-46"><a href="#cb9-46"></a></span>
<span id="cb9-47"><a href="#cb9-47"></a> <span class="cf">return</span> {</span>
<span id="cb9-48"><a href="#cb9-48"></a> <span class="st">"entities"</span>: [e.<span class="bu">dict</span>() <span class="cf">for</span> e <span class="kw">in</span> entities],</span>
<span id="cb9-49"><a href="#cb9-49"></a> <span class="st">"redacted_preview"</span>: redacted,</span>
<span id="cb9-50"><a href="#cb9-50"></a> <span class="st">"summary"</span>: generate_summary(entities)</span>
<span id="cb9-51"><a href="#cb9-51"></a> }</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="step-9-create-the-sveltekit-frontend" class="level2">
<h2 class="anchored" data-anchor-id="step-9-create-the-sveltekit-frontend">Step 9: Create the SvelteKit Frontend</h2>
<p>Build an interactive UI in <code>frontend/src/routes/privacy-scanner/+page.svelte</code>:</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode numberSource svelte number-lines code-with-copy"><code class="sourceCode"><span id="cb10-1"><a href="#cb10-1"></a>&lt;script lang="ts"&gt;</span>
<span id="cb10-2"><a href="#cb10-2"></a> let inputText = '';</span>
<span id="cb10-3"><a href="#cb10-3"></a> let results: any = null;</span>
<span id="cb10-4"><a href="#cb10-4"></a> let loading = false;</span>
<span id="cb10-5"><a href="#cb10-5"></a></span>
<span id="cb10-6"><a href="#cb10-6"></a> async function scanText() {</span>
<span id="cb10-7"><a href="#cb10-7"></a> loading = true;</span>
<span id="cb10-8"><a href="#cb10-8"></a> const formData = new FormData();</span>
<span id="cb10-9"><a href="#cb10-9"></a> formData.append('text', inputText);</span>
<span id="cb10-10"><a href="#cb10-10"></a></span>
<span id="cb10-11"><a href="#cb10-11"></a> const response = await fetch('/api/privacy/scan-text', {</span>
<span id="cb10-12"><a href="#cb10-12"></a> method: 'POST',</span>
<span id="cb10-13"><a href="#cb10-13"></a> body: formData</span>
<span id="cb10-14"><a href="#cb10-14"></a> });</span>
<span id="cb10-15"><a href="#cb10-15"></a></span>
<span id="cb10-16"><a href="#cb10-16"></a> results = await response.json();</span>
<span id="cb10-17"><a href="#cb10-17"></a> loading = false;</span>
<span id="cb10-18"><a href="#cb10-18"></a> }</span>
<span id="cb10-19"><a href="#cb10-19"></a>&lt;/script&gt;</span>
<span id="cb10-20"><a href="#cb10-20"></a></span>
<span id="cb10-21"><a href="#cb10-21"></a>&lt;div class="container mx-auto p-6"&gt;</span>
<span id="cb10-22"><a href="#cb10-22"></a> &lt;h1 class="text-2xl font-bold mb-4"&gt;Privacy Scanner&lt;/h1&gt;</span>
<span id="cb10-23"><a href="#cb10-23"></a></span>
<span id="cb10-24"><a href="#cb10-24"></a> &lt;textarea</span>
<span id="cb10-25"><a href="#cb10-25"></a> bind:value={inputText}</span>
<span id="cb10-26"><a href="#cb10-26"></a> class="w-full h-48 p-4 border rounded"</span>
<span id="cb10-27"><a href="#cb10-27"></a> placeholder="Paste text to scan for PII..."</span>
<span id="cb10-28"><a href="#cb10-28"></a> &gt;&lt;/textarea&gt;</span>
<span id="cb10-29"><a href="#cb10-29"></a></span>
<span id="cb10-30"><a href="#cb10-30"></a> &lt;button</span>
<span id="cb10-31"><a href="#cb10-31"></a> on:click={scanText}</span>
<span id="cb10-32"><a href="#cb10-32"></a> disabled={loading}</span>
<span id="cb10-33"><a href="#cb10-33"></a> class="mt-4 px-6 py-2 bg-blue-600 text-white rounded"</span>
<span id="cb10-34"><a href="#cb10-34"></a> &gt;</span>
<span id="cb10-35"><a href="#cb10-35"></a> {loading ? 'Scanning...' : 'Scan for PII'}</span>
<span id="cb10-36"><a href="#cb10-36"></a> &lt;/button&gt;</span>
<span id="cb10-37"><a href="#cb10-37"></a></span>
<span id="cb10-38"><a href="#cb10-38"></a> {#if results}</span>
<span id="cb10-39"><a href="#cb10-39"></a> &lt;div class="mt-6"&gt;</span>
<span id="cb10-40"><a href="#cb10-40"></a> &lt;h2 class="text-xl font-semibold"&gt;Results&lt;/h2&gt;</span>
<span id="cb10-41"><a href="#cb10-41"></a></span>
<span id="cb10-42"><a href="#cb10-42"></a> &lt;!-- Entity badges --&gt;</span>
<span id="cb10-43"><a href="#cb10-43"></a> &lt;div class="flex flex-wrap gap-2 mt-4"&gt;</span>
<span id="cb10-44"><a href="#cb10-44"></a> {#each results.entities as entity}</span>
<span id="cb10-45"><a href="#cb10-45"></a> &lt;span class="px-3 py-1 rounded-full bg-red-100 text-red-800"&gt;</span>
<span id="cb10-46"><a href="#cb10-46"></a> {entity.type}: {entity.value}</span>
<span id="cb10-47"><a href="#cb10-47"></a> &lt;/span&gt;</span>
<span id="cb10-48"><a href="#cb10-48"></a> {/each}</span>
<span id="cb10-49"><a href="#cb10-49"></a> &lt;/div&gt;</span>
<span id="cb10-50"><a href="#cb10-50"></a></span>
<span id="cb10-51"><a href="#cb10-51"></a> &lt;!-- Redacted preview --&gt;</span>
<span id="cb10-52"><a href="#cb10-52"></a> &lt;div class="mt-4 p-4 bg-gray-100 rounded font-mono"&gt;</span>
<span id="cb10-53"><a href="#cb10-53"></a> {results.redacted_preview}</span>
<span id="cb10-54"><a href="#cb10-54"></a> &lt;/div&gt;</span>
<span id="cb10-55"><a href="#cb10-55"></a> &lt;/div&gt;</span>
<span id="cb10-56"><a href="#cb10-56"></a> {/if}</span>
<span id="cb10-57"><a href="#cb10-57"></a>&lt;/div&gt;</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="step-10-add-security-features" class="level2">
<h2 class="anchored" data-anchor-id="step-10-add-security-features">Step 10: Add Security Features</h2>
<p>For production deployment, implement ephemeral processing:</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1"></a><span class="co"># In main.py - ensure no PII logging</span></span>
<span id="cb11-2"><a href="#cb11-2"></a><span class="im">import</span> logging</span>
<span id="cb11-3"><a href="#cb11-3"></a></span>
<span id="cb11-4"><a href="#cb11-4"></a><span class="kw">class</span> PIIFilter(logging.Filter):</span>
<span id="cb11-5"><a href="#cb11-5"></a> <span class="kw">def</span> <span class="bu">filter</span>(<span class="va">self</span>, record):</span>
<span id="cb11-6"><a href="#cb11-6"></a> <span class="co"># Never log request bodies that might contain PII</span></span>
<span id="cb11-7"><a href="#cb11-7"></a> <span class="cf">return</span> <span class="st">'text='</span> <span class="kw">not</span> <span class="kw">in</span> <span class="bu">str</span>(record.msg)</span>
<span id="cb11-8"><a href="#cb11-8"></a></span>
<span id="cb11-9"><a href="#cb11-9"></a>logging.getLogger().addFilter(PIIFilter())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>And add coordinates-only mode for ultra-sensitive clients:</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1"></a><span class="at">@router.post</span>(<span class="st">"/scan-text"</span>)</span>
<span id="cb12-2"><a href="#cb12-2"></a><span class="cf">async</span> <span class="kw">def</span> scan_text(</span>
<span id="cb12-3"><a href="#cb12-3"></a> text: <span class="bu">str</span> <span class="op">=</span> Form(...),</span>
<span id="cb12-4"><a href="#cb12-4"></a> coordinates_only: <span class="bu">bool</span> <span class="op">=</span> Form(<span class="va">False</span>) <span class="co"># Client-side redaction mode</span></span>
<span id="cb12-5"><a href="#cb12-5"></a>):</span>
<span id="cb12-6"><a href="#cb12-6"></a> entities <span class="op">=</span> detect_pii_multilayer(text)</span>
<span id="cb12-7"><a href="#cb12-7"></a></span>
<span id="cb12-8"><a href="#cb12-8"></a> <span class="cf">if</span> coordinates_only:</span>
<span id="cb12-9"><a href="#cb12-9"></a> <span class="co"># Return only positions, not actual values</span></span>
<span id="cb12-10"><a href="#cb12-10"></a> <span class="cf">return</span> {</span>
<span id="cb12-11"><a href="#cb12-11"></a> <span class="st">"entities"</span>: [</span>
<span id="cb12-12"><a href="#cb12-12"></a> {<span class="st">"type"</span>: e.<span class="bu">type</span>, <span class="st">"start"</span>: e.start, <span class="st">"end"</span>: e.end, <span class="st">"length"</span>: e.end <span class="op">-</span> e.start}</span>
<span id="cb12-13"><a href="#cb12-13"></a> <span class="cf">for</span> e <span class="kw">in</span> entities</span>
<span id="cb12-14"><a href="#cb12-14"></a> ],</span>
<span id="cb12-15"><a href="#cb12-15"></a> <span class="st">"coordinates_only"</span>: <span class="va">True</span></span>
<span id="cb12-16"><a href="#cb12-16"></a> }</span>
<span id="cb12-17"><a href="#cb12-17"></a></span>
<span id="cb12-18"><a href="#cb12-18"></a> <span class="co"># Normal response with values</span></span>
<span id="cb12-19"><a href="#cb12-19"></a> <span class="cf">return</span> {<span class="st">"entities"</span>: [e.<span class="bu">dict</span>() <span class="cf">for</span> e <span class="kw">in</span> entities], ...}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="conclusion" class="level2">
<h2 class="anchored" data-anchor-id="conclusion">Conclusion</h2>
<p>Youve now built a multi-layer Privacy Scanner that can:</p>
<ul>
<li>Detect 40+ PII types using regex patterns</li>
<li>Defeat obfuscation through text normalization</li>
<li>Extract PII from JSON payloads and Base64 encodings</li>
<li>Validate checksums to reduce false positives</li>
<li>Provide a clean web interface for interactive scanning</li>
<li>Operate in secure, coordinates-only mode</li>
</ul>
<p><strong>Next steps</strong> to enhance your scanner:</p>
<ol type="1">
<li>Add machine learning for name/address detection</li>
<li>Implement language-specific patterns (EU VAT, UK NI numbers)</li>
<li>Build CI/CD integration for automated pre-commit scanning</li>
<li>Add PDF and document parsing capabilities</li>
</ol>
<p>The complete source code is available in the AI Tools Suite repository. Happy scanning!</p>
</section>
</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const toggleBodyColorMode = (bsSheetEl) => {
const mode = bsSheetEl.getAttribute("data-mode");
const bodyEl = window.document.querySelector("body");
if (mode === "dark") {
bodyEl.classList.add("quarto-dark");
bodyEl.classList.remove("quarto-light");
} else {
bodyEl.classList.add("quarto-light");
bodyEl.classList.remove("quarto-dark");
}
}
const toggleBodyColorPrimary = () => {
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
if (bsSheetEl) {
toggleBodyColorMode(bsSheetEl);
}
}
toggleBodyColorPrimary();
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const isCodeAnnotation = (el) => {
for (const clz of el.classList) {
if (clz.startsWith('code-annotation-')) {
return true;
}
}
return false;
}
const onCopySuccess = function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
}
const getTextToCopy = function(trigger) {
const codeEl = trigger.previousElementSibling.cloneNode(true);
for (const childEl of codeEl.children) {
if (isCodeAnnotation(childEl)) {
childEl.remove();
}
}
return codeEl.innerText;
}
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
text: getTextToCopy
});
clipboard.on('success', onCopySuccess);
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
// For code content inside modals, clipBoardJS needs to be initialized with a container option
// TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
text: getTextToCopy,
container: window.document.getElementById('quarto-embedded-source-code-modal')
});
clipboardModal.on('success', onCopySuccess);
}
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
var mailtoRegex = new RegExp(/^mailto:/);
var filterRegex = new RegExp('/' + window.location.host + '/');
var isInternal = (href) => {
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
}
// Inspect non-navigation links and adorn them if external
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
for (var i=0; i<links.length; i++) {
const link = links[i];
if (!isInternal(link.href)) {
// undo the damage that might have been done by quarto-nav.js in the case of
// links that we want to consider external
if (link.dataset.originalHref !== undefined) {
link.href = link.dataset.originalHref;
}
}
}
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
const config = {
allowHTML: true,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start',
};
if (contentFn) {
config.content = contentFn;
}
if (onTriggerFn) {
config.onTrigger = onTriggerFn;
}
if (onUntriggerFn) {
config.onUntrigger = onUntriggerFn;
}
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
// use id or data attribute instead here
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
if (note) {
return note.innerHTML;
} else {
return "";
}
});
}
const xrefs = window.document.querySelectorAll('a.quarto-xref');
const processXRef = (id, note) => {
// Strip column container classes
const stripColumnClz = (el) => {
el.classList.remove("page-full", "page-columns");
if (el.children) {
for (const child of el.children) {
stripColumnClz(child);
}
}
}
stripColumnClz(note)
if (id === null || id.startsWith('sec-')) {
// Special case sections, only their first couple elements
const container = document.createElement("div");
if (note.children && note.children.length > 2) {
container.appendChild(note.children[0].cloneNode(true));
for (let i = 1; i < note.children.length; i++) {
const child = note.children[i];
if (child.tagName === "P" && child.innerText === "") {
continue;
} else {
container.appendChild(child.cloneNode(true));
break;
}
}
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(container);
}
return container.innerHTML
} else {
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(note);
}
return note.innerHTML;
}
} else {
// Remove any anchor links if they are present
const anchorLink = note.querySelector('a.anchorjs-link');
if (anchorLink) {
anchorLink.remove();
}
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(note);
}
// TODO in 1.5, we should make sure this works without a callout special case
if (note.classList.contains("callout")) {
return note.outerHTML;
} else {
return note.innerHTML;
}
}
}
for (var i=0; i<xrefs.length; i++) {
const xref = xrefs[i];
tippyHover(xref, undefined, function(instance) {
instance.disable();
let url = xref.getAttribute('href');
let hash = undefined;
if (url.startsWith('#')) {
hash = url;
} else {
try { hash = new URL(url).hash; } catch {}
}
if (hash) {
const id = hash.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
if (note !== null) {
try {
const html = processXRef(id, note.cloneNode(true));
instance.setContent(html);
} finally {
instance.enable();
instance.show();
}
} else {
// See if we can fetch this
fetch(url.split('#')[0])
.then(res => res.text())
.then(html => {
const parser = new DOMParser();
const htmlDoc = parser.parseFromString(html, "text/html");
const note = htmlDoc.getElementById(id);
if (note !== null) {
const html = processXRef(id, note);
instance.setContent(html);
}
}).finally(() => {
instance.enable();
instance.show();
});
}
} else {
// See if we can fetch a full url (with no hash to target)
// This is a special case and we should probably do some content thinning / targeting
fetch(url)
.then(res => res.text())
.then(html => {
const parser = new DOMParser();
const htmlDoc = parser.parseFromString(html, "text/html");
const note = htmlDoc.querySelector('main.content');
if (note !== null) {
// This should only happen for chapter cross references
// (since there is no id in the URL)
// remove the first header
if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
note.children[0].remove();
}
const html = processXRef(null, note);
instance.setContent(html);
}
}).finally(() => {
instance.enable();
instance.show();
});
}
}, function(instance) {
});
}
let selectedAnnoteEl;
const selectorForAnnotation = ( cell, annotation) => {
let cellAttr = 'data-code-cell="' + cell + '"';
let lineAttr = 'data-code-annotation="' + annotation + '"';
const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
return selector;
}
const selectCodeLines = (annoteEl) => {
const doc = window.document;
const targetCell = annoteEl.getAttribute("data-target-cell");
const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
const lines = annoteSpan.getAttribute("data-code-lines").split(",");
const lineIds = lines.map((line) => {
return targetCell + "-" + line;
})
let top = null;
let height = null;
let parent = null;
if (lineIds.length > 0) {
//compute the position of the single el (top and bottom and make a div)
const el = window.document.getElementById(lineIds[0]);
top = el.offsetTop;
height = el.offsetHeight;
parent = el.parentElement.parentElement;
if (lineIds.length > 1) {
const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
const bottom = lastEl.offsetTop + lastEl.offsetHeight;
height = bottom - top;
}
if (top !== null && height !== null && parent !== null) {
// cook up a div (if necessary) and position it
let div = window.document.getElementById("code-annotation-line-highlight");
if (div === null) {
div = window.document.createElement("div");
div.setAttribute("id", "code-annotation-line-highlight");
div.style.position = 'absolute';
parent.appendChild(div);
}
div.style.top = top - 2 + "px";
div.style.height = height + 4 + "px";
div.style.left = 0;
let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
if (gutterDiv === null) {
gutterDiv = window.document.createElement("div");
gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
gutterDiv.style.position = 'absolute';
const codeCell = window.document.getElementById(targetCell);
const gutter = codeCell.querySelector('.code-annotation-gutter');
gutter.appendChild(gutterDiv);
}
gutterDiv.style.top = top - 2 + "px";
gutterDiv.style.height = height + 4 + "px";
}
selectedAnnoteEl = annoteEl;
}
};
const unselectCodeLines = () => {
const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
elementsIds.forEach((elId) => {
const div = window.document.getElementById(elId);
if (div) {
div.remove();
}
});
selectedAnnoteEl = undefined;
};
// Handle positioning of the toggle
window.addEventListener(
"resize",
throttle(() => {
elRect = undefined;
if (selectedAnnoteEl) {
selectCodeLines(selectedAnnoteEl);
}
}, 10)
);
function throttle(fn, ms) {
let throttle = false;
let timer;
return (...args) => {
if(!throttle) { // first call gets through
fn.apply(this, args);
throttle = true;
} else { // all the others get throttled
if(timer) clearTimeout(timer); // cancel #2
timer = setTimeout(() => {
fn.apply(this, args);
timer = throttle = false;
}, ms);
}
};
}
// Attach click handler to the DT
const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
for (const annoteDlNode of annoteDls) {
annoteDlNode.addEventListener('click', (event) => {
const clickedEl = event.target;
if (clickedEl !== selectedAnnoteEl) {
unselectCodeLines();
const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
if (activeEl) {
activeEl.classList.remove('code-annotation-active');
}
selectCodeLines(clickedEl);
clickedEl.classList.add('code-annotation-active');
} else {
// Unselect the line
unselectCodeLines();
clickedEl.classList.remove('code-annotation-active');
}
});
}
const findCites = (el) => {
const parentEl = el.parentElement;
if (parentEl) {
const cites = parentEl.dataset.cites;
if (cites) {
return {
el,
cites: cites.split(' ')
};
} else {
return findCites(el.parentElement)
}
} else {
return undefined;
}
};
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i<bibliorefs.length; i++) {
const ref = bibliorefs[i];
const citeInfo = findCites(ref);
if (citeInfo) {
tippyHover(citeInfo.el, function() {
var popup = window.document.createElement('div');
citeInfo.cites.forEach(function(cite) {
var citeDiv = window.document.createElement('div');
citeDiv.classList.add('hanging-indent');
citeDiv.classList.add('csl-entry');
var biblioDiv = window.document.getElementById('ref-' + cite);
if (biblioDiv) {
citeDiv.innerHTML = biblioDiv.innerHTML;
}
popup.appendChild(citeDiv);
});
return popup.innerHTML;
});
}
}
});
</script>
</div> <!-- /content -->
</body></html>