ai-tools-suite/docs/building-privacy-scanner.html
2025-12-27 15:33:06 +00:00

975 lines
No EOL
65 KiB
HTML
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.6.33">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<meta name="author" content="AI Tools Suite">
<meta name="dcterms.date" content="2024-12-23">
<title>Building a Privacy Scanner: A Step-by-Step Implementation Guide</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
vertical-align: middle;
}
/* CSS for syntax highlighting */
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
}
pre.numberSource { margin-left: 3em; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
</style>
<script src="building-privacy-scanner_files/libs/clipboard/clipboard.min.js"></script>
<script src="building-privacy-scanner_files/libs/quarto-html/quarto.js"></script>
<script src="building-privacy-scanner_files/libs/quarto-html/popper.min.js"></script>
<script src="building-privacy-scanner_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="building-privacy-scanner_files/libs/quarto-html/anchor.min.js"></script>
<link href="building-privacy-scanner_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="building-privacy-scanner_files/libs/quarto-html/quarto-syntax-highlighting-07ba0ad10f5680c660e360ac31d2f3b6.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="building-privacy-scanner_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="building-privacy-scanner_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="building-privacy-scanner_files/libs/bootstrap/bootstrap-fe6593aca1dacbc749dc3d2ba78c8639.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="light">
</head>
<body>
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<nav id="TOC" role="doc-toc" class="toc-active">
<h2 id="toc-title">Table of contents</h2>
<ul>
<li><a href="#introduction" id="toc-introduction" class="nav-link active" data-scroll-target="#introduction">Introduction</a></li>
<li><a href="#step-1-project-structure" id="toc-step-1-project-structure" class="nav-link" data-scroll-target="#step-1-project-structure">Step 1: Project Structure</a></li>
<li><a href="#step-2-define-pii-patterns" id="toc-step-2-define-pii-patterns" class="nav-link" data-scroll-target="#step-2-define-pii-patterns">Step 2: Define PII Patterns</a></li>
<li><a href="#step-3-build-the-basic-detection-engine" id="toc-step-3-build-the-basic-detection-engine" class="nav-link" data-scroll-target="#step-3-build-the-basic-detection-engine">Step 3: Build the Basic Detection Engine</a></li>
<li><a href="#step-4-add-text-normalization-layer-2" id="toc-step-4-add-text-normalization-layer-2" class="nav-link" data-scroll-target="#step-4-add-text-normalization-layer-2">Step 4: Add Text Normalization (Layer 2)</a></li>
<li><a href="#step-5-implement-checksum-validation-layer-4" id="toc-step-5-implement-checksum-validation-layer-4" class="nav-link" data-scroll-target="#step-5-implement-checksum-validation-layer-4">Step 5: Implement Checksum Validation (Layer 4)</a></li>
<li><a href="#step-6-json-blob-extraction-layer-2.5" id="toc-step-6-json-blob-extraction-layer-2.5" class="nav-link" data-scroll-target="#step-6-json-blob-extraction-layer-2.5">Step 6: JSON Blob Extraction (Layer 2.5)</a></li>
<li><a href="#step-7-base64-auto-decoding-layer-2.6" id="toc-step-7-base64-auto-decoding-layer-2.6" class="nav-link" data-scroll-target="#step-7-base64-auto-decoding-layer-2.6">Step 7: Base64 Auto-Decoding (Layer 2.6)</a></li>
<li><a href="#step-8-build-the-fastapi-endpoint" id="toc-step-8-build-the-fastapi-endpoint" class="nav-link" data-scroll-target="#step-8-build-the-fastapi-endpoint">Step 8: Build the FastAPI Endpoint</a></li>
<li><a href="#step-9-create-the-sveltekit-frontend" id="toc-step-9-create-the-sveltekit-frontend" class="nav-link" data-scroll-target="#step-9-create-the-sveltekit-frontend">Step 9: Create the SvelteKit Frontend</a></li>
<li><a href="#step-10-add-security-features" id="toc-step-10-add-security-features" class="nav-link" data-scroll-target="#step-10-add-security-features">Step 10: Add Security Features</a></li>
<li><a href="#conclusion" id="toc-conclusion" class="nav-link" data-scroll-target="#conclusion">Conclusion</a></li>
</ul>
</nav>
</div>
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Building a Privacy Scanner: A Step-by-Step Implementation Guide</h1>
<div class="quarto-categories">
<div class="quarto-category">tutorial</div>
<div class="quarto-category">privacy</div>
<div class="quarto-category">pii-detection</div>
<div class="quarto-category">python</div>
<div class="quarto-category">svelte</div>
</div>
</div>
<div class="quarto-title-meta">
<div>
<div class="quarto-title-meta-heading">Author</div>
<div class="quarto-title-meta-contents">
<p>AI Tools Suite </p>
</div>
</div>
<div>
<div class="quarto-title-meta-heading">Published</div>
<div class="quarto-title-meta-contents">
<p class="date">December 23, 2024</p>
</div>
</div>
</div>
</header>
<section id="introduction" class="level2">
<h2 class="anchored" data-anchor-id="introduction">Introduction</h2>
<p>In this tutorial, well build a production-grade Privacy Scanner from scratch. By the end, youll have a tool that detects 40+ types of Personally Identifiable Information (PII) using an eight-layer detection pipeline, complete with a modern web interface.</p>
<p>Our stack: <strong>FastAPI</strong> for the backend API, <strong>SvelteKit</strong> for the frontend, and <strong>Python regex</strong> with validation logic for detection.</p>
</section>
<section id="step-1-project-structure" class="level2">
<h2 class="anchored" data-anchor-id="step-1-project-structure">Step 1: Project Structure</h2>
<p>First, create the project scaffolding:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1"></a><span class="fu">mkdir</span> <span class="at">-p</span> ai_tools_suite/<span class="dt">{backend/routers</span><span class="op">,</span><span class="dt">frontend/src/routes/privacy-scanner}</span></span>
<span id="cb1-2"><a href="#cb1-2"></a><span class="bu">cd</span> ai_tools_suite</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Your directory structure should look like:</p>
<pre><code>ai_tools_suite/
├── backend/
│ ├── main.py
│ └── routers/
│ └── privacy.py
└── frontend/
└── src/
└── routes/
└── privacy-scanner/
└── +page.svelte</code></pre>
</section>
<section id="step-2-define-pii-patterns" class="level2">
<h2 class="anchored" data-anchor-id="step-2-define-pii-patterns">Step 2: Define PII Patterns</h2>
<p>The foundation of any PII scanner is its pattern library. Create <code>backend/routers/privacy.py</code> and start with the core patterns:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1"></a><span class="im">import</span> re</span>
<span id="cb3-2"><a href="#cb3-2"></a><span class="im">from</span> typing <span class="im">import</span> List, Dict, Any</span>
<span id="cb3-3"><a href="#cb3-3"></a><span class="im">from</span> pydantic <span class="im">import</span> BaseModel</span>
<span id="cb3-4"><a href="#cb3-4"></a></span>
<span id="cb3-5"><a href="#cb3-5"></a><span class="kw">class</span> PIIEntity(BaseModel):</span>
<span id="cb3-6"><a href="#cb3-6"></a> <span class="bu">type</span>: <span class="bu">str</span></span>
<span id="cb3-7"><a href="#cb3-7"></a> value: <span class="bu">str</span></span>
<span id="cb3-8"><a href="#cb3-8"></a> start: <span class="bu">int</span></span>
<span id="cb3-9"><a href="#cb3-9"></a> end: <span class="bu">int</span></span>
<span id="cb3-10"><a href="#cb3-10"></a> confidence: <span class="bu">float</span></span>
<span id="cb3-11"><a href="#cb3-11"></a> context: <span class="bu">str</span> <span class="op">=</span> <span class="st">""</span></span>
<span id="cb3-12"><a href="#cb3-12"></a></span>
<span id="cb3-13"><a href="#cb3-13"></a>PII_PATTERNS <span class="op">=</span> {</span>
<span id="cb3-14"><a href="#cb3-14"></a> <span class="co"># Identity Documents</span></span>
<span id="cb3-15"><a href="#cb3-15"></a> <span class="st">"SSN"</span>: {</span>
<span id="cb3-16"><a href="#cb3-16"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b\d</span><span class="sc">{3}</span><span class="vs">-\d</span><span class="sc">{2}</span><span class="vs">-\d</span><span class="sc">{4}</span><span class="vs">\b'</span>,</span>
<span id="cb3-17"><a href="#cb3-17"></a> <span class="st">"description"</span>: <span class="st">"US Social Security Number"</span>,</span>
<span id="cb3-18"><a href="#cb3-18"></a> <span class="st">"category"</span>: <span class="st">"identity"</span></span>
<span id="cb3-19"><a href="#cb3-19"></a> },</span>
<span id="cb3-20"><a href="#cb3-20"></a> <span class="st">"PASSPORT"</span>: {</span>
<span id="cb3-21"><a href="#cb3-21"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b[A-Z]{1,2}\d{6,9}\b'</span>,</span>
<span id="cb3-22"><a href="#cb3-22"></a> <span class="st">"description"</span>: <span class="st">"Passport Number"</span>,</span>
<span id="cb3-23"><a href="#cb3-23"></a> <span class="st">"category"</span>: <span class="st">"identity"</span></span>
<span id="cb3-24"><a href="#cb3-24"></a> },</span>
<span id="cb3-25"><a href="#cb3-25"></a></span>
<span id="cb3-26"><a href="#cb3-26"></a> <span class="co"># Financial Information</span></span>
<span id="cb3-27"><a href="#cb3-27"></a> <span class="st">"CREDIT_CARD"</span>: {</span>
<span id="cb3-28"><a href="#cb3-28"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b(?:4[0-9]</span><span class="sc">{12}</span><span class="vs">(?:[0-9]</span><span class="sc">{3}</span><span class="vs">)?|5[1-5][0-9]</span><span class="sc">{14}</span><span class="vs">|3[47][0-9]</span><span class="sc">{13}</span><span class="vs">)\b'</span>,</span>
<span id="cb3-29"><a href="#cb3-29"></a> <span class="st">"description"</span>: <span class="st">"Credit Card Number (Visa, MC, Amex)"</span>,</span>
<span id="cb3-30"><a href="#cb3-30"></a> <span class="st">"category"</span>: <span class="st">"financial"</span></span>
<span id="cb3-31"><a href="#cb3-31"></a> },</span>
<span id="cb3-32"><a href="#cb3-32"></a> <span class="st">"IBAN"</span>: {</span>
<span id="cb3-33"><a href="#cb3-33"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b[A-Z]</span><span class="sc">{2}</span><span class="vs">\d</span><span class="sc">{2}</span><span class="vs">[A-Z0-9]{4,30}\b'</span>,</span>
<span id="cb3-34"><a href="#cb3-34"></a> <span class="st">"description"</span>: <span class="st">"International Bank Account Number"</span>,</span>
<span id="cb3-35"><a href="#cb3-35"></a> <span class="st">"category"</span>: <span class="st">"financial"</span></span>
<span id="cb3-36"><a href="#cb3-36"></a> },</span>
<span id="cb3-37"><a href="#cb3-37"></a></span>
<span id="cb3-38"><a href="#cb3-38"></a> <span class="co"># Contact Information</span></span>
<span id="cb3-39"><a href="#cb3-39"></a> <span class="st">"EMAIL"</span>: {</span>
<span id="cb3-40"><a href="#cb3-40"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'</span>,</span>
<span id="cb3-41"><a href="#cb3-41"></a> <span class="st">"description"</span>: <span class="st">"Email Address"</span>,</span>
<span id="cb3-42"><a href="#cb3-42"></a> <span class="st">"category"</span>: <span class="st">"contact"</span></span>
<span id="cb3-43"><a href="#cb3-43"></a> },</span>
<span id="cb3-44"><a href="#cb3-44"></a> <span class="st">"PHONE_US"</span>: {</span>
<span id="cb3-45"><a href="#cb3-45"></a> <span class="st">"pattern"</span>: <span class="vs">r'\b(?:\+1[-.\s]?)?\(?\d</span><span class="sc">{3}</span><span class="vs">\)?[-.\s]?\d</span><span class="sc">{3}</span><span class="vs">[-.\s]?\d</span><span class="sc">{4}</span><span class="vs">\b'</span>,</span>
<span id="cb3-46"><a href="#cb3-46"></a> <span class="st">"description"</span>: <span class="st">"US Phone Number"</span>,</span>
<span id="cb3-47"><a href="#cb3-47"></a> <span class="st">"category"</span>: <span class="st">"contact"</span></span>
<span id="cb3-48"><a href="#cb3-48"></a> },</span>
<span id="cb3-49"><a href="#cb3-49"></a></span>
<span id="cb3-50"><a href="#cb3-50"></a> <span class="co"># Add more patterns as needed...</span></span>
<span id="cb3-51"><a href="#cb3-51"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Each pattern includes a regex, human-readable description, and category for risk classification.</p>
</section>
<section id="step-3-build-the-basic-detection-engine" class="level2">
<h2 class="anchored" data-anchor-id="step-3-build-the-basic-detection-engine">Step 3: Build the Basic Detection Engine</h2>
<p>Add the core detection function:</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1"></a><span class="kw">def</span> detect_pii_basic(text: <span class="bu">str</span>) <span class="op">-&gt;</span> List[PIIEntity]:</span>
<span id="cb4-2"><a href="#cb4-2"></a> <span class="co">"""Layer 1: Standard regex pattern matching."""</span></span>
<span id="cb4-3"><a href="#cb4-3"></a> entities <span class="op">=</span> []</span>
<span id="cb4-4"><a href="#cb4-4"></a></span>
<span id="cb4-5"><a href="#cb4-5"></a> <span class="cf">for</span> pii_type, config <span class="kw">in</span> PII_PATTERNS.items():</span>
<span id="cb4-6"><a href="#cb4-6"></a> pattern <span class="op">=</span> re.<span class="bu">compile</span>(config[<span class="st">"pattern"</span>], re.IGNORECASE)</span>
<span id="cb4-7"><a href="#cb4-7"></a></span>
<span id="cb4-8"><a href="#cb4-8"></a> <span class="cf">for</span> match <span class="kw">in</span> pattern.finditer(text):</span>
<span id="cb4-9"><a href="#cb4-9"></a> entity <span class="op">=</span> PIIEntity(</span>
<span id="cb4-10"><a href="#cb4-10"></a> <span class="bu">type</span><span class="op">=</span>pii_type,</span>
<span id="cb4-11"><a href="#cb4-11"></a> value<span class="op">=</span>match.group(),</span>
<span id="cb4-12"><a href="#cb4-12"></a> start<span class="op">=</span>match.start(),</span>
<span id="cb4-13"><a href="#cb4-13"></a> end<span class="op">=</span>match.end(),</span>
<span id="cb4-14"><a href="#cb4-14"></a> confidence<span class="op">=</span><span class="fl">0.8</span>, <span class="co"># Base confidence</span></span>
<span id="cb4-15"><a href="#cb4-15"></a> context<span class="op">=</span>text[<span class="bu">max</span>(<span class="dv">0</span>, match.start()<span class="op">-</span><span class="dv">20</span>):match.end()<span class="op">+</span><span class="dv">20</span>]</span>
<span id="cb4-16"><a href="#cb4-16"></a> )</span>
<span id="cb4-17"><a href="#cb4-17"></a> entities.append(entity)</span>
<span id="cb4-18"><a href="#cb4-18"></a></span>
<span id="cb4-19"><a href="#cb4-19"></a> <span class="cf">return</span> entities</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>This gives us working PII detection, but its easily fooled by obfuscation.</p>
</section>
<section id="step-4-add-text-normalization-layer-2" class="level2">
<h2 class="anchored" data-anchor-id="step-4-add-text-normalization-layer-2">Step 4: Add Text Normalization (Layer 2)</h2>
<p>Attackers often hide PII using separators, leetspeak, or unicode tricks. Add normalization:</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1"></a><span class="kw">def</span> normalize_text(text: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">tuple</span>[<span class="bu">str</span>, <span class="bu">dict</span>]:</span>
<span id="cb5-2"><a href="#cb5-2"></a> <span class="co">"""Layer 2: Remove obfuscation techniques."""</span></span>
<span id="cb5-3"><a href="#cb5-3"></a> original <span class="op">=</span> text</span>
<span id="cb5-4"><a href="#cb5-4"></a> mappings <span class="op">=</span> {}</span>
<span id="cb5-5"><a href="#cb5-5"></a></span>
<span id="cb5-6"><a href="#cb5-6"></a> <span class="co"># Remove common separators</span></span>
<span id="cb5-7"><a href="#cb5-7"></a> normalized <span class="op">=</span> re.sub(<span class="vs">r'[\s\-\.\(\)]+'</span>, <span class="st">''</span>, text)</span>
<span id="cb5-8"><a href="#cb5-8"></a></span>
<span id="cb5-9"><a href="#cb5-9"></a> <span class="co"># Leetspeak conversion</span></span>
<span id="cb5-10"><a href="#cb5-10"></a> leet_map <span class="op">=</span> {<span class="st">'0'</span>: <span class="st">'o'</span>, <span class="st">'1'</span>: <span class="st">'i'</span>, <span class="st">'3'</span>: <span class="st">'e'</span>, <span class="st">'4'</span>: <span class="st">'a'</span>, <span class="st">'5'</span>: <span class="st">'s'</span>, <span class="st">'7'</span>: <span class="st">'t'</span>}</span>
<span id="cb5-11"><a href="#cb5-11"></a> <span class="cf">for</span> leet, char <span class="kw">in</span> leet_map.items():</span>
<span id="cb5-12"><a href="#cb5-12"></a> normalized <span class="op">=</span> normalized.replace(leet, char)</span>
<span id="cb5-13"><a href="#cb5-13"></a></span>
<span id="cb5-14"><a href="#cb5-14"></a> <span class="co"># Track position mappings for accurate reporting</span></span>
<span id="cb5-15"><a href="#cb5-15"></a> <span class="co"># (simplified - production code needs full position tracking)</span></span>
<span id="cb5-16"><a href="#cb5-16"></a></span>
<span id="cb5-17"><a href="#cb5-17"></a> <span class="cf">return</span> normalized, mappings</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Now <code>4-5-6-7-8-9-0-1-2-3</code> gets normalized and detected as a potential SSN.</p>
</section>
<section id="step-5-implement-checksum-validation-layer-4" class="level2">
<h2 class="anchored" data-anchor-id="step-5-implement-checksum-validation-layer-4">Step 5: Implement Checksum Validation (Layer 4)</h2>
<p>Not every number sequence is valid PII. Add validation logic:</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1"></a><span class="kw">def</span> luhn_checksum(card_number: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">bool</span>:</span>
<span id="cb6-2"><a href="#cb6-2"></a> <span class="co">"""Validate credit card using Luhn algorithm."""</span></span>
<span id="cb6-3"><a href="#cb6-3"></a> digits <span class="op">=</span> [<span class="bu">int</span>(d) <span class="cf">for</span> d <span class="kw">in</span> card_number <span class="cf">if</span> d.isdigit()]</span>
<span id="cb6-4"><a href="#cb6-4"></a> odd_digits <span class="op">=</span> digits[<span class="op">-</span><span class="dv">1</span>::<span class="op">-</span><span class="dv">2</span>]</span>
<span id="cb6-5"><a href="#cb6-5"></a> even_digits <span class="op">=</span> digits[<span class="op">-</span><span class="dv">2</span>::<span class="op">-</span><span class="dv">2</span>]</span>
<span id="cb6-6"><a href="#cb6-6"></a></span>
<span id="cb6-7"><a href="#cb6-7"></a> total <span class="op">=</span> <span class="bu">sum</span>(odd_digits)</span>
<span id="cb6-8"><a href="#cb6-8"></a> <span class="cf">for</span> d <span class="kw">in</span> even_digits:</span>
<span id="cb6-9"><a href="#cb6-9"></a> total <span class="op">+=</span> <span class="bu">sum</span>(<span class="bu">divmod</span>(d <span class="op">*</span> <span class="dv">2</span>, <span class="dv">10</span>))</span>
<span id="cb6-10"><a href="#cb6-10"></a></span>
<span id="cb6-11"><a href="#cb6-11"></a> <span class="cf">return</span> total <span class="op">%</span> <span class="dv">10</span> <span class="op">==</span> <span class="dv">0</span></span>
<span id="cb6-12"><a href="#cb6-12"></a></span>
<span id="cb6-13"><a href="#cb6-13"></a><span class="kw">def</span> validate_iban(iban: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">bool</span>:</span>
<span id="cb6-14"><a href="#cb6-14"></a> <span class="co">"""Validate IBAN using MOD-97 algorithm."""</span></span>
<span id="cb6-15"><a href="#cb6-15"></a> iban <span class="op">=</span> iban.replace(<span class="st">' '</span>, <span class="st">''</span>).upper()</span>
<span id="cb6-16"><a href="#cb6-16"></a></span>
<span id="cb6-17"><a href="#cb6-17"></a> <span class="co"># Move first 4 chars to end</span></span>
<span id="cb6-18"><a href="#cb6-18"></a> rearranged <span class="op">=</span> iban[<span class="dv">4</span>:] <span class="op">+</span> iban[:<span class="dv">4</span>]</span>
<span id="cb6-19"><a href="#cb6-19"></a></span>
<span id="cb6-20"><a href="#cb6-20"></a> <span class="co"># Convert letters to numbers (A=10, B=11, etc.)</span></span>
<span id="cb6-21"><a href="#cb6-21"></a> numeric <span class="op">=</span> <span class="st">''</span></span>
<span id="cb6-22"><a href="#cb6-22"></a> <span class="cf">for</span> char <span class="kw">in</span> rearranged:</span>
<span id="cb6-23"><a href="#cb6-23"></a> <span class="cf">if</span> char.isdigit():</span>
<span id="cb6-24"><a href="#cb6-24"></a> numeric <span class="op">+=</span> char</span>
<span id="cb6-25"><a href="#cb6-25"></a> <span class="cf">else</span>:</span>
<span id="cb6-26"><a href="#cb6-26"></a> numeric <span class="op">+=</span> <span class="bu">str</span>(<span class="bu">ord</span>(char) <span class="op">-</span> <span class="dv">55</span>)</span>
<span id="cb6-27"><a href="#cb6-27"></a></span>
<span id="cb6-28"><a href="#cb6-28"></a> <span class="cf">return</span> <span class="bu">int</span>(numeric) <span class="op">%</span> <span class="dv">97</span> <span class="op">==</span> <span class="dv">1</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>With validation, we can boost confidence for valid numbers and flag invalid ones as <code>POSSIBLE_CARD_PATTERN</code>.</p>
</section>
<section id="step-6-json-blob-extraction-layer-2.5" class="level2">
<h2 class="anchored" data-anchor-id="step-6-json-blob-extraction-layer-2.5">Step 6: JSON Blob Extraction (Layer 2.5)</h2>
<p>PII often hides in JSON payloads within logs or messages:</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1"></a><span class="im">import</span> json</span>
<span id="cb7-2"><a href="#cb7-2"></a></span>
<span id="cb7-3"><a href="#cb7-3"></a><span class="kw">def</span> extract_json_strings(text: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">list</span>[<span class="bu">tuple</span>[<span class="bu">str</span>, <span class="bu">int</span>, <span class="bu">int</span>]]:</span>
<span id="cb7-4"><a href="#cb7-4"></a> <span class="co">"""Find and extract JSON objects from text."""</span></span>
<span id="cb7-5"><a href="#cb7-5"></a> json_objects <span class="op">=</span> []</span>
<span id="cb7-6"><a href="#cb7-6"></a></span>
<span id="cb7-7"><a href="#cb7-7"></a> <span class="co"># Find potential JSON starts</span></span>
<span id="cb7-8"><a href="#cb7-8"></a> <span class="cf">for</span> i, char <span class="kw">in</span> <span class="bu">enumerate</span>(text):</span>
<span id="cb7-9"><a href="#cb7-9"></a> <span class="cf">if</span> char <span class="op">==</span> <span class="st">'{'</span>:</span>
<span id="cb7-10"><a href="#cb7-10"></a> depth <span class="op">=</span> <span class="dv">0</span></span>
<span id="cb7-11"><a href="#cb7-11"></a> <span class="cf">for</span> j <span class="kw">in</span> <span class="bu">range</span>(i, <span class="bu">len</span>(text)):</span>
<span id="cb7-12"><a href="#cb7-12"></a> <span class="cf">if</span> text[j] <span class="op">==</span> <span class="st">'{'</span>:</span>
<span id="cb7-13"><a href="#cb7-13"></a> depth <span class="op">+=</span> <span class="dv">1</span></span>
<span id="cb7-14"><a href="#cb7-14"></a> <span class="cf">elif</span> text[j] <span class="op">==</span> <span class="st">'}'</span>:</span>
<span id="cb7-15"><a href="#cb7-15"></a> depth <span class="op">-=</span> <span class="dv">1</span></span>
<span id="cb7-16"><a href="#cb7-16"></a> <span class="cf">if</span> depth <span class="op">==</span> <span class="dv">0</span>:</span>
<span id="cb7-17"><a href="#cb7-17"></a> <span class="cf">try</span>:</span>
<span id="cb7-18"><a href="#cb7-18"></a> candidate <span class="op">=</span> text[i:j<span class="op">+</span><span class="dv">1</span>]</span>
<span id="cb7-19"><a href="#cb7-19"></a> json.loads(candidate) <span class="co"># Validate</span></span>
<span id="cb7-20"><a href="#cb7-20"></a> json_objects.append((candidate, i, j<span class="op">+</span><span class="dv">1</span>))</span>
<span id="cb7-21"><a href="#cb7-21"></a> <span class="cf">except</span> json.JSONDecodeError:</span>
<span id="cb7-22"><a href="#cb7-22"></a> <span class="cf">pass</span></span>
<span id="cb7-23"><a href="#cb7-23"></a> <span class="cf">break</span></span>
<span id="cb7-24"><a href="#cb7-24"></a></span>
<span id="cb7-25"><a href="#cb7-25"></a> <span class="cf">return</span> json_objects</span>
<span id="cb7-26"><a href="#cb7-26"></a></span>
<span id="cb7-27"><a href="#cb7-27"></a><span class="kw">def</span> deep_scan_json(json_str: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">list</span>[<span class="bu">str</span>]:</span>
<span id="cb7-28"><a href="#cb7-28"></a> <span class="co">"""Recursively extract all string values from JSON."""</span></span>
<span id="cb7-29"><a href="#cb7-29"></a> values <span class="op">=</span> []</span>
<span id="cb7-30"><a href="#cb7-30"></a></span>
<span id="cb7-31"><a href="#cb7-31"></a> <span class="kw">def</span> extract(obj):</span>
<span id="cb7-32"><a href="#cb7-32"></a> <span class="cf">if</span> <span class="bu">isinstance</span>(obj, <span class="bu">str</span>):</span>
<span id="cb7-33"><a href="#cb7-33"></a> values.append(obj)</span>
<span id="cb7-34"><a href="#cb7-34"></a> <span class="cf">elif</span> <span class="bu">isinstance</span>(obj, <span class="bu">dict</span>):</span>
<span id="cb7-35"><a href="#cb7-35"></a> <span class="cf">for</span> v <span class="kw">in</span> obj.values():</span>
<span id="cb7-36"><a href="#cb7-36"></a> extract(v)</span>
<span id="cb7-37"><a href="#cb7-37"></a> <span class="cf">elif</span> <span class="bu">isinstance</span>(obj, <span class="bu">list</span>):</span>
<span id="cb7-38"><a href="#cb7-38"></a> <span class="cf">for</span> item <span class="kw">in</span> obj:</span>
<span id="cb7-39"><a href="#cb7-39"></a> extract(item)</span>
<span id="cb7-40"><a href="#cb7-40"></a></span>
<span id="cb7-41"><a href="#cb7-41"></a> <span class="cf">try</span>:</span>
<span id="cb7-42"><a href="#cb7-42"></a> extract(json.loads(json_str))</span>
<span id="cb7-43"><a href="#cb7-43"></a> <span class="cf">except</span>:</span>
<span id="cb7-44"><a href="#cb7-44"></a> <span class="cf">pass</span></span>
<span id="cb7-45"><a href="#cb7-45"></a></span>
<span id="cb7-46"><a href="#cb7-46"></a> <span class="cf">return</span> values</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="step-7-base64-auto-decoding-layer-2.6" class="level2">
<h2 class="anchored" data-anchor-id="step-7-base64-auto-decoding-layer-2.6">Step 7: Base64 Auto-Decoding (Layer 2.6)</h2>
<p>Encoded PII is common in API responses and logs:</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1"></a><span class="im">import</span> base64</span>
<span id="cb8-2"><a href="#cb8-2"></a></span>
<span id="cb8-3"><a href="#cb8-3"></a><span class="kw">def</span> is_valid_base64(s: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">bool</span>:</span>
<span id="cb8-4"><a href="#cb8-4"></a> <span class="co">"""Check if string is valid base64."""</span></span>
<span id="cb8-5"><a href="#cb8-5"></a> <span class="cf">if</span> <span class="bu">len</span>(s) <span class="op">&lt;</span> <span class="dv">20</span> <span class="kw">or</span> <span class="bu">len</span>(s) <span class="op">%</span> <span class="dv">4</span> <span class="op">!=</span> <span class="dv">0</span>:</span>
<span id="cb8-6"><a href="#cb8-6"></a> <span class="cf">return</span> <span class="va">False</span></span>
<span id="cb8-7"><a href="#cb8-7"></a> <span class="cf">try</span>:</span>
<span id="cb8-8"><a href="#cb8-8"></a> decoded <span class="op">=</span> base64.b64decode(s, validate<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb8-9"><a href="#cb8-9"></a> decoded.decode(<span class="st">'utf-8'</span>) <span class="co"># Must be valid UTF-8</span></span>
<span id="cb8-10"><a href="#cb8-10"></a> <span class="cf">return</span> <span class="va">True</span></span>
<span id="cb8-11"><a href="#cb8-11"></a> <span class="cf">except</span>:</span>
<span id="cb8-12"><a href="#cb8-12"></a> <span class="cf">return</span> <span class="va">False</span></span>
<span id="cb8-13"><a href="#cb8-13"></a></span>
<span id="cb8-14"><a href="#cb8-14"></a><span class="kw">def</span> decode_base64_strings(text: <span class="bu">str</span>) <span class="op">-&gt;</span> <span class="bu">list</span>[<span class="bu">tuple</span>[<span class="bu">str</span>, <span class="bu">str</span>, <span class="bu">int</span>, <span class="bu">int</span>]]:</span>
<span id="cb8-15"><a href="#cb8-15"></a> <span class="co">"""Find and decode base64 strings."""</span></span>
<span id="cb8-16"><a href="#cb8-16"></a> results <span class="op">=</span> []</span>
<span id="cb8-17"><a href="#cb8-17"></a> pattern <span class="op">=</span> <span class="vs">r'[A-Za-z0-9+/]{20,}={0,2}'</span></span>
<span id="cb8-18"><a href="#cb8-18"></a></span>
<span id="cb8-19"><a href="#cb8-19"></a> <span class="cf">for</span> match <span class="kw">in</span> re.finditer(pattern, text):</span>
<span id="cb8-20"><a href="#cb8-20"></a> candidate <span class="op">=</span> match.group()</span>
<span id="cb8-21"><a href="#cb8-21"></a> <span class="cf">if</span> is_valid_base64(candidate):</span>
<span id="cb8-22"><a href="#cb8-22"></a> <span class="cf">try</span>:</span>
<span id="cb8-23"><a href="#cb8-23"></a> decoded <span class="op">=</span> base64.b64decode(candidate).decode(<span class="st">'utf-8'</span>)</span>
<span id="cb8-24"><a href="#cb8-24"></a> results.append((candidate, decoded, match.start(), match.end()))</span>
<span id="cb8-25"><a href="#cb8-25"></a> <span class="cf">except</span>:</span>
<span id="cb8-26"><a href="#cb8-26"></a> <span class="cf">pass</span></span>
<span id="cb8-27"><a href="#cb8-27"></a></span>
<span id="cb8-28"><a href="#cb8-28"></a> <span class="cf">return</span> results</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="step-8-build-the-fastapi-endpoint" class="level2">
<h2 class="anchored" data-anchor-id="step-8-build-the-fastapi-endpoint">Step 8: Build the FastAPI Endpoint</h2>
<p>Wire everything together in an API endpoint:</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1"></a><span class="im">from</span> fastapi <span class="im">import</span> APIRouter, Form</span>
<span id="cb9-2"><a href="#cb9-2"></a></span>
<span id="cb9-3"><a href="#cb9-3"></a>router <span class="op">=</span> APIRouter(prefix<span class="op">=</span><span class="st">"/api/privacy"</span>, tags<span class="op">=</span>[<span class="st">"privacy"</span>])</span>
<span id="cb9-4"><a href="#cb9-4"></a></span>
<span id="cb9-5"><a href="#cb9-5"></a><span class="at">@router.post</span>(<span class="st">"/scan-text"</span>)</span>
<span id="cb9-6"><a href="#cb9-6"></a><span class="cf">async</span> <span class="kw">def</span> scan_text(</span>
<span id="cb9-7"><a href="#cb9-7"></a> text: <span class="bu">str</span> <span class="op">=</span> Form(...),</span>
<span id="cb9-8"><a href="#cb9-8"></a> sensitivity: <span class="bu">str</span> <span class="op">=</span> Form(<span class="st">"medium"</span>)</span>
<span id="cb9-9"><a href="#cb9-9"></a>):</span>
<span id="cb9-10"><a href="#cb9-10"></a> <span class="co">"""Main PII scanning endpoint."""</span></span>
<span id="cb9-11"><a href="#cb9-11"></a></span>
<span id="cb9-12"><a href="#cb9-12"></a> <span class="co"># Layer 1: Basic pattern matching</span></span>
<span id="cb9-13"><a href="#cb9-13"></a> entities <span class="op">=</span> detect_pii_basic(text)</span>
<span id="cb9-14"><a href="#cb9-14"></a></span>
<span id="cb9-15"><a href="#cb9-15"></a> <span class="co"># Layer 2: Normalized text scan</span></span>
<span id="cb9-16"><a href="#cb9-16"></a> normalized, mappings <span class="op">=</span> normalize_text(text)</span>
<span id="cb9-17"><a href="#cb9-17"></a> normalized_entities <span class="op">=</span> detect_pii_basic(normalized)</span>
<span id="cb9-18"><a href="#cb9-18"></a> <span class="co"># ... map positions back to original</span></span>
<span id="cb9-19"><a href="#cb9-19"></a></span>
<span id="cb9-20"><a href="#cb9-20"></a> <span class="co"># Layer 2.5: JSON extraction</span></span>
<span id="cb9-21"><a href="#cb9-21"></a> <span class="cf">for</span> json_str, start, end <span class="kw">in</span> extract_json_strings(text):</span>
<span id="cb9-22"><a href="#cb9-22"></a> <span class="cf">for</span> value <span class="kw">in</span> deep_scan_json(json_str):</span>
<span id="cb9-23"><a href="#cb9-23"></a> entities.extend(detect_pii_basic(value))</span>
<span id="cb9-24"><a href="#cb9-24"></a></span>
<span id="cb9-25"><a href="#cb9-25"></a> <span class="co"># Layer 2.6: Base64 decoding</span></span>
<span id="cb9-26"><a href="#cb9-26"></a> <span class="cf">for</span> original, decoded, start, end <span class="kw">in</span> decode_base64_strings(text):</span>
<span id="cb9-27"><a href="#cb9-27"></a> decoded_entities <span class="op">=</span> detect_pii_basic(decoded)</span>
<span id="cb9-28"><a href="#cb9-28"></a> <span class="cf">for</span> e <span class="kw">in</span> decoded_entities:</span>
<span id="cb9-29"><a href="#cb9-29"></a> e.<span class="bu">type</span> <span class="op">=</span> <span class="ss">f"</span><span class="sc">{</span>e<span class="sc">.</span><span class="bu">type</span><span class="sc">}</span><span class="ss">_BASE64_ENCODED"</span></span>
<span id="cb9-30"><a href="#cb9-30"></a> entities.extend(decoded_entities)</span>
<span id="cb9-31"><a href="#cb9-31"></a></span>
<span id="cb9-32"><a href="#cb9-32"></a> <span class="co"># Layer 4: Validation</span></span>
<span id="cb9-33"><a href="#cb9-33"></a> <span class="cf">for</span> entity <span class="kw">in</span> entities:</span>
<span id="cb9-34"><a href="#cb9-34"></a> <span class="cf">if</span> entity.<span class="bu">type</span> <span class="op">==</span> <span class="st">"CREDIT_CARD"</span>:</span>
<span id="cb9-35"><a href="#cb9-35"></a> <span class="cf">if</span> luhn_checksum(entity.value):</span>
<span id="cb9-36"><a href="#cb9-36"></a> entity.confidence <span class="op">=</span> <span class="fl">0.95</span></span>
<span id="cb9-37"><a href="#cb9-37"></a> <span class="cf">else</span>:</span>
<span id="cb9-38"><a href="#cb9-38"></a> entity.<span class="bu">type</span> <span class="op">=</span> <span class="st">"POSSIBLE_CARD_PATTERN"</span></span>
<span id="cb9-39"><a href="#cb9-39"></a> entity.confidence <span class="op">=</span> <span class="fl">0.5</span></span>
<span id="cb9-40"><a href="#cb9-40"></a></span>
<span id="cb9-41"><a href="#cb9-41"></a> <span class="co"># Deduplicate and sort</span></span>
<span id="cb9-42"><a href="#cb9-42"></a> entities <span class="op">=</span> deduplicate_entities(entities)</span>
<span id="cb9-43"><a href="#cb9-43"></a></span>
<span id="cb9-44"><a href="#cb9-44"></a> <span class="co"># Generate masked preview</span></span>
<span id="cb9-45"><a href="#cb9-45"></a> redacted <span class="op">=</span> mask_pii(text, entities)</span>
<span id="cb9-46"><a href="#cb9-46"></a></span>
<span id="cb9-47"><a href="#cb9-47"></a> <span class="cf">return</span> {</span>
<span id="cb9-48"><a href="#cb9-48"></a> <span class="st">"entities"</span>: [e.<span class="bu">dict</span>() <span class="cf">for</span> e <span class="kw">in</span> entities],</span>
<span id="cb9-49"><a href="#cb9-49"></a> <span class="st">"redacted_preview"</span>: redacted,</span>
<span id="cb9-50"><a href="#cb9-50"></a> <span class="st">"summary"</span>: generate_summary(entities)</span>
<span id="cb9-51"><a href="#cb9-51"></a> }</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="step-9-create-the-sveltekit-frontend" class="level2">
<h2 class="anchored" data-anchor-id="step-9-create-the-sveltekit-frontend">Step 9: Create the SvelteKit Frontend</h2>
<p>Build an interactive UI in <code>frontend/src/routes/privacy-scanner/+page.svelte</code>:</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode numberSource svelte number-lines code-with-copy"><code class="sourceCode"><span id="cb10-1"><a href="#cb10-1"></a>&lt;script lang="ts"&gt;</span>
<span id="cb10-2"><a href="#cb10-2"></a> let inputText = '';</span>
<span id="cb10-3"><a href="#cb10-3"></a> let results: any = null;</span>
<span id="cb10-4"><a href="#cb10-4"></a> let loading = false;</span>
<span id="cb10-5"><a href="#cb10-5"></a></span>
<span id="cb10-6"><a href="#cb10-6"></a> async function scanText() {</span>
<span id="cb10-7"><a href="#cb10-7"></a> loading = true;</span>
<span id="cb10-8"><a href="#cb10-8"></a> const formData = new FormData();</span>
<span id="cb10-9"><a href="#cb10-9"></a> formData.append('text', inputText);</span>
<span id="cb10-10"><a href="#cb10-10"></a></span>
<span id="cb10-11"><a href="#cb10-11"></a> const response = await fetch('/api/privacy/scan-text', {</span>
<span id="cb10-12"><a href="#cb10-12"></a> method: 'POST',</span>
<span id="cb10-13"><a href="#cb10-13"></a> body: formData</span>
<span id="cb10-14"><a href="#cb10-14"></a> });</span>
<span id="cb10-15"><a href="#cb10-15"></a></span>
<span id="cb10-16"><a href="#cb10-16"></a> results = await response.json();</span>
<span id="cb10-17"><a href="#cb10-17"></a> loading = false;</span>
<span id="cb10-18"><a href="#cb10-18"></a> }</span>
<span id="cb10-19"><a href="#cb10-19"></a>&lt;/script&gt;</span>
<span id="cb10-20"><a href="#cb10-20"></a></span>
<span id="cb10-21"><a href="#cb10-21"></a>&lt;div class="container mx-auto p-6"&gt;</span>
<span id="cb10-22"><a href="#cb10-22"></a> &lt;h1 class="text-2xl font-bold mb-4"&gt;Privacy Scanner&lt;/h1&gt;</span>
<span id="cb10-23"><a href="#cb10-23"></a></span>
<span id="cb10-24"><a href="#cb10-24"></a> &lt;textarea</span>
<span id="cb10-25"><a href="#cb10-25"></a> bind:value={inputText}</span>
<span id="cb10-26"><a href="#cb10-26"></a> class="w-full h-48 p-4 border rounded"</span>
<span id="cb10-27"><a href="#cb10-27"></a> placeholder="Paste text to scan for PII..."</span>
<span id="cb10-28"><a href="#cb10-28"></a> &gt;&lt;/textarea&gt;</span>
<span id="cb10-29"><a href="#cb10-29"></a></span>
<span id="cb10-30"><a href="#cb10-30"></a> &lt;button</span>
<span id="cb10-31"><a href="#cb10-31"></a> on:click={scanText}</span>
<span id="cb10-32"><a href="#cb10-32"></a> disabled={loading}</span>
<span id="cb10-33"><a href="#cb10-33"></a> class="mt-4 px-6 py-2 bg-blue-600 text-white rounded"</span>
<span id="cb10-34"><a href="#cb10-34"></a> &gt;</span>
<span id="cb10-35"><a href="#cb10-35"></a> {loading ? 'Scanning...' : 'Scan for PII'}</span>
<span id="cb10-36"><a href="#cb10-36"></a> &lt;/button&gt;</span>
<span id="cb10-37"><a href="#cb10-37"></a></span>
<span id="cb10-38"><a href="#cb10-38"></a> {#if results}</span>
<span id="cb10-39"><a href="#cb10-39"></a> &lt;div class="mt-6"&gt;</span>
<span id="cb10-40"><a href="#cb10-40"></a> &lt;h2 class="text-xl font-semibold"&gt;Results&lt;/h2&gt;</span>
<span id="cb10-41"><a href="#cb10-41"></a></span>
<span id="cb10-42"><a href="#cb10-42"></a> &lt;!-- Entity badges --&gt;</span>
<span id="cb10-43"><a href="#cb10-43"></a> &lt;div class="flex flex-wrap gap-2 mt-4"&gt;</span>
<span id="cb10-44"><a href="#cb10-44"></a> {#each results.entities as entity}</span>
<span id="cb10-45"><a href="#cb10-45"></a> &lt;span class="px-3 py-1 rounded-full bg-red-100 text-red-800"&gt;</span>
<span id="cb10-46"><a href="#cb10-46"></a> {entity.type}: {entity.value}</span>
<span id="cb10-47"><a href="#cb10-47"></a> &lt;/span&gt;</span>
<span id="cb10-48"><a href="#cb10-48"></a> {/each}</span>
<span id="cb10-49"><a href="#cb10-49"></a> &lt;/div&gt;</span>
<span id="cb10-50"><a href="#cb10-50"></a></span>
<span id="cb10-51"><a href="#cb10-51"></a> &lt;!-- Redacted preview --&gt;</span>
<span id="cb10-52"><a href="#cb10-52"></a> &lt;div class="mt-4 p-4 bg-gray-100 rounded font-mono"&gt;</span>
<span id="cb10-53"><a href="#cb10-53"></a> {results.redacted_preview}</span>
<span id="cb10-54"><a href="#cb10-54"></a> &lt;/div&gt;</span>
<span id="cb10-55"><a href="#cb10-55"></a> &lt;/div&gt;</span>
<span id="cb10-56"><a href="#cb10-56"></a> {/if}</span>
<span id="cb10-57"><a href="#cb10-57"></a>&lt;/div&gt;</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="step-10-add-security-features" class="level2">
<h2 class="anchored" data-anchor-id="step-10-add-security-features">Step 10: Add Security Features</h2>
<p>For production deployment, implement ephemeral processing:</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1"></a><span class="co"># In main.py - ensure no PII logging</span></span>
<span id="cb11-2"><a href="#cb11-2"></a><span class="im">import</span> logging</span>
<span id="cb11-3"><a href="#cb11-3"></a></span>
<span id="cb11-4"><a href="#cb11-4"></a><span class="kw">class</span> PIIFilter(logging.Filter):</span>
<span id="cb11-5"><a href="#cb11-5"></a> <span class="kw">def</span> <span class="bu">filter</span>(<span class="va">self</span>, record):</span>
<span id="cb11-6"><a href="#cb11-6"></a> <span class="co"># Never log request bodies that might contain PII</span></span>
<span id="cb11-7"><a href="#cb11-7"></a> <span class="cf">return</span> <span class="st">'text='</span> <span class="kw">not</span> <span class="kw">in</span> <span class="bu">str</span>(record.msg)</span>
<span id="cb11-8"><a href="#cb11-8"></a></span>
<span id="cb11-9"><a href="#cb11-9"></a>logging.getLogger().addFilter(PIIFilter())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>And add coordinates-only mode for ultra-sensitive clients:</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1"></a><span class="at">@router.post</span>(<span class="st">"/scan-text"</span>)</span>
<span id="cb12-2"><a href="#cb12-2"></a><span class="cf">async</span> <span class="kw">def</span> scan_text(</span>
<span id="cb12-3"><a href="#cb12-3"></a> text: <span class="bu">str</span> <span class="op">=</span> Form(...),</span>
<span id="cb12-4"><a href="#cb12-4"></a> coordinates_only: <span class="bu">bool</span> <span class="op">=</span> Form(<span class="va">False</span>) <span class="co"># Client-side redaction mode</span></span>
<span id="cb12-5"><a href="#cb12-5"></a>):</span>
<span id="cb12-6"><a href="#cb12-6"></a> entities <span class="op">=</span> detect_pii_multilayer(text)</span>
<span id="cb12-7"><a href="#cb12-7"></a></span>
<span id="cb12-8"><a href="#cb12-8"></a> <span class="cf">if</span> coordinates_only:</span>
<span id="cb12-9"><a href="#cb12-9"></a> <span class="co"># Return only positions, not actual values</span></span>
<span id="cb12-10"><a href="#cb12-10"></a> <span class="cf">return</span> {</span>
<span id="cb12-11"><a href="#cb12-11"></a> <span class="st">"entities"</span>: [</span>
<span id="cb12-12"><a href="#cb12-12"></a> {<span class="st">"type"</span>: e.<span class="bu">type</span>, <span class="st">"start"</span>: e.start, <span class="st">"end"</span>: e.end, <span class="st">"length"</span>: e.end <span class="op">-</span> e.start}</span>
<span id="cb12-13"><a href="#cb12-13"></a> <span class="cf">for</span> e <span class="kw">in</span> entities</span>
<span id="cb12-14"><a href="#cb12-14"></a> ],</span>
<span id="cb12-15"><a href="#cb12-15"></a> <span class="st">"coordinates_only"</span>: <span class="va">True</span></span>
<span id="cb12-16"><a href="#cb12-16"></a> }</span>
<span id="cb12-17"><a href="#cb12-17"></a></span>
<span id="cb12-18"><a href="#cb12-18"></a> <span class="co"># Normal response with values</span></span>
<span id="cb12-19"><a href="#cb12-19"></a> <span class="cf">return</span> {<span class="st">"entities"</span>: [e.<span class="bu">dict</span>() <span class="cf">for</span> e <span class="kw">in</span> entities], ...}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="conclusion" class="level2">
<h2 class="anchored" data-anchor-id="conclusion">Conclusion</h2>
<p>Youve now built a multi-layer Privacy Scanner that can:</p>
<ul>
<li>Detect 40+ PII types using regex patterns</li>
<li>Defeat obfuscation through text normalization</li>
<li>Extract PII from JSON payloads and Base64 encodings</li>
<li>Validate checksums to reduce false positives</li>
<li>Provide a clean web interface for interactive scanning</li>
<li>Operate in secure, coordinates-only mode</li>
</ul>
<p><strong>Next steps</strong> to enhance your scanner:</p>
<ol type="1">
<li>Add machine learning for name/address detection</li>
<li>Implement language-specific patterns (EU VAT, UK NI numbers)</li>
<li>Build CI/CD integration for automated pre-commit scanning</li>
<li>Add PDF and document parsing capabilities</li>
</ol>
<p>The complete source code is available in the AI Tools Suite repository. Happy scanning!</p>
</section>
</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const toggleBodyColorMode = (bsSheetEl) => {
const mode = bsSheetEl.getAttribute("data-mode");
const bodyEl = window.document.querySelector("body");
if (mode === "dark") {
bodyEl.classList.add("quarto-dark");
bodyEl.classList.remove("quarto-light");
} else {
bodyEl.classList.add("quarto-light");
bodyEl.classList.remove("quarto-dark");
}
}
const toggleBodyColorPrimary = () => {
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
if (bsSheetEl) {
toggleBodyColorMode(bsSheetEl);
}
}
toggleBodyColorPrimary();
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const isCodeAnnotation = (el) => {
for (const clz of el.classList) {
if (clz.startsWith('code-annotation-')) {
return true;
}
}
return false;
}
const onCopySuccess = function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
}
const getTextToCopy = function(trigger) {
const codeEl = trigger.previousElementSibling.cloneNode(true);
for (const childEl of codeEl.children) {
if (isCodeAnnotation(childEl)) {
childEl.remove();
}
}
return codeEl.innerText;
}
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
text: getTextToCopy
});
clipboard.on('success', onCopySuccess);
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
// For code content inside modals, clipBoardJS needs to be initialized with a container option
// TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
text: getTextToCopy,
container: window.document.getElementById('quarto-embedded-source-code-modal')
});
clipboardModal.on('success', onCopySuccess);
}
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
var mailtoRegex = new RegExp(/^mailto:/);
var filterRegex = new RegExp('/' + window.location.host + '/');
var isInternal = (href) => {
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
}
// Inspect non-navigation links and adorn them if external
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
for (var i=0; i<links.length; i++) {
const link = links[i];
if (!isInternal(link.href)) {
// undo the damage that might have been done by quarto-nav.js in the case of
// links that we want to consider external
if (link.dataset.originalHref !== undefined) {
link.href = link.dataset.originalHref;
}
}
}
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
const config = {
allowHTML: true,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start',
};
if (contentFn) {
config.content = contentFn;
}
if (onTriggerFn) {
config.onTrigger = onTriggerFn;
}
if (onUntriggerFn) {
config.onUntrigger = onUntriggerFn;
}
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
// use id or data attribute instead here
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
if (note) {
return note.innerHTML;
} else {
return "";
}
});
}
const xrefs = window.document.querySelectorAll('a.quarto-xref');
const processXRef = (id, note) => {
// Strip column container classes
const stripColumnClz = (el) => {
el.classList.remove("page-full", "page-columns");
if (el.children) {
for (const child of el.children) {
stripColumnClz(child);
}
}
}
stripColumnClz(note)
if (id === null || id.startsWith('sec-')) {
// Special case sections, only their first couple elements
const container = document.createElement("div");
if (note.children && note.children.length > 2) {
container.appendChild(note.children[0].cloneNode(true));
for (let i = 1; i < note.children.length; i++) {
const child = note.children[i];
if (child.tagName === "P" && child.innerText === "") {
continue;
} else {
container.appendChild(child.cloneNode(true));
break;
}
}
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(container);
}
return container.innerHTML
} else {
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(note);
}
return note.innerHTML;
}
} else {
// Remove any anchor links if they are present
const anchorLink = note.querySelector('a.anchorjs-link');
if (anchorLink) {
anchorLink.remove();
}
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(note);
}
// TODO in 1.5, we should make sure this works without a callout special case
if (note.classList.contains("callout")) {
return note.outerHTML;
} else {
return note.innerHTML;
}
}
}
for (var i=0; i<xrefs.length; i++) {
const xref = xrefs[i];
tippyHover(xref, undefined, function(instance) {
instance.disable();
let url = xref.getAttribute('href');
let hash = undefined;
if (url.startsWith('#')) {
hash = url;
} else {
try { hash = new URL(url).hash; } catch {}
}
if (hash) {
const id = hash.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
if (note !== null) {
try {
const html = processXRef(id, note.cloneNode(true));
instance.setContent(html);
} finally {
instance.enable();
instance.show();
}
} else {
// See if we can fetch this
fetch(url.split('#')[0])
.then(res => res.text())
.then(html => {
const parser = new DOMParser();
const htmlDoc = parser.parseFromString(html, "text/html");
const note = htmlDoc.getElementById(id);
if (note !== null) {
const html = processXRef(id, note);
instance.setContent(html);
}
}).finally(() => {
instance.enable();
instance.show();
});
}
} else {
// See if we can fetch a full url (with no hash to target)
// This is a special case and we should probably do some content thinning / targeting
fetch(url)
.then(res => res.text())
.then(html => {
const parser = new DOMParser();
const htmlDoc = parser.parseFromString(html, "text/html");
const note = htmlDoc.querySelector('main.content');
if (note !== null) {
// This should only happen for chapter cross references
// (since there is no id in the URL)
// remove the first header
if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
note.children[0].remove();
}
const html = processXRef(null, note);
instance.setContent(html);
}
}).finally(() => {
instance.enable();
instance.show();
});
}
}, function(instance) {
});
}
let selectedAnnoteEl;
const selectorForAnnotation = ( cell, annotation) => {
let cellAttr = 'data-code-cell="' + cell + '"';
let lineAttr = 'data-code-annotation="' + annotation + '"';
const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
return selector;
}
const selectCodeLines = (annoteEl) => {
const doc = window.document;
const targetCell = annoteEl.getAttribute("data-target-cell");
const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
const lines = annoteSpan.getAttribute("data-code-lines").split(",");
const lineIds = lines.map((line) => {
return targetCell + "-" + line;
})
let top = null;
let height = null;
let parent = null;
if (lineIds.length > 0) {
//compute the position of the single el (top and bottom and make a div)
const el = window.document.getElementById(lineIds[0]);
top = el.offsetTop;
height = el.offsetHeight;
parent = el.parentElement.parentElement;
if (lineIds.length > 1) {
const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
const bottom = lastEl.offsetTop + lastEl.offsetHeight;
height = bottom - top;
}
if (top !== null && height !== null && parent !== null) {
// cook up a div (if necessary) and position it
let div = window.document.getElementById("code-annotation-line-highlight");
if (div === null) {
div = window.document.createElement("div");
div.setAttribute("id", "code-annotation-line-highlight");
div.style.position = 'absolute';
parent.appendChild(div);
}
div.style.top = top - 2 + "px";
div.style.height = height + 4 + "px";
div.style.left = 0;
let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
if (gutterDiv === null) {
gutterDiv = window.document.createElement("div");
gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
gutterDiv.style.position = 'absolute';
const codeCell = window.document.getElementById(targetCell);
const gutter = codeCell.querySelector('.code-annotation-gutter');
gutter.appendChild(gutterDiv);
}
gutterDiv.style.top = top - 2 + "px";
gutterDiv.style.height = height + 4 + "px";
}
selectedAnnoteEl = annoteEl;
}
};
const unselectCodeLines = () => {
const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
elementsIds.forEach((elId) => {
const div = window.document.getElementById(elId);
if (div) {
div.remove();
}
});
selectedAnnoteEl = undefined;
};
// Handle positioning of the toggle
window.addEventListener(
"resize",
throttle(() => {
elRect = undefined;
if (selectedAnnoteEl) {
selectCodeLines(selectedAnnoteEl);
}
}, 10)
);
function throttle(fn, ms) {
let throttle = false;
let timer;
return (...args) => {
if(!throttle) { // first call gets through
fn.apply(this, args);
throttle = true;
} else { // all the others get throttled
if(timer) clearTimeout(timer); // cancel #2
timer = setTimeout(() => {
fn.apply(this, args);
timer = throttle = false;
}, ms);
}
};
}
// Attach click handler to the DT
const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
for (const annoteDlNode of annoteDls) {
annoteDlNode.addEventListener('click', (event) => {
const clickedEl = event.target;
if (clickedEl !== selectedAnnoteEl) {
unselectCodeLines();
const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
if (activeEl) {
activeEl.classList.remove('code-annotation-active');
}
selectCodeLines(clickedEl);
clickedEl.classList.add('code-annotation-active');
} else {
// Unselect the line
unselectCodeLines();
clickedEl.classList.remove('code-annotation-active');
}
});
}
const findCites = (el) => {
const parentEl = el.parentElement;
if (parentEl) {
const cites = parentEl.dataset.cites;
if (cites) {
return {
el,
cites: cites.split(' ')
};
} else {
return findCites(el.parentElement)
}
} else {
return undefined;
}
};
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i<bibliorefs.length; i++) {
const ref = bibliorefs[i];
const citeInfo = findCites(ref);
if (citeInfo) {
tippyHover(citeInfo.el, function() {
var popup = window.document.createElement('div');
citeInfo.cites.forEach(function(cite) {
var citeDiv = window.document.createElement('div');
citeDiv.classList.add('hanging-indent');
citeDiv.classList.add('csl-entry');
var biblioDiv = window.document.getElementById('ref-' + cite);
if (biblioDiv) {
citeDiv.innerHTML = biblioDiv.innerHTML;
}
popup.appendChild(citeDiv);
});
return popup.innerHTML;
});
}
}
});
</script>
</div> <!-- /content -->
</body></html>