1838 lines
No EOL
97 KiB
HTML
1838 lines
No EOL
97 KiB
HTML
<!DOCTYPE html>
|
||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="generator" content="quarto-1.6.33">
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||
|
||
<meta name="author" content="AI Tools Suite">
|
||
<meta name="dcterms.date" content="2024-12-23">
|
||
|
||
<title>Privacy Scanner: Security & Compliance White Paper</title>
|
||
<style>
|
||
code{white-space: pre-wrap;}
|
||
span.smallcaps{font-variant: small-caps;}
|
||
div.columns{display: flex; gap: min(4vw, 1.5em);}
|
||
div.column{flex: auto; overflow-x: auto;}
|
||
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
||
ul.task-list{list-style: none;}
|
||
ul.task-list li input[type="checkbox"] {
|
||
width: 0.8em;
|
||
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
|
||
vertical-align: middle;
|
||
}
|
||
/* CSS for syntax highlighting */
|
||
pre > code.sourceCode { white-space: pre; position: relative; }
|
||
pre > code.sourceCode > span { line-height: 1.25; }
|
||
pre > code.sourceCode > span:empty { height: 1.2em; }
|
||
.sourceCode { overflow: visible; }
|
||
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
||
div.sourceCode { margin: 1em 0; }
|
||
pre.sourceCode { margin: 0; }
|
||
@media screen {
|
||
div.sourceCode { overflow: auto; }
|
||
}
|
||
@media print {
|
||
pre > code.sourceCode { white-space: pre-wrap; }
|
||
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
|
||
}
|
||
pre.numberSource code
|
||
{ counter-reset: source-line 0; }
|
||
pre.numberSource code > span
|
||
{ position: relative; left: -4em; counter-increment: source-line; }
|
||
pre.numberSource code > span > a:first-child::before
|
||
{ content: counter(source-line);
|
||
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
||
border: none; display: inline-block;
|
||
-webkit-touch-callout: none; -webkit-user-select: none;
|
||
-khtml-user-select: none; -moz-user-select: none;
|
||
-ms-user-select: none; user-select: none;
|
||
padding: 0 4px; width: 4em;
|
||
}
|
||
pre.numberSource { margin-left: 3em; padding-left: 4px; }
|
||
div.sourceCode
|
||
{ }
|
||
@media screen {
|
||
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
||
}
|
||
</style>
|
||
|
||
|
||
<script src="security-compliance-whitepaper_files/libs/clipboard/clipboard.min.js"></script>
|
||
<script src="security-compliance-whitepaper_files/libs/quarto-html/quarto.js"></script>
|
||
<script src="security-compliance-whitepaper_files/libs/quarto-html/popper.min.js"></script>
|
||
<script src="security-compliance-whitepaper_files/libs/quarto-html/tippy.umd.min.js"></script>
|
||
<script src="security-compliance-whitepaper_files/libs/quarto-html/anchor.min.js"></script>
|
||
<link href="security-compliance-whitepaper_files/libs/quarto-html/tippy.css" rel="stylesheet">
|
||
<link href="security-compliance-whitepaper_files/libs/quarto-html/quarto-syntax-highlighting-07ba0ad10f5680c660e360ac31d2f3b6.css" rel="stylesheet" id="quarto-text-highlighting-styles">
|
||
<script src="security-compliance-whitepaper_files/libs/bootstrap/bootstrap.min.js"></script>
|
||
<link href="security-compliance-whitepaper_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
|
||
<link href="security-compliance-whitepaper_files/libs/bootstrap/bootstrap-fe6593aca1dacbc749dc3d2ba78c8639.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="light">
|
||
|
||
|
||
</head>
|
||
|
||
<body>
|
||
|
||
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
|
||
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
|
||
<nav id="TOC" role="doc-toc" class="toc-active">
|
||
<h2 id="toc-title">Table of contents</h2>
|
||
|
||
<ul>
|
||
<li><a href="#executive-summary" id="toc-executive-summary" class="nav-link active" data-scroll-target="#executive-summary"><span class="header-section-number">1</span> Executive Summary</a>
|
||
<ul class="collapse">
|
||
<li><a href="#value-realization" id="toc-value-realization" class="nav-link" data-scroll-target="#value-realization"><span class="header-section-number">1.1</span> Value Realization</a></li>
|
||
</ul></li>
|
||
<li><a href="#security-architecture" id="toc-security-architecture" class="nav-link" data-scroll-target="#security-architecture"><span class="header-section-number">2</span> Security Architecture</a>
|
||
<ul class="collapse">
|
||
<li><a href="#defense-in-depth" id="toc-defense-in-depth" class="nav-link" data-scroll-target="#defense-in-depth"><span class="header-section-number">2.1</span> 2.1 Defense in Depth</a></li>
|
||
<li><a href="#ephemeral-processing-model" id="toc-ephemeral-processing-model" class="nav-link" data-scroll-target="#ephemeral-processing-model"><span class="header-section-number">2.2</span> 2.2 Ephemeral Processing Model</a></li>
|
||
<li><a href="#client-side-redaction-mode" id="toc-client-side-redaction-mode" class="nav-link" data-scroll-target="#client-side-redaction-mode"><span class="header-section-number">2.3</span> 2.3 Client-Side Redaction Mode</a></li>
|
||
</ul></li>
|
||
<li><a href="#detection-capabilities" id="toc-detection-capabilities" class="nav-link" data-scroll-target="#detection-capabilities"><span class="header-section-number">3</span> Detection Capabilities</a>
|
||
<ul class="collapse">
|
||
<li><a href="#pii-categories-and-types" id="toc-pii-categories-and-types" class="nav-link" data-scroll-target="#pii-categories-and-types"><span class="header-section-number">3.1</span> 3.1 PII Categories and Types</a></li>
|
||
<li><a href="#eight-layer-detection-pipeline" id="toc-eight-layer-detection-pipeline" class="nav-link" data-scroll-target="#eight-layer-detection-pipeline"><span class="header-section-number">3.2</span> 3.2 Eight-Layer Detection Pipeline</a></li>
|
||
<li><a href="#anti-evasion-capabilities" id="toc-anti-evasion-capabilities" class="nav-link" data-scroll-target="#anti-evasion-capabilities"><span class="header-section-number">3.3</span> 3.3 Anti-Evasion Capabilities</a></li>
|
||
</ul></li>
|
||
<li><a href="#compliance-mapping" id="toc-compliance-mapping" class="nav-link" data-scroll-target="#compliance-mapping"><span class="header-section-number">4</span> Compliance Mapping</a>
|
||
<ul class="collapse">
|
||
<li><a href="#gdpr-general-data-protection-regulation" id="toc-gdpr-general-data-protection-regulation" class="nav-link" data-scroll-target="#gdpr-general-data-protection-regulation"><span class="header-section-number">4.1</span> 4.1 GDPR (General Data Protection Regulation)</a></li>
|
||
<li><a href="#hipaa-health-insurance-portability-and-accountability-act" id="toc-hipaa-health-insurance-portability-and-accountability-act" class="nav-link" data-scroll-target="#hipaa-health-insurance-portability-and-accountability-act"><span class="header-section-number">4.2</span> 4.2 HIPAA (Health Insurance Portability and Accountability Act)</a></li>
|
||
<li><a href="#pci-dss-payment-card-industry-data-security-standard" id="toc-pci-dss-payment-card-industry-data-security-standard" class="nav-link" data-scroll-target="#pci-dss-payment-card-industry-data-security-standard"><span class="header-section-number">4.3</span> 4.3 PCI-DSS (Payment Card Industry Data Security Standard)</a></li>
|
||
<li><a href="#soc-2-service-organization-control" id="toc-soc-2-service-organization-control" class="nav-link" data-scroll-target="#soc-2-service-organization-control"><span class="header-section-number">4.4</span> 4.4 SOC 2 (Service Organization Control)</a></li>
|
||
<li><a href="#ccpa-california-consumer-privacy-act" id="toc-ccpa-california-consumer-privacy-act" class="nav-link" data-scroll-target="#ccpa-california-consumer-privacy-act"><span class="header-section-number">4.5</span> 4.5 CCPA (California Consumer Privacy Act)</a></li>
|
||
</ul></li>
|
||
<li><a href="#integration-patterns" id="toc-integration-patterns" class="nav-link" data-scroll-target="#integration-patterns"><span class="header-section-number">5</span> Integration Patterns</a>
|
||
<ul class="collapse">
|
||
<li><a href="#pre-commit-hook-developer-workflow" id="toc-pre-commit-hook-developer-workflow" class="nav-link" data-scroll-target="#pre-commit-hook-developer-workflow"><span class="header-section-number">5.1</span> 5.1 Pre-Commit Hook (Developer Workflow)</a></li>
|
||
<li><a href="#cicd-pipeline-integration" id="toc-cicd-pipeline-integration" class="nav-link" data-scroll-target="#cicd-pipeline-integration"><span class="header-section-number">5.2</span> 5.2 CI/CD Pipeline Integration</a></li>
|
||
<li><a href="#data-pipeline-integration" id="toc-data-pipeline-integration" class="nav-link" data-scroll-target="#data-pipeline-integration"><span class="header-section-number">5.3</span> 5.3 Data Pipeline Integration</a></li>
|
||
<li><a href="#log-sanitization-service" id="toc-log-sanitization-service" class="nav-link" data-scroll-target="#log-sanitization-service"><span class="header-section-number">5.4</span> 5.4 Log Sanitization Service</a></li>
|
||
</ul></li>
|
||
<li><a href="#performance-characteristics" id="toc-performance-characteristics" class="nav-link" data-scroll-target="#performance-characteristics"><span class="header-section-number">6</span> Performance Characteristics</a>
|
||
<ul class="collapse">
|
||
<li><a href="#benchmarks" id="toc-benchmarks" class="nav-link" data-scroll-target="#benchmarks"><span class="header-section-number">6.1</span> 6.1 Benchmarks</a></li>
|
||
<li><a href="#scalability" id="toc-scalability" class="nav-link" data-scroll-target="#scalability"><span class="header-section-number">6.2</span> 6.2 Scalability</a></li>
|
||
</ul></li>
|
||
<li><a href="#deployment-options" id="toc-deployment-options" class="nav-link" data-scroll-target="#deployment-options"><span class="header-section-number">7</span> Deployment Options</a>
|
||
<ul class="collapse">
|
||
<li><a href="#on-premises" id="toc-on-premises" class="nav-link" data-scroll-target="#on-premises"><span class="header-section-number">7.1</span> 7.1 On-Premises</a></li>
|
||
<li><a href="#private-cloud-vpc" id="toc-private-cloud-vpc" class="nav-link" data-scroll-target="#private-cloud-vpc"><span class="header-section-number">7.2</span> 7.2 Private Cloud (VPC)</a></li>
|
||
<li><a href="#air-gapped-deployment" id="toc-air-gapped-deployment" class="nav-link" data-scroll-target="#air-gapped-deployment"><span class="header-section-number">7.3</span> 7.3 Air-Gapped Deployment</a></li>
|
||
</ul></li>
|
||
<li><a href="#security-hardening-checklist" id="toc-security-hardening-checklist" class="nav-link" data-scroll-target="#security-hardening-checklist"><span class="header-section-number">8</span> Security Hardening Checklist</a>
|
||
<ul class="collapse">
|
||
<li><a href="#pre-deployment" id="toc-pre-deployment" class="nav-link" data-scroll-target="#pre-deployment"><span class="header-section-number">8.1</span> Pre-Deployment</a></li>
|
||
<li><a href="#runtime" id="toc-runtime" class="nav-link" data-scroll-target="#runtime"><span class="header-section-number">8.2</span> Runtime</a></li>
|
||
<li><a href="#audit" id="toc-audit" class="nav-link" data-scroll-target="#audit"><span class="header-section-number">8.3</span> Audit</a></li>
|
||
</ul></li>
|
||
<li><a href="#appendix-a-api-reference" id="toc-appendix-a-api-reference" class="nav-link" data-scroll-target="#appendix-a-api-reference"><span class="header-section-number">9</span> Appendix A: API Reference</a>
|
||
<ul class="collapse">
|
||
<li><a href="#scan-text-endpoint" id="toc-scan-text-endpoint" class="nav-link" data-scroll-target="#scan-text-endpoint"><span class="header-section-number">9.1</span> Scan Text Endpoint</a></li>
|
||
</ul></li>
|
||
<li><a href="#appendix-b-confidence-scoring" id="toc-appendix-b-confidence-scoring" class="nav-link" data-scroll-target="#appendix-b-confidence-scoring"><span class="header-section-number">10</span> Appendix B: Confidence Scoring</a></li>
|
||
<li><a href="#appendix-c-version-history" id="toc-appendix-c-version-history" class="nav-link" data-scroll-target="#appendix-c-version-history"><span class="header-section-number">11</span> Appendix C: Version History</a></li>
|
||
<li><a href="#contact-support" id="toc-contact-support" class="nav-link" data-scroll-target="#contact-support"><span class="header-section-number">12</span> Contact & Support</a></li>
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
<main class="content" id="quarto-document-content">
|
||
|
||
<header id="title-block-header" class="quarto-title-block default">
|
||
<div class="quarto-title">
|
||
<h1 class="title">Privacy Scanner: Security & Compliance White Paper</h1>
|
||
<p class="subtitle lead">Enterprise-Grade PII Detection with Zero-Trust Architecture</p>
|
||
<div class="quarto-categories">
|
||
<div class="quarto-category">security</div>
|
||
<div class="quarto-category">compliance</div>
|
||
<div class="quarto-category">enterprise</div>
|
||
<div class="quarto-category">privacy</div>
|
||
<div class="quarto-category">whitepaper</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="quarto-title-meta">
|
||
|
||
<div>
|
||
<div class="quarto-title-meta-heading">Author</div>
|
||
<div class="quarto-title-meta-contents">
|
||
<p>AI Tools Suite </p>
|
||
</div>
|
||
</div>
|
||
|
||
<div>
|
||
<div class="quarto-title-meta-heading">Published</div>
|
||
<div class="quarto-title-meta-contents">
|
||
<p class="date">December 23, 2024</p>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
</div>
|
||
|
||
|
||
|
||
</header>
|
||
|
||
|
||
<section id="executive-summary" class="level2" data-number="1">
|
||
<h2 data-number="1" class="anchored" data-anchor-id="executive-summary"><span class="header-section-number">1</span> Executive Summary</h2>
|
||
<section id="value-realization" class="level3" data-number="1.1">
|
||
<h3 data-number="1.1" class="anchored" data-anchor-id="value-realization"><span class="header-section-number">1.1</span> Value Realization</h3>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 43%">
|
||
<col style="width: 56%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Stakeholder</th>
|
||
<th>Primary Benefit</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>Developer</strong></td>
|
||
<td>Prevents secrets/keys from ever reaching GitHub</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Data Engineer</strong></td>
|
||
<td>Automates PII scrubbing before data enters the warehouse</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Compliance Officer</strong></td>
|
||
<td>Provides proof of “Privacy by Design” for GDPR/SOC2 audits</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>CISO</strong></td>
|
||
<td>Reduces the overall blast radius of a potential data breach</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Legal/DPO</strong></td>
|
||
<td>Supports DSAR (Data Subject Access Request) fulfillment</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>DevOps/SRE</strong></td>
|
||
<td>Sanitizes logs before shipping to centralized observability</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<hr>
|
||
<p>The Privacy Scanner is an enterprise-grade Personally Identifiable Information (PII) detection and redaction solution designed with security-first principles. This white paper details the security architecture, compliance capabilities, and technical safeguards that make the Privacy Scanner suitable for organizations with stringent data protection requirements.</p>
|
||
<p><strong>Key Highlights:</strong></p>
|
||
<ul>
|
||
<li><strong>40+ PII Types Detected</strong> across identity, financial, contact, medical, and secret categories</li>
|
||
<li><strong>8-Layer Detection Pipeline</strong> for comprehensive coverage including obfuscation bypass</li>
|
||
<li><strong>Zero-Trust Architecture</strong> with optional client-side redaction mode</li>
|
||
<li><strong>Ephemeral Processing</strong> - no data persistence, no logging of sensitive content</li>
|
||
<li><strong>Supports Compliance Programs</strong> - technical controls aligned with GDPR, HIPAA, PCI-DSS, SOC 2, and CCPA requirements (tool assists compliance efforts; does not guarantee compliance)</li>
|
||
</ul>
|
||
<hr>
|
||
</section>
|
||
</section>
|
||
<section id="security-architecture" class="level2" data-number="2">
|
||
<h2 data-number="2" class="anchored" data-anchor-id="security-architecture"><span class="header-section-number">2</span> Security Architecture</h2>
|
||
<section id="defense-in-depth" class="level3" data-number="2.1">
|
||
<h3 data-number="2.1" class="anchored" data-anchor-id="defense-in-depth"><span class="header-section-number">2.1</span> 2.1 Defense in Depth</h3>
|
||
<p>The Privacy Scanner implements multiple layers of security controls:</p>
|
||
<pre><code>┌─────────────────────────────────────────────────────────────┐
|
||
│ CLIENT BROWSER │
|
||
│ ┌─────────────────────────────────────────────────────┐ │
|
||
│ │ Client-Side Redaction Mode (Optional) │ │
|
||
│ │ • PII never leaves browser │ │
|
||
│ │ • Only coordinates returned from backend │ │
|
||
│ │ • Maximum privacy guarantee │ │
|
||
│ └─────────────────────────────────────────────────────┘ │
|
||
└─────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌─────────────────────────────────────────────────────────────┐
|
||
│ TRANSPORT LAYER │
|
||
│ • TLS 1.3 encryption in transit │
|
||
│ • Certificate pinning (recommended) │
|
||
│ • No sensitive data in URL parameters │
|
||
└─────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌─────────────────────────────────────────────────────────────┐
|
||
│ APPLICATION LAYER │
|
||
│ ┌─────────────────────────────────────────────────────┐ │
|
||
│ │ FastAPI Backend │ │
|
||
│ │ • Request validation via Pydantic │ │
|
||
│ │ • No database connections for scan operations │ │
|
||
│ │ • Stateless processing │ │
|
||
│ │ • PII-filtered logging │ │
|
||
│ └─────────────────────────────────────────────────────┘ │
|
||
└─────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌─────────────────────────────────────────────────────────────┐
|
||
│ PROCESSING LAYER │
|
||
│ • In-memory only - no disk writes │
|
||
│ • Automatic garbage collection post-response │
|
||
│ • No caching of scanned content │
|
||
│ • Deterministic regex patterns (no ML model storage) │
|
||
└─────────────────────────────────────────────────────────────┘</code></pre>
|
||
</section>
|
||
<section id="ephemeral-processing-model" class="level3" data-number="2.2">
|
||
<h3 data-number="2.2" class="anchored" data-anchor-id="ephemeral-processing-model"><span class="header-section-number">2.2</span> 2.2 Ephemeral Processing Model</h3>
|
||
<p>The Privacy Scanner operates on a strict ephemeral processing model:</p>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 33%">
|
||
<col style="width: 66%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Aspect</th>
|
||
<th>Implementation</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>Data Retention</strong></td>
|
||
<td>Zero - content exists only during request processing</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Disk Writes</strong></td>
|
||
<td>None - all processing in-memory</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Database Storage</strong></td>
|
||
<td>None - stateless architecture</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Log Sanitization</strong></td>
|
||
<td>PII-filtered logging prevents accidental exposure</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Session State</strong></td>
|
||
<td>None - each request is independent</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<div class="sourceCode" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Example: PII-Safe Logging Filter</span></span>
|
||
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> PIIFilter(logging.Filter):</span>
|
||
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a> <span class="kw">def</span> <span class="bu">filter</span>(<span class="va">self</span>, record):</span>
|
||
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a> <span class="co"># Block any log message containing request body content</span></span>
|
||
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a> sensitive_patterns <span class="op">=</span> [<span class="st">'text='</span>, <span class="st">'content='</span>, <span class="st">'body='</span>]</span>
|
||
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> <span class="kw">not</span> <span class="bu">any</span>(p <span class="kw">in</span> <span class="bu">str</span>(record.msg) <span class="cf">for</span> p <span class="kw">in</span> sensitive_patterns)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
</section>
|
||
<section id="client-side-redaction-mode" class="level3" data-number="2.3">
|
||
<h3 data-number="2.3" class="anchored" data-anchor-id="client-side-redaction-mode"><span class="header-section-number">2.3</span> 2.3 Client-Side Redaction Mode</h3>
|
||
<p>For organizations with ultra-sensitive data, the Privacy Scanner offers <strong>Coordinates-Only Mode</strong>:</p>
|
||
<p><strong>Standard Mode:</strong></p>
|
||
<pre><code>Client → Server: "John's SSN is 123-45-6789"
|
||
Server → Client: {type: "SSN", value: "123-45-6789", masked: "[SSN:***-**-6789]"}</code></pre>
|
||
<p><strong>Client-Side Redaction Mode:</strong></p>
|
||
<pre><code>Client → Server: "John's SSN is 123-45-6789"
|
||
Server → Client: {type: "SSN", start: 15, end: 26, length: 11}
|
||
Client performs local redaction - actual PII value never returned</code></pre>
|
||
<p>This mode ensures:</p>
|
||
<ul>
|
||
<li>Backend <strong>never echoes PII values</strong> back to the client</li>
|
||
<li>Redaction occurs <strong>entirely in the browser</strong></li>
|
||
<li>Suitable for <strong>air-gapped environments</strong> with strict data egress policies</li>
|
||
<li><strong>Zero data leakage risk</strong> from server-side processing</li>
|
||
</ul>
|
||
<hr>
|
||
</section>
|
||
</section>
|
||
<section id="detection-capabilities" class="level2" data-number="3">
|
||
<h2 data-number="3" class="anchored" data-anchor-id="detection-capabilities"><span class="header-section-number">3</span> Detection Capabilities</h2>
|
||
<section id="pii-categories-and-types" class="level3" data-number="3.1">
|
||
<h3 data-number="3.1" class="anchored" data-anchor-id="pii-categories-and-types"><span class="header-section-number">3.1</span> 3.1 PII Categories and Types</h3>
|
||
<p>The Privacy Scanner detects <strong>40+ distinct PII types</strong> across six categories:</p>
|
||
<section id="identity-documents" class="level4" data-number="3.1.1">
|
||
<h4 data-number="3.1.1" class="anchored" data-anchor-id="identity-documents"><span class="header-section-number">3.1.1</span> Identity Documents</h4>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 22%">
|
||
<col style="width: 33%">
|
||
<col style="width: 44%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Type</th>
|
||
<th>Pattern</th>
|
||
<th>Validation</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td>US Social Security Number (SSN)</td>
|
||
<td><code>XXX-XX-XXXX</code></td>
|
||
<td>Format + Area validation</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>US Medicare ID (MBI)</td>
|
||
<td><code>XAXX-XXX-XXXX</code></td>
|
||
<td>Format validation</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>US Driver’s License</td>
|
||
<td>State-specific</td>
|
||
<td>Context-aware</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>UK National Insurance</td>
|
||
<td><code>AB123456C</code></td>
|
||
<td>Format + prefix validation</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>Canadian SIN</td>
|
||
<td><code>XXX-XXX-XXX</code></td>
|
||
<td>Luhn checksum</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>India Aadhaar</td>
|
||
<td>12 digits</td>
|
||
<td>Verhoeff checksum</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>India PAN</td>
|
||
<td><code>ABCDE1234F</code></td>
|
||
<td>Format validation</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>Australia TFN</td>
|
||
<td>8-9 digits</td>
|
||
<td>Checksum validation</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>Brazil CPF</td>
|
||
<td><code>XXX.XXX.XXX-XX</code></td>
|
||
<td>MOD-11 checksum</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>Mexico CURP</td>
|
||
<td>18 chars</td>
|
||
<td>Format validation</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>South Africa ID</td>
|
||
<td>13 digits</td>
|
||
<td>Luhn checksum</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>Passport Numbers</td>
|
||
<td>Country-specific</td>
|
||
<td>Format validation</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>German Personalausweis</td>
|
||
<td>10 chars</td>
|
||
<td>Context-aware</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="financial-information" class="level4" data-number="3.1.2">
|
||
<h4 data-number="3.1.2" class="anchored" data-anchor-id="financial-information"><span class="header-section-number">3.1.2</span> Financial Information</h4>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 22%">
|
||
<col style="width: 33%">
|
||
<col style="width: 44%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Type</th>
|
||
<th>Pattern</th>
|
||
<th>Validation</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td>Credit Card (Visa/MC/Amex/Discover)</td>
|
||
<td>13-19 digits</td>
|
||
<td><strong>Luhn Algorithm</strong></td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>IBAN</td>
|
||
<td>Country + check digits + BBAN</td>
|
||
<td><strong>MOD-97 Algorithm</strong></td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>SWIFT/BIC</td>
|
||
<td>8 or 11 chars</td>
|
||
<td>Format + context</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>Bank Account Numbers</td>
|
||
<td>8-17 digits</td>
|
||
<td>Context-aware</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>Routing/ABA Numbers</td>
|
||
<td>9 digits</td>
|
||
<td>Context-aware</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>CUSIP</td>
|
||
<td>9 chars</td>
|
||
<td>Check digit</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>ISIN</td>
|
||
<td>12 chars</td>
|
||
<td>Luhn checksum</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>SEDOL</td>
|
||
<td>7 chars</td>
|
||
<td>Checksum</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="contact-information" class="level4" data-number="3.1.3">
|
||
<h4 data-number="3.1.3" class="anchored" data-anchor-id="contact-information"><span class="header-section-number">3.1.3</span> Contact Information</h4>
|
||
<table class="caption-top table">
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Type</th>
|
||
<th>Pattern</th>
|
||
<th>Validation</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td>Email Addresses</td>
|
||
<td>RFC 5322 compliant</td>
|
||
<td>Domain validation</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>Obfuscated Emails</td>
|
||
<td><code>[at]</code>, <code>(dot)</code> variants</td>
|
||
<td>TLD validation</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>US Phone Numbers</td>
|
||
<td>Multiple formats</td>
|
||
<td>Area code validation</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>International Phone</td>
|
||
<td>30+ country codes</td>
|
||
<td>Country-specific</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>Physical Addresses</td>
|
||
<td>US format</td>
|
||
<td>Context-aware</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="secrets-and-api-keys" class="level4" data-number="3.1.4">
|
||
<h4 data-number="3.1.4" class="anchored" data-anchor-id="secrets-and-api-keys"><span class="header-section-number">3.1.4</span> Secrets and API Keys</h4>
|
||
<table class="caption-top table">
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Type</th>
|
||
<th>Pattern</th>
|
||
<th>Example</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td>AWS Access Key</td>
|
||
<td><code>AKIA[A-Z0-9]{16}</code></td>
|
||
<td><code>AKIAIOSFODNN7EXAMPLE</code></td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>AWS Secret Key</td>
|
||
<td>40-char base64</td>
|
||
<td><code>wJalrXUtnFEMI/K7MDENG...</code></td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>GitHub Token</td>
|
||
<td><code>gh[pousr]_[A-Za-z0-9]{36+}</code></td>
|
||
<td><code>ghp_xxxxxxxxxxxx...</code></td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>Slack Token</td>
|
||
<td><code>xox[baprs]-...</code></td>
|
||
<td><code>xoxb-123456-789012-...</code></td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>Stripe Key</td>
|
||
<td><code>sk_live_...</code> / <code>pk_test_...</code></td>
|
||
<td><code>sk_live_abc123...</code></td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>JWT Token</td>
|
||
<td>Base64.Base64.Base64</td>
|
||
<td><code>eyJhbGci...</code></td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>OpenAI API Key</td>
|
||
<td><code>sk-[A-Za-z0-9]{48}</code></td>
|
||
<td><code>sk-abc123...</code></td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>Anthropic API Key</td>
|
||
<td><code>sk-ant-...</code></td>
|
||
<td><code>sk-ant-api03-...</code></td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>Discord Token</td>
|
||
<td>Base64 format</td>
|
||
<td>Token pattern</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>Private Keys</td>
|
||
<td>PEM headers</td>
|
||
<td><code>-----BEGIN PRIVATE KEY-----</code></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="medical-information" class="level4" data-number="3.1.5">
|
||
<h4 data-number="3.1.5" class="anchored" data-anchor-id="medical-information"><span class="header-section-number">3.1.5</span> Medical Information</h4>
|
||
<table class="caption-top table">
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Type</th>
|
||
<th>Pattern</th>
|
||
<th>Validation</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td>Medical Record Number</td>
|
||
<td>6-10 digits</td>
|
||
<td>Context-aware</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>NPI (Provider ID)</td>
|
||
<td>10 digits</td>
|
||
<td>Luhn checksum</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>DEA Number</td>
|
||
<td>2 letters + 7 digits</td>
|
||
<td>Checksum</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="cryptocurrency" class="level4" data-number="3.1.6">
|
||
<h4 data-number="3.1.6" class="anchored" data-anchor-id="cryptocurrency"><span class="header-section-number">3.1.6</span> Cryptocurrency</h4>
|
||
<table class="caption-top table">
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Type</th>
|
||
<th>Pattern</th>
|
||
<th>Validation</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td>Bitcoin Address</td>
|
||
<td><code>1</code>, <code>3</code>, or <code>bc1</code> prefix</td>
|
||
<td>Base58Check / Bech32</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>Ethereum Address</td>
|
||
<td><code>0x</code> + 40 hex</td>
|
||
<td>Checksum optional</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>Monero Address</td>
|
||
<td><code>4</code> prefix, 95 chars</td>
|
||
<td>Format validation</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
</section>
|
||
<section id="eight-layer-detection-pipeline" class="level3" data-number="3.2">
|
||
<h3 data-number="3.2" class="anchored" data-anchor-id="eight-layer-detection-pipeline"><span class="header-section-number">3.2</span> 3.2 Eight-Layer Detection Pipeline</h3>
|
||
<pre><code>┌────────────────────────────────────────────────────────────────┐
|
||
│ INPUT TEXT │
|
||
└────────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌────────────────────────────────────────────────────────────────┐
|
||
│ LAYER 1: Unicode Normalization (NFKC) │
|
||
│ • Converts fullwidth chars: email → email │
|
||
│ • Normalizes homoglyphs: е (Cyrillic) → e (Latin) │
|
||
│ • Decodes HTML entities: &#64; → @ │
|
||
└────────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌────────────────────────────────────────────────────────────────┐
|
||
│ LAYER 2: Text Normalization │
|
||
│ • Defanging reversal: [dot] → ., [at] → @ │
|
||
│ • Smart "at" detection (TLD validation, false trigger filter) │
|
||
│ • Separator removal: 123-45-6789 → 123456789 │
|
||
│ • Character unspacing: t-e-s-t → test │
|
||
└────────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌────────────────────────────────────────────────────────────────┐
|
||
│ LAYER 2.5: Structured Data Extraction │
|
||
│ • JSON blob detection and deep value extraction │
|
||
│ • Recursive scanning of nested objects/arrays │
|
||
│ • Key-value pair analysis │
|
||
└────────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌────────────────────────────────────────────────────────────────┐
|
||
│ LAYER 2.6: Encoding Detection │
|
||
│ • Base64 auto-detection and decoding │
|
||
│ • UTF-8 validation of decoded content │
|
||
│ • Recursive PII scan on decoded payloads │
|
||
└────────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌────────────────────────────────────────────────────────────────┐
|
||
│ LAYER 3: Pattern Matching │
|
||
│ • 40+ regex patterns with category classification │
|
||
│ • Context-aware matching (lookbehind/lookahead) │
|
||
│ • Multi-format support per PII type │
|
||
└────────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌────────────────────────────────────────────────────────────────┐
|
||
│ LAYER 4: Checksum Validation │
|
||
│ • Luhn algorithm (credit cards, Canadian SIN) │
|
||
│ • MOD-97 (IBAN) │
|
||
│ • Verhoeff (Aadhaar) │
|
||
│ • Custom checksums (DEA, NPI) │
|
||
└────────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌────────────────────────────────────────────────────────────────┐
|
||
│ LAYER 5: Context Analysis │
|
||
│ • Surrounding text analysis for disambiguation │
|
||
│ • False positive filtering (connection strings, UUIDs) │
|
||
│ • Confidence adjustment based on context │
|
||
└────────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌────────────────────────────────────────────────────────────────┐
|
||
│ LAYER 6: Deduplication & Scoring │
|
||
│ • Overlapping entity resolution │
|
||
│ • Confidence score aggregation │
|
||
│ • Risk level classification │
|
||
└────────────────────────────────────────────────────────────────┘
|
||
│
|
||
▼
|
||
┌────────────────────────────────────────────────────────────────┐
|
||
│ OUTPUT: Structured PII Report │
|
||
│ • Entity list with types, values, positions, confidence │
|
||
│ • Redacted text preview │
|
||
│ • Risk assessment summary │
|
||
└────────────────────────────────────────────────────────────────┘</code></pre>
|
||
</section>
|
||
<section id="anti-evasion-capabilities" class="level3" data-number="3.3">
|
||
<h3 data-number="3.3" class="anchored" data-anchor-id="anti-evasion-capabilities"><span class="header-section-number">3.3</span> 3.3 Anti-Evasion Capabilities</h3>
|
||
<p>The Privacy Scanner is designed to detect PII even when intentionally obfuscated:</p>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 41%">
|
||
<col style="width: 19%">
|
||
<col style="width: 39%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Evasion Technique</th>
|
||
<th>Example</th>
|
||
<th>Detection Method</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>Defanging</strong></td>
|
||
<td><code>john[at]gmail[dot]com</code></td>
|
||
<td>Layer 2 normalization</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Spacing</strong></td>
|
||
<td><code>j-o-h-n @ g-m-a-i-l</code></td>
|
||
<td>Character joining</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Leetspeak</strong></td>
|
||
<td><code>j0hn@gm4il.c0m</code></td>
|
||
<td>Leetspeak reversal</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Unicode tricks</strong></td>
|
||
<td><code>john@gmail.com</code></td>
|
||
<td>NFKC normalization</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>HTML encoding</strong></td>
|
||
<td><code>john&#64;gmail&#46;com</code></td>
|
||
<td>Entity decoding</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Base64 hiding</strong></td>
|
||
<td><code>am9obkBnbWFpbC5jb20=</code></td>
|
||
<td>Auto-decode + scan</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>JSON embedding</strong></td>
|
||
<td><code>{"email":"john@gmail.com"}</code></td>
|
||
<td>Deep extraction</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Number formatting</strong></td>
|
||
<td><code>123.45.6789</code> (SSN with dots)</td>
|
||
<td>Multi-separator support</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<hr>
|
||
</section>
|
||
</section>
|
||
<section id="compliance-mapping" class="level2" data-number="4">
|
||
<h2 data-number="4" class="anchored" data-anchor-id="compliance-mapping"><span class="header-section-number">4</span> Compliance Mapping</h2>
|
||
<section id="gdpr-general-data-protection-regulation" class="level3" data-number="4.1">
|
||
<h3 data-number="4.1" class="anchored" data-anchor-id="gdpr-general-data-protection-regulation"><span class="header-section-number">4.1</span> 4.1 GDPR (General Data Protection Regulation)</h3>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 40%">
|
||
<col style="width: 60%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>GDPR Requirement</th>
|
||
<th>Privacy Scanner Capability</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>Art. 5(1)(c)</strong> - Data Minimization</td>
|
||
<td>Client-side redaction mode ensures minimal data processing</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Art. 5(1)(e)</strong> - Storage Limitation</td>
|
||
<td>Zero data retention - ephemeral processing only</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Art. 25</strong> - Privacy by Design</td>
|
||
<td>Built-in PII detection before data enters downstream systems</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Art. 32</strong> - Security of Processing</td>
|
||
<td>TLS encryption, no persistent storage, PII-filtered logs</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Art. 33/34</strong> - Breach Notification</td>
|
||
<td>Detection of exposed PII in logs/documents aids breach assessment</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><strong>GDPR PII Types Detected:</strong> - Names (via context analysis) - Email addresses - Phone numbers (EU formats) - National IDs (UK NI, German Ausweis) - Financial identifiers (IBAN, EU VAT) - IP addresses - Physical addresses</p>
|
||
</section>
|
||
<section id="hipaa-health-insurance-portability-and-accountability-act" class="level3" data-number="4.2">
|
||
<h3 data-number="4.2" class="anchored" data-anchor-id="hipaa-health-insurance-portability-and-accountability-act"><span class="header-section-number">4.2</span> 4.2 HIPAA (Health Insurance Portability and Accountability Act)</h3>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 40%">
|
||
<col style="width: 60%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>HIPAA Requirement</th>
|
||
<th>Privacy Scanner Capability</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>§164.502</strong> - Minimum Necessary</td>
|
||
<td>Detects PHI before transmission to reduce exposure</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>§164.312(a)(1)</strong> - Access Control</td>
|
||
<td>Coordinates-only mode prevents PHI echo</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>§164.312(c)(1)</strong> - Integrity</td>
|
||
<td>Immutable detection - no modification of source data</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>§164.312(e)(1)</strong> - Transmission Security</td>
|
||
<td>TLS 1.3 for all communications</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>§164.530(c)</strong> - Safeguards</td>
|
||
<td>Multi-layer detection prevents PHI leakage</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><strong>HIPAA PHI Types Detected:</strong> - Social Security Numbers - Medicare Beneficiary Identifiers (MBI) - Medical Record Numbers - NPI (National Provider Identifier) - DEA Numbers - Dates of Birth - Phone Numbers - Email Addresses - Physical Addresses</p>
|
||
</section>
|
||
<section id="pci-dss-payment-card-industry-data-security-standard" class="level3" data-number="4.3">
|
||
<h3 data-number="4.3" class="anchored" data-anchor-id="pci-dss-payment-card-industry-data-security-standard"><span class="header-section-number">4.3</span> 4.3 PCI-DSS (Payment Card Industry Data Security Standard)</h3>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 42%">
|
||
<col style="width: 57%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>PCI-DSS Requirement</th>
|
||
<th>Privacy Scanner Capability</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>Req. 3.4</strong> - Render PAN Unreadable</td>
|
||
<td>Automatic credit card detection and masking</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Req. 4.1</strong> - Encrypt Transmission</td>
|
||
<td>TLS 1.3 encryption</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Req. 6.5</strong> - Secure Development</td>
|
||
<td>Input validation, no SQL/command injection vectors</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Req. 10.2</strong> - Audit Trails</td>
|
||
<td>PII-safe logging with detection events</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Req. 12.3</strong> - Usage Policies</td>
|
||
<td>Supports policy enforcement via API integration</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><strong>PCI-DSS Data Types Detected:</strong> - Primary Account Numbers (PAN) - Visa, Mastercard, Amex, Discover - <strong>Luhn validation</strong> reduces false positives - Detects formatted (<code>4111-1111-1111-1111</code>) and unformatted (<code>4111111111111111</code>) - Bank routing numbers - IBAN/SWIFT codes</p>
|
||
</section>
|
||
<section id="soc-2-service-organization-control" class="level3" data-number="4.4">
|
||
<h3 data-number="4.4" class="anchored" data-anchor-id="soc-2-service-organization-control"><span class="header-section-number">4.4</span> 4.4 SOC 2 (Service Organization Control)</h3>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 37%">
|
||
<col style="width: 62%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>SOC 2 Criteria</th>
|
||
<th>Privacy Scanner Capability</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>CC6.1</strong> - Logical Access</td>
|
||
<td>API-based access with optional authentication</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>CC6.6</strong> - System Boundaries</td>
|
||
<td>Clear input/output contracts via OpenAPI spec</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>CC6.7</strong> - Transmission Integrity</td>
|
||
<td>TLS encryption, request validation</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>CC7.2</strong> - System Monitoring</td>
|
||
<td>Structured detection logs (without PII content)</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>PI1.1</strong> - Privacy Notice</td>
|
||
<td>Transparent processing - documented detection categories</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="ccpa-california-consumer-privacy-act" class="level3" data-number="4.5">
|
||
<h3 data-number="4.5" class="anchored" data-anchor-id="ccpa-california-consumer-privacy-act"><span class="header-section-number">4.5</span> 4.5 CCPA (California Consumer Privacy Act)</h3>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 38%">
|
||
<col style="width: 61%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>CCPA Requirement</th>
|
||
<th>Privacy Scanner Capability</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>§1798.100</strong> - Right to Know</td>
|
||
<td>Identifies all PII categories in documents</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>§1798.105</strong> - Right to Delete</td>
|
||
<td>Supports identification for deletion workflows</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>§1798.110</strong> - Disclosure</td>
|
||
<td>Structured output for compliance reporting</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<hr>
|
||
</section>
|
||
</section>
|
||
<section id="integration-patterns" class="level2" data-number="5">
|
||
<h2 data-number="5" class="anchored" data-anchor-id="integration-patterns"><span class="header-section-number">5</span> Integration Patterns</h2>
|
||
<section id="pre-commit-hook-developer-workflow" class="level3" data-number="5.1">
|
||
<h3 data-number="5.1" class="anchored" data-anchor-id="pre-commit-hook-developer-workflow"><span class="header-section-number">5.1</span> 5.1 Pre-Commit Hook (Developer Workflow)</h3>
|
||
<div class="sourceCode" id="cb6"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co">#!/bin/bash</span></span>
|
||
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co"># .git/hooks/pre-commit</span></span>
|
||
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Scan staged files for PII</span></span>
|
||
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> file <span class="kw">in</span> <span class="va">$(</span><span class="fu">git</span> diff <span class="at">--cached</span> <span class="at">--name-only</span><span class="va">)</span><span class="kw">;</span> <span class="cf">do</span></span>
|
||
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a> <span class="va">response</span><span class="op">=</span><span class="va">$(</span><span class="ex">curl</span> <span class="at">-s</span> <span class="at">-X</span> POST http://localhost:8000/api/privacy/scan-text <span class="dt">\</span></span>
|
||
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a> <span class="at">-F</span> <span class="st">"text=</span><span class="va">$(</span><span class="fu">cat</span> <span class="va">$file)</span><span class="st">"</span> <span class="dt">\</span></span>
|
||
<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a> <span class="at">-F</span> <span class="st">"coordinates_only=true"</span><span class="va">)</span></span>
|
||
<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a> <span class="va">count</span><span class="op">=</span><span class="va">$(</span><span class="bu">echo</span> <span class="va">$response</span> <span class="kw">|</span> <span class="ex">jq</span> <span class="st">'.entities | length'</span><span class="va">)</span></span>
|
||
<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> <span class="bu">[</span> <span class="st">"</span><span class="va">$count</span><span class="st">"</span> <span class="ot">-gt</span> 0 <span class="bu">]</span><span class="kw">;</span> <span class="cf">then</span></span>
|
||
<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a> <span class="bu">echo</span> <span class="st">"PII detected in </span><span class="va">$file</span><span class="st"> - commit blocked"</span></span>
|
||
<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a> <span class="bu">exit</span> 1</span>
|
||
<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a> <span class="cf">fi</span></span>
|
||
<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a><span class="cf">done</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
</section>
|
||
<section id="cicd-pipeline-integration" class="level3" data-number="5.2">
|
||
<h3 data-number="5.2" class="anchored" data-anchor-id="cicd-pipeline-integration"><span class="header-section-number">5.2</span> 5.2 CI/CD Pipeline Integration</h3>
|
||
<div class="sourceCode" id="cb7"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co"># GitHub Actions example</span></span>
|
||
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="kw">-</span><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> PII Scan</span></span>
|
||
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="fu"> run</span><span class="kw">: </span><span class="ch">|</span></span>
|
||
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a> for file in $(find . -name "*.log" -o -name "*.json"); do</span>
|
||
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a> result=$(curl -s -X POST $PII_SCANNER_URL/api/privacy/scan-text \</span>
|
||
<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a> -F "text=$(cat $file)")</span>
|
||
<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a> if echo "$result" | jq -e '.entities | length > 0' > /dev/null; then</span>
|
||
<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a> echo "::error::PII detected in $file"</span>
|
||
<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a> exit 1</span>
|
||
<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a> fi</span>
|
||
<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a> done</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
</section>
|
||
<section id="data-pipeline-integration" class="level3" data-number="5.3">
|
||
<h3 data-number="5.3" class="anchored" data-anchor-id="data-pipeline-integration"><span class="header-section-number">5.3</span> 5.3 Data Pipeline Integration</h3>
|
||
<div class="sourceCode" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Apache Airflow DAG example</span></span>
|
||
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> airflow.decorators <span class="im">import</span> task</span>
|
||
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> requests</span>
|
||
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
|
||
<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> scan_for_pii(data: <span class="bu">str</span>, coordinates_only: <span class="bu">bool</span> <span class="op">=</span> <span class="va">True</span>) <span class="op">-></span> <span class="bu">dict</span>:</span>
|
||
<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Scan data for PII before loading to data warehouse"""</span></span>
|
||
<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a> response <span class="op">=</span> requests.post(</span>
|
||
<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a> <span class="ss">f"</span><span class="sc">{</span>PII_SCANNER_URL<span class="sc">}</span><span class="ss">/api/privacy/scan-text"</span>,</span>
|
||
<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a> data<span class="op">=</span>{</span>
|
||
<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a> <span class="st">"text"</span>: data,</span>
|
||
<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a> <span class="st">"coordinates_only"</span>: coordinates_only</span>
|
||
<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a> }</span>
|
||
<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a> )</span>
|
||
<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a> result <span class="op">=</span> response.json()</span>
|
||
<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb8-17"><a href="#cb8-17" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> result.get(<span class="st">"entities"</span>):</span>
|
||
<span id="cb8-18"><a href="#cb8-18" aria-hidden="true" tabindex="-1"></a> <span class="cf">raise</span> <span class="pp">ValueError</span>(<span class="ss">f"PII detected: </span><span class="sc">{</span><span class="bu">len</span>(result[<span class="st">'entities'</span>])<span class="sc">}</span><span class="ss"> entities"</span>)</span>
|
||
<span id="cb8-19"><a href="#cb8-19" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb8-20"><a href="#cb8-20" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {<span class="st">"status"</span>: <span class="st">"clean"</span>, <span class="st">"data"</span>: data}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
</section>
|
||
<section id="log-sanitization-service" class="level3" data-number="5.4">
|
||
<h3 data-number="5.4" class="anchored" data-anchor-id="log-sanitization-service"><span class="header-section-number">5.4</span> 5.4 Log Sanitization Service</h3>
|
||
<div class="sourceCode" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Real-time log sanitization</span></span>
|
||
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> asyncio</span>
|
||
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> aiohttp</span>
|
||
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a><span class="cf">async</span> <span class="kw">def</span> sanitize_log_stream(log_lines: <span class="bu">list</span>[<span class="bu">str</span>]) <span class="op">-></span> <span class="bu">list</span>[<span class="bu">str</span>]:</span>
|
||
<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Sanitize logs before shipping to centralized logging"""</span></span>
|
||
<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a> <span class="cf">async</span> <span class="cf">with</span> aiohttp.ClientSession() <span class="im">as</span> session:</span>
|
||
<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a> tasks <span class="op">=</span> []</span>
|
||
<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> line <span class="kw">in</span> log_lines:</span>
|
||
<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a> task <span class="op">=</span> session.post(</span>
|
||
<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a> <span class="ss">f"</span><span class="sc">{</span>PII_SCANNER_URL<span class="sc">}</span><span class="ss">/api/privacy/scan-text"</span>,</span>
|
||
<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a> data<span class="op">=</span>{<span class="st">"text"</span>: line}</span>
|
||
<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a> )</span>
|
||
<span id="cb9-14"><a href="#cb9-14" aria-hidden="true" tabindex="-1"></a> tasks.append(task)</span>
|
||
<span id="cb9-15"><a href="#cb9-15" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb9-16"><a href="#cb9-16" aria-hidden="true" tabindex="-1"></a> responses <span class="op">=</span> <span class="cf">await</span> asyncio.gather(<span class="op">*</span>tasks)</span>
|
||
<span id="cb9-17"><a href="#cb9-17" aria-hidden="true" tabindex="-1"></a> sanitized <span class="op">=</span> []</span>
|
||
<span id="cb9-18"><a href="#cb9-18" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> resp, original <span class="kw">in</span> <span class="bu">zip</span>(responses, log_lines):</span>
|
||
<span id="cb9-19"><a href="#cb9-19" aria-hidden="true" tabindex="-1"></a> result <span class="op">=</span> <span class="cf">await</span> resp.json()</span>
|
||
<span id="cb9-20"><a href="#cb9-20" aria-hidden="true" tabindex="-1"></a> sanitized.append(result.get(<span class="st">"redacted_preview"</span>, original))</span>
|
||
<span id="cb9-21"><a href="#cb9-21" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb9-22"><a href="#cb9-22" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> sanitized</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<hr>
|
||
</section>
|
||
</section>
|
||
<section id="performance-characteristics" class="level2" data-number="6">
|
||
<h2 data-number="6" class="anchored" data-anchor-id="performance-characteristics"><span class="header-section-number">6</span> Performance Characteristics</h2>
|
||
<section id="benchmarks" class="level3" data-number="6.1">
|
||
<h3 data-number="6.1" class="anchored" data-anchor-id="benchmarks"><span class="header-section-number">6.1</span> 6.1 Benchmarks</h3>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 29%">
|
||
<col style="width: 25%">
|
||
<col style="width: 44%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Metric</th>
|
||
<th>Value</th>
|
||
<th>Conditions</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>Throughput</strong></td>
|
||
<td>~10,000 chars/sec</td>
|
||
<td>Single-threaded, all layers enabled</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Latency (P50)</strong></td>
|
||
<td><50ms</td>
|
||
<td>1KB text input</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Latency (P99)</strong></td>
|
||
<td><200ms</td>
|
||
<td>10KB text input</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Memory Usage</strong></td>
|
||
<td><100MB</td>
|
||
<td>Per-request peak</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Startup Time</strong></td>
|
||
<td><2 seconds</td>
|
||
<td>Cold start with pattern compilation</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="scalability" class="level3" data-number="6.2">
|
||
<h3 data-number="6.2" class="anchored" data-anchor-id="scalability"><span class="header-section-number">6.2</span> 6.2 Scalability</h3>
|
||
<p>The Privacy Scanner is designed for horizontal scalability:</p>
|
||
<ul>
|
||
<li><strong>Stateless Architecture</strong>: Any instance can handle any request</li>
|
||
<li><strong>No Shared State</strong>: No database or cache dependencies for scan operations</li>
|
||
<li><strong>Container-Ready</strong>: Single-process model ideal for Kubernetes</li>
|
||
<li><strong>Load Balancer Compatible</strong>: Round-robin distribution works optimally</li>
|
||
</ul>
|
||
<div class="sourceCode" id="cb10"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Kubernetes HPA example</span></span>
|
||
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="fu">apiVersion</span><span class="kw">:</span><span class="at"> autoscaling/v2</span></span>
|
||
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a><span class="fu">kind</span><span class="kw">:</span><span class="at"> HorizontalPodAutoscaler</span></span>
|
||
<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a><span class="fu">metadata</span><span class="kw">:</span></span>
|
||
<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> privacy-scanner</span></span>
|
||
<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a><span class="fu">spec</span><span class="kw">:</span></span>
|
||
<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">scaleTargetRef</span><span class="kw">:</span></span>
|
||
<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">apiVersion</span><span class="kw">:</span><span class="at"> apps/v1</span></span>
|
||
<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">kind</span><span class="kw">:</span><span class="at"> Deployment</span></span>
|
||
<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> privacy-scanner</span></span>
|
||
<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">minReplicas</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
|
||
<span id="cb10-12"><a href="#cb10-12" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">maxReplicas</span><span class="kw">:</span><span class="at"> </span><span class="dv">20</span></span>
|
||
<span id="cb10-13"><a href="#cb10-13" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">metrics</span><span class="kw">:</span></span>
|
||
<span id="cb10-14"><a href="#cb10-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> Resource</span></span>
|
||
<span id="cb10-15"><a href="#cb10-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">resource</span><span class="kw">:</span></span>
|
||
<span id="cb10-16"><a href="#cb10-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> cpu</span></span>
|
||
<span id="cb10-17"><a href="#cb10-17" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">target</span><span class="kw">:</span></span>
|
||
<span id="cb10-18"><a href="#cb10-18" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> Utilization</span></span>
|
||
<span id="cb10-19"><a href="#cb10-19" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">averageUtilization</span><span class="kw">:</span><span class="at"> </span><span class="dv">70</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<hr>
|
||
</section>
|
||
</section>
|
||
<section id="deployment-options" class="level2" data-number="7">
|
||
<h2 data-number="7" class="anchored" data-anchor-id="deployment-options"><span class="header-section-number">7</span> Deployment Options</h2>
|
||
<section id="on-premises" class="level3" data-number="7.1">
|
||
<h3 data-number="7.1" class="anchored" data-anchor-id="on-premises"><span class="header-section-number">7.1</span> 7.1 On-Premises</h3>
|
||
<p>For maximum data sovereignty:</p>
|
||
<div class="sourceCode" id="cb11"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Docker deployment</span></span>
|
||
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> run <span class="at">-d</span> <span class="dt">\</span></span>
|
||
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a> <span class="at">--name</span> privacy-scanner <span class="dt">\</span></span>
|
||
<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a> <span class="at">-p</span> 8000:8000 <span class="dt">\</span></span>
|
||
<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a> <span class="at">--memory</span><span class="op">=</span>512m <span class="dt">\</span></span>
|
||
<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a> <span class="at">--cpus</span><span class="op">=</span>1 <span class="dt">\</span></span>
|
||
<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a> privacy-scanner:latest</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p><strong>Benefits:</strong> - Data never leaves your network - Full control over infrastructure - No external dependencies</p>
|
||
</section>
|
||
<section id="private-cloud-vpc" class="level3" data-number="7.2">
|
||
<h3 data-number="7.2" class="anchored" data-anchor-id="private-cloud-vpc"><span class="header-section-number">7.2</span> 7.2 Private Cloud (VPC)</h3>
|
||
<div class="sourceCode" id="cb12"><pre class="sourceCode terraform code-with-copy"><code class="sourceCode terraform"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="co"># AWS VPC deployment example</span></span>
|
||
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="kw">resource</span> <span class="st">"aws_ecs_service"</span> <span class="st">"privacy_scanner"</span> {</span>
|
||
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a> name <span class="op">=</span> <span class="st">"privacy-scanner"</span></span>
|
||
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a> cluster <span class="op">=</span> aws_ecs_cluster.main.id</span>
|
||
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a> task_definition <span class="op">=</span> aws_ecs_task_definition.privacy_scanner.arn</span>
|
||
<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a> desired_count <span class="op">=</span> <span class="dv">2</span></span>
|
||
<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a> network_configuration {</span>
|
||
<span id="cb12-9"><a href="#cb12-9" aria-hidden="true" tabindex="-1"></a> subnets <span class="op">=</span> aws_subnet.private[<span class="op">*</span>].id</span>
|
||
<span id="cb12-10"><a href="#cb12-10" aria-hidden="true" tabindex="-1"></a> security_groups <span class="op">=</span> [aws_security_group.privacy_scanner.id]</span>
|
||
<span id="cb12-11"><a href="#cb12-11" aria-hidden="true" tabindex="-1"></a> assign_public_ip <span class="op">=</span> <span class="va">false</span> <span class="co"># No public access</span></span>
|
||
<span id="cb12-12"><a href="#cb12-12" aria-hidden="true" tabindex="-1"></a> }</span>
|
||
<span id="cb12-13"><a href="#cb12-13" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p><strong>Benefits:</strong> - Network isolation via VPC - Integration with cloud IAM - Auto-scaling capabilities</p>
|
||
</section>
|
||
<section id="air-gapped-deployment" class="level3" data-number="7.3">
|
||
<h3 data-number="7.3" class="anchored" data-anchor-id="air-gapped-deployment"><span class="header-section-number">7.3</span> 7.3 Air-Gapped Deployment</h3>
|
||
<p>For highly restricted environments:</p>
|
||
<ol type="1">
|
||
<li><strong>Client-Side Redaction Mode</strong>: Backend only returns coordinates</li>
|
||
<li><strong>No Outbound Connections</strong>: Zero external API calls</li>
|
||
<li><strong>Offline Pattern Updates</strong>: Manual pattern file updates</li>
|
||
<li><strong>Local-Only Logging</strong>: No telemetry or metrics export</li>
|
||
</ol>
|
||
<hr>
|
||
</section>
|
||
</section>
|
||
<section id="security-hardening-checklist" class="level2" data-number="8">
|
||
<h2 data-number="8" class="anchored" data-anchor-id="security-hardening-checklist"><span class="header-section-number">8</span> Security Hardening Checklist</h2>
|
||
<section id="pre-deployment" class="level3" data-number="8.1">
|
||
<h3 data-number="8.1" class="anchored" data-anchor-id="pre-deployment"><span class="header-section-number">8.1</span> Pre-Deployment</h3>
|
||
<ul class="task-list">
|
||
<li><label><input type="checkbox">Enable TLS 1.3 with strong cipher suites</label></li>
|
||
<li><label><input type="checkbox">Configure rate limiting (recommend: 100 req/min per IP)</label></li>
|
||
<li><label><input type="checkbox">Set up authentication (API keys or OAuth 2.0)</label></li>
|
||
<li><label><input type="checkbox">Review and customize PII patterns for your use case</label></li>
|
||
<li><label><input type="checkbox">Configure PII-safe logging</label></li>
|
||
<li><label><input type="checkbox">Set appropriate request size limits (default: 10MB)</label></li>
|
||
</ul>
|
||
</section>
|
||
<section id="runtime" class="level3" data-number="8.2">
|
||
<h3 data-number="8.2" class="anchored" data-anchor-id="runtime"><span class="header-section-number">8.2</span> Runtime</h3>
|
||
<ul class="task-list">
|
||
<li><label><input type="checkbox">Monitor for unusual request patterns</label></li>
|
||
<li><label><input type="checkbox">Set up alerting on high PII detection rates</label></li>
|
||
<li><label><input type="checkbox">Implement request timeout (default: 30 seconds)</label></li>
|
||
<li><label><input type="checkbox">Enable health check endpoints for orchestration</label></li>
|
||
<li><label><input type="checkbox">Configure graceful shutdown handling</label></li>
|
||
</ul>
|
||
</section>
|
||
<section id="audit" class="level3" data-number="8.3">
|
||
<h3 data-number="8.3" class="anchored" data-anchor-id="audit"><span class="header-section-number">8.3</span> Audit</h3>
|
||
<ul class="task-list">
|
||
<li><label><input type="checkbox">Log detection events (without PII content)</label></li>
|
||
<li><label><input type="checkbox">Track API usage metrics</label></li>
|
||
<li><label><input type="checkbox">Periodic pattern effectiveness review</label></li>
|
||
<li><label><input type="checkbox">Regular security scanning of container images</label></li>
|
||
</ul>
|
||
<hr>
|
||
</section>
|
||
</section>
|
||
<section id="appendix-a-api-reference" class="level2" data-number="9">
|
||
<h2 data-number="9" class="anchored" data-anchor-id="appendix-a-api-reference"><span class="header-section-number">9</span> Appendix A: API Reference</h2>
|
||
<section id="scan-text-endpoint" class="level3" data-number="9.1">
|
||
<h3 data-number="9.1" class="anchored" data-anchor-id="scan-text-endpoint"><span class="header-section-number">9.1</span> Scan Text Endpoint</h3>
|
||
<pre><code>POST /api/privacy/scan-text
|
||
Content-Type: multipart/form-data</code></pre>
|
||
<p><strong>Parameters:</strong></p>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 27%">
|
||
<col style="width: 15%">
|
||
<col style="width: 25%">
|
||
<col style="width: 32%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Parameter</th>
|
||
<th>Type</th>
|
||
<th>Required</th>
|
||
<th>Description</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><code>text</code></td>
|
||
<td>string</td>
|
||
<td>Yes</td>
|
||
<td>Text content to scan</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><code>coordinates_only</code></td>
|
||
<td>boolean</td>
|
||
<td>No</td>
|
||
<td>Return only positions (default: false)</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><code>detect_emails</code></td>
|
||
<td>boolean</td>
|
||
<td>No</td>
|
||
<td>Enable email detection (default: true)</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><code>detect_phones</code></td>
|
||
<td>boolean</td>
|
||
<td>No</td>
|
||
<td>Enable phone detection (default: true)</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><code>detect_ssn</code></td>
|
||
<td>boolean</td>
|
||
<td>No</td>
|
||
<td>Enable SSN detection (default: true)</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><code>detect_credit_cards</code></td>
|
||
<td>boolean</td>
|
||
<td>No</td>
|
||
<td>Enable credit card detection (default: true)</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><code>detect_secrets</code></td>
|
||
<td>boolean</td>
|
||
<td>No</td>
|
||
<td>Enable secrets detection (default: true)</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><strong>Response (Standard Mode):</strong></p>
|
||
<div class="sourceCode" id="cb14"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
|
||
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"entities"</span><span class="fu">:</span> <span class="ot">[</span></span>
|
||
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||
<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"type"</span><span class="fu">:</span> <span class="st">"EMAIL"</span><span class="fu">,</span></span>
|
||
<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"value"</span><span class="fu">:</span> <span class="st">"john@example.com"</span><span class="fu">,</span></span>
|
||
<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">"masked_value"</span><span class="fu">:</span> <span class="st">"[EMAIL:j***@example.com]"</span><span class="fu">,</span></span>
|
||
<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">"start"</span><span class="fu">:</span> <span class="dv">15</span><span class="fu">,</span></span>
|
||
<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">"end"</span><span class="fu">:</span> <span class="dv">31</span><span class="fu">,</span></span>
|
||
<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a> <span class="dt">"confidence"</span><span class="fu">:</span> <span class="dv">0</span><span class="er">.</span><span class="dv">95</span><span class="fu">,</span></span>
|
||
<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">"category"</span><span class="fu">:</span> <span class="st">"pii"</span></span>
|
||
<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span></span>
|
||
<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span><span class="fu">,</span></span>
|
||
<span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a> <span class="dt">"redacted_preview"</span><span class="fu">:</span> <span class="st">"Contact: [EMAIL:j***@example.com] for info"</span><span class="fu">,</span></span>
|
||
<span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a> <span class="dt">"summary"</span><span class="fu">:</span> <span class="fu">{</span></span>
|
||
<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">"total_entities"</span><span class="fu">:</span> <span class="dv">1</span><span class="fu">,</span></span>
|
||
<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">"by_category"</span><span class="fu">:</span> <span class="fu">{</span><span class="dt">"pii"</span><span class="fu">:</span> <span class="dv">1</span><span class="fu">},</span></span>
|
||
<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">"risk_level"</span><span class="fu">:</span> <span class="st">"medium"</span></span>
|
||
<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span></span>
|
||
<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p><strong>Response (Coordinates-Only Mode):</strong></p>
|
||
<div class="sourceCode" id="cb15"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
|
||
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"entities"</span><span class="fu">:</span> <span class="ot">[</span></span>
|
||
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"type"</span><span class="fu">:</span> <span class="st">"EMAIL"</span><span class="fu">,</span></span>
|
||
<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"start"</span><span class="fu">:</span> <span class="dv">15</span><span class="fu">,</span></span>
|
||
<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">"end"</span><span class="fu">:</span> <span class="dv">31</span><span class="fu">,</span></span>
|
||
<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">"length"</span><span class="fu">:</span> <span class="dv">16</span></span>
|
||
<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span></span>
|
||
<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span><span class="fu">,</span></span>
|
||
<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">"coordinates_only"</span><span class="fu">:</span> <span class="kw">true</span></span>
|
||
<span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<hr>
|
||
</section>
|
||
</section>
|
||
<section id="appendix-b-confidence-scoring" class="level2" data-number="10">
|
||
<h2 data-number="10" class="anchored" data-anchor-id="appendix-b-confidence-scoring"><span class="header-section-number">10</span> Appendix B: Confidence Scoring</h2>
|
||
<table class="caption-top table">
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Confidence Level</th>
|
||
<th>Score Range</th>
|
||
<th>Meaning</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>Very High</strong></td>
|
||
<td>0.95 - 1.00</td>
|
||
<td>Checksum validated (Luhn, MOD-97)</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>High</strong></td>
|
||
<td>0.85 - 0.94</td>
|
||
<td>Strong pattern match with context</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Medium</strong></td>
|
||
<td>0.70 - 0.84</td>
|
||
<td>Pattern match, limited context</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>Low</strong></td>
|
||
<td>0.50 - 0.69</td>
|
||
<td>Possible match, needs review</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><strong>Uncertain</strong></td>
|
||
<td>< 0.50</td>
|
||
<td>Flagged for manual review</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><strong>Confidence Adjustments:</strong></p>
|
||
<ul>
|
||
<li><strong>+15%</strong>: Checksum validation passed</li>
|
||
<li><strong>+10%</strong>: Contextual keywords present (e.g., “SSN:”, “card number”)</li>
|
||
<li><strong>-30%</strong>: Anti-context detected (e.g., “order number”, “reference ID”)</li>
|
||
<li><strong>-20%</strong>: Common false positive pattern (UUID format, connection string)</li>
|
||
</ul>
|
||
<hr>
|
||
</section>
|
||
<section id="appendix-c-version-history" class="level2" data-number="11">
|
||
<h2 data-number="11" class="anchored" data-anchor-id="appendix-c-version-history"><span class="header-section-number">11</span> Appendix C: Version History</h2>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 37%">
|
||
<col style="width: 25%">
|
||
<col style="width: 37%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Version</th>
|
||
<th>Date</th>
|
||
<th>Changes</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><strong>1.1</strong></td>
|
||
<td>2024-12-23</td>
|
||
<td>Added international IDs (UK NI, Canadian SIN, India Aadhaar/PAN, etc.), cloud tokens (OpenAI, Anthropic, Discord), crypto addresses, financial identifiers (CUSIP, ISIN), improved false positive filtering</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><strong>1.0</strong></td>
|
||
<td>2024-12-20</td>
|
||
<td>Initial release with 30+ PII types, 8-layer detection pipeline</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<hr>
|
||
</section>
|
||
<section id="contact-support" class="level2" data-number="12">
|
||
<h2 data-number="12" class="anchored" data-anchor-id="contact-support"><span class="header-section-number">12</span> Contact & Support</h2>
|
||
<p>For enterprise licensing, custom integrations, or security assessments:</p>
|
||
<ul>
|
||
<li><strong>Documentation</strong>: See <code>privacy-scanner-overview.qmd</code> and <code>building-privacy-scanner.qmd</code></li>
|
||
<li><strong>Issues</strong>: Report via your organization’s support channel</li>
|
||
<li><strong>Updates</strong>: Pattern updates released quarterly</li>
|
||
</ul>
|
||
<hr>
|
||
<p><em>This document is intended for enterprise security and compliance teams evaluating the Privacy Scanner for production deployment. All technical specifications are subject to change. Please refer to the latest documentation for current capabilities.</em></p>
|
||
</section>
|
||
|
||
</main>
|
||
<!-- /main column -->
|
||
<script id="quarto-html-after-body" type="application/javascript">
|
||
window.document.addEventListener("DOMContentLoaded", function (event) {
|
||
const toggleBodyColorMode = (bsSheetEl) => {
|
||
const mode = bsSheetEl.getAttribute("data-mode");
|
||
const bodyEl = window.document.querySelector("body");
|
||
if (mode === "dark") {
|
||
bodyEl.classList.add("quarto-dark");
|
||
bodyEl.classList.remove("quarto-light");
|
||
} else {
|
||
bodyEl.classList.add("quarto-light");
|
||
bodyEl.classList.remove("quarto-dark");
|
||
}
|
||
}
|
||
const toggleBodyColorPrimary = () => {
|
||
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
|
||
if (bsSheetEl) {
|
||
toggleBodyColorMode(bsSheetEl);
|
||
}
|
||
}
|
||
toggleBodyColorPrimary();
|
||
const icon = "";
|
||
const anchorJS = new window.AnchorJS();
|
||
anchorJS.options = {
|
||
placement: 'right',
|
||
icon: icon
|
||
};
|
||
anchorJS.add('.anchored');
|
||
const isCodeAnnotation = (el) => {
|
||
for (const clz of el.classList) {
|
||
if (clz.startsWith('code-annotation-')) {
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
const onCopySuccess = function(e) {
|
||
// button target
|
||
const button = e.trigger;
|
||
// don't keep focus
|
||
button.blur();
|
||
// flash "checked"
|
||
button.classList.add('code-copy-button-checked');
|
||
var currentTitle = button.getAttribute("title");
|
||
button.setAttribute("title", "Copied!");
|
||
let tooltip;
|
||
if (window.bootstrap) {
|
||
button.setAttribute("data-bs-toggle", "tooltip");
|
||
button.setAttribute("data-bs-placement", "left");
|
||
button.setAttribute("data-bs-title", "Copied!");
|
||
tooltip = new bootstrap.Tooltip(button,
|
||
{ trigger: "manual",
|
||
customClass: "code-copy-button-tooltip",
|
||
offset: [0, -8]});
|
||
tooltip.show();
|
||
}
|
||
setTimeout(function() {
|
||
if (tooltip) {
|
||
tooltip.hide();
|
||
button.removeAttribute("data-bs-title");
|
||
button.removeAttribute("data-bs-toggle");
|
||
button.removeAttribute("data-bs-placement");
|
||
}
|
||
button.setAttribute("title", currentTitle);
|
||
button.classList.remove('code-copy-button-checked');
|
||
}, 1000);
|
||
// clear code selection
|
||
e.clearSelection();
|
||
}
|
||
const getTextToCopy = function(trigger) {
|
||
const codeEl = trigger.previousElementSibling.cloneNode(true);
|
||
for (const childEl of codeEl.children) {
|
||
if (isCodeAnnotation(childEl)) {
|
||
childEl.remove();
|
||
}
|
||
}
|
||
return codeEl.innerText;
|
||
}
|
||
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
|
||
text: getTextToCopy
|
||
});
|
||
clipboard.on('success', onCopySuccess);
|
||
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
|
||
// For code content inside modals, clipBoardJS needs to be initialized with a container option
|
||
// TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
|
||
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
|
||
text: getTextToCopy,
|
||
container: window.document.getElementById('quarto-embedded-source-code-modal')
|
||
});
|
||
clipboardModal.on('success', onCopySuccess);
|
||
}
|
||
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
|
||
var mailtoRegex = new RegExp(/^mailto:/);
|
||
var filterRegex = new RegExp('/' + window.location.host + '/');
|
||
var isInternal = (href) => {
|
||
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
|
||
}
|
||
// Inspect non-navigation links and adorn them if external
|
||
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
|
||
for (var i=0; i<links.length; i++) {
|
||
const link = links[i];
|
||
if (!isInternal(link.href)) {
|
||
// undo the damage that might have been done by quarto-nav.js in the case of
|
||
// links that we want to consider external
|
||
if (link.dataset.originalHref !== undefined) {
|
||
link.href = link.dataset.originalHref;
|
||
}
|
||
}
|
||
}
|
||
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
|
||
const config = {
|
||
allowHTML: true,
|
||
maxWidth: 500,
|
||
delay: 100,
|
||
arrow: false,
|
||
appendTo: function(el) {
|
||
return el.parentElement;
|
||
},
|
||
interactive: true,
|
||
interactiveBorder: 10,
|
||
theme: 'quarto',
|
||
placement: 'bottom-start',
|
||
};
|
||
if (contentFn) {
|
||
config.content = contentFn;
|
||
}
|
||
if (onTriggerFn) {
|
||
config.onTrigger = onTriggerFn;
|
||
}
|
||
if (onUntriggerFn) {
|
||
config.onUntrigger = onUntriggerFn;
|
||
}
|
||
window.tippy(el, config);
|
||
}
|
||
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
|
||
for (var i=0; i<noterefs.length; i++) {
|
||
const ref = noterefs[i];
|
||
tippyHover(ref, function() {
|
||
// use id or data attribute instead here
|
||
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
|
||
try { href = new URL(href).hash; } catch {}
|
||
const id = href.replace(/^#\/?/, "");
|
||
const note = window.document.getElementById(id);
|
||
if (note) {
|
||
return note.innerHTML;
|
||
} else {
|
||
return "";
|
||
}
|
||
});
|
||
}
|
||
const xrefs = window.document.querySelectorAll('a.quarto-xref');
|
||
const processXRef = (id, note) => {
|
||
// Strip column container classes
|
||
const stripColumnClz = (el) => {
|
||
el.classList.remove("page-full", "page-columns");
|
||
if (el.children) {
|
||
for (const child of el.children) {
|
||
stripColumnClz(child);
|
||
}
|
||
}
|
||
}
|
||
stripColumnClz(note)
|
||
if (id === null || id.startsWith('sec-')) {
|
||
// Special case sections, only their first couple elements
|
||
const container = document.createElement("div");
|
||
if (note.children && note.children.length > 2) {
|
||
container.appendChild(note.children[0].cloneNode(true));
|
||
for (let i = 1; i < note.children.length; i++) {
|
||
const child = note.children[i];
|
||
if (child.tagName === "P" && child.innerText === "") {
|
||
continue;
|
||
} else {
|
||
container.appendChild(child.cloneNode(true));
|
||
break;
|
||
}
|
||
}
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(container);
|
||
}
|
||
return container.innerHTML
|
||
} else {
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(note);
|
||
}
|
||
return note.innerHTML;
|
||
}
|
||
} else {
|
||
// Remove any anchor links if they are present
|
||
const anchorLink = note.querySelector('a.anchorjs-link');
|
||
if (anchorLink) {
|
||
anchorLink.remove();
|
||
}
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(note);
|
||
}
|
||
// TODO in 1.5, we should make sure this works without a callout special case
|
||
if (note.classList.contains("callout")) {
|
||
return note.outerHTML;
|
||
} else {
|
||
return note.innerHTML;
|
||
}
|
||
}
|
||
}
|
||
for (var i=0; i<xrefs.length; i++) {
|
||
const xref = xrefs[i];
|
||
tippyHover(xref, undefined, function(instance) {
|
||
instance.disable();
|
||
let url = xref.getAttribute('href');
|
||
let hash = undefined;
|
||
if (url.startsWith('#')) {
|
||
hash = url;
|
||
} else {
|
||
try { hash = new URL(url).hash; } catch {}
|
||
}
|
||
if (hash) {
|
||
const id = hash.replace(/^#\/?/, "");
|
||
const note = window.document.getElementById(id);
|
||
if (note !== null) {
|
||
try {
|
||
const html = processXRef(id, note.cloneNode(true));
|
||
instance.setContent(html);
|
||
} finally {
|
||
instance.enable();
|
||
instance.show();
|
||
}
|
||
} else {
|
||
// See if we can fetch this
|
||
fetch(url.split('#')[0])
|
||
.then(res => res.text())
|
||
.then(html => {
|
||
const parser = new DOMParser();
|
||
const htmlDoc = parser.parseFromString(html, "text/html");
|
||
const note = htmlDoc.getElementById(id);
|
||
if (note !== null) {
|
||
const html = processXRef(id, note);
|
||
instance.setContent(html);
|
||
}
|
||
}).finally(() => {
|
||
instance.enable();
|
||
instance.show();
|
||
});
|
||
}
|
||
} else {
|
||
// See if we can fetch a full url (with no hash to target)
|
||
// This is a special case and we should probably do some content thinning / targeting
|
||
fetch(url)
|
||
.then(res => res.text())
|
||
.then(html => {
|
||
const parser = new DOMParser();
|
||
const htmlDoc = parser.parseFromString(html, "text/html");
|
||
const note = htmlDoc.querySelector('main.content');
|
||
if (note !== null) {
|
||
// This should only happen for chapter cross references
|
||
// (since there is no id in the URL)
|
||
// remove the first header
|
||
if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
|
||
note.children[0].remove();
|
||
}
|
||
const html = processXRef(null, note);
|
||
instance.setContent(html);
|
||
}
|
||
}).finally(() => {
|
||
instance.enable();
|
||
instance.show();
|
||
});
|
||
}
|
||
}, function(instance) {
|
||
});
|
||
}
|
||
let selectedAnnoteEl;
|
||
const selectorForAnnotation = ( cell, annotation) => {
|
||
let cellAttr = 'data-code-cell="' + cell + '"';
|
||
let lineAttr = 'data-code-annotation="' + annotation + '"';
|
||
const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
|
||
return selector;
|
||
}
|
||
const selectCodeLines = (annoteEl) => {
|
||
const doc = window.document;
|
||
const targetCell = annoteEl.getAttribute("data-target-cell");
|
||
const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
|
||
const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
|
||
const lines = annoteSpan.getAttribute("data-code-lines").split(",");
|
||
const lineIds = lines.map((line) => {
|
||
return targetCell + "-" + line;
|
||
})
|
||
let top = null;
|
||
let height = null;
|
||
let parent = null;
|
||
if (lineIds.length > 0) {
|
||
//compute the position of the single el (top and bottom and make a div)
|
||
const el = window.document.getElementById(lineIds[0]);
|
||
top = el.offsetTop;
|
||
height = el.offsetHeight;
|
||
parent = el.parentElement.parentElement;
|
||
if (lineIds.length > 1) {
|
||
const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
|
||
const bottom = lastEl.offsetTop + lastEl.offsetHeight;
|
||
height = bottom - top;
|
||
}
|
||
if (top !== null && height !== null && parent !== null) {
|
||
// cook up a div (if necessary) and position it
|
||
let div = window.document.getElementById("code-annotation-line-highlight");
|
||
if (div === null) {
|
||
div = window.document.createElement("div");
|
||
div.setAttribute("id", "code-annotation-line-highlight");
|
||
div.style.position = 'absolute';
|
||
parent.appendChild(div);
|
||
}
|
||
div.style.top = top - 2 + "px";
|
||
div.style.height = height + 4 + "px";
|
||
div.style.left = 0;
|
||
let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
|
||
if (gutterDiv === null) {
|
||
gutterDiv = window.document.createElement("div");
|
||
gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
|
||
gutterDiv.style.position = 'absolute';
|
||
const codeCell = window.document.getElementById(targetCell);
|
||
const gutter = codeCell.querySelector('.code-annotation-gutter');
|
||
gutter.appendChild(gutterDiv);
|
||
}
|
||
gutterDiv.style.top = top - 2 + "px";
|
||
gutterDiv.style.height = height + 4 + "px";
|
||
}
|
||
selectedAnnoteEl = annoteEl;
|
||
}
|
||
};
|
||
const unselectCodeLines = () => {
|
||
const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
|
||
elementsIds.forEach((elId) => {
|
||
const div = window.document.getElementById(elId);
|
||
if (div) {
|
||
div.remove();
|
||
}
|
||
});
|
||
selectedAnnoteEl = undefined;
|
||
};
|
||
// Handle positioning of the toggle
|
||
window.addEventListener(
|
||
"resize",
|
||
throttle(() => {
|
||
elRect = undefined;
|
||
if (selectedAnnoteEl) {
|
||
selectCodeLines(selectedAnnoteEl);
|
||
}
|
||
}, 10)
|
||
);
|
||
function throttle(fn, ms) {
|
||
let throttle = false;
|
||
let timer;
|
||
return (...args) => {
|
||
if(!throttle) { // first call gets through
|
||
fn.apply(this, args);
|
||
throttle = true;
|
||
} else { // all the others get throttled
|
||
if(timer) clearTimeout(timer); // cancel #2
|
||
timer = setTimeout(() => {
|
||
fn.apply(this, args);
|
||
timer = throttle = false;
|
||
}, ms);
|
||
}
|
||
};
|
||
}
|
||
// Attach click handler to the DT
|
||
const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
|
||
for (const annoteDlNode of annoteDls) {
|
||
annoteDlNode.addEventListener('click', (event) => {
|
||
const clickedEl = event.target;
|
||
if (clickedEl !== selectedAnnoteEl) {
|
||
unselectCodeLines();
|
||
const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
|
||
if (activeEl) {
|
||
activeEl.classList.remove('code-annotation-active');
|
||
}
|
||
selectCodeLines(clickedEl);
|
||
clickedEl.classList.add('code-annotation-active');
|
||
} else {
|
||
// Unselect the line
|
||
unselectCodeLines();
|
||
clickedEl.classList.remove('code-annotation-active');
|
||
}
|
||
});
|
||
}
|
||
const findCites = (el) => {
|
||
const parentEl = el.parentElement;
|
||
if (parentEl) {
|
||
const cites = parentEl.dataset.cites;
|
||
if (cites) {
|
||
return {
|
||
el,
|
||
cites: cites.split(' ')
|
||
};
|
||
} else {
|
||
return findCites(el.parentElement)
|
||
}
|
||
} else {
|
||
return undefined;
|
||
}
|
||
};
|
||
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
|
||
for (var i=0; i<bibliorefs.length; i++) {
|
||
const ref = bibliorefs[i];
|
||
const citeInfo = findCites(ref);
|
||
if (citeInfo) {
|
||
tippyHover(citeInfo.el, function() {
|
||
var popup = window.document.createElement('div');
|
||
citeInfo.cites.forEach(function(cite) {
|
||
var citeDiv = window.document.createElement('div');
|
||
citeDiv.classList.add('hanging-indent');
|
||
citeDiv.classList.add('csl-entry');
|
||
var biblioDiv = window.document.getElementById('ref-' + cite);
|
||
if (biblioDiv) {
|
||
citeDiv.innerHTML = biblioDiv.innerHTML;
|
||
}
|
||
popup.appendChild(citeDiv);
|
||
});
|
||
return popup.innerHTML;
|
||
});
|
||
}
|
||
}
|
||
});
|
||
</script>
|
||
</div> <!-- /content -->
|
||
|
||
|
||
|
||
|
||
</body></html> |