""" PII Detector & Scrubber - Format Indonesia ============================================ Deteksi dan mask data pribadi: NIK, KTP, NPWP, No HP, Email, dll. """ import re from typing import Dict, List, Tuple # Daftar kode provinsi valid untuk validasi NIK _VALID_PROVINCE_CODES = { "11", "12", "13", "14", "15", "16", "17", "18", "19", "21", "31", "32", "33", "34", "35", "36", "51", "52", "53", "61", "62", "63", "64", "65", "71", "72", "73", "74", "75", "76", "81", "82", "91", "92", "94", } def _validate_nik(match_str: str) -> bool: """Validasi struktur NIK: kode provinsi + tanggal lahir.""" if len(match_str) != 16: return False province = match_str[:2] if province not in _VALID_PROVINCE_CODES: return False day = int(match_str[6:8]) month = int(match_str[8:10]) # Perempuan: hari + 40 if day > 40: day -= 40 if not (1 <= day <= 31 and 1 <= month <= 12): return False return True # Pattern definitions for Indonesian PII PII_PATTERNS = { "nik": { "pattern": r'\b(\d{16})\b', "label": "NIK/KTP", "description": "Nomor Induk Kependudukan (16 digit)", "validate": _validate_nik, }, "phone": { "pattern": r'(? Dict: """ Deteksi PII dalam teks. Returns: { "has_pii": bool, "findings": list of {type, label, value, position}, "count": int, } """ findings = [] for name, pattern in self.compiled.items(): info = self.patterns[name] validate_fn = info.get("validate") for match in pattern.finditer(text): # Gunakan full match untuk posisi (konsisten dengan scrub) full_start, full_end = match.start(), match.end() value = match.group(1) if match.lastindex else match.group(0) # Jalankan validasi jika ada if validate_fn and not validate_fn(value): continue findings.append({ "type": name, "label": info["label"], "value": value, "position": (full_start, full_end), "description": info["description"], }) return { "has_pii": len(findings) > 0, "findings": findings, "count": len(findings), } def scrub(self, text: str, replacement: str = "[REDACTED]") -> Dict: """ Deteksi dan mask semua PII dalam teks. Returns: { "original": str, "scrubbed": str, "replacements": list of {type, original, replacement}, } """ result = self.detect(text) scrubbed = text replacements = [] # Filter overlapping matches: keep yang lebih panjang by_length = sorted(result["findings"], key=lambda x: x["position"][1] - x["position"][0], reverse=True) filtered = [] for finding in by_length: start, end = finding["position"] overlaps = False for kept in filtered: ks, ke = kept["position"] if start < ke and end > ks: overlaps = True break if not overlaps: filtered.append(finding) # Sort by position (reverse) to replace from end filtered.sort(key=lambda x: x["position"][0], reverse=True) for finding in filtered: start, end = finding["position"] label = f"[{finding['label'].upper()}]" original_text = scrubbed[start:end] scrubbed = scrubbed[:start] + label + scrubbed[end:] replacements.append({ "type": finding["type"], "original": original_text, "replacement": label, }) return { "original": text, "scrubbed": scrubbed, "replacements": replacements, "pii_found": len(replacements) > 0, }