""" Prompt Injection Detector ========================== Deteksi usaha manipulasi/injection terhadap AI model. """ import re from typing import Dict, List # Injection patterns (English + Indonesian) INJECTION_PATTERNS = [ # English injection attempts r"ignore\s+(all\s+)?(previous|prior|above)\s+(instructions?|prompts?|rules?)", r"disregard\s+(all\s+)?(previous|prior)\s+(instructions?|rules?)", r"forget\s+(all\s+)?(previous|your)\s+(instructions?|rules?|training)", r"you\s+are\s+now\s+(?:a|an)\s+\w+", r"act\s+as\s+(?:if|though)\s+you", r"pretend\s+(you\s+are|to\s+be)", r"new\s+instruction[s]?\s*:", r"system\s+prompt\s*:", r"override\s+(all\s+)?instructions?", r"do\s+not\s+follow\s+(any|your)\s+(rules?|instructions?)", r"jailbreak", r"DAN\s+mode", r"developer\s+mode", # Indonesian injection attempts r"abaikan\s+(semua\s+)?(instruksi|perintah|aturan)", r"lupakan\s+(semua\s+)?(instruksi|perintah|aturan)", r"kamu\s+sekarang\s+adalah", r"berpura[\-\s]?pura\s+(menjadi|jadi|sebagai)", r"instruksi\s+baru\s*:", r"perintah\s+baru\s*:", r"langgar\s+(semua\s+)?aturan", r"jangan\s+ikuti\s+(aturan|instruksi)", r"ubah\s+(peran|perilaku|karakter)\s*(mu|kamu)", ] # Role hijacking patterns ROLE_HIJACK_PATTERNS = [ r"(you\s+are|kamu\s+adalah)\s+(evil|jahat|dangerous|berbahaya)", r"(respond|jawab)\s+(as|sebagai)\s+(villain|penjahat|hacker)", r"mode\s+(tanpa\s+filter|unfiltered|uncensored|bebas)", r"(tanpa|tanpa\s+ada|hilangkan)\s+(batasan|filter|sensor|aturan)", ] # Data exfiltration patterns EXFIL_PATTERNS = [ r"(tampilkan|show|reveal|tunjukkan)\s+(system\s+prompt|instruksi\s+sistem)", r"(apa|what\s+is)\s+(system\s+prompt|instruksi\s+awal)(mu|you)?", r"(repeat|ulangi)\s+(your|semua)\s+(instructions?|instruksi)", r"(ceritakan|beritahu)\s+(semua\s+)?(aturan|instruksi)\s+(rahasia|tersembunyi)", ] class InjectionDetector: """Deteksi prompt injection attacks.""" def __init__(self, sensitivity: str = "medium"): self.sensitivity = sensitivity self.injection_patterns = [re.compile(p, re.IGNORECASE) for p in INJECTION_PATTERNS] self.role_patterns = [re.compile(p, re.IGNORECASE) for p in ROLE_HIJACK_PATTERNS] self.exfil_patterns = [re.compile(p, re.IGNORECASE) for p in EXFIL_PATTERNS] def check(self, text: str) -> Dict: """ Cek teks untuk prompt injection. Returns: { "safe": bool, "score": float (0-1), "violations": list, "injection_type": str or None, } """ violations = [] score = 0.0 injection_type = None # 1. Check injection patterns for pattern in self.injection_patterns: if pattern.search(text): score += 0.5 injection_type = "prompt_injection" violations.append({ "type": "prompt_injection", "severity": "high", "detail": "Pola prompt injection terdeteksi", }) break # 2. Check role hijacking for pattern in self.role_patterns: if pattern.search(text): score += 0.4 injection_type = injection_type or "role_hijack" violations.append({ "type": "role_hijack", "severity": "high", "detail": "Usaha mengubah peran/karakter AI terdeteksi", }) break # 3. Check data exfiltration for pattern in self.exfil_patterns: if pattern.search(text): score += 0.3 injection_type = injection_type or "data_exfiltration" violations.append({ "type": "data_exfiltration", "severity": "medium", "detail": "Usaha mengakses instruksi sistem terdeteksi", }) break # 4. Heuristic: suspicious formatting suspicious_markers = [ "```system", "###system", "[SYSTEM]", "<>", "---\nrole:", "```\nignore", ] for marker in suspicious_markers: if marker.lower() in text.lower(): score += 0.2 violations.append({ "type": "suspicious_format", "severity": "medium", "detail": "Format mencurigakan terdeteksi", }) break threshold = {"low": 0.6, "medium": 0.3, "high": 0.1}[self.sensitivity] score = min(score, 1.0) return { "safe": score < threshold, "score": round(score, 2), "violations": violations, "injection_type": injection_type, }