""" Guardrails Pipeline - Core Engine ==================================== Pipeline utama yang menggabungkan semua guards. Author: Jekardah AI Lab """ from typing import Dict, List, Optional from .guards import ToxicDetector, PIIDetector, InjectionDetector, TopicFilter, LanguageDetector class GuardrailsPipeline: """ Pipeline guardrails untuk Bahasa Indonesia. Menggabungkan berbagai guard untuk mengecek keamanan input dan output dari AI model. Usage: pipeline = GuardrailsPipeline() # Cek input result = pipeline.check_input("Apa itu fotosintesis?") print(result["safe"]) # True # Cek output result = pipeline.check_output( output_text="Nama saya AI assistant.", input_text="Siapa namamu?" ) # Full pipeline result = pipeline.run( input_text="Jelaskan demokrasi", output_text="Demokrasi adalah..." ) """ def __init__( self, enable_toxic: bool = True, enable_pii: bool = True, enable_injection: bool = True, enable_topic: bool = True, enable_language: bool = True, sensitivity: str = "medium", language: str = "id", ): self.guards = {} self.sensitivity = sensitivity self.language = language if enable_toxic: self.guards["toxic"] = ToxicDetector(sensitivity=sensitivity) if enable_pii: self.guards["pii"] = PIIDetector() if enable_injection: self.guards["injection"] = InjectionDetector(sensitivity=sensitivity) if enable_topic: self.guards["topic"] = TopicFilter(sensitivity=sensitivity) if enable_language: self.guards["language"] = LanguageDetector() def check_input(self, text: str) -> Dict: """ Cek keamanan input dari user. Returns: { "safe": bool, "input": str, "sanitized_input": str (PII removed), "violations": list, "guard_results": dict, "summary": str, } """ results = {} all_violations = [] is_safe = True # Run each guard if "toxic" in self.guards: r = self.guards["toxic"].check(text) results["toxic"] = r if not r["safe"]: is_safe = False all_violations.extend(r["violations"]) if "injection" in self.guards: r = self.guards["injection"].check(text) results["injection"] = r if not r["safe"]: is_safe = False all_violations.extend(r["violations"]) if "topic" in self.guards: r = self.guards["topic"].check(text) results["topic"] = r if not r["safe"]: is_safe = False all_violations.extend(r["violations"]) if "language" in self.guards: r = self.guards["language"].check(text) results["language"] = r # PII detection (detect but don't block — just flag) sanitized = text if "pii" in self.guards: r = self.guards["pii"].scrub(text) results["pii"] = { "has_pii": r["pii_found"], "replacements": r["replacements"], } sanitized = r["scrubbed"] if r["pii_found"]: all_violations.append({ "type": "pii_detected", "severity": "warning", "detail": f"Data pribadi terdeteksi: {len(r['replacements'])} item", }) # Build summary if is_safe and not all_violations: summary = "✅ Input aman — tidak ada masalah terdeteksi." elif is_safe and all_violations: summary = "⚠️ Input diterima dengan peringatan." else: violation_types = list(set(v["type"] for v in all_violations if v.get("severity") != "warning")) summary = f"⛔ Input diblokir — {', '.join(violation_types)}" # Check for self-harm (special handling) if "topic" in results and results["topic"].get("is_self_harm"): summary += "\n\n" + results["topic"]["help_message"] return { "safe": is_safe, "input": text, "sanitized_input": sanitized, "violations": all_violations, "guard_results": results, "summary": summary, } def check_output(self, output_text: str, input_text: str = "") -> Dict: """ Cek keamanan output dari AI. Returns: { "safe": bool, "output": str, "sanitized_output": str, "violations": list, "guard_results": dict, "summary": str, } """ results = {} all_violations = [] is_safe = True # Check toxic content in output if "toxic" in self.guards: r = self.guards["toxic"].check(output_text) results["toxic"] = r if not r["safe"]: is_safe = False all_violations.extend(r["violations"]) # PII scrubbing in output (important!) sanitized = output_text if "pii" in self.guards: r = self.guards["pii"].scrub(output_text) results["pii"] = { "has_pii": r["pii_found"], "replacements": r["replacements"], } sanitized = r["scrubbed"] if r["pii_found"]: all_violations.append({ "type": "pii_in_output", "severity": "high", "detail": f"PII ditemukan dalam output: {len(r['replacements'])} item — otomatis di-mask", }) # Relevance check (basic) if input_text and output_text: input_words = set(input_text.lower().split()) output_words = set(output_text.lower().split()) overlap = len(input_words & output_words) if len(input_words) > 3 and overlap == 0: all_violations.append({ "type": "low_relevance", "severity": "warning", "detail": "Output mungkin tidak relevan dengan input", }) # Length check if len(output_text) > 5000: all_violations.append({ "type": "output_too_long", "severity": "warning", "detail": f"Output terlalu panjang ({len(output_text)} chars)", }) elif len(output_text.strip()) < 5: all_violations.append({ "type": "output_too_short", "severity": "warning", "detail": "Output terlalu pendek", }) # Summary if is_safe and not all_violations: summary = "✅ Output aman." elif is_safe: summary = "⚠️ Output diterima dengan peringatan." else: summary = "⛔ Output memiliki masalah keamanan." return { "safe": is_safe, "output": output_text, "sanitized_output": sanitized, "violations": all_violations, "guard_results": results, "summary": summary, } def run(self, input_text: str, output_text: str = "") -> Dict: """ Full pipeline: cek input + output sekaligus. """ input_result = self.check_input(input_text) output_result = None if output_text: output_result = self.check_output(output_text, input_text) overall_safe = input_result["safe"] if output_result: overall_safe = overall_safe and output_result["safe"] return { "safe": overall_safe, "input_check": input_result, "output_check": output_result, }