romizone's picture
Upload Guardrails ID
356fd47 verified
Raw
History Blame Contribute Delete
8.14 kB
"""
Guardrails Pipeline - Core Engine
====================================
Pipeline utama yang menggabungkan semua guards.
Author: Jekardah AI Lab
"""
from typing import Dict, List, Optional
from .guards import ToxicDetector, PIIDetector, InjectionDetector, TopicFilter, LanguageDetector
class GuardrailsPipeline:
"""
Pipeline guardrails untuk Bahasa Indonesia.
Menggabungkan berbagai guard untuk mengecek keamanan
input dan output dari AI model.
Usage:
pipeline = GuardrailsPipeline()
# Cek input
result = pipeline.check_input("Apa itu fotosintesis?")
print(result["safe"]) # True
# Cek output
result = pipeline.check_output(
output_text="Nama saya AI assistant.",
input_text="Siapa namamu?"
)
# Full pipeline
result = pipeline.run(
input_text="Jelaskan demokrasi",
output_text="Demokrasi adalah..."
)
"""
def __init__(
self,
enable_toxic: bool = True,
enable_pii: bool = True,
enable_injection: bool = True,
enable_topic: bool = True,
enable_language: bool = True,
sensitivity: str = "medium",
language: str = "id",
):
self.guards = {}
self.sensitivity = sensitivity
self.language = language
if enable_toxic:
self.guards["toxic"] = ToxicDetector(sensitivity=sensitivity)
if enable_pii:
self.guards["pii"] = PIIDetector()
if enable_injection:
self.guards["injection"] = InjectionDetector(sensitivity=sensitivity)
if enable_topic:
self.guards["topic"] = TopicFilter(sensitivity=sensitivity)
if enable_language:
self.guards["language"] = LanguageDetector()
def check_input(self, text: str) -> Dict:
"""
Cek keamanan input dari user.
Returns:
{
"safe": bool,
"input": str,
"sanitized_input": str (PII removed),
"violations": list,
"guard_results": dict,
"summary": str,
}
"""
results = {}
all_violations = []
is_safe = True
# Run each guard
if "toxic" in self.guards:
r = self.guards["toxic"].check(text)
results["toxic"] = r
if not r["safe"]:
is_safe = False
all_violations.extend(r["violations"])
if "injection" in self.guards:
r = self.guards["injection"].check(text)
results["injection"] = r
if not r["safe"]:
is_safe = False
all_violations.extend(r["violations"])
if "topic" in self.guards:
r = self.guards["topic"].check(text)
results["topic"] = r
if not r["safe"]:
is_safe = False
all_violations.extend(r["violations"])
if "language" in self.guards:
r = self.guards["language"].check(text)
results["language"] = r
# PII detection (detect but don't block — just flag)
sanitized = text
if "pii" in self.guards:
r = self.guards["pii"].scrub(text)
results["pii"] = {
"has_pii": r["pii_found"],
"replacements": r["replacements"],
}
sanitized = r["scrubbed"]
if r["pii_found"]:
all_violations.append({
"type": "pii_detected",
"severity": "warning",
"detail": f"Data pribadi terdeteksi: {len(r['replacements'])} item",
})
# Build summary
if is_safe and not all_violations:
summary = "✅ Input aman — tidak ada masalah terdeteksi."
elif is_safe and all_violations:
summary = "⚠️ Input diterima dengan peringatan."
else:
violation_types = list(set(v["type"] for v in all_violations if v.get("severity") != "warning"))
summary = f"⛔ Input diblokir — {', '.join(violation_types)}"
# Check for self-harm (special handling)
if "topic" in results and results["topic"].get("is_self_harm"):
summary += "\n\n" + results["topic"]["help_message"]
return {
"safe": is_safe,
"input": text,
"sanitized_input": sanitized,
"violations": all_violations,
"guard_results": results,
"summary": summary,
}
def check_output(self, output_text: str, input_text: str = "") -> Dict:
"""
Cek keamanan output dari AI.
Returns:
{
"safe": bool,
"output": str,
"sanitized_output": str,
"violations": list,
"guard_results": dict,
"summary": str,
}
"""
results = {}
all_violations = []
is_safe = True
# Check toxic content in output
if "toxic" in self.guards:
r = self.guards["toxic"].check(output_text)
results["toxic"] = r
if not r["safe"]:
is_safe = False
all_violations.extend(r["violations"])
# PII scrubbing in output (important!)
sanitized = output_text
if "pii" in self.guards:
r = self.guards["pii"].scrub(output_text)
results["pii"] = {
"has_pii": r["pii_found"],
"replacements": r["replacements"],
}
sanitized = r["scrubbed"]
if r["pii_found"]:
all_violations.append({
"type": "pii_in_output",
"severity": "high",
"detail": f"PII ditemukan dalam output: {len(r['replacements'])} item — otomatis di-mask",
})
# Relevance check (basic)
if input_text and output_text:
input_words = set(input_text.lower().split())
output_words = set(output_text.lower().split())
overlap = len(input_words & output_words)
if len(input_words) > 3 and overlap == 0:
all_violations.append({
"type": "low_relevance",
"severity": "warning",
"detail": "Output mungkin tidak relevan dengan input",
})
# Length check
if len(output_text) > 5000:
all_violations.append({
"type": "output_too_long",
"severity": "warning",
"detail": f"Output terlalu panjang ({len(output_text)} chars)",
})
elif len(output_text.strip()) < 5:
all_violations.append({
"type": "output_too_short",
"severity": "warning",
"detail": "Output terlalu pendek",
})
# Summary
if is_safe and not all_violations:
summary = "✅ Output aman."
elif is_safe:
summary = "⚠️ Output diterima dengan peringatan."
else:
summary = "⛔ Output memiliki masalah keamanan."
return {
"safe": is_safe,
"output": output_text,
"sanitized_output": sanitized,
"violations": all_violations,
"guard_results": results,
"summary": summary,
}
def run(self, input_text: str, output_text: str = "") -> Dict:
"""
Full pipeline: cek input + output sekaligus.
"""
input_result = self.check_input(input_text)
output_result = None
if output_text:
output_result = self.check_output(output_text, input_text)
overall_safe = input_result["safe"]
if output_result:
overall_safe = overall_safe and output_result["safe"]
return {
"safe": overall_safe,
"input_check": input_result,
"output_check": output_result,
}