""" Topic Filter & Language Detector ================================== """ import re from typing import Dict, List class TopicFilter: """Filter topik yang tidak diizinkan.""" DEFAULT_BLOCKED = { "kekerasan_eksplisit": [ r"cara\s+(membuat|merakit|bikin)\s+(bom|senjata|racun|narkoba)", r"tutorial\s+(membunuh|menyerang|meretas|hack)", r"langkah[\-\s]langkah\s+(membuat|merakit)\s+(senjata|bahan\s+peledak)", ], "konten_ilegal": [ r"cara\s+(membobol|meretas|hack)\s+(bank|rekening|akun|sistem)", r"(membeli|mendapatkan|membuat)\s+(narkoba|obat\s+terlarang)", r"cara\s+(memalsukan|membuat\s+palsu)\s+(dokumen|ijazah|ktp|uang)", ], "self_harm": [ r"cara\s+(bunuh\s+diri|menyakiti\s+diri)", r"(ingin|mau)\s+(mengakhiri\s+hidup|mati)", ], } def __init__(self, blocked_topics: Dict[str, List[str]] = None, custom_blocked: Dict[str, List[str]] = None, sensitivity: str = "medium"): self.blocked = blocked_topics or self.DEFAULT_BLOCKED self.sensitivity = sensitivity if custom_blocked: self.blocked.update(custom_blocked) self.compiled = {} for topic, patterns in self.blocked.items(): self.compiled[topic] = [re.compile(p, re.IGNORECASE) for p in patterns] def check(self, text: str) -> Dict: violations = [] score = 0.0 for topic, patterns in self.compiled.items(): for pattern in patterns: if pattern.search(text): severity = "critical" if topic == "self_harm" else "high" score += 0.6 violations.append({ "type": "blocked_topic", "topic": topic, "severity": severity, "detail": f"Topik terlarang terdeteksi: {topic.replace('_', ' ')}", }) break score = min(score, 1.0) # Topic thresholds lebih ketat karena topik berbahaya harus selalu diblokir # low: hanya blokir jika multiple topics atau self_harm # medium/high: blokir semua match threshold = {"low": 0.7, "medium": 0.5, "high": 0.1}[self.sensitivity] # Special handling for self-harm is_self_harm = any(v["topic"] == "self_harm" for v in violations) return { "safe": score < threshold, "score": round(score, 2), "violations": violations, "is_self_harm": is_self_harm, "help_message": ( "Jika Anda atau seseorang yang Anda kenal membutuhkan bantuan, " "hubungi Into The Light Indonesia di 021-7884-5555 " "atau Hotline Kemenkes 119 ext. 8." ) if is_self_harm else None, } class LanguageDetector: """Deteksi bahasa teks (sederhana, rule-based).""" # Common Indonesian words ID_MARKERS = { "yang", "dan", "di", "ini", "itu", "dengan", "untuk", "dari", "adalah", "pada", "ke", "tidak", "akan", "juga", "sudah", "ada", "bisa", "saya", "kamu", "mereka", "kami", "kita", "apa", "atau", "dalam", "oleh", "agar", "bahwa", "karena", "tetapi", "jadi", "sangat", "lebih", "belum", "harus", "mau", "perlu", "seperti", "antara", "setiap", "semua", "masih", "banyak", "orang", "dapat", "telah", "sedang", "saat", "lagi", "punya", "ingin", "belajar", "bahasa", "indonesia", "tentang", "bagaimana", "mengapa", "dimana", "kapan", "siapa", "berapa", "tolong", "mohon", "silakan", "terima", "kasih", "selamat", } EN_MARKERS = { "the", "is", "are", "was", "were", "have", "has", "had", "will", "would", "could", "should", "can", "may", "might", "this", "that", "these", "those", "what", "which", "who", "how", "why", "where", "when", "not", "but", "and", "for", "with", "from", "about", "into", "been", "being", "does", "i", "you", "he", "she", "it", "we", "they", "me", "my", "your", "his", "her", "our", "their", "to", "of", "do", "did", "want", "need", "like", "know", "think", "learn", "make", "get", "go", "come", "see", "say", "because", "very", "also", "just", "only", "still", "really", "actually", "however", "although", "though", "useful", "important", "different", "available", "possible", "development", "machine", "learning", "please", "thank", "before", "after", "between", "through", "during", } # Kata yang ada di kedua bahasa atau terlalu pendek untuk jadi sinyal AMBIGUOUS = {"di", "a", "an", "air", "data", "era", "unit", "drama", "status", "pun"} def check(self, text: str) -> Dict: words = re.findall(r'\b\w+\b', text.lower()) unique_words = set(words) total = max(len(unique_words), 1) # Hapus kata ambigu dari penghitungan id_matches = (unique_words & self.ID_MARKERS) - self.AMBIGUOUS en_matches = (unique_words & self.EN_MARKERS) - self.AMBIGUOUS id_count = len(id_matches) en_count = len(en_matches) id_ratio = id_count / total en_ratio = en_count / total # Deteksi mixed language (code-switching) both_significant = id_count >= 2 and en_count >= 2 ratio_close = (min(id_ratio, en_ratio) / max(id_ratio, en_ratio, 0.001)) > 0.4 if both_significant and ratio_close: language = "mixed" confidence = round(min((id_ratio + en_ratio) * 2, 1.0), 2) elif id_ratio > en_ratio and id_count >= 2: language = "id" confidence = min(id_ratio * 3, 1.0) elif en_ratio > id_ratio and en_count >= 2: language = "en" confidence = min(en_ratio * 3, 1.0) else: language = "unknown" confidence = 0.0 return { "language": language, "confidence": round(confidence, 2), "id_score": round(id_ratio, 3), "en_score": round(en_ratio, 3), }