"""
Topic Filter & Language Detector
==================================
"""

import re
from typing import Dict, List


class TopicFilter:
    """Filter topik yang tidak diizinkan."""

    DEFAULT_BLOCKED = {
        "kekerasan_eksplisit": [
            r"cara\s+(membuat|merakit|bikin)\s+(bom|senjata|racun|narkoba)",
            r"tutorial\s+(membunuh|menyerang|meretas|hack)",
            r"langkah[\-\s]langkah\s+(membuat|merakit)\s+(senjata|bahan\s+peledak)",
        ],
        "konten_ilegal": [
            r"cara\s+(membobol|meretas|hack)\s+(bank|rekening|akun|sistem)",
            r"(membeli|mendapatkan|membuat)\s+(narkoba|obat\s+terlarang)",
            r"cara\s+(memalsukan|membuat\s+palsu)\s+(dokumen|ijazah|ktp|uang)",
        ],
        "self_harm": [
            r"cara\s+(bunuh\s+diri|menyakiti\s+diri)",
            r"(ingin|mau)\s+(mengakhiri\s+hidup|mati)",
        ],
    }

    def __init__(self, blocked_topics: Dict[str, List[str]] = None,
                 custom_blocked: Dict[str, List[str]] = None,
                 sensitivity: str = "medium"):
        self.blocked = blocked_topics or self.DEFAULT_BLOCKED
        self.sensitivity = sensitivity
        if custom_blocked:
            self.blocked.update(custom_blocked)

        self.compiled = {}
        for topic, patterns in self.blocked.items():
            self.compiled[topic] = [re.compile(p, re.IGNORECASE) for p in patterns]

    def check(self, text: str) -> Dict:
        violations = []
        score = 0.0

        for topic, patterns in self.compiled.items():
            for pattern in patterns:
                if pattern.search(text):
                    severity = "critical" if topic == "self_harm" else "high"
                    score += 0.6
                    violations.append({
                        "type": "blocked_topic",
                        "topic": topic,
                        "severity": severity,
                        "detail": f"Topik terlarang terdeteksi: {topic.replace('_', ' ')}",
                    })
                    break

        score = min(score, 1.0)
        # Topic thresholds lebih ketat karena topik berbahaya harus selalu diblokir
        # low: hanya blokir jika multiple topics atau self_harm
        # medium/high: blokir semua match
        threshold = {"low": 0.7, "medium": 0.5, "high": 0.1}[self.sensitivity]

        # Special handling for self-harm
        is_self_harm = any(v["topic"] == "self_harm" for v in violations)

        return {
            "safe": score < threshold,
            "score": round(score, 2),
            "violations": violations,
            "is_self_harm": is_self_harm,
            "help_message": (
                "Jika Anda atau seseorang yang Anda kenal membutuhkan bantuan, "
                "hubungi Into The Light Indonesia di 021-7884-5555 "
                "atau Hotline Kemenkes 119 ext. 8."
            ) if is_self_harm else None,
        }


class LanguageDetector:
    """Deteksi bahasa teks (sederhana, rule-based)."""

    # Common Indonesian words
    ID_MARKERS = {
        "yang", "dan", "di", "ini", "itu", "dengan", "untuk", "dari",
        "adalah", "pada", "ke", "tidak", "akan", "juga", "sudah", "ada",
        "bisa", "saya", "kamu", "mereka", "kami", "kita", "apa", "atau",
        "dalam", "oleh", "agar", "bahwa", "karena", "tetapi", "jadi",
        "sangat", "lebih", "belum", "harus", "mau", "perlu", "seperti",
        "antara", "setiap", "semua", "masih", "banyak", "orang",
        "dapat", "telah", "sedang", "saat", "lagi", "punya",
        "ingin", "belajar", "bahasa", "indonesia", "tentang", "bagaimana",
        "mengapa", "dimana", "kapan", "siapa", "berapa", "tolong",
        "mohon", "silakan", "terima", "kasih", "selamat",
    }

    EN_MARKERS = {
        "the", "is", "are", "was", "were", "have", "has", "had",
        "will", "would", "could", "should", "can", "may", "might",
        "this", "that", "these", "those", "what", "which", "who",
        "how", "why", "where", "when", "not", "but", "and", "for",
        "with", "from", "about", "into", "been", "being", "does",
        "i", "you", "he", "she", "it", "we", "they", "me", "my",
        "your", "his", "her", "our", "their", "to", "of",
        "do", "did", "want", "need", "like", "know", "think",
        "learn", "make", "get", "go", "come", "see", "say",
        "because", "very", "also", "just", "only", "still",
        "really", "actually", "however", "although", "though",
        "useful", "important", "different", "available", "possible",
        "development", "machine", "learning", "please", "thank",
        "before", "after", "between", "through", "during",
    }

    # Kata yang ada di kedua bahasa atau terlalu pendek untuk jadi sinyal
    AMBIGUOUS = {"di", "a", "an", "air", "data", "era", "unit", "drama", "status", "pun"}

    def check(self, text: str) -> Dict:
        words = re.findall(r'\b\w+\b', text.lower())
        unique_words = set(words)
        total = max(len(unique_words), 1)

        # Hapus kata ambigu dari penghitungan
        id_matches = (unique_words & self.ID_MARKERS) - self.AMBIGUOUS
        en_matches = (unique_words & self.EN_MARKERS) - self.AMBIGUOUS

        id_count = len(id_matches)
        en_count = len(en_matches)

        id_ratio = id_count / total
        en_ratio = en_count / total

        # Deteksi mixed language (code-switching)
        both_significant = id_count >= 2 and en_count >= 2
        ratio_close = (min(id_ratio, en_ratio) / max(id_ratio, en_ratio, 0.001)) > 0.4

        if both_significant and ratio_close:
            language = "mixed"
            confidence = round(min((id_ratio + en_ratio) * 2, 1.0), 2)
        elif id_ratio > en_ratio and id_count >= 2:
            language = "id"
            confidence = min(id_ratio * 3, 1.0)
        elif en_ratio > id_ratio and en_count >= 2:
            language = "en"
            confidence = min(en_ratio * 3, 1.0)
        else:
            language = "unknown"
            confidence = 0.0

        return {
            "language": language,
            "confidence": round(confidence, 2),
            "id_score": round(id_ratio, 3),
            "en_score": round(en_ratio, 3),
        }