"""
═══════════════════════════════════════════════════════════════════════════════
TAU Platform v4.0 - Enhanced Hebrew Encoder
═══════════════════════════════════════════════════════════════════════════════

Unified Hebrew text encoder combining:
1. Gematria encoding (log-scale + bigrams + position weights)
2. Spherical trajectory encoding (with curvature/torsion)
3. Morphological analysis (root extraction)
4. τ complexity metrics (Heaps' Law adjusted)
5. Vocabulary encoding (word-to-index)
6. Compression integration

Author: Avri Barzel
Date: November 2025
═══════════════════════════════════════════════════════════════════════════════
"""

import numpy as np
import re
import hashlib
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, field
from collections import Counter
from functools import lru_cache

# ═══════════════════════════════════════════════════════════════════════════════
# PRE-COMPILED REGEX PATTERNS (Performance optimization)
# ═══════════════════════════════════════════════════════════════════════════════
NIQQUD_PATTERN = re.compile(r"[\u0591-\u05C7]")
HEBREW_WORD_PATTERN = re.compile(r"[\u05D0-\u05EA]+")

# Universal multilingual word pattern - supports ALL major scripts
MULTILINGUAL_WORD_PATTERN = re.compile(
    r"[\u05D0-\u05EA]+"  # Hebrew
    r"|[a-zA-Z]+"  # Latin/English
    r"|[\u0400-\u04FF]+"  # Cyrillic (Russian, etc.)
    r"|[\u0600-\u06FF]+"  # Arabic
    r"|[\u4E00-\u9FFF]+"  # Chinese (CJK)
    r"|[\u3040-\u309F]+"  # Japanese Hiragana
    r"|[\u30A0-\u30FF]+"  # Japanese Katakana
    r"|[\uAC00-\uD7AF]+"  # Korean Hangul
    r"|[\u0900-\u097F]+"  # Hindi/Devanagari
    r"|[\u0370-\u03FF]+"  # Greek
    r"|[\u0E00-\u0E7F]+"  # Thai
    r"|[\u0590-\u05FF]+"  # Hebrew extended
    r"|\d+"  # Numbers
)
WORD_OR_PUNCT_PATTERN = re.compile(r"[\w]+|[^\w\s]")

# ═══════════════════════════════════════════════════════════════════════════════
# CONSTANTS - UNIVERSAL CHARACTER VALUES
# ═══════════════════════════════════════════════════════════════════════════════

# Hebrew Gematria (original, 1-400)
GEMATRIA = {
    "א": 1,
    "ב": 2,
    "ג": 3,
    "ד": 4,
    "ה": 5,
    "ו": 6,
    "ז": 7,
    "ח": 8,
    "ט": 9,
    "י": 10,
    "כ": 20,
    "ך": 20,
    "ל": 30,
    "מ": 40,
    "ם": 40,
    "נ": 50,
    "ן": 50,
    "ס": 60,
    "ע": 70,
    "פ": 80,
    "ף": 80,
    "צ": 90,
    "ץ": 90,
    "ק": 100,
    "ר": 200,
    "ש": 300,
    "ת": 400,
}

# English/Latin letters (scaled to 1-400 range, frequency-aware)
LATIN_VALUES = {
    # Lowercase - frequency weighted (common letters = higher values)
    "e": 400,
    "t": 380,
    "a": 360,
    "o": 340,
    "i": 320,
    "n": 300,
    "s": 280,
    "h": 260,
    "r": 240,
    "d": 220,
    "l": 200,
    "c": 180,
    "u": 160,
    "m": 140,
    "w": 120,
    "f": 100,
    "g": 90,
    "y": 80,
    "p": 70,
    "b": 60,
    "v": 50,
    "k": 40,
    "j": 30,
    "x": 20,
    "q": 15,
    "z": 10,
}
# Add uppercase with same values
LATIN_VALUES.update({c.upper(): v for c, v in LATIN_VALUES.items()})

# Cyrillic (Russian, Ukrainian, etc.) - mapped to similar Hebrew values
CYRILLIC_VALUES = {
    # Russian alphabet (33 letters) - scaled to 1-400
    "а": 1,
    "б": 15,
    "в": 30,
    "г": 45,
    "д": 60,
    "е": 75,
    "ё": 80,
    "ж": 95,
    "з": 110,
    "и": 125,
    "й": 140,
    "к": 155,
    "л": 170,
    "м": 185,
    "н": 200,
    "о": 215,
    "п": 230,
    "р": 245,
    "с": 260,
    "т": 275,
    "у": 290,
    "ф": 305,
    "х": 320,
    "ц": 335,
    "ч": 350,
    "ш": 365,
    "щ": 375,
    "ъ": 380,
    "ы": 385,
    "ь": 390,
    "э": 393,
    "ю": 396,
    "я": 400,
}
# Add uppercase
CYRILLIC_VALUES.update({c.upper(): v for c, v in CYRILLIC_VALUES.items()})

# Arabic letters - mapped to similar range as Hebrew
ARABIC_VALUES = {
    "ا": 1,
    "ب": 15,
    "ت": 30,
    "ث": 45,
    "ج": 60,
    "ح": 75,
    "خ": 90,
    "د": 105,
    "ذ": 120,
    "ر": 135,
    "ز": 150,
    "س": 165,
    "ش": 180,
    "ص": 195,
    "ض": 210,
    "ط": 225,
    "ظ": 240,
    "ع": 255,
    "غ": 270,
    "ف": 285,
    "ق": 300,
    "ك": 315,
    "ل": 330,
    "م": 345,
    "ن": 360,
    "ه": 375,
    "و": 385,
    "ي": 400,
}

# Greek letters - scientific/math importance
GREEK_VALUES = {
    "α": 1,
    "β": 15,
    "γ": 30,
    "δ": 45,
    "ε": 60,
    "ζ": 75,
    "η": 90,
    "θ": 105,
    "ι": 120,
    "κ": 135,
    "λ": 150,
    "μ": 165,
    "ν": 180,
    "ξ": 195,
    "ο": 210,
    "π": 314,
    "ρ": 240,
    "σ": 255,
    "τ": 270,
    "υ": 285,
    "φ": 300,
    "χ": 315,
    "ψ": 330,
    "ω": 400,
}
# Add uppercase
GREEK_VALUES.update({c.upper(): v for c, v in GREEK_VALUES.items()})
# Special: π gets special value (3.14...)
GREEK_VALUES["Π"] = 314

# Hindi/Devanagari - 46 primary characters
DEVANAGARI_VALUES = {chr(0x0905 + i): (i + 1) * 8 for i in range(50)}  # अ to ह

# Chinese/CJK - use stroke count approximation (1-400 range)
# Common characters mapped by frequency
CJK_COMMON = {
    "的": 400,
    "一": 1,
    "是": 380,
    "不": 360,
    "了": 340,
    "在": 320,
    "人": 300,
    "有": 280,
    "我": 260,
    "他": 240,
    "这": 220,
    "个": 200,
    "们": 180,
    "中": 160,
    "来": 140,
    "上": 120,
    "大": 100,
    "为": 80,
    "和": 60,
    "国": 40,
    "地": 20,
}

# Japanese Hiragana (46 characters)
HIRAGANA_VALUES = {chr(0x3041 + i): (i + 1) * 8 for i in range(83)}  # ぁ to ん

# Japanese Katakana (46 characters)
KATAKANA_VALUES = {chr(0x30A1 + i): (i + 1) * 8 for i in range(83)}  # ァ to ン

# Korean Hangul - use syllable block value
# Basic Jamo (consonants and vowels)
HANGUL_JAMO = {chr(0x1100 + i): (i + 1) * 10 for i in range(40)}  # ᄀ to ᄒ

# Thai alphabet (44 consonants + vowels)
THAI_VALUES = {chr(0x0E01 + i): (i + 1) * 8 for i in range(58)}  # ก to ฮ

# Mathematical symbols with semantic values
MATH_SYMBOLS = {
    # Operators
    "+": 100,
    "-": 100,
    "*": 150,
    "/": 150,
    "=": 200,
    "×": 150,
    "÷": 150,
    "±": 100,
    "∓": 100,
    # Comparison
    "<": 75,
    ">": 75,
    "≤": 80,
    "≥": 80,
    "≠": 85,
    "≈": 90,
    "≡": 95,
    # Brackets
    "(": 50,
    ")": 50,
    "[": 50,
    "]": 50,
    "{": 50,
    "}": 50,
    "⟨": 50,
    "⟩": 50,
    "|": 50,
    # Punctuation
    ".": 25,
    ",": 25,
    ":": 25,
    ";": 25,
    "?": 50,
    "!": 50,
    "@": 75,
    "#": 75,
    # Math symbols
    "%": 125,
    "^": 175,
    "√": 200,
    "∞": 400,
    "∑": 300,
    "∏": 280,
    "∫": 350,
    "∂": 250,
    "∇": 270,
    "∆": 260,
    # Logic
    "∧": 150,
    "∨": 150,
    "¬": 100,
    "→": 180,
    "↔": 190,
    "∀": 200,
    "∃": 200,
    # Set theory
    "∈": 160,
    "∉": 165,
    "⊂": 170,
    "⊃": 170,
    "∪": 180,
    "∩": 180,
    "∅": 50,
    # Currency
    "$": 200,
    "€": 200,
    "₪": 200,
    "£": 200,
    "¥": 200,
    "₹": 200,
    "₽": 200,
    # Programming
    "&": 100,
    "|": 100,
    "~": 80,
    "`": 30,
    "_": 40,
    "\\": 60,
}

# Programming/Code symbols
CODE_SYMBOLS = {
    "==": 200,
    "!=": 200,
    "<=": 200,
    ">=": 200,
    "&&": 150,
    "||": 150,
    "++": 120,
    "--": 120,
    "->": 180,
    "=>": 180,
    "::": 160,
    "...": 100,
}

# Language identifiers for embedding separation
# Each language gets a unique ID that affects embedding dimensions
LANGUAGE_IDS = {
    "hebrew": 0,  # Primary - גימטריה מקורית
    "latin": 1,  # English/Latin
    "cyrillic": 2,  # Russian etc.
    "arabic": 3,  # Arabic
    "greek": 4,  # Greek
    "devanagari": 5,  # Hindi
    "cjk": 6,  # Chinese
    "hiragana": 7,  # Japanese
    "katakana": 8,  # Japanese
    "hangul": 9,  # Korean
    "thai": 10,  # Thai
    "math": 11,  # Mathematical symbols
    "unknown": 12,  # Unknown scripts
}

# Character to language mapping (for embedding separation)
CHAR_TO_LANGUAGE = {}
for c in GEMATRIA:
    CHAR_TO_LANGUAGE[c] = "hebrew"
for c in LATIN_VALUES:
    CHAR_TO_LANGUAGE[c] = "latin"
for c in CYRILLIC_VALUES:
    CHAR_TO_LANGUAGE[c] = "cyrillic"
for c in ARABIC_VALUES:
    CHAR_TO_LANGUAGE[c] = "arabic"
for c in GREEK_VALUES:
    CHAR_TO_LANGUAGE[c] = "greek"
for c in DEVANAGARI_VALUES:
    CHAR_TO_LANGUAGE[c] = "devanagari"
for c in CJK_COMMON:
    CHAR_TO_LANGUAGE[c] = "cjk"
for c in HIRAGANA_VALUES:
    CHAR_TO_LANGUAGE[c] = "hiragana"
for c in KATAKANA_VALUES:
    CHAR_TO_LANGUAGE[c] = "katakana"
for c in HANGUL_JAMO:
    CHAR_TO_LANGUAGE[c] = "hangul"
for c in THAI_VALUES:
    CHAR_TO_LANGUAGE[c] = "thai"
for c in MATH_SYMBOLS:
    CHAR_TO_LANGUAGE[c] = "math"

# Combined character values (ALL languages + Math + Code)
# Hebrew gematria values are PRESERVED as-is (1-400)
# Other languages use OFFSET ranges to avoid collision
CHAR_VALUES = {
    **GEMATRIA,  # Hebrew (priority - original 1-400)
    **LATIN_VALUES,  # English/Latin
    **CYRILLIC_VALUES,  # Russian etc.
    **ARABIC_VALUES,  # Arabic
    **GREEK_VALUES,  # Greek
    **DEVANAGARI_VALUES,  # Hindi
    **CJK_COMMON,  # Chinese common
    **HIRAGANA_VALUES,  # Japanese
    **KATAKANA_VALUES,  # Japanese
    **HANGUL_JAMO,  # Korean
    **THAI_VALUES,  # Thai
    **MATH_SYMBOLS,  # Math
}

LETTER_COORDS = {
    "א": (0.05, 0.85),
    "ה": (0.10, 0.95),
    "ח": (0.15, 0.45),
    "ע": (0.08, 0.55),
    "ג": (0.30, 0.25),
    "י": (0.35, 0.75),
    "כ": (0.38, 0.50),
    "ך": (0.38, 0.20),
    "ק": (0.32, 0.40),
    "ד": (0.50, 0.45),
    "ט": (0.52, 0.20),
    "ת": (0.55, 0.70),
    "ז": (0.58, 0.25),
    "ס": (0.54, 0.35),
    "צ": (0.56, 0.30),
    "ץ": (0.56, 0.10),
    "ש": (0.53, 0.65),
    "ל": (0.62, 0.80),
    "נ": (0.65, 0.55),
    "ן": (0.65, 0.25),
    "ר": (0.68, 0.70),
    "ב": (0.85, 0.60),
    "ו": (0.88, 0.90),
    "מ": (0.90, 0.75),
    "ם": (0.90, 0.35),
    "פ": (0.92, 0.40),
    "ף": (0.92, 0.15),
}

HEBREW_ROOTS = {
    "שפט": "judge",
    "דין": "law",
    "חוק": "statute",
    "עבר": "violate",
    "טען": "claim",
    "ערר": "appeal",
    "פסק": "rule",
    "תבע": "sue",
    "זכה": "acquit",
    "חייב": "convict",
    "ענש": "punish",
    "קנס": "fine",
    "חתם": "sign",
    "הסכם": "agree",
    "בטל": "cancel",
    "תקף": "valid",
    "כתב": "write",
    "קרא": "read",
    "אמר": "say",
    "דבר": "speak",
    "הלך": "walk",
    "בוא": "come",
    "ראה": "see",
    "שמע": "hear",
    "ידע": "know",
    "חשב": "think",
    "רצה": "want",
    "יכל": "can",
    "נתן": "give",
    "לקח": "take",
    "עשה": "make",
    "היה": "be",
}

DOMAIN_WEIGHTS = {
    "hebrew_legal": {"gematria": 0.50, "trajectory": 0.30, "morphology": 0.20},
    "hebrew_medical": {"gematria": 0.40, "trajectory": 0.30, "morphology": 0.30},
    "hebrew_general": {"gematria": 0.40, "trajectory": 0.40, "morphology": 0.20},
    "default": {"gematria": 0.45, "trajectory": 0.35, "morphology": 0.20},
}

HEAPS_K = 10.0
HEAPS_BETA = 0.5


# ═══════════════════════════════════════════════════════════════════════════════
# DATA CLASSES
# ═══════════════════════════════════════════════════════════════════════════════


@dataclass
class GematriaFeatures:
    histogram: np.ndarray
    bigram_stats: np.ndarray
    word_values: List[int]
    total: int
    mean: float
    std: float


@dataclass
class TrajectoryFeatures:
    coords: np.ndarray
    path_length: float
    curvature: float
    torsion: float
    centroid: np.ndarray

    @property
    def arc_length(self) -> float:
        """Alias for path_length to maintain consistency with multimodal encoders."""
        return self.path_length


@dataclass
class MorphologyFeatures:
    roots: List[Optional[str]]
    unique_roots: List[str]
    coverage: float
    prefix_count: int
    suffix_count: int


@dataclass
class TAUMetrics:
    tau: float
    complexity: float
    activity: float
    coherence: float


@dataclass
class VocabularyFeatures:
    indices: List[int]
    coverage: float
    oov_count: int


@dataclass
class EncodingResult:
    embedding: np.ndarray
    gematria: GematriaFeatures
    trajectory: TrajectoryFeatures
    morphology: MorphologyFeatures
    vocabulary: VocabularyFeatures
    tau_metrics: TAUMetrics
    metadata: Dict[str, Any]


# ═══════════════════════════════════════════════════════════════════════════════
# VOCABULARY LAYER
# ═══════════════════════════════════════════════════════════════════════════════


class VocabularyLayer:
    """Word-to-index mapping with OOV handling"""

    SPECIAL_TOKENS = ["<PAD>", "<UNK>", "<START>", "<END>", "<NUM>", "<PUNC>"]

    def __init__(self, vocab_file: Optional[str] = None):
        self.word2idx: Dict[str, int] = {}
        self.idx2word: Dict[int, str] = {}
        self.frequencies: Counter = Counter()

        # Add special tokens
        for i, token in enumerate(self.SPECIAL_TOKENS):
            self.word2idx[token] = i
            self.idx2word[i] = token

        self.next_idx = len(self.SPECIAL_TOKENS)

        if vocab_file:
            self.load(vocab_file)

    def add_word(self, word: str) -> int:
        word = word.lower().strip()
        if not word:
            return self.word2idx["<PAD>"]

        self.frequencies[word] += 1

        if word not in self.word2idx:
            self.word2idx[word] = self.next_idx
            self.idx2word[self.next_idx] = word
            self.next_idx += 1

        return self.word2idx[word]

    def get_index(self, word: str) -> int:
        word = word.lower().strip()

        if not word:
            return self.word2idx["<PAD>"]
        if word.isdigit():
            return self.word2idx["<NUM>"]
        if not word.isalnum():
            return self.word2idx["<PUNC>"]

        return self.word2idx.get(word, self.word2idx["<UNK>"])

    def encode(self, text: str) -> Tuple[List[int], float]:
        words = WORD_OR_PUNCT_PATTERN.findall(text)  # Use pre-compiled pattern
        indices = []
        oov_count = 0
        unk_idx = self.word2idx["<UNK>"]  # Cache lookup

        for word in words:
            idx = self.get_index(word)
            indices.append(idx)
            if idx == unk_idx:
                oov_count += 1

        coverage = 1 - (oov_count / len(words)) if words else 1.0
        return indices, coverage

    def size(self) -> int:
        return len(self.word2idx)

    def save(self, path: str):
        import json

        data = {
            "word2idx": self.word2idx,
            "frequencies": dict(self.frequencies.most_common(50000)),
        }
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False)

    def load(self, path: str):
        import json

        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        self.word2idx = data["word2idx"]
        self.idx2word = {int(v): k for k, v in self.word2idx.items()}
        self.frequencies = Counter(data.get("frequencies", {}))
        self.next_idx = max(self.word2idx.values()) + 1


# ═══════════════════════════════════════════════════════════════════════════════
# MORPHOLOGY LAYER
# ═══════════════════════════════════════════════════════════════════════════════


class MorphologyLayer:
    """Hebrew morphological analysis"""

    PREFIXES = ["ב", "כ", "ל", "מ", "ש", "ה", "ו", "וב", "וכ", "ול", "ומ", "וש", "וה"]
    SUFFIXES = [
        "ים",
        "ות",
        "ה",
        "י",
        "ך",
        "ו",
        "נו",
        "כם",
        "כן",
        "הם",
        "הן",
        "תי",
        "תם",
        "תן",
    ]

    def __init__(self):
        self.roots = HEBREW_ROOTS

    def extract_root(self, word: str) -> Tuple[Optional[str], int, int]:
        if not word or len(word) < 2:
            return None, 0, 0

        # Check known roots
        for root in self.roots:
            if root in word:
                return root, 0, 0

        prefix_count = 0
        suffix_count = 0
        cleaned = word

        # Remove prefixes
        for prefix in sorted(self.PREFIXES, key=len, reverse=True):
            if cleaned.startswith(prefix) and len(cleaned) > len(prefix) + 2:
                cleaned = cleaned[len(prefix) :]
                prefix_count += 1
                break

        # Remove suffixes
        for suffix in sorted(self.SUFFIXES, key=len, reverse=True):
            if cleaned.endswith(suffix) and len(cleaned) > len(suffix) + 2:
                cleaned = cleaned[: -len(suffix)]
                suffix_count += 1
                break

        if len(cleaned) >= 3:
            return cleaned[:3], prefix_count, suffix_count
        return None, prefix_count, suffix_count

    def analyze(self, words: List[str]) -> MorphologyFeatures:
        roots = []
        total_prefixes = 0
        total_suffixes = 0

        for word in words:
            root, pref, suff = self.extract_root(word)
            roots.append(root)
            total_prefixes += pref
            total_suffixes += suff

        found = [r for r in roots if r]
        unique = list(set(found))
        coverage = len(found) / len(words) * 100 if words else 0

        return MorphologyFeatures(
            roots=roots,
            unique_roots=unique,
            coverage=coverage,
            prefix_count=total_prefixes,
            suffix_count=total_suffixes,
        )


# ═══════════════════════════════════════════════════════════════════════════════
# ENHANCED HEBREW ENCODER
# ═══════════════════════════════════════════════════════════════════════════════


class EnhancedHebrewEncoder:
    """
    Complete Hebrew encoder with all features.

    Encoding pipeline:
    1. Preprocessing (clean, tokenize)
    2. Vocabulary encoding (word → index)
    3. Gematria encoding (numerical + bigrams)
    4. Trajectory encoding (3D spherical)
    5. Morphology encoding (root extraction)
    6. τ metrics calculation
    7. Combined embedding
    """

    def __init__(
        self,
        embedding_dim: int = 256,
        trajectory_points: int = 100,
        vocab_file: Optional[str] = None,
    ):
        self.embedding_dim = embedding_dim
        self.trajectory_points = trajectory_points

        # Initialize layers
        self.vocab_layer = VocabularyLayer(vocab_file)
        self.morph_layer = MorphologyLayer()

        # Projection matrices
        np.random.seed(42)
        self.W_gematria = np.random.randn(78, embedding_dim).astype(np.float32) * 0.1
        self.W_trajectory = np.random.randn(306, embedding_dim).astype(np.float32) * 0.1
        self.W_morphology = np.random.randn(110, embedding_dim).astype(np.float32) * 0.1

    def _clean_text(self, text: str) -> str:
        text = NIQQUD_PATTERN.sub("", text)  # Use pre-compiled pattern
        return " ".join(text.split())

    def _extract_words(self, text: str) -> List[str]:
        """Extract words from text - supports Hebrew, English, and numbers."""
        return MULTILINGUAL_WORD_PATTERN.findall(text)

    @staticmethod
    @lru_cache(maxsize=50000)
    def _word_gematria_cached(word: str) -> int:
        """
        Cached character value calculation with LANGUAGE SEPARATION.

        Hebrew gematria is PRESERVED (1-400).
        Other languages get OFFSET values to avoid collision:
        - Hebrew: 1-400 (original gematria)
        - Latin: 1000 + value (1010-1400)
        - Cyrillic: 2000 + value (2001-2400)
        - Arabic: 3000 + value (3001-3400)
        - Greek: 4000 + value (4001-4400)
        - etc.

        This ensures Hebrew gematria uniqueness is preserved.
        """
        total = 0
        lang_offset = 0
        detected_lang = None

        for c in word:
            # Detect language from first recognized character
            if detected_lang is None and c in CHAR_TO_LANGUAGE:
                detected_lang = CHAR_TO_LANGUAGE[c]
                lang_id = LANGUAGE_IDS.get(detected_lang, 12)
                # Hebrew (id=0) has NO offset - preserves original gematria
                # Other languages get offset = lang_id * 1000
                lang_offset = lang_id * 1000 if lang_id > 0 else 0

            if c in CHAR_VALUES:
                total += CHAR_VALUES[c]
            elif c.isdigit():
                # Numbers: use digit value * position weight
                total += int(c) * 10
            else:
                # Unknown character: use ord value scaled
                total += ord(c) % 100

        # Apply language offset (Hebrew stays pure, others get offset)
        total += lang_offset

        return total if total > 0 else 1  # Ensure non-zero

    @staticmethod
    def detect_language(text: str) -> str:
        """Detect primary language of text based on character distribution."""
        lang_counts = {}
        for c in text:
            if c in CHAR_TO_LANGUAGE:
                lang = CHAR_TO_LANGUAGE[c]
                lang_counts[lang] = lang_counts.get(lang, 0) + 1

        if not lang_counts:
            return "unknown"
        return max(lang_counts, key=lang_counts.get)

    def _word_gematria(self, word: str) -> int:
        return self._word_gematria_cached(word)

    def calculate_gematria(self, text: str) -> int:
        """Calculate gematria value for Hebrew text (public API)."""
        words = self._extract_words(text)
        if not words:
            return 0
        return sum(self._word_gematria(w) for w in words)

    def _entropy(self, items: List) -> float:
        if not items:
            return 0.0
        counts = Counter(items)
        total = len(items)
        probs = np.array(list(counts.values())) / total
        return float(-np.sum(probs * np.log2(probs + 1e-10)))

    def _encode_gematria(self, words: List[str]) -> Tuple[np.ndarray, GematriaFeatures]:
        if not words:
            return np.zeros(self.embedding_dim, dtype=np.float32), GematriaFeatures(
                histogram=np.zeros(50),
                bigram_stats=np.zeros(20),
                word_values=[],
                total=0,
                mean=0.0,
                std=0.0,
            )

        values = [self._word_gematria(w) for w in words]

        # Log-scale histogram
        log_vals = np.log1p(values)
        hist, _ = np.histogram(log_vals, bins=50, range=(0, np.log1p(1000)))
        hist = hist.astype(np.float32)
        if hist.sum() > 0:
            hist /= hist.sum()

        # Bigram stats
        bigram_vals = (
            [
                self._word_gematria(words[i] + words[i + 1])
                for i in range(len(words) - 1)
            ]
            if len(words) > 1
            else []
        )

        if bigram_vals:
            bigram_stats = (
                np.array(
                    [
                        np.mean(bigram_vals),
                        np.std(bigram_vals),
                        np.median(bigram_vals),
                        np.min(bigram_vals),
                        np.max(bigram_vals),
                        np.percentile(bigram_vals, 25),
                        np.percentile(bigram_vals, 75),
                        len(bigram_vals),
                    ],
                    dtype=np.float32,
                )
                / 1000
            )
            bigram_stats = np.pad(bigram_stats, (0, 20 - len(bigram_stats)))
        else:
            bigram_stats = np.zeros(20, dtype=np.float32)

        # Basic stats
        basic = np.array(
            [
                np.mean(values) / 1000,
                np.std(values) / 1000,
                np.median(values) / 1000,
                np.min(values) / 1000,
                np.max(values) / 1000,
                len(words) / 100,
                sum(values) / 10000,
                len(set(values)) / max(len(values), 1),
            ],
            dtype=np.float32,
        )

        # Combine and project
        features = np.concatenate([hist, bigram_stats, basic])
        embedding = features @ self.W_gematria

        norm = np.linalg.norm(embedding)
        if norm > 0:
            embedding /= norm

        return embedding.astype(np.float32), GematriaFeatures(
            histogram=hist,
            bigram_stats=bigram_stats,
            word_values=values,
            total=sum(values),
            mean=float(np.mean(values)),
            std=float(np.std(values)),
        )

    def _encode_trajectory(
        self, words: List[str]
    ) -> Tuple[np.ndarray, TrajectoryFeatures]:
        if not words:
            return np.zeros(self.embedding_dim, dtype=np.float32), TrajectoryFeatures(
                coords=np.zeros((self.trajectory_points, 3)),
                path_length=0.0,
                curvature=0.0,
                torsion=0.0,
                centroid=np.zeros(3),
            )

        seq = [self._word_gematria(w) for w in words]

        # Resample
        if len(seq) < self.trajectory_points:
            idx = np.linspace(0, len(seq) - 1, self.trajectory_points)
            resampled = np.interp(idx, range(len(seq)), seq)
        else:
            idx = np.linspace(0, len(seq) - 1, self.trajectory_points).astype(int)
            resampled = np.array([seq[i] for i in idx])

        normalized = resampled / (np.max(resampled) + 1e-8)

        # Spherical coordinates
        theta = np.cumsum(normalized) * 2 * np.pi / self.trajectory_points
        phi = normalized * np.pi * 0.8 + 0.1 * np.pi

        x = np.sin(phi) * np.cos(theta)
        y = np.sin(phi) * np.sin(theta)
        z = np.cos(phi)

        coords = np.stack([x, y, z], axis=1).astype(np.float32)

        # Geometric features
        diffs = np.diff(coords, axis=0)
        path_length = float(np.sum(np.linalg.norm(diffs, axis=1)))

        d1 = np.gradient(coords, axis=0)
        d2 = np.gradient(d1, axis=0)
        d1_norm = np.linalg.norm(d1, axis=1) + 1e-10
        d2_norm = np.linalg.norm(d2, axis=1)
        curvature = float(np.mean(d2_norm / (d1_norm**2)))

        d3 = np.gradient(d2, axis=0)
        cross = np.cross(d1, d2)
        cross_norm_sq = np.sum(cross**2, axis=1) + 1e-10
        torsion_vals = np.sum(cross * d3, axis=1) / cross_norm_sq
        torsion = float(np.mean(np.abs(torsion_vals)))

        centroid = np.mean(coords, axis=0)

        # Project
        geom_features = np.array(
            [
                path_length / self.trajectory_points,
                curvature,
                torsion,
                np.std(np.linalg.norm(coords - centroid, axis=1)),
                np.std(np.linalg.norm(diffs, axis=1)),
                np.std(d2_norm / (d1_norm**2)),
            ],
            dtype=np.float32,
        )

        features = np.concatenate([coords.flatten(), geom_features])
        embedding = features @ self.W_trajectory

        norm = np.linalg.norm(embedding)
        if norm > 0:
            embedding /= norm

        return embedding.astype(np.float32), TrajectoryFeatures(
            coords=coords,
            path_length=path_length,
            curvature=curvature,
            torsion=torsion,
            centroid=centroid,
        )

    def _encode_morphology(
        self, words: List[str]
    ) -> Tuple[np.ndarray, MorphologyFeatures]:
        morph = self.morph_layer.analyze(words)

        if not words:
            return np.zeros(self.embedding_dim, dtype=np.float32), morph

        # Root histogram
        root_indices = [hash(r) % 10000 for r in morph.roots if r]
        if root_indices:
            hist, _ = np.histogram(root_indices, bins=100, range=(0, 10000))
            hist = hist.astype(np.float32)
            if hist.sum() > 0:
                hist /= hist.sum()
        else:
            hist = np.zeros(100, dtype=np.float32)

        # Stats
        stats = np.array(
            [
                morph.coverage / 100,
                len(morph.unique_roots) / max(len(words), 1),
                morph.prefix_count / max(len(words), 1),
                morph.suffix_count / max(len(words), 1),
                np.mean([len(w) for w in words]) / 10,
                np.std([len(w) for w in words]) / 5,
                (
                    np.mean([len(r) for r in morph.unique_roots]) / 5
                    if morph.unique_roots
                    else 0
                ),
                (
                    np.std([len(r) for r in morph.unique_roots]) / 3
                    if len(morph.unique_roots) > 1
                    else 0
                ),
                sum(1 for r in morph.unique_roots if r in HEBREW_ROOTS)
                / max(len(morph.unique_roots), 1),
                len(words) / 100,
            ],
            dtype=np.float32,
        )

        features = np.concatenate([hist, stats])
        embedding = features @ self.W_morphology

        norm = np.linalg.norm(embedding)
        if norm > 0:
            embedding /= norm

        return embedding.astype(np.float32), morph

    # ═══════════════════════════════════════════════════════════════════════════
    # OPTIMIZED ENCODING METHODS (avoid duplicate gematria calculation)
    # ═══════════════════════════════════════════════════════════════════════════

    def _encode_gematria_optimized(
        self, words: List[str], word_values: List[int]
    ) -> Tuple[np.ndarray, GematriaFeatures]:
        """Optimized gematria encoding with pre-computed values."""
        if not words:
            return np.zeros(self.embedding_dim, dtype=np.float32), GematriaFeatures(
                histogram=np.zeros(50),
                bigram_stats=np.zeros(20),
                word_values=[],
                total=0,
                mean=0.0,
                std=0.0,
            )

        values = word_values  # Use pre-computed values

        # Log-scale histogram (vectorized)
        values_arr = np.array(values, dtype=np.float32)
        log_vals = np.log1p(values_arr)
        hist, _ = np.histogram(log_vals, bins=50, range=(0, np.log1p(1000)))
        hist = hist.astype(np.float32)
        hist_sum = hist.sum()
        if hist_sum > 0:
            hist /= hist_sum

        # Bigram stats - use pre-computed values for adjacent word sums
        if len(values) > 1:
            bigram_vals = np.array(
                [values[i] + values[i + 1] for i in range(len(values) - 1)],
                dtype=np.float32,
            )
            bigram_stats = (
                np.array(
                    [
                        np.mean(bigram_vals),
                        np.std(bigram_vals),
                        np.median(bigram_vals),
                        np.min(bigram_vals),
                        np.max(bigram_vals),
                        np.percentile(bigram_vals, 25),
                        np.percentile(bigram_vals, 75),
                        len(bigram_vals),
                    ],
                    dtype=np.float32,
                )
                / 1000
            )
            bigram_stats = np.pad(bigram_stats, (0, 20 - len(bigram_stats)))
        else:
            bigram_stats = np.zeros(20, dtype=np.float32)

        # Basic stats (vectorized)
        basic = np.array(
            [
                values_arr.mean() / 1000,
                values_arr.std() / 1000,
                np.median(values_arr) / 1000,
                values_arr.min() / 1000,
                values_arr.max() / 1000,
                len(words) / 100,
                values_arr.sum() / 10000,
                len(np.unique(values_arr)) / len(values_arr),
            ],
            dtype=np.float32,
        )

        # Combine and project
        features = np.concatenate([hist, bigram_stats, basic])
        embedding = features @ self.W_gematria

        norm = np.linalg.norm(embedding)
        if norm > 0:
            embedding /= norm

        return embedding.astype(np.float32), GematriaFeatures(
            histogram=hist,
            bigram_stats=bigram_stats,
            word_values=values,
            total=int(values_arr.sum()),
            mean=float(values_arr.mean()),
            std=float(values_arr.std()),
        )

    def _encode_trajectory_optimized(
        self, words: List[str], word_values: List[int]
    ) -> Tuple[np.ndarray, TrajectoryFeatures]:
        """Optimized trajectory encoding with pre-computed gematria values."""
        if not words:
            return np.zeros(self.embedding_dim, dtype=np.float32), TrajectoryFeatures(
                coords=np.zeros((self.trajectory_points, 3)),
                path_length=0.0,
                curvature=0.0,
                torsion=0.0,
                centroid=np.zeros(3),
            )

        seq = np.array(word_values, dtype=np.float32)  # Use pre-computed values

        # Resample (vectorized)
        n_points = len(seq)
        if n_points < self.trajectory_points:
            idx = np.linspace(0, n_points - 1, self.trajectory_points)
            resampled = np.interp(idx, np.arange(n_points), seq)
        else:
            idx = np.linspace(0, n_points - 1, self.trajectory_points).astype(int)
            resampled = seq[idx]

        max_val = resampled.max()
        normalized = resampled / (max_val + 1e-8)

        # Spherical coordinates (fully vectorized)
        theta = np.cumsum(normalized) * 2 * np.pi / self.trajectory_points
        phi = normalized * np.pi * 0.8 + 0.1 * np.pi

        sin_phi = np.sin(phi)
        cos_phi = np.cos(phi)
        cos_theta = np.cos(theta)
        sin_theta = np.sin(theta)

        x = sin_phi * cos_theta
        y = sin_phi * sin_theta
        z = cos_phi

        coords = np.stack([x, y, z], axis=1).astype(np.float32)

        # Geometric features (optimized - compute all derivatives at once)
        diffs = np.diff(coords, axis=0)
        diff_norms = np.linalg.norm(diffs, axis=1)
        path_length = float(diff_norms.sum())

        # Compute derivatives in sequence (d1 -> d2 -> d3)
        d1 = np.gradient(coords, axis=0)
        d2 = np.gradient(d1, axis=0)
        d3 = np.gradient(d2, axis=0)

        d1_norm = np.linalg.norm(d1, axis=1) + 1e-10
        d2_norm = np.linalg.norm(d2, axis=1)
        curvature = float(np.mean(d2_norm / (d1_norm**2)))

        cross = np.cross(d1, d2)
        cross_norm_sq = np.sum(cross**2, axis=1) + 1e-10
        torsion_vals = np.abs(np.sum(cross * d3, axis=1) / cross_norm_sq)
        torsion = float(torsion_vals.mean())

        centroid = coords.mean(axis=0)

        # Project
        geom_features = np.array(
            [
                path_length / self.trajectory_points,
                curvature,
                torsion,
                np.std(np.linalg.norm(coords - centroid, axis=1)),
                np.std(diff_norms),
                np.std(d2_norm / (d1_norm**2)),
            ],
            dtype=np.float32,
        )

        features = np.concatenate([coords.flatten(), geom_features])
        embedding = features @ self.W_trajectory

        norm = np.linalg.norm(embedding)
        if norm > 0:
            embedding /= norm

        return embedding.astype(np.float32), TrajectoryFeatures(
            coords=coords,
            path_length=path_length,
            curvature=curvature,
            torsion=torsion,
            centroid=centroid,
        )

    def _calculate_tau(self, words: List[str]) -> TAUMetrics:
        if not words:
            return TAUMetrics(tau=0.0, complexity=0.0, activity=0.0, coherence=0.0)

        n = len(words)
        unique = len(set(words))

        # Complexity (Heaps' Law)
        expected = HEAPS_K * (n**HEAPS_BETA)
        complexity = min((unique / max(expected, 1)) * 100, 200)

        # Activity (entropy)
        chars = "".join(words)
        char_entropy = self._entropy(list(chars))
        avg_len = np.mean([len(w) for w in words])
        activity = char_entropy * np.log2(avg_len + 1)

        # Coherence (sliding window)
        window = 5
        if n >= window:
            ratios = [
                len(set(words[i : i + window])) / window for i in range(n - window + 1)
            ]
            coherence = 1 - min(np.std(ratios) * 2, 1)
        else:
            coherence = 0.5

        tau = complexity * activity * (1 - coherence) / 100

        return TAUMetrics(
            tau=round(tau, 4),
            complexity=round(complexity, 4),
            activity=round(activity, 4),
            coherence=round(coherence, 4),
        )

    def _encode_language_signature(self, text: str) -> np.ndarray:
        """
        Create language signature embedding.

        Uses SIGNED values to separate languages in embedding space:
        - Hebrew: POSITIVE values (preserves original gematria semantics)
        - Other languages: NEGATIVE values (clear separation)

        This ensures Hebrew gematria uniqueness is preserved in the embedding.
        """
        # Count characters per language
        lang_counts = {}
        total_chars = 0

        for c in text:
            if c in CHAR_TO_LANGUAGE:
                lang = CHAR_TO_LANGUAGE[c]
                lang_counts[lang] = lang_counts.get(lang, 0) + 1
                total_chars += 1

        if total_chars == 0:
            return np.zeros(len(LANGUAGE_IDS), dtype=np.float32)

        # Create language distribution vector with SIGNED values
        lang_vec = np.zeros(len(LANGUAGE_IDS), dtype=np.float32)

        for lang, count in lang_counts.items():
            lang_id = LANGUAGE_IDS.get(lang, LANGUAGE_IDS["unknown"])
            proportion = count / total_chars

            # Hebrew (id=0) gets POSITIVE values
            # All other languages get NEGATIVE values
            if lang_id == 0:  # Hebrew
                lang_vec[lang_id] = proportion  # Positive
            else:
                lang_vec[lang_id] = -proportion  # Negative

        return lang_vec

    def encode(self, text: str, domain: str = "default") -> EncodingResult:
        """
        Encode multilingual text with language-aware embedding.

        Args:
            text: Text to encode (Hebrew, English, or any supported language)
            domain: Domain for weight adjustment

        Returns:
            EncodingResult with all features including language signature
        """
        text = self._clean_text(text)
        words = self._extract_words(text)
        weights = DOMAIN_WEIGHTS.get(domain, DOMAIN_WEIGHTS["default"])

        # OPTIMIZATION: Calculate gematria values once, reuse everywhere
        word_values = [self._word_gematria(w) for w in words]

        # Encode all channels with pre-computed gematria values
        gematria_emb, gematria_feat = self._encode_gematria_optimized(
            words, word_values
        )
        traj_emb, traj_feat = self._encode_trajectory_optimized(words, word_values)
        morph_emb, morph_feat = self._encode_morphology(words)

        # Language signature (SIGNED: Hebrew=positive, others=negative)
        lang_signature = self._encode_language_signature(text)

        # Vocabulary
        indices, coverage = self.vocab_layer.encode(text)
        unk_idx = self.vocab_layer.word2idx["<UNK>"]
        oov_count = sum(1 for idx in indices if idx == unk_idx)  # Faster than .count()
        vocab_feat = VocabularyFeatures(
            indices=indices, coverage=coverage, oov_count=oov_count
        )

        # τ metrics
        tau_metrics = self._calculate_tau(words)

        # Combined embedding (before language signature)
        base_embedding = (
            weights["gematria"] * gematria_emb
            + weights["trajectory"] * traj_emb
            + weights["morphology"] * morph_emb
        )

        # Inject language signature into last dimensions
        # This creates SEPARATION in embedding space:
        # - Hebrew texts cluster together (positive values)
        # - Non-Hebrew texts cluster separately (negative values)
        num_lang_dims = len(LANGUAGE_IDS)
        embedding = base_embedding.copy()
        embedding[-num_lang_dims:] += lang_signature * 0.3  # 30% influence

        norm = np.linalg.norm(embedding)
        if norm > 0:
            embedding /= norm

        metadata = {
            "text_length": len(text),
            "word_count": len(words),
            "domain": domain,
            "weights": weights,
            "hash": hashlib.md5(text.encode()).hexdigest(),
        }

        return EncodingResult(
            embedding=embedding,
            gematria=gematria_feat,
            trajectory=traj_feat,
            morphology=morph_feat,
            vocabulary=vocab_feat,
            tau_metrics=tau_metrics,
            metadata=metadata,
        )

    def encode_batch(
        self, texts: List[str], domain: str = "default"
    ) -> List[EncodingResult]:
        return [self.encode(text, domain) for text in texts]

    def similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
        norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
        if norm1 == 0 or norm2 == 0:
            return 0.0
        return float(np.dot(vec1, vec2) / (norm1 * norm2))

    def find_similar(
        self, query: str, corpus: List[str], domain: str = "default", top_k: int = 10
    ) -> List[Tuple[int, float, str]]:
        query_emb = self.encode(query, domain).embedding
        results = []

        for i, text in enumerate(corpus):
            emb = self.encode(text, domain).embedding
            sim = self.similarity(query_emb, emb)
            preview = text[:100] + "..." if len(text) > 100 else text
            results.append((i, sim, preview))

        results.sort(key=lambda x: x[1], reverse=True)
        return results[:top_k]


# Alias for backward compatibility
HebrewEncoder = EnhancedHebrewEncoder


# ═══════════════════════════════════════════════════════════════════════════════
# DEMO
# ═══════════════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    print("=" * 70)
    print("TAU Platform v4.0 - Enhanced Hebrew Encoder Demo")
    print("=" * 70)
    print()

    encoder = EnhancedHebrewEncoder()

    text = "בית המשפט העליון פסק כי הערעור יתקבל. השופטים קבעו פה אחד כי יש לבטל את ההחלטה."

    print(f"Text: {text}")
    print()

    result = encoder.encode(text, domain="hebrew_legal")

    print(f"Embedding shape: {result.embedding.shape}")
    print(f"Embedding norm: {np.linalg.norm(result.embedding):.4f}")
    print()

    print("τ Metrics:")
    print(f"  τ: {result.tau_metrics.tau}")
    print(f"  Complexity: {result.tau_metrics.complexity}")
    print(f"  Activity: {result.tau_metrics.activity}")
    print(f"  Coherence: {result.tau_metrics.coherence}")
    print()

    print("Gematria:")
    print(f"  Total: {result.gematria.total}")
    print(f"  Mean: {result.gematria.mean:.2f}")
    print()

    print("Trajectory:")
    print(f"  Path length: {result.trajectory.path_length:.4f}")
    print(f"  Curvature: {result.trajectory.curvature:.4f}")
    print()

    print("Morphology:")
    print(f"  Coverage: {result.morphology.coverage:.1f}%")
    print(f"  Unique roots: {result.morphology.unique_roots[:5]}")
    print()

    print("Vocabulary:")
    print(f"  Coverage: {result.vocabulary.coverage:.1%}")
    print(f"  OOV count: {result.vocabulary.oov_count}")
    print()

    print("=" * 70)
    print("✅ Demo Complete!")
    print("=" * 70)