""" ═══════════════════════════════════════════════════════════════════════════════ TAU Platform v4.0 - Enhanced Hebrew Encoder ═══════════════════════════════════════════════════════════════════════════════ Unified Hebrew text encoder combining: 1. Gematria encoding (log-scale + bigrams + position weights) 2. Spherical trajectory encoding (with curvature/torsion) 3. Morphological analysis (root extraction) 4. τ complexity metrics (Heaps' Law adjusted) 5. Vocabulary encoding (word-to-index) 6. Compression integration Author: Avri Barzel Date: November 2025 ═══════════════════════════════════════════════════════════════════════════════ """ import numpy as np import re import hashlib from typing import Dict, List, Tuple, Optional, Any from dataclasses import dataclass, field from collections import Counter from functools import lru_cache # ═══════════════════════════════════════════════════════════════════════════════ # PRE-COMPILED REGEX PATTERNS (Performance optimization) # ═══════════════════════════════════════════════════════════════════════════════ NIQQUD_PATTERN = re.compile(r"[\u0591-\u05C7]") HEBREW_WORD_PATTERN = re.compile(r"[\u05D0-\u05EA]+") # Universal multilingual word pattern - supports ALL major scripts MULTILINGUAL_WORD_PATTERN = re.compile( r"[\u05D0-\u05EA]+" # Hebrew r"|[a-zA-Z]+" # Latin/English r"|[\u0400-\u04FF]+" # Cyrillic (Russian, etc.) r"|[\u0600-\u06FF]+" # Arabic r"|[\u4E00-\u9FFF]+" # Chinese (CJK) r"|[\u3040-\u309F]+" # Japanese Hiragana r"|[\u30A0-\u30FF]+" # Japanese Katakana r"|[\uAC00-\uD7AF]+" # Korean Hangul r"|[\u0900-\u097F]+" # Hindi/Devanagari r"|[\u0370-\u03FF]+" # Greek r"|[\u0E00-\u0E7F]+" # Thai r"|[\u0590-\u05FF]+" # Hebrew extended r"|\d+" # Numbers ) WORD_OR_PUNCT_PATTERN = re.compile(r"[\w]+|[^\w\s]") # ═══════════════════════════════════════════════════════════════════════════════ # CONSTANTS - UNIVERSAL CHARACTER VALUES # ═══════════════════════════════════════════════════════════════════════════════ # Hebrew Gematria (original, 1-400) GEMATRIA = { "א": 1, "ב": 2, "ג": 3, "ד": 4, "ה": 5, "ו": 6, "ז": 7, "ח": 8, "ט": 9, "י": 10, "כ": 20, "ך": 20, "ל": 30, "מ": 40, "ם": 40, "נ": 50, "ן": 50, "ס": 60, "ע": 70, "פ": 80, "ף": 80, "צ": 90, "ץ": 90, "ק": 100, "ר": 200, "ש": 300, "ת": 400, } # English/Latin letters (scaled to 1-400 range, frequency-aware) LATIN_VALUES = { # Lowercase - frequency weighted (common letters = higher values) "e": 400, "t": 380, "a": 360, "o": 340, "i": 320, "n": 300, "s": 280, "h": 260, "r": 240, "d": 220, "l": 200, "c": 180, "u": 160, "m": 140, "w": 120, "f": 100, "g": 90, "y": 80, "p": 70, "b": 60, "v": 50, "k": 40, "j": 30, "x": 20, "q": 15, "z": 10, } # Add uppercase with same values LATIN_VALUES.update({c.upper(): v for c, v in LATIN_VALUES.items()}) # Cyrillic (Russian, Ukrainian, etc.) - mapped to similar Hebrew values CYRILLIC_VALUES = { # Russian alphabet (33 letters) - scaled to 1-400 "а": 1, "б": 15, "в": 30, "г": 45, "д": 60, "е": 75, "ё": 80, "ж": 95, "з": 110, "и": 125, "й": 140, "к": 155, "л": 170, "м": 185, "н": 200, "о": 215, "п": 230, "р": 245, "с": 260, "т": 275, "у": 290, "ф": 305, "х": 320, "ц": 335, "ч": 350, "ш": 365, "щ": 375, "ъ": 380, "ы": 385, "ь": 390, "э": 393, "ю": 396, "я": 400, } # Add uppercase CYRILLIC_VALUES.update({c.upper(): v for c, v in CYRILLIC_VALUES.items()}) # Arabic letters - mapped to similar range as Hebrew ARABIC_VALUES = { "ا": 1, "ب": 15, "ت": 30, "ث": 45, "ج": 60, "ح": 75, "خ": 90, "د": 105, "ذ": 120, "ر": 135, "ز": 150, "س": 165, "ش": 180, "ص": 195, "ض": 210, "ط": 225, "ظ": 240, "ع": 255, "غ": 270, "ف": 285, "ق": 300, "ك": 315, "ل": 330, "م": 345, "ن": 360, "ه": 375, "و": 385, "ي": 400, } # Greek letters - scientific/math importance GREEK_VALUES = { "α": 1, "β": 15, "γ": 30, "δ": 45, "ε": 60, "ζ": 75, "η": 90, "θ": 105, "ι": 120, "κ": 135, "λ": 150, "μ": 165, "ν": 180, "ξ": 195, "ο": 210, "π": 314, "ρ": 240, "σ": 255, "τ": 270, "υ": 285, "φ": 300, "χ": 315, "ψ": 330, "ω": 400, } # Add uppercase GREEK_VALUES.update({c.upper(): v for c, v in GREEK_VALUES.items()}) # Special: π gets special value (3.14...) GREEK_VALUES["Π"] = 314 # Hindi/Devanagari - 46 primary characters DEVANAGARI_VALUES = {chr(0x0905 + i): (i + 1) * 8 for i in range(50)} # अ to ह # Chinese/CJK - use stroke count approximation (1-400 range) # Common characters mapped by frequency CJK_COMMON = { "的": 400, "一": 1, "是": 380, "不": 360, "了": 340, "在": 320, "人": 300, "有": 280, "我": 260, "他": 240, "这": 220, "个": 200, "们": 180, "中": 160, "来": 140, "上": 120, "大": 100, "为": 80, "和": 60, "国": 40, "地": 20, } # Japanese Hiragana (46 characters) HIRAGANA_VALUES = {chr(0x3041 + i): (i + 1) * 8 for i in range(83)} # ぁ to ん # Japanese Katakana (46 characters) KATAKANA_VALUES = {chr(0x30A1 + i): (i + 1) * 8 for i in range(83)} # ァ to ン # Korean Hangul - use syllable block value # Basic Jamo (consonants and vowels) HANGUL_JAMO = {chr(0x1100 + i): (i + 1) * 10 for i in range(40)} # ᄀ to ᄒ # Thai alphabet (44 consonants + vowels) THAI_VALUES = {chr(0x0E01 + i): (i + 1) * 8 for i in range(58)} # ก to ฮ # Mathematical symbols with semantic values MATH_SYMBOLS = { # Operators "+": 100, "-": 100, "*": 150, "/": 150, "=": 200, "×": 150, "÷": 150, "±": 100, "∓": 100, # Comparison "<": 75, ">": 75, "≤": 80, "≥": 80, "≠": 85, "≈": 90, "≡": 95, # Brackets "(": 50, ")": 50, "[": 50, "]": 50, "{": 50, "}": 50, "⟨": 50, "⟩": 50, "|": 50, # Punctuation ".": 25, ",": 25, ":": 25, ";": 25, "?": 50, "!": 50, "@": 75, "#": 75, # Math symbols "%": 125, "^": 175, "√": 200, "∞": 400, "∑": 300, "∏": 280, "∫": 350, "∂": 250, "∇": 270, "∆": 260, # Logic "∧": 150, "∨": 150, "¬": 100, "→": 180, "↔": 190, "∀": 200, "∃": 200, # Set theory "∈": 160, "∉": 165, "⊂": 170, "⊃": 170, "∪": 180, "∩": 180, "∅": 50, # Currency "$": 200, "€": 200, "₪": 200, "£": 200, "¥": 200, "₹": 200, "₽": 200, # Programming "&": 100, "|": 100, "~": 80, "`": 30, "_": 40, "\\": 60, } # Programming/Code symbols CODE_SYMBOLS = { "==": 200, "!=": 200, "<=": 200, ">=": 200, "&&": 150, "||": 150, "++": 120, "--": 120, "->": 180, "=>": 180, "::": 160, "...": 100, } # Language identifiers for embedding separation # Each language gets a unique ID that affects embedding dimensions LANGUAGE_IDS = { "hebrew": 0, # Primary - גימטריה מקורית "latin": 1, # English/Latin "cyrillic": 2, # Russian etc. "arabic": 3, # Arabic "greek": 4, # Greek "devanagari": 5, # Hindi "cjk": 6, # Chinese "hiragana": 7, # Japanese "katakana": 8, # Japanese "hangul": 9, # Korean "thai": 10, # Thai "math": 11, # Mathematical symbols "unknown": 12, # Unknown scripts } # Character to language mapping (for embedding separation) CHAR_TO_LANGUAGE = {} for c in GEMATRIA: CHAR_TO_LANGUAGE[c] = "hebrew" for c in LATIN_VALUES: CHAR_TO_LANGUAGE[c] = "latin" for c in CYRILLIC_VALUES: CHAR_TO_LANGUAGE[c] = "cyrillic" for c in ARABIC_VALUES: CHAR_TO_LANGUAGE[c] = "arabic" for c in GREEK_VALUES: CHAR_TO_LANGUAGE[c] = "greek" for c in DEVANAGARI_VALUES: CHAR_TO_LANGUAGE[c] = "devanagari" for c in CJK_COMMON: CHAR_TO_LANGUAGE[c] = "cjk" for c in HIRAGANA_VALUES: CHAR_TO_LANGUAGE[c] = "hiragana" for c in KATAKANA_VALUES: CHAR_TO_LANGUAGE[c] = "katakana" for c in HANGUL_JAMO: CHAR_TO_LANGUAGE[c] = "hangul" for c in THAI_VALUES: CHAR_TO_LANGUAGE[c] = "thai" for c in MATH_SYMBOLS: CHAR_TO_LANGUAGE[c] = "math" # Combined character values (ALL languages + Math + Code) # Hebrew gematria values are PRESERVED as-is (1-400) # Other languages use OFFSET ranges to avoid collision CHAR_VALUES = { **GEMATRIA, # Hebrew (priority - original 1-400) **LATIN_VALUES, # English/Latin **CYRILLIC_VALUES, # Russian etc. **ARABIC_VALUES, # Arabic **GREEK_VALUES, # Greek **DEVANAGARI_VALUES, # Hindi **CJK_COMMON, # Chinese common **HIRAGANA_VALUES, # Japanese **KATAKANA_VALUES, # Japanese **HANGUL_JAMO, # Korean **THAI_VALUES, # Thai **MATH_SYMBOLS, # Math } LETTER_COORDS = { "א": (0.05, 0.85), "ה": (0.10, 0.95), "ח": (0.15, 0.45), "ע": (0.08, 0.55), "ג": (0.30, 0.25), "י": (0.35, 0.75), "כ": (0.38, 0.50), "ך": (0.38, 0.20), "ק": (0.32, 0.40), "ד": (0.50, 0.45), "ט": (0.52, 0.20), "ת": (0.55, 0.70), "ז": (0.58, 0.25), "ס": (0.54, 0.35), "צ": (0.56, 0.30), "ץ": (0.56, 0.10), "ש": (0.53, 0.65), "ל": (0.62, 0.80), "נ": (0.65, 0.55), "ן": (0.65, 0.25), "ר": (0.68, 0.70), "ב": (0.85, 0.60), "ו": (0.88, 0.90), "מ": (0.90, 0.75), "ם": (0.90, 0.35), "פ": (0.92, 0.40), "ף": (0.92, 0.15), } HEBREW_ROOTS = { "שפט": "judge", "דין": "law", "חוק": "statute", "עבר": "violate", "טען": "claim", "ערר": "appeal", "פסק": "rule", "תבע": "sue", "זכה": "acquit", "חייב": "convict", "ענש": "punish", "קנס": "fine", "חתם": "sign", "הסכם": "agree", "בטל": "cancel", "תקף": "valid", "כתב": "write", "קרא": "read", "אמר": "say", "דבר": "speak", "הלך": "walk", "בוא": "come", "ראה": "see", "שמע": "hear", "ידע": "know", "חשב": "think", "רצה": "want", "יכל": "can", "נתן": "give", "לקח": "take", "עשה": "make", "היה": "be", } DOMAIN_WEIGHTS = { "hebrew_legal": {"gematria": 0.50, "trajectory": 0.30, "morphology": 0.20}, "hebrew_medical": {"gematria": 0.40, "trajectory": 0.30, "morphology": 0.30}, "hebrew_general": {"gematria": 0.40, "trajectory": 0.40, "morphology": 0.20}, "default": {"gematria": 0.45, "trajectory": 0.35, "morphology": 0.20}, } HEAPS_K = 10.0 HEAPS_BETA = 0.5 # ═══════════════════════════════════════════════════════════════════════════════ # DATA CLASSES # ═══════════════════════════════════════════════════════════════════════════════ @dataclass class GematriaFeatures: histogram: np.ndarray bigram_stats: np.ndarray word_values: List[int] total: int mean: float std: float @dataclass class TrajectoryFeatures: coords: np.ndarray path_length: float curvature: float torsion: float centroid: np.ndarray @property def arc_length(self) -> float: """Alias for path_length to maintain consistency with multimodal encoders.""" return self.path_length @dataclass class MorphologyFeatures: roots: List[Optional[str]] unique_roots: List[str] coverage: float prefix_count: int suffix_count: int @dataclass class TAUMetrics: tau: float complexity: float activity: float coherence: float @dataclass class VocabularyFeatures: indices: List[int] coverage: float oov_count: int @dataclass class EncodingResult: embedding: np.ndarray gematria: GematriaFeatures trajectory: TrajectoryFeatures morphology: MorphologyFeatures vocabulary: VocabularyFeatures tau_metrics: TAUMetrics metadata: Dict[str, Any] # ═══════════════════════════════════════════════════════════════════════════════ # VOCABULARY LAYER # ═══════════════════════════════════════════════════════════════════════════════ class VocabularyLayer: """Word-to-index mapping with OOV handling""" SPECIAL_TOKENS = ["", "", "", "", "", ""] def __init__(self, vocab_file: Optional[str] = None): self.word2idx: Dict[str, int] = {} self.idx2word: Dict[int, str] = {} self.frequencies: Counter = Counter() # Add special tokens for i, token in enumerate(self.SPECIAL_TOKENS): self.word2idx[token] = i self.idx2word[i] = token self.next_idx = len(self.SPECIAL_TOKENS) if vocab_file: self.load(vocab_file) def add_word(self, word: str) -> int: word = word.lower().strip() if not word: return self.word2idx[""] self.frequencies[word] += 1 if word not in self.word2idx: self.word2idx[word] = self.next_idx self.idx2word[self.next_idx] = word self.next_idx += 1 return self.word2idx[word] def get_index(self, word: str) -> int: word = word.lower().strip() if not word: return self.word2idx[""] if word.isdigit(): return self.word2idx[""] if not word.isalnum(): return self.word2idx[""] return self.word2idx.get(word, self.word2idx[""]) def encode(self, text: str) -> Tuple[List[int], float]: words = WORD_OR_PUNCT_PATTERN.findall(text) # Use pre-compiled pattern indices = [] oov_count = 0 unk_idx = self.word2idx[""] # Cache lookup for word in words: idx = self.get_index(word) indices.append(idx) if idx == unk_idx: oov_count += 1 coverage = 1 - (oov_count / len(words)) if words else 1.0 return indices, coverage def size(self) -> int: return len(self.word2idx) def save(self, path: str): import json data = { "word2idx": self.word2idx, "frequencies": dict(self.frequencies.most_common(50000)), } with open(path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False) def load(self, path: str): import json with open(path, "r", encoding="utf-8") as f: data = json.load(f) self.word2idx = data["word2idx"] self.idx2word = {int(v): k for k, v in self.word2idx.items()} self.frequencies = Counter(data.get("frequencies", {})) self.next_idx = max(self.word2idx.values()) + 1 # ═══════════════════════════════════════════════════════════════════════════════ # MORPHOLOGY LAYER # ═══════════════════════════════════════════════════════════════════════════════ class MorphologyLayer: """Hebrew morphological analysis""" PREFIXES = ["ב", "כ", "ל", "מ", "ש", "ה", "ו", "וב", "וכ", "ול", "ומ", "וש", "וה"] SUFFIXES = [ "ים", "ות", "ה", "י", "ך", "ו", "נו", "כם", "כן", "הם", "הן", "תי", "תם", "תן", ] def __init__(self): self.roots = HEBREW_ROOTS def extract_root(self, word: str) -> Tuple[Optional[str], int, int]: if not word or len(word) < 2: return None, 0, 0 # Check known roots for root in self.roots: if root in word: return root, 0, 0 prefix_count = 0 suffix_count = 0 cleaned = word # Remove prefixes for prefix in sorted(self.PREFIXES, key=len, reverse=True): if cleaned.startswith(prefix) and len(cleaned) > len(prefix) + 2: cleaned = cleaned[len(prefix) :] prefix_count += 1 break # Remove suffixes for suffix in sorted(self.SUFFIXES, key=len, reverse=True): if cleaned.endswith(suffix) and len(cleaned) > len(suffix) + 2: cleaned = cleaned[: -len(suffix)] suffix_count += 1 break if len(cleaned) >= 3: return cleaned[:3], prefix_count, suffix_count return None, prefix_count, suffix_count def analyze(self, words: List[str]) -> MorphologyFeatures: roots = [] total_prefixes = 0 total_suffixes = 0 for word in words: root, pref, suff = self.extract_root(word) roots.append(root) total_prefixes += pref total_suffixes += suff found = [r for r in roots if r] unique = list(set(found)) coverage = len(found) / len(words) * 100 if words else 0 return MorphologyFeatures( roots=roots, unique_roots=unique, coverage=coverage, prefix_count=total_prefixes, suffix_count=total_suffixes, ) # ═══════════════════════════════════════════════════════════════════════════════ # ENHANCED HEBREW ENCODER # ═══════════════════════════════════════════════════════════════════════════════ class EnhancedHebrewEncoder: """ Complete Hebrew encoder with all features. Encoding pipeline: 1. Preprocessing (clean, tokenize) 2. Vocabulary encoding (word → index) 3. Gematria encoding (numerical + bigrams) 4. Trajectory encoding (3D spherical) 5. Morphology encoding (root extraction) 6. τ metrics calculation 7. Combined embedding """ def __init__( self, embedding_dim: int = 256, trajectory_points: int = 100, vocab_file: Optional[str] = None, ): self.embedding_dim = embedding_dim self.trajectory_points = trajectory_points # Initialize layers self.vocab_layer = VocabularyLayer(vocab_file) self.morph_layer = MorphologyLayer() # Projection matrices np.random.seed(42) self.W_gematria = np.random.randn(78, embedding_dim).astype(np.float32) * 0.1 self.W_trajectory = np.random.randn(306, embedding_dim).astype(np.float32) * 0.1 self.W_morphology = np.random.randn(110, embedding_dim).astype(np.float32) * 0.1 def _clean_text(self, text: str) -> str: text = NIQQUD_PATTERN.sub("", text) # Use pre-compiled pattern return " ".join(text.split()) def _extract_words(self, text: str) -> List[str]: """Extract words from text - supports Hebrew, English, and numbers.""" return MULTILINGUAL_WORD_PATTERN.findall(text) @staticmethod @lru_cache(maxsize=50000) def _word_gematria_cached(word: str) -> int: """ Cached character value calculation with LANGUAGE SEPARATION. Hebrew gematria is PRESERVED (1-400). Other languages get OFFSET values to avoid collision: - Hebrew: 1-400 (original gematria) - Latin: 1000 + value (1010-1400) - Cyrillic: 2000 + value (2001-2400) - Arabic: 3000 + value (3001-3400) - Greek: 4000 + value (4001-4400) - etc. This ensures Hebrew gematria uniqueness is preserved. """ total = 0 lang_offset = 0 detected_lang = None for c in word: # Detect language from first recognized character if detected_lang is None and c in CHAR_TO_LANGUAGE: detected_lang = CHAR_TO_LANGUAGE[c] lang_id = LANGUAGE_IDS.get(detected_lang, 12) # Hebrew (id=0) has NO offset - preserves original gematria # Other languages get offset = lang_id * 1000 lang_offset = lang_id * 1000 if lang_id > 0 else 0 if c in CHAR_VALUES: total += CHAR_VALUES[c] elif c.isdigit(): # Numbers: use digit value * position weight total += int(c) * 10 else: # Unknown character: use ord value scaled total += ord(c) % 100 # Apply language offset (Hebrew stays pure, others get offset) total += lang_offset return total if total > 0 else 1 # Ensure non-zero @staticmethod def detect_language(text: str) -> str: """Detect primary language of text based on character distribution.""" lang_counts = {} for c in text: if c in CHAR_TO_LANGUAGE: lang = CHAR_TO_LANGUAGE[c] lang_counts[lang] = lang_counts.get(lang, 0) + 1 if not lang_counts: return "unknown" return max(lang_counts, key=lang_counts.get) def _word_gematria(self, word: str) -> int: return self._word_gematria_cached(word) def calculate_gematria(self, text: str) -> int: """Calculate gematria value for Hebrew text (public API).""" words = self._extract_words(text) if not words: return 0 return sum(self._word_gematria(w) for w in words) def _entropy(self, items: List) -> float: if not items: return 0.0 counts = Counter(items) total = len(items) probs = np.array(list(counts.values())) / total return float(-np.sum(probs * np.log2(probs + 1e-10))) def _encode_gematria(self, words: List[str]) -> Tuple[np.ndarray, GematriaFeatures]: if not words: return np.zeros(self.embedding_dim, dtype=np.float32), GematriaFeatures( histogram=np.zeros(50), bigram_stats=np.zeros(20), word_values=[], total=0, mean=0.0, std=0.0, ) values = [self._word_gematria(w) for w in words] # Log-scale histogram log_vals = np.log1p(values) hist, _ = np.histogram(log_vals, bins=50, range=(0, np.log1p(1000))) hist = hist.astype(np.float32) if hist.sum() > 0: hist /= hist.sum() # Bigram stats bigram_vals = ( [ self._word_gematria(words[i] + words[i + 1]) for i in range(len(words) - 1) ] if len(words) > 1 else [] ) if bigram_vals: bigram_stats = ( np.array( [ np.mean(bigram_vals), np.std(bigram_vals), np.median(bigram_vals), np.min(bigram_vals), np.max(bigram_vals), np.percentile(bigram_vals, 25), np.percentile(bigram_vals, 75), len(bigram_vals), ], dtype=np.float32, ) / 1000 ) bigram_stats = np.pad(bigram_stats, (0, 20 - len(bigram_stats))) else: bigram_stats = np.zeros(20, dtype=np.float32) # Basic stats basic = np.array( [ np.mean(values) / 1000, np.std(values) / 1000, np.median(values) / 1000, np.min(values) / 1000, np.max(values) / 1000, len(words) / 100, sum(values) / 10000, len(set(values)) / max(len(values), 1), ], dtype=np.float32, ) # Combine and project features = np.concatenate([hist, bigram_stats, basic]) embedding = features @ self.W_gematria norm = np.linalg.norm(embedding) if norm > 0: embedding /= norm return embedding.astype(np.float32), GematriaFeatures( histogram=hist, bigram_stats=bigram_stats, word_values=values, total=sum(values), mean=float(np.mean(values)), std=float(np.std(values)), ) def _encode_trajectory( self, words: List[str] ) -> Tuple[np.ndarray, TrajectoryFeatures]: if not words: return np.zeros(self.embedding_dim, dtype=np.float32), TrajectoryFeatures( coords=np.zeros((self.trajectory_points, 3)), path_length=0.0, curvature=0.0, torsion=0.0, centroid=np.zeros(3), ) seq = [self._word_gematria(w) for w in words] # Resample if len(seq) < self.trajectory_points: idx = np.linspace(0, len(seq) - 1, self.trajectory_points) resampled = np.interp(idx, range(len(seq)), seq) else: idx = np.linspace(0, len(seq) - 1, self.trajectory_points).astype(int) resampled = np.array([seq[i] for i in idx]) normalized = resampled / (np.max(resampled) + 1e-8) # Spherical coordinates theta = np.cumsum(normalized) * 2 * np.pi / self.trajectory_points phi = normalized * np.pi * 0.8 + 0.1 * np.pi x = np.sin(phi) * np.cos(theta) y = np.sin(phi) * np.sin(theta) z = np.cos(phi) coords = np.stack([x, y, z], axis=1).astype(np.float32) # Geometric features diffs = np.diff(coords, axis=0) path_length = float(np.sum(np.linalg.norm(diffs, axis=1))) d1 = np.gradient(coords, axis=0) d2 = np.gradient(d1, axis=0) d1_norm = np.linalg.norm(d1, axis=1) + 1e-10 d2_norm = np.linalg.norm(d2, axis=1) curvature = float(np.mean(d2_norm / (d1_norm**2))) d3 = np.gradient(d2, axis=0) cross = np.cross(d1, d2) cross_norm_sq = np.sum(cross**2, axis=1) + 1e-10 torsion_vals = np.sum(cross * d3, axis=1) / cross_norm_sq torsion = float(np.mean(np.abs(torsion_vals))) centroid = np.mean(coords, axis=0) # Project geom_features = np.array( [ path_length / self.trajectory_points, curvature, torsion, np.std(np.linalg.norm(coords - centroid, axis=1)), np.std(np.linalg.norm(diffs, axis=1)), np.std(d2_norm / (d1_norm**2)), ], dtype=np.float32, ) features = np.concatenate([coords.flatten(), geom_features]) embedding = features @ self.W_trajectory norm = np.linalg.norm(embedding) if norm > 0: embedding /= norm return embedding.astype(np.float32), TrajectoryFeatures( coords=coords, path_length=path_length, curvature=curvature, torsion=torsion, centroid=centroid, ) def _encode_morphology( self, words: List[str] ) -> Tuple[np.ndarray, MorphologyFeatures]: morph = self.morph_layer.analyze(words) if not words: return np.zeros(self.embedding_dim, dtype=np.float32), morph # Root histogram root_indices = [hash(r) % 10000 for r in morph.roots if r] if root_indices: hist, _ = np.histogram(root_indices, bins=100, range=(0, 10000)) hist = hist.astype(np.float32) if hist.sum() > 0: hist /= hist.sum() else: hist = np.zeros(100, dtype=np.float32) # Stats stats = np.array( [ morph.coverage / 100, len(morph.unique_roots) / max(len(words), 1), morph.prefix_count / max(len(words), 1), morph.suffix_count / max(len(words), 1), np.mean([len(w) for w in words]) / 10, np.std([len(w) for w in words]) / 5, ( np.mean([len(r) for r in morph.unique_roots]) / 5 if morph.unique_roots else 0 ), ( np.std([len(r) for r in morph.unique_roots]) / 3 if len(morph.unique_roots) > 1 else 0 ), sum(1 for r in morph.unique_roots if r in HEBREW_ROOTS) / max(len(morph.unique_roots), 1), len(words) / 100, ], dtype=np.float32, ) features = np.concatenate([hist, stats]) embedding = features @ self.W_morphology norm = np.linalg.norm(embedding) if norm > 0: embedding /= norm return embedding.astype(np.float32), morph # ═══════════════════════════════════════════════════════════════════════════ # OPTIMIZED ENCODING METHODS (avoid duplicate gematria calculation) # ═══════════════════════════════════════════════════════════════════════════ def _encode_gematria_optimized( self, words: List[str], word_values: List[int] ) -> Tuple[np.ndarray, GematriaFeatures]: """Optimized gematria encoding with pre-computed values.""" if not words: return np.zeros(self.embedding_dim, dtype=np.float32), GematriaFeatures( histogram=np.zeros(50), bigram_stats=np.zeros(20), word_values=[], total=0, mean=0.0, std=0.0, ) values = word_values # Use pre-computed values # Log-scale histogram (vectorized) values_arr = np.array(values, dtype=np.float32) log_vals = np.log1p(values_arr) hist, _ = np.histogram(log_vals, bins=50, range=(0, np.log1p(1000))) hist = hist.astype(np.float32) hist_sum = hist.sum() if hist_sum > 0: hist /= hist_sum # Bigram stats - use pre-computed values for adjacent word sums if len(values) > 1: bigram_vals = np.array( [values[i] + values[i + 1] for i in range(len(values) - 1)], dtype=np.float32, ) bigram_stats = ( np.array( [ np.mean(bigram_vals), np.std(bigram_vals), np.median(bigram_vals), np.min(bigram_vals), np.max(bigram_vals), np.percentile(bigram_vals, 25), np.percentile(bigram_vals, 75), len(bigram_vals), ], dtype=np.float32, ) / 1000 ) bigram_stats = np.pad(bigram_stats, (0, 20 - len(bigram_stats))) else: bigram_stats = np.zeros(20, dtype=np.float32) # Basic stats (vectorized) basic = np.array( [ values_arr.mean() / 1000, values_arr.std() / 1000, np.median(values_arr) / 1000, values_arr.min() / 1000, values_arr.max() / 1000, len(words) / 100, values_arr.sum() / 10000, len(np.unique(values_arr)) / len(values_arr), ], dtype=np.float32, ) # Combine and project features = np.concatenate([hist, bigram_stats, basic]) embedding = features @ self.W_gematria norm = np.linalg.norm(embedding) if norm > 0: embedding /= norm return embedding.astype(np.float32), GematriaFeatures( histogram=hist, bigram_stats=bigram_stats, word_values=values, total=int(values_arr.sum()), mean=float(values_arr.mean()), std=float(values_arr.std()), ) def _encode_trajectory_optimized( self, words: List[str], word_values: List[int] ) -> Tuple[np.ndarray, TrajectoryFeatures]: """Optimized trajectory encoding with pre-computed gematria values.""" if not words: return np.zeros(self.embedding_dim, dtype=np.float32), TrajectoryFeatures( coords=np.zeros((self.trajectory_points, 3)), path_length=0.0, curvature=0.0, torsion=0.0, centroid=np.zeros(3), ) seq = np.array(word_values, dtype=np.float32) # Use pre-computed values # Resample (vectorized) n_points = len(seq) if n_points < self.trajectory_points: idx = np.linspace(0, n_points - 1, self.trajectory_points) resampled = np.interp(idx, np.arange(n_points), seq) else: idx = np.linspace(0, n_points - 1, self.trajectory_points).astype(int) resampled = seq[idx] max_val = resampled.max() normalized = resampled / (max_val + 1e-8) # Spherical coordinates (fully vectorized) theta = np.cumsum(normalized) * 2 * np.pi / self.trajectory_points phi = normalized * np.pi * 0.8 + 0.1 * np.pi sin_phi = np.sin(phi) cos_phi = np.cos(phi) cos_theta = np.cos(theta) sin_theta = np.sin(theta) x = sin_phi * cos_theta y = sin_phi * sin_theta z = cos_phi coords = np.stack([x, y, z], axis=1).astype(np.float32) # Geometric features (optimized - compute all derivatives at once) diffs = np.diff(coords, axis=0) diff_norms = np.linalg.norm(diffs, axis=1) path_length = float(diff_norms.sum()) # Compute derivatives in sequence (d1 -> d2 -> d3) d1 = np.gradient(coords, axis=0) d2 = np.gradient(d1, axis=0) d3 = np.gradient(d2, axis=0) d1_norm = np.linalg.norm(d1, axis=1) + 1e-10 d2_norm = np.linalg.norm(d2, axis=1) curvature = float(np.mean(d2_norm / (d1_norm**2))) cross = np.cross(d1, d2) cross_norm_sq = np.sum(cross**2, axis=1) + 1e-10 torsion_vals = np.abs(np.sum(cross * d3, axis=1) / cross_norm_sq) torsion = float(torsion_vals.mean()) centroid = coords.mean(axis=0) # Project geom_features = np.array( [ path_length / self.trajectory_points, curvature, torsion, np.std(np.linalg.norm(coords - centroid, axis=1)), np.std(diff_norms), np.std(d2_norm / (d1_norm**2)), ], dtype=np.float32, ) features = np.concatenate([coords.flatten(), geom_features]) embedding = features @ self.W_trajectory norm = np.linalg.norm(embedding) if norm > 0: embedding /= norm return embedding.astype(np.float32), TrajectoryFeatures( coords=coords, path_length=path_length, curvature=curvature, torsion=torsion, centroid=centroid, ) def _calculate_tau(self, words: List[str]) -> TAUMetrics: if not words: return TAUMetrics(tau=0.0, complexity=0.0, activity=0.0, coherence=0.0) n = len(words) unique = len(set(words)) # Complexity (Heaps' Law) expected = HEAPS_K * (n**HEAPS_BETA) complexity = min((unique / max(expected, 1)) * 100, 200) # Activity (entropy) chars = "".join(words) char_entropy = self._entropy(list(chars)) avg_len = np.mean([len(w) for w in words]) activity = char_entropy * np.log2(avg_len + 1) # Coherence (sliding window) window = 5 if n >= window: ratios = [ len(set(words[i : i + window])) / window for i in range(n - window + 1) ] coherence = 1 - min(np.std(ratios) * 2, 1) else: coherence = 0.5 tau = complexity * activity * (1 - coherence) / 100 return TAUMetrics( tau=round(tau, 4), complexity=round(complexity, 4), activity=round(activity, 4), coherence=round(coherence, 4), ) def _encode_language_signature(self, text: str) -> np.ndarray: """ Create language signature embedding. Uses SIGNED values to separate languages in embedding space: - Hebrew: POSITIVE values (preserves original gematria semantics) - Other languages: NEGATIVE values (clear separation) This ensures Hebrew gematria uniqueness is preserved in the embedding. """ # Count characters per language lang_counts = {} total_chars = 0 for c in text: if c in CHAR_TO_LANGUAGE: lang = CHAR_TO_LANGUAGE[c] lang_counts[lang] = lang_counts.get(lang, 0) + 1 total_chars += 1 if total_chars == 0: return np.zeros(len(LANGUAGE_IDS), dtype=np.float32) # Create language distribution vector with SIGNED values lang_vec = np.zeros(len(LANGUAGE_IDS), dtype=np.float32) for lang, count in lang_counts.items(): lang_id = LANGUAGE_IDS.get(lang, LANGUAGE_IDS["unknown"]) proportion = count / total_chars # Hebrew (id=0) gets POSITIVE values # All other languages get NEGATIVE values if lang_id == 0: # Hebrew lang_vec[lang_id] = proportion # Positive else: lang_vec[lang_id] = -proportion # Negative return lang_vec def encode(self, text: str, domain: str = "default") -> EncodingResult: """ Encode multilingual text with language-aware embedding. Args: text: Text to encode (Hebrew, English, or any supported language) domain: Domain for weight adjustment Returns: EncodingResult with all features including language signature """ text = self._clean_text(text) words = self._extract_words(text) weights = DOMAIN_WEIGHTS.get(domain, DOMAIN_WEIGHTS["default"]) # OPTIMIZATION: Calculate gematria values once, reuse everywhere word_values = [self._word_gematria(w) for w in words] # Encode all channels with pre-computed gematria values gematria_emb, gematria_feat = self._encode_gematria_optimized( words, word_values ) traj_emb, traj_feat = self._encode_trajectory_optimized(words, word_values) morph_emb, morph_feat = self._encode_morphology(words) # Language signature (SIGNED: Hebrew=positive, others=negative) lang_signature = self._encode_language_signature(text) # Vocabulary indices, coverage = self.vocab_layer.encode(text) unk_idx = self.vocab_layer.word2idx[""] oov_count = sum(1 for idx in indices if idx == unk_idx) # Faster than .count() vocab_feat = VocabularyFeatures( indices=indices, coverage=coverage, oov_count=oov_count ) # τ metrics tau_metrics = self._calculate_tau(words) # Combined embedding (before language signature) base_embedding = ( weights["gematria"] * gematria_emb + weights["trajectory"] * traj_emb + weights["morphology"] * morph_emb ) # Inject language signature into last dimensions # This creates SEPARATION in embedding space: # - Hebrew texts cluster together (positive values) # - Non-Hebrew texts cluster separately (negative values) num_lang_dims = len(LANGUAGE_IDS) embedding = base_embedding.copy() embedding[-num_lang_dims:] += lang_signature * 0.3 # 30% influence norm = np.linalg.norm(embedding) if norm > 0: embedding /= norm metadata = { "text_length": len(text), "word_count": len(words), "domain": domain, "weights": weights, "hash": hashlib.md5(text.encode()).hexdigest(), } return EncodingResult( embedding=embedding, gematria=gematria_feat, trajectory=traj_feat, morphology=morph_feat, vocabulary=vocab_feat, tau_metrics=tau_metrics, metadata=metadata, ) def encode_batch( self, texts: List[str], domain: str = "default" ) -> List[EncodingResult]: return [self.encode(text, domain) for text in texts] def similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float: norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2) if norm1 == 0 or norm2 == 0: return 0.0 return float(np.dot(vec1, vec2) / (norm1 * norm2)) def find_similar( self, query: str, corpus: List[str], domain: str = "default", top_k: int = 10 ) -> List[Tuple[int, float, str]]: query_emb = self.encode(query, domain).embedding results = [] for i, text in enumerate(corpus): emb = self.encode(text, domain).embedding sim = self.similarity(query_emb, emb) preview = text[:100] + "..." if len(text) > 100 else text results.append((i, sim, preview)) results.sort(key=lambda x: x[1], reverse=True) return results[:top_k] # Alias for backward compatibility HebrewEncoder = EnhancedHebrewEncoder # ═══════════════════════════════════════════════════════════════════════════════ # DEMO # ═══════════════════════════════════════════════════════════════════════════════ if __name__ == "__main__": print("=" * 70) print("TAU Platform v4.0 - Enhanced Hebrew Encoder Demo") print("=" * 70) print() encoder = EnhancedHebrewEncoder() text = "בית המשפט העליון פסק כי הערעור יתקבל. השופטים קבעו פה אחד כי יש לבטל את ההחלטה." print(f"Text: {text}") print() result = encoder.encode(text, domain="hebrew_legal") print(f"Embedding shape: {result.embedding.shape}") print(f"Embedding norm: {np.linalg.norm(result.embedding):.4f}") print() print("τ Metrics:") print(f" τ: {result.tau_metrics.tau}") print(f" Complexity: {result.tau_metrics.complexity}") print(f" Activity: {result.tau_metrics.activity}") print(f" Coherence: {result.tau_metrics.coherence}") print() print("Gematria:") print(f" Total: {result.gematria.total}") print(f" Mean: {result.gematria.mean:.2f}") print() print("Trajectory:") print(f" Path length: {result.trajectory.path_length:.4f}") print(f" Curvature: {result.trajectory.curvature:.4f}") print() print("Morphology:") print(f" Coverage: {result.morphology.coverage:.1f}%") print(f" Unique roots: {result.morphology.unique_roots[:5]}") print() print("Vocabulary:") print(f" Coverage: {result.vocabulary.coverage:.1%}") print(f" OOV count: {result.vocabulary.oov_count}") print() print("=" * 70) print("✅ Demo Complete!") print("=" * 70)