| """ |
| ═══════════════════════════════════════════════════════════════════════════════ |
| TAU Platform v4.0 - Enhanced Hebrew Encoder |
| ═══════════════════════════════════════════════════════════════════════════════ |
| |
| Unified Hebrew text encoder combining: |
| 1. Gematria encoding (log-scale + bigrams + position weights) |
| 2. Spherical trajectory encoding (with curvature/torsion) |
| 3. Morphological analysis (root extraction) |
| 4. τ complexity metrics (Heaps' Law adjusted) |
| 5. Vocabulary encoding (word-to-index) |
| 6. Compression integration |
| |
| Author: Avri Barzel |
| Date: November 2025 |
| ═══════════════════════════════════════════════════════════════════════════════ |
| """ |
|
|
| import numpy as np |
| import re |
| import hashlib |
| from typing import Dict, List, Tuple, Optional, Any |
| from dataclasses import dataclass, field |
| from collections import Counter |
| from functools import lru_cache |
|
|
| |
| |
| |
| NIQQUD_PATTERN = re.compile(r"[\u0591-\u05C7]") |
| HEBREW_WORD_PATTERN = re.compile(r"[\u05D0-\u05EA]+") |
|
|
| |
| MULTILINGUAL_WORD_PATTERN = re.compile( |
| r"[\u05D0-\u05EA]+" |
| r"|[a-zA-Z]+" |
| r"|[\u0400-\u04FF]+" |
| r"|[\u0600-\u06FF]+" |
| r"|[\u4E00-\u9FFF]+" |
| r"|[\u3040-\u309F]+" |
| r"|[\u30A0-\u30FF]+" |
| r"|[\uAC00-\uD7AF]+" |
| r"|[\u0900-\u097F]+" |
| r"|[\u0370-\u03FF]+" |
| r"|[\u0E00-\u0E7F]+" |
| r"|[\u0590-\u05FF]+" |
| r"|\d+" |
| ) |
| WORD_OR_PUNCT_PATTERN = re.compile(r"[\w]+|[^\w\s]") |
|
|
| |
| |
| |
|
|
| |
| GEMATRIA = { |
| "א": 1, |
| "ב": 2, |
| "ג": 3, |
| "ד": 4, |
| "ה": 5, |
| "ו": 6, |
| "ז": 7, |
| "ח": 8, |
| "ט": 9, |
| "י": 10, |
| "כ": 20, |
| "ך": 20, |
| "ל": 30, |
| "מ": 40, |
| "ם": 40, |
| "נ": 50, |
| "ן": 50, |
| "ס": 60, |
| "ע": 70, |
| "פ": 80, |
| "ף": 80, |
| "צ": 90, |
| "ץ": 90, |
| "ק": 100, |
| "ר": 200, |
| "ש": 300, |
| "ת": 400, |
| } |
|
|
| |
| LATIN_VALUES = { |
| |
| "e": 400, |
| "t": 380, |
| "a": 360, |
| "o": 340, |
| "i": 320, |
| "n": 300, |
| "s": 280, |
| "h": 260, |
| "r": 240, |
| "d": 220, |
| "l": 200, |
| "c": 180, |
| "u": 160, |
| "m": 140, |
| "w": 120, |
| "f": 100, |
| "g": 90, |
| "y": 80, |
| "p": 70, |
| "b": 60, |
| "v": 50, |
| "k": 40, |
| "j": 30, |
| "x": 20, |
| "q": 15, |
| "z": 10, |
| } |
| |
| LATIN_VALUES.update({c.upper(): v for c, v in LATIN_VALUES.items()}) |
|
|
| |
| CYRILLIC_VALUES = { |
| |
| "а": 1, |
| "б": 15, |
| "в": 30, |
| "г": 45, |
| "д": 60, |
| "е": 75, |
| "ё": 80, |
| "ж": 95, |
| "з": 110, |
| "и": 125, |
| "й": 140, |
| "к": 155, |
| "л": 170, |
| "м": 185, |
| "н": 200, |
| "о": 215, |
| "п": 230, |
| "р": 245, |
| "с": 260, |
| "т": 275, |
| "у": 290, |
| "ф": 305, |
| "х": 320, |
| "ц": 335, |
| "ч": 350, |
| "ш": 365, |
| "щ": 375, |
| "ъ": 380, |
| "ы": 385, |
| "ь": 390, |
| "э": 393, |
| "ю": 396, |
| "я": 400, |
| } |
| |
| CYRILLIC_VALUES.update({c.upper(): v for c, v in CYRILLIC_VALUES.items()}) |
|
|
| |
| ARABIC_VALUES = { |
| "ا": 1, |
| "ب": 15, |
| "ت": 30, |
| "ث": 45, |
| "ج": 60, |
| "ح": 75, |
| "خ": 90, |
| "د": 105, |
| "ذ": 120, |
| "ر": 135, |
| "ز": 150, |
| "س": 165, |
| "ش": 180, |
| "ص": 195, |
| "ض": 210, |
| "ط": 225, |
| "ظ": 240, |
| "ع": 255, |
| "غ": 270, |
| "ف": 285, |
| "ق": 300, |
| "ك": 315, |
| "ل": 330, |
| "م": 345, |
| "ن": 360, |
| "ه": 375, |
| "و": 385, |
| "ي": 400, |
| } |
|
|
| |
| GREEK_VALUES = { |
| "α": 1, |
| "β": 15, |
| "γ": 30, |
| "δ": 45, |
| "ε": 60, |
| "ζ": 75, |
| "η": 90, |
| "θ": 105, |
| "ι": 120, |
| "κ": 135, |
| "λ": 150, |
| "μ": 165, |
| "ν": 180, |
| "ξ": 195, |
| "ο": 210, |
| "π": 314, |
| "ρ": 240, |
| "σ": 255, |
| "τ": 270, |
| "υ": 285, |
| "φ": 300, |
| "χ": 315, |
| "ψ": 330, |
| "ω": 400, |
| } |
| |
| GREEK_VALUES.update({c.upper(): v for c, v in GREEK_VALUES.items()}) |
| |
| GREEK_VALUES["Π"] = 314 |
|
|
| |
| DEVANAGARI_VALUES = {chr(0x0905 + i): (i + 1) * 8 for i in range(50)} |
|
|
| |
| |
| CJK_COMMON = { |
| "的": 400, |
| "一": 1, |
| "是": 380, |
| "不": 360, |
| "了": 340, |
| "在": 320, |
| "人": 300, |
| "有": 280, |
| "我": 260, |
| "他": 240, |
| "这": 220, |
| "个": 200, |
| "们": 180, |
| "中": 160, |
| "来": 140, |
| "上": 120, |
| "大": 100, |
| "为": 80, |
| "和": 60, |
| "国": 40, |
| "地": 20, |
| } |
|
|
| |
| HIRAGANA_VALUES = {chr(0x3041 + i): (i + 1) * 8 for i in range(83)} |
|
|
| |
| KATAKANA_VALUES = {chr(0x30A1 + i): (i + 1) * 8 for i in range(83)} |
|
|
| |
| |
| HANGUL_JAMO = {chr(0x1100 + i): (i + 1) * 10 for i in range(40)} |
|
|
| |
| THAI_VALUES = {chr(0x0E01 + i): (i + 1) * 8 for i in range(58)} |
|
|
| |
| MATH_SYMBOLS = { |
| |
| "+": 100, |
| "-": 100, |
| "*": 150, |
| "/": 150, |
| "=": 200, |
| "×": 150, |
| "÷": 150, |
| "±": 100, |
| "∓": 100, |
| |
| "<": 75, |
| ">": 75, |
| "≤": 80, |
| "≥": 80, |
| "≠": 85, |
| "≈": 90, |
| "≡": 95, |
| |
| "(": 50, |
| ")": 50, |
| "[": 50, |
| "]": 50, |
| "{": 50, |
| "}": 50, |
| "⟨": 50, |
| "⟩": 50, |
| "|": 50, |
| |
| ".": 25, |
| ",": 25, |
| ":": 25, |
| ";": 25, |
| "?": 50, |
| "!": 50, |
| "@": 75, |
| "#": 75, |
| |
| "%": 125, |
| "^": 175, |
| "√": 200, |
| "∞": 400, |
| "∑": 300, |
| "∏": 280, |
| "∫": 350, |
| "∂": 250, |
| "∇": 270, |
| "∆": 260, |
| |
| "∧": 150, |
| "∨": 150, |
| "¬": 100, |
| "→": 180, |
| "↔": 190, |
| "∀": 200, |
| "∃": 200, |
| |
| "∈": 160, |
| "∉": 165, |
| "⊂": 170, |
| "⊃": 170, |
| "∪": 180, |
| "∩": 180, |
| "∅": 50, |
| |
| "$": 200, |
| "€": 200, |
| "₪": 200, |
| "£": 200, |
| "¥": 200, |
| "₹": 200, |
| "₽": 200, |
| |
| "&": 100, |
| "|": 100, |
| "~": 80, |
| "`": 30, |
| "_": 40, |
| "\\": 60, |
| } |
|
|
| |
| CODE_SYMBOLS = { |
| "==": 200, |
| "!=": 200, |
| "<=": 200, |
| ">=": 200, |
| "&&": 150, |
| "||": 150, |
| "++": 120, |
| "--": 120, |
| "->": 180, |
| "=>": 180, |
| "::": 160, |
| "...": 100, |
| } |
|
|
| |
| |
| LANGUAGE_IDS = { |
| "hebrew": 0, |
| "latin": 1, |
| "cyrillic": 2, |
| "arabic": 3, |
| "greek": 4, |
| "devanagari": 5, |
| "cjk": 6, |
| "hiragana": 7, |
| "katakana": 8, |
| "hangul": 9, |
| "thai": 10, |
| "math": 11, |
| "unknown": 12, |
| } |
|
|
| |
| CHAR_TO_LANGUAGE = {} |
| for c in GEMATRIA: |
| CHAR_TO_LANGUAGE[c] = "hebrew" |
| for c in LATIN_VALUES: |
| CHAR_TO_LANGUAGE[c] = "latin" |
| for c in CYRILLIC_VALUES: |
| CHAR_TO_LANGUAGE[c] = "cyrillic" |
| for c in ARABIC_VALUES: |
| CHAR_TO_LANGUAGE[c] = "arabic" |
| for c in GREEK_VALUES: |
| CHAR_TO_LANGUAGE[c] = "greek" |
| for c in DEVANAGARI_VALUES: |
| CHAR_TO_LANGUAGE[c] = "devanagari" |
| for c in CJK_COMMON: |
| CHAR_TO_LANGUAGE[c] = "cjk" |
| for c in HIRAGANA_VALUES: |
| CHAR_TO_LANGUAGE[c] = "hiragana" |
| for c in KATAKANA_VALUES: |
| CHAR_TO_LANGUAGE[c] = "katakana" |
| for c in HANGUL_JAMO: |
| CHAR_TO_LANGUAGE[c] = "hangul" |
| for c in THAI_VALUES: |
| CHAR_TO_LANGUAGE[c] = "thai" |
| for c in MATH_SYMBOLS: |
| CHAR_TO_LANGUAGE[c] = "math" |
|
|
| |
| |
| |
| CHAR_VALUES = { |
| **GEMATRIA, |
| **LATIN_VALUES, |
| **CYRILLIC_VALUES, |
| **ARABIC_VALUES, |
| **GREEK_VALUES, |
| **DEVANAGARI_VALUES, |
| **CJK_COMMON, |
| **HIRAGANA_VALUES, |
| **KATAKANA_VALUES, |
| **HANGUL_JAMO, |
| **THAI_VALUES, |
| **MATH_SYMBOLS, |
| } |
|
|
| LETTER_COORDS = { |
| "א": (0.05, 0.85), |
| "ה": (0.10, 0.95), |
| "ח": (0.15, 0.45), |
| "ע": (0.08, 0.55), |
| "ג": (0.30, 0.25), |
| "י": (0.35, 0.75), |
| "כ": (0.38, 0.50), |
| "ך": (0.38, 0.20), |
| "ק": (0.32, 0.40), |
| "ד": (0.50, 0.45), |
| "ט": (0.52, 0.20), |
| "ת": (0.55, 0.70), |
| "ז": (0.58, 0.25), |
| "ס": (0.54, 0.35), |
| "צ": (0.56, 0.30), |
| "ץ": (0.56, 0.10), |
| "ש": (0.53, 0.65), |
| "ל": (0.62, 0.80), |
| "נ": (0.65, 0.55), |
| "ן": (0.65, 0.25), |
| "ר": (0.68, 0.70), |
| "ב": (0.85, 0.60), |
| "ו": (0.88, 0.90), |
| "מ": (0.90, 0.75), |
| "ם": (0.90, 0.35), |
| "פ": (0.92, 0.40), |
| "ף": (0.92, 0.15), |
| } |
|
|
| HEBREW_ROOTS = { |
| "שפט": "judge", |
| "דין": "law", |
| "חוק": "statute", |
| "עבר": "violate", |
| "טען": "claim", |
| "ערר": "appeal", |
| "פסק": "rule", |
| "תבע": "sue", |
| "זכה": "acquit", |
| "חייב": "convict", |
| "ענש": "punish", |
| "קנס": "fine", |
| "חתם": "sign", |
| "הסכם": "agree", |
| "בטל": "cancel", |
| "תקף": "valid", |
| "כתב": "write", |
| "קרא": "read", |
| "אמר": "say", |
| "דבר": "speak", |
| "הלך": "walk", |
| "בוא": "come", |
| "ראה": "see", |
| "שמע": "hear", |
| "ידע": "know", |
| "חשב": "think", |
| "רצה": "want", |
| "יכל": "can", |
| "נתן": "give", |
| "לקח": "take", |
| "עשה": "make", |
| "היה": "be", |
| } |
|
|
| DOMAIN_WEIGHTS = { |
| "hebrew_legal": {"gematria": 0.50, "trajectory": 0.30, "morphology": 0.20}, |
| "hebrew_medical": {"gematria": 0.40, "trajectory": 0.30, "morphology": 0.30}, |
| "hebrew_general": {"gematria": 0.40, "trajectory": 0.40, "morphology": 0.20}, |
| "default": {"gematria": 0.45, "trajectory": 0.35, "morphology": 0.20}, |
| } |
|
|
| HEAPS_K = 10.0 |
| HEAPS_BETA = 0.5 |
|
|
|
|
| |
| |
| |
|
|
|
|
| @dataclass |
| class GematriaFeatures: |
| histogram: np.ndarray |
| bigram_stats: np.ndarray |
| word_values: List[int] |
| total: int |
| mean: float |
| std: float |
|
|
|
|
| @dataclass |
| class TrajectoryFeatures: |
| coords: np.ndarray |
| path_length: float |
| curvature: float |
| torsion: float |
| centroid: np.ndarray |
|
|
| @property |
| def arc_length(self) -> float: |
| """Alias for path_length to maintain consistency with multimodal encoders.""" |
| return self.path_length |
|
|
|
|
| @dataclass |
| class MorphologyFeatures: |
| roots: List[Optional[str]] |
| unique_roots: List[str] |
| coverage: float |
| prefix_count: int |
| suffix_count: int |
|
|
|
|
| @dataclass |
| class TAUMetrics: |
| tau: float |
| complexity: float |
| activity: float |
| coherence: float |
|
|
|
|
| @dataclass |
| class VocabularyFeatures: |
| indices: List[int] |
| coverage: float |
| oov_count: int |
|
|
|
|
| @dataclass |
| class EncodingResult: |
| embedding: np.ndarray |
| gematria: GematriaFeatures |
| trajectory: TrajectoryFeatures |
| morphology: MorphologyFeatures |
| vocabulary: VocabularyFeatures |
| tau_metrics: TAUMetrics |
| metadata: Dict[str, Any] |
|
|
|
|
| |
| |
| |
|
|
|
|
| class VocabularyLayer: |
| """Word-to-index mapping with OOV handling""" |
|
|
| SPECIAL_TOKENS = ["<PAD>", "<UNK>", "<START>", "<END>", "<NUM>", "<PUNC>"] |
|
|
| def __init__(self, vocab_file: Optional[str] = None): |
| self.word2idx: Dict[str, int] = {} |
| self.idx2word: Dict[int, str] = {} |
| self.frequencies: Counter = Counter() |
|
|
| |
| for i, token in enumerate(self.SPECIAL_TOKENS): |
| self.word2idx[token] = i |
| self.idx2word[i] = token |
|
|
| self.next_idx = len(self.SPECIAL_TOKENS) |
|
|
| if vocab_file: |
| self.load(vocab_file) |
|
|
| def add_word(self, word: str) -> int: |
| word = word.lower().strip() |
| if not word: |
| return self.word2idx["<PAD>"] |
|
|
| self.frequencies[word] += 1 |
|
|
| if word not in self.word2idx: |
| self.word2idx[word] = self.next_idx |
| self.idx2word[self.next_idx] = word |
| self.next_idx += 1 |
|
|
| return self.word2idx[word] |
|
|
| def get_index(self, word: str) -> int: |
| word = word.lower().strip() |
|
|
| if not word: |
| return self.word2idx["<PAD>"] |
| if word.isdigit(): |
| return self.word2idx["<NUM>"] |
| if not word.isalnum(): |
| return self.word2idx["<PUNC>"] |
|
|
| return self.word2idx.get(word, self.word2idx["<UNK>"]) |
|
|
| def encode(self, text: str) -> Tuple[List[int], float]: |
| words = WORD_OR_PUNCT_PATTERN.findall(text) |
| indices = [] |
| oov_count = 0 |
| unk_idx = self.word2idx["<UNK>"] |
|
|
| for word in words: |
| idx = self.get_index(word) |
| indices.append(idx) |
| if idx == unk_idx: |
| oov_count += 1 |
|
|
| coverage = 1 - (oov_count / len(words)) if words else 1.0 |
| return indices, coverage |
|
|
| def size(self) -> int: |
| return len(self.word2idx) |
|
|
| def save(self, path: str): |
| import json |
|
|
| data = { |
| "word2idx": self.word2idx, |
| "frequencies": dict(self.frequencies.most_common(50000)), |
| } |
| with open(path, "w", encoding="utf-8") as f: |
| json.dump(data, f, ensure_ascii=False) |
|
|
| def load(self, path: str): |
| import json |
|
|
| with open(path, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| self.word2idx = data["word2idx"] |
| self.idx2word = {int(v): k for k, v in self.word2idx.items()} |
| self.frequencies = Counter(data.get("frequencies", {})) |
| self.next_idx = max(self.word2idx.values()) + 1 |
|
|
|
|
| |
| |
| |
|
|
|
|
| class MorphologyLayer: |
| """Hebrew morphological analysis""" |
|
|
| PREFIXES = ["ב", "כ", "ל", "מ", "ש", "ה", "ו", "וב", "וכ", "ול", "ומ", "וש", "וה"] |
| SUFFIXES = [ |
| "ים", |
| "ות", |
| "ה", |
| "י", |
| "ך", |
| "ו", |
| "נו", |
| "כם", |
| "כן", |
| "הם", |
| "הן", |
| "תי", |
| "תם", |
| "תן", |
| ] |
|
|
| def __init__(self): |
| self.roots = HEBREW_ROOTS |
|
|
| def extract_root(self, word: str) -> Tuple[Optional[str], int, int]: |
| if not word or len(word) < 2: |
| return None, 0, 0 |
|
|
| |
| for root in self.roots: |
| if root in word: |
| return root, 0, 0 |
|
|
| prefix_count = 0 |
| suffix_count = 0 |
| cleaned = word |
|
|
| |
| for prefix in sorted(self.PREFIXES, key=len, reverse=True): |
| if cleaned.startswith(prefix) and len(cleaned) > len(prefix) + 2: |
| cleaned = cleaned[len(prefix) :] |
| prefix_count += 1 |
| break |
|
|
| |
| for suffix in sorted(self.SUFFIXES, key=len, reverse=True): |
| if cleaned.endswith(suffix) and len(cleaned) > len(suffix) + 2: |
| cleaned = cleaned[: -len(suffix)] |
| suffix_count += 1 |
| break |
|
|
| if len(cleaned) >= 3: |
| return cleaned[:3], prefix_count, suffix_count |
| return None, prefix_count, suffix_count |
|
|
| def analyze(self, words: List[str]) -> MorphologyFeatures: |
| roots = [] |
| total_prefixes = 0 |
| total_suffixes = 0 |
|
|
| for word in words: |
| root, pref, suff = self.extract_root(word) |
| roots.append(root) |
| total_prefixes += pref |
| total_suffixes += suff |
|
|
| found = [r for r in roots if r] |
| unique = list(set(found)) |
| coverage = len(found) / len(words) * 100 if words else 0 |
|
|
| return MorphologyFeatures( |
| roots=roots, |
| unique_roots=unique, |
| coverage=coverage, |
| prefix_count=total_prefixes, |
| suffix_count=total_suffixes, |
| ) |
|
|
|
|
| |
| |
| |
|
|
|
|
| class EnhancedHebrewEncoder: |
| """ |
| Complete Hebrew encoder with all features. |
| |
| Encoding pipeline: |
| 1. Preprocessing (clean, tokenize) |
| 2. Vocabulary encoding (word → index) |
| 3. Gematria encoding (numerical + bigrams) |
| 4. Trajectory encoding (3D spherical) |
| 5. Morphology encoding (root extraction) |
| 6. τ metrics calculation |
| 7. Combined embedding |
| """ |
|
|
| def __init__( |
| self, |
| embedding_dim: int = 256, |
| trajectory_points: int = 100, |
| vocab_file: Optional[str] = None, |
| ): |
| self.embedding_dim = embedding_dim |
| self.trajectory_points = trajectory_points |
|
|
| |
| self.vocab_layer = VocabularyLayer(vocab_file) |
| self.morph_layer = MorphologyLayer() |
|
|
| |
| np.random.seed(42) |
| self.W_gematria = np.random.randn(78, embedding_dim).astype(np.float32) * 0.1 |
| self.W_trajectory = np.random.randn(306, embedding_dim).astype(np.float32) * 0.1 |
| self.W_morphology = np.random.randn(110, embedding_dim).astype(np.float32) * 0.1 |
|
|
| def _clean_text(self, text: str) -> str: |
| text = NIQQUD_PATTERN.sub("", text) |
| return " ".join(text.split()) |
|
|
| def _extract_words(self, text: str) -> List[str]: |
| """Extract words from text - supports Hebrew, English, and numbers.""" |
| return MULTILINGUAL_WORD_PATTERN.findall(text) |
|
|
| @staticmethod |
| @lru_cache(maxsize=50000) |
| def _word_gematria_cached(word: str) -> int: |
| """ |
| Cached character value calculation with LANGUAGE SEPARATION. |
| |
| Hebrew gematria is PRESERVED (1-400). |
| Other languages get OFFSET values to avoid collision: |
| - Hebrew: 1-400 (original gematria) |
| - Latin: 1000 + value (1010-1400) |
| - Cyrillic: 2000 + value (2001-2400) |
| - Arabic: 3000 + value (3001-3400) |
| - Greek: 4000 + value (4001-4400) |
| - etc. |
| |
| This ensures Hebrew gematria uniqueness is preserved. |
| """ |
| total = 0 |
| lang_offset = 0 |
| detected_lang = None |
|
|
| for c in word: |
| |
| if detected_lang is None and c in CHAR_TO_LANGUAGE: |
| detected_lang = CHAR_TO_LANGUAGE[c] |
| lang_id = LANGUAGE_IDS.get(detected_lang, 12) |
| |
| |
| lang_offset = lang_id * 1000 if lang_id > 0 else 0 |
|
|
| if c in CHAR_VALUES: |
| total += CHAR_VALUES[c] |
| elif c.isdigit(): |
| |
| total += int(c) * 10 |
| else: |
| |
| total += ord(c) % 100 |
|
|
| |
| total += lang_offset |
|
|
| return total if total > 0 else 1 |
|
|
| @staticmethod |
| def detect_language(text: str) -> str: |
| """Detect primary language of text based on character distribution.""" |
| lang_counts = {} |
| for c in text: |
| if c in CHAR_TO_LANGUAGE: |
| lang = CHAR_TO_LANGUAGE[c] |
| lang_counts[lang] = lang_counts.get(lang, 0) + 1 |
|
|
| if not lang_counts: |
| return "unknown" |
| return max(lang_counts, key=lang_counts.get) |
|
|
| def _word_gematria(self, word: str) -> int: |
| return self._word_gematria_cached(word) |
|
|
| def calculate_gematria(self, text: str) -> int: |
| """Calculate gematria value for Hebrew text (public API).""" |
| words = self._extract_words(text) |
| if not words: |
| return 0 |
| return sum(self._word_gematria(w) for w in words) |
|
|
| def _entropy(self, items: List) -> float: |
| if not items: |
| return 0.0 |
| counts = Counter(items) |
| total = len(items) |
| probs = np.array(list(counts.values())) / total |
| return float(-np.sum(probs * np.log2(probs + 1e-10))) |
|
|
| def _encode_gematria(self, words: List[str]) -> Tuple[np.ndarray, GematriaFeatures]: |
| if not words: |
| return np.zeros(self.embedding_dim, dtype=np.float32), GematriaFeatures( |
| histogram=np.zeros(50), |
| bigram_stats=np.zeros(20), |
| word_values=[], |
| total=0, |
| mean=0.0, |
| std=0.0, |
| ) |
|
|
| values = [self._word_gematria(w) for w in words] |
|
|
| |
| log_vals = np.log1p(values) |
| hist, _ = np.histogram(log_vals, bins=50, range=(0, np.log1p(1000))) |
| hist = hist.astype(np.float32) |
| if hist.sum() > 0: |
| hist /= hist.sum() |
|
|
| |
| bigram_vals = ( |
| [ |
| self._word_gematria(words[i] + words[i + 1]) |
| for i in range(len(words) - 1) |
| ] |
| if len(words) > 1 |
| else [] |
| ) |
|
|
| if bigram_vals: |
| bigram_stats = ( |
| np.array( |
| [ |
| np.mean(bigram_vals), |
| np.std(bigram_vals), |
| np.median(bigram_vals), |
| np.min(bigram_vals), |
| np.max(bigram_vals), |
| np.percentile(bigram_vals, 25), |
| np.percentile(bigram_vals, 75), |
| len(bigram_vals), |
| ], |
| dtype=np.float32, |
| ) |
| / 1000 |
| ) |
| bigram_stats = np.pad(bigram_stats, (0, 20 - len(bigram_stats))) |
| else: |
| bigram_stats = np.zeros(20, dtype=np.float32) |
|
|
| |
| basic = np.array( |
| [ |
| np.mean(values) / 1000, |
| np.std(values) / 1000, |
| np.median(values) / 1000, |
| np.min(values) / 1000, |
| np.max(values) / 1000, |
| len(words) / 100, |
| sum(values) / 10000, |
| len(set(values)) / max(len(values), 1), |
| ], |
| dtype=np.float32, |
| ) |
|
|
| |
| features = np.concatenate([hist, bigram_stats, basic]) |
| embedding = features @ self.W_gematria |
|
|
| norm = np.linalg.norm(embedding) |
| if norm > 0: |
| embedding /= norm |
|
|
| return embedding.astype(np.float32), GematriaFeatures( |
| histogram=hist, |
| bigram_stats=bigram_stats, |
| word_values=values, |
| total=sum(values), |
| mean=float(np.mean(values)), |
| std=float(np.std(values)), |
| ) |
|
|
| def _encode_trajectory( |
| self, words: List[str] |
| ) -> Tuple[np.ndarray, TrajectoryFeatures]: |
| if not words: |
| return np.zeros(self.embedding_dim, dtype=np.float32), TrajectoryFeatures( |
| coords=np.zeros((self.trajectory_points, 3)), |
| path_length=0.0, |
| curvature=0.0, |
| torsion=0.0, |
| centroid=np.zeros(3), |
| ) |
|
|
| seq = [self._word_gematria(w) for w in words] |
|
|
| |
| if len(seq) < self.trajectory_points: |
| idx = np.linspace(0, len(seq) - 1, self.trajectory_points) |
| resampled = np.interp(idx, range(len(seq)), seq) |
| else: |
| idx = np.linspace(0, len(seq) - 1, self.trajectory_points).astype(int) |
| resampled = np.array([seq[i] for i in idx]) |
|
|
| normalized = resampled / (np.max(resampled) + 1e-8) |
|
|
| |
| theta = np.cumsum(normalized) * 2 * np.pi / self.trajectory_points |
| phi = normalized * np.pi * 0.8 + 0.1 * np.pi |
|
|
| x = np.sin(phi) * np.cos(theta) |
| y = np.sin(phi) * np.sin(theta) |
| z = np.cos(phi) |
|
|
| coords = np.stack([x, y, z], axis=1).astype(np.float32) |
|
|
| |
| diffs = np.diff(coords, axis=0) |
| path_length = float(np.sum(np.linalg.norm(diffs, axis=1))) |
|
|
| d1 = np.gradient(coords, axis=0) |
| d2 = np.gradient(d1, axis=0) |
| d1_norm = np.linalg.norm(d1, axis=1) + 1e-10 |
| d2_norm = np.linalg.norm(d2, axis=1) |
| curvature = float(np.mean(d2_norm / (d1_norm**2))) |
|
|
| d3 = np.gradient(d2, axis=0) |
| cross = np.cross(d1, d2) |
| cross_norm_sq = np.sum(cross**2, axis=1) + 1e-10 |
| torsion_vals = np.sum(cross * d3, axis=1) / cross_norm_sq |
| torsion = float(np.mean(np.abs(torsion_vals))) |
|
|
| centroid = np.mean(coords, axis=0) |
|
|
| |
| geom_features = np.array( |
| [ |
| path_length / self.trajectory_points, |
| curvature, |
| torsion, |
| np.std(np.linalg.norm(coords - centroid, axis=1)), |
| np.std(np.linalg.norm(diffs, axis=1)), |
| np.std(d2_norm / (d1_norm**2)), |
| ], |
| dtype=np.float32, |
| ) |
|
|
| features = np.concatenate([coords.flatten(), geom_features]) |
| embedding = features @ self.W_trajectory |
|
|
| norm = np.linalg.norm(embedding) |
| if norm > 0: |
| embedding /= norm |
|
|
| return embedding.astype(np.float32), TrajectoryFeatures( |
| coords=coords, |
| path_length=path_length, |
| curvature=curvature, |
| torsion=torsion, |
| centroid=centroid, |
| ) |
|
|
| def _encode_morphology( |
| self, words: List[str] |
| ) -> Tuple[np.ndarray, MorphologyFeatures]: |
| morph = self.morph_layer.analyze(words) |
|
|
| if not words: |
| return np.zeros(self.embedding_dim, dtype=np.float32), morph |
|
|
| |
| root_indices = [hash(r) % 10000 for r in morph.roots if r] |
| if root_indices: |
| hist, _ = np.histogram(root_indices, bins=100, range=(0, 10000)) |
| hist = hist.astype(np.float32) |
| if hist.sum() > 0: |
| hist /= hist.sum() |
| else: |
| hist = np.zeros(100, dtype=np.float32) |
|
|
| |
| stats = np.array( |
| [ |
| morph.coverage / 100, |
| len(morph.unique_roots) / max(len(words), 1), |
| morph.prefix_count / max(len(words), 1), |
| morph.suffix_count / max(len(words), 1), |
| np.mean([len(w) for w in words]) / 10, |
| np.std([len(w) for w in words]) / 5, |
| ( |
| np.mean([len(r) for r in morph.unique_roots]) / 5 |
| if morph.unique_roots |
| else 0 |
| ), |
| ( |
| np.std([len(r) for r in morph.unique_roots]) / 3 |
| if len(morph.unique_roots) > 1 |
| else 0 |
| ), |
| sum(1 for r in morph.unique_roots if r in HEBREW_ROOTS) |
| / max(len(morph.unique_roots), 1), |
| len(words) / 100, |
| ], |
| dtype=np.float32, |
| ) |
|
|
| features = np.concatenate([hist, stats]) |
| embedding = features @ self.W_morphology |
|
|
| norm = np.linalg.norm(embedding) |
| if norm > 0: |
| embedding /= norm |
|
|
| return embedding.astype(np.float32), morph |
|
|
| |
| |
| |
|
|
| def _encode_gematria_optimized( |
| self, words: List[str], word_values: List[int] |
| ) -> Tuple[np.ndarray, GematriaFeatures]: |
| """Optimized gematria encoding with pre-computed values.""" |
| if not words: |
| return np.zeros(self.embedding_dim, dtype=np.float32), GematriaFeatures( |
| histogram=np.zeros(50), |
| bigram_stats=np.zeros(20), |
| word_values=[], |
| total=0, |
| mean=0.0, |
| std=0.0, |
| ) |
|
|
| values = word_values |
|
|
| |
| values_arr = np.array(values, dtype=np.float32) |
| log_vals = np.log1p(values_arr) |
| hist, _ = np.histogram(log_vals, bins=50, range=(0, np.log1p(1000))) |
| hist = hist.astype(np.float32) |
| hist_sum = hist.sum() |
| if hist_sum > 0: |
| hist /= hist_sum |
|
|
| |
| if len(values) > 1: |
| bigram_vals = np.array( |
| [values[i] + values[i + 1] for i in range(len(values) - 1)], |
| dtype=np.float32, |
| ) |
| bigram_stats = ( |
| np.array( |
| [ |
| np.mean(bigram_vals), |
| np.std(bigram_vals), |
| np.median(bigram_vals), |
| np.min(bigram_vals), |
| np.max(bigram_vals), |
| np.percentile(bigram_vals, 25), |
| np.percentile(bigram_vals, 75), |
| len(bigram_vals), |
| ], |
| dtype=np.float32, |
| ) |
| / 1000 |
| ) |
| bigram_stats = np.pad(bigram_stats, (0, 20 - len(bigram_stats))) |
| else: |
| bigram_stats = np.zeros(20, dtype=np.float32) |
|
|
| |
| basic = np.array( |
| [ |
| values_arr.mean() / 1000, |
| values_arr.std() / 1000, |
| np.median(values_arr) / 1000, |
| values_arr.min() / 1000, |
| values_arr.max() / 1000, |
| len(words) / 100, |
| values_arr.sum() / 10000, |
| len(np.unique(values_arr)) / len(values_arr), |
| ], |
| dtype=np.float32, |
| ) |
|
|
| |
| features = np.concatenate([hist, bigram_stats, basic]) |
| embedding = features @ self.W_gematria |
|
|
| norm = np.linalg.norm(embedding) |
| if norm > 0: |
| embedding /= norm |
|
|
| return embedding.astype(np.float32), GematriaFeatures( |
| histogram=hist, |
| bigram_stats=bigram_stats, |
| word_values=values, |
| total=int(values_arr.sum()), |
| mean=float(values_arr.mean()), |
| std=float(values_arr.std()), |
| ) |
|
|
| def _encode_trajectory_optimized( |
| self, words: List[str], word_values: List[int] |
| ) -> Tuple[np.ndarray, TrajectoryFeatures]: |
| """Optimized trajectory encoding with pre-computed gematria values.""" |
| if not words: |
| return np.zeros(self.embedding_dim, dtype=np.float32), TrajectoryFeatures( |
| coords=np.zeros((self.trajectory_points, 3)), |
| path_length=0.0, |
| curvature=0.0, |
| torsion=0.0, |
| centroid=np.zeros(3), |
| ) |
|
|
| seq = np.array(word_values, dtype=np.float32) |
|
|
| |
| n_points = len(seq) |
| if n_points < self.trajectory_points: |
| idx = np.linspace(0, n_points - 1, self.trajectory_points) |
| resampled = np.interp(idx, np.arange(n_points), seq) |
| else: |
| idx = np.linspace(0, n_points - 1, self.trajectory_points).astype(int) |
| resampled = seq[idx] |
|
|
| max_val = resampled.max() |
| normalized = resampled / (max_val + 1e-8) |
|
|
| |
| theta = np.cumsum(normalized) * 2 * np.pi / self.trajectory_points |
| phi = normalized * np.pi * 0.8 + 0.1 * np.pi |
|
|
| sin_phi = np.sin(phi) |
| cos_phi = np.cos(phi) |
| cos_theta = np.cos(theta) |
| sin_theta = np.sin(theta) |
|
|
| x = sin_phi * cos_theta |
| y = sin_phi * sin_theta |
| z = cos_phi |
|
|
| coords = np.stack([x, y, z], axis=1).astype(np.float32) |
|
|
| |
| diffs = np.diff(coords, axis=0) |
| diff_norms = np.linalg.norm(diffs, axis=1) |
| path_length = float(diff_norms.sum()) |
|
|
| |
| d1 = np.gradient(coords, axis=0) |
| d2 = np.gradient(d1, axis=0) |
| d3 = np.gradient(d2, axis=0) |
|
|
| d1_norm = np.linalg.norm(d1, axis=1) + 1e-10 |
| d2_norm = np.linalg.norm(d2, axis=1) |
| curvature = float(np.mean(d2_norm / (d1_norm**2))) |
|
|
| cross = np.cross(d1, d2) |
| cross_norm_sq = np.sum(cross**2, axis=1) + 1e-10 |
| torsion_vals = np.abs(np.sum(cross * d3, axis=1) / cross_norm_sq) |
| torsion = float(torsion_vals.mean()) |
|
|
| centroid = coords.mean(axis=0) |
|
|
| |
| geom_features = np.array( |
| [ |
| path_length / self.trajectory_points, |
| curvature, |
| torsion, |
| np.std(np.linalg.norm(coords - centroid, axis=1)), |
| np.std(diff_norms), |
| np.std(d2_norm / (d1_norm**2)), |
| ], |
| dtype=np.float32, |
| ) |
|
|
| features = np.concatenate([coords.flatten(), geom_features]) |
| embedding = features @ self.W_trajectory |
|
|
| norm = np.linalg.norm(embedding) |
| if norm > 0: |
| embedding /= norm |
|
|
| return embedding.astype(np.float32), TrajectoryFeatures( |
| coords=coords, |
| path_length=path_length, |
| curvature=curvature, |
| torsion=torsion, |
| centroid=centroid, |
| ) |
|
|
| def _calculate_tau(self, words: List[str]) -> TAUMetrics: |
| if not words: |
| return TAUMetrics(tau=0.0, complexity=0.0, activity=0.0, coherence=0.0) |
|
|
| n = len(words) |
| unique = len(set(words)) |
|
|
| |
| expected = HEAPS_K * (n**HEAPS_BETA) |
| complexity = min((unique / max(expected, 1)) * 100, 200) |
|
|
| |
| chars = "".join(words) |
| char_entropy = self._entropy(list(chars)) |
| avg_len = np.mean([len(w) for w in words]) |
| activity = char_entropy * np.log2(avg_len + 1) |
|
|
| |
| window = 5 |
| if n >= window: |
| ratios = [ |
| len(set(words[i : i + window])) / window for i in range(n - window + 1) |
| ] |
| coherence = 1 - min(np.std(ratios) * 2, 1) |
| else: |
| coherence = 0.5 |
|
|
| tau = complexity * activity * (1 - coherence) / 100 |
|
|
| return TAUMetrics( |
| tau=round(tau, 4), |
| complexity=round(complexity, 4), |
| activity=round(activity, 4), |
| coherence=round(coherence, 4), |
| ) |
|
|
| def _encode_language_signature(self, text: str) -> np.ndarray: |
| """ |
| Create language signature embedding. |
| |
| Uses SIGNED values to separate languages in embedding space: |
| - Hebrew: POSITIVE values (preserves original gematria semantics) |
| - Other languages: NEGATIVE values (clear separation) |
| |
| This ensures Hebrew gematria uniqueness is preserved in the embedding. |
| """ |
| |
| lang_counts = {} |
| total_chars = 0 |
|
|
| for c in text: |
| if c in CHAR_TO_LANGUAGE: |
| lang = CHAR_TO_LANGUAGE[c] |
| lang_counts[lang] = lang_counts.get(lang, 0) + 1 |
| total_chars += 1 |
|
|
| if total_chars == 0: |
| return np.zeros(len(LANGUAGE_IDS), dtype=np.float32) |
|
|
| |
| lang_vec = np.zeros(len(LANGUAGE_IDS), dtype=np.float32) |
|
|
| for lang, count in lang_counts.items(): |
| lang_id = LANGUAGE_IDS.get(lang, LANGUAGE_IDS["unknown"]) |
| proportion = count / total_chars |
|
|
| |
| |
| if lang_id == 0: |
| lang_vec[lang_id] = proportion |
| else: |
| lang_vec[lang_id] = -proportion |
|
|
| return lang_vec |
|
|
| def encode(self, text: str, domain: str = "default") -> EncodingResult: |
| """ |
| Encode multilingual text with language-aware embedding. |
| |
| Args: |
| text: Text to encode (Hebrew, English, or any supported language) |
| domain: Domain for weight adjustment |
| |
| Returns: |
| EncodingResult with all features including language signature |
| """ |
| text = self._clean_text(text) |
| words = self._extract_words(text) |
| weights = DOMAIN_WEIGHTS.get(domain, DOMAIN_WEIGHTS["default"]) |
|
|
| |
| word_values = [self._word_gematria(w) for w in words] |
|
|
| |
| gematria_emb, gematria_feat = self._encode_gematria_optimized( |
| words, word_values |
| ) |
| traj_emb, traj_feat = self._encode_trajectory_optimized(words, word_values) |
| morph_emb, morph_feat = self._encode_morphology(words) |
|
|
| |
| lang_signature = self._encode_language_signature(text) |
|
|
| |
| indices, coverage = self.vocab_layer.encode(text) |
| unk_idx = self.vocab_layer.word2idx["<UNK>"] |
| oov_count = sum(1 for idx in indices if idx == unk_idx) |
| vocab_feat = VocabularyFeatures( |
| indices=indices, coverage=coverage, oov_count=oov_count |
| ) |
|
|
| |
| tau_metrics = self._calculate_tau(words) |
|
|
| |
| base_embedding = ( |
| weights["gematria"] * gematria_emb |
| + weights["trajectory"] * traj_emb |
| + weights["morphology"] * morph_emb |
| ) |
|
|
| |
| |
| |
| |
| num_lang_dims = len(LANGUAGE_IDS) |
| embedding = base_embedding.copy() |
| embedding[-num_lang_dims:] += lang_signature * 0.3 |
|
|
| norm = np.linalg.norm(embedding) |
| if norm > 0: |
| embedding /= norm |
|
|
| metadata = { |
| "text_length": len(text), |
| "word_count": len(words), |
| "domain": domain, |
| "weights": weights, |
| "hash": hashlib.md5(text.encode()).hexdigest(), |
| } |
|
|
| return EncodingResult( |
| embedding=embedding, |
| gematria=gematria_feat, |
| trajectory=traj_feat, |
| morphology=morph_feat, |
| vocabulary=vocab_feat, |
| tau_metrics=tau_metrics, |
| metadata=metadata, |
| ) |
|
|
| def encode_batch( |
| self, texts: List[str], domain: str = "default" |
| ) -> List[EncodingResult]: |
| return [self.encode(text, domain) for text in texts] |
|
|
| def similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float: |
| norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2) |
| if norm1 == 0 or norm2 == 0: |
| return 0.0 |
| return float(np.dot(vec1, vec2) / (norm1 * norm2)) |
|
|
| def find_similar( |
| self, query: str, corpus: List[str], domain: str = "default", top_k: int = 10 |
| ) -> List[Tuple[int, float, str]]: |
| query_emb = self.encode(query, domain).embedding |
| results = [] |
|
|
| for i, text in enumerate(corpus): |
| emb = self.encode(text, domain).embedding |
| sim = self.similarity(query_emb, emb) |
| preview = text[:100] + "..." if len(text) > 100 else text |
| results.append((i, sim, preview)) |
|
|
| results.sort(key=lambda x: x[1], reverse=True) |
| return results[:top_k] |
|
|
|
|
| |
| HebrewEncoder = EnhancedHebrewEncoder |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| print("=" * 70) |
| print("TAU Platform v4.0 - Enhanced Hebrew Encoder Demo") |
| print("=" * 70) |
| print() |
|
|
| encoder = EnhancedHebrewEncoder() |
|
|
| text = "בית המשפט העליון פסק כי הערעור יתקבל. השופטים קבעו פה אחד כי יש לבטל את ההחלטה." |
|
|
| print(f"Text: {text}") |
| print() |
|
|
| result = encoder.encode(text, domain="hebrew_legal") |
|
|
| print(f"Embedding shape: {result.embedding.shape}") |
| print(f"Embedding norm: {np.linalg.norm(result.embedding):.4f}") |
| print() |
|
|
| print("τ Metrics:") |
| print(f" τ: {result.tau_metrics.tau}") |
| print(f" Complexity: {result.tau_metrics.complexity}") |
| print(f" Activity: {result.tau_metrics.activity}") |
| print(f" Coherence: {result.tau_metrics.coherence}") |
| print() |
|
|
| print("Gematria:") |
| print(f" Total: {result.gematria.total}") |
| print(f" Mean: {result.gematria.mean:.2f}") |
| print() |
|
|
| print("Trajectory:") |
| print(f" Path length: {result.trajectory.path_length:.4f}") |
| print(f" Curvature: {result.trajectory.curvature:.4f}") |
| print() |
|
|
| print("Morphology:") |
| print(f" Coverage: {result.morphology.coverage:.1f}%") |
| print(f" Unique roots: {result.morphology.unique_roots[:5]}") |
| print() |
|
|
| print("Vocabulary:") |
| print(f" Coverage: {result.vocabulary.coverage:.1%}") |
| print(f" OOV count: {result.vocabulary.oov_count}") |
| print() |
|
|
| print("=" * 70) |
| print("✅ Demo Complete!") |
| print("=" * 70) |
|
|