Spaces:

qainsights
/

valluvar-or-ai

Sleeping

NaveenKumar Namachivayam commited on May 10

Commit

b817849

1 Parent(s): ba05e77

feat: add Thirukkural Tamil text dataset with English translations

- Add complete Thirukkural text (7046 lines) with Tamil verses and English translations
- Include all 133 chapters covering virtue, wealth, and love
- Format each kural with Tamil original, transliteration, and English couplet translation
- Organize by sections: domestic virtue, ascetic virtue, royalty, love, and more

Files changed (7) hide show

data/thirukkural_clean.txt +0 -0
hf-space/README.md +47 -0
hf-space/app.py +363 -0
hf-space/model.py +118 -0
hf-space/requirements.txt +2 -0
hf-space/thirukkural_clean.txt +0 -0
train.py +4 -4

data/thirukkural_clean.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

hf-space/README.md ADDED Viewed

	@@ -0,0 +1,47 @@

+---
+title: Valluvar or AI?
+emoji: 🕉️
+colorFrom: orange
+colorTo: red
+sdk: gradio
+sdk_version: 4.x
+app_file: app.py
+pinned: false
+license: mit
+---
+# Valluvar or AI? 🕉️
+An AI that writes new Thirukkurals in the style of Thiruvalluvar.
+## Features
+- **Generate Kural**: Enter a Tamil theme and get a bilingual couplet
+- **Valluvar or AI Quiz**: Can you tell which is original and which is AI-generated?
+- **Temperature Control**: Adjust creativity from coherent (0.5) to wild (2.0)
+## Model
+- **Architecture**: GPT (8L/8H/512D, 25.4M params)
+- **Training Data**: Thirukkural (1330 kurals + English translations)
+- **Tokenization**: Character-level
+## Examples
+**Traditional themes work great:**
+- `கடவுள் வாழ்த்து` (Praise of God) ✅
+- `அரசியல்` (Politics/Governance) ✅
+- `நட்பு` (Friendship) ✅
+**Modern topics don't work:**
+- `விஞ்ஞானம்` (Science) ❌
+- `கணிதம்` (Mathematics) ❌
+The model learned Thiruvalluvar's form and traditional themes, but not modern concepts.
+## How to Use
+1. Enter a Tamil word or theme
+2. Adjust temperature (0.8 recommended)
+3. Click Generate
+4. See if the model memorized or created something new!

hf-space/app.py ADDED Viewed

	@@ -0,0 +1,363 @@

+"""Gradio app for Thirukkural GPT - Valluvar or AI."""
+import random
+import re
+import gradio as gr
+import torch
+from model import GPT, GPTConfig
+def load_model():
+    """Load the trained model and tokenizer."""
+    # Allow GPTConfig for safe loading
+    from model import GPTConfig
+    torch.serialization.add_safe_globals([GPTConfig])
+    checkpoint = torch.load("checkpoint_final.pt", map_location="cpu", weights_only=True)
+    config = checkpoint["config"]
+    stoi = checkpoint["stoi"]
+    itos = checkpoint["itos"]
+    model = GPT(config)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    model.eval()
+    return model, stoi, itos
+def generate(model, prompt, stoi, itos, max_new_tokens=200, temperature=0.8, device="cpu"):
+    """Generate text from prompt."""
+    model = model.to(device)
+    # Encode prompt
+    prompt_tokens = [stoi.get(c, stoi.get(" ", 0)) for c in prompt]
+    idx = torch.tensor([prompt_tokens], dtype=torch.long, device=device)
+    # Generate
+    with torch.no_grad():
+        for _ in range(max_new_tokens):
+            # Crop to block size
+            idx_cond = idx[:, -model.config.block_size :]
+            # Get predictions
+            logits, _ = model(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            # Sample
+            probs = torch.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # Append
+            idx = torch.cat((idx, idx_next), dim=1)
+    # Decode
+    tokens = idx[0].tolist()
+    result = "".join([itos.get(t, "") for t in tokens])
+    return result
+def is_real_kural(text, original_text):
+    """Check if generated text exists in original kurals.
+    A kural is considered "real" if:
+    1. The Tamil couplet (2 lines) exists in original
+    2. The English translation matches
+    """
+    lines = text.strip().split("\n")
+    # Get Tamil lines (contain Tamil Unicode)
+    tamil_lines = [l.strip() for l in lines if re.search(r"[\u0B80-\u0BFF]", l)]
+    # Get English lines (no Tamil, just text)
+    english_lines = [l.strip() for l in lines if l.strip() and not re.search(r"[\u0B80-\u0BFF]", l)]
+    if len(tamil_lines) < 2:
+        return False
+    # Check if Tamil couplet exists in original
+    first_tamil = tamil_lines[0]
+    second_tamil = tamil_lines[1] if len(tamil_lines) > 1 else ""
+    # A true kural needs both Tamil lines to exist consecutively
+    tamil_couplet = first_tamil + "\n" + second_tamil
+    if tamil_couplet not in original_text:
+        return False
+    # Also check that English lines roughly match (at least one should exist)
+    if english_lines:
+        first_english = english_lines[0]
+        # Check if this English translation exists near the Tamil
+        return first_english in original_text
+    return True
+# Load model and data
+print("Loading model...")
+model, stoi, itos = load_model()
+print(f"Model loaded: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M params")
+# Load original text for verification
+with open("thirukkural_clean.txt", "r", encoding="utf-8") as f:
+    ORIGINAL_TEXT = f.read()
+def generate_kural(prompt, temperature, max_tokens):
+    """Generate and format kural with proper structure."""
+    # Generate with higher token count to ensure complete kural
+    output_raw = generate(model, prompt, stoi, itos, int(max_tokens) + 100, temperature)
+    # Extract first complete kural from generated text
+    lines = output_raw.strip().split("\n")
+    # Find the first proper kural (skip headers, get 2 Tamil + 2 English lines)
+    tamil_lines = []
+    english_lines = []
+    for line in lines:
+        line = line.strip()
+        if not line or " - " in line:
+            continue
+        # Skip short Tamil headers (1-2 words)
+        if re.search(r"[\u0B80-\u0BFF]", line) and len(line.split()) <= 2 and not re.search(r"[a-zA-Z]", line):
+            continue
+        if re.search(r"[\u0B80-\u0BFF]", line):
+            if len(tamil_lines) < 2:
+                tamil_lines.append(line)
+        elif line and len(english_lines) < 2:
+            english_lines.append(line)
+    # Build formatted output
+    formatted_lines = []
+    if tamil_lines:
+        formatted_lines.extend(tamil_lines[:2])
+    if english_lines:
+        formatted_lines.extend(english_lines[:2])
+    output = "\n".join(formatted_lines) if formatted_lines else format_kural(output_raw)
+    # Check if real or AI
+    is_real = is_real_kural(output_raw, ORIGINAL_TEXT)
+    source = "📖 Original Thirukkural" if is_real else "🤖 AI Generated"
+    return output, source
+def format_kural(text):
+    """Format kural text with proper structure (2 Tamil + 2 English lines)."""
+    lines = text.strip().split("\n")
+    # Skip headers: lines with " - " OR short single Tamil words (chapter names)
+    def is_header(line):
+        # Headers have " - " or are short Tamil-only phrases (1-3 words)
+        if " - " in line:
+            return True
+        # Check if it's a short Tamil phrase (likely a chapter title)
+        if re.search(r"[\u0B80-\u0BFF]", line) and len(line.split()) <= 3:
+            # And no English words
+            if not re.search(r"[a-zA-Z]", line):
+                return True
+        return False
+    content_lines = [l.strip() for l in lines if l.strip() and not is_header(l)]
+    # Classify lines
+    tamil_lines = [l for l in content_lines if re.search(r"[\u0B80-\u0BFF]", l)]
+    english_lines = [l for l in content_lines if l and not re.search(r"[\u0B80-\u0BFF]", l)]
+    # Build proper 4-line kural
+    formatted = []
+    # Tamil couplet (2 lines)
+    if len(tamil_lines) >= 2:
+        formatted.extend(tamil_lines[:2])
+    elif len(tamil_lines) == 1:
+        formatted.append(tamil_lines[0])
+        formatted.append("")  # Placeholder
+    # English translation (2 lines)
+    if len(english_lines) >= 2:
+        formatted.extend(english_lines[:2])
+    elif len(english_lines) == 1:
+        formatted.append(english_lines[0])
+        formatted.append("")
+    return "\n".join(formatted)
+def valluvar_or_ai_quiz():
+    """Generate a quiz: one real, one AI."""
+    # Get random real kural - find a proper 4-line kural
+    lines = ORIGINAL_TEXT.strip().split("\n")
+    # Find a random valid kural (2 Tamil + 2 English lines)
+    attempts = 0
+    real_kural = ""
+    while attempts < 100:
+        idx = random.randint(0, len(lines) - 4)
+        chunk = lines[idx:idx+4]
+        tamil_count = sum(1 for l in chunk if re.search(r"[\u0B80-\u0BFF]", l))
+        english_count = sum(1 for l in chunk if l.strip() and not re.search(r"[\u0B80-\u0BFF]", l))
+        if tamil_count == 2 and english_count == 2:
+            real_kural = "\n".join(chunk).strip()
+            break
+        attempts += 1
+    # Fallback if no proper kural found
+    if not real_kural:
+        real_kural = "அகர முதல எழுத்தெல்லாம் ஆதி\nபகவன் முதற்றே உலகு\n'A' leads letters; the Ancient Lord\nLeads and lords the entire world"
+    # Generate AI kural with random prompt
+    prompts = ["கடவுள் வாழ்த்து", "நட்பு", "அறன்", "வான் சிறப்பு", "அரசியல்"]
+    prompt = random.choice(prompts)
+    ai_kural_raw = generate(model, prompt, stoi, itos, 150, 0.8)
+    ai_kural = format_kural(ai_kural_raw)
+    # Format real kural too
+    real_kural = format_kural(real_kural)
+    # Shuffle
+    kurals = [("A", real_kural, True), ("B", ai_kural, False)]
+    random.shuffle(kurals)
+    return (
+        f"## Option A\n```\n{kurals[0][1]}\n```\n\n---\n\n## Option B\n```\n{kurals[1][1]}\n```",
+        kurals[0][2],
+        kurals[1][2],
+        "A" if kurals[0][2] else "B",
+    )
+# Gradio Interface
+with gr.Blocks(title="Valluvar or AI?") as demo:
+    gr.Markdown("# 🕉️ Valluvar or AI?")
+    gr.Markdown(
+        "An AI that writes new Thirukkurals in the style of Thiruvalluvar. "
+        "Enter a Tamil theme to generate bilingual wisdom."
+    )
+    with gr.Tab("✨ Generate Kural"):
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(
+                    label="Theme (Tamil)",
+                    placeholder="e.g., கடவுள் வாழ்த்து, நட்பு, அரசியல்",
+                    value="கடவுள் வாழ்த்து",
+                )
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=2.0,
+                    value=0.8,
+                    step=0.1,
+                    label="Temperature (Creativity)",
+                )
+                max_tokens = gr.Slider(
+                    minimum=50,
+                    maximum=400,
+                    value=200,
+                    step=50,
+                    label="Max Tokens",
+                )
+                generate_btn = gr.Button("Generate", variant="primary")
+            with gr.Column():
+                output = gr.Textbox(
+                    label="Generated Kural",
+                    lines=10,
+                )
+                source = gr.Textbox(label="Source")
+        generate_btn.click(
+            fn=generate_kural,
+            inputs=[prompt, temperature, max_tokens],
+            outputs=[output, source],
+        )
+        # Quick theme buttons
+        gr.Markdown("### Quick Themes")
+        with gr.Row():
+            themes = [
+                "கடவுள் வாழ்த்து",
+                "வான் சிறப்பு",
+                "நட்பு",
+                "அரசியல்",
+                "அறன் வலியுறுத்தல்",
+            ]
+            for theme in themes:
+                btn = gr.Button(theme, size="sm")
+                btn.click(lambda t=theme: t, outputs=prompt)
+    with gr.Tab("🎯 Valluvar or AI? Quiz"):
+        gr.Markdown("Can you tell which is the original Thirukkural and which is AI-generated?")
+        quiz_output = gr.Markdown()
+        with gr.Row():
+            guess_a = gr.Button("Option A is Real", variant="secondary")
+            guess_b = gr.Button("Option B is Real", variant="secondary")
+        quiz_result = gr.Markdown()
+        new_quiz_btn = gr.Button("New Quiz", variant="primary")
+        # Store answers
+        a_is_real = gr.State()
+        b_is_real = gr.State()
+        correct_answer = gr.State()
+        def check_answer(guess, a_real, b_real, correct):
+            if guess == correct:
+                return "✅ Correct! You identified the original Thirukkural."
+            return "❌ Wrong! The original Thirukkural was: " + correct
+        new_quiz_btn.click(
+            fn=valluvar_or_ai_quiz,
+            outputs=[quiz_output, a_is_real, b_is_real, correct_answer],
+        )
+        guess_a.click(
+            fn=lambda a, b, c: check_answer("A", a, b, c),
+            inputs=[a_is_real, b_is_real, correct_answer],
+            outputs=quiz_result,
+        )
+        guess_b.click(
+            fn=lambda a, b, c: check_answer("B", a, b, c),
+            inputs=[a_is_real, b_is_real, correct_answer],
+            outputs=quiz_result,
+        )
+    with gr.Tab("📊 About"):
+        gr.Markdown(
+            f"""
+            ## Model Details
+            - **Architecture:** GPT ({model.config.n_layer}L/{model.config.n_head}H/{model.config.n_embd}D)
+            - **Parameters:** {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M
+            - **Vocabulary:** {len(stoi)} characters (Tamil + English)
+            - **Training Data:** Thirukkural (1330 kurals with English translations)
+            - **Tokenization:** Character-level
+            ## Training
+            - Steps: 10,000
+            - Device: Apple MPS (Mac Mini)
+            - Time: ~5 hours
+            - Final Loss: ~1.5
+            ## Capabilities
+            - ✅ Generate authentic Tamil couplets (2 lines × 4 words)
+            - ✅ Produce coherent English translations
+            - ✅ Handle traditional themes (virtue, politics, love)
+            - ❌ Modern topics (science, technology) - not in training data
+            ## Examples of AI vs Original
+            The model sometimes generates exact memorized kurals from the 1330,
+            and sometimes creates entirely new ones in Thiruvalluvar's style.
+            Built with ❤️ using PyTorch and Gradio.
+            """
+        )
+if __name__ == "__main__":
+    demo.launch()

hf-space/model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""GPT Model Architecture for Thirukkural Training."""
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+@dataclass
+class GPTConfig:
+    """Configuration for GPT model."""
+    vocab_size: int = 65  # Will be set dynamically based on dataset
+    block_size: int = 256  # Max sequence length
+    n_layer: int = 6  # Number of transformer blocks
+    n_head: int = 6  # Number of attention heads
+    n_embd: int = 384  # Embedding dimension
+class CausalSelfAttention(nn.Module):
+    """Multi-head causal self-attention layer."""
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        head_dim = C // self.n_head
+        q = q.view(B, T, self.n_head, head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, head_dim).transpose(1, 2)
+        y = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, is_causal=True
+        )
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class MLP(nn.Module):
+    """Feed-forward network with GELU activation."""
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
+        self.gelu = nn.GELU(approximate="tanh")
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        return self.c_proj(x)
+class Block(nn.Module):
+    """Transformer block with attention and MLP."""
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd)
+        self.mlp = MLP(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class GPT(nn.Module):
+    """GPT language model."""
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.vocab_size, config.n_embd),
+                wpe=nn.Embedding(config.block_size, config.n_embd),
+                h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                ln_f=nn.LayerNorm(config.n_embd),
+            )
+        )
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.transformer.wte.weight = self.lm_head.weight
+    def forward(
+        self, idx: torch.Tensor, targets: torch.Tensor | None = None
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        B, T = idx.shape
+        pos = torch.arange(0, T, device=idx.device)
+        tok_emb = self.transformer.wte(idx)
+        pos_emb = self.transformer.wpe(pos)
+        x = tok_emb + pos_emb
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = nn.functional.cross_entropy(
+                logits.view(-1, logits.size(-1)), targets.view(-1)
+            )
+        return logits, loss

hf-space/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch>=2.0.0
2	+ gradio>=4.0.0

hf-space/thirukkural_clean.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

train.py CHANGED Viewed

@@ -61,11 +61,11 @@ def get_lr(
 def train(
     data_path: str,
-    max_steps: int = 5000,
     batch_size: int = 64,
-    n_layer: int = 6,
-    n_head: int = 6,
-    n_embd: int = 384,
     block_size: int = 256,
 ) -> tuple[GPT, dict[str, int], dict[int, str]]:
     """Train a GPT model on the given dataset."""

 def train(
     data_path: str,
+    max_steps: int = 10000,
     batch_size: int = 64,
+    n_layer: int = 8,
+    n_head: int = 8,
+    n_embd: int = 512,
     block_size: int = 256,
 ) -> tuple[GPT, dict[str, int], dict[int, str]]:
     """Train a GPT model on the given dataset."""