valluvar-or-ai / app.py
NaveenKumar Namachivayam
fix: restore HfFolder shim for Gradio 3.50.0
fd6258d
raw
history blame
12.8 kB
"""Gradio app for Thirukkural GPT - Valluvar or AI."""
import random
import re
import huggingface_hub
if not hasattr(huggingface_hub, "HfFolder"):
class _HfFolder:
path = None
@staticmethod
def get_token():
return None
huggingface_hub.HfFolder = _HfFolder
import gradio as gr
import torch
from model import GPT, GPTConfig
def load_model():
"""Load the trained model and tokenizer."""
# Allow GPTConfig for safe loading
from model import GPTConfig
torch.serialization.add_safe_globals([GPTConfig])
checkpoint = torch.load("checkpoint_final.pt", map_location="cpu", weights_only=True)
config = checkpoint["config"]
stoi = checkpoint["stoi"]
itos = checkpoint["itos"]
model = GPT(config)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
return model, stoi, itos
def generate(model, prompt, stoi, itos, max_new_tokens=200, temperature=0.8, device="cpu"):
"""Generate text from prompt."""
model = model.to(device)
# Encode prompt
prompt_tokens = [stoi.get(c, stoi.get(" ", 0)) for c in prompt]
idx = torch.tensor([prompt_tokens], dtype=torch.long, device=device)
# Generate
with torch.no_grad():
for _ in range(max_new_tokens):
# Crop to block size
idx_cond = idx[:, -model.config.block_size :]
# Get predictions
logits, _ = model(idx_cond)
logits = logits[:, -1, :] / temperature
# Sample
probs = torch.softmax(logits, dim=-1)
idx_next = torch.multinomial(probs, num_samples=1)
# Append
idx = torch.cat((idx, idx_next), dim=1)
# Decode
tokens = idx[0].tolist()
result = "".join([itos.get(t, "") for t in tokens])
return result
def is_real_kural(text, original_text):
"""Check if generated text exists in original kurals.
A kural is considered "real" if:
1. The Tamil couplet (2 lines) exists in original
2. The English translation matches
"""
lines = text.strip().split("\n")
# Get Tamil lines (contain Tamil Unicode)
tamil_lines = [l.strip() for l in lines if re.search(r"[\u0B80-\u0BFF]", l)]
# Get English lines (no Tamil, just text)
english_lines = [l.strip() for l in lines if l.strip() and not re.search(r"[\u0B80-\u0BFF]", l)]
if len(tamil_lines) < 2:
return False
# Check if Tamil couplet exists in original
first_tamil = tamil_lines[0]
second_tamil = tamil_lines[1] if len(tamil_lines) > 1 else ""
# A true kural needs both Tamil lines to exist consecutively
tamil_couplet = first_tamil + "\n" + second_tamil
if tamil_couplet not in original_text:
return False
# Also check that English lines roughly match (at least one should exist)
if english_lines:
first_english = english_lines[0]
# Check if this English translation exists near the Tamil
return first_english in original_text
return True
# Load model and data
print("Loading model...")
model, stoi, itos = load_model()
print(f"Model loaded: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M params")
# Load original text for verification
with open("thirukkural_clean.txt", "r", encoding="utf-8") as f:
ORIGINAL_TEXT = f.read()
def generate_kural(prompt, temperature, max_tokens):
"""Generate and format kural with proper structure."""
# Generate with higher token count to ensure complete kural
output_raw = generate(model, prompt, stoi, itos, int(max_tokens) + 100, temperature)
# Extract first complete kural from generated text
lines = output_raw.strip().split("\n")
# Find the first proper kural (skip headers, get 2 Tamil + 2 English lines)
tamil_lines = []
english_lines = []
for line in lines:
line = line.strip()
if not line or " - " in line:
continue
# Skip short Tamil headers (1-2 words)
if re.search(r"[\u0B80-\u0BFF]", line) and len(line.split()) <= 2 and not re.search(r"[a-zA-Z]", line):
continue
if re.search(r"[\u0B80-\u0BFF]", line):
if len(tamil_lines) < 2:
tamil_lines.append(line)
elif line and len(english_lines) < 2:
english_lines.append(line)
# Build formatted output
formatted_lines = []
if tamil_lines:
formatted_lines.extend(tamil_lines[:2])
if english_lines:
formatted_lines.extend(english_lines[:2])
output = "\n".join(formatted_lines) if formatted_lines else format_kural(output_raw)
# Check if real or AI
is_real = is_real_kural(output_raw, ORIGINAL_TEXT)
source = "📖 Original Thirukkural" if is_real else "🤖 AI Generated"
return output, source
def format_kural(text):
"""Format kural text with proper structure (2 Tamil + 2 English lines)."""
lines = text.strip().split("\n")
# Skip headers: lines with " - " OR short single Tamil words (chapter names)
def is_header(line):
# Headers have " - " or are short Tamil-only phrases (1-3 words)
if " - " in line:
return True
# Check if it's a short Tamil phrase (likely a chapter title)
if re.search(r"[\u0B80-\u0BFF]", line) and len(line.split()) <= 3:
# And no English words
if not re.search(r"[a-zA-Z]", line):
return True
return False
content_lines = [l.strip() for l in lines if l.strip() and not is_header(l)]
# Classify lines
tamil_lines = [l for l in content_lines if re.search(r"[\u0B80-\u0BFF]", l)]
english_lines = [l for l in content_lines if l and not re.search(r"[\u0B80-\u0BFF]", l)]
# Build proper 4-line kural
formatted = []
# Tamil couplet (2 lines)
if len(tamil_lines) >= 2:
formatted.extend(tamil_lines[:2])
elif len(tamil_lines) == 1:
formatted.append(tamil_lines[0])
formatted.append("") # Placeholder
# English translation (2 lines)
if len(english_lines) >= 2:
formatted.extend(english_lines[:2])
elif len(english_lines) == 1:
formatted.append(english_lines[0])
formatted.append("")
return "\n".join(formatted)
def valluvar_or_ai_quiz():
"""Generate a quiz: one real, one AI."""
# Get random real kural - find a proper 4-line kural
lines = ORIGINAL_TEXT.strip().split("\n")
# Find a random valid kural (2 Tamil + 2 English lines)
attempts = 0
real_kural = ""
while attempts < 100:
idx = random.randint(0, len(lines) - 4)
chunk = lines[idx:idx+4]
tamil_count = sum(1 for l in chunk if re.search(r"[\u0B80-\u0BFF]", l))
english_count = sum(1 for l in chunk if l.strip() and not re.search(r"[\u0B80-\u0BFF]", l))
if tamil_count == 2 and english_count == 2:
real_kural = "\n".join(chunk).strip()
break
attempts += 1
# Fallback if no proper kural found
if not real_kural:
real_kural = "அகர முதல எழுத்தெல்லாம் ஆதி\nபகவன் முதற்றே உலகு\n'A' leads letters; the Ancient Lord\nLeads and lords the entire world"
# Generate AI kural with random prompt
prompts = ["கடவுள் வாழ்த்து", "நட்பு", "அறன்", "வான் சிறப்பு", "அரசியல்"]
prompt = random.choice(prompts)
ai_kural_raw = generate(model, prompt, stoi, itos, 150, 0.8)
ai_kural = format_kural(ai_kural_raw)
# Format real kural too
real_kural = format_kural(real_kural)
# Shuffle
kurals = [("A", real_kural, True), ("B", ai_kural, False)]
random.shuffle(kurals)
return (
f"## Option A\n```\n{kurals[0][1]}\n```\n\n---\n\n## Option B\n```\n{kurals[1][1]}\n```",
kurals[0][2],
kurals[1][2],
"A" if kurals[0][2] else "B",
)
# Gradio Interface
with gr.Blocks(title="Valluvar or AI?") as demo:
gr.Markdown("# 🕉️ Valluvar or AI?")
gr.Markdown(
"An AI that writes new Thirukkurals in the style of Thiruvalluvar. "
"Enter a Tamil theme to generate bilingual wisdom."
)
with gr.Tab("✨ Generate Kural"):
with gr.Row():
with gr.Column():
prompt = gr.Textbox(
label="Theme (Tamil)",
placeholder="e.g., கடவுள் வாழ்த்து, நட்பு, அரசியல்",
value="கடவுள் வாழ்த்து",
)
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.8,
step=0.1,
label="Temperature (Creativity)",
)
max_tokens = gr.Slider(
minimum=50,
maximum=400,
value=200,
step=50,
label="Max Tokens",
)
generate_btn = gr.Button("Generate", variant="primary")
with gr.Column():
output = gr.Textbox(
label="Generated Kural",
lines=10,
)
source = gr.Textbox(label="Source")
generate_btn.click(
fn=generate_kural,
inputs=[prompt, temperature, max_tokens],
outputs=[output, source],
)
# Quick theme buttons
gr.Markdown("### Quick Themes")
with gr.Row():
themes = [
"கடவுள் வாழ்த்து",
"வான் சிறப்பு",
"நட்பு",
"அரசியல்",
"அறன் வலியுறுத்தல்",
]
for theme in themes:
btn = gr.Button(theme)
btn.click(lambda t=theme: t, outputs=prompt)
with gr.Tab("🎯 Valluvar or AI? Quiz"):
gr.Markdown("Can you tell which is the original Thirukkural and which is AI-generated?")
quiz_output = gr.Markdown()
with gr.Row():
guess_a = gr.Button("Option A is Real", variant="secondary")
guess_b = gr.Button("Option B is Real", variant="secondary")
quiz_result = gr.Markdown()
new_quiz_btn = gr.Button("New Quiz", variant="primary")
# Store answers
a_is_real = gr.State()
b_is_real = gr.State()
correct_answer = gr.State()
def check_answer(guess, a_real, b_real, correct):
if guess == correct:
return "✅ Correct! You identified the original Thirukkural."
return "❌ Wrong! The original Thirukkural was: " + correct
new_quiz_btn.click(
fn=valluvar_or_ai_quiz,
outputs=[quiz_output, a_is_real, b_is_real, correct_answer],
)
guess_a.click(
fn=lambda a, b, c: check_answer("A", a, b, c),
inputs=[a_is_real, b_is_real, correct_answer],
outputs=quiz_result,
)
guess_b.click(
fn=lambda a, b, c: check_answer("B", a, b, c),
inputs=[a_is_real, b_is_real, correct_answer],
outputs=quiz_result,
)
with gr.Tab("📊 About"):
gr.Markdown(
f"""
## Model Details
- **Architecture:** GPT ({model.config.n_layer}L/{model.config.n_head}H/{model.config.n_embd}D)
- **Parameters:** {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M
- **Vocabulary:** {len(stoi)} characters (Tamil + English)
- **Training Data:** Thirukkural (1330 kurals with English translations)
- **Tokenization:** Character-level
## Training
- Steps: 10,000
- Device: Apple MPS (Mac Mini)
- Time: ~5 hours
- Final Loss: ~1.5
## Capabilities
- ✅ Generate authentic Tamil couplets (2 lines × 4 words)
- ✅ Produce coherent English translations
- ✅ Handle traditional themes (virtue, politics, love)
- ❌ Modern topics (science, technology) - not in training data
## Examples of AI vs Original
The model sometimes generates exact memorized kurals from the 1330,
and sometimes creates entirely new ones in Thiruvalluvar's style.
Built with ❤️ using PyTorch and Gradio.
"""
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
)