"""Kural text processing, parsing, and structured generation.""" import random from typing import Any from config import ( _TAMIL_RE, _ENGLISH_RE, _KURAL_FILE, _MAX_GENERATE_ATTEMPTS, _TEMP_INCREMENT, _MAX_TEMP, _MAX_SEED, _GENERATE_MAX_TOKENS, ) from model_engine import generate, model, stoi, itos # --------------------------------------------------------------------------- # Raw corpus # --------------------------------------------------------------------------- with _KURAL_FILE.open("r", encoding="utf-8") as fh: ORIGINAL_TEXT = fh.read() # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _is_tamil_line(line: str) -> bool: """Return True if the line contains Tamil Unicode characters.""" return bool(_TAMIL_RE.search(line)) def _is_header(line: str) -> bool: """Return True if the line is a chapter header, not a kural.""" if " - " in line: return True words = line.split() return bool(_TAMIL_RE.search(line) and len(words) <= 2 and not _ENGLISH_RE.search(line)) def _extract_tamil_couplet(text: str) -> str | None: """Extract the first 2 Tamil lines from text, skipping headers.""" lines = text.strip().split("\n") tamil_lines = [] for line in lines: line = line.strip() if not line or " - " in line or _is_header(line): continue if _is_tamil_line(line) and len(tamil_lines) < 2: tamil_lines.append(line) return "\n".join(tamil_lines[:2]) if len(tamil_lines) >= 2 else None def _is_couplet_in_original(couplet: str | None, original_text: str) -> bool: """Check if a Tamil couplet exists in the original text.""" return bool(couplet and couplet in original_text) def _is_valid_kural_structure(tamil_lines: list[str]) -> bool: """Check if Tamil lines follow Thirukkural structure: 4 words first line, 3 words second line.""" return len(tamil_lines) >= 2 and len(tamil_lines[0].split()) == 4 and len(tamil_lines[1].split()) == 3 def _extract_kural_lines(lines: list[str]) -> tuple[list[str], list[str]]: """Extract Tamil and English lines from raw output, skipping headers.""" tamil_lines: list[str] = [] english_lines: list[str] = [] for line in lines: line = line.strip() if not line or " - " in line or _is_header(line): continue if _is_tamil_line(line): if len(tamil_lines) < 2: tamil_lines.append(line) elif len(english_lines) < 2: english_lines.append(line) return tamil_lines, english_lines # --------------------------------------------------------------------------- # Formatting # --------------------------------------------------------------------------- def format_kural(text: str) -> str: """Format kural text with proper structure (2 Tamil + 2 English lines).""" content_lines = [l.strip() for l in text.strip().split("\n") if l.strip() and not _is_header(l)] tamil_lines = [l for l in content_lines if _is_tamil_line(l)] english_lines = [l for l in content_lines if l and not _is_tamil_line(l)] formatted: list[str] = [] formatted.extend(tamil_lines[:2] if len(tamil_lines) >= 2 else [tamil_lines[0], ""] if tamil_lines else []) formatted.extend(english_lines[:2] if len(english_lines) >= 2 else [english_lines[0], ""] if english_lines else []) return "\n".join(formatted) # --------------------------------------------------------------------------- # Parsing # --------------------------------------------------------------------------- def parse_kurals(text: str) -> list[dict[str, Any]]: """Parse original text into structured kurals with chapter, number, and text.""" kurals: list[dict[str, Any]] = [] lines = text.strip().split("\n") current_chapter = "" current_chapter_en = "" kural_number = 0 i = 0 total = len(lines) while i < total: line = lines[i].strip() if " - " in line: current_chapter, current_chapter_en = line.split(" - ", 1) current_chapter = current_chapter.strip() current_chapter_en = current_chapter_en.strip() i += 1 continue if not line or i + 3 >= total: i += 1 continue chunk = [lines[i + j].strip() for j in range(4)] tamil = [l for l in chunk if _is_tamil_line(l)] english = [l for l in chunk if l and not _is_tamil_line(l)] if len(tamil) == 2 and len(english) == 2: kural_number += 1 kurals.append({ "number": kural_number, "chapter_tamil": current_chapter, "chapter_english": current_chapter_en, "tamil_1": tamil[0], "tamil_2": tamil[1], "english_1": english[0], "english_2": english[1], }) i += 5 continue i += 1 return kurals _KURALS_DB: list[dict[str, Any]] | None = None def _get_kurals_db() -> list[dict[str, Any]]: global _KURALS_DB if _KURALS_DB is None: _KURALS_DB = parse_kurals(ORIGINAL_TEXT) return _KURALS_DB # --------------------------------------------------------------------------- # Structured generation # --------------------------------------------------------------------------- def generate_kural(prompt: str, temperature: float, max_tokens: float) -> tuple[str, str]: """Generate and format kural with proper structure.""" output_raw = "" final_attempt = 0 is_ai_generated = False for attempt in range(_MAX_GENERATE_ATTEMPTS): final_attempt = attempt + 1 temp = min(temperature + attempt * _TEMP_INCREMENT, _MAX_TEMP) seed = random.randint(1, _MAX_SEED) output_raw = generate(model, prompt, stoi, itos, max_new_tokens=int(max_tokens) + 100, temperature=temp, seed=seed) couplet = _extract_tamil_couplet(output_raw) if not couplet: continue tamil_lines = couplet.split("\n") if not _is_valid_kural_structure(tamil_lines): is_ai_generated = True break if not _is_couplet_in_original(couplet, ORIGINAL_TEXT): is_ai_generated = True break lines = output_raw.strip().split("\n") tamil_lines, english_lines = _extract_kural_lines(lines) # If we don't have valid 4-3 structure, try to find lines that do if tamil_lines and len(tamil_lines) >= 2 and not _is_valid_kural_structure(tamil_lines): all_tamil = [l.strip() for l in lines if _is_tamil_line(l)] for i in range(len(all_tamil) - 1): candidate = [all_tamil[i], all_tamil[i + 1]] if _is_valid_kural_structure(candidate): tamil_lines = candidate break formatted_lines = tamil_lines[:2] + english_lines[:2] output = "\n".join(formatted_lines) if formatted_lines else format_kural(output_raw) if is_ai_generated: source = "🤖 AI Generated" confidence = max(20, 100 - (final_attempt - 1) * 4) else: source = "📖 Original Thirukkural" confidence = 100 return output, f"{source} (Confidence: {confidence}%)"