"""Kural text processing, parsing, and structured generation."""
import random
from typing import Any

from config import (
    _TAMIL_RE,
    _ENGLISH_RE,
    _KURAL_FILE,
    _MAX_GENERATE_ATTEMPTS,
    _TEMP_INCREMENT,
    _MAX_TEMP,
    _MAX_SEED,
    _GENERATE_MAX_TOKENS,
)
from model_engine import generate, model, stoi, itos

# ---------------------------------------------------------------------------
# Raw corpus
# ---------------------------------------------------------------------------
with _KURAL_FILE.open("r", encoding="utf-8") as fh:
    ORIGINAL_TEXT = fh.read()


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _is_tamil_line(line: str) -> bool:
    """Return True if the line contains Tamil Unicode characters."""
    return bool(_TAMIL_RE.search(line))


def _is_header(line: str) -> bool:
    """Return True if the line is a chapter header, not a kural."""
    if " - " in line:
        return True
    words = line.split()
    return bool(_TAMIL_RE.search(line) and len(words) <= 2 and not _ENGLISH_RE.search(line))


def _extract_tamil_couplet(text: str) -> str | None:
    """Extract the first 2 Tamil lines from text, skipping headers."""
    lines = text.strip().split("\n")
    tamil_lines = []
    for line in lines:
        line = line.strip()
        if not line or " - " in line or _is_header(line):
            continue
        if _is_tamil_line(line) and len(tamil_lines) < 2:
            tamil_lines.append(line)
    return "\n".join(tamil_lines[:2]) if len(tamil_lines) >= 2 else None


def _is_couplet_in_original(couplet: str | None, original_text: str) -> bool:
    """Check if a Tamil couplet exists in the original text."""
    return bool(couplet and couplet in original_text)


def _is_valid_kural_structure(tamil_lines: list[str]) -> bool:
    """Check if Tamil lines follow Thirukkural structure: 4 words first line, 3 words second line."""
    return len(tamil_lines) >= 2 and len(tamil_lines[0].split()) == 4 and len(tamil_lines[1].split()) == 3


def _extract_kural_lines(lines: list[str]) -> tuple[list[str], list[str]]:
    """Extract Tamil and English lines from raw output, skipping headers."""
    tamil_lines: list[str] = []
    english_lines: list[str] = []
    for line in lines:
        line = line.strip()
        if not line or " - " in line or _is_header(line):
            continue
        if _is_tamil_line(line):
            if len(tamil_lines) < 2:
                tamil_lines.append(line)
        elif len(english_lines) < 2:
            english_lines.append(line)
    return tamil_lines, english_lines


# ---------------------------------------------------------------------------
# Formatting
# ---------------------------------------------------------------------------
def format_kural(text: str) -> str:
    """Format kural text with proper structure (2 Tamil + 2 English lines)."""
    content_lines = [l.strip() for l in text.strip().split("\n") if l.strip() and not _is_header(l)]
    tamil_lines = [l for l in content_lines if _is_tamil_line(l)]
    english_lines = [l for l in content_lines if l and not _is_tamil_line(l)]

    formatted: list[str] = []
    formatted.extend(tamil_lines[:2] if len(tamil_lines) >= 2 else [tamil_lines[0], ""] if tamil_lines else [])
    formatted.extend(english_lines[:2] if len(english_lines) >= 2 else [english_lines[0], ""] if english_lines else [])

    return "\n".join(formatted)


# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------
def parse_kurals(text: str) -> list[dict[str, Any]]:
    """Parse original text into structured kurals with chapter, number, and text."""
    kurals: list[dict[str, Any]] = []
    lines = text.strip().split("\n")
    current_chapter = ""
    current_chapter_en = ""
    kural_number = 0
    i = 0
    total = len(lines)

    while i < total:
        line = lines[i].strip()

        if " - " in line:
            current_chapter, current_chapter_en = line.split(" - ", 1)
            current_chapter = current_chapter.strip()
            current_chapter_en = current_chapter_en.strip()
            i += 1
            continue

        if not line or i + 3 >= total:
            i += 1
            continue

        chunk = [lines[i + j].strip() for j in range(4)]
        tamil = [l for l in chunk if _is_tamil_line(l)]
        english = [l for l in chunk if l and not _is_tamil_line(l)]

        if len(tamil) == 2 and len(english) == 2:
            kural_number += 1
            kurals.append({
                "number": kural_number,
                "chapter_tamil": current_chapter,
                "chapter_english": current_chapter_en,
                "tamil_1": tamil[0],
                "tamil_2": tamil[1],
                "english_1": english[0],
                "english_2": english[1],
            })
            i += 5
            continue

        i += 1

    return kurals


_KURALS_DB: list[dict[str, Any]] | None = None


def _get_kurals_db() -> list[dict[str, Any]]:
    global _KURALS_DB
    if _KURALS_DB is None:
        _KURALS_DB = parse_kurals(ORIGINAL_TEXT)
    return _KURALS_DB


# ---------------------------------------------------------------------------
# Structured generation
# ---------------------------------------------------------------------------
def generate_kural(prompt: str, temperature: float, max_tokens: float) -> tuple[str, str]:
    """Generate and format kural with proper structure."""
    output_raw = ""
    final_attempt = 0
    is_ai_generated = False

    for attempt in range(_MAX_GENERATE_ATTEMPTS):
        final_attempt = attempt + 1
        temp = min(temperature + attempt * _TEMP_INCREMENT, _MAX_TEMP)
        seed = random.randint(1, _MAX_SEED)
        output_raw = generate(model, prompt, stoi, itos, max_new_tokens=int(max_tokens) + 100, temperature=temp, seed=seed)

        couplet = _extract_tamil_couplet(output_raw)
        if not couplet:
            continue

        tamil_lines = couplet.split("\n")
        if not _is_valid_kural_structure(tamil_lines):
            is_ai_generated = True
            break

        if not _is_couplet_in_original(couplet, ORIGINAL_TEXT):
            is_ai_generated = True
            break

    lines = output_raw.strip().split("\n")
    tamil_lines, english_lines = _extract_kural_lines(lines)

    # If we don't have valid 4-3 structure, try to find lines that do
    if tamil_lines and len(tamil_lines) >= 2 and not _is_valid_kural_structure(tamil_lines):
        all_tamil = [l.strip() for l in lines if _is_tamil_line(l)]
        for i in range(len(all_tamil) - 1):
            candidate = [all_tamil[i], all_tamil[i + 1]]
            if _is_valid_kural_structure(candidate):
                tamil_lines = candidate
                break

    formatted_lines = tamil_lines[:2] + english_lines[:2]
    output = "\n".join(formatted_lines) if formatted_lines else format_kural(output_raw)

    if is_ai_generated:
        source = "🤖 AI Generated"
        confidence = max(20, 100 - (final_attempt - 1) * 4)
    else:
        source = "📖 Original Thirukkural"
        confidence = 100

    return output, f"{source} (Confidence: {confidence}%)"