#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
app.py
Gradio demo for Kabyle POS Tagger v2.
Pre-splits punctuation and hyphenated clitics, strips hyphens from clitics
before tokenization, then applies a post-processing lookup table to fix
remaining clitic misclassifications.
"""

import re
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

MODEL_NAME = "boffire/kabyle-pos-v2"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
model.eval()
id2label = model.config.id2label

# Universal Dependencies POS tagset with descriptions
POS_DESCRIPTIONS = {
    "ADJ": "Adjective", "ADP": "Adposition", "ADV": "Adverb",
    "AUX": "Auxiliary verb", "CCONJ": "Coordinating conjunction",
    "DET": "Determiner", "INTJ": "Interjection", "NOUN": "Noun",
    "NUM": "Numeral", "PART": "Particle", "PRON": "Pronoun",
    "PROPN": "Proper noun", "PUNCT": "Punctuation",
    "SCONJ": "Subordinating conjunction", "SYM": "Symbol",
    "VERB": "Verb", "X": "Other"
}

# High-contrast color palette (dark backgrounds, light text)
POS_COLORS = {
    "NOUN":    "#1565c0",  # Dark blue
    "PROPN":   "#0d47a1",  # Darker blue
    "PRON":    "#0277bd",  # Ocean blue
    "VERB":    "#2e7d32",  # Forest green
    "AUX":     "#1b5e20",  # Dark green
    "ADJ":     "#ef6c00",  # Burnt orange
    "ADV":     "#f9a825",  # Golden (dark text)
    "ADP":     "#6a1b9a",  # Deep purple
    "PART":    "#ad1457",  # Dark pink
    "DET":     "#c62828",  # Dark red
    "NUM":     "#00838f",  # Teal
    "CCONJ":   "#00695c",  # Dark cyan
    "SCONJ":   "#004d40",  # Darker cyan
    "INTJ":    "#d84315",  # Deep orange
    "PUNCT":   "#455a64",  # Blue grey
    "SYM":     "#37474f",  # Darker blue grey
    "X":       "#5d4037",  # Brown
}

# Text colors
POS_TEXT_COLORS = {
    "NOUN":    "#ffffff",
    "PROPN":   "#ffffff",
    "PRON":    "#ffffff",
    "VERB":    "#ffffff",
    "AUX":     "#ffffff",
    "ADJ":     "#ffffff",
    "ADV":     "#000000",  # Dark text on yellow
    "ADP":     "#ffffff",
    "PART":    "#ffffff",
    "DET":     "#ffffff",
    "NUM":     "#ffffff",
    "CCONJ":   "#ffffff",
    "SCONJ":   "#ffffff",
    "INTJ":    "#ffffff",
    "PUNCT":   "#ffffff",
    "SYM":     "#ffffff",
    "X":       "#ffffff",
}


# =============================================================================
# POST-PROCESSING: Clitic Lookup Table
# =============================================================================
# These morphemes are closed-class and their POS is deterministic.
# We split into two tiers:
#   1. Unambiguous forms (override always): multi-letter clitics and the
#      copula/directional particle "d".
#   2. Hyphen-only forms (override only when hyphenated): short subject
#      affixes and preposition-like clitics where standalone usage differs
#      from affix usage (e.g., "i" = ADP preposition, "i-" = PRON subject).
# -----------------------------------------------------------------------------

CLITIC_POS = {
    # --- Unambiguous: override regardless of hyphenation ---
    "d": "PART",          # Directional / copula particle
    # Possessive plural
    "nneɣ": "PRON", "neɣ": "PRON", "nteɣ": "PRON", "nnteɣ": "PRON",
    "wen": "PRON", "nwen": "PRON",
    "nkent": "PRON", "tkent": "PRON",
    "nsen": "PRON", "tsen": "PRON",
    "nsent": "PRON", "tsent": "PRON",
    "nnek": "PRON", "nnem": "PRON", "nnes": "PRON",
    # Accusative / dative
    "iyi": "PRON", "yi": "PRON", "ayi": "PRON",
    "kem": "PRON", "akem": "PRON",
    "tt": "PRON", "itt": "PRON",
    "aɣ": "PRON", "yaɣ": "PRON",
    "ken": "PRON",
    "akent": "PRON",
    "ten": "PRON",
    "ak": "PRON",
    "am": "PRON",
    "as": "PRON", "asen": "PRON",
    "aneɣ": "PRON", "anaɣ": "PRON", "yanaɣ": "PRON",
    "atneɣ": "PRON", "atenteɣ": "PRON",
    "awen": "PRON", "atwen": "PRON",
    "atkent": "PRON",
    "atsen": "PRON", "sen": "PRON",
    "asent": "PRON", "atsent": "PRON",
    # Demonstratives / determiners
    "agi": "DET", "a": "DET",
    "nni": "DET",
    "nniḍen": "DET", "niḍen": "DET",
}

# Short affixes that are ambiguous when standalone (e.g., "i" = preposition ADP,
# "i-" = subject pronoun PRON). Only override if the user wrote them hyphenated.
CLITIC_POS_HYPHEN_ONLY = {
    "ɣ": "PRON",          # 1st sg subject affix
    "t": "PRON",          # 2nd sg / 3rd fem sg subject affix
    "k": "PRON",          # 2nd masc sg
    "m": "PRON",          # 2nd fem sg
    "n": "PRON",          # 1st pl / 3rd masc pl subject affix
    "i": "PRON",          # 3rd masc sg subject affix (vs. standalone prep.)
    "w": "PRON", "iw": "PRON", "inu": "PRON",
    "ik": "PRON",
    "im": "PRON",
    "is": "PRON",         # possessive / dative
    "kent": "PRON",
    "sen": "PRON", "sent": "PRON",
}


def apply_clitic_override(results):
    """
    Post-process model predictions using the clitic lookup table.
    Overrides known closed-class morphemes to their linguistically correct POS.
    """
    for i, token in enumerate(results):
        word = token["word"]
        stripped = word.strip('-')

        # Skip empty or pure punctuation
        if not stripped or stripped in CLITIC_POS.get("PUNCT", {}):
            continue

        # Tier 1: unambiguous forms (override always)
        if stripped in CLITIC_POS:
            token["entity_group"] = CLITIC_POS[stripped]
            token["score"] = max(token["score"], 0.99)
            continue

        # Tier 2: short affixes — only override if hyphenated (affix context)
        if word.startswith('-') or word.endswith('-'):
            if stripped in CLITIC_POS_HYPHEN_ONLY:
                token["entity_group"] = CLITIC_POS_HYPHEN_ONLY[stripped]
                token["score"] = max(token["score"], 0.99)

    return results


def tag_text(text):
    if not text or not text.strip():
        return "", "Please enter some Kabyle text."

    try:
        # 1. Split text into words, hyphenated clitics, and punctuation
        raw_tokens = re.findall(r"-?[\w'’]+|[^\w\s'’-]", text.strip(), re.UNICODE)

        display_tokens = []
        model_tokens = []

        for tok in raw_tokens:
            if tok == "-":
                display_tokens.append(tok)
                model_tokens.append(tok)
            elif tok.startswith('-') and len(tok) > 1:
                display_tokens.append(tok)       # UI: "-nneɣ"
                model_tokens.append(tok[1:])     # Model: "nneɣ"
            elif tok.endswith('-') and len(tok) > 1:
                display_tokens.append(tok)       # UI: "akent-"
                model_tokens.append(tok[:-1])    # Model: "akent"
            else:
                display_tokens.append(tok)
                model_tokens.append(tok)

        # 2. Tokenize the model tokens
        inputs = tokenizer(
            model_tokens,
            is_split_into_words=True,
            return_tensors="pt",
            return_offsets_mapping=False,
        )
        word_ids = inputs.word_ids(batch_index=0)

        with torch.no_grad():
            outputs = model(**inputs)

        predictions = torch.argmax(outputs.logits, dim=-1)[0].tolist()
        scores = torch.softmax(outputs.logits, dim=-1)[0].max(dim=-1).values.tolist()

        # 3. Group subword pieces by original word index
        word_groups = {}
        for idx, wid in enumerate(word_ids):
            if wid is None:
                continue
            if wid not in word_groups:
                word_groups[wid] = {"labels": [], "scores": []}
            word_groups[wid]["labels"].append(id2label[predictions[idx]])
            word_groups[wid]["scores"].append(scores[idx])

        # 4. Build results with majority voting per word
        results = []
        for wid in sorted(word_groups.keys()):
            info = word_groups[wid]
            word_text = display_tokens[wid]

            counts = {}
            for lbl in info["labels"]:
                counts[lbl] = counts.get(lbl, 0) + 1
            majority = max(counts, key=counts.get)

            avg_score = sum(info["scores"]) / len(info["scores"])

            results.append({
                "word": word_text,
                "entity_group": majority,
                "score": avg_score
            })

        # 5. Apply post-processing clitic lookup table
        results = apply_clitic_override(results)

    except Exception as e:
        return "", f"Error: {str(e)}"

    if not results:
        return "", "No tokens found."

    # HTML visualization
    html_parts = ['<<div style="font-size: 1.15rem; line-height: 2.2; font-family: system-ui, sans-serif; padding: 10px;">']
    for token in results:
        word = token["word"]
        label = token["entity_group"]
        score = token["score"]
        bg_color = POS_COLORS.get(label, "#333333")
        text_color = POS_TEXT_COLORS.get(label, "#ffffff")
        html_parts.append(
            '<span style="display: inline-block; margin: 3px; vertical-align: top; box-shadow: 0 1px 3px rgba(0,0,0,0.3);">'
            '<span style="background: ' + bg_color + '; color: ' + text_color + '; border-radius: 6px 6px 0 0; padding: 5px 10px; display: block; text-align: center; font-weight: 600; font-size: 1.1rem;">'
            + word +
            '</span>'
            '<span style="background: #1a1a1a; color: #fff; border-radius: 0 0 6px 6px; padding: 3px 10px; display: block; text-align: center; font-size: 0.8rem; font-weight: 500;">'
            + label + ' <span style="opacity: 0.7;">(' + f"{score:.2f}" + ')</span>'
            '</span>'
            '</span>'
        )
    html_parts.append('</div>')

    # Markdown table
    table_lines = [
        "| Token | POS Tag | Description | Confidence |",
        "|-------|---------|-------------|------------|"
    ]
    for token in results:
        word = token["word"]
        label = token["entity_group"]
        desc = POS_DESCRIPTIONS.get(label, label)
        score = token["score"]
        table_lines.append(f"| {word} | `{label}` | {desc} | {score:.3f} |")

    return "\n".join(html_parts), "\n".join(table_lines)


examples = [
    "Aṭas n medden i yessen.",
    "Taqbaylit d tutlayt deg Lezzayer.",
    "Yella wuccen ameqqran deg taddart.",
    "Tameddakelt-nneɣ teɣra adlis-is.",
    "D nekkni i d-yusan d imezwura.",
]

demo = gr.Interface(
    fn=tag_text,
    inputs=gr.Textbox(
        label="Kabyle Text",
        placeholder="Enter a sentence in Kabyle (e.g., Aṭas n medden i yessen.)",
        lines=2
    ),
    outputs=[
        gr.HTML(label="Tagged Visualization"),
        gr.Markdown(label="Results Table")
    ],
    title="Kabyle POS Tagger v2",
    description="""
    <div style="text-align: center;">
        <h2>Kabyle Part-of-Speech Tagger</h2>
        <p>Enter a sentence in <strong>Kabyle</strong> (Berber language) to see POS tags predicted by
        <a href="https://huggingface.co/boffire/kabyle-pos-v2" target="_blank">boffire/kabyle-pos-v2</a>
        (XLM-RoBERTa-base, Test F1: 93.8%).</p>
        <p style="font-size: 0.9rem; color: #666;">
            Tags follow the <a href="https://universaldependencies.org/u/pos/" target="_blank">Universal Dependencies</a> POS tagset.
        </p>
    </div>
    """,
    examples=examples,
)

if __name__ == "__main__":
    demo.launch()