"""Minimal baseline Gradio entry point for the Month 1-3 rebuild.

Wires the simplest possible slice: Whisper (zero-shot) -> Aya-Expanse -> MMS-TTS.
No LoRA adapters, no memory loop, no speaker ID, no voice cloning, no IoT,
no phrase matcher. Used for field testing and building a real-user eval set.

See docs/baseline_rebuild.md for the plan this fits into.

Run locally:
    HF_TOKEN=hf_xxx python app_minimal.py

Environment variables (all optional except HF_TOKEN, which is needed for the
HF Serverless LLM call):
    HF_TOKEN       — HuggingFace token with read access
    LLM_MODEL_ID   — default "CohereLabs/aya-expanse-32b"
                     (23-language multilingual, strong African-language coverage)
    DEVICE         — "cuda" or "cpu" (auto if unset)
    LOG_LEVEL      — default "INFO"
"""
from __future__ import annotations

import logging
import os
from typing import Optional, Tuple

import numpy as np

# Load .env (HF_TOKEN etc.) before reading os.environ below. Silent no-op if
# python-dotenv is not installed or no .env is present.
try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass

# Local imports — the four modules the baseline-rebuild plan authorizes.
# Everything else in src/ is intentionally unused here.
from src.data.bam_normalize import normalize as bam_normalize
from src.engine.turn_logger import TurnLogger
from src.engine.whisper_base import WhisperBackbone
from src.llm.minimal_client import MinimalClient
from src.llm.phrasebook import lookup as phrasebook_lookup, top_k as phrasebook_top_k
from src.tts.mms_tts import MMSTTSEngine

logging.basicConfig(
    level=os.getenv("LOG_LEVEL", "INFO"),
    format="%(asctime)s  %(name)-30s  %(levelname)-7s  %(message)s",
)
logger = logging.getLogger(__name__)


# ── Environment ──────────────────────────────────────────────────────────────
HF_TOKEN     = os.environ.get("HF_TOKEN")
LLM_MODEL_ID = os.environ.get("LLM_MODEL_ID", "CohereLabs/aya-expanse-32b")
_REQUESTED_DEVICE = os.environ.get("DEVICE")  # optional override

LANG_CHOICES = [("Bambara", "bam"), ("Fula", "ful"), ("French", "fr"), ("English", "en")]
LANG_NAMES = {"bam": "Bambara", "ful": "Fula", "fr": "French", "en": "English"}
LANG_TO_WHISPER_HINT = {
    # Whisper large-v3-turbo does not know Bambara/Fula as first-class
    # languages. We leave `language` unset for those so Whisper auto-detects;
    # fr/en are explicit hints for clean decoding.
    "bam": None,
    "ful": None,
    "fr":  "french",
    "en":  "english",
}


# Reply-language steering is handled inside MinimalClient via a dialect-anchored
# system prompt (see src/llm/minimal_client.py). No per-turn directive needed.


# ── Service singletons (lazy-loaded) ────────────────────────────────────────
_backbone:    Optional[WhisperBackbone] = None
_llm:         Optional[MinimalClient]   = None
_tts:         Optional[MMSTTSEngine]    = None
_turn_logger: TurnLogger                = TurnLogger()


def _resolve_device() -> str:
    """Pick 'cuda' if torch sees a GPU, else 'cpu'. DEVICE env overrides.

    Some torch builds (CPU-only wheels) report `cuda.is_available() == True`
    in error states; we additionally probe device_count and fall back to cpu
    on any exception to keep the app usable on CPU-only laptops.
    """
    import torch  # lazy
    if _REQUESTED_DEVICE:
        return _REQUESTED_DEVICE
    try:
        if torch.cuda.is_available() and torch.cuda.device_count() > 0:
            return "cuda"
    except Exception:
        pass
    return "cpu"


def get_backbone() -> WhisperBackbone:
    """Load the Whisper backbone once and cache. Zero-shot — no adapters."""
    global _backbone
    if _backbone is None:
        _backbone = WhisperBackbone(config_path="configs/base_config.yaml")
        _backbone.load(device=_resolve_device(), hf_token=HF_TOKEN)
        logger.info("Whisper backbone ready: %s on %s",
                    _backbone.model_id, _backbone.device)
    return _backbone


def get_llm() -> MinimalClient:
    global _llm
    if _llm is None:
        _llm = MinimalClient(model_id=LLM_MODEL_ID, hf_token=HF_TOKEN)
        logger.info("Minimal LLM client configured: %s", LLM_MODEL_ID)
    return _llm


def get_tts() -> MMSTTSEngine:
    global _tts
    if _tts is None:
        _tts = MMSTTSEngine()
        logger.info("MMS-TTS engine ready (lazy per-language load)")
    return _tts


# ── Core pipeline ────────────────────────────────────────────────────────────
def transcribe(audio_np: np.ndarray, sample_rate: int, input_lang: str) -> str:
    """Run zero-shot Whisper on a numpy audio array. Returns the raw transcript.

    `input_lang` drives two things only: the Whisper language hint (for fr/en)
    and whether bam_normalize is applied. It has no effect on the TTS voice or
    on the LLM reply language — those are driven by the separate output-language
    dropdown in the UI.
    """
    import torch  # lazy
    import librosa  # lazy — resample if the mic gave us something non-16k

    backbone = get_backbone()
    target_sr = 16_000

    # Ensure mono float32
    if audio_np.ndim == 2:
        audio_np = audio_np.mean(axis=1)
    audio_np = audio_np.astype(np.float32)

    # Gradio's gr.Audio often returns int16-scaled floats or ints — normalize.
    peak = np.max(np.abs(audio_np)) if audio_np.size else 0.0
    if peak > 1.5:  # looks like raw int16 cast to float
        audio_np = audio_np / 32768.0

    if sample_rate != target_sr:
        audio_np = librosa.resample(audio_np, orig_sr=sample_rate, target_sr=target_sr)

    inputs = backbone.processor(
        audio_np, sampling_rate=target_sr, return_tensors="pt"
    )
    input_features = inputs.input_features.to(backbone.device)
    if backbone.device == "cuda":
        input_features = input_features.half()

    gen_kwargs: dict = {"max_new_tokens": 128}
    hint = LANG_TO_WHISPER_HINT.get(input_lang)
    if hint:
        gen_kwargs["language"] = hint
        gen_kwargs["task"] = "transcribe"

    with torch.no_grad():
        output_ids = backbone.model.generate(input_features, **gen_kwargs)

    transcript = backbone.processor.batch_decode(
        output_ids, skip_special_tokens=True
    )[0].strip()

    if input_lang == "bam" and transcript:
        transcript = bam_normalize(transcript)

    return transcript


NO_TRANSLATION = "(no curated translation — try Generate reply)"


def _synthesize(text: str, output_lang: str
                ) -> Tuple[Optional[Tuple[int, np.ndarray]], Optional[int], Optional[str]]:
    """Run TTS on `text` in `output_lang`. Returns (audio_or_None, tts_ms, error)."""
    import time
    if not text:
        return None, None, None
    t = time.perf_counter()
    device = _resolve_device()
    try:
        wav, sr = get_tts().synthesize(text, language=output_lang, device=device)
        return (sr, wav), int((time.perf_counter() - t) * 1000), None
    except AssertionError as exc:
        # Most common: "Torch not compiled with CUDA enabled" on CPU-only boxes
        # where is_available() lied. Retry once on CPU.
        if device != "cpu":
            logger.warning("TTS failed on %s (%s) — retrying on cpu", device, exc)
            try:
                wav, sr = get_tts().synthesize(text, language=output_lang, device="cpu")
                return (sr, wav), int((time.perf_counter() - t) * 1000), None
            except Exception as exc2:  # pragma: no cover
                logger.exception("TTS failed on cpu fallback")
                return None, None, f"tts: {exc2}"
        logger.exception("TTS failed")
        return None, None, f"tts: {exc}"
    except Exception as exc:  # pragma: no cover
        logger.exception("TTS failed")
        return None, None, f"tts: {exc}"


def _translate_only(user_text: str, output_lang: str
                    ) -> Tuple[str, Optional[Tuple[int, np.ndarray]], Optional[dict], Optional[int]]:
    """Phrasebook-only translation — never calls the LLM.

    Returns (translation_text, translation_audio, hit_or_None, tts_ms).
    On miss for bam/ful, returns NO_TRANSLATION and no audio.
    For en/fr targets (no curated phrasebook), echoes the input as the
    translation since the user likely wants to hear it spoken — TTS in that
    language is still the right thing to play.
    """
    text = (user_text or "").strip()
    if not text:
        return "", None, None, None

    hit = phrasebook_lookup(text, output_lang)
    if hit:
        logger.info(
            "Phrasebook hit (%s, score=%.2f): %r → %r [cat=%s]",
            hit["match"], hit["score"], text, hit["target"], hit["category"],
        )
        target = hit["target"] or ""
        audio, tts_ms, _ = _synthesize(target, output_lang)
        return target, audio, hit, tts_ms

    # No curated translation. For en/fr we still synthesize the input itself
    # (the user can use the app as a TTS box). For bam/ful we surface the
    # honest "no curated translation" sentinel — the user can then click
    # "Generate reply" if they want the LLM to handle it.
    if output_lang in ("en", "fr"):
        audio, tts_ms, _ = _synthesize(text, output_lang)
        return text, audio, None, tts_ms
    return NO_TRANSLATION, None, None, None


def _generate_reply(user_text: str, output_lang: str
                    ) -> Tuple[str, Optional[Tuple[int, np.ndarray]], Optional[int], Optional[int], Optional[str]]:
    """Dialect-anchored LLM reply (with RAG top-3 few-shot) + TTS.

    Returns (reply_text, reply_audio, llm_ms, tts_ms, error).
    Always returns a usable text string — even on LLM failure it returns a
    short parenthetical so the UI never goes blank.
    """
    import time
    text = (user_text or "").strip()
    if not text:
        return "(nothing to reply to)", None, None, None, None

    extras = phrasebook_top_k(text, output_lang, k=3) or None
    if extras:
        logger.info(
            "RAG-injecting top-%d nearest phrasebook entries (top score=%.2f)",
            len(extras), extras[0]["score"],
        )

    t_llm = time.perf_counter()
    try:
        reply = get_llm().chat(
            text, target_lang=output_lang, extra_examples=extras,
        )
    except Exception as exc:  # pragma: no cover
        logger.exception("LLM call failed")
        llm_ms = int((time.perf_counter() - t_llm) * 1000)
        return f"(LLM error: {exc})", None, llm_ms, None, f"llm: {exc}"
    llm_ms = int((time.perf_counter() - t_llm) * 1000)
    reply = (reply or "").strip() or "(empty reply)"
    audio, tts_ms, tts_error = _synthesize(reply, output_lang)
    return reply, audio, llm_ms, tts_ms, tts_error


# ── Tab handlers ─────────────────────────────────────────────────────────────
def run_text_translate(
    text: str,
    output_lang: str,
) -> Tuple[str, Optional[Tuple[int, np.ndarray]], str]:
    """Text tab → Send: phrasebook-only translation. Always-on, no LLM.

    Returns (translation_text, translation_audio, transcript_state).
    `transcript_state` is the canonicalised input passed to the Generate-reply
    button so it doesn't need to re-read the textbox.
    """
    import time
    t0 = time.perf_counter()
    text = (text or "").strip()
    if not text:
        return "(no text entered)", None, ""

    translation, audio, hit, tts_ms = _translate_only(text, output_lang)
    _turn_logger.log(
        phase="translate", tab="text",
        input_lang=None, output_lang=output_lang,
        user_text=text, transcript=None, transcribe_ms=None,
        phrasebook=hit, llm_model=None, llm_ms=None,
        reply_text=translation, tts_ms=tts_ms,
        total_ms=int((time.perf_counter() - t0) * 1000),
        error=None,
    )
    return translation, audio, text


def run_text_reply(
    transcript_state: str,
    output_lang: str,
) -> Tuple[str, Optional[Tuple[int, np.ndarray]]]:
    """Text tab → Generate reply: dialect-anchored LLM + TTS."""
    import time
    t0 = time.perf_counter()
    if not (transcript_state or "").strip():
        return "(send a message first)", None

    reply, audio, llm_ms, tts_ms, error = _generate_reply(
        transcript_state, output_lang
    )
    _turn_logger.log(
        phase="reply", tab="text",
        input_lang=None, output_lang=output_lang,
        user_text=transcript_state, transcript=None, transcribe_ms=None,
        phrasebook=None, llm_model=LLM_MODEL_ID, llm_ms=llm_ms,
        reply_text=reply, tts_ms=tts_ms,
        total_ms=int((time.perf_counter() - t0) * 1000),
        error=error,
    )
    return reply, audio


def run_voice_translate(
    audio: Optional[Tuple[int, np.ndarray]],
    input_lang: str,
    output_lang: str,
) -> Tuple[str, str, Optional[Tuple[int, np.ndarray]], str]:
    """Voice tab → Submit: Whisper transcribe + phrasebook-only translation.

    Returns (transcript, translation_text, translation_audio, transcript_state).
    """
    import time
    t0 = time.perf_counter()
    if audio is None:
        return "", "(no audio received)", None, ""
    sample_rate, audio_np = audio
    if audio_np.size == 0:
        return "", "(empty audio)", None, ""

    t_stt = time.perf_counter()
    try:
        transcript = transcribe(audio_np, sample_rate, input_lang)
    except Exception as exc:  # pragma: no cover
        logger.exception("Transcription failed")
        _turn_logger.log(
            phase="translate", tab="voice",
            input_lang=input_lang, output_lang=output_lang,
            user_text=None, transcript=None, transcribe_ms=None,
            phrasebook=None, llm_model=None, llm_ms=None,
            reply_text=None, tts_ms=None,
            total_ms=int((time.perf_counter() - t0) * 1000),
            error=f"stt: {exc}",
        )
        return "", f"(STT error: {exc})", None, ""
    transcribe_ms = int((time.perf_counter() - t_stt) * 1000)

    if not transcript:
        _turn_logger.log(
            phase="translate", tab="voice",
            input_lang=input_lang, output_lang=output_lang,
            user_text=None, transcript="", transcribe_ms=transcribe_ms,
            phrasebook=None, llm_model=None, llm_ms=None,
            reply_text=None, tts_ms=None,
            total_ms=int((time.perf_counter() - t0) * 1000),
            error="no_speech",
        )
        return "", "(no speech detected)", None, ""

    translation, t_audio, hit, tts_ms = _translate_only(transcript, output_lang)
    _turn_logger.log(
        phase="translate", tab="voice",
        input_lang=input_lang, output_lang=output_lang,
        user_text=transcript, transcript=transcript,
        transcribe_ms=transcribe_ms,
        phrasebook=hit, llm_model=None, llm_ms=None,
        reply_text=translation, tts_ms=tts_ms,
        total_ms=int((time.perf_counter() - t0) * 1000),
        error=None,
    )
    return transcript, translation, t_audio, transcript


def run_voice_reply(
    transcript_state: str,
    output_lang: str,
) -> Tuple[str, Optional[Tuple[int, np.ndarray]]]:
    """Voice tab → Generate reply: uses the stored transcript, no re-Whisper."""
    import time
    t0 = time.perf_counter()
    if not (transcript_state or "").strip():
        return "(record audio and submit first)", None

    reply, audio, llm_ms, tts_ms, error = _generate_reply(
        transcript_state, output_lang
    )
    _turn_logger.log(
        phase="reply", tab="voice",
        input_lang=None, output_lang=output_lang,
        user_text=transcript_state, transcript=transcript_state,
        transcribe_ms=None,
        phrasebook=None, llm_model=LLM_MODEL_ID, llm_ms=llm_ms,
        reply_text=reply, tts_ms=tts_ms,
        total_ms=int((time.perf_counter() - t0) * 1000),
        error=error,
    )
    return reply, audio


# ── Gradio UI ────────────────────────────────────────────────────────────────
def build_ui():
    """Construct and return the Gradio Blocks app."""
    import gradio as gr  # lazy — keeps module importable without gradio installed

    with gr.Blocks(title="Sahel-Voice — Minimal Baseline") as demo:
        gr.Markdown(
            "# 🌾 Sahel-Voice — Minimal Baseline\n"
            f"Zero-shot Whisper → {LLM_MODEL_ID} → MMS-TTS, with a curated "
            "Bambara/Pular phrasebook short-circuit in front of the LLM. "
            "No adapters, no memory, no polish. This is the field-test "
            "baseline — see `docs/baseline_rebuild.md`."
        )

        # Shared across tabs. Split into two so input and output language
        # are never conflated — the Voice tab cares about both; the Text tab
        # only uses output_lang (it doesn't feed Whisper).
        with gr.Row():
            input_lang = gr.Dropdown(
                choices=LANG_CHOICES, value="bam", label="Input language",
                info="Language you're speaking/typing. Drives Whisper hint "
                     "(fr/en only) and bam_normalize (bam only).",
            )
            output_lang = gr.Dropdown(
                choices=LANG_CHOICES, value="bam", label="Output language",
                info="Language the LLM should reply in. Also picks the TTS voice.",
            )

        # Carries the canonical input (typed text, or Whisper transcript) from
        # Submit/Send into the Generate-reply button so we don't re-transcribe
        # or re-read the textbox.
        transcript_state = gr.State("")

        with gr.Tabs():
            # ── Voice tab — the actual baseline the field test measures ─────
            with gr.Tab("🎤 Voice (full STT → translation + optional reply)"):
                with gr.Row():
                    with gr.Column():
                        audio_in = gr.Audio(
                            sources=["microphone", "upload"],
                            type="numpy",
                            label="Speak (or upload a .wav)",
                        )
                        voice_submit = gr.Button(
                            "Transcribe + translate", variant="primary"
                        )
                        voice_transcript_out = gr.Textbox(
                            label="Transcript (zero-shot Whisper)",
                            lines=2, interactive=False,
                        )
                    with gr.Column():
                        voice_translation_out = gr.Textbox(
                            label="Phrasebook translation",
                            lines=3, interactive=False,
                        )
                        voice_translation_audio = gr.Audio(
                            label="Translation audio",
                            type="numpy", autoplay=False,
                        )
                        voice_reply_btn = gr.Button(
                            "Generate reply (LLM)", variant="secondary"
                        )
                        voice_reply_out = gr.Textbox(
                            label="LLM reply", lines=4, interactive=False,
                        )
                        voice_reply_audio = gr.Audio(
                            label="Reply audio", type="numpy", autoplay=False,
                        )

                voice_submit.click(
                    fn=run_voice_translate,
                    inputs=[audio_in, input_lang, output_lang],
                    outputs=[
                        voice_transcript_out,
                        voice_translation_out,
                        voice_translation_audio,
                        transcript_state,
                    ],
                )
                voice_reply_btn.click(
                    fn=run_voice_reply,
                    inputs=[transcript_state, output_lang],
                    outputs=[voice_reply_out, voice_reply_audio],
                )

            # ── Text tab — dev loop, skips Whisper ──────────────────────────
            with gr.Tab("⌨️ Text (translation + optional reply, dev loop)"):
                with gr.Row():
                    with gr.Column():
                        text_in = gr.Textbox(
                            label="Type your message",
                            lines=3,
                            placeholder="e.g. Good morning, how are you?",
                        )
                        text_submit = gr.Button("Send", variant="primary")
                    with gr.Column():
                        text_translation_out = gr.Textbox(
                            label="Phrasebook translation",
                            lines=3, interactive=False,
                        )
                        text_translation_audio = gr.Audio(
                            label="Translation audio",
                            type="numpy", autoplay=False,
                        )
                        text_reply_btn = gr.Button(
                            "Generate reply (LLM)", variant="secondary"
                        )
                        text_reply_out = gr.Textbox(
                            label="LLM reply", lines=4, interactive=False,
                        )
                        text_reply_audio = gr.Audio(
                            label="Reply audio", type="numpy", autoplay=False,
                        )

                # Text tab only uses output_lang — input_lang is a no-op here.
                text_submit.click(
                    fn=run_text_translate,
                    inputs=[text_in, output_lang],
                    outputs=[
                        text_translation_out,
                        text_translation_audio,
                        transcript_state,
                    ],
                )
                # Pressing Enter in the textbox also submits.
                text_in.submit(
                    fn=run_text_translate,
                    inputs=[text_in, output_lang],
                    outputs=[
                        text_translation_out,
                        text_translation_audio,
                        transcript_state,
                    ],
                )
                text_reply_btn.click(
                    fn=run_text_reply,
                    inputs=[transcript_state, output_lang],
                    outputs=[text_reply_out, text_reply_audio],
                )

        gr.Markdown(
            "---\n"
            "**What's intentionally missing:** LoRA adapters, memory/vocabulary "
            "persistence, speaker ID, Waxal/F5 TTS, IoT sensor integration, "
            "phrase-matcher shortcuts. All of those live in `app.py` — this is the "
            "stripped-down baseline used to measure what Whisper zero-shot does on "
            "real Bambara/Fula recordings and to collect a real-user eval set.\n\n"
            "The **Text** tab skips Whisper — it's for fast iteration on the "
            "LLM + TTS path, not for field-test measurement.\n\n"
            "**How the two boxes differ:** the top pair is a phrasebook lookup "
            "(no LLM, instant, gold-curated translation). If your input isn't "
            "in the curated list you'll see *(no curated translation)* — click "
            "**Generate reply** to get a dialect-anchored LLM response in the "
            "bottom pair."
        )

    return demo


def main() -> None:
    if not HF_TOKEN:
        logger.warning(
            "HF_TOKEN is not set — the LLM call will fail. "
            "Export HF_TOKEN before launching for the pipeline to work end-to-end."
        )
    demo = build_ui()
    demo.queue().launch()


if __name__ == "__main__":
    main()