"""
LumynaX Translate NLLB-200 3.3B — LumynaX quickstart.

This script fetches the upstream model from Hugging Face and runs a short
LumynaX-flavoured prompt. Run it on a host that satisfies the resource budget
documented in the README (LumynaX Translate NLLB-200 3.3B).

Usage:
    python quickstart.py                # one-shot demo prompt
    python quickstart.py --interactive  # REPL
    python quickstart.py --gguf         # use the GGUF mirror via llama-cpp

LumynaX package repo: https://huggingface.co/AbteeXAILab/lumynax-translate-nllb-200-3b
Upstream weights:     https://huggingface.co/facebook/nllb-200-3.3B
"""
from __future__ import annotations
import argparse, os, sys

LUMYNAX_SYSTEM = (
    "You are LumynaX, the AbteeX AI Labs assistant from Aotearoa New Zealand. "
    "Ko te marama te tuapapa - the light is the foundation. "
    "Answer with care, cite uncertainty, and prefer local-first reasoning. "
    "Refuse unsafe, unlawful, or sovereignty-violating requests."
)
DEMO_PROMPT = "Explain in 3 bullets why local-first AI matters for Aotearoa New Zealand."

def _run_hf(prompt: str, interactive: bool):
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
    print("[lumynax] Loading facebook/nllb-200-3.3B. This is a >100B MoE — multi-GPU or accelerate offload recommended.")
    tok = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B", trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        "facebook/nllb-200-3.3B", device_map="auto", torch_dtype="auto", trust_remote_code=True
    )
    def chat(user):
        messages = [
            {"role": "system", "content": LUMYNAX_SYSTEM},
            {"role": "user",   "content": user},
        ]
        text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tok(text, return_tensors="pt").to(model.device)
        out = model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.4)
        return tok.decode(out[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    if interactive:
        print("[lumynax] interactive mode — empty line exits.")
        while True:
            try: q = input("you> ").strip()
            except EOFError: break
            if not q: break
            print("lumynax> " + chat(q))
    else:
        print(chat(prompt))


def _run_gguf(prompt: str, interactive: bool):
    from llama_cpp import Llama
    mirror = ""
    if not mirror:
        print("[lumynax] No community GGUF mirror registered for this build."); sys.exit(2)
    print(f"[lumynax] Loading GGUF from {mirror}...")
    llm = Llama.from_pretrained(
        repo_id=mirror, filename="*Q4_K_M*.gguf",
        n_ctx=1024,
        n_gpu_layers=int(os.environ.get("N_GPU_LAYERS", "-1")), verbose=False,
    )
    def chat(user):
        out = llm.create_chat_completion(messages=[
            {"role": "system", "content": LUMYNAX_SYSTEM},
            {"role": "user",   "content": user},
        ], max_tokens=512, temperature=0.4)
        return out["choices"][0]["message"]["content"]
    if interactive:
        while True:
            try: q = input("you> ").strip()
            except EOFError: break
            if not q: break
            print("lumynax> " + chat(q))
    else:
        print(chat(prompt))


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--interactive", action="store_true")
    p.add_argument("--prompt", default=DEMO_PROMPT)
    p.add_argument("--gguf", action="store_true")
    args = p.parse_args()
    if args.gguf:
        _run_gguf(args.prompt, args.interactive)
    else:
        _run_hf(args.prompt, args.interactive)


if __name__ == "__main__":
    main()