"""
🖤 Gemma 4 E4B — Agentic Tool Calling Demo
Built by RavenX AI | github.com/DeadByDawn101

What makes this different from every other HF demo:
- Live tool calling loop (web search, calculator, datetime, code exec)
- <think> tag reasoning shown in real time
- Full agentic loop with tool dispatch and result injection
- Runs on the fused Opus Reasoning + Claude Code model
"""

import gradio as gr
import json
import re
import datetime
import math
import os

# ── Tool definitions (OpenAI-compatible schema) ──────────────────────────────

TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "calculator",
            "description": "Evaluate a mathematical expression. Use for any numeric calculation.",
            "parameters": {
                "type": "object",
                "properties": {
                    "expression": {
                        "type": "string",
                        "description": "Math expression to evaluate, e.g. '2 ** 32' or 'math.sqrt(144)'"
                    }
                },
                "required": ["expression"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "get_datetime",
            "description": "Get the current date and time in a given timezone.",
            "parameters": {
                "type": "object",
                "properties": {
                    "timezone": {
                        "type": "string",
                        "description": "IANA timezone string, e.g. 'America/Los_Angeles'"
                    }
                },
                "required": ["timezone"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "web_search",
            "description": "Search the web for current information. Use when you need live data, news, prices, or recent events.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "Search query string"
                    }
                },
                "required": ["query"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "run_python",
            "description": "Execute a safe Python snippet and return the output. Use for data analysis, transformations, or complex calculations.",
            "parameters": {
                "type": "object",
                "properties": {
                    "code": {
                        "type": "string",
                        "description": "Python code to execute (no imports needed for math, json, datetime, re)"
                    }
                },
                "required": ["code"]
            }
        }
    }
]

# ── Tool executor ─────────────────────────────────────────────────────────────

def execute_tool(name: str, args: dict) -> str:
    try:
        if name == "calculator":
            expr = args.get("expression", "")
            # Safe eval: only math ops
            allowed = {k: getattr(math, k) for k in dir(math) if not k.startswith("_")}
            allowed.update({"abs": abs, "round": round, "min": min, "max": max})
            result = eval(expr, {"__builtins__": {}}, allowed)
            return f"Result: {result}"

        elif name == "get_datetime":
            tz_name = args.get("timezone", "UTC")
            try:
                import zoneinfo
                tz = zoneinfo.ZoneInfo(tz_name)
                now = datetime.datetime.now(tz)
            except Exception:
                now = datetime.datetime.utcnow()
                tz_name = "UTC"
            return f"Current time in {tz_name}: {now.strftime('%A, %B %d %Y — %I:%M %p %Z')}"

        elif name == "web_search":
            query = args.get("query", "")
            # Simulated search — in production wire to SerpAPI / Brave / Gemini grounding
            return (
                f"[Web Search: '{query}']\n"
                f"Note: Live search not wired in this demo. "
                f"In production, connect SerpAPI or Gemini grounding. "
                f"Query was: {query}"
            )

        elif name == "run_python":
            code = args.get("code", "")
            import io, contextlib
            allowed_globals = {
                "math": math, "json": json, "re": re,
                "datetime": datetime, "print": print,
                "__builtins__": {"range": range, "len": len, "str": str,
                                  "int": int, "float": float, "list": list,
                                  "dict": dict, "sum": sum, "zip": zip,
                                  "enumerate": enumerate, "sorted": sorted,
                                  "isinstance": isinstance, "round": round}
            }
            buf = io.StringIO()
            with contextlib.redirect_stdout(buf):
                exec(code, allowed_globals)
            output = buf.getvalue()
            return output if output else "(no output — use print() to see results)"

    except Exception as e:
        return f"Tool error: {e}"

    return "Unknown tool"


# ── Model inference (runs on CPU/GPU via transformers or mlx_lm) ──────────────

def build_inference_fn():
    """Try to load the model. Falls back to mock if not enough VRAM/RAM."""
    model_id = "deadbydawn101/gemma-4-E4B-Agentic-Opus-Reasoning-GeminiCLI-mlx-4bit"

    # Try mlx_lm first (Apple Silicon spaces)
    try:
        from mlx_lm import load, generate as mlx_generate
        model, tokenizer = load(model_id)
        print(f"✅ Loaded via mlx_lm: {model_id}")

        def infer(messages, tools=None):
            prompt = tokenizer.apply_chat_template(
                messages,
                tools=tools,
                add_generation_prompt=True,
                tokenize=False
            )
            return mlx_generate(model, tokenizer, prompt=prompt, max_tokens=1024, verbose=False)

        return infer

    except Exception as e:
        print(f"mlx_lm failed: {e}")

    # Try transformers (CPU/GPU)
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM
        import torch
        print("Loading via transformers (CPU)...")
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id, torch_dtype=torch.bfloat16, device_map="auto"
        )
        print("✅ Loaded via transformers")

        def infer(messages, tools=None):
            prompt = tokenizer.apply_chat_template(
                messages, tools=tools,
                add_generation_prompt=True, tokenize=False
            )
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            out = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
            return tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=False)

        return infer

    except Exception as e:
        print(f"transformers failed: {e}")
        return None


_infer_fn = None

def get_infer():
    global _infer_fn
    if _infer_fn is None:
        _infer_fn = build_inference_fn()
    return _infer_fn


# ── Agentic loop ──────────────────────────────────────────────────────────────

def parse_tool_call(text: str):
    """Extract tool name and args from <|tool_call>call:name{args}<tool_call|>"""
    pattern = r"<\|tool_call>call:(\w+)\{(.*?)\}<tool_call\|>"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        name = match.group(1)
        args_raw = "{" + match.group(2) + "}"
        try:
            # Gemma 4 uses a custom arg format, try JSON-like parse
            args = json.loads(args_raw)
        except Exception:
            # Fallback: extract key:value pairs
            args = {}
            for kv in re.finditer(r'(\w+):"?([^",}]+)"?', match.group(2)):
                args[kv.group(1)] = kv.group(2).strip()
        return name, args
    return None, None


def agentic_chat(user_message: str, history: list, enable_tools: bool, show_thinking: bool):
    """Full agentic loop with tool calling and <think> display."""

    infer = get_infer()

    # Format history for chat template
    messages = []
    messages.append({
        "role": "system",
        "content": (
            "You are an advanced AI assistant with tool-calling capabilities. "
            "Think through problems carefully using <think> tags before responding. "
            "Use tools when you need real data, calculations, or code execution. "
            "Be concise, accurate, and action-oriented."
        )
    })
    for h in history:
        if h[0]:
            messages.append({"role": "user", "content": h[0]})
        if h[1]:
            messages.append({"role": "assistant", "content": h[1]})
    messages.append({"role": "user", "content": user_message})

    tools = TOOLS if enable_tools else None

    # Simulated agentic loop — shows real UX when model isn't loaded on CPU tier
    if infer is None:
        import re as _re

        lowered = user_message.lower()

        # Pick the most relevant tool to simulate
        if any(x in lowered for x in ["calculat", "math", "compute", "sqrt", "power", "2^", "**"]):
            sim_tool = "calculator"
            # extract something that looks like a math expression
            expr_match = _re.search(r"[\d\.\+\-\*\/\^\(\)\s]+", user_message)
            expr = expr_match.group(0).strip() if expr_match else "2 ** 32"
            expr = expr.replace("^", "**")
            try:
                import math as _math
                safe = {k: getattr(_math, k) for k in dir(_math) if not k.startswith("_")}
                safe.update({"abs": abs, "round": round})
                sim_result = f"Result: {eval(expr, {'__builtins__': {}}, safe)}"
            except Exception:
                sim_result = "Result: 4294967296"
            sim_think = (
                f"The user wants me to calculate something.\n"
                f"I should use the `calculator` tool rather than doing arithmetic in my head.\n"
                f"Expression: `{expr}`\nLet me call the tool."
            )
            sim_answer = f"I called `calculator(expression=\"{expr}\")` → **{sim_result}**\n\nThe answer is `{sim_result.replace('Result: ', '')}`.\n\nI used the tool rather than attempting mental arithmetic — that's the right agentic pattern."

        elif any(x in lowered for x in ["time", "date", "today", "now", "timezone", "tokyo", "london", "utc", "pst"]):
            import datetime
            sim_tool = "get_datetime"
            tz = "America/Los_Angeles"
            for t in ["tokyo", "japan"]: 
                if t in lowered: tz = "Asia/Tokyo"
            for t in ["london", "uk", "gmt"]:
                if t in lowered: tz = "Europe/London"
            try:
                import zoneinfo
                now = datetime.datetime.now(zoneinfo.ZoneInfo(tz))
                time_str = now.strftime("%A, %B %d %Y — %I:%M %p %Z")
            except Exception:
                time_str = datetime.datetime.utcnow().strftime("%A, %B %d %Y — %I:%M %p UTC")
            sim_think = (
                f"The user wants to know the current time.\n"
                f"I should call `get_datetime` with timezone=\"{tz}\" rather than guessing.\n"
                f"This gives a precise, real answer."
            )
            sim_result = f"Current time in {tz}: {time_str}"
            sim_answer = f"I called `get_datetime(timezone=\"{tz}\")` →\n\n**{sim_result}**"

        elif any(x in lowered for x in ["python", "code", "fibonacci", "list", "sort", "function", "def ", "script"]):
            sim_tool = "run_python"
            sim_think = (
                f"The user wants me to execute some Python code.\n"
                f"Rather than just describing it, I should use `run_python` to actually run it and show the real output."
            )
            if "fibonacci" in lowered:
                code = "a,b=0,1\nresult=[]\nfor _ in range(10):\n    result.append(a)\n    a,b=b,a+b\nprint(result)"
                output = "[0, 1, 1, 2, 3, 5, 8, 13, 21, 34]"
            else:
                code = "print([x**2 for x in range(1,11)])"
                output = "[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]"
            sim_result = output
            sim_answer = f"I called `run_python` with:\n```python\n{code}\n```\n\nOutput: `{output}`\n\nCode executed and returned real results — no hallucination."

        else:
            # General knowledge — no tool needed, just reasoning
            sim_tool = None
            sim_think = (
                f"The user is asking: \"{user_message}\"\n"
                f"Let me check: do I need a tool here?\n"
                f"- `web_search`: useful if I need live/current data\n"
                f"- `calculator`: useful for math\n"
                f"- `run_python`: useful for code execution\n"
                f"- `get_datetime`: useful for time queries\n\n"
                f"This appears to be a knowledge question I can answer from training. No tool needed."
            )
            sim_answer = (
                f"I reasoned through your question using baked-in Opus-style chain-of-thought.\n\n"
                f"No tool call was needed here — this is a knowledge question I can answer directly.\n\n"
                f"**To see tool calling in action, try:**\n"
                f"- *\"What is 2 to the power of 32?\"* → triggers `calculator`\n"
                f"- *\"What time is it in Tokyo?\"* → triggers `get_datetime`\n"
                f"- *\"Generate the first 10 Fibonacci numbers\"* → triggers `run_python`"
            )

        parts = []
        if show_thinking:
            parts.append(f"<details>\n<summary>💭 Reasoning (simulated Opus reasoning style)</summary>\n\n```\n{sim_think}\n```\n\n</details>\n")
        if sim_tool:
            parts.append(f"🔧 **Tool called:** `{sim_tool}` ✅")
        parts.append(sim_answer)
        parts.append("\n---\n*Running in simulation mode on HF CPU. The real model (10.5 GB bfloat16) runs locally on Apple Silicon via MLX. [Download & run locally](https://huggingface.co/deadbydawn101/gemma-4-E4B-Agentic-Opus-Reasoning-GeminiCLI-mlx-4bit)*")

        display = "\n\n".join(parts)
        history.append((user_message, display))
        return history, history

    # Real agentic loop — max 4 tool turns
    MAX_TURNS = 4
    tool_log = []

    for turn in range(MAX_TURNS):
        raw = infer(messages, tools=tools)

        # Extract <think> block
        think_match = re.search(r"<think>(.*?)</think>", raw, re.DOTALL)
        think_text = think_match.group(1).strip() if think_match else ""
        clean = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()

        # Check for tool call
        tool_name, tool_args = parse_tool_call(clean)

        if tool_name and enable_tools:
            # Execute tool
            result = execute_tool(tool_name, tool_args)
            tool_log.append(f"🔧 `{tool_name}({json.dumps(tool_args)})` → `{result}`")

            # Inject assistant tool call + tool response back into messages
            messages.append({"role": "assistant", "content": clean})
            messages.append({
                "role": "tool",
                "tool_responses": [{"name": tool_name, "response": {"result": result}}]
            })
            continue  # loop for final answer

        else:
            # Final answer
            parts = []
            if show_thinking and think_text:
                parts.append(f"<details>\n<summary>💭 Reasoning</summary>\n\n```\n{think_text}\n```\n\n</details>\n")
            if tool_log:
                parts.append("**Tools used:**\n" + "\n".join(tool_log) + "\n")
            final_text = re.sub(r"<\|tool_call>.*?<tool_call\|>", "", clean, flags=re.DOTALL).strip()
            parts.append(final_text)
            response = "\n".join(parts)
            history.append((user_message, response))
            return history, history

    # Fallback if max turns hit
    history.append((user_message, "Max tool turns reached. Try a simpler query."))
    return history, history


# ── Gradio UI ─────────────────────────────────────────────────────────────────

EXAMPLES = [
    ["What is 2 to the power of 32?"],
    ["What time is it in Tokyo right now?"],
    ["Write a Python snippet that generates the first 10 Fibonacci numbers and show me the output."],
    ["Search for the latest Solana price."],
    ["Explain how TurboQuant KV cache compression works and why it matters for long-context inference."],
    ["What can you do with tool calling that a normal chat model can't?"],
]

CSS = """
.gradio-container { background: #0a000f !important; font-family: 'JetBrains Mono', monospace; }
.gr-button { background: #2d0040 !important; border: 1px solid #6a0dad !important; color: #c084fc !important; }
.gr-button:hover { background: #4a0080 !important; }
h1, h2, h3 { color: #c084fc !important; }
.gr-chatbot { background: #0f0018 !important; border: 1px solid #2d0040 !important; }
footer { display: none !important; }
"""

with gr.Blocks(css=CSS, title="Gemma 4 E4B — Agentic Tool Calling") as demo:

    gr.Markdown("""
<div align="center">

# 🖤 Gemma 4 E4B — Agentic Tool Calling Demo

**First live demo of Gemma 4 E4B with baked-in Opus reasoning + native tool calling**

`deadbydawn101/gemma-4-E4B-Agentic-Opus-Reasoning-GeminiCLI-mlx-4bit` · Built by [RavenX AI](https://github.com/DeadByDawn101)

</div>

> **What makes this different:** Most HF demos are just chat. This runs a real **agentic loop** — the model decides when to call tools, executes them, injects results, and reasons to a final answer. Opus reasoning is baked into the weights (`<think>` tags work without any adapter).

---
""")

    with gr.Row():
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(
                label="Agent",
                height=500,
                bubble_full_width=False,
                show_label=True,
                render_markdown=True,
            )
            with gr.Row():
                msg = gr.Textbox(
                    placeholder="Ask anything — try 'calculate 2^32' or 'what time is it in Tokyo?'",
                    scale=5,
                    show_label=False,
                    lines=2,
                )
                send = gr.Button("Send ↩", scale=1, variant="primary")

            gr.Examples(examples=EXAMPLES, inputs=msg, label="Try these")

        with gr.Column(scale=1, min_width=200):
            gr.Markdown("### ⚙️ Agent Settings")
            enable_tools = gr.Checkbox(value=True, label="Enable Tool Calling")
            show_thinking = gr.Checkbox(value=True, label="Show <think> Reasoning")

            gr.Markdown("""---
### 🔧 Available Tools
- `calculator` — math expressions
- `get_datetime` — current time / timezone
- `web_search` — live web search
- `run_python` — execute code snippets

---
### 📦 Model
[gemma-4-E4B-Agentic-Opus-Reasoning-GeminiCLI-mlx-4bit](https://huggingface.co/deadbydawn101/gemma-4-E4B-Agentic-Opus-Reasoning-GeminiCLI-mlx-4bit)

**10.5 GB** · bfloat16 fused  
Opus reasoning baked in  
No adapter needed

---
### 🖤 RavenX AI
[GitHub](https://github.com/DeadByDawn101) · [HuggingFace](https://huggingface.co/deadbydawn101)
""")

            clear = gr.Button("Clear", variant="secondary")

    state = gr.State([])

    send.click(
        agentic_chat,
        inputs=[msg, state, enable_tools, show_thinking],
        outputs=[chatbot, state]
    ).then(lambda: "", outputs=msg)

    msg.submit(
        agentic_chat,
        inputs=[msg, state, enable_tools, show_thinking],
        outputs=[chatbot, state]
    ).then(lambda: "", outputs=msg)

    clear.click(lambda: ([], []), outputs=[chatbot, state])

    gr.Markdown("""
---
<div align="center">

**🖤 Built by RavenX AI** — [turboquant-mlx](https://github.com/DeadByDawn101/turboquant-mlx) · [gemini-cli](https://github.com/DeadByDawn101/gemini-cli) · [mlx-gemma4](https://github.com/DeadByDawn101/mlx-gemma4)

*Opus 4.6 reasoning + Claude Code tool-use fused into Gemma 4 E4B weights*

</div>
""")


    gr.Markdown("""
---
<div align="center">

**🖤 RavenX Ecosystem**

[Model](https://huggingface.co/deadbydawn101/gemma-4-E4B-Agentic-Opus-Reasoning-GeminiCLI-mlx-4bit) &nbsp;·&nbsp;
[Tool Calling Demo](https://huggingface.co/spaces/deadbydawn101/gemma4-agentic-tool-calling-demo) &nbsp;·&nbsp;
[OpenClaw Sandbox](https://huggingface.co/spaces/deadbydawn101/openclaw-agent-sandbox-demo) &nbsp;·&nbsp;
[GitHub](https://github.com/DeadByDawn101) &nbsp;·&nbsp;
[TurboQuant](https://github.com/DeadByDawn101/turboquant-mlx)

*Built by [RavenX AI](https://github.com/DeadByDawn101)*

</div>
""")

if __name__ == "__main__":
    demo.launch()