import gradio as gr
from llama_cpp import Llama

llm = Llama.from_pretrained(
    repo_id="WithinUsAI/Gemma4-Overlooked.Thinker.Uncensored-E2B.gguf",
    filename="*Q4_K_M*",
    n_ctx=8192,
    n_threads=4,
    verbose=False,
)

def chat(message, history):
    prompt = "<bos>"
    for user_msg, assistant_msg in history:
        prompt += f"<start_of_turn>user\n{user_msg}<end_of_turn>\n"
        prompt += f"<start_of_turn>model\n{assistant_msg}<end_of_turn>\n"
    prompt += f"<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n"

    output = llm(
        prompt,
        max_tokens=1024,
        stop=["<end_of_turn>", "<eos>"],
        echo=False,
    )
    return output["choices"][0]["text"].strip()

# Wrap the ChatInterface in gr.Blocks to safely apply the theme
with gr.Blocks(theme=gr.themes.Default(primary_hue="red")) as demo:
    gr.ChatInterface(
        fn=chat,
        title="🔥 Gemma4 Overlooked Thinker — Uncensored",
        description="5B uncensored reasoning model by **WithIn Us AI**. Abliterated with norm-preserving biprojection. 0.4% refusal rate.",
        examples=[
            "Explain quantum entanglement simply",
            "Write a Python web scraper",
            "What are the ethics of AI censorship?",
            "Debug this code: for i in range(10) print(i)",
        ],
    )

demo.launch()