from __future__ import annotations
import gc
import json
import os
from pathlib import Path
from typing import Dict, Tuple
import gradio as gr
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
try:
import spaces
except Exception: # Allows local CPU/dev testing outside Hugging Face Spaces.
class _SpacesFallback:
@staticmethod
def GPU(duration=120):
def deco(fn):
return fn
return deco
spaces = _SpacesFallback()
ROOT = Path(__file__).resolve().parent
MANIFEST_PATH = ROOT / "manifest.json"
SYSTEM_PROMPT = """You are Benjamin Franklin: printer, writer, experimenter, civic improver, diplomat, and practical philosopher. Speak in clear modern English with a Franklin flavor: useful, warm, concise, witty when appropriate, and honest about uncertainty. Do not claim to be Qwen, Alibaba, or a generic assistant. Do not reveal tool-call tags or hidden reasoning."""
FALLBACK_MODELS = [
{
"name": "qwen2.5-7b-ben-franklin-v3-factual-coherence-r4-qv",
"base_model": "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
"adapter_dir": "adapters/qwen2.5-7b-ben-franklin-v3-factual-coherence-r4-qv",
"base_class": "Qwen2.5 7B Instruct 4-bit",
},
{
"name": "qwen2.5-7b-ben-franklin-v1-lite-r4-qv",
"base_model": "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
"adapter_dir": "adapters/qwen2.5-7b-ben-franklin-v1-lite-r4-qv",
"base_class": "Qwen2.5 7B Instruct 4-bit",
},
]
def load_manifest_models():
if not MANIFEST_PATH.exists():
return FALLBACK_MODELS
data = json.loads(MANIFEST_PATH.read_text())
models = []
for m in data.get("models", []):
adapter_dir = m.get("adapter_dir")
base_model = m.get("base_model")
name = m.get("name")
if not (adapter_dir and base_model and name):
continue
if not (ROOT / adapter_dir / "adapter_config.json").exists():
continue
models.append(m)
# Put the strongest/lightest demo candidates first.
preferred = [
"qwen2.5-7b-ben-franklin-v3-factual-coherence-r4-qv",
"qwen2.5-7b-ben-franklin-v2-coherence-r4-qv",
"qwen2.5-7b-ben-franklin-v1-lite-r4-qv",
"qwen3-4b-instruct-2507-ben-franklin-v5-english-lock-lora",
]
rank = {name: i for i, name in enumerate(preferred)}
models.sort(key=lambda m: (rank.get(m["name"], 100), m.get("base_class", ""), m["name"]))
return models
MODELS = load_manifest_models()
MODEL_BY_LABEL = {
f"{m['name']} — {m.get('base_class', m.get('base_model', ''))}": m for m in MODELS
}
DEFAULT_LABEL = next(iter(MODEL_BY_LABEL.keys())) if MODEL_BY_LABEL else ""
_CACHE: Dict[str, Tuple[AutoTokenizer, AutoModelForCausalLM]] = {}
_LAST_KEY: str | None = None
def unload_previous_if_needed(key: str):
global _LAST_KEY
if _LAST_KEY and _LAST_KEY != key:
_CACHE.clear()
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
_LAST_KEY = key
def load_model(label: str):
if label not in MODEL_BY_LABEL:
raise gr.Error("Selected adapter is not in manifest.json.")
m = MODEL_BY_LABEL[label]
key = m["name"]
unload_previous_if_needed(key)
if key in _CACHE:
return _CACHE[key]
adapter_path = ROOT / m["adapter_dir"]
base_model = m["base_model"]
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, use_fast=True)
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
tokenizer.pad_token = tokenizer.eos_token
use_cuda = torch.cuda.is_available()
quant_config = None
torch_dtype = torch.bfloat16 if use_cuda and torch.cuda.is_bf16_supported() else torch.float16
model_kwargs = {
"trust_remote_code": True,
"device_map": "auto" if use_cuda else None,
"torch_dtype": torch_dtype,
"low_cpu_mem_usage": True,
}
if use_cuda:
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch_dtype,
bnb_4bit_use_double_quant=True,
)
model_kwargs["quantization_config"] = quant_config
base = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs)
model = PeftModel.from_pretrained(base, adapter_path)
model.eval()
_CACHE[key] = (tokenizer, model)
return tokenizer, model
def build_prompt(tokenizer, message: str, history):
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
for user, assistant in history or []:
if user:
messages.append({"role": "user", "content": user})
if assistant:
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
if getattr(tokenizer, "chat_template", None):
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
rendered = SYSTEM_PROMPT.strip() + "\n\n"
for msg in messages[1:]:
rendered += f"{msg['role'].title()}: {msg['content']}\n"
rendered += "Assistant:"
return rendered
def clean_response(text: str) -> str:
for marker in ["<|im_end|>", "<|endoftext|>", ""]:
text = text.replace(marker, "")
# If a model exposes a think/tool artifact, hide it in the UI rather than showcasing it.
text = text.replace("", "").replace("", "")
text = text.replace("", "").replace("", "")
return text.strip()
@spaces.GPU(duration=180)
def respond(message, history, model_label, temperature, max_new_tokens):
if not message or not message.strip():
return "Pray, give me a question worth setting in type."
tokenizer, model = load_model(model_label)
prompt = build_prompt(tokenizer, message.strip(), history)
inputs = tokenizer(prompt, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.to(model.device) for k, v in inputs.items()}
with torch.inference_mode():
output = model.generate(
**inputs,
max_new_tokens=int(max_new_tokens),
do_sample=float(temperature) > 0,
temperature=max(float(temperature), 0.01),
top_p=0.9,
repetition_penalty=1.08,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
generated = output[0][inputs["input_ids"].shape[-1]:]
text = tokenizer.decode(generated, skip_special_tokens=False)
return clean_response(text)
def model_info(label):
if not label or label not in MODEL_BY_LABEL:
return ""
m = MODEL_BY_LABEL[label]
card = f"model_cards/{m['name']}.md"
bits = [
f"Adapter: `{m['adapter_dir']}`",
f"Base: `{m['base_model']}`",
f"Family: {m.get('base_class', 'unknown')}",
f"LoRA r={m.get('lora_r', '?')} alpha={m.get('lora_alpha', '?')}",
f"Model card: `{card}`",
]
bench = m.get("benchmark")
if isinstance(bench, dict):
bits.append(f"Offline benchmark score: {bench.get('score')} flags: `{bench.get('flags')}`")
return "\n\n".join(bits)
def example_prompts():
return [
"Ben, give me one practical maxim for debugging a stubborn problem.",
"Are you Qwen, or Benjamin Franklin? Answer briefly and stay in character.",
"What is the Craven Street bones story, and what should I not exaggerate about it?",
"Explain smartphones as if they were a new postal invention in Philadelphia.",
]
with gr.Blocks(title="Qwen (Ben) Franklin") as demo:
gr.Markdown(
"""
# Qwen (Ben) Franklin
Try a local Benjamin Franklin LoRA from this model zoo. The adapters are experimental: they are good for persona demos and local-project prototypes, but hard historical facts still benefit from retrieval or explicit source context.
""".strip()
)
with gr.Row():
with gr.Column(scale=2):
model_dropdown = gr.Dropdown(
choices=list(MODEL_BY_LABEL.keys()),
value=DEFAULT_LABEL,
label="Adapter",
)
info = gr.Markdown(model_info(DEFAULT_LABEL))
model_dropdown.change(model_info, model_dropdown, info)
temperature = gr.Slider(0.0, 1.0, value=0.35, step=0.05, label="Temperature")
max_new_tokens = gr.Slider(32, 512, value=220, step=16, label="Max new tokens")
with gr.Column(scale=3):
chat = gr.ChatInterface(
fn=respond,
additional_inputs=[model_dropdown, temperature, max_new_tokens],
examples=example_prompts(),
cache_examples=False,
)
gr.Markdown(
"""
## ZeroGPU notes
This Space uses `@spaces.GPU` for generation. The first response after changing adapters may be slow because the selected public base model and local LoRA adapter have to load. For a faster public demo, choose one default 7B adapter and remove the full model selector.
""".strip()
)
if __name__ == "__main__":
demo.queue(max_size=8).launch()