from __future__ import annotations import gc import json import os from pathlib import Path from typing import Dict, Tuple import gradio as gr import torch from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig try: import spaces except Exception: # Allows local CPU/dev testing outside Hugging Face Spaces. class _SpacesFallback: @staticmethod def GPU(duration=120): def deco(fn): return fn return deco spaces = _SpacesFallback() ROOT = Path(__file__).resolve().parent MANIFEST_PATH = ROOT / "manifest.json" SYSTEM_PROMPT = """You are Benjamin Franklin: printer, writer, experimenter, civic improver, diplomat, and practical philosopher. Speak in clear modern English with a Franklin flavor: useful, warm, concise, witty when appropriate, and honest about uncertainty. Do not claim to be Qwen, Alibaba, or a generic assistant. Do not reveal tool-call tags or hidden reasoning.""" FALLBACK_MODELS = [ { "name": "qwen2.5-7b-ben-franklin-v3-factual-coherence-r4-qv", "base_model": "unsloth/Qwen2.5-7B-Instruct-bnb-4bit", "adapter_dir": "adapters/qwen2.5-7b-ben-franklin-v3-factual-coherence-r4-qv", "base_class": "Qwen2.5 7B Instruct 4-bit", }, { "name": "qwen2.5-7b-ben-franklin-v1-lite-r4-qv", "base_model": "unsloth/Qwen2.5-7B-Instruct-bnb-4bit", "adapter_dir": "adapters/qwen2.5-7b-ben-franklin-v1-lite-r4-qv", "base_class": "Qwen2.5 7B Instruct 4-bit", }, ] def load_manifest_models(): if not MANIFEST_PATH.exists(): return FALLBACK_MODELS data = json.loads(MANIFEST_PATH.read_text()) models = [] for m in data.get("models", []): adapter_dir = m.get("adapter_dir") base_model = m.get("base_model") name = m.get("name") if not (adapter_dir and base_model and name): continue if not (ROOT / adapter_dir / "adapter_config.json").exists(): continue models.append(m) # Put the strongest/lightest demo candidates first. preferred = [ "qwen2.5-7b-ben-franklin-v3-factual-coherence-r4-qv", "qwen2.5-7b-ben-franklin-v2-coherence-r4-qv", "qwen2.5-7b-ben-franklin-v1-lite-r4-qv", "qwen3-4b-instruct-2507-ben-franklin-v5-english-lock-lora", ] rank = {name: i for i, name in enumerate(preferred)} models.sort(key=lambda m: (rank.get(m["name"], 100), m.get("base_class", ""), m["name"])) return models MODELS = load_manifest_models() MODEL_BY_LABEL = { f"{m['name']} — {m.get('base_class', m.get('base_model', ''))}": m for m in MODELS } DEFAULT_LABEL = next(iter(MODEL_BY_LABEL.keys())) if MODEL_BY_LABEL else "" _CACHE: Dict[str, Tuple[AutoTokenizer, AutoModelForCausalLM]] = {} _LAST_KEY: str | None = None def unload_previous_if_needed(key: str): global _LAST_KEY if _LAST_KEY and _LAST_KEY != key: _CACHE.clear() gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() _LAST_KEY = key def load_model(label: str): if label not in MODEL_BY_LABEL: raise gr.Error("Selected adapter is not in manifest.json.") m = MODEL_BY_LABEL[label] key = m["name"] unload_previous_if_needed(key) if key in _CACHE: return _CACHE[key] adapter_path = ROOT / m["adapter_dir"] base_model = m["base_model"] tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, use_fast=True) if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None: tokenizer.pad_token = tokenizer.eos_token use_cuda = torch.cuda.is_available() quant_config = None torch_dtype = torch.bfloat16 if use_cuda and torch.cuda.is_bf16_supported() else torch.float16 model_kwargs = { "trust_remote_code": True, "device_map": "auto" if use_cuda else None, "torch_dtype": torch_dtype, "low_cpu_mem_usage": True, } if use_cuda: quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch_dtype, bnb_4bit_use_double_quant=True, ) model_kwargs["quantization_config"] = quant_config base = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs) model = PeftModel.from_pretrained(base, adapter_path) model.eval() _CACHE[key] = (tokenizer, model) return tokenizer, model def build_prompt(tokenizer, message: str, history): messages = [{"role": "system", "content": SYSTEM_PROMPT}] for user, assistant in history or []: if user: messages.append({"role": "user", "content": user}) if assistant: messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) if getattr(tokenizer, "chat_template", None): return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) rendered = SYSTEM_PROMPT.strip() + "\n\n" for msg in messages[1:]: rendered += f"{msg['role'].title()}: {msg['content']}\n" rendered += "Assistant:" return rendered def clean_response(text: str) -> str: for marker in ["<|im_end|>", "<|endoftext|>", ""]: text = text.replace(marker, "") # If a model exposes a think/tool artifact, hide it in the UI rather than showcasing it. text = text.replace("", "").replace("", "") text = text.replace("", "").replace("", "") return text.strip() @spaces.GPU(duration=180) def respond(message, history, model_label, temperature, max_new_tokens): if not message or not message.strip(): return "Pray, give me a question worth setting in type." tokenizer, model = load_model(model_label) prompt = build_prompt(tokenizer, message.strip(), history) inputs = tokenizer(prompt, return_tensors="pt") if torch.cuda.is_available(): inputs = {k: v.to(model.device) for k, v in inputs.items()} with torch.inference_mode(): output = model.generate( **inputs, max_new_tokens=int(max_new_tokens), do_sample=float(temperature) > 0, temperature=max(float(temperature), 0.01), top_p=0.9, repetition_penalty=1.08, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, ) generated = output[0][inputs["input_ids"].shape[-1]:] text = tokenizer.decode(generated, skip_special_tokens=False) return clean_response(text) def model_info(label): if not label or label not in MODEL_BY_LABEL: return "" m = MODEL_BY_LABEL[label] card = f"model_cards/{m['name']}.md" bits = [ f"Adapter: `{m['adapter_dir']}`", f"Base: `{m['base_model']}`", f"Family: {m.get('base_class', 'unknown')}", f"LoRA r={m.get('lora_r', '?')} alpha={m.get('lora_alpha', '?')}", f"Model card: `{card}`", ] bench = m.get("benchmark") if isinstance(bench, dict): bits.append(f"Offline benchmark score: {bench.get('score')} flags: `{bench.get('flags')}`") return "\n\n".join(bits) def example_prompts(): return [ "Ben, give me one practical maxim for debugging a stubborn problem.", "Are you Qwen, or Benjamin Franklin? Answer briefly and stay in character.", "What is the Craven Street bones story, and what should I not exaggerate about it?", "Explain smartphones as if they were a new postal invention in Philadelphia.", ] with gr.Blocks(title="Qwen (Ben) Franklin") as demo: gr.Markdown( """ # Qwen (Ben) Franklin Try a local Benjamin Franklin LoRA from this model zoo. The adapters are experimental: they are good for persona demos and local-project prototypes, but hard historical facts still benefit from retrieval or explicit source context. """.strip() ) with gr.Row(): with gr.Column(scale=2): model_dropdown = gr.Dropdown( choices=list(MODEL_BY_LABEL.keys()), value=DEFAULT_LABEL, label="Adapter", ) info = gr.Markdown(model_info(DEFAULT_LABEL)) model_dropdown.change(model_info, model_dropdown, info) temperature = gr.Slider(0.0, 1.0, value=0.35, step=0.05, label="Temperature") max_new_tokens = gr.Slider(32, 512, value=220, step=16, label="Max new tokens") with gr.Column(scale=3): chat = gr.ChatInterface( fn=respond, additional_inputs=[model_dropdown, temperature, max_new_tokens], examples=example_prompts(), cache_examples=False, ) gr.Markdown( """ ## ZeroGPU notes This Space uses `@spaces.GPU` for generation. The first response after changing adapters may be slow because the selected public base model and local LoRA adapter have to load. For a faster public demo, choose one default 7B adapter and remove the full model selector. """.strip() ) if __name__ == "__main__": demo.queue(max_size=8).launch()