Spaces:

StentorLabs
/

StentorLabs-demo_space

Running

App Files Files Community

StentorLabs-demo_space / app.py

StentorLabs

Update app.py

0c38029 verified about 7 hours ago

raw

history blame contribute delete

100 kB

	import os
	import time
	import random
	import queue
	from datetime import datetime, timezone
	import html
	from threading import Thread
	from typing import Any

	import gradio as gr
	import torch
	import torch.nn.functional as F
	try:
	from huggingface_hub import login
	except ImportError:
	login = None
	from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
	try:
	from transformers import TextIteratorStreamer
	except ImportError: # transformers >=5 may move streamers under generation
	from transformers.generation.streamers import TextIteratorStreamer

	# ── PERFORMANCE BOTTLENECK FIXES ──
	if not torch.cuda.is_available():
	# Hugging Face Free Spaces typically have 2 vCPUs.
	# Limiting threads prevents context-switching overhead which is a major bottleneck for small models.
	torch.set_num_threads(2)
	# Flush denormal numbers to zero to avoid expensive CPU subnormal float calculations.
	try:
	torch.set_flush_denormal(True)
	except Exception:
	pass

	# Import tokenmonster for models that require it
	try:
	import tokenmonster
	except ImportError:
	tokenmonster = None

	class InterruptCallback(StoppingCriteria):
	def __init__(self):
	self.stop_signal = False
	def __call__(self, input_ids, scores, **kwargs):
	return self.stop_signal

	interrupt_callback = InterruptCallback()

	# ─────────────────────────────────── CONFIG ───────────────────────────────────
	APP_TITLE = "Stentor Labs"
	MODEL_OWNER = os.getenv("HF_MODEL_OWNER", "StentorLabs")
	FLAGSHIP_MODELS = ["Portimbria-150M", "Stentor3-50M"]

	STENTOR_MODELS = {
	"Portimbria-150M": f"{MODEL_OWNER}/Portimbria-150M",
	"Stentor3-50M": f"{MODEL_OWNER}/Stentor3-50M",
	"Stentor3-20M": f"{MODEL_OWNER}/Stentor3-20M",
	"Stentor2-30M": f"{MODEL_OWNER}/Stentor2-30M",
	"Stentor2-12M": f"{MODEL_OWNER}/Stentor2-12M",
	"Stentor-30M": f"{MODEL_OWNER}/Stentor-30M",
	"Stentor-12M": f"{MODEL_OWNER}/Stentor-12M",
	"Stentor-30M-Instruct": f"{MODEL_OWNER}/Stentor-30M-Instruct",
	"Stentor-12M-Instruct": f"{MODEL_OWNER}/Stentor-12M-Instruct",
	}

	ARENA_MODELS = {
	"DistilGPT2": "distilbert/distilgpt2",
	"Pythia-14M": "EleutherAI/pythia-14m",
	"Pythia-31M": "EleutherAI/pythia-31m",
	"Pythia-70M": "EleutherAI/pythia-70m",
	"gpt2 small": "openai-community/gpt2",
	"SmolLM2-135M": "HuggingFaceTB/SmolLM2-135M",
	"NanoWhale-100M-Base": "HuggingFaceTB/nanowhale-100m-base",
	"Pythia-160M": "EleutherAI/pythia-160m",
	"OPT-125M": "facebook/opt-125m",
	"GPT-Neo 125M": "EleutherAI/gpt-neo-125M",
	}

	ALL_MODELS = {STENTOR_MODELS, ARENA_MODELS}
	DEFAULT_MODEL = "Portimbria-150M"

	def _max_tokens_cap(model_key: str) -> int:
	if model_key.lower().startswith("portimbria") or model_key.lower().startswith("stentor3-"):
	return 4096
	if model_key.lower().startswith("stentor2-"):
	return 1024
	return 512

	# Default generation parameters
	DEFAULT_TEMP = 0.8
	DEFAULT_REP_PENALTY = 1.35
	DEFAULT_TOP_P = 0.9
	DEFAULT_MAX_TOKENS = 100

	INITIAL_MAX_TOKENS = _max_tokens_cap(DEFAULT_MODEL)

	PRESETS = {
	"🎨 Creative": {
	"temperature": 1.1,
	"top_p": 0.95,
	"max_tokens": DEFAULT_MAX_TOKENS,
	"repetition_penalty": DEFAULT_REP_PENALTY,
	},
	"⚖️ Balanced": {
	"temperature": 0.8,
	"top_p": 0.9,
	"max_tokens": DEFAULT_MAX_TOKENS,
	"repetition_penalty": DEFAULT_REP_PENALTY,
	},
	"🎯 Focused": {
	"temperature": 0.6,
	"top_p": 0.9,
	"max_tokens": DEFAULT_MAX_TOKENS,
	"repetition_penalty": DEFAULT_REP_PENALTY,
	},
	}

	MODE_RECOMMENDATION_HTML = (
	'<p class="mode-caption">'
	'<strong>Balanced</strong> or <strong>Focused</strong> modes are recommended; '
	'Creative mode can get chaotic.'
	'</p>'
	)

	EXAMPLE_PROMPTS = [
	("📖", "Once upon a time in a world where"),
	("📖", "The last explorer on Earth discovered"),
	("🔬", "The theory of relativity states that"),
	("🔬", "Scientists recently discovered that"),
	("💻", "def quicksort(arr):\n "),
	("💻", "class NeuralNetwork:\n def __init__"),
	("🧠", "The most important thing about AI is"),
	("🧠", "The philosophy of consciousness suggests"),
	]

	_model_cache: dict[str, tuple[Any, Any]] = {}

	def _hf_auth_token():
	return os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")

	# Attempt global login if token is available
	_token = _hf_auth_token()
	if _token:
	print(f"[Stentor] Authentication token found (ending in ...{_token[-4:]}).")
	if login:
	try:
	login(token=_token)
	except Exception as e:
	print(f"[Stentor] Global login failed: {e}")
	else:
	print("[Stentor] WARNING: No HF_TOKEN found in secrets. Private/Gated models will fail to load.")


	def _hf_repo_kwargs() -> dict[str, Any]:
	token = _hf_auth_token()
	return {"token": token} if token else {}

	# ─────────────────────────────── MODEL LOGIC ──────────────────────────────────
	def _load_tokenizer(model_id: str):
	# For Stentor models, trust_remote_code is mandatory for TokenMonster support
	is_stentor = any(s.lower() in model_id.lower() for s in ["stentor", "portimbria"])

	kwargs = {"trust_remote_code": is_stentor, "use_fast": not is_stentor}
	try:
	return AutoTokenizer.from_pretrained(model_id, _hf_repo_kwargs(), kwargs)
	except Exception as first_err:
	# Fallback to absolute basics if that fails
	try:
	return AutoTokenizer.from_pretrained(model_id, **_hf_repo_kwargs(), trust_remote_code=True)
	except Exception as second_err:
	raise RuntimeError(f"Tokenizer fail: {first_err} -> {second_err}")


	def _load_model(model_id: str):
	is_stentor = any(s.lower() in model_id.lower() for s in ["stentor", "portimbria"])

	# Determine optimal dtype for loading
	if torch.cuda.is_available():
	if torch.cuda.is_bf16_supported():
	dtype = torch.bfloat16
	print(f"[Stentor] Using bfloat16 for {model_id} on CUDA.")
	else:
	dtype = torch.float16
	print(f"[Stentor] Using float16 for {model_id} on CUDA.")
	else:
	# On CPU, float16 is extremely slow due to software emulation.
	# float32 is the native fast path for CPU inference.
	# Even for 150M models, memory usage in float32 is only ~600MB.
	dtype = torch.float32
	print(f"[Stentor] Using float32 for {model_id} on CPU for maximum speed.")

	try:
	return AutoModelForCausalLM.from_pretrained(
	model_id,
	dtype=dtype,
	low_cpu_mem_usage=True,
	trust_remote_code=is_stentor,
	**_hf_repo_kwargs(),
	)
	except Exception as e:
	print(f"[Stentor] Failed to load {model_id} with {dtype}. Retrying with float32 and trust_remote_code=True. Error: {e}")
	# Fallback to float32 and forced trust_remote_code if initial attempt fails
	return AutoModelForCausalLM.from_pretrained(
	model_id,
	dtype=torch.float32,
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	**_hf_repo_kwargs(),
	)


	def _get_model(model_id: str):
	if model_id in _model_cache:
	return _model_cache[model_id]
	print(f"[Stentor] Loading {model_id}...")
	tok = _load_tokenizer(model_id)
	mdl = _load_model(model_id)
	mdl.eval()
	# Removed torch.compile to prevent initialization timeouts/errors on CPU spaces

	# Warmup pass to trigger JIT compilation
	print(f"[Stentor] Warming up {model_id}...")
	try:
	dummy = tok("Hello", return_tensors="pt")
	with torch.no_grad():
	mdl.generate(**dummy, max_new_tokens=1, pad_token_id=tok.eos_token_id)
	print(f"[Stentor] {model_id} warmup complete")
	except Exception as e:
	print(f"[Stentor] {model_id} warmup skipped ({e})")

	_model_cache[model_id] = (tok, mdl)
	print(f"[Stentor] {model_id} ready.")
	return tok, mdl


	def _prep_inputs(tokenizer, prompt):
	inputs = tokenizer(prompt, return_tensors="pt")
	inputs.pop("token_type_ids", None)
	return inputs


	def _to_model_device(inputs: dict[str, Any], model):
	device = next(model.parameters()).device
	for k, v in inputs.items():
	if hasattr(v, "to"):
	inputs[k] = v.to(device)
	if "attention_mask" not in inputs and "input_ids" in inputs:
	inputs["attention_mask"] = torch.ones_like(inputs["input_ids"])
	return inputs


	def _decode_response_only(tokenizer, full_ids, input_ids):
	prompt_len = input_ids.shape[1]
	new_ids = full_ids[0][prompt_len:]
	return tokenizer.decode(new_ids, skip_special_tokens=True)


	def _generate_and_stream(repo_id, prompt, max_tokens, temperature, top_p, repetition_penalty):
	"""Generate text from a model, yielding tokens as they come."""
	tokenizer, model = _get_model(repo_id)
	interrupt_callback.stop_signal = False
	inputs = _to_model_device(_prep_inputs(tokenizer, prompt), model)

	# IMPORTANT: Removed skip_special_tokens=True due to TokenMonster incompatibility
	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
	error_container = [None]

	def worker_fn():
	try:
	with torch.inference_mode():
	gen_kwargs = dict(
	**inputs,
	max_new_tokens=int(max_tokens),
	temperature=float(temperature),
	top_p=float(top_p),
	repetition_penalty=float(repetition_penalty),
	do_sample=float(temperature) >= 0.05,
	pad_token_id=tokenizer.eos_token_id,
	stopping_criteria=StoppingCriteriaList([interrupt_callback]),
	streamer=streamer,
	)
	model.generate(**gen_kwargs)
	except Exception as e:
	error_container[0] = str(e)

	t = Thread(target=worker_fn, daemon=True)
	t.start()
	full_text = ""

	# Manually filter common special tokens if they appear (TokenMonster compatibility)
	special_tokens_to_remove = [tokenizer.eos_token, tokenizer.pad_token, tokenizer.bos_token, tokenizer.unk_token]
	# Filter out None values from special_tokens_to_remove list
	special_tokens_to_remove = [t for t in special_tokens_to_remove if t is not None]

	try:
	for chunk in streamer:
	if interrupt_callback.stop_signal: break
	for st in special_tokens_to_remove:
	chunk = chunk.replace(st, "")
	full_text += chunk
	yield full_text
	finally:
	interrupt_callback.stop_signal = True
	t.join(timeout=1.0)

	if error_container[0]:
	yield f"❌ {error_container[0]}"


	def parallel_config_generate(prompt, configs):
	"""Runs multiple generations sequentially and yields results list."""
	text = (prompt or "").strip()
	if not text: return

	num = len(configs)
	results = [""] * num
	t0 = time.perf_counter()

	interrupt_callback.stop_signal = False
	for i, cfg in enumerate(configs):
	if interrupt_callback.stop_signal: break
	m_key = cfg["model_key"]
	error_container = [None]
	try:
	interrupt_callback.stop_signal = False
	tokenizer, model = _get_model(ALL_MODELS[m_key])
	m_cap = _max_tokens_cap(m_key)
	actual_max = min(int(cfg["max_tokens"]), m_cap)
	inputs = _to_model_device(_prep_inputs(tokenizer, text), model)
	# Unified with _generate_and_stream for TokenMonster compatibility
	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)

	def worker_fn():
	try:
	with torch.inference_mode():
	# Ensure diverse output for sequential same-model runs (Tab 1 multi-response)
	torch.manual_seed(random.randint(0, 2**31 - 1))
	gen_kwargs = dict(
	**inputs,
	max_new_tokens=actual_max,
	temperature=float(cfg["temperature"]),
	top_p=float(cfg["top_p"]),
	repetition_penalty=float(cfg["repetition_penalty"]),
	do_sample=float(cfg["temperature"]) >= 0.05,
	pad_token_id=tokenizer.eos_token_id,
	stopping_criteria=StoppingCriteriaList([interrupt_callback]),
	streamer=streamer,
	)
	model.generate(**gen_kwargs)
	except Exception as e:
	error_container[0] = str(e)

	t = Thread(target=worker_fn, daemon=True)
	t.start()

	# Filter common special tokens manually for TokenMonster compatibility
	special_tokens_to_remove = [tokenizer.eos_token, tokenizer.pad_token, tokenizer.bos_token, tokenizer.unk_token]
	special_tokens_to_remove = [t for t in special_tokens_to_remove if t is not None]

	try:
	for chunk in streamer:
	if interrupt_callback.stop_signal: break
	for st in special_tokens_to_remove:
	chunk = chunk.replace(st, "")
	results[i] += chunk
	yield results, time.perf_counter() - t0
	finally:
	t.join(timeout=1.0)

	if error_container[0]:
	results[i] = f"❌ {error_container[0]}"
	yield results, time.perf_counter() - t0

	if interrupt_callback.stop_signal:
	break

	except Exception as e:
	results[i] = f"❌ {str(e)}"
	yield results, time.perf_counter() - t0


	# ── Chat generation ─────────────────────────────────────────────────────────────
	def chat_generate(messages_state, user_message, model_key, max_tok, temp, tp, rep_penalty):
	if messages_state is None:
	messages_state = []
	if not user_message:
	yield "", messages_state
	return

	messages_state.append({"role": "user", "content": user_message})

	conversation = ""
	for msg in messages_state:
	role = msg.get("role", "user")
	content = msg.get("content", "")
	conversation += f"<{role}>{content}</{role}>\n"
	conversation += f"<assistant>"

	config = [{"model_key": model_key, "max_tokens": max_tok, "temperature": temp, "top_p": tp, "repetition_penalty": rep_penalty}]

	assistant_response = ""
	for results, _ in parallel_config_generate(conversation, config):
	assistant_response = results[0]
	temp_messages = messages_state + [{"role": "assistant", "content": assistant_response}]
	yield _render_chat_html(temp_messages, model_key), messages_state

	messages_state.append({"role": "assistant", "content": assistant_response.strip()})
	yield _render_chat_html(messages_state, model_key), messages_state


	def _render_chat_html(messages, model_key):
	html_parts = []
	for msg in messages:
	role = msg.get("role", "user")
	content = msg.get("content", "")
	if role == "user":
	html_parts.append(
	f'<div style="margin-bottom:8px;padding:8px 12px;background:#0d1829;border:1px solid #1a2744;'
	f'border-radius:8px;"><span style="font-weight:700;color:#38bdf8;font-size:11px;'
	f'text-transform:uppercase;">You</span><p style="margin:4px 0 0;color:#ffffff;font-size:13px;">'
	f'{content}</p></div>'
	)
	else:
	html_parts.append(
	f'<div style="margin-bottom:8px;padding:8px 12px;background:#080d1a;border:1px solid rgba(245,158,11,0.25);'
	f'border-radius:8px;"><span style="font-weight:700;color:#f59e0b;font-size:11px;'
	f'text-transform:uppercase;">{model_key}</span><p style="margin:4px 0 0;color:#ffffff;font-size:13px;">'
	f'{content}</p></div>'
	)
	return "".join(html_parts)


	def chat_clear():
	"""Clear chat messages."""
	placeholder = '<div style="padding:20px;text-align:center;background:#060a14;border:1px solid #1e2d45;border-radius:10px;min-height:200px;display:flex;flex-direction:column;align-items:center;justify-content:center;"><p style="font-family:Sora,sans-serif;font-size:13px;color:#374151;margin:0;">Start a conversation by typing a message below.</p></div>'
	return [], placeholder, ""


	def apply_standard_preset(name: str):
	p = PRESETS[name]
	return p["max_tokens"], p["temperature"], p["top_p"], p["repetition_penalty"]


	def apply_sweep_preset(name: str):
	p = PRESETS[name]
	return p["max_tokens"], p["top_p"], p["repetition_penalty"]


	# ── Token Explorer (sampling + logprobs) ───────────────────────────────────────
	def run_token_explorer(prompt, model_key, max_tokens, temperature, top_p, repetition_penalty):
	text = (prompt or "").strip()
	if not text:
	return _explorer_placeholder(), "Enter a prompt."

	interrupt_callback.stop_signal = False
	try:
	tokenizer, model = _get_model(ALL_MODELS.get(model_key, ALL_MODELS[DEFAULT_MODEL]))
	except Exception as e:
	return f'<p style="color:#f87171;padding:20px;">Error loading model: {e}</p>', "Error"

	max_tokens = min(int(max_tokens), _max_tokens_cap(model_key))
	inputs = _to_model_device(_prep_inputs(tokenizer, text), model)
	try:
	with torch.inference_mode():
	outputs = model.generate(
	**inputs,
	max_new_tokens=int(max_tokens),
	do_sample=True,
	temperature=float(temperature),
	top_p=float(top_p),
	repetition_penalty=float(repetition_penalty),
	output_scores=True,
	return_dict_in_generate=True,
	pad_token_id=tokenizer.eos_token_id,
	stopping_criteria=StoppingCriteriaList([interrupt_callback]),
	)
	except Exception as e:
	return f'<p style="color:#f87171;padding:20px;">Generation error: {e}</p>', "Error"

	input_len = inputs["input_ids"].shape[1]
	generated_ids = outputs.sequences[0][input_len:]

	token_data = []
	for score_t, token_id in zip(outputs.scores, generated_ids):
	probs = F.softmax(score_t[0], dim=-1)
	top_k = torch.topk(probs, 8)
	token_data.append({
	"token": tokenizer.decode([token_id.item()]),
	"prob": probs[token_id].item(),
	"alternatives": [
	{"token": tokenizer.decode([idx.item()]), "prob": p.item()}
	for idx, p in zip(top_k.indices, top_k.values)
	],
	})

	html = _build_token_html(text, token_data)
	return html, f"✓ {len(token_data)} tokens · sampled · {model_key}"


	def _tok_style(p):
	if p >= 0.80: return "#10b981", "rgba(16,185,129,0.18)", "rgba(16,185,129,0.40)"
	if p >= 0.50: return "#eab308", "rgba(234,179,8,0.18)", "rgba(234,179,8,0.40)"
	if p >= 0.35: return "#f97316", "rgba(249,115,22,0.14)", "rgba(249,115,22,0.40)"
	return "#b91c1c", "rgba(185,28,28,0.12)", "rgba(185,28,28,0.40)"


	def _build_token_html(prompt_text, token_data):
	if not token_data:
	return '<p style="color:#64748b;padding:20px;">No tokens generated.</p>'

	avg_p = sum(td["prob"] for td in token_data) / len(token_data)
	high = sum(1 for td in token_data if td["prob"] >= 0.80)
	med = sum(1 for td in token_data if 0.50 <= td["prob"] < 0.80)
	unsure = sum(1 for td in token_data if 0.35 <= td["prob"] < 0.50)
	low = sum(1 for td in token_data if td["prob"] < 0.35)

	spans = []
	for td in token_data:
	raw = td["token"]
	p = td["prob"]
	pct = int(p * 100)
	col, bg, brd = _tok_style(p)

	disp = html.escape(raw).replace("\n", "↵")
	if not disp.strip():
	disp = "·"

	alts = " \| ".join(
	f'{html.escape(a["token"].strip() or "·")} {a["prob"]*100:.0f}%'
	for a in td["alternatives"][:6]
	)
	tip = html.escape(f"Token: {raw.strip() or repr(raw)} ({pct}%)\nAlternatives: {alts}").replace("\n", " ")

	spans.append(
	f'<span title="{tip}" '
	f'style="background:{bg};color:{col};border:1px solid {brd};'
	f'padding:3px 7px;border-radius:5px;margin:2px 1px;display:inline-block;'
	f'font-family:Space Mono,monospace;font-size:13px;line-height:2.2;'
	f'cursor:help;transition:transform 0.1s,box-shadow 0.1s;" '
	f'onmouseover="this.style.transform=\'scale(1.1)\';this.style.boxShadow=\'0 0 12px {brd}\'" '
	f'onmouseout="this.style.transform=\'\';this.style.boxShadow=\'\'">'
	f'{disp}<sup style="font-size:8px;opacity:0.6;margin-left:2px;">{pct}%</sup>'
	f'</span>'
	)

	prompt_span = (
	f'<span style="color:#374151;font-family:Space Mono,monospace;font-size:13px;">'
	f'{html.escape(prompt_text)}'
	f'</span>'
	)

	token_block = (
	f'<div style="padding:20px;background:#060a14;border:1px solid #1e2d45;border-radius:10px;'
	f'line-height:2.4;word-wrap:break-word;min-height:80px;">'
	+ prompt_span + "".join(spans)
	+ "</div>"
	)

	legend = (
	f'<div style="margin-bottom:14px;">'
	f'<div style="display:flex;gap:10px;margin-bottom:10px;flex-wrap:wrap;align-items:center;">'
	f'<span style="font-family:Sora,sans-serif;font-size:13px;color:#64748b;">Avg confidence: '
	f'<strong style="color:#ffffff;">{int(avg_p*100)}%</strong></span>'
	f'<span style="font-family:Sora,sans-serif;font-size:11px;color:#4a5568;margin-left:4px;">· Hover any token to see top alternatives</span>'
	f'</div>'
	f'<div style="display:flex;gap:8px;flex-wrap:wrap;">'
	f'<div style="display:flex;align-items:center;gap:6px;">'
	f'<div style="width:12px;height:12px;background:#10b981;border-radius:3px;"></div>'
	f'<span style="font-family:Space Mono,monospace;font-size:11px;color:#94a3b8;">≥80% confident · {high} tokens</span></div>'
	f'<div style="display:flex;align-items:center;gap:6px;">'
	f'<div style="width:12px;height:12px;background:#eab308;border-radius:3px;"></div>'
	f'<span style="font-family:Space Mono,monospace;font-size:11px;color:#94a3b8;">50–79% moderate · {med} tokens</span></div>'
	f'<div style="display:flex;align-items:center;gap:6px;">'
	f'<div style="width:12px;height:12px;background:#f97316;border-radius:3px;"></div>'
	f'<span style="font-family:Space Mono,monospace;font-size:11px;color:#94a3b8;">35–49% uncertain · {unsure} tokens</span></div>'
	f'<div style="display:flex;align-items:center;gap:6px;">'
	f'<div style="width:12px;height:12px;background:#b91c1c;border-radius:3px;"></div>'
	f'<span style="font-family:Space Mono,monospace;font-size:11px;color:#94a3b8;"><35% low · {low} tokens</span></div>'
	f'</div></div>'
	)

	return legend + token_block


	def _explorer_placeholder():
	return (
	'<div style="padding:32px;text-align:center;background:#060a14;border:1px solid #1e2d45;'
	'border-radius:10px;min-height:120px;display:flex;flex-direction:column;align-items:center;justify-content:center;">'
	'<div style="font-family:Bebas Neue,sans-serif;font-size:28px;letter-spacing:0.08em;color:#1e2d45;margin-bottom:8px;">TOKEN EXPLORER</div>'
	'<p style="font-family:Sora,sans-serif;font-size:13px;color:#374151;margin:0;">'
	'Enter a prompt and click Explore to see per-token confidence heatmap</p>'
	'</div>'
	)


	# ── Temperature Sweep ──────────────────────────────────────────────────────────
	def run_temp_sweep_streamed(prompt, model_key, max_tok, tp, rep_penalty, count, *temps):
	count_int = int(count)
	text = (prompt or "").strip()
	if not text:
	yield tuple(["Enter a prompt."] * 5)
	return

	active_temps = [float(t) for t in temps[:count_int]]

	# Validate: No duplicates
	rounded = [round(t, 2) for t in active_temps]
	if len(set(rounded)) != len(rounded):
	yield tuple(["⚠️ Duplicate temperatures detected. Please make each temperature unique."] * 5)
	return

	configs = []
	for t in active_temps:
	configs.append({
	"model_key": model_key, "max_tokens": max_tok,
	"temperature": t, "top_p": tp, "repetition_penalty": rep_penalty
	})

	for results, _ in parallel_config_generate(prompt, configs):
	outputs = []
	for i in range(5):
	outputs.append(results[i] if i < count_int else "")
	yield tuple(outputs)


	# ── History helpers ────────────────────────────────────────────────────────────
	def add_to_history(history, prompt, output, model_key, temperature):
	if not (output and prompt):
	return history or []
	entry = {
	"prompt": prompt[:55] + ("…" if len(prompt) > 55 else ""),
	"output": output.strip()[:100],
	"model": model_key,
	"temp": round(float(temperature), 1),
	"time": datetime.now().strftime("%H:%M"),
	}
	return ([entry] + (history or []))[:10]


	def build_history_html(history):
	if not history:
	return (
	'<div style="padding:24px 16px;text-align:center;">'
	'<div style="font-family:Sora,sans-serif;font-size:12px;color:#374151;">No generations yet</div>'
	'</div>'
	)
	rows = []
	for i, e in enumerate(history):
	fade = max(0.35, 1.0 - i * 0.07)
	rows.append(
	f'<div style="padding:10px 14px;border-bottom:1px solid #0d1829;opacity:{fade:.2f};">'
	f'<div style="font-family:Space Mono,monospace;font-size:10px;color:#f59e0b;margin-bottom:3px;letter-spacing:0.04em;">'
	f'{e["time"]} · {e["model"]} · t={e["temp"]}</div>'
	f'<div style="font-family:Sora,sans-serif;font-size:12px;color:#cbd5e1;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;margin-bottom:2px;">'
	f'<strong style="color:#ffffff;">{e["prompt"]}</strong></div>'
	f'<div style="font-family:Sora,sans-serif;font-size:11px;color:#4a5568;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">'
	f'{e["output"] or "…"}</div>'
	f'</div>'
	)
	return (
	'<div style="background:#0a0f1e;border:1px solid #1e2d45;border-radius:10px;overflow:hidden;">'
	'<div style="padding:10px 14px;border-bottom:1px solid #1e2d45;background:#060a14;display:flex;align-items:center;gap:8px;">'
	'<span style="font-family:Bebas Neue,sans-serif;font-size:16px;letter-spacing:0.1em;color:#ffffff;">HISTORY</span>'
	f'<span style="font-family:Space Mono,monospace;font-size:10px;color:#374151;">{len(history)} runs</span>'
	'</div>'
	+ "".join(rows)
	+ '</div>'
	)


	# ── Stats HTML ─────────────────────────────────────────────────────────────────
	def build_stats_html(tokens, elapsed, tps):
	def pill(val, lbl):
	return (
	f'<div style="background:#0d1829;border:1px solid #1e2d45;border-radius:8px;padding:10px;text-align:center;">'
	f'<span style="font-family:Space Mono,monospace;font-size:15px;font-weight:700;color:#f59e0b;display:block;line-height:1;margin-bottom:3px;">{val}</span>'
	f'<span style="font-family:Sora,sans-serif;font-size:10px;font-weight:600;letter-spacing:0.12em;text-transform:uppercase;color:#374151;">{lbl}</span>'
	f'</div>'
	)
	return (
	f'<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:8px;margin-top:8px;">'
	+ pill(tokens, "Tokens") + pill(elapsed, "Time") + pill(tps, "Speed")
	+ '</div>'
	)


	def _on_generate_model_change(model_key: str, current_max_tokens: float):
	cap = _max_tokens_cap(model_key)
	value = int(min(max(DEFAULT_MAX_TOKENS, current_max_tokens), cap))
	return MODEL_CARDS.get(model_key, ""), gr.update(maximum=cap, value=value)


	def _on_model_cap_change(model_key: str, current_max_tokens: float, min_value: int = 5):
	cap = _max_tokens_cap(model_key)
	value = int(min(max(min_value, current_max_tokens), cap))
	return gr.update(maximum=cap, value=value)


	# ── Arena ─────────────────────────────────────────────────────────────────────
	def arena_generate(prompt, mode, model1_key, model2_key, max_tok, temp, tp, rep_penalty, fair_match, internal_pair):
	"""Run two models side by side and stream the results."""
	text = (prompt or "").strip()
	show_vote = (mode != "👀 Show")
	if not text:
	yield gr.update(value="", label="Left Model Output"), gr.update(value="", label="Right Model Output"), "Enter a prompt.", "Enter a prompt.", gr.update(visible=show_vote), gr.update(visible=True), None, ""
	return

	interrupt_callback.stop_signal = False
	m1 = model1_key
	m2 = model2_key

	if mode == "🎲 Random":
	m1 = random.choice(list(STENTOR_MODELS.keys()))

	potential_m2 = list(ARENA_MODELS.keys())
	if internal_pair:
	potential_m2 += [k for k in STENTOR_MODELS if k != m1]

	if fair_match:
	# Group categorizations for matchmaking tiers
	tiers = {
	"12m": [k for k in STENTOR_MODELS if "12M" in k],
	"20m": [k for k in STENTOR_MODELS if "20M" in k],
	"30m": [k for k in STENTOR_MODELS if "30M" in k],
	"50m": [k for k in STENTOR_MODELS if "50M" in k],
	"150m": ["Portimbria-150M"]
	}
	fair_external_allowlist = {
	"12m": ["Pythia-14M"],
	"20m": ["Pythia-14M", "Pythia-31M"],
	"30m": ["Pythia-31M"],
	"50m": ["Pythia-31M", "Pythia-70M", "NanoWhale-100M-Base"],
	"150m": ["gpt2 small", "SmolLM2-135M", "NanoWhale-100M-Base", "Pythia-160M", "OPT-125M", "GPT-Neo 125M"],
	}

	allowed_m2 = []
	if m1 in tiers["12m"]:
	allowed_m2 = list(fair_external_allowlist["12m"])
	if internal_pair: allowed_m2 += [k for k in tiers["12m"] if k != m1]
	elif m1 in tiers["20m"]:
	allowed_m2 = list(fair_external_allowlist["20m"])
	if internal_pair: allowed_m2 += [k for k in tiers["20m"] if k != m1]
	elif m1 in tiers["30m"]:
	allowed_m2 = list(fair_external_allowlist["30m"])
	if internal_pair: allowed_m2 += [k for k in tiers["30m"] if k != m1]
	elif m1 in tiers["50m"]:
	allowed_m2 = list(fair_external_allowlist["50m"])
	if internal_pair: allowed_m2 += [k for k in tiers["50m"] if k != m1]
	elif m1 in tiers["150m"]:
	allowed_m2 = list(fair_external_allowlist["150m"])
	if internal_pair: allowed_m2 = [k for k in allowed_m2 if k != m1]
	else:
	# Fallback safety (e.g. for Instruct models)
	allowed_m2 = [k for k in potential_m2 if k != m1]

	if not allowed_m2:
	m2 = random.choice(potential_m2)
	else:
	m2 = random.choice(allowed_m2)
	else:
	m2 = random.choice(potential_m2)

	elif mode == "🙈 Blind":
	# Rule: No External vs External. If user picks two, force left to a Stentor model.
	if m1 in ARENA_MODELS and m2 in ARENA_MODELS:
	m1 = random.choice(list(STENTOR_MODELS.keys()))

	# Swap randomly in Blind/Random so "Left" isn't always the same type of model
	is_swapped = False
	if mode != "👀 Show" and random.random() > 0.5:
	m1, m2 = m2, m1
	is_swapped = True

	display_name1 = m1 if mode == "👀 Show" else "Model A"
	display_name2 = m2 if mode == "👀 Show" else "Model B"

	label1 = f"{display_name1} Output"
	label2 = f"{display_name2} Output"

	# Initial thinking update to clear UI immediately
	yield gr.update(value="", label=label1), gr.update(value="", label=label2), f"⚡ {display_name1} is thinking…", f"Waiting for {display_name2}…", gr.update(visible=show_vote), gr.update(visible=False), (m1, m2), ""

	# Run model 1
	output1 = ""
	for partial in _generate_and_stream(ALL_MODELS[m1], text, max_tok, temp, tp, rep_penalty):
	if interrupt_callback.stop_signal: break
	output1 = partial
	yield gr.update(value=output1, label=label1), gr.update(value="", label=label2), f"⚡ {display_name1} is thinking…", f"Waiting for {display_name2}…", gr.update(visible=show_vote), gr.update(visible=False), (m1, m2), ""

	# Run model 2
	output2 = ""
	start2 = time.time()
	for partial in _generate_and_stream(ALL_MODELS[m2], text, max_tok, temp, tp, rep_penalty):
	if interrupt_callback.stop_signal: break
	output2 = partial
	elapsed = time.time() - start2
	yield gr.update(value=output1, label=label1), gr.update(value=output2, label=label2), f"✓ {display_name1} Finished", f"⚡ {display_name2} is thinking… ({elapsed:.1f}s)", gr.update(visible=show_vote), gr.update(visible=False), (m1, m2), ""

	yield gr.update(value=output1, label=label1), gr.update(value=output2, label=label2), f"✓ {display_name1} Finished", f"✓ {display_name2} Finished", gr.update(visible=show_vote), gr.update(visible=(mode == "👀 Show")), (m1, m2), ""


	def arena_setup(mode):
	show_vote = (mode != "👀 Show")
	return gr.update(value="", label="Left Model Output"), gr.update(value="", label="Right Model Output"), "Waiting...", "Waiting...", gr.update(visible=show_vote), gr.update(visible=False), None, ""


	def arena_vote(vote_type, identities):
	if not identities:
	return "Please run a battle first.", gr.update(visible=False)

	m1, m2 = identities
	result_text = f"### Decision Recorded! \n\nWinner: {vote_type}\n\n"
	result_text += f"Left was: `{m1}`\nRight was: `{m2}`"

	return result_text, gr.update(visible=False)


	# ─────────────────────────────────── CSS ──────────────────────────────────────
	CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Bebas+Neue&family=Space+Mono:wght@400;700&family=Sora:wght@300;400;500;600;700&display=swap');

	.gradio-container, .gradio-container * { box-sizing: border-box !important; }

	.gradio-container {
	background: #04060e !important;
	max-width: 1280px !important;
	margin: 0 auto !important;
	padding: 0 !important;
	font-family: 'Sora', sans-serif !important;
	color: #ffffff !important;
	}

	body,
	.gradio-container > div,
	.gradio-container .contain,
	.gradio-container .wrap,
	.gradio-container section,
	.gradio-container .tabs,
	.gradio-container .tabitem,
	.gradio-container > div > div,
	.gradio-container .block {
	background: #04060e !important;
	border-color: #1a2744 !important;
	}

	.gradio-container .block {
	box-shadow: none !important;
	border-radius: 0 !important;
	padding: 0 !important;
	border: none !important;
	}

	footer { display: none !important; }

	.gradio-container p,
	.gradio-container span,
	.gradio-container div,
	.gradio-container li,
	.gradio-container td,
	.gradio-container th {
	color: #ffffff !important;
	font-family: 'Sora', sans-serif !important;
	}

	.gradio-container label,
	.gradio-container label span,
	.gradio-container .label-wrap span {
	font-family: 'Sora', sans-serif !important;
	font-size: 11px !important;
	font-weight: 600 !important;
	letter-spacing: 0.1em !important;
	text-transform: uppercase !important;
	color: #374151 !important;
	}

	.gradio-container textarea,
	.gradio-container input[type="text"],
	.gradio-container input[type="number"] {
	font-family: 'Sora', sans-serif !important;
	background: #0d1829 !important;
	border: 1px solid #1a2744 !important;
	color: #ffffff !important;
	border-radius: 8px !important;
	font-size: 14px !important;
	}

	.gradio-container textarea:focus,
	.gradio-container input:focus {
	border-color: #78490a !important;
	box-shadow: 0 0 0 3px rgba(245,158,11,0.07) !important;
	outline: none !important;
	}

	#prompt-box textarea {
	font-size: 15px !important;
	line-height: 1.75 !important;
	min-height: 120px !important;
	}

	#output-box textarea {
	font-family: 'Space Mono', monospace !important;
	font-size: 13px !important;
	line-height: 1.85 !important;
	color: #ffffff !important;
	background: #060a14 !important;
	border-color: #1a2744 !important;
	}

	/* Stop fading during generation updates */
	.gradio-container textarea { transition: none !important; opacity: 1 !important; }

	.status-bar textarea {
	font-family: 'Space Mono', monospace !important;
	font-size: 12px !important;
	color: #374151 !important;
	background: #0a0f1e !important;
	border-color: #1a2744 !important;
	padding: 6px 10px !important;
	}

	.gradio-container input[type="range"] { accent-color: #f59e0b !important; }
	.gradio-container input[type="number"] {
	background: #0d1829 !important;
	color: #e2e8f0 !important;
	border: 1px solid #1a2744 !important;
	font-family: 'Space Mono', monospace !important;
	font-size: 13px !important;
	width: 64px !important;
	}

	.gradio-container [role="tablist"] {
	background: #04060e !important;
	border-bottom: 1px solid #1a2744 !important;
	padding: 0 36px !important;
	gap: 0 !important;
	}

	.gradio-container [role="tab"] {
	font-family: 'Sora', sans-serif !important;
	font-size: 13px !important;
	font-weight: 500 !important;
	color: #374151 !important;
	background: transparent !important;
	border: none !important;
	border-bottom: 2px solid transparent !important;
	border-radius: 0 !important;
	padding: 14px 20px !important;
	letter-spacing: 0.03em !important;
	transition: color 0.15s !important;
	}

	.gradio-container [role="tab"]:hover { color: #94a3b8 !important; background: transparent !important; }

	.gradio-container [role="tab"][aria-selected="true"],
	.gradio-container [role="tab"].selected {
	color: #f59e0b !important;
	border-bottom: 2px solid #f59e0b !important;
	background: transparent !important;
	}

	.gradio-container [role="tabpanel"],
	.gradio-container .tabitem {
	background: #04060e !important;
	padding: 28px 36px !important;
	border: none !important;
	}

	.gradio-container fieldset {
	background: transparent !important;
	border: none !important;
	padding: 0 !important;
	gap: 6px !important;
	}

	.gradio-container fieldset label {
	background: #0d1829 !important;
	border: 1px solid #1a2744 !important;
	border-radius: 8px !important;
	padding: 8px 14px !important;
	cursor: pointer !important;
	color: #64748b !important;
	font-size: 13px !important;
	font-weight: 500 !important;
	text-transform: none !important;
	letter-spacing: 0 !important;
	transition: all 0.15s !important;
	}

	.gradio-container fieldset label:has(input:checked) {
	background: rgba(245,158,11,0.1) !important;
	border-color: #f59e0b !important;
	color: #f59e0b !important;
	}

	.gradio-container button {
	font-family: 'Sora', sans-serif !important;
	cursor: pointer !important;
	transition: all 0.18s !important;
	border-radius: 8px !important;
	}

	.gradio-container button.primary,
	.gradio-container button[variant="primary"] {
	background: #f59e0b !important;
	color: #07090f !important;
	border: none !important;
	font-size: 13px !important;
	font-weight: 700 !important;
	letter-spacing: 0.07em !important;
	text-transform: uppercase !important;
	padding: 11px 22px !important;
	position: relative !important;
	overflow: hidden !important;
	}

	.gradio-container button.primary::after {
	content: '' !important;
	position: absolute !important;
	inset: 0 !important;
	background: linear-gradient(120deg, transparent 30%, rgba(255,255,255,0.15) 50%, transparent 70%) !important;
	transform: translateX(-100%) !important;
	transition: transform 0.4s !important;
	}

	.gradio-container button.primary:hover::after { transform: translateX(100%) !important; }
	.gradio-container button.primary:hover {
	background: #fbbf24 !important;
	box-shadow: 0 0 28px rgba(245,158,11,0.4) !important;
	transform: translateY(-1px) !important;
	}

	.gradio-container button.secondary,
	.gradio-container button[variant="secondary"] {
	background: #0d1829 !important;
	color: #64748b !important;
	border: 1px solid #1a2744 !important;
	font-size: 13px !important;
	font-weight: 500 !important;
	padding: 10px 18px !important;
	}

	.gradio-container button.secondary:hover {
	background: #111d30 !important;
	color: #e2e8f0 !important;
	border-color: #2a3f60 !important;
	}

	.prompt-chip {
	background: transparent !important;
	border: 1px solid #1a2744 !important;
	color: #374151 !important;
	font-size: 11px !important;
	font-weight: 400 !important;
	padding: 5px 11px !important;
	border-radius: 16px !important;
	white-space: nowrap !important;
	overflow: hidden !important;
	text-overflow: ellipsis !important;
	max-width: 200px !important;
	text-transform: none !important;
	letter-spacing: 0 !important;
	}

	.prompt-chip:hover {
	border-color: #0e4a6a !important;
	color: #38bdf8 !important;
	background: rgba(56,189,248,0.05) !important;
	}

	.preset-chip {
	background: #0d1829 !important;
	border: 1px solid #1a2744 !important;
	color: #64748b !important;
	font-size: 12px !important;
	font-weight: 600 !important;
	padding: 6px 14px !important;
	border-radius: 20px !important;
	text-transform: none !important;
	letter-spacing: 0 !important;
	}

	.preset-chip:hover {
	border-color: #78490a !important;
	color: #f59e0b !important;
	background: rgba(245,158,11,0.07) !important;
	}

	.mode-caption {
	margin: 10px 0 0 0;
	font-family: 'Sora', sans-serif;
	font-size: 12px;
	line-height: 1.5;
	color: #94a3b8;
	}

	.mode-caption strong {
	color: #f59e0b;
	}

	@keyframes shimmer { 0%, 100% { opacity: 1; } 50% { opacity: 0.6; } }
	@keyframes pulse-border { 0%,100%{border-color:#1a2744} 50%{border-color:#2a3f60} }

	.stentor-header {
	position: relative;
	padding: 52px 40px 44px;
	overflow: hidden;
	border-bottom: 1px solid #1a2744;
	background: #04060e;
	}

	.stentor-header::before {
	content: '';
	position: absolute;
	inset: 0;
	background:
	radial-gradient(ellipse 70% 55% at 50% -10%, rgba(245,158,11,0.06) 0%, transparent 65%),
	repeating-linear-gradient(90deg, transparent, transparent 79px, rgba(26,39,68,0.2) 80px),
	repeating-linear-gradient(0deg, transparent, transparent 79px, rgba(26,39,68,0.2) 80px);
	pointer-events: none;
	}

	.stentor-header::after {
	content: '';
	position: absolute;
	top: 0; left: 0; right: 0; height: 2px;
	background: linear-gradient(90deg, transparent 0%, #f59e0b 50%, transparent 100%);
	animation: shimmer 5s ease-in-out infinite;
	}

	.header-inner {
	position: relative; z-index: 1;
	display: flex; align-items: flex-end;
	justify-content: space-between; gap: 20px; flex-wrap: wrap;
	}

	.stentor-header h1,
	.stentor-header h1 * {
	color: #ffffff !important;
	-webkit-text-fill-color: #ffffff !important;
	}

	.wordmark-eyebrow {
	font-family: 'Sora', sans-serif;
	font-size: 11px; font-weight: 600;
	letter-spacing: 0.3em; text-transform: uppercase;
	color: #f59e0b; display: block; margin-bottom: 6px;
	}

	.wordmark-title {
	font-family: 'Bebas Neue', sans-serif !important;
	font-size: clamp(56px, 9vw, 96px) !important;
	line-height: 0.88 !important;
	color: #ffffff !important;
	-webkit-text-fill-color: #ffffff !important;
	margin: 0 !important; display: block !important;
	letter-spacing: 0.02em !important;
	}

	.wordmark-sub {
	font-family: 'Space Mono', monospace;
	font-size: 11px; color: #374151;
	margin-top: 10px; display: block; letter-spacing: 0.04em;
	}

	.header-badges { display: flex; flex-direction: column; align-items: flex-end; gap: 8px; }
	.badge-row { display: flex; gap: 6px; flex-wrap: wrap; justify-content: flex-end; }
	.badge { font-family: 'Space Mono', monospace; font-size: 10px; padding: 4px 10px; border-radius: 4px; font-weight: 700; display: inline-block; }
	.badge-gold { background: rgba(245,158,11,0.12); color: #f59e0b; border: 1px solid rgba(245,158,11,0.3); }
	.badge-ice { background: rgba(56,189,248,0.08); color: #38bdf8; border: 1px solid rgba(56,189,248,0.25); }
	.badge-green { background: rgba(16,185,129,0.08); color: #10b981; border: 1px solid rgba(16,185,129,0.25); }

	.section-title {
	font-family: 'Bebas Neue', sans-serif !important;
	font-size: 24px !important; letter-spacing: 0.09em !important;
	color: #ffffff !important; margin: 0 0 16px 0 !important;
	padding-bottom: 10px !important; border-bottom: 1px solid #1a2744 !important;
	line-height: 1 !important; display: block;
	}

	.model-card { background: #080d1a; border: 1px solid #1a2744; border-radius: 10px; padding: 14px; margin-bottom: 12px; }
	.model-card-title { font-family: 'Bebas Neue', sans-serif; font-size: 20px; letter-spacing: 0.06em; color: #ffffff; margin: 0 0 10px 0; line-height: 1; }
	.model-attr { display: flex; justify-content: space-between; align-items: center; padding: 4px 0; border-bottom: 1px solid #0d1829; font-size: 12px; }
	.model-attr:last-child { border-bottom: none; }
	.attr-key { font-family: 'Sora', sans-serif; color: #374151; font-weight: 500; }
	.attr-val { font-family: 'Space Mono', monospace; color: #38bdf8; font-size: 11px; }

	.explorer-info {
	background: rgba(245,158,11,0.05);
	border: 1px solid rgba(245,158,11,0.2);
	border-radius: 8px;
	padding: 12px 16px;
	margin-bottom: 16px;
	font-family: 'Sora', sans-serif;
	font-size: 12px;
	color: #94a3b8;
	line-height: 1.6;
	}

	.about-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin-top: 20px; }
	.about-block { background: #080d1a; border: 1px solid #1a2744; border-radius: 10px; padding: 18px; }
	.about-block h3 { font-family: 'Bebas Neue', sans-serif !important; font-size: 17px !important; letter-spacing: 0.07em !important; color: #f59e0b !important; margin: 0 0 12px 0 !important; }
	.about-block p { font-size: 13px; color: #64748b; line-height: 1.7; margin: 0; }
	.about-block li { font-size: 13px; color: #64748b; line-height: 1.85; }
	.about-block a { color: #38bdf8 !important; text-decoration: none !important; }
	.about-block a:hover { text-decoration: underline !important; }

	.arch-table { width: 100%; border-collapse: collapse; font-family: 'Space Mono', monospace; font-size: 12px; }
	.arch-table td { padding: 5px 6px; border-bottom: 1px solid #1a2744; }
	.arch-table td:first-child { color: #374151; font-size: 11px; }
	.arch-table td:last-child { color: #38bdf8; text-align: right; }

	::-webkit-scrollbar { width: 5px; height: 5px; }
	::-webkit-scrollbar-track { background: #080d1a; }
	::-webkit-scrollbar-thumb { background: #1a2744; border-radius: 3px; }
	::-webkit-scrollbar-thumb:hover { background: #2a3f60; }

	@media (max-width: 800px) {
	.stentor-header { padding: 28px 16px 24px; }
	.gradio-container [role="tabpanel"] { padding: 16px !important; }
	.about-grid { grid-template-columns: 1fr; }
	.header-badges { display: none; }
	}
	"""

	# ───────────────────────────────── HTML ────────────────────────────────────────
	HEADER_HTML = """
	<div class="stentor-header">
	<div class="header-inner">
	<div>
	<span class="wordmark-eyebrow">Model Showcase</span>
	<h1 class="wordmark-title">STENTOR<br>LABS</h1>
	<span class="wordmark-sub">// compact llama models · cpu-native · open research</span>
	</div>
	<div class="header-badges">
	<div class="badge-row">
	<span class="badge badge-gold">Apache 2.0</span>
	<span class="badge badge-ice">Llama Arch</span>
	<span class="badge badge-green">CPU Native</span>
	</div>
	<div class="badge-row">
	<span class="badge badge-gold">Stentor2-30M + Stentor2-12M</span>
	<span class="badge badge-ice">1024 ctx</span>
	<span class="badge badge-green">Edge Ready</span>
	</div>
	<div class="badge-row">
	<span class="badge badge-gold">Stentor3-50M + Stentor3-20M</span>
	<span class="badge badge-ice">4096 ctx</span>
	<span class="badge badge-green">Next Gen</span>
	</div>
	<div class="badge-row">
	<span class="badge badge-gold">Portimbria-150M</span>
	<span class="badge badge-ice">4096 ctx</span>
	</div>
	<div class="badge-row">
	<span class="badge badge-gold">Flagship: Portimbria-150M</span>
	</div>
	<div class="badge-row">
	<span class="badge badge-gold">FineWeb-Edu</span>
	<span class="badge badge-ice">Cosmopedia v2</span>
	</div>
	</div>
	</div>
	</div>
	"""

	MODEL_CARDS = {
	"Portimbria-150M": (
	'<div class="model-card">'
	'<p class="model-card-title">PORTIMBRIA-150M</p>'
	'<div class="model-attr"><span class="attr-key">Parameters</span><span class="attr-val">151M</span></div>'
	'<div class="model-attr"><span class="attr-key">Family</span><span class="attr-val">Portimbria</span></div>'
	'<div class="model-attr"><span class="attr-key">Variant</span><span class="attr-val">Base model</span></div>'
	'<div class="model-attr"><span class="attr-key">Context</span><span class="attr-val">4096 tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Architecture</span><span class="attr-val">Llama with GQA</span></div>'
	'<div class="model-attr"><span class="attr-key">Model Card</span><span class="attr-val"><a href="https://huggingface.co/StentorLabs/Portimbria-150M" target="_blank" style="color:#38bdf8">Open ↗</a></span></div>'
	'</div>'
	),
	"Stentor3-50M": (
	'<div class="model-card">'
	'<p class="model-card-title">STENTOR3-50M</p>'
	'<div class="model-attr"><span class="attr-key">Parameters</span><span class="attr-val">50M</span></div>'
	'<div class="model-attr"><span class="attr-key">Family</span><span class="attr-val">Stentor3</span></div>'
	'<div class="model-attr"><span class="attr-key">Variant</span><span class="attr-val">Base model</span></div>'
	'<div class="model-attr"><span class="attr-key">Context</span><span class="attr-val">4096 tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Model Card</span><span class="attr-val"><a href="https://huggingface.co/StentorLabs/stentor3-50m" target="_blank" style="color:#38bdf8">Open ↗</a></span></div>'
	'</div>'
	),
	"Stentor3-20M": (
	'<div class="model-card">'
	'<p class="model-card-title">STENTOR3-20M</p>'
	'<div class="model-attr"><span class="attr-key">Parameters</span><span class="attr-val">20M</span></div>'
	'<div class="model-attr"><span class="attr-key">Family</span><span class="attr-val">Stentor3</span></div>'
	'<div class="model-attr"><span class="attr-key">Variant</span><span class="attr-val">Base model</span></div>'
	'<div class="model-attr"><span class="attr-key">Context</span><span class="attr-val">4096 tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Model Card</span><span class="attr-val"><a href="https://huggingface.co/StentorLabs/stentor3-20m" target="_blank" style="color:#38bdf8">Open ↗</a></span></div>'
	'</div>'
	),
	"Stentor2-30M": (
	'<div class="model-card">'
	'<p class="model-card-title">STENTOR2-30M</p>'
	'<div class="model-attr"><span class="attr-key">Family</span><span class="attr-val">Stentor2</span></div>'
	'<div class="model-attr"><span class="attr-key">Variant</span><span class="attr-val">Base model</span></div>'
	'<div class="model-attr"><span class="attr-key">Context</span><span class="attr-val">1024 tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Model Card</span><span class="attr-val"><a href="https://huggingface.co/StentorLabs/stentor2-30m" target="_blank" style="color:#38bdf8">Open ↗</a></span></div>'
	'</div>'
	),
	"Stentor2-12M": (
	'<div class="model-card">'
	'<p class="model-card-title">STENTOR2-12M</p>'
	'<div class="model-attr"><span class="attr-key">Family</span><span class="attr-val">Stentor2</span></div>'
	'<div class="model-attr"><span class="attr-key">Variant</span><span class="attr-val">Base model</span></div>'
	'<div class="model-attr"><span class="attr-key">Context</span><span class="attr-val">1024 tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Model Card</span><span class="attr-val"><a href="https://huggingface.co/StentorLabs/stentor2-12m" target="_blank" style="color:#38bdf8">Open ↗</a></span></div>'
	'</div>'
	),
	"Stentor-30M": (
	'<div class="model-card">'
	'<p class="model-card-title">STENTOR-30M</p>'
	'<div class="model-attr"><span class="attr-key">Parameters</span><span class="attr-val">30,419,712</span></div>'
	'<div class="model-attr"><span class="attr-key">Architecture</span><span class="attr-val">LlamaForCausalLM</span></div>'
	'<div class="model-attr"><span class="attr-key">Layers</span><span class="attr-val">21</span></div>'
	'<div class="model-attr"><span class="attr-key">Hidden Size</span><span class="attr-val">256</span></div>'
	'<div class="model-attr"><span class="attr-key">Attn Heads</span><span class="attr-val">4</span></div>'
	'<div class="model-attr"><span class="attr-key">Context</span><span class="attr-val">512 tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Val Loss / PPL</span><span class="attr-val">3.4971 / 33.02</span></div>'
	'<div class="model-attr"><span class="attr-key">Trained On</span><span class="attr-val">600M tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Hardware</span><span class="attr-val">1× Tesla T4 · 7.88h</span></div>'
	'<div class="model-attr"><span class="attr-key">GGUF</span><span class="attr-val"><a href="https://huggingface.co/mradermacher/Stentor-30M-GGUF" target="_blank" style="color:#38bdf8">mradermacher ↗</a></span></div>'
	'</div>'
	),
	"Stentor-12M": (
	'<div class="model-card">'
	'<p class="model-card-title">STENTOR-12M</p>'
	'<div class="model-attr"><span class="attr-key">Parameters</span><span class="attr-val">12,047,040</span></div>'
	'<div class="model-attr"><span class="attr-key">Architecture</span><span class="attr-val">LlamaForCausalLM</span></div>'
	'<div class="model-attr"><span class="attr-key">Layers</span><span class="attr-val">9</span></div>'
	'<div class="model-attr"><span class="attr-key">Hidden Size</span><span class="attr-val">192</span></div>'
	'<div class="model-attr"><span class="attr-key">Attn Heads</span><span class="attr-val">3</span></div>'
	'<div class="model-attr"><span class="attr-key">Context</span><span class="attr-val">512 tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Val Loss / PPL</span><span class="attr-val">4.4887 / 89.01</span></div>'
	'<div class="model-attr"><span class="attr-key">Trained On</span><span class="attr-val">200M tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Hardware</span><span class="attr-val">2× Tesla T4 · 1.3h</span></div>'
	'</div>'
	),
	"Stentor-30M-Instruct": (
	'<div class="model-card">'
	'<p class="model-card-title">STENTOR-30M-INSTRUCT</p>'
	'<div class="model-attr"><span class="attr-key">Variant</span><span class="attr-val">Instruction-tuned</span></div>'
	'<div class="model-attr"><span class="attr-key">Base Family</span><span class="attr-val">Stentor-30M</span></div>'
	'<div class="model-attr"><span class="attr-key">Architecture</span><span class="attr-val">LlamaForCausalLM</span></div>'
	'<div class="model-attr"><span class="attr-key">Context</span><span class="attr-val">512 tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Model Card</span><span class="attr-val"><a href="https://huggingface.co/StentorLabs/Stentor-30M-Instruct" target="_blank" style="color:#38bdf8">Open ↗</a></span></div>'
	'<div class="model-attr"><span class="attr-key">Status</span><span class="attr-val">Featured in this Space</span></div>'
	'</div>'
	),
	"Stentor-12M-Instruct": (
	'<div class="model-card">'
	'<p class="model-card-title">STENTOR-12M-INSTRUCT</p>'
	'<div class="model-attr"><span class="attr-key">Variant</span><span class="attr-val">Instruction-tuned</span></div>'
	'<div class="model-attr"><span class="attr-key">Base Family</span><span class="attr-val">Stentor-12M</span></div>'
	'<div class="model-attr"><span class="attr-key">Architecture</span><span class="attr-val">LlamaForCausalLM</span></div>'
	'<div class="model-attr"><span class="attr-key">Context</span><span class="attr-val">512 tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Model Card</span><span class="attr-val"><a href="https://huggingface.co/StentorLabs/Stentor-12M-Instruct" target="_blank" style="color:#38bdf8">Open ↗</a></span></div>'
	'<div class="model-attr"><span class="attr-key">Status</span><span class="attr-val">Featured in this Space</span></div>'
	'</div>'
	),
	}

	FLAGSHIP_HTML = (
	'<div class="model-card" style="border-color:rgba(245,158,11,0.35);background:rgba(245,158,11,0.06);">'
	'<p class="model-card-title" style="color:#f59e0b;">FLAGSHIP MODEL</p>'
	'<div class="model-attr"><span class="attr-key">Primary</span><span class="attr-val">Portimbria-150M</span></div>'
	'<div class="model-attr"><span class="attr-key">Context Length</span><span class="attr-val">4,096 tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Training Data</span><span class="attr-val">6B tokens</span></div>'
	'<div class="model-attr"><span class="attr-key">Model Card</span><span class="attr-val"><a href="https://huggingface.co/StentorLabs/Portimbria-150M" target="_blank" style="color:#38bdf8">Portimbria-150M ↗</a></span></div>'
	'</div>'
	)

	ABOUT_HTML = """
	<div>
	<div style="margin-bottom:24px;padding:20px;background:#080d1a;border:1px solid #1a2744;border-radius:10px;">
	<div style="font-family:Bebas Neue,sans-serif;font-size:22px;letter-spacing:0.08em;color:#f59e0b;margin-bottom:8px;">STENTORLABS PLAYGROUND</div>
	<p style="font-family:Sora,sans-serif;font-size:14px;color:#ffffff;line-height:1.85;margin:0;max-width:850px;">
	Welcome to the official StentorLabs sandbox. This Hugging Face Space is a <strong style="color:#f59e0b;">free, comprehensive testing environment</strong>
	designed to give anyone—from researchers to hobbyists—full access to our family of compact Llama models.
	Unlike traditional demos, this Space provides deep diagnostic tools to help you understand how Small Language Models (SLMs)
	actually process information, manage confidence, and respond to parameter shifts.
	</p>
	</div>

	<div style="margin-bottom:24px;padding:20px;background:#080d1a;border:1px solid #1a2744;border-radius:10px;">
	<div style="font-family:Bebas Neue,sans-serif;font-size:18px;letter-spacing:0.08em;color:#f59e0b;margin-bottom:16px;">ARCHITECTURE DIAGRAM · PORTIMBRIA-150M</div>
	<div style="display:flex;flex-direction:column;align-items:center;gap:0;font-family:Space Mono,monospace;font-size:11px;">
	<div style="background:#0d1829;border:1px solid #1a2744;border-radius:6px;padding:8px 28px;color:#94a3b8;text-align:center;">INPUT TOKENS</div>
	<div style="color:#374151;padding:4px 0;font-size:16px;">↓</div>
	<div style="background:rgba(245,158,11,0.1);border:1px solid rgba(245,158,11,0.3);border-radius:6px;padding:8px 20px;color:#f59e0b;text-align:center;">EMBEDDING LAYER<br><span style="font-size:10px;color:#78490a;">32,768 vocab × 768 hidden</span></div>
	<div style="color:#374151;padding:4px 0;font-size:16px;">↓</div>
	<div style="border:1px solid #1a2744;border-radius:8px;padding:12px 20px;background:#060a14;width:100%;max-width:420px;">
	<div style="font-family:Bebas Neue,sans-serif;font-size:14px;letter-spacing:0.08em;color:#94a3b8;text-align:center;margin-bottom:8px;">× 20 TRANSFORMER BLOCKS</div>
	<div style="display:grid;grid-template-columns:1fr 1fr;gap:6px;">
	<div style="background:rgba(56,189,248,0.08);border:1px solid rgba(56,189,248,0.2);border-radius:5px;padding:6px 10px;text-align:center;color:#38bdf8;font-size:10px;">GQA ATTENTION<br><span style="color:#0e4a6a;">6 heads · 2 KV · RoPE θ=50000</span></div>
	<div style="background:rgba(167,139,250,0.08);border:1px solid rgba(167,139,250,0.2);border-radius:5px;padding:6px 10px;text-align:center;color:#a78bfa;font-size:10px;">FEED-FORWARD<br><span style="color:#4c1d95;">768→2048→768 SiLU</span></div>
	</div>
	<div style="margin-top:6px;text-align:center;font-size:10px;color:#1a2744;">RMSNorm + Residual connections</div>
	</div>
	<div style="color:#374151;padding:4px 0;font-size:16px;">↓</div>
	<div style="background:rgba(16,185,129,0.08);border:1px solid rgba(16,185,129,0.25);border-radius:6px;padding:8px 20px;color:#10b981;text-align:center;">OUTPUT LOGITS<br><span style="font-size:10px;color:#064e3b;">32,768 vocab (tied weights)</span></div>
	</div>
	</div>

	<div class="about-grid">
	<div class="about-block">
	<h3>⚡ Mode: Generate</h3>
	<p>The standard interface for text completion. Test how models handle creative writing, code drafting, or factual continuation.</p>
	<ul style="margin-top:8px;">
	<li><strong>Presets:</strong> Instantly switch between Creative (high temp), Balanced, and Focused (low temp) logic.</li>
	<li><strong>Multi-Response:</strong> Generate up to 5 variations of the same prompt sequentially to test output variance.</li>
	</ul>
	</div>
	<div class="about-block">
	<h3>🔬 Mode: Token Explorer</h3>
	<p>Peek "under the hood" of the model's decision-making process. This mode visualizes internal confidence levels.</p>
	<ul style="margin-top:8px;">
	<li><strong>Confidence Heatmap:</strong> See which tokens the model was certain about vs. which were random guesses.</li>
	<li><strong>Alternatives:</strong> Hover over any generated token to see the <em>top 8 alternatives</em> the model was considering at that exact moment.</li>
	</ul>
	</div>
	<div class="about-block">
	<h3>🌡 Mode: Temp Sweep</h3>
	<p>A visual study in creativity. Run the exact same prompt across 2–5 different temperature settings simultaneously.</p>
	<ul style="margin-top:8px;">
	<li><strong>Visual Divergence:</strong> Observe how low temperatures stay rigid and repetitive while high temperatures become increasingly chaotic.</li>
	<li><strong>Dynamic Slots:</strong> Use the Number of Boxes slider to reveal up to five outputs.</li>
	</ul>
	</div>
	<div class="about-block">
	<h3>💬 Mode: Chat</h3>
	<p>Interactive testing for all model variants, including both <strong>Base</strong> and <strong>Instruct</strong> versions. Test how models handle multi-turn dialogue and maintain context.</p>
	<ul style="margin-top:8px;">
	<li><strong>Memory:</strong> Test how the 512–4096 token context handles conversation history.</li>
	<li><strong>Safety:</strong> Observe how small models handle refusals and helpfulness constraints.</li>
	</ul>
	</div>
	<div class="about-block">
	<h3>🛠 Parameter Guide</h3>
	<ul style="margin-top:0;">
	<li><strong>Temperature:</strong> Controls "creativity." 0.1 is nearly deterministic; 1.5+ is experimental/chaotic.</li>
	<li><strong>Top P:</strong> Nucleus sampling. Limits the model to the most likely group of tokens whose cumulative probability is P. Helps prevent gibberish.</li>
	<li><strong>Repetition Penalty:</strong> Penalizes tokens that have already appeared. Essential for preventing loops in very small models.</li>
	<li><strong>Max Tokens:</strong> Each model has a physical cap (e.g., 4,096 for Portimbria). Setting this too low will cut off thoughts mid-sentence.</li>
	</ul>
	</div>
	<div class="about-block">
	<h3>Links & Resources</h3>
	<ul>
	<li><a href="https://huggingface.co/StentorLabs/Portimbria-150M" target="_blank">Portimbria-150M Model Card ↗</a></li>
	<li><a href="https://huggingface.co/StentorLabs/Stentor-30M" target="_blank">Stentor-30M Model Card ↗</a></li>
	<li><a href="https://huggingface.co/StentorLabs/Stentor-30M-Instruct" target="_blank">Stentor-30M-Instruct Model Card ↗</a></li>
	<li><a href="https://huggingface.co/StentorLabs/Stentor-12M" target="_blank">Stentor-12M Model Card ↗</a></li>
	<li><a href="https://huggingface.co/StentorLabs/Stentor-12M-Instruct" target="_blank">Stentor-12M-Instruct Model Card ↗</a></li>
	<li><a href="https://huggingface.co/StentorLabs/stentor2-30m" target="_blank">Stentor2-30M Model Card ↗</a></li>
	<li><a href="https://huggingface.co/StentorLabs/stentor2-12m" target="_blank">Stentor2-12M Model Card ↗</a></li>
	<li><a href="https://huggingface.co/mradermacher/Stentor-30M-GGUF" target="_blank">GGUF Quantizations (mradermacher) ↗</a></li>
	<li><a href="https://huggingface.co/StentorLabs" target="_blank">StentorLabs on Hugging Face ↗</a></li>
	</ul>
	<p style="margin-top:14px;font-size:12px;color:#1a2744;border-top:1px solid #0d1829;padding-top:12px;">
	⚠ Includes both base and instruct variants · Always set max_new_tokens · Apache 2.0 · Built by Kai Izumoto
	</p>
	</div>
	</div>
	</div>
	"""

	# Preload only the default model at startup so the UI is responsive quickly.
	print(f"[Stentor] Preloading default model ({DEFAULT_MODEL}) at startup...")
	try:
	_get_model(ALL_MODELS[DEFAULT_MODEL])
	print("[Stentor] Default model loaded and warmed up.")
	except Exception as e:
	print(f"[Stentor] Could not preload default model: {e}")

	print("[Stentor] Preloading arena models at startup...")
	for arena_name, arena_repo in ARENA_MODELS.items():
	try:
	_get_model(arena_repo)
	print(f"[Stentor] Arena model loaded: {arena_name}")
	except Exception as e:
	print(f"[Stentor] Could not preload arena model {arena_name}: {e}")

	# ─────────────────────────────────── UI ───────────────────────────────────────
	with gr.Blocks(title=APP_TITLE) as demo:
	gr.HTML(HEADER_HTML)
	history_state = gr.State([])

	with gr.Tabs():

	# ── TAB 1: GENERATE ────────────────────────────────────────────────────
	with gr.TabItem(" ▶ Generate "):
	with gr.Row():

	with gr.Column(scale=1, min_width=240):
	gr.HTML('<span class="section-title">MODEL</span>')
	model_sel = gr.Radio(
	choices=list(STENTOR_MODELS.keys()),
	value=DEFAULT_MODEL,
	label="",
	interactive=True,
	)
	gr.HTML(FLAGSHIP_HTML)
	model_card_html = gr.HTML(MODEL_CARDS[DEFAULT_MODEL])
	gr.HTML('<span class="section-title" style="margin-top:20px;">PARAMETERS</span>')
	with gr.Row():
	btn_creative = gr.Button("🎨 Creative", size="sm", elem_classes=["preset-chip"])
	btn_balanced = gr.Button("⚖️ Balanced", size="sm", elem_classes=["preset-chip"])
	btn_focused = gr.Button("🎯 Focused", size="sm", elem_classes=["preset-chip"])
	gr.HTML(MODE_RECOMMENDATION_HTML)
	max_tokens = gr.Slider(10, INITIAL_MAX_TOKENS, value=DEFAULT_MAX_TOKENS, step=10, label="Max New Tokens")
	temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMP, step=0.05, label="Temperature")
	top_p = gr.Slider(0.05, 1.0, value=DEFAULT_TOP_P, step=0.05, label="Top P")
	repetition_penalty = gr.Slider(0.8, 2.0, value=DEFAULT_REP_PENALTY, step=0.05, label="Repetition Penalty")
	num_responses = gr.Slider(1, 5, value=1, step=1, label="Number of Responses")

	with gr.Column(scale=3):
	gr.HTML('<span class="section-title">GENERATE</span>')
	prompt_box = gr.Textbox(
	label="Prompt",
	placeholder="Start writing or pick an example below…",
	lines=4,
	elem_id="prompt-box",
	)
	example_btns = []
	with gr.Column():
	with gr.Row():
	for emoji, p in EXAMPLE_PROMPTS[:4]:
	short = p[:26] + ("…" if len(p) > 26 else "")
	b = gr.Button(f"{emoji} {short}", size="sm", elem_classes=["prompt-chip"])
	example_btns.append((b, p))
	with gr.Row():
	for emoji, p in EXAMPLE_PROMPTS[4:]:
	short = p[:26] + ("…" if len(p) > 26 else "")
	b = gr.Button(f"{emoji} {short}", size="sm", elem_classes=["prompt-chip"])
	example_btns.append((b, p))

	with gr.Row():
	gen_btn = gr.Button("▶ Generate", variant="primary", scale=3)
	stop_btn = gr.Button("⏹ Stop", variant="secondary", scale=1)

	output_box = gr.Textbox(
	label="Output",
	lines=12,
	interactive=False,
	elem_id="output-box",
	)
	stats_html = gr.HTML(build_stats_html("—", "—", "—"))
	status_box = gr.Textbox(
	value="Ready.", label="",
	interactive=False, elem_classes=["status-bar"],
	)

	with gr.Column(scale=1, min_width=220):
	gr.HTML('<span class="section-title">HISTORY</span>')
	history_html = gr.HTML(build_history_html([]))

	# ── TAB 2: TOKEN EXPLORER ──────────────────────────────────────────────
	with gr.TabItem(" 🔬 Token Explorer "):
	with gr.Column():
	gr.HTML('<span class="section-title">TOKEN PROBABILITY EXPLORER</span>')
	gr.HTML(
	'<div class="explorer-info">'
	'🔬 <strong style="color:#f59e0b;">How it works:</strong> Samples tokens while capturing the full probability distribution over the vocabulary at each step. '
	'Tokens are color-coded by confidence: '
	'<span style="color:#10b981;">●</span> green = confident (≥80%), '
	'<span style="color:#eab308;">●</span> yellow = moderate (50–79%), '
	'<span style="color:#f97316;">●</span> orange = uncertain (35–49%), '
	'<span style="color:#b91c1c;">●</span> dark red = low (<35%). '
	'Hover any token to see the top alternatives the model considered.'
	'</div>'
	)
	with gr.Row():
	with gr.Column(scale=3):
	exp_prompt = gr.Textbox(
	label="Prompt",
	placeholder="Enter a prompt to visualize token-by-token confidence…",
	lines=3,
	elem_id="prompt-box",
	)
	with gr.Column(scale=1):
	exp_model = gr.Radio(choices=list(STENTOR_MODELS.keys()), value=DEFAULT_MODEL, label="Model")
	with gr.Row():
	exp_creative = gr.Button("🎨 Creative", size="sm", elem_classes=["preset-chip"])
	exp_balanced = gr.Button("⚖️ Balanced", size="sm", elem_classes=["preset-chip"])
	exp_focused = gr.Button("🎯 Focused", size="sm", elem_classes=["preset-chip"])
	gr.HTML(MODE_RECOMMENDATION_HTML)
	exp_tokens = gr.Slider(5, INITIAL_MAX_TOKENS, value=DEFAULT_MAX_TOKENS, step=10, label="Max Tokens")
	exp_temp = gr.Slider(0.1, 2.0, value=DEFAULT_TEMP, step=0.05, label="Temperature")
	exp_top_p = gr.Slider(0.05, 1.0, value=DEFAULT_TOP_P, step=0.05, label="Top P")
	exp_rep_pen = gr.Slider(0.8, 2.0, value=DEFAULT_REP_PENALTY, step=0.05, label="Repetition Penalty")
	exp_btn = gr.Button("🔬 Explore", variant="primary", scale=3)
	exp_stop_btn = gr.Button("⏹ Stop", variant="secondary", scale=1)

	exp_output = gr.HTML(_explorer_placeholder())
	exp_status = gr.Textbox(value="", label="", interactive=False, elem_classes=["status-bar"])

	# ── TAB 3: TEMPERATURE SWEEP ───────────────────────────────────────────
	with gr.TabItem(" 🌡 Temp Sweep "):
	with gr.Column():
	gr.HTML('<span class="section-title">TEMPERATURE SWEEP</span>')
	gr.HTML(
	'<div class="explorer-info">'
	'🌡 <strong style="color:#f59e0b;">What this shows:</strong> The same prompt run at multiple different temperatures simultaneously. '
	'Low temperature = conservative/repetitive. High temperature = creative/chaotic. '
	'Choose between 2–5 temperature boxes below. <strong>No duplicate temperatures allowed.</strong>'
	'</div>'
	)
	sweep_state = gr.State([0.5, 1.0, 1.5, 2.0])
	with gr.Row():
	with gr.Column(scale=3):
	sweep_prompt = gr.Textbox(
	label="Prompt",
	placeholder="Enter a prompt to run across all temperatures…",
	lines=3,
	elem_id="prompt-box",
	)
	with gr.Column(scale=1):
	sweep_model = gr.Radio(choices=list(STENTOR_MODELS.keys()), value=DEFAULT_MODEL, label="Model")
	with gr.Row():
	sweep_creative = gr.Button("🎨 Creative", size="sm", elem_classes=["preset-chip"])
	sweep_balanced = gr.Button("⚖️ Balanced", size="sm", elem_classes=["preset-chip"])
	sweep_focused = gr.Button("🎯 Focused", size="sm", elem_classes=["preset-chip"])
	gr.HTML(MODE_RECOMMENDATION_HTML)
	sweep_tokens = gr.Slider(10, INITIAL_MAX_TOKENS, value=DEFAULT_MAX_TOKENS, step=10, label="Max Tokens")
	sweep_top_p = gr.Slider(0.05, 1.0, value=DEFAULT_TOP_P, step=0.05, label="Top P")
	sweep_rep_pen = gr.Slider(0.8, 2.0, value=DEFAULT_REP_PENALTY, step=0.05, label="Repetition Penalty")
	sweep_count = gr.Slider(2, 5, value=2, step=1, label="Number of Boxes")
	sweep_btn = gr.Button("🌡 Run Sweep", variant="primary", scale=3)
	sweep_stop_btn = gr.Button("⏹ Stop", variant="secondary", scale=1)

	with gr.Row():
	sweep_temp_inputs = []
	sweep_temp_labels = ["1st Temp", "2nd Temp", "3rd Temp", "4th Temp", "5th Temp"]
	default_temps_for_index = [0.5, 1.0, 1.5, 2.0, 2.5]
	sweep_columns = []
	sweep_outputs_for_fn = []
	for i in range(5):
	with gr.Column(visible=(i < 2)) as col:
	color_map = ["#38bdf8", "#10b981", "#f59e0b", "#f97316", "#f87171"]
	gr.HTML(
	f'<div style="text-align:center;padding:8px 0 4px;">'
	f'<span style="font-family:Bebas Neue,sans-serif;font-size:20px;'
	f'letter-spacing:0.06em;color:{color_map[i]};">BOX {i+1}</span>'
	f'</div>'
	)
	inp = gr.Number(value=default_temps_for_index[i], label=sweep_temp_labels[i], minimum=0.1, maximum=2.5, step=0.05)
	sweep_temp_inputs.append(inp)
	out = gr.Textbox(label="", lines=8, interactive=False, elem_id="output-box")
	sweep_outputs_for_fn.append(out)
	sweep_columns.append(col)

	with gr.Row():
	sweep_add_btn = gr.Button("+ Add Box", variant="secondary", scale=1)
	sweep_rm_btn = gr.Button("− Remove Box", variant="secondary", scale=1)

	def update_sweep_visibility(count, *temps):
	count_int = int(count)
	provided_temps = list(temps[:5])
	valid_temps = [t for t in provided_temps[:count_int] if t is not None]
	if len(valid_temps) != len(set(round(float(t), 2) for t in valid_temps)):
	new_temps = [round(0.5 + i * (1.5 / max(count_int - 1, 1)), 2) for i in range(count_int)]
	col_updates = [gr.update(visible=(i < count_int)) for i in range(5)]
	temp_updates = []
	for i in range(5):
	if i < count_int:
	temp_updates.append(gr.update(visible=True, value=new_temps[i]))
	else:
	temp_updates.append(gr.update(visible=False))
	return col_updates + temp_updates
	col_updates = [gr.update(visible=(i < count_int)) for i in range(5)]
	temp_updates = [gr.update()] * 5
	return col_updates + temp_updates

	def add_sweep_box(count):
	count_int = 2 if count is None else int(count)
	return gr.update(value=min(count_int + 1, 5))

	def remove_sweep_box(count):
	count_int = 2 if count is None else int(count)
	return gr.update(value=max(count_int - 1, 2))

	sweep_outs = sweep_outputs_for_fn

	sweep_count.change(
	fn=update_sweep_visibility,
	inputs=[sweep_count] + sweep_temp_inputs,
	outputs=sweep_columns + sweep_temp_inputs
	)

	sweep_add_btn.click(
	fn=add_sweep_box,
	inputs=[sweep_count],
	outputs=[sweep_count],
	).then(
	fn=update_sweep_visibility,
	inputs=[sweep_count] + sweep_temp_inputs,
	outputs=sweep_columns + sweep_temp_inputs,
	)
	sweep_rm_btn.click(
	fn=remove_sweep_box,
	inputs=[sweep_count],
	outputs=[sweep_count],
	).then(
	fn=update_sweep_visibility,
	inputs=[sweep_count] + sweep_temp_inputs,
	outputs=sweep_columns + sweep_temp_inputs,
	)

	# ── TAB 4: ARENA ─────────────────────────────────────────────────────
	with gr.TabItem(" 🏟 Arena "):
	with gr.Column():
	gr.HTML('<span class="section-title">MODEL ARENA</span>')
	gr.HTML(
	'<div class="explorer-info">'
	'🏟 <strong style="color:#f59e0b;">Model Arena:</strong> Benchmark performance via blind or open testing.<br>'
	'• <strong>Show Mode:</strong> Pick models and see their names while generating.<br>'
	'• <strong>Blind Mode:</strong> Pick models but their identities are hidden until you vote.<br>'
	'• <strong>Random Mode:</strong> Let the arena pick a random Stentor vs a Baseline model.'
	'</div>'
	)
	arena_identities = gr.State(None)
	with gr.Row():
	with gr.Column(scale=3):
	arena_prompt = gr.Textbox(
	label="Shared Prompt",
	placeholder="Enter a prompt to run through both models…",
	lines=3,
	elem_id="prompt-box",
	)
	with gr.Column(scale=2):
	with gr.Row():
	arena_creative = gr.Button("🎨 Creative", size="sm", elem_classes=["preset-chip"])
	arena_balanced = gr.Button("⚖️ Balanced", size="sm", elem_classes=["preset-chip"])
	arena_focused = gr.Button("🎯 Focused", size="sm", elem_classes=["preset-chip"])
	gr.HTML(MODE_RECOMMENDATION_HTML)
	with gr.Row():
	arena_mode = gr.Dropdown(
	choices=["👀 Show", "🙈 Blind", "🎲 Random"],
	value="👀 Show",
	label="Arena Mode"
	)
	with gr.Column(visible=False) as arena_random_options:
	arena_fair_match = gr.Checkbox(value=True, label="Fair Matchmaking", info="Pairs models with similar parameter counts for a balanced fight.")
	gr.HTML('<p style="font-family:Sora,sans-serif;font-size:10px;color:#374151;margin-bottom:8px;text-transform:uppercase;font-weight:600;letter-spacing:0.05em;">Pairing Logic</p>')
	arena_internal_pair = gr.Checkbox(value=True, label="Internal Pairings", info="Allows Stentor models to face other Stentor models.")

	arena_max = gr.Slider(10, 1024, value=DEFAULT_MAX_TOKENS, step=10, label="Max Tokens")
	with gr.Row():
	arena_temp = gr.Slider(0.1, 2.0, value=DEFAULT_TEMP, step=0.05, label="Temperature")
	arena_top_p = gr.Slider(0.05, 1.0, value=DEFAULT_TOP_P, step=0.05, label="Top P")
	with gr.Row():
	arena_rep_pen = gr.Slider(0.8, 2.0, value=DEFAULT_REP_PENALTY, step=0.05, label="Repetition Penalty")

	with gr.Row():
	arena_btn = gr.Button("🏟 Battle", variant="primary", scale=2)
	arena_stop_btn = gr.Button("⏹ Stop", variant="secondary", scale=1)

	with gr.Row(elem_id="arena-selectors") as arena_selector_row:
	with gr.Column():
	arena_model1 = gr.Dropdown(
	choices=list(STENTOR_MODELS.keys()),
	value=DEFAULT_MODEL,
	label="Stentor Model",
	interactive=True,
	)
	with gr.Column():
	arena_model2 = gr.Dropdown(
	choices=list(ARENA_MODELS.keys()),
	value=list(ARENA_MODELS.keys())[0],
	label="External Model",
	interactive=True,
	)

	def update_arena_ui_visibility(mode):
	return gr.update(visible=(mode == "🎲 Random")), gr.update(visible=(mode != "🎲 Random"))

	arena_mode.change(fn=update_arena_ui_visibility, inputs=[arena_mode], outputs=[arena_random_options, arena_selector_row])

	with gr.Row():
	with gr.Column():
	arena_output1 = gr.Textbox(label="Left Model Output", lines=12, interactive=False, elem_id="output-box")
	arena_status1 = gr.Textbox(value="Ready.", label="", interactive=False, elem_classes=["status-bar"])
	with gr.Column():
	arena_output2 = gr.Textbox(label="Right Model Output", lines=12, interactive=False, elem_id="output-box")
	arena_status2 = gr.Textbox(value="Ready.", label="", interactive=False, elem_classes=["status-bar"])

	with gr.Column(visible=False) as vote_col:
	gr.HTML('<div style="text-align:center;margin-top:20px;"><span class="section-title">VOTE FOR THE BEST RESPONSE</span></div>')
	with gr.Row():
	left_win = gr.Button("👈 Left is Better", variant="secondary")
	right_win = gr.Button("Right is Better 👉", variant="secondary")
	tie_win = gr.Button("🤝 It's a Tie", variant="secondary")
	both_bad = gr.Button("👎 Both are Bad", variant="secondary")

	arena_results = gr.Markdown("")

	# Arena event wiring
	arena_event = arena_btn.click(
	fn=arena_setup,
	inputs=[arena_mode],
	outputs=[arena_output1, arena_output2, arena_status1, arena_status2, vote_col, arena_selector_row, arena_identities, arena_results],
	).then(
	fn=arena_generate,
	inputs=[arena_prompt, arena_mode, arena_model1, arena_model2, arena_max, arena_temp, arena_top_p, arena_rep_pen, arena_fair_match, arena_internal_pair],
	outputs=[arena_output1, arena_output2, arena_status1, arena_status2, vote_col, arena_selector_row, arena_identities, arena_results],
	)

	vote_inputs = [arena_identities]
	left_win.click(fn=lambda ids: arena_vote("Left Model", ids), inputs=vote_inputs, outputs=[arena_results, vote_col])
	right_win.click(fn=lambda ids: arena_vote("Right Model", ids), inputs=vote_inputs, outputs=[arena_results, vote_col])
	tie_win.click(fn=lambda ids: arena_vote("Tie", ids), inputs=vote_inputs, outputs=[arena_results, vote_col])
	both_bad.click(fn=lambda ids: arena_vote("Both Bad", ids), inputs=vote_inputs, outputs=[arena_results, vote_col])

	arena_stop_btn.click(
	fn=lambda: ("⏹ Stopped.", "⏹ Stopped.", "", "", gr.update(visible=False)),
	outputs=[arena_output1, arena_output2, arena_status1, arena_status2, vote_col],
	cancels=[arena_event],
	)

	# ── TAB 6: CHAT ─────────────────────────────────────────────────────────
	with gr.TabItem(" 💬 Chat "):
	with gr.Row():
	with gr.Column(scale=1, min_width=240):
	gr.HTML('<span class="section-title">CHAT SETTINGS</span>')
	with gr.Row():
	chat_creative = gr.Button("🎨 Creative", size="sm", elem_classes=["preset-chip"])
	chat_balanced = gr.Button("⚖️ Balanced", size="sm", elem_classes=["preset-chip"])
	chat_focused = gr.Button("🎯 Focused", size="sm", elem_classes=["preset-chip"])
	gr.HTML(MODE_RECOMMENDATION_HTML)
	chat_model = gr.Radio(
	choices=list(STENTOR_MODELS.keys()),
	value=DEFAULT_MODEL,
	label="Model",
	interactive=True,
	)
	chat_max_tokens = gr.Slider(10, INITIAL_MAX_TOKENS, value=DEFAULT_MAX_TOKENS, step=10, label="Max New Tokens")
	chat_temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMP, step=0.05, label="Temperature")
	chat_top_p = gr.Slider(0.05, 1.0, value=DEFAULT_TOP_P, step=0.05, label="Top P")
	chat_rep_penalty = gr.Slider(0.8, 2.0, value=DEFAULT_REP_PENALTY, step=0.05, label="Repetition Penalty")
	with gr.Row():
	chat_stop_btn = gr.Button("⏹ Stop", variant="secondary")
	chat_reset_btn = gr.Button("↺ Reset Chat", variant="secondary")

	with gr.Column(scale=3):
	gr.HTML('<span class="section-title">CONVERSATION</span>')
	chat_messages = gr.State([])
	chat_display = gr.HTML(
	'<div style="padding:20px;text-align:center;background:#060a14;border:1px solid #1e2d45;'
	'border-radius:10px;min-height:200px;display:flex;flex-direction:column;align-items:center;'
	'justify-content:center;">'
	'<p style="font-family:Sora,sans-serif;font-size:13px;color:#374151;margin:0;">'
	'Start a conversation by typing a message below.</p></div>'
	)
	chat_input = gr.Textbox(
	label="Your Message",
	placeholder="Type your message here and press Send…",
	lines=3,
	elem_id="prompt-box",
	)
	chat_send_btn = gr.Button("▶ Send", variant="primary")

	chat_model.change(
	fn=lambda m, cur: gr.update(maximum=_max_tokens_cap(m), value=min(cur, _max_tokens_cap(m))),
	inputs=[chat_model, chat_max_tokens],
	outputs=[chat_max_tokens],
	)

	# ── TAB 7: ABOUT ───────────────────────────────────────────────────────
	with gr.TabItem(" ℹ About "):
	with gr.Column():
	gr.HTML('<span class="section-title">THE STENTOR SERIES</span>')
	gr.HTML(ABOUT_HTML)

	# ───────────────────────── EVENT WIRING ───────────────────────────────────

	model_sel.change(
	fn=lambda m, cur: (MODEL_CARDS.get(m, ""), gr.update(maximum=_max_tokens_cap(m), value=int(min(max(DEFAULT_MAX_TOKENS, cur), _max_tokens_cap(m))))),
	inputs=[model_sel, max_tokens],
	outputs=[model_card_html, max_tokens],
	)

	exp_model.change(
	fn=lambda m, cur: gr.update(maximum=_max_tokens_cap(m), value=int(min(max(5, cur), _max_tokens_cap(m)))),
	inputs=[exp_model, exp_tokens],
	outputs=[exp_tokens],
	)
	sweep_model.change(
	fn=lambda m, cur: gr.update(maximum=_max_tokens_cap(m), value=int(min(max(10, cur), _max_tokens_cap(m)))),
	inputs=[sweep_model, sweep_tokens],
	outputs=[sweep_tokens],
	)

	btn_creative.click(fn=lambda: apply_standard_preset("🎨 Creative"), outputs=[max_tokens, temperature, top_p, repetition_penalty])
	btn_balanced.click(fn=lambda: apply_standard_preset("⚖️ Balanced"), outputs=[max_tokens, temperature, top_p, repetition_penalty])
	btn_focused .click(fn=lambda: apply_standard_preset("🎯 Focused"), outputs=[max_tokens, temperature, top_p, repetition_penalty])

	exp_creative.click(fn=lambda: apply_standard_preset("🎨 Creative"), outputs=[exp_tokens, exp_temp, exp_top_p, exp_rep_pen])
	exp_balanced.click(fn=lambda: apply_standard_preset("⚖️ Balanced"), outputs=[exp_tokens, exp_temp, exp_top_p, exp_rep_pen])
	exp_focused .click(fn=lambda: apply_standard_preset("🎯 Focused"), outputs=[exp_tokens, exp_temp, exp_top_p, exp_rep_pen])

	sweep_creative.click(fn=lambda: apply_sweep_preset("🎨 Creative"), outputs=[sweep_tokens, sweep_top_p, sweep_rep_pen])
	sweep_balanced.click(fn=lambda: apply_sweep_preset("⚖️ Balanced"), outputs=[sweep_tokens, sweep_top_p, sweep_rep_pen])
	sweep_focused .click(fn=lambda: apply_sweep_preset("🎯 Focused"), outputs=[sweep_tokens, sweep_top_p, sweep_rep_pen])

	arena_creative.click(fn=lambda: apply_standard_preset("🎨 Creative"), outputs=[arena_max, arena_temp, arena_top_p, arena_rep_pen])
	arena_balanced.click(fn=lambda: apply_standard_preset("⚖️ Balanced"), outputs=[arena_max, arena_temp, arena_top_p, arena_rep_pen])
	arena_focused .click(fn=lambda: apply_standard_preset("🎯 Focused"), outputs=[arena_max, arena_temp, arena_top_p, arena_rep_pen])

	chat_creative.click(fn=lambda: apply_standard_preset("🎨 Creative"), outputs=[chat_max_tokens, chat_temperature, chat_top_p, chat_rep_penalty])
	chat_balanced.click(fn=lambda: apply_standard_preset("⚖️ Balanced"), outputs=[chat_max_tokens, chat_temperature, chat_top_p, chat_rep_penalty])
	chat_focused .click(fn=lambda: apply_standard_preset("🎯 Focused"), outputs=[chat_max_tokens, chat_temperature, chat_top_p, chat_rep_penalty])

	for btn, full_prompt in example_btns:
	btn.click(fn=lambda t=full_prompt: t, outputs=[prompt_box])

	def handle_stop_gen():
	interrupt_callback.stop_signal = True
	return "⏹ Stopped.", build_stats_html("—", "—", "—")

	def run_generate_multi(prompt, model_key, max_tok, temp, tp, rep_penalty, num_resp, history):
	text = (prompt or "").strip()
	if not text:
	yield "Enter a prompt.", "Enter a prompt.", build_stats_html("—", "—", "—"), history, build_history_html(history)
	return

	num = int(num_resp) if num_resp is not None else 1
	tokenizer, _ = _get_model(STENTOR_MODELS[model_key])
	configs = [{
	"model_key": model_key, "max_tokens": max_tok, "temperature": temp,
	"top_p": tp, "repetition_penalty": rep_penalty
	} for _ in range(num)]

	final_results = [""] * num
	final_elapsed = 0.0

	for results, elapsed in parallel_config_generate(prompt, configs):
	final_results, final_elapsed = results, elapsed
	display = ""
	if num > 1:
	for idx, r in enumerate(results):
	display += f"─── Response {idx+1} ───\n{r}\n\n"
	else:
	display = results[0]

	total_tokens = sum(len(tokenizer.encode(r)) for r in results)
	tps = total_tokens / elapsed if elapsed > 0 else 0
	yield display, "⚡ Generating…", build_stats_html(str(total_tokens), f"{elapsed:.1f}s", f"{tps:.1f} t/s"), history, build_history_html(history)

	new_history = add_to_history(history, prompt, final_results[0], model_key, temp)
	total_tokens = sum(len(tokenizer.encode(r)) for r in final_results)
	tps = total_tokens / final_elapsed if final_elapsed > 0 else 0
	yield display, f"✓ Done · {model_key}", build_stats_html(str(total_tokens), f"{final_elapsed:.2f}s", f"{tps:.1f} t/s"), new_history, build_history_html(new_history)

	gen_event = gen_btn.click(
	fn=run_generate_multi,
	inputs=[prompt_box, model_sel, max_tokens, temperature, top_p, repetition_penalty, num_responses, history_state],
	outputs=[output_box, status_box, stats_html, history_state, history_html],
	)

	stop_btn.click(
	fn=handle_stop_gen,
	outputs=[status_box, stats_html],
	cancels=[gen_event]
	)

	def handle_stop_exp():
	interrupt_callback.stop_signal = True
	return "⏹ Stopped.", _explorer_placeholder()

	exp_event = exp_btn.click(
	fn=run_token_explorer,
	inputs=[exp_prompt, exp_model, exp_tokens, exp_temp, exp_top_p, exp_rep_pen],
	outputs=[exp_output, exp_status],
	)
	exp_stop_btn.click(
	fn=handle_stop_exp,
	outputs=[exp_status, exp_output],
	cancels=[exp_event]
	)

	sweep_event = sweep_btn.click(
	fn=run_temp_sweep_streamed,
	inputs=[sweep_prompt, sweep_model, sweep_tokens, sweep_top_p, sweep_rep_pen, sweep_count] + sweep_temp_inputs,
	outputs=sweep_outs,
	)
	def handle_stop_sweep():
	interrupt_callback.stop_signal = True
	return [gr.update(value="⏹ Stopped.")] * len(sweep_outputs_for_fn)
	sweep_stop_btn.click(
	fn=handle_stop_sweep,
	outputs=sweep_outputs_for_fn,
	cancels=[sweep_event]
	)

	chat_event = chat_send_btn.click(
	fn=chat_generate,
	inputs=[chat_messages, chat_input, chat_model, chat_max_tokens, chat_temperature, chat_top_p, chat_rep_penalty],
	outputs=[chat_display, chat_messages],
	).then(
	fn=lambda: "", outputs=[chat_input],
	)

	chat_input_event = chat_input.submit(
	fn=chat_generate,
	inputs=[chat_messages, chat_input, chat_model, chat_max_tokens, chat_temperature, chat_top_p, chat_rep_penalty],
	outputs=[chat_display, chat_messages],
	).then(
	fn=lambda: "", outputs=[chat_input],
	)

	def handle_stop_chat():
	interrupt_callback.stop_signal = True

	chat_stop_btn.click(
	fn=handle_stop_chat,
	cancels=[chat_event, chat_input_event]
	)

	chat_reset_btn.click(
	fn=chat_clear,
	outputs=[chat_messages, chat_display, chat_input],
	)


	if __name__ == "__main__":
	demo.launch(theme=gr.themes.Base(), css=CSS, ssr_mode=False)