import os
import time
import random
import queue
from datetime import datetime, timezone
import html
from threading import Thread
from typing import Any
import gradio as gr
import torch
import torch.nn.functional as F
try:
from huggingface_hub import login
except ImportError:
login = None
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
try:
from transformers import TextIteratorStreamer
except ImportError: # transformers >=5 may move streamers under generation
from transformers.generation.streamers import TextIteratorStreamer
# ── PERFORMANCE BOTTLENECK FIXES ──
if not torch.cuda.is_available():
# Hugging Face Free Spaces typically have 2 vCPUs.
# Limiting threads prevents context-switching overhead which is a major bottleneck for small models.
torch.set_num_threads(2)
# Flush denormal numbers to zero to avoid expensive CPU subnormal float calculations.
try:
torch.set_flush_denormal(True)
except Exception:
pass
# Import tokenmonster for models that require it
try:
import tokenmonster
except ImportError:
tokenmonster = None
class InterruptCallback(StoppingCriteria):
def __init__(self):
self.stop_signal = False
def __call__(self, input_ids, scores, **kwargs):
return self.stop_signal
interrupt_callback = InterruptCallback()
# ─────────────────────────────────── CONFIG ───────────────────────────────────
APP_TITLE = "Stentor Labs"
MODEL_OWNER = os.getenv("HF_MODEL_OWNER", "StentorLabs")
FLAGSHIP_MODELS = ["Portimbria-150M", "Stentor3-50M"]
STENTOR_MODELS = {
"Portimbria-150M": f"{MODEL_OWNER}/Portimbria-150M",
"Stentor3-50M": f"{MODEL_OWNER}/Stentor3-50M",
"Stentor3-20M": f"{MODEL_OWNER}/Stentor3-20M",
"Stentor2-30M": f"{MODEL_OWNER}/Stentor2-30M",
"Stentor2-12M": f"{MODEL_OWNER}/Stentor2-12M",
"Stentor-30M": f"{MODEL_OWNER}/Stentor-30M",
"Stentor-12M": f"{MODEL_OWNER}/Stentor-12M",
"Stentor-30M-Instruct": f"{MODEL_OWNER}/Stentor-30M-Instruct",
"Stentor-12M-Instruct": f"{MODEL_OWNER}/Stentor-12M-Instruct",
}
ARENA_MODELS = {
"DistilGPT2": "distilbert/distilgpt2",
"Pythia-14M": "EleutherAI/pythia-14m",
"Pythia-31M": "EleutherAI/pythia-31m",
"Pythia-70M": "EleutherAI/pythia-70m",
"gpt2 small": "openai-community/gpt2",
"SmolLM2-135M": "HuggingFaceTB/SmolLM2-135M",
"NanoWhale-100M-Base": "HuggingFaceTB/nanowhale-100m-base",
"Pythia-160M": "EleutherAI/pythia-160m",
"OPT-125M": "facebook/opt-125m",
"GPT-Neo 125M": "EleutherAI/gpt-neo-125M",
}
ALL_MODELS = {**STENTOR_MODELS, **ARENA_MODELS}
DEFAULT_MODEL = "Portimbria-150M"
def _max_tokens_cap(model_key: str) -> int:
if model_key.lower().startswith("portimbria") or model_key.lower().startswith("stentor3-"):
return 4096
if model_key.lower().startswith("stentor2-"):
return 1024
return 512
# Default generation parameters
DEFAULT_TEMP = 0.8
DEFAULT_REP_PENALTY = 1.35
DEFAULT_TOP_P = 0.9
DEFAULT_MAX_TOKENS = 100
INITIAL_MAX_TOKENS = _max_tokens_cap(DEFAULT_MODEL)
PRESETS = {
"🎨 Creative": {
"temperature": 1.1,
"top_p": 0.95,
"max_tokens": DEFAULT_MAX_TOKENS,
"repetition_penalty": DEFAULT_REP_PENALTY,
},
"⚖️ Balanced": {
"temperature": 0.8,
"top_p": 0.9,
"max_tokens": DEFAULT_MAX_TOKENS,
"repetition_penalty": DEFAULT_REP_PENALTY,
},
"🎯 Focused": {
"temperature": 0.6,
"top_p": 0.9,
"max_tokens": DEFAULT_MAX_TOKENS,
"repetition_penalty": DEFAULT_REP_PENALTY,
},
}
MODE_RECOMMENDATION_HTML = (
'
'
'Balanced or Focused modes are recommended; '
'Creative mode can get chaotic.'
'
'
)
EXAMPLE_PROMPTS = [
("📖", "Once upon a time in a world where"),
("📖", "The last explorer on Earth discovered"),
("🔬", "The theory of relativity states that"),
("🔬", "Scientists recently discovered that"),
("💻", "def quicksort(arr):\n "),
("💻", "class NeuralNetwork:\n def __init__"),
("🧠", "The most important thing about AI is"),
("🧠", "The philosophy of consciousness suggests"),
]
_model_cache: dict[str, tuple[Any, Any]] = {}
def _hf_auth_token():
return os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
# Attempt global login if token is available
_token = _hf_auth_token()
if _token:
print(f"[Stentor] Authentication token found (ending in ...{_token[-4:]}).")
if login:
try:
login(token=_token)
except Exception as e:
print(f"[Stentor] Global login failed: {e}")
else:
print("[Stentor] WARNING: No HF_TOKEN found in secrets. Private/Gated models will fail to load.")
def _hf_repo_kwargs() -> dict[str, Any]:
token = _hf_auth_token()
return {"token": token} if token else {}
# ─────────────────────────────── MODEL LOGIC ──────────────────────────────────
def _load_tokenizer(model_id: str):
# For Stentor models, trust_remote_code is mandatory for TokenMonster support
is_stentor = any(s.lower() in model_id.lower() for s in ["stentor", "portimbria"])
kwargs = {"trust_remote_code": is_stentor, "use_fast": not is_stentor}
try:
return AutoTokenizer.from_pretrained(model_id, **_hf_repo_kwargs(), **kwargs)
except Exception as first_err:
# Fallback to absolute basics if that fails
try:
return AutoTokenizer.from_pretrained(model_id, **_hf_repo_kwargs(), trust_remote_code=True)
except Exception as second_err:
raise RuntimeError(f"Tokenizer fail: {first_err} -> {second_err}")
def _load_model(model_id: str):
is_stentor = any(s.lower() in model_id.lower() for s in ["stentor", "portimbria"])
# Determine optimal dtype for loading
if torch.cuda.is_available():
if torch.cuda.is_bf16_supported():
dtype = torch.bfloat16
print(f"[Stentor] Using bfloat16 for {model_id} on CUDA.")
else:
dtype = torch.float16
print(f"[Stentor] Using float16 for {model_id} on CUDA.")
else:
# On CPU, float16 is extremely slow due to software emulation.
# float32 is the native fast path for CPU inference.
# Even for 150M models, memory usage in float32 is only ~600MB.
dtype = torch.float32
print(f"[Stentor] Using float32 for {model_id} on CPU for maximum speed.")
try:
return AutoModelForCausalLM.from_pretrained(
model_id,
dtype=dtype,
low_cpu_mem_usage=True,
trust_remote_code=is_stentor,
**_hf_repo_kwargs(),
)
except Exception as e:
print(f"[Stentor] Failed to load {model_id} with {dtype}. Retrying with float32 and trust_remote_code=True. Error: {e}")
# Fallback to float32 and forced trust_remote_code if initial attempt fails
return AutoModelForCausalLM.from_pretrained(
model_id,
dtype=torch.float32,
low_cpu_mem_usage=True,
trust_remote_code=True,
**_hf_repo_kwargs(),
)
def _get_model(model_id: str):
if model_id in _model_cache:
return _model_cache[model_id]
print(f"[Stentor] Loading {model_id}...")
tok = _load_tokenizer(model_id)
mdl = _load_model(model_id)
mdl.eval()
# Removed torch.compile to prevent initialization timeouts/errors on CPU spaces
# Warmup pass to trigger JIT compilation
print(f"[Stentor] Warming up {model_id}...")
try:
dummy = tok("Hello", return_tensors="pt")
with torch.no_grad():
mdl.generate(**dummy, max_new_tokens=1, pad_token_id=tok.eos_token_id)
print(f"[Stentor] {model_id} warmup complete")
except Exception as e:
print(f"[Stentor] {model_id} warmup skipped ({e})")
_model_cache[model_id] = (tok, mdl)
print(f"[Stentor] {model_id} ready.")
return tok, mdl
def _prep_inputs(tokenizer, prompt):
inputs = tokenizer(prompt, return_tensors="pt")
inputs.pop("token_type_ids", None)
return inputs
def _to_model_device(inputs: dict[str, Any], model):
device = next(model.parameters()).device
for k, v in inputs.items():
if hasattr(v, "to"):
inputs[k] = v.to(device)
if "attention_mask" not in inputs and "input_ids" in inputs:
inputs["attention_mask"] = torch.ones_like(inputs["input_ids"])
return inputs
def _decode_response_only(tokenizer, full_ids, input_ids):
prompt_len = input_ids.shape[1]
new_ids = full_ids[0][prompt_len:]
return tokenizer.decode(new_ids, skip_special_tokens=True)
def _generate_and_stream(repo_id, prompt, max_tokens, temperature, top_p, repetition_penalty):
"""Generate text from a model, yielding tokens as they come."""
tokenizer, model = _get_model(repo_id)
interrupt_callback.stop_signal = False
inputs = _to_model_device(_prep_inputs(tokenizer, prompt), model)
# IMPORTANT: Removed skip_special_tokens=True due to TokenMonster incompatibility
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
error_container = [None]
def worker_fn():
try:
with torch.inference_mode():
gen_kwargs = dict(
**inputs,
max_new_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
repetition_penalty=float(repetition_penalty),
do_sample=float(temperature) >= 0.05,
pad_token_id=tokenizer.eos_token_id,
stopping_criteria=StoppingCriteriaList([interrupt_callback]),
streamer=streamer,
)
model.generate(**gen_kwargs)
except Exception as e:
error_container[0] = str(e)
t = Thread(target=worker_fn, daemon=True)
t.start()
full_text = ""
# Manually filter common special tokens if they appear (TokenMonster compatibility)
special_tokens_to_remove = [tokenizer.eos_token, tokenizer.pad_token, tokenizer.bos_token, tokenizer.unk_token]
# Filter out None values from special_tokens_to_remove list
special_tokens_to_remove = [t for t in special_tokens_to_remove if t is not None]
try:
for chunk in streamer:
if interrupt_callback.stop_signal: break
for st in special_tokens_to_remove:
chunk = chunk.replace(st, "")
full_text += chunk
yield full_text
finally:
interrupt_callback.stop_signal = True
t.join(timeout=1.0)
if error_container[0]:
yield f"❌ {error_container[0]}"
def parallel_config_generate(prompt, configs):
"""Runs multiple generations sequentially and yields results list."""
text = (prompt or "").strip()
if not text: return
num = len(configs)
results = [""] * num
t0 = time.perf_counter()
interrupt_callback.stop_signal = False
for i, cfg in enumerate(configs):
if interrupt_callback.stop_signal: break
m_key = cfg["model_key"]
error_container = [None]
try:
interrupt_callback.stop_signal = False
tokenizer, model = _get_model(ALL_MODELS[m_key])
m_cap = _max_tokens_cap(m_key)
actual_max = min(int(cfg["max_tokens"]), m_cap)
inputs = _to_model_device(_prep_inputs(tokenizer, text), model)
# Unified with _generate_and_stream for TokenMonster compatibility
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
def worker_fn():
try:
with torch.inference_mode():
# Ensure diverse output for sequential same-model runs (Tab 1 multi-response)
torch.manual_seed(random.randint(0, 2**31 - 1))
gen_kwargs = dict(
**inputs,
max_new_tokens=actual_max,
temperature=float(cfg["temperature"]),
top_p=float(cfg["top_p"]),
repetition_penalty=float(cfg["repetition_penalty"]),
do_sample=float(cfg["temperature"]) >= 0.05,
pad_token_id=tokenizer.eos_token_id,
stopping_criteria=StoppingCriteriaList([interrupt_callback]),
streamer=streamer,
)
model.generate(**gen_kwargs)
except Exception as e:
error_container[0] = str(e)
t = Thread(target=worker_fn, daemon=True)
t.start()
# Filter common special tokens manually for TokenMonster compatibility
special_tokens_to_remove = [tokenizer.eos_token, tokenizer.pad_token, tokenizer.bos_token, tokenizer.unk_token]
special_tokens_to_remove = [t for t in special_tokens_to_remove if t is not None]
try:
for chunk in streamer:
if interrupt_callback.stop_signal: break
for st in special_tokens_to_remove:
chunk = chunk.replace(st, "")
results[i] += chunk
yield results, time.perf_counter() - t0
finally:
t.join(timeout=1.0)
if error_container[0]:
results[i] = f"❌ {error_container[0]}"
yield results, time.perf_counter() - t0
if interrupt_callback.stop_signal:
break
except Exception as e:
results[i] = f"❌ {str(e)}"
yield results, time.perf_counter() - t0
# ── Chat generation ─────────────────────────────────────────────────────────────
def chat_generate(messages_state, user_message, model_key, max_tok, temp, tp, rep_penalty):
if messages_state is None:
messages_state = []
if not user_message:
yield "", messages_state
return
messages_state.append({"role": "user", "content": user_message})
conversation = ""
for msg in messages_state:
role = msg.get("role", "user")
content = msg.get("content", "")
conversation += f"<{role}>{content}{role}>\n"
conversation += f""
config = [{"model_key": model_key, "max_tokens": max_tok, "temperature": temp, "top_p": tp, "repetition_penalty": rep_penalty}]
assistant_response = ""
for results, _ in parallel_config_generate(conversation, config):
assistant_response = results[0]
temp_messages = messages_state + [{"role": "assistant", "content": assistant_response}]
yield _render_chat_html(temp_messages, model_key), messages_state
messages_state.append({"role": "assistant", "content": assistant_response.strip()})
yield _render_chat_html(messages_state, model_key), messages_state
def _render_chat_html(messages, model_key):
html_parts = []
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
if role == "user":
html_parts.append(
f''
)
else:
html_parts.append(
f''
)
return "".join(html_parts)
def chat_clear():
"""Clear chat messages."""
placeholder = 'Start a conversation by typing a message below.
'
return [], placeholder, ""
def apply_standard_preset(name: str):
p = PRESETS[name]
return p["max_tokens"], p["temperature"], p["top_p"], p["repetition_penalty"]
def apply_sweep_preset(name: str):
p = PRESETS[name]
return p["max_tokens"], p["top_p"], p["repetition_penalty"]
# ── Token Explorer (sampling + logprobs) ───────────────────────────────────────
def run_token_explorer(prompt, model_key, max_tokens, temperature, top_p, repetition_penalty):
text = (prompt or "").strip()
if not text:
return _explorer_placeholder(), "Enter a prompt."
interrupt_callback.stop_signal = False
try:
tokenizer, model = _get_model(ALL_MODELS.get(model_key, ALL_MODELS[DEFAULT_MODEL]))
except Exception as e:
return f'Error loading model: {e}
', "Error"
max_tokens = min(int(max_tokens), _max_tokens_cap(model_key))
inputs = _to_model_device(_prep_inputs(tokenizer, text), model)
try:
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=int(max_tokens),
do_sample=True,
temperature=float(temperature),
top_p=float(top_p),
repetition_penalty=float(repetition_penalty),
output_scores=True,
return_dict_in_generate=True,
pad_token_id=tokenizer.eos_token_id,
stopping_criteria=StoppingCriteriaList([interrupt_callback]),
)
except Exception as e:
return f'Generation error: {e}
', "Error"
input_len = inputs["input_ids"].shape[1]
generated_ids = outputs.sequences[0][input_len:]
token_data = []
for score_t, token_id in zip(outputs.scores, generated_ids):
probs = F.softmax(score_t[0], dim=-1)
top_k = torch.topk(probs, 8)
token_data.append({
"token": tokenizer.decode([token_id.item()]),
"prob": probs[token_id].item(),
"alternatives": [
{"token": tokenizer.decode([idx.item()]), "prob": p.item()}
for idx, p in zip(top_k.indices, top_k.values)
],
})
html = _build_token_html(text, token_data)
return html, f"✓ {len(token_data)} tokens · sampled · {model_key}"
def _tok_style(p):
if p >= 0.80: return "#10b981", "rgba(16,185,129,0.18)", "rgba(16,185,129,0.40)"
if p >= 0.50: return "#eab308", "rgba(234,179,8,0.18)", "rgba(234,179,8,0.40)"
if p >= 0.35: return "#f97316", "rgba(249,115,22,0.14)", "rgba(249,115,22,0.40)"
return "#b91c1c", "rgba(185,28,28,0.12)", "rgba(185,28,28,0.40)"
def _build_token_html(prompt_text, token_data):
if not token_data:
return 'No tokens generated.
'
avg_p = sum(td["prob"] for td in token_data) / len(token_data)
high = sum(1 for td in token_data if td["prob"] >= 0.80)
med = sum(1 for td in token_data if 0.50 <= td["prob"] < 0.80)
unsure = sum(1 for td in token_data if 0.35 <= td["prob"] < 0.50)
low = sum(1 for td in token_data if td["prob"] < 0.35)
spans = []
for td in token_data:
raw = td["token"]
p = td["prob"]
pct = int(p * 100)
col, bg, brd = _tok_style(p)
disp = html.escape(raw).replace("\n", "↵")
if not disp.strip():
disp = "·"
alts = " | ".join(
f'{html.escape(a["token"].strip() or "·")} {a["prob"]*100:.0f}%'
for a in td["alternatives"][:6]
)
tip = html.escape(f"Token: {raw.strip() or repr(raw)} ({pct}%)\nAlternatives: {alts}").replace("\n", "
")
spans.append(
f''
f'{disp}{pct}%'
f''
)
prompt_span = (
f''
f'{html.escape(prompt_text)}'
f''
)
token_block = (
f''
+ prompt_span + "".join(spans)
+ "
"
)
legend = (
f''
f'
'
f'Avg confidence: '
f'{int(avg_p*100)}%'
f'· Hover any token to see top alternatives'
f'
'
f'
'
f'
'
f'
'
f'
≥80% confident · {high} tokens'
f'
'
f'
'
f'
50–79% moderate · {med} tokens'
f'
'
f'
'
f'
35–49% uncertain · {unsure} tokens'
f'
'
f'
'
f'
<35% low · {low} tokens'
f'
'
)
return legend + token_block
def _explorer_placeholder():
return (
''
'
TOKEN EXPLORER
'
'
'
'Enter a prompt and click Explore to see per-token confidence heatmap
'
'
'
)
# ── Temperature Sweep ──────────────────────────────────────────────────────────
def run_temp_sweep_streamed(prompt, model_key, max_tok, tp, rep_penalty, count, *temps):
count_int = int(count)
text = (prompt or "").strip()
if not text:
yield tuple(["Enter a prompt."] * 5)
return
active_temps = [float(t) for t in temps[:count_int]]
# Validate: No duplicates
rounded = [round(t, 2) for t in active_temps]
if len(set(rounded)) != len(rounded):
yield tuple(["⚠️ Duplicate temperatures detected. Please make each temperature unique."] * 5)
return
configs = []
for t in active_temps:
configs.append({
"model_key": model_key, "max_tokens": max_tok,
"temperature": t, "top_p": tp, "repetition_penalty": rep_penalty
})
for results, _ in parallel_config_generate(prompt, configs):
outputs = []
for i in range(5):
outputs.append(results[i] if i < count_int else "")
yield tuple(outputs)
# ── History helpers ────────────────────────────────────────────────────────────
def add_to_history(history, prompt, output, model_key, temperature):
if not (output and prompt):
return history or []
entry = {
"prompt": prompt[:55] + ("…" if len(prompt) > 55 else ""),
"output": output.strip()[:100],
"model": model_key,
"temp": round(float(temperature), 1),
"time": datetime.now().strftime("%H:%M"),
}
return ([entry] + (history or []))[:10]
def build_history_html(history):
if not history:
return (
''
)
rows = []
for i, e in enumerate(history):
fade = max(0.35, 1.0 - i * 0.07)
rows.append(
f''
f'
'
f'{e["time"]} · {e["model"]} · t={e["temp"]}
'
f'
'
f'{e["prompt"]}
'
f'
'
f'{e["output"] or "…"}
'
f'
'
)
return (
''
'
'
'HISTORY'
f'{len(history)} runs'
'
'
+ "".join(rows)
+ '
'
)
# ── Stats HTML ─────────────────────────────────────────────────────────────────
def build_stats_html(tokens, elapsed, tps):
def pill(val, lbl):
return (
f''
f'{val}'
f'{lbl}'
f'
'
)
return (
f''
+ pill(tokens, "Tokens") + pill(elapsed, "Time") + pill(tps, "Speed")
+ '
'
)
def _on_generate_model_change(model_key: str, current_max_tokens: float):
cap = _max_tokens_cap(model_key)
value = int(min(max(DEFAULT_MAX_TOKENS, current_max_tokens), cap))
return MODEL_CARDS.get(model_key, ""), gr.update(maximum=cap, value=value)
def _on_model_cap_change(model_key: str, current_max_tokens: float, min_value: int = 5):
cap = _max_tokens_cap(model_key)
value = int(min(max(min_value, current_max_tokens), cap))
return gr.update(maximum=cap, value=value)
# ── Arena ─────────────────────────────────────────────────────────────────────
def arena_generate(prompt, mode, model1_key, model2_key, max_tok, temp, tp, rep_penalty, fair_match, internal_pair):
"""Run two models side by side and stream the results."""
text = (prompt or "").strip()
show_vote = (mode != "👀 Show")
if not text:
yield gr.update(value="", label="Left Model Output"), gr.update(value="", label="Right Model Output"), "Enter a prompt.", "Enter a prompt.", gr.update(visible=show_vote), gr.update(visible=True), None, ""
return
interrupt_callback.stop_signal = False
m1 = model1_key
m2 = model2_key
if mode == "🎲 Random":
m1 = random.choice(list(STENTOR_MODELS.keys()))
potential_m2 = list(ARENA_MODELS.keys())
if internal_pair:
potential_m2 += [k for k in STENTOR_MODELS if k != m1]
if fair_match:
# Group categorizations for matchmaking tiers
tiers = {
"12m": [k for k in STENTOR_MODELS if "12M" in k],
"20m": [k for k in STENTOR_MODELS if "20M" in k],
"30m": [k for k in STENTOR_MODELS if "30M" in k],
"50m": [k for k in STENTOR_MODELS if "50M" in k],
"150m": ["Portimbria-150M"]
}
fair_external_allowlist = {
"12m": ["Pythia-14M"],
"20m": ["Pythia-14M", "Pythia-31M"],
"30m": ["Pythia-31M"],
"50m": ["Pythia-31M", "Pythia-70M", "NanoWhale-100M-Base"],
"150m": ["gpt2 small", "SmolLM2-135M", "NanoWhale-100M-Base", "Pythia-160M", "OPT-125M", "GPT-Neo 125M"],
}
allowed_m2 = []
if m1 in tiers["12m"]:
allowed_m2 = list(fair_external_allowlist["12m"])
if internal_pair: allowed_m2 += [k for k in tiers["12m"] if k != m1]
elif m1 in tiers["20m"]:
allowed_m2 = list(fair_external_allowlist["20m"])
if internal_pair: allowed_m2 += [k for k in tiers["20m"] if k != m1]
elif m1 in tiers["30m"]:
allowed_m2 = list(fair_external_allowlist["30m"])
if internal_pair: allowed_m2 += [k for k in tiers["30m"] if k != m1]
elif m1 in tiers["50m"]:
allowed_m2 = list(fair_external_allowlist["50m"])
if internal_pair: allowed_m2 += [k for k in tiers["50m"] if k != m1]
elif m1 in tiers["150m"]:
allowed_m2 = list(fair_external_allowlist["150m"])
if internal_pair: allowed_m2 = [k for k in allowed_m2 if k != m1]
else:
# Fallback safety (e.g. for Instruct models)
allowed_m2 = [k for k in potential_m2 if k != m1]
if not allowed_m2:
m2 = random.choice(potential_m2)
else:
m2 = random.choice(allowed_m2)
else:
m2 = random.choice(potential_m2)
elif mode == "🙈 Blind":
# Rule: No External vs External. If user picks two, force left to a Stentor model.
if m1 in ARENA_MODELS and m2 in ARENA_MODELS:
m1 = random.choice(list(STENTOR_MODELS.keys()))
# Swap randomly in Blind/Random so "Left" isn't always the same type of model
is_swapped = False
if mode != "👀 Show" and random.random() > 0.5:
m1, m2 = m2, m1
is_swapped = True
display_name1 = m1 if mode == "👀 Show" else "Model A"
display_name2 = m2 if mode == "👀 Show" else "Model B"
label1 = f"{display_name1} Output"
label2 = f"{display_name2} Output"
# Initial thinking update to clear UI immediately
yield gr.update(value="", label=label1), gr.update(value="", label=label2), f"⚡ {display_name1} is thinking…", f"Waiting for {display_name2}…", gr.update(visible=show_vote), gr.update(visible=False), (m1, m2), ""
# Run model 1
output1 = ""
for partial in _generate_and_stream(ALL_MODELS[m1], text, max_tok, temp, tp, rep_penalty):
if interrupt_callback.stop_signal: break
output1 = partial
yield gr.update(value=output1, label=label1), gr.update(value="", label=label2), f"⚡ {display_name1} is thinking…", f"Waiting for {display_name2}…", gr.update(visible=show_vote), gr.update(visible=False), (m1, m2), ""
# Run model 2
output2 = ""
start2 = time.time()
for partial in _generate_and_stream(ALL_MODELS[m2], text, max_tok, temp, tp, rep_penalty):
if interrupt_callback.stop_signal: break
output2 = partial
elapsed = time.time() - start2
yield gr.update(value=output1, label=label1), gr.update(value=output2, label=label2), f"✓ {display_name1} Finished", f"⚡ {display_name2} is thinking… ({elapsed:.1f}s)", gr.update(visible=show_vote), gr.update(visible=False), (m1, m2), ""
yield gr.update(value=output1, label=label1), gr.update(value=output2, label=label2), f"✓ {display_name1} Finished", f"✓ {display_name2} Finished", gr.update(visible=show_vote), gr.update(visible=(mode == "👀 Show")), (m1, m2), ""
def arena_setup(mode):
show_vote = (mode != "👀 Show")
return gr.update(value="", label="Left Model Output"), gr.update(value="", label="Right Model Output"), "Waiting...", "Waiting...", gr.update(visible=show_vote), gr.update(visible=False), None, ""
def arena_vote(vote_type, identities):
if not identities:
return "Please run a battle first.", gr.update(visible=False)
m1, m2 = identities
result_text = f"### Decision Recorded! \n\n**Winner:** {vote_type}\n\n"
result_text += f"**Left was:** `{m1}`\n**Right was:** `{m2}`"
return result_text, gr.update(visible=False)
# ─────────────────────────────────── CSS ──────────────────────────────────────
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Bebas+Neue&family=Space+Mono:wght@400;700&family=Sora:wght@300;400;500;600;700&display=swap');
.gradio-container, .gradio-container * { box-sizing: border-box !important; }
.gradio-container {
background: #04060e !important;
max-width: 1280px !important;
margin: 0 auto !important;
padding: 0 !important;
font-family: 'Sora', sans-serif !important;
color: #ffffff !important;
}
body,
.gradio-container > div,
.gradio-container .contain,
.gradio-container .wrap,
.gradio-container section,
.gradio-container .tabs,
.gradio-container .tabitem,
.gradio-container > div > div,
.gradio-container .block {
background: #04060e !important;
border-color: #1a2744 !important;
}
.gradio-container .block {
box-shadow: none !important;
border-radius: 0 !important;
padding: 0 !important;
border: none !important;
}
footer { display: none !important; }
.gradio-container p,
.gradio-container span,
.gradio-container div,
.gradio-container li,
.gradio-container td,
.gradio-container th {
color: #ffffff !important;
font-family: 'Sora', sans-serif !important;
}
.gradio-container label,
.gradio-container label span,
.gradio-container .label-wrap span {
font-family: 'Sora', sans-serif !important;
font-size: 11px !important;
font-weight: 600 !important;
letter-spacing: 0.1em !important;
text-transform: uppercase !important;
color: #374151 !important;
}
.gradio-container textarea,
.gradio-container input[type="text"],
.gradio-container input[type="number"] {
font-family: 'Sora', sans-serif !important;
background: #0d1829 !important;
border: 1px solid #1a2744 !important;
color: #ffffff !important;
border-radius: 8px !important;
font-size: 14px !important;
}
.gradio-container textarea:focus,
.gradio-container input:focus {
border-color: #78490a !important;
box-shadow: 0 0 0 3px rgba(245,158,11,0.07) !important;
outline: none !important;
}
#prompt-box textarea {
font-size: 15px !important;
line-height: 1.75 !important;
min-height: 120px !important;
}
#output-box textarea {
font-family: 'Space Mono', monospace !important;
font-size: 13px !important;
line-height: 1.85 !important;
color: #ffffff !important;
background: #060a14 !important;
border-color: #1a2744 !important;
}
/* Stop fading during generation updates */
.gradio-container textarea { transition: none !important; opacity: 1 !important; }
.status-bar textarea {
font-family: 'Space Mono', monospace !important;
font-size: 12px !important;
color: #374151 !important;
background: #0a0f1e !important;
border-color: #1a2744 !important;
padding: 6px 10px !important;
}
.gradio-container input[type="range"] { accent-color: #f59e0b !important; }
.gradio-container input[type="number"] {
background: #0d1829 !important;
color: #e2e8f0 !important;
border: 1px solid #1a2744 !important;
font-family: 'Space Mono', monospace !important;
font-size: 13px !important;
width: 64px !important;
}
.gradio-container [role="tablist"] {
background: #04060e !important;
border-bottom: 1px solid #1a2744 !important;
padding: 0 36px !important;
gap: 0 !important;
}
.gradio-container [role="tab"] {
font-family: 'Sora', sans-serif !important;
font-size: 13px !important;
font-weight: 500 !important;
color: #374151 !important;
background: transparent !important;
border: none !important;
border-bottom: 2px solid transparent !important;
border-radius: 0 !important;
padding: 14px 20px !important;
letter-spacing: 0.03em !important;
transition: color 0.15s !important;
}
.gradio-container [role="tab"]:hover { color: #94a3b8 !important; background: transparent !important; }
.gradio-container [role="tab"][aria-selected="true"],
.gradio-container [role="tab"].selected {
color: #f59e0b !important;
border-bottom: 2px solid #f59e0b !important;
background: transparent !important;
}
.gradio-container [role="tabpanel"],
.gradio-container .tabitem {
background: #04060e !important;
padding: 28px 36px !important;
border: none !important;
}
.gradio-container fieldset {
background: transparent !important;
border: none !important;
padding: 0 !important;
gap: 6px !important;
}
.gradio-container fieldset label {
background: #0d1829 !important;
border: 1px solid #1a2744 !important;
border-radius: 8px !important;
padding: 8px 14px !important;
cursor: pointer !important;
color: #64748b !important;
font-size: 13px !important;
font-weight: 500 !important;
text-transform: none !important;
letter-spacing: 0 !important;
transition: all 0.15s !important;
}
.gradio-container fieldset label:has(input:checked) {
background: rgba(245,158,11,0.1) !important;
border-color: #f59e0b !important;
color: #f59e0b !important;
}
.gradio-container button {
font-family: 'Sora', sans-serif !important;
cursor: pointer !important;
transition: all 0.18s !important;
border-radius: 8px !important;
}
.gradio-container button.primary,
.gradio-container button[variant="primary"] {
background: #f59e0b !important;
color: #07090f !important;
border: none !important;
font-size: 13px !important;
font-weight: 700 !important;
letter-spacing: 0.07em !important;
text-transform: uppercase !important;
padding: 11px 22px !important;
position: relative !important;
overflow: hidden !important;
}
.gradio-container button.primary::after {
content: '' !important;
position: absolute !important;
inset: 0 !important;
background: linear-gradient(120deg, transparent 30%, rgba(255,255,255,0.15) 50%, transparent 70%) !important;
transform: translateX(-100%) !important;
transition: transform 0.4s !important;
}
.gradio-container button.primary:hover::after { transform: translateX(100%) !important; }
.gradio-container button.primary:hover {
background: #fbbf24 !important;
box-shadow: 0 0 28px rgba(245,158,11,0.4) !important;
transform: translateY(-1px) !important;
}
.gradio-container button.secondary,
.gradio-container button[variant="secondary"] {
background: #0d1829 !important;
color: #64748b !important;
border: 1px solid #1a2744 !important;
font-size: 13px !important;
font-weight: 500 !important;
padding: 10px 18px !important;
}
.gradio-container button.secondary:hover {
background: #111d30 !important;
color: #e2e8f0 !important;
border-color: #2a3f60 !important;
}
.prompt-chip {
background: transparent !important;
border: 1px solid #1a2744 !important;
color: #374151 !important;
font-size: 11px !important;
font-weight: 400 !important;
padding: 5px 11px !important;
border-radius: 16px !important;
white-space: nowrap !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
max-width: 200px !important;
text-transform: none !important;
letter-spacing: 0 !important;
}
.prompt-chip:hover {
border-color: #0e4a6a !important;
color: #38bdf8 !important;
background: rgba(56,189,248,0.05) !important;
}
.preset-chip {
background: #0d1829 !important;
border: 1px solid #1a2744 !important;
color: #64748b !important;
font-size: 12px !important;
font-weight: 600 !important;
padding: 6px 14px !important;
border-radius: 20px !important;
text-transform: none !important;
letter-spacing: 0 !important;
}
.preset-chip:hover {
border-color: #78490a !important;
color: #f59e0b !important;
background: rgba(245,158,11,0.07) !important;
}
.mode-caption {
margin: 10px 0 0 0;
font-family: 'Sora', sans-serif;
font-size: 12px;
line-height: 1.5;
color: #94a3b8;
}
.mode-caption strong {
color: #f59e0b;
}
@keyframes shimmer { 0%, 100% { opacity: 1; } 50% { opacity: 0.6; } }
@keyframes pulse-border { 0%,100%{border-color:#1a2744} 50%{border-color:#2a3f60} }
.stentor-header {
position: relative;
padding: 52px 40px 44px;
overflow: hidden;
border-bottom: 1px solid #1a2744;
background: #04060e;
}
.stentor-header::before {
content: '';
position: absolute;
inset: 0;
background:
radial-gradient(ellipse 70% 55% at 50% -10%, rgba(245,158,11,0.06) 0%, transparent 65%),
repeating-linear-gradient(90deg, transparent, transparent 79px, rgba(26,39,68,0.2) 80px),
repeating-linear-gradient(0deg, transparent, transparent 79px, rgba(26,39,68,0.2) 80px);
pointer-events: none;
}
.stentor-header::after {
content: '';
position: absolute;
top: 0; left: 0; right: 0; height: 2px;
background: linear-gradient(90deg, transparent 0%, #f59e0b 50%, transparent 100%);
animation: shimmer 5s ease-in-out infinite;
}
.header-inner {
position: relative; z-index: 1;
display: flex; align-items: flex-end;
justify-content: space-between; gap: 20px; flex-wrap: wrap;
}
.stentor-header h1,
.stentor-header h1 * {
color: #ffffff !important;
-webkit-text-fill-color: #ffffff !important;
}
.wordmark-eyebrow {
font-family: 'Sora', sans-serif;
font-size: 11px; font-weight: 600;
letter-spacing: 0.3em; text-transform: uppercase;
color: #f59e0b; display: block; margin-bottom: 6px;
}
.wordmark-title {
font-family: 'Bebas Neue', sans-serif !important;
font-size: clamp(56px, 9vw, 96px) !important;
line-height: 0.88 !important;
color: #ffffff !important;
-webkit-text-fill-color: #ffffff !important;
margin: 0 !important; display: block !important;
letter-spacing: 0.02em !important;
}
.wordmark-sub {
font-family: 'Space Mono', monospace;
font-size: 11px; color: #374151;
margin-top: 10px; display: block; letter-spacing: 0.04em;
}
.header-badges { display: flex; flex-direction: column; align-items: flex-end; gap: 8px; }
.badge-row { display: flex; gap: 6px; flex-wrap: wrap; justify-content: flex-end; }
.badge { font-family: 'Space Mono', monospace; font-size: 10px; padding: 4px 10px; border-radius: 4px; font-weight: 700; display: inline-block; }
.badge-gold { background: rgba(245,158,11,0.12); color: #f59e0b; border: 1px solid rgba(245,158,11,0.3); }
.badge-ice { background: rgba(56,189,248,0.08); color: #38bdf8; border: 1px solid rgba(56,189,248,0.25); }
.badge-green { background: rgba(16,185,129,0.08); color: #10b981; border: 1px solid rgba(16,185,129,0.25); }
.section-title {
font-family: 'Bebas Neue', sans-serif !important;
font-size: 24px !important; letter-spacing: 0.09em !important;
color: #ffffff !important; margin: 0 0 16px 0 !important;
padding-bottom: 10px !important; border-bottom: 1px solid #1a2744 !important;
line-height: 1 !important; display: block;
}
.model-card { background: #080d1a; border: 1px solid #1a2744; border-radius: 10px; padding: 14px; margin-bottom: 12px; }
.model-card-title { font-family: 'Bebas Neue', sans-serif; font-size: 20px; letter-spacing: 0.06em; color: #ffffff; margin: 0 0 10px 0; line-height: 1; }
.model-attr { display: flex; justify-content: space-between; align-items: center; padding: 4px 0; border-bottom: 1px solid #0d1829; font-size: 12px; }
.model-attr:last-child { border-bottom: none; }
.attr-key { font-family: 'Sora', sans-serif; color: #374151; font-weight: 500; }
.attr-val { font-family: 'Space Mono', monospace; color: #38bdf8; font-size: 11px; }
.explorer-info {
background: rgba(245,158,11,0.05);
border: 1px solid rgba(245,158,11,0.2);
border-radius: 8px;
padding: 12px 16px;
margin-bottom: 16px;
font-family: 'Sora', sans-serif;
font-size: 12px;
color: #94a3b8;
line-height: 1.6;
}
.about-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin-top: 20px; }
.about-block { background: #080d1a; border: 1px solid #1a2744; border-radius: 10px; padding: 18px; }
.about-block h3 { font-family: 'Bebas Neue', sans-serif !important; font-size: 17px !important; letter-spacing: 0.07em !important; color: #f59e0b !important; margin: 0 0 12px 0 !important; }
.about-block p { font-size: 13px; color: #64748b; line-height: 1.7; margin: 0; }
.about-block li { font-size: 13px; color: #64748b; line-height: 1.85; }
.about-block a { color: #38bdf8 !important; text-decoration: none !important; }
.about-block a:hover { text-decoration: underline !important; }
.arch-table { width: 100%; border-collapse: collapse; font-family: 'Space Mono', monospace; font-size: 12px; }
.arch-table td { padding: 5px 6px; border-bottom: 1px solid #1a2744; }
.arch-table td:first-child { color: #374151; font-size: 11px; }
.arch-table td:last-child { color: #38bdf8; text-align: right; }
::-webkit-scrollbar { width: 5px; height: 5px; }
::-webkit-scrollbar-track { background: #080d1a; }
::-webkit-scrollbar-thumb { background: #1a2744; border-radius: 3px; }
::-webkit-scrollbar-thumb:hover { background: #2a3f60; }
@media (max-width: 800px) {
.stentor-header { padding: 28px 16px 24px; }
.gradio-container [role="tabpanel"] { padding: 16px !important; }
.about-grid { grid-template-columns: 1fr; }
.header-badges { display: none; }
}
"""
# ───────────────────────────────── HTML ────────────────────────────────────────
HEADER_HTML = """
"""
MODEL_CARDS = {
"Portimbria-150M": (
''
'
PORTIMBRIA-150M
'
'
Parameters151M
'
'
FamilyPortimbria
'
'
VariantBase model
'
'
Context4096 tokens
'
'
ArchitectureLlama with GQA
'
'
'
'
'
),
"Stentor3-50M": (
''
'
STENTOR3-50M
'
'
Parameters50M
'
'
FamilyStentor3
'
'
VariantBase model
'
'
Context4096 tokens
'
'
'
'
'
),
"Stentor3-20M": (
''
'
STENTOR3-20M
'
'
Parameters20M
'
'
FamilyStentor3
'
'
VariantBase model
'
'
Context4096 tokens
'
'
'
'
'
),
"Stentor2-30M": (
''
'
STENTOR2-30M
'
'
FamilyStentor2
'
'
VariantBase model
'
'
Context1024 tokens
'
'
'
'
'
),
"Stentor2-12M": (
''
'
STENTOR2-12M
'
'
FamilyStentor2
'
'
VariantBase model
'
'
Context1024 tokens
'
'
'
'
'
),
"Stentor-30M": (
''
'
STENTOR-30M
'
'
Parameters30,419,712
'
'
ArchitectureLlamaForCausalLM
'
'
Layers21
'
'
Hidden Size256
'
'
Attn Heads4
'
'
Context512 tokens
'
'
Val Loss / PPL3.4971 / 33.02
'
'
Trained On600M tokens
'
'
Hardware1× Tesla T4 · 7.88h
'
'
'
'
'
),
"Stentor-12M": (
''
'
STENTOR-12M
'
'
Parameters12,047,040
'
'
ArchitectureLlamaForCausalLM
'
'
Layers9
'
'
Hidden Size192
'
'
Attn Heads3
'
'
Context512 tokens
'
'
Val Loss / PPL4.4887 / 89.01
'
'
Trained On200M tokens
'
'
Hardware2× Tesla T4 · 1.3h
'
'
'
),
"Stentor-30M-Instruct": (
''
'
STENTOR-30M-INSTRUCT
'
'
VariantInstruction-tuned
'
'
Base FamilyStentor-30M
'
'
ArchitectureLlamaForCausalLM
'
'
Context512 tokens
'
'
'
'
StatusFeatured in this Space
'
'
'
),
"Stentor-12M-Instruct": (
''
'
STENTOR-12M-INSTRUCT
'
'
VariantInstruction-tuned
'
'
Base FamilyStentor-12M
'
'
ArchitectureLlamaForCausalLM
'
'
Context512 tokens
'
'
'
'
StatusFeatured in this Space
'
'
'
),
}
FLAGSHIP_HTML = (
''
'
FLAGSHIP MODEL
'
'
PrimaryPortimbria-150M
'
'
Context Length4,096 tokens
'
'
Training Data6B tokens
'
'
'
'
'
)
ABOUT_HTML = """
STENTORLABS PLAYGROUND
Welcome to the official StentorLabs sandbox. This Hugging Face Space is a free, comprehensive testing environment
designed to give anyone—from researchers to hobbyists—full access to our family of compact Llama models.
Unlike traditional demos, this Space provides deep diagnostic tools to help you understand how Small Language Models (SLMs)
actually process information, manage confidence, and respond to parameter shifts.
ARCHITECTURE DIAGRAM · PORTIMBRIA-150M
INPUT TOKENS
↓
EMBEDDING LAYER
32,768 vocab × 768 hidden
↓
× 20 TRANSFORMER BLOCKS
GQA ATTENTION
6 heads · 2 KV · RoPE θ=50000
FEED-FORWARD
768→2048→768 SiLU
RMSNorm + Residual connections
↓
OUTPUT LOGITS
32,768 vocab (tied weights)
⚡ Mode: Generate
The standard interface for text completion. Test how models handle creative writing, code drafting, or factual continuation.
- Presets: Instantly switch between Creative (high temp), Balanced, and Focused (low temp) logic.
- Multi-Response: Generate up to 5 variations of the same prompt sequentially to test output variance.
🔬 Mode: Token Explorer
Peek "under the hood" of the model's decision-making process. This mode visualizes internal confidence levels.
- Confidence Heatmap: See which tokens the model was certain about vs. which were random guesses.
- Alternatives: Hover over any generated token to see the top 8 alternatives the model was considering at that exact moment.
🌡 Mode: Temp Sweep
A visual study in creativity. Run the exact same prompt across 2–5 different temperature settings simultaneously.
- Visual Divergence: Observe how low temperatures stay rigid and repetitive while high temperatures become increasingly chaotic.
- Dynamic Slots: Use the Number of Boxes slider to reveal up to five outputs.
💬 Mode: Chat
Interactive testing for all model variants, including both Base and Instruct versions. Test how models handle multi-turn dialogue and maintain context.
- Memory: Test how the 512–4096 token context handles conversation history.
- Safety: Observe how small models handle refusals and helpfulness constraints.
🛠 Parameter Guide
- Temperature: Controls "creativity." 0.1 is nearly deterministic; 1.5+ is experimental/chaotic.
- Top P: Nucleus sampling. Limits the model to the most likely group of tokens whose cumulative probability is P. Helps prevent gibberish.
- Repetition Penalty: Penalizes tokens that have already appeared. Essential for preventing loops in very small models.
- Max Tokens: Each model has a physical cap (e.g., 4,096 for Portimbria). Setting this too low will cut off thoughts mid-sentence.
Links & Resources
⚠ Includes both base and instruct variants · Always set max_new_tokens · Apache 2.0 · Built by Kai Izumoto
"""
# Preload only the default model at startup so the UI is responsive quickly.
print(f"[Stentor] Preloading default model ({DEFAULT_MODEL}) at startup...")
try:
_get_model(ALL_MODELS[DEFAULT_MODEL])
print("[Stentor] Default model loaded and warmed up.")
except Exception as e:
print(f"[Stentor] Could not preload default model: {e}")
print("[Stentor] Preloading arena models at startup...")
for arena_name, arena_repo in ARENA_MODELS.items():
try:
_get_model(arena_repo)
print(f"[Stentor] Arena model loaded: {arena_name}")
except Exception as e:
print(f"[Stentor] Could not preload arena model {arena_name}: {e}")
# ─────────────────────────────────── UI ───────────────────────────────────────
with gr.Blocks(title=APP_TITLE) as demo:
gr.HTML(HEADER_HTML)
history_state = gr.State([])
with gr.Tabs():
# ── TAB 1: GENERATE ────────────────────────────────────────────────────
with gr.TabItem(" ▶ Generate "):
with gr.Row():
with gr.Column(scale=1, min_width=240):
gr.HTML('MODEL')
model_sel = gr.Radio(
choices=list(STENTOR_MODELS.keys()),
value=DEFAULT_MODEL,
label="",
interactive=True,
)
gr.HTML(FLAGSHIP_HTML)
model_card_html = gr.HTML(MODEL_CARDS[DEFAULT_MODEL])
gr.HTML('PARAMETERS')
with gr.Row():
btn_creative = gr.Button("🎨 Creative", size="sm", elem_classes=["preset-chip"])
btn_balanced = gr.Button("⚖️ Balanced", size="sm", elem_classes=["preset-chip"])
btn_focused = gr.Button("🎯 Focused", size="sm", elem_classes=["preset-chip"])
gr.HTML(MODE_RECOMMENDATION_HTML)
max_tokens = gr.Slider(10, INITIAL_MAX_TOKENS, value=DEFAULT_MAX_TOKENS, step=10, label="Max New Tokens")
temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMP, step=0.05, label="Temperature")
top_p = gr.Slider(0.05, 1.0, value=DEFAULT_TOP_P, step=0.05, label="Top P")
repetition_penalty = gr.Slider(0.8, 2.0, value=DEFAULT_REP_PENALTY, step=0.05, label="Repetition Penalty")
num_responses = gr.Slider(1, 5, value=1, step=1, label="Number of Responses")
with gr.Column(scale=3):
gr.HTML('GENERATE')
prompt_box = gr.Textbox(
label="Prompt",
placeholder="Start writing or pick an example below…",
lines=4,
elem_id="prompt-box",
)
example_btns = []
with gr.Column():
with gr.Row():
for emoji, p in EXAMPLE_PROMPTS[:4]:
short = p[:26] + ("…" if len(p) > 26 else "")
b = gr.Button(f"{emoji} {short}", size="sm", elem_classes=["prompt-chip"])
example_btns.append((b, p))
with gr.Row():
for emoji, p in EXAMPLE_PROMPTS[4:]:
short = p[:26] + ("…" if len(p) > 26 else "")
b = gr.Button(f"{emoji} {short}", size="sm", elem_classes=["prompt-chip"])
example_btns.append((b, p))
with gr.Row():
gen_btn = gr.Button("▶ Generate", variant="primary", scale=3)
stop_btn = gr.Button("⏹ Stop", variant="secondary", scale=1)
output_box = gr.Textbox(
label="Output",
lines=12,
interactive=False,
elem_id="output-box",
)
stats_html = gr.HTML(build_stats_html("—", "—", "—"))
status_box = gr.Textbox(
value="Ready.", label="",
interactive=False, elem_classes=["status-bar"],
)
with gr.Column(scale=1, min_width=220):
gr.HTML('HISTORY')
history_html = gr.HTML(build_history_html([]))
# ── TAB 2: TOKEN EXPLORER ──────────────────────────────────────────────
with gr.TabItem(" 🔬 Token Explorer "):
with gr.Column():
gr.HTML('TOKEN PROBABILITY EXPLORER')
gr.HTML(
''
'🔬 How it works: Samples tokens while capturing the full probability distribution over the vocabulary at each step. '
'Tokens are color-coded by confidence: '
'● green = confident (≥80%), '
'● yellow = moderate (50–79%), '
'● orange = uncertain (35–49%), '
'● dark red = low (<35%). '
'Hover any token to see the top alternatives the model considered.'
'
'
)
with gr.Row():
with gr.Column(scale=3):
exp_prompt = gr.Textbox(
label="Prompt",
placeholder="Enter a prompt to visualize token-by-token confidence…",
lines=3,
elem_id="prompt-box",
)
with gr.Column(scale=1):
exp_model = gr.Radio(choices=list(STENTOR_MODELS.keys()), value=DEFAULT_MODEL, label="Model")
with gr.Row():
exp_creative = gr.Button("🎨 Creative", size="sm", elem_classes=["preset-chip"])
exp_balanced = gr.Button("⚖️ Balanced", size="sm", elem_classes=["preset-chip"])
exp_focused = gr.Button("🎯 Focused", size="sm", elem_classes=["preset-chip"])
gr.HTML(MODE_RECOMMENDATION_HTML)
exp_tokens = gr.Slider(5, INITIAL_MAX_TOKENS, value=DEFAULT_MAX_TOKENS, step=10, label="Max Tokens")
exp_temp = gr.Slider(0.1, 2.0, value=DEFAULT_TEMP, step=0.05, label="Temperature")
exp_top_p = gr.Slider(0.05, 1.0, value=DEFAULT_TOP_P, step=0.05, label="Top P")
exp_rep_pen = gr.Slider(0.8, 2.0, value=DEFAULT_REP_PENALTY, step=0.05, label="Repetition Penalty")
exp_btn = gr.Button("🔬 Explore", variant="primary", scale=3)
exp_stop_btn = gr.Button("⏹ Stop", variant="secondary", scale=1)
exp_output = gr.HTML(_explorer_placeholder())
exp_status = gr.Textbox(value="", label="", interactive=False, elem_classes=["status-bar"])
# ── TAB 3: TEMPERATURE SWEEP ───────────────────────────────────────────
with gr.TabItem(" 🌡 Temp Sweep "):
with gr.Column():
gr.HTML('TEMPERATURE SWEEP')
gr.HTML(
''
'🌡 What this shows: The same prompt run at multiple different temperatures simultaneously. '
'Low temperature = conservative/repetitive. High temperature = creative/chaotic. '
'Choose between 2–5 temperature boxes below. No duplicate temperatures allowed.'
'
'
)
sweep_state = gr.State([0.5, 1.0, 1.5, 2.0])
with gr.Row():
with gr.Column(scale=3):
sweep_prompt = gr.Textbox(
label="Prompt",
placeholder="Enter a prompt to run across all temperatures…",
lines=3,
elem_id="prompt-box",
)
with gr.Column(scale=1):
sweep_model = gr.Radio(choices=list(STENTOR_MODELS.keys()), value=DEFAULT_MODEL, label="Model")
with gr.Row():
sweep_creative = gr.Button("🎨 Creative", size="sm", elem_classes=["preset-chip"])
sweep_balanced = gr.Button("⚖️ Balanced", size="sm", elem_classes=["preset-chip"])
sweep_focused = gr.Button("🎯 Focused", size="sm", elem_classes=["preset-chip"])
gr.HTML(MODE_RECOMMENDATION_HTML)
sweep_tokens = gr.Slider(10, INITIAL_MAX_TOKENS, value=DEFAULT_MAX_TOKENS, step=10, label="Max Tokens")
sweep_top_p = gr.Slider(0.05, 1.0, value=DEFAULT_TOP_P, step=0.05, label="Top P")
sweep_rep_pen = gr.Slider(0.8, 2.0, value=DEFAULT_REP_PENALTY, step=0.05, label="Repetition Penalty")
sweep_count = gr.Slider(2, 5, value=2, step=1, label="Number of Boxes")
sweep_btn = gr.Button("🌡 Run Sweep", variant="primary", scale=3)
sweep_stop_btn = gr.Button("⏹ Stop", variant="secondary", scale=1)
with gr.Row():
sweep_temp_inputs = []
sweep_temp_labels = ["1st Temp", "2nd Temp", "3rd Temp", "4th Temp", "5th Temp"]
default_temps_for_index = [0.5, 1.0, 1.5, 2.0, 2.5]
sweep_columns = []
sweep_outputs_for_fn = []
for i in range(5):
with gr.Column(visible=(i < 2)) as col:
color_map = ["#38bdf8", "#10b981", "#f59e0b", "#f97316", "#f87171"]
gr.HTML(
f''
f'BOX {i+1}'
f'
'
)
inp = gr.Number(value=default_temps_for_index[i], label=sweep_temp_labels[i], minimum=0.1, maximum=2.5, step=0.05)
sweep_temp_inputs.append(inp)
out = gr.Textbox(label="", lines=8, interactive=False, elem_id="output-box")
sweep_outputs_for_fn.append(out)
sweep_columns.append(col)
with gr.Row():
sweep_add_btn = gr.Button("+ Add Box", variant="secondary", scale=1)
sweep_rm_btn = gr.Button("− Remove Box", variant="secondary", scale=1)
def update_sweep_visibility(count, *temps):
count_int = int(count)
provided_temps = list(temps[:5])
valid_temps = [t for t in provided_temps[:count_int] if t is not None]
if len(valid_temps) != len(set(round(float(t), 2) for t in valid_temps)):
new_temps = [round(0.5 + i * (1.5 / max(count_int - 1, 1)), 2) for i in range(count_int)]
col_updates = [gr.update(visible=(i < count_int)) for i in range(5)]
temp_updates = []
for i in range(5):
if i < count_int:
temp_updates.append(gr.update(visible=True, value=new_temps[i]))
else:
temp_updates.append(gr.update(visible=False))
return col_updates + temp_updates
col_updates = [gr.update(visible=(i < count_int)) for i in range(5)]
temp_updates = [gr.update()] * 5
return col_updates + temp_updates
def add_sweep_box(count):
count_int = 2 if count is None else int(count)
return gr.update(value=min(count_int + 1, 5))
def remove_sweep_box(count):
count_int = 2 if count is None else int(count)
return gr.update(value=max(count_int - 1, 2))
sweep_outs = sweep_outputs_for_fn
sweep_count.change(
fn=update_sweep_visibility,
inputs=[sweep_count] + sweep_temp_inputs,
outputs=sweep_columns + sweep_temp_inputs
)
sweep_add_btn.click(
fn=add_sweep_box,
inputs=[sweep_count],
outputs=[sweep_count],
).then(
fn=update_sweep_visibility,
inputs=[sweep_count] + sweep_temp_inputs,
outputs=sweep_columns + sweep_temp_inputs,
)
sweep_rm_btn.click(
fn=remove_sweep_box,
inputs=[sweep_count],
outputs=[sweep_count],
).then(
fn=update_sweep_visibility,
inputs=[sweep_count] + sweep_temp_inputs,
outputs=sweep_columns + sweep_temp_inputs,
)
# ── TAB 4: ARENA ─────────────────────────────────────────────────────
with gr.TabItem(" 🏟 Arena "):
with gr.Column():
gr.HTML('MODEL ARENA')
gr.HTML(
''
'🏟 Model Arena: Benchmark performance via blind or open testing.
'
'• Show Mode: Pick models and see their names while generating.
'
'• Blind Mode: Pick models but their identities are hidden until you vote.
'
'• Random Mode: Let the arena pick a random Stentor vs a Baseline model.'
'
'
)
arena_identities = gr.State(None)
with gr.Row():
with gr.Column(scale=3):
arena_prompt = gr.Textbox(
label="Shared Prompt",
placeholder="Enter a prompt to run through both models…",
lines=3,
elem_id="prompt-box",
)
with gr.Column(scale=2):
with gr.Row():
arena_creative = gr.Button("🎨 Creative", size="sm", elem_classes=["preset-chip"])
arena_balanced = gr.Button("⚖️ Balanced", size="sm", elem_classes=["preset-chip"])
arena_focused = gr.Button("🎯 Focused", size="sm", elem_classes=["preset-chip"])
gr.HTML(MODE_RECOMMENDATION_HTML)
with gr.Row():
arena_mode = gr.Dropdown(
choices=["👀 Show", "🙈 Blind", "🎲 Random"],
value="👀 Show",
label="Arena Mode"
)
with gr.Column(visible=False) as arena_random_options:
arena_fair_match = gr.Checkbox(value=True, label="Fair Matchmaking", info="Pairs models with similar parameter counts for a balanced fight.")
gr.HTML('Pairing Logic
')
arena_internal_pair = gr.Checkbox(value=True, label="Internal Pairings", info="Allows Stentor models to face other Stentor models.")
arena_max = gr.Slider(10, 1024, value=DEFAULT_MAX_TOKENS, step=10, label="Max Tokens")
with gr.Row():
arena_temp = gr.Slider(0.1, 2.0, value=DEFAULT_TEMP, step=0.05, label="Temperature")
arena_top_p = gr.Slider(0.05, 1.0, value=DEFAULT_TOP_P, step=0.05, label="Top P")
with gr.Row():
arena_rep_pen = gr.Slider(0.8, 2.0, value=DEFAULT_REP_PENALTY, step=0.05, label="Repetition Penalty")
with gr.Row():
arena_btn = gr.Button("🏟 Battle", variant="primary", scale=2)
arena_stop_btn = gr.Button("⏹ Stop", variant="secondary", scale=1)
with gr.Row(elem_id="arena-selectors") as arena_selector_row:
with gr.Column():
arena_model1 = gr.Dropdown(
choices=list(STENTOR_MODELS.keys()),
value=DEFAULT_MODEL,
label="Stentor Model",
interactive=True,
)
with gr.Column():
arena_model2 = gr.Dropdown(
choices=list(ARENA_MODELS.keys()),
value=list(ARENA_MODELS.keys())[0],
label="External Model",
interactive=True,
)
def update_arena_ui_visibility(mode):
return gr.update(visible=(mode == "🎲 Random")), gr.update(visible=(mode != "🎲 Random"))
arena_mode.change(fn=update_arena_ui_visibility, inputs=[arena_mode], outputs=[arena_random_options, arena_selector_row])
with gr.Row():
with gr.Column():
arena_output1 = gr.Textbox(label="Left Model Output", lines=12, interactive=False, elem_id="output-box")
arena_status1 = gr.Textbox(value="Ready.", label="", interactive=False, elem_classes=["status-bar"])
with gr.Column():
arena_output2 = gr.Textbox(label="Right Model Output", lines=12, interactive=False, elem_id="output-box")
arena_status2 = gr.Textbox(value="Ready.", label="", interactive=False, elem_classes=["status-bar"])
with gr.Column(visible=False) as vote_col:
gr.HTML('VOTE FOR THE BEST RESPONSE
')
with gr.Row():
left_win = gr.Button("👈 Left is Better", variant="secondary")
right_win = gr.Button("Right is Better 👉", variant="secondary")
tie_win = gr.Button("🤝 It's a Tie", variant="secondary")
both_bad = gr.Button("👎 Both are Bad", variant="secondary")
arena_results = gr.Markdown("")
# Arena event wiring
arena_event = arena_btn.click(
fn=arena_setup,
inputs=[arena_mode],
outputs=[arena_output1, arena_output2, arena_status1, arena_status2, vote_col, arena_selector_row, arena_identities, arena_results],
).then(
fn=arena_generate,
inputs=[arena_prompt, arena_mode, arena_model1, arena_model2, arena_max, arena_temp, arena_top_p, arena_rep_pen, arena_fair_match, arena_internal_pair],
outputs=[arena_output1, arena_output2, arena_status1, arena_status2, vote_col, arena_selector_row, arena_identities, arena_results],
)
vote_inputs = [arena_identities]
left_win.click(fn=lambda ids: arena_vote("Left Model", ids), inputs=vote_inputs, outputs=[arena_results, vote_col])
right_win.click(fn=lambda ids: arena_vote("Right Model", ids), inputs=vote_inputs, outputs=[arena_results, vote_col])
tie_win.click(fn=lambda ids: arena_vote("Tie", ids), inputs=vote_inputs, outputs=[arena_results, vote_col])
both_bad.click(fn=lambda ids: arena_vote("Both Bad", ids), inputs=vote_inputs, outputs=[arena_results, vote_col])
arena_stop_btn.click(
fn=lambda: ("⏹ Stopped.", "⏹ Stopped.", "", "", gr.update(visible=False)),
outputs=[arena_output1, arena_output2, arena_status1, arena_status2, vote_col],
cancels=[arena_event],
)
# ── TAB 6: CHAT ─────────────────────────────────────────────────────────
with gr.TabItem(" 💬 Chat "):
with gr.Row():
with gr.Column(scale=1, min_width=240):
gr.HTML('CHAT SETTINGS')
with gr.Row():
chat_creative = gr.Button("🎨 Creative", size="sm", elem_classes=["preset-chip"])
chat_balanced = gr.Button("⚖️ Balanced", size="sm", elem_classes=["preset-chip"])
chat_focused = gr.Button("🎯 Focused", size="sm", elem_classes=["preset-chip"])
gr.HTML(MODE_RECOMMENDATION_HTML)
chat_model = gr.Radio(
choices=list(STENTOR_MODELS.keys()),
value=DEFAULT_MODEL,
label="Model",
interactive=True,
)
chat_max_tokens = gr.Slider(10, INITIAL_MAX_TOKENS, value=DEFAULT_MAX_TOKENS, step=10, label="Max New Tokens")
chat_temperature = gr.Slider(0.1, 2.0, value=DEFAULT_TEMP, step=0.05, label="Temperature")
chat_top_p = gr.Slider(0.05, 1.0, value=DEFAULT_TOP_P, step=0.05, label="Top P")
chat_rep_penalty = gr.Slider(0.8, 2.0, value=DEFAULT_REP_PENALTY, step=0.05, label="Repetition Penalty")
with gr.Row():
chat_stop_btn = gr.Button("⏹ Stop", variant="secondary")
chat_reset_btn = gr.Button("↺ Reset Chat", variant="secondary")
with gr.Column(scale=3):
gr.HTML('CONVERSATION')
chat_messages = gr.State([])
chat_display = gr.HTML(
''
'
'
'Start a conversation by typing a message below.
'
)
chat_input = gr.Textbox(
label="Your Message",
placeholder="Type your message here and press Send…",
lines=3,
elem_id="prompt-box",
)
chat_send_btn = gr.Button("▶ Send", variant="primary")
chat_model.change(
fn=lambda m, cur: gr.update(maximum=_max_tokens_cap(m), value=min(cur, _max_tokens_cap(m))),
inputs=[chat_model, chat_max_tokens],
outputs=[chat_max_tokens],
)
# ── TAB 7: ABOUT ───────────────────────────────────────────────────────
with gr.TabItem(" ℹ About "):
with gr.Column():
gr.HTML('THE STENTOR SERIES')
gr.HTML(ABOUT_HTML)
# ───────────────────────── EVENT WIRING ───────────────────────────────────
model_sel.change(
fn=lambda m, cur: (MODEL_CARDS.get(m, ""), gr.update(maximum=_max_tokens_cap(m), value=int(min(max(DEFAULT_MAX_TOKENS, cur), _max_tokens_cap(m))))),
inputs=[model_sel, max_tokens],
outputs=[model_card_html, max_tokens],
)
exp_model.change(
fn=lambda m, cur: gr.update(maximum=_max_tokens_cap(m), value=int(min(max(5, cur), _max_tokens_cap(m)))),
inputs=[exp_model, exp_tokens],
outputs=[exp_tokens],
)
sweep_model.change(
fn=lambda m, cur: gr.update(maximum=_max_tokens_cap(m), value=int(min(max(10, cur), _max_tokens_cap(m)))),
inputs=[sweep_model, sweep_tokens],
outputs=[sweep_tokens],
)
btn_creative.click(fn=lambda: apply_standard_preset("🎨 Creative"), outputs=[max_tokens, temperature, top_p, repetition_penalty])
btn_balanced.click(fn=lambda: apply_standard_preset("⚖️ Balanced"), outputs=[max_tokens, temperature, top_p, repetition_penalty])
btn_focused .click(fn=lambda: apply_standard_preset("🎯 Focused"), outputs=[max_tokens, temperature, top_p, repetition_penalty])
exp_creative.click(fn=lambda: apply_standard_preset("🎨 Creative"), outputs=[exp_tokens, exp_temp, exp_top_p, exp_rep_pen])
exp_balanced.click(fn=lambda: apply_standard_preset("⚖️ Balanced"), outputs=[exp_tokens, exp_temp, exp_top_p, exp_rep_pen])
exp_focused .click(fn=lambda: apply_standard_preset("🎯 Focused"), outputs=[exp_tokens, exp_temp, exp_top_p, exp_rep_pen])
sweep_creative.click(fn=lambda: apply_sweep_preset("🎨 Creative"), outputs=[sweep_tokens, sweep_top_p, sweep_rep_pen])
sweep_balanced.click(fn=lambda: apply_sweep_preset("⚖️ Balanced"), outputs=[sweep_tokens, sweep_top_p, sweep_rep_pen])
sweep_focused .click(fn=lambda: apply_sweep_preset("🎯 Focused"), outputs=[sweep_tokens, sweep_top_p, sweep_rep_pen])
arena_creative.click(fn=lambda: apply_standard_preset("🎨 Creative"), outputs=[arena_max, arena_temp, arena_top_p, arena_rep_pen])
arena_balanced.click(fn=lambda: apply_standard_preset("⚖️ Balanced"), outputs=[arena_max, arena_temp, arena_top_p, arena_rep_pen])
arena_focused .click(fn=lambda: apply_standard_preset("🎯 Focused"), outputs=[arena_max, arena_temp, arena_top_p, arena_rep_pen])
chat_creative.click(fn=lambda: apply_standard_preset("🎨 Creative"), outputs=[chat_max_tokens, chat_temperature, chat_top_p, chat_rep_penalty])
chat_balanced.click(fn=lambda: apply_standard_preset("⚖️ Balanced"), outputs=[chat_max_tokens, chat_temperature, chat_top_p, chat_rep_penalty])
chat_focused .click(fn=lambda: apply_standard_preset("🎯 Focused"), outputs=[chat_max_tokens, chat_temperature, chat_top_p, chat_rep_penalty])
for btn, full_prompt in example_btns:
btn.click(fn=lambda t=full_prompt: t, outputs=[prompt_box])
def handle_stop_gen():
interrupt_callback.stop_signal = True
return "⏹ Stopped.", build_stats_html("—", "—", "—")
def run_generate_multi(prompt, model_key, max_tok, temp, tp, rep_penalty, num_resp, history):
text = (prompt or "").strip()
if not text:
yield "Enter a prompt.", "Enter a prompt.", build_stats_html("—", "—", "—"), history, build_history_html(history)
return
num = int(num_resp) if num_resp is not None else 1
tokenizer, _ = _get_model(STENTOR_MODELS[model_key])
configs = [{
"model_key": model_key, "max_tokens": max_tok, "temperature": temp,
"top_p": tp, "repetition_penalty": rep_penalty
} for _ in range(num)]
final_results = [""] * num
final_elapsed = 0.0
for results, elapsed in parallel_config_generate(prompt, configs):
final_results, final_elapsed = results, elapsed
display = ""
if num > 1:
for idx, r in enumerate(results):
display += f"─── Response {idx+1} ───\n{r}\n\n"
else:
display = results[0]
total_tokens = sum(len(tokenizer.encode(r)) for r in results)
tps = total_tokens / elapsed if elapsed > 0 else 0
yield display, "⚡ Generating…", build_stats_html(str(total_tokens), f"{elapsed:.1f}s", f"{tps:.1f} t/s"), history, build_history_html(history)
new_history = add_to_history(history, prompt, final_results[0], model_key, temp)
total_tokens = sum(len(tokenizer.encode(r)) for r in final_results)
tps = total_tokens / final_elapsed if final_elapsed > 0 else 0
yield display, f"✓ Done · {model_key}", build_stats_html(str(total_tokens), f"{final_elapsed:.2f}s", f"{tps:.1f} t/s"), new_history, build_history_html(new_history)
gen_event = gen_btn.click(
fn=run_generate_multi,
inputs=[prompt_box, model_sel, max_tokens, temperature, top_p, repetition_penalty, num_responses, history_state],
outputs=[output_box, status_box, stats_html, history_state, history_html],
)
stop_btn.click(
fn=handle_stop_gen,
outputs=[status_box, stats_html],
cancels=[gen_event]
)
def handle_stop_exp():
interrupt_callback.stop_signal = True
return "⏹ Stopped.", _explorer_placeholder()
exp_event = exp_btn.click(
fn=run_token_explorer,
inputs=[exp_prompt, exp_model, exp_tokens, exp_temp, exp_top_p, exp_rep_pen],
outputs=[exp_output, exp_status],
)
exp_stop_btn.click(
fn=handle_stop_exp,
outputs=[exp_status, exp_output],
cancels=[exp_event]
)
sweep_event = sweep_btn.click(
fn=run_temp_sweep_streamed,
inputs=[sweep_prompt, sweep_model, sweep_tokens, sweep_top_p, sweep_rep_pen, sweep_count] + sweep_temp_inputs,
outputs=sweep_outs,
)
def handle_stop_sweep():
interrupt_callback.stop_signal = True
return [gr.update(value="⏹ Stopped.")] * len(sweep_outputs_for_fn)
sweep_stop_btn.click(
fn=handle_stop_sweep,
outputs=sweep_outputs_for_fn,
cancels=[sweep_event]
)
chat_event = chat_send_btn.click(
fn=chat_generate,
inputs=[chat_messages, chat_input, chat_model, chat_max_tokens, chat_temperature, chat_top_p, chat_rep_penalty],
outputs=[chat_display, chat_messages],
).then(
fn=lambda: "", outputs=[chat_input],
)
chat_input_event = chat_input.submit(
fn=chat_generate,
inputs=[chat_messages, chat_input, chat_model, chat_max_tokens, chat_temperature, chat_top_p, chat_rep_penalty],
outputs=[chat_display, chat_messages],
).then(
fn=lambda: "", outputs=[chat_input],
)
def handle_stop_chat():
interrupt_callback.stop_signal = True
chat_stop_btn.click(
fn=handle_stop_chat,
cancels=[chat_event, chat_input_event]
)
chat_reset_btn.click(
fn=chat_clear,
outputs=[chat_messages, chat_display, chat_input],
)
if __name__ == "__main__":
demo.launch(theme=gr.themes.Base(), css=CSS, ssr_mode=False)