import sys print(f"[BOOT] Python {sys.version}", flush=True) import base64 import os import re from typing import Generator, Optional try: import gradio as gr print(f"[BOOT] gradio {gr.__version__}", flush=True) except ImportError as e: print(f"[BOOT] FATAL: {e}", flush=True) sys.exit(1) try: from huggingface_hub import InferenceClient import httpx import uvicorn from fastapi import FastAPI, Request from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse print("[BOOT] All imports OK", flush=True) except ImportError as e: print(f"[BOOT] FATAL: {e} — add to requirements.txt", flush=True) sys.exit(1) # ══════════════════════════════════════════════════════════════════════════════ # 1. MODEL CAPABILITY MATRIX # ══════════════════════════════════════════════════════════════════════════════ MODEL_CAPS: dict[str, dict] = { "Qwen/Qwen3.5-122B-A10B": { "arch": "MoE", "active": "10B / 122B total", "ctx": "262K → 1M", "thinking": True, "vision": True, "max_tokens": 8192, "temp_max": 2.0, "top_p": True, "color": "#7c3aed", "badge": "🏆 Best Overall · BFCL 72.2 · GPQA 86.6 · SWE 72.0", "desc": "Top reasoning & agents · Complex math · Long context", }, "Qwen/Qwen3.5-27B": { "arch": "Dense", "active": "27B (all active)", "ctx": "262K → 1M", "thinking": True, "vision": True, "max_tokens": 8192, "temp_max": 2.0, "top_p": True, "color": "#0d9488", "badge": "🎯 Dense #1 · IFEval 95.0 · SWE 72.4 · PolyMATH 71.2", "desc": "Instruction king · Creative writing · 201 languages", }, "Qwen/Qwen3.5-35B-A3B": { "arch": "MoE", "active": "3B / 35B total", "ctx": "262K → 1M", "thinking": True, "vision": True, "max_tokens": 4096, "temp_max": 1.5, "top_p": True, "color": "#d97706", "badge": "⚡ Flash Speed · TAU2 81.2 · MMLU-Pro 85.3", "desc": "Fastest · 3B active params · ~6× faster than 27B", } } MODEL_IDS = list(MODEL_CAPS.keys()) DEFAULT_MODEL = MODEL_IDS[0] # ══════════════════════════════════════════════════════════════════════════════ # 2. SYSTEM PROMPT PRESETS # ══════════════════════════════════════════════════════════════════════════════ PRESETS = { "general": "You are a helpful, harmless, and honest AI assistant.", "code": "You are an expert software engineer. Write clean, efficient, well-commented code. Explain your approach before writing. Use modern best practices.", "math": "You are a world-class mathematician. Break problems step-by-step. Show full working. Use LaTeX where helpful.", "creative": "You are a brilliant creative writer. Be imaginative, vivid, and engaging. Adapt tone and style to the request.", "translate": "You are a professional translator fluent in 201 languages. Provide accurate, natural-sounding translations with cultural context.", "research": "You are a rigorous research analyst. Provide structured, well-reasoned analysis. Identify assumptions and acknowledge uncertainty.", } # ══════════════════════════════════════════════════════════════════════════════ # 3. THINKING MODE HELPERS # ══════════════════════════════════════════════════════════════════════════════ def build_user_message(text: str, thinking: bool) -> str: return ("/think\n" if thinking else "/no_think\n") + text def parse_think_blocks(text: str) -> tuple[str, str]: m = re.search(r"(.*?)\s*", text, re.DOTALL) return (m.group(1).strip(), text[m.end():].strip()) if m else ("", text) def format_response(raw: str) -> str: chain, answer = parse_think_blocks(raw) if chain: lines = chain.split("\n") quoted = "\n".join(f"> {l}" for l in lines) block = ( "
\n" "🧠 Reasoning Chain — click to expand\n\n" f"{quoted}\n\n" "
\n\n" ) return block + answer return raw # ══════════════════════════════════════════════════════════════════════════════ # 4. STREAMING BACKEND # ══════════════════════════════════════════════════════════════════════════════ def generate_reply( message: str, history: list, model_id: str, thinking_mode: str, image_input, system_prompt: str, max_new_tokens: int, temperature: float, top_p: float, ) -> Generator[str, None, None]: token = os.getenv("HF_TOKEN") client = InferenceClient(token=token, timeout=120) cap = MODEL_CAPS[model_id] use_think = "Thinking" in thinking_mode and cap["thinking"] max_new_tokens = min(int(max_new_tokens), cap["max_tokens"]) temperature = min(float(temperature), cap["temp_max"]) messages: list[dict] = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt.strip()}) for turn in history: if isinstance(turn, dict): role = turn.get("role", "") raw = turn.get("content") or "" text = (" ".join(p.get("text","") for p in raw if isinstance(p,dict) and p.get("type")=="text") if isinstance(raw, list) else str(raw)) if role == "user": messages.append({"role":"user","content":text}) elif role == "assistant": _, clean = parse_think_blocks(text) messages.append({"role":"assistant","content":clean}) else: try: u, a = (turn[0] or None), (turn[1] if len(turn)>1 else None) except (IndexError, TypeError): continue def _txt(v): if v is None: return None if isinstance(v, list): return " ".join(p.get("text","") for p in v if isinstance(p,dict) and p.get("type")=="text") return str(v) if u := _txt(u): messages.append({"role":"user","content":u}) if a := _txt(a): _, clean = parse_think_blocks(a) messages.append({"role":"assistant","content":clean}) user_text = build_user_message(message, use_think) if image_input and cap["vision"]: import io from PIL import Image as PILImage # Handle 3 cases: # 1. base64 data URL string (from JS fetch API) "data:image/...;base64,..." # 2. PIL Image object (from Gradio UI) # 3. numpy array (legacy Gradio) if isinstance(image_input, str) and image_input.startswith("data:"): # Strip the data URL prefix and decode directly header, b64_data = image_input.split(",", 1) b64 = b64_data else: buf = io.BytesIO() if not isinstance(image_input, PILImage.Image): image_input = PILImage.fromarray(image_input) image_input.save(buf, format="JPEG") b64 = base64.b64encode(buf.getvalue()).decode() content = [ {"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{b64}"}}, {"type":"text","text":user_text}, ] else: content = user_text messages.append({"role":"user","content":content}) try: stream = client.chat_completion( model=model_id, messages=messages, max_tokens=max_new_tokens, temperature=temperature, top_p=float(top_p), stream=True, ) raw = "" for chunk in stream: if not chunk.choices: continue delta = chunk.choices[0].delta if not delta or not delta.content: continue raw += delta.content yield format_response(raw) except Exception as exc: yield (f"**Error:** `{model_id}`\n\n```\n{exc}\n```\n\n" "_Check HF\\_TOKEN or try another model._") # ══════════════════════════════════════════════════════════════════════════════ # 5. GRADIO BLOCKS (hidden – only serves /gradio/gradio_api/call/chat API) # ══════════════════════════════════════════════════════════════════════════════ with gr.Blocks(title="Qwen3.5 MultiChat API") as gradio_demo: model_dd = gr.Dropdown(choices=MODEL_IDS, value=DEFAULT_MODEL, visible=False) thinking_toggle = gr.Radio( choices=["⚡ Fast Mode (direct answer)", "🧠 Thinking Mode (chain-of-thought reasoning)"], value="⚡ Fast Mode (direct answer)", visible=False, ) image_input = gr.Textbox(value="", visible=False) # receives base64 data URL from JS system_prompt = gr.Textbox(value=PRESETS["general"], visible=False) max_new_tokens = gr.Slider(minimum=64, maximum=8192, value=1024, visible=False) temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, visible=False) top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, visible=False) gr.ChatInterface( fn=generate_reply, api_name="chat", additional_inputs=[ model_dd, thinking_toggle, image_input, system_prompt, max_new_tokens, temperature, top_p, ], ) # ══════════════════════════════════════════════════════════════════════════════ # 6. FASTAPI – index.html + HF OAuth + Gradio API # ══════════════════════════════════════════════════════════════════════════════ import pathlib, secrets fapp = FastAPI() SESSIONS: dict[str, dict] = {} # session_id → user info HTML = pathlib.Path(__file__).parent / "index.html" # ── HF OAuth config (auto-injected by HF Spaces when OAuth is enabled) ──────── CLIENT_ID = os.getenv("OAUTH_CLIENT_ID", "") CLIENT_SECRET = os.getenv("OAUTH_CLIENT_SECRET", "") SPACE_HOST = os.getenv("SPACE_HOST", "localhost:7860") REDIRECT_URI = f"https://{SPACE_HOST}/login/callback" # Startup OAuth status log print(f"[OAuth] CLIENT_ID set: {bool(CLIENT_ID)}") print(f"[OAuth] CLIENT_SECRET set: {bool(CLIENT_SECRET)}") print(f"[OAuth] SPACE_HOST: {SPACE_HOST}") print(f"[OAuth] REDIRECT_URI: {REDIRECT_URI}") HF_AUTH_URL = "https://huggingface.co/oauth/authorize" HF_TOKEN_URL = "https://huggingface.co/oauth/token" HF_USER_URL = "https://huggingface.co/oauth/userinfo" SCOPES = os.getenv("OAUTH_SCOPES", "openid profile") from urllib.parse import urlencode def _sid(req: Request) -> Optional[str]: return req.cookies.get("mc_session") def _user(req: Request) -> Optional[dict]: sid = _sid(req) return SESSIONS.get(sid) if sid else None # ── Routes ──────────────────────────────────────────────────────────────────── @fapp.get("/") async def root(request: Request): html = HTML.read_text(encoding="utf-8") if HTML.exists() else "

index.html missing

" return HTMLResponse(html) @fapp.get("/oauth/user") async def oauth_user(request: Request): u = _user(request) if u: return JSONResponse(u) return JSONResponse({"logged_in": False}, status_code=401) @fapp.get("/oauth/login") async def oauth_login(request: Request): print(f"[OAuth] /oauth/login called. CLIENT_ID={bool(CLIENT_ID)}") if not CLIENT_ID: print("[OAuth] ERROR: OAUTH_CLIENT_ID not set — add hf_oauth: true to README.md") return RedirectResponse("/?oauth_error=not_configured") state = secrets.token_urlsafe(16) params = { "response_type": "code", "client_id": CLIENT_ID, "redirect_uri": REDIRECT_URI, "scope": SCOPES, "state": state, } url = f"{HF_AUTH_URL}?{urlencode(params)}" print(f"[OAuth] Redirecting → {url[:120]}") return RedirectResponse(url, status_code=302) @fapp.get("/login/callback") async def oauth_callback(code: str = "", error: str = "", state: str = ""): if error or not code: print(f"[OAuth] Callback error: {error}") return RedirectResponse("/?auth_error=1") # Basic auth as recommended by HF docs basic = base64.b64encode(f"{CLIENT_ID}:{CLIENT_SECRET}".encode()).decode() async with httpx.AsyncClient() as client: # Exchange code for token — use Authorization: Basic header tok = await client.post(HF_TOKEN_URL, data={ "grant_type": "authorization_code", "code": code, "redirect_uri": REDIRECT_URI, }, headers={ "Accept": "application/json", "Authorization": f"Basic {basic}", }) if tok.status_code != 200: print(f"[OAuth] Token exchange FAILED: {tok.status_code} {tok.text[:300]}") return RedirectResponse("/?auth_error=1") access_token = tok.json().get("access_token", "") if not access_token: print(f"[OAuth] No access_token: {tok.text[:300]}") return RedirectResponse("/?auth_error=1") # Get user info uinfo = await client.get(HF_USER_URL, headers={"Authorization": f"Bearer {access_token}"}) if uinfo.status_code != 200: print(f"[OAuth] Userinfo FAILED: {uinfo.status_code}") return RedirectResponse("/?auth_error=1") user = uinfo.json() print(f"[OAuth] Login OK: {user.get('preferred_username', '?')}") sid = secrets.token_urlsafe(32) SESSIONS[sid] = { "logged_in": True, "username": user.get("preferred_username", user.get("name", "User")), "name": user.get("name", ""), "avatar": user.get("picture", ""), "profile": f"https://huggingface.co/{user.get('preferred_username', '')}", } resp = RedirectResponse("/") resp.set_cookie("mc_session", sid, httponly=True, samesite="lax", secure=True, max_age=60*60*24*7) return resp @fapp.get("/oauth/logout") async def oauth_logout(request: Request): sid = _sid(request) if sid and sid in SESSIONS: del SESSIONS[sid] resp = RedirectResponse("/") resp.delete_cookie("mc_session") return resp @fapp.get("/health") async def health(): return {"status": "ok"} # Mount Gradio at /gradio → API at /gradio/gradio_api/call/chat app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio") # ── Launch ──────────────────────────────────────────────────────────────────── if __name__ == "__main__": print("[BOOT] All components initialized. Starting uvicorn on :7860", flush=True) uvicorn.run(app, host="0.0.0.0", port=7860)