Spaces:

build-small-hackathon
/

lesson-agent

Running on Zero

App Files Files Community

MSGEncrypted commited on 23 days ago

Commit

9a7964b

1 Parent(s): 9341111

app inference gradio

Browse files

Files changed (5) hide show

.env.example +16 -3
Dockerfile +1 -1
apps/gradio-space/src/gradio_space/app.py +92 -38
libs/inference/pyproject.toml +1 -0
models.yaml +47 -0

.env.example CHANGED Viewed

@@ -1,12 +1,25 @@
 INFERENCE_BACKEND=llama_cpp
 MODEL_REPO=Qwen/Qwen2.5-3B-Instruct-GGUF
 MODEL_FILE=qwen2.5-3b-instruct-q4_k_m.gguf
 N_CTX=4096
 N_GPU_LAYERS=0
-# Optional: local GGUF path instead of Hub download
 # MODEL_PATH=./models/qwen2.5-3b-instruct-q4_k_m.gguf
-# Optional: transformers backend (requires inference[transformers] extra)
 # INFERENCE_BACKEND=transformers
-# MODEL_ID=Qwen/Qwen2.5-3B-Instruct

+# --- Preset selection (models.yaml is the source of truth) ---
+ACTIVE_MODEL=qwen3b-gguf
+# Dev: enable dropdown in Gradio. Space: leave false to pin one model for visitors.
+ALLOW_MODEL_SWITCH=true
+# MODEL_PRESETS_PATH=./models.yaml
+# --- Legacy single-model overrides (optional; applied to ACTIVE_MODEL only) ---
 INFERENCE_BACKEND=llama_cpp
 MODEL_REPO=Qwen/Qwen2.5-3B-Instruct-GGUF
 MODEL_FILE=qwen2.5-3b-instruct-q4_k_m.gguf
 N_CTX=4096
 N_GPU_LAYERS=0
+# Optional: local GGUF path instead of Hub download (set in models.yaml model_path too)
 # MODEL_PATH=./models/qwen2.5-3b-instruct-q4_k_m.gguf
+# Optional: transformers presets (requires inference[transformers] extra)
+# ACTIVE_MODEL=minicpm5-1b
 # INFERENCE_BACKEND=transformers
+# MODEL_ID=openbmb/MiniCPM5-1B
+# TRUST_REMOTE_CODE=true
+# Optional: local fine-tuned merged weights
+# ACTIVE_MODEL=gemma-merged-local
+# MODEL_ID=./gemma_merged_model

Dockerfile CHANGED Viewed

@@ -13,7 +13,7 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 WORKDIR /app
-COPY pyproject.toml uv.lock .python-version README.md ./
 COPY apps/gradio-space/pyproject.toml apps/gradio-space/README.md apps/gradio-space/
 COPY libs/inference/pyproject.toml libs/inference/README.md libs/inference/
 COPY apps/gradio-space/src apps/gradio-space/src

 WORKDIR /app
+COPY pyproject.toml uv.lock .python-version README.md models.yaml ./
 COPY apps/gradio-space/pyproject.toml apps/gradio-space/README.md apps/gradio-space/
 COPY libs/inference/pyproject.toml libs/inference/README.md libs/inference/
 COPY apps/gradio-space/src apps/gradio-space/src

apps/gradio-space/src/gradio_space/app.py CHANGED Viewed

@@ -2,36 +2,39 @@ import os
 import gradio as gr
-from inference.factory import get_backend
-_backend = get_backend()
-_model_ready = False
-_load_error: str | None = None
-def _ensure_model_loaded() -> str | None:
-    global _model_ready, _load_error
-    if _model_ready:
         return None
-    if _load_error:
-        return _load_error
     try:
-        _backend.load()
-        _model_ready = True
         return None
     except Exception as exc:  # noqa: BLE001 — surface model load failures in the UI
-        _load_error = f"Failed to load model: {exc}"
-        return _load_error
-def chat(message: str, history: list) -> str:
-    load_error = _ensure_model_loaded()
-    if load_error:
-        return load_error
     messages: list[dict[str, str]] = []
     for item in history:
         if isinstance(item, dict):
@@ -41,48 +44,99 @@ def chat(message: str, history: list) -> str:
             messages.append({"role": "user", "content": user_msg})
             if assistant_msg:
                 messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
-    return _backend.chat(messages)
-def warmup() -> str:
-    if _model_ready:
-        return "Model ready."
-    if _load_error:
-        return _load_error
     return (
-        "Model not loaded yet. It will download from Hugging Face Hub on the "
-        "first chat message — this can take a few minutes on CPU."
     )
 def build_demo() -> gr.Blocks:
-    model_repo = os.environ.get("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
-    model_file = os.environ.get("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
-    backend_name = os.environ.get("INFERENCE_BACKEND", "llama_cpp")
     with gr.Blocks(title="Small Model Hackathon") as demo:
         gr.Markdown(
             f"""
 # Small Model Chat
-Local inference via **{backend_name}**.
-- **Repo:** `{model_repo}`
-- **File:** `{model_file}`
 Part of the [Build Small Hackathon](https://huggingface.co/build-small-hackathon).
 """
         )
-        status = gr.Markdown(warmup())
-        gr.ChatInterface(
-            fn=chat,
-            examples=["Hello! What can you help me with?", "Explain llama.cpp in one sentence."],
-        )
-        demo.load(warmup, outputs=status)
     return demo

 import gradio as gr
+from inference.config import get_app_config, get_model_config
+from inference.factory import get_backend, reset_backend
+_app_config = get_app_config()
+_current_model_key: str | None = None
+_load_state: dict[str, bool] = {}
+_load_errors: dict[str, str] = {}
+def _ensure_model_loaded(model_key: str) -> str | None:
+    global _current_model_key
+    if model_key != _current_model_key:
+        reset_backend()
+        _current_model_key = model_key
+    if _load_state.get(model_key):
         return None
+    if model_key in _load_errors:
+        return _load_errors[model_key]
     try:
+        get_backend(model_key).load()
+        _load_state[model_key] = True
         return None
     except Exception as exc:  # noqa: BLE001 — surface model load failures in the UI
+        message = f"Failed to load model: {exc}"
+        _load_errors[model_key] = message
+        return message
+def _history_to_messages(history: list) -> list[dict[str, str]]:
     messages: list[dict[str, str]] = []
     for item in history:
         if isinstance(item, dict):
             messages.append({"role": "user", "content": user_msg})
             if assistant_msg:
                 messages.append({"role": "assistant", "content": assistant_msg})
+    return messages
+def chat(message: str, history: list, model_key: str) -> str:
+    load_error = _ensure_model_loaded(model_key)
+    if load_error:
+        return load_error
+    messages = _history_to_messages(history)
     messages.append({"role": "user", "content": message})
+    return get_backend(model_key).chat(messages)
+def warmup(model_key: str | None = None) -> str:
+    key = model_key or _app_config.active_model
+    model = get_model_config(key)
+    if _load_state.get(key):
+        return f"Model ready: {model.label}"
+    if key in _load_errors:
+        return _load_errors[key]
     return (
+        f"Preset `{key}` selected ({model.backend}). "
+        "Weights load on the first chat message — this can take a few minutes on CPU."
     )
+def model_status(model_key: str) -> str:
+    model = get_model_config(model_key)
+    return f"**{model.label}**\n\n- Backend: `{model.backend}`\n- {warmup(model_key)}"
 def build_demo() -> gr.Blocks:
+    active = _app_config.active
+    presets_note = (
+        f"Presets file: `{_app_config.presets_path}`"
+        if _app_config.presets_path
+        else "Using built-in presets (models.yaml not found)."
+    )
     with gr.Blocks(title="Small Model Hackathon") as demo:
         gr.Markdown(
             f"""
 # Small Model Chat
+Local inference with preset-based configuration.
+- **Default preset:** `{active.key}` — {active.label}
+- **Backend:** `{active.backend}`
+- {presets_note}
 Part of the [Build Small Hackathon](https://huggingface.co/build-small-hackathon).
 """
         )
+        model_key = gr.State(_app_config.active_model)
+        if _app_config.allow_model_switch and len(_app_config.models) > 1:
+            model_dropdown = gr.Dropdown(
+                choices=_app_config.model_choices(),
+                value=_app_config.active_model,
+                label="Model preset",
+                info="Switch presets for local testing. Each preset loads on first use.",
+            )
+            status = gr.Markdown(model_status(_app_config.active_model))
+            model_dropdown.change(
+                fn=model_status,
+                inputs=model_dropdown,
+                outputs=status,
+            ).then(
+                fn=lambda key: key,
+                inputs=model_dropdown,
+                outputs=model_key,
+            )
+            gr.ChatInterface(
+                fn=chat,
+                additional_inputs=[model_dropdown],
+                examples=[
+                    ["Hello! What can you help me with?", _app_config.active_model],
+                    ["Explain llama.cpp in one sentence.", _app_config.active_model],
+                ],
+            )
+        else:
+            status = gr.Markdown(model_status(_app_config.active_model))
+            gr.ChatInterface(
+                fn=lambda message, history: chat(message, history, _app_config.active_model),
+                examples=["Hello! What can you help me with?", "Explain llama.cpp in one sentence."],
+            )
+            demo.load(lambda: warmup(_app_config.active_model), outputs=status)
     return demo

libs/inference/pyproject.toml CHANGED Viewed

@@ -10,6 +10,7 @@ requires-python = ">=3.12"
 dependencies = [
     "huggingface-hub>=0.27.0",
     "llama-cpp-python>=0.3.0",
 ]
 [project.optional-dependencies]

 dependencies = [
     "huggingface-hub>=0.27.0",
     "llama-cpp-python>=0.3.0",
+    "pyyaml>=6.0.2",
 ]
 [project.optional-dependencies]

models.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+# Model preset registry for dev and Hugging Face Space.
+# Select active preset with ACTIVE_MODEL; override any field via .env (see .env.example).
+defaults:
+  active_model: qwen3b-gguf
+  # Dev: set ALLOW_MODEL_SWITCH=true in .env to expose a dropdown in Gradio.
+  # Space: keep false so visitors use one pinned model.
+  allow_model_switch: false
+models:
+  qwen3b-gguf:
+    label: Qwen 2.5 3B Instruct (GGUF, default)
+    backend: llama_cpp
+    model_repo: Qwen/Qwen2.5-3B-Instruct-GGUF
+    model_file: qwen2.5-3b-instruct-q4_k_m.gguf
+    n_ctx: 4096
+    n_gpu_layers: 0
+  llama32-3b-gguf:
+    label: Llama 3.2 3B Instruct (GGUF)
+    backend: llama_cpp
+    model_repo: bartowski/Llama-3.2-3B-Instruct-GGUF
+    model_file: Llama-3.2-3B-Instruct-Q4_K_M.gguf
+    n_ctx: 4096
+    n_gpu_layers: 0
+  minicpm5-1b:
+    label: MiniCPM5 1B (Transformers)
+    backend: transformers
+    model_id: openbmb/MiniCPM5-1B
+    trust_remote_code: true
+  gemma4-e2b-mobile:
+    label: Gemma 4 E2B IT QAT Mobile (Transformers)
+    backend: transformers
+    model_id: google/gemma-4-E2B-it-qat-mobile-transformers
+    trust_remote_code: true
+  gemma-merged-local:
+    label: Fine-tuned merged model (local path)
+    backend: transformers
+    model_id: ./gemma_merged_model
+  gemma-lora-local:
+    label: Fine-tuned LoRA adapter (local path)
+    backend: transformers
+    model_id: ./gemma_finetuned_model