MSGEncrypted commited on
Commit
9a7964b
·
1 Parent(s): 9341111

app inference gradio

Browse files
.env.example CHANGED
@@ -1,12 +1,25 @@
 
 
 
 
 
 
 
1
  INFERENCE_BACKEND=llama_cpp
2
  MODEL_REPO=Qwen/Qwen2.5-3B-Instruct-GGUF
3
  MODEL_FILE=qwen2.5-3b-instruct-q4_k_m.gguf
4
  N_CTX=4096
5
  N_GPU_LAYERS=0
6
 
7
- # Optional: local GGUF path instead of Hub download
8
  # MODEL_PATH=./models/qwen2.5-3b-instruct-q4_k_m.gguf
9
 
10
- # Optional: transformers backend (requires inference[transformers] extra)
 
11
  # INFERENCE_BACKEND=transformers
12
- # MODEL_ID=Qwen/Qwen2.5-3B-Instruct
 
 
 
 
 
 
1
+ # --- Preset selection (models.yaml is the source of truth) ---
2
+ ACTIVE_MODEL=qwen3b-gguf
3
+ # Dev: enable dropdown in Gradio. Space: leave false to pin one model for visitors.
4
+ ALLOW_MODEL_SWITCH=true
5
+ # MODEL_PRESETS_PATH=./models.yaml
6
+
7
+ # --- Legacy single-model overrides (optional; applied to ACTIVE_MODEL only) ---
8
  INFERENCE_BACKEND=llama_cpp
9
  MODEL_REPO=Qwen/Qwen2.5-3B-Instruct-GGUF
10
  MODEL_FILE=qwen2.5-3b-instruct-q4_k_m.gguf
11
  N_CTX=4096
12
  N_GPU_LAYERS=0
13
 
14
+ # Optional: local GGUF path instead of Hub download (set in models.yaml model_path too)
15
  # MODEL_PATH=./models/qwen2.5-3b-instruct-q4_k_m.gguf
16
 
17
+ # Optional: transformers presets (requires inference[transformers] extra)
18
+ # ACTIVE_MODEL=minicpm5-1b
19
  # INFERENCE_BACKEND=transformers
20
+ # MODEL_ID=openbmb/MiniCPM5-1B
21
+ # TRUST_REMOTE_CODE=true
22
+
23
+ # Optional: local fine-tuned merged weights
24
+ # ACTIVE_MODEL=gemma-merged-local
25
+ # MODEL_ID=./gemma_merged_model
Dockerfile CHANGED
@@ -13,7 +13,7 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
13
 
14
  WORKDIR /app
15
 
16
- COPY pyproject.toml uv.lock .python-version README.md ./
17
  COPY apps/gradio-space/pyproject.toml apps/gradio-space/README.md apps/gradio-space/
18
  COPY libs/inference/pyproject.toml libs/inference/README.md libs/inference/
19
  COPY apps/gradio-space/src apps/gradio-space/src
 
13
 
14
  WORKDIR /app
15
 
16
+ COPY pyproject.toml uv.lock .python-version README.md models.yaml ./
17
  COPY apps/gradio-space/pyproject.toml apps/gradio-space/README.md apps/gradio-space/
18
  COPY libs/inference/pyproject.toml libs/inference/README.md libs/inference/
19
  COPY apps/gradio-space/src apps/gradio-space/src
apps/gradio-space/src/gradio_space/app.py CHANGED
@@ -2,36 +2,39 @@ import os
2
 
3
  import gradio as gr
4
 
5
- from inference.factory import get_backend
 
6
 
7
- _backend = get_backend()
8
- _model_ready = False
9
- _load_error: str | None = None
 
10
 
11
 
12
- def _ensure_model_loaded() -> str | None:
13
- global _model_ready, _load_error
14
 
15
- if _model_ready:
 
 
 
 
16
  return None
17
 
18
- if _load_error:
19
- return _load_error
20
 
21
  try:
22
- _backend.load()
23
- _model_ready = True
24
  return None
25
  except Exception as exc: # noqa: BLE001 — surface model load failures in the UI
26
- _load_error = f"Failed to load model: {exc}"
27
- return _load_error
28
-
29
 
30
- def chat(message: str, history: list) -> str:
31
- load_error = _ensure_model_loaded()
32
- if load_error:
33
- return load_error
34
 
 
35
  messages: list[dict[str, str]] = []
36
  for item in history:
37
  if isinstance(item, dict):
@@ -41,48 +44,99 @@ def chat(message: str, history: list) -> str:
41
  messages.append({"role": "user", "content": user_msg})
42
  if assistant_msg:
43
  messages.append({"role": "assistant", "content": assistant_msg})
 
 
 
 
 
 
 
44
 
 
45
  messages.append({"role": "user", "content": message})
46
- return _backend.chat(messages)
47
 
48
 
49
- def warmup() -> str:
50
- if _model_ready:
51
- return "Model ready."
52
 
53
- if _load_error:
54
- return _load_error
 
 
 
55
 
56
  return (
57
- "Model not loaded yet. It will download from Hugging Face Hub on the "
58
- "first chat message — this can take a few minutes on CPU."
59
  )
60
 
61
 
 
 
 
 
 
62
  def build_demo() -> gr.Blocks:
63
- model_repo = os.environ.get("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
64
- model_file = os.environ.get("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
65
- backend_name = os.environ.get("INFERENCE_BACKEND", "llama_cpp")
 
 
 
66
 
67
  with gr.Blocks(title="Small Model Hackathon") as demo:
68
  gr.Markdown(
69
  f"""
70
  # Small Model Chat
71
 
72
- Local inference via **{backend_name}**.
73
 
74
- - **Repo:** `{model_repo}`
75
- - **File:** `{model_file}`
 
76
 
77
  Part of the [Build Small Hackathon](https://huggingface.co/build-small-hackathon).
78
  """
79
  )
80
- status = gr.Markdown(warmup())
81
- gr.ChatInterface(
82
- fn=chat,
83
- examples=["Hello! What can you help me with?", "Explain llama.cpp in one sentence."],
84
- )
85
- demo.load(warmup, outputs=status)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  return demo
88
 
 
2
 
3
  import gradio as gr
4
 
5
+ from inference.config import get_app_config, get_model_config
6
+ from inference.factory import get_backend, reset_backend
7
 
8
+ _app_config = get_app_config()
9
+ _current_model_key: str | None = None
10
+ _load_state: dict[str, bool] = {}
11
+ _load_errors: dict[str, str] = {}
12
 
13
 
14
+ def _ensure_model_loaded(model_key: str) -> str | None:
15
+ global _current_model_key
16
 
17
+ if model_key != _current_model_key:
18
+ reset_backend()
19
+ _current_model_key = model_key
20
+
21
+ if _load_state.get(model_key):
22
  return None
23
 
24
+ if model_key in _load_errors:
25
+ return _load_errors[model_key]
26
 
27
  try:
28
+ get_backend(model_key).load()
29
+ _load_state[model_key] = True
30
  return None
31
  except Exception as exc: # noqa: BLE001 — surface model load failures in the UI
32
+ message = f"Failed to load model: {exc}"
33
+ _load_errors[model_key] = message
34
+ return message
35
 
 
 
 
 
36
 
37
+ def _history_to_messages(history: list) -> list[dict[str, str]]:
38
  messages: list[dict[str, str]] = []
39
  for item in history:
40
  if isinstance(item, dict):
 
44
  messages.append({"role": "user", "content": user_msg})
45
  if assistant_msg:
46
  messages.append({"role": "assistant", "content": assistant_msg})
47
+ return messages
48
+
49
+
50
+ def chat(message: str, history: list, model_key: str) -> str:
51
+ load_error = _ensure_model_loaded(model_key)
52
+ if load_error:
53
+ return load_error
54
 
55
+ messages = _history_to_messages(history)
56
  messages.append({"role": "user", "content": message})
57
+ return get_backend(model_key).chat(messages)
58
 
59
 
60
+ def warmup(model_key: str | None = None) -> str:
61
+ key = model_key or _app_config.active_model
62
+ model = get_model_config(key)
63
 
64
+ if _load_state.get(key):
65
+ return f"Model ready: {model.label}"
66
+
67
+ if key in _load_errors:
68
+ return _load_errors[key]
69
 
70
  return (
71
+ f"Preset `{key}` selected ({model.backend}). "
72
+ "Weights load on the first chat message — this can take a few minutes on CPU."
73
  )
74
 
75
 
76
+ def model_status(model_key: str) -> str:
77
+ model = get_model_config(model_key)
78
+ return f"**{model.label}**\n\n- Backend: `{model.backend}`\n- {warmup(model_key)}"
79
+
80
+
81
  def build_demo() -> gr.Blocks:
82
+ active = _app_config.active
83
+ presets_note = (
84
+ f"Presets file: `{_app_config.presets_path}`"
85
+ if _app_config.presets_path
86
+ else "Using built-in presets (models.yaml not found)."
87
+ )
88
 
89
  with gr.Blocks(title="Small Model Hackathon") as demo:
90
  gr.Markdown(
91
  f"""
92
  # Small Model Chat
93
 
94
+ Local inference with preset-based configuration.
95
 
96
+ - **Default preset:** `{active.key}` — {active.label}
97
+ - **Backend:** `{active.backend}`
98
+ - {presets_note}
99
 
100
  Part of the [Build Small Hackathon](https://huggingface.co/build-small-hackathon).
101
  """
102
  )
103
+
104
+ model_key = gr.State(_app_config.active_model)
105
+
106
+ if _app_config.allow_model_switch and len(_app_config.models) > 1:
107
+ model_dropdown = gr.Dropdown(
108
+ choices=_app_config.model_choices(),
109
+ value=_app_config.active_model,
110
+ label="Model preset",
111
+ info="Switch presets for local testing. Each preset loads on first use.",
112
+ )
113
+ status = gr.Markdown(model_status(_app_config.active_model))
114
+
115
+ model_dropdown.change(
116
+ fn=model_status,
117
+ inputs=model_dropdown,
118
+ outputs=status,
119
+ ).then(
120
+ fn=lambda key: key,
121
+ inputs=model_dropdown,
122
+ outputs=model_key,
123
+ )
124
+
125
+ gr.ChatInterface(
126
+ fn=chat,
127
+ additional_inputs=[model_dropdown],
128
+ examples=[
129
+ ["Hello! What can you help me with?", _app_config.active_model],
130
+ ["Explain llama.cpp in one sentence.", _app_config.active_model],
131
+ ],
132
+ )
133
+ else:
134
+ status = gr.Markdown(model_status(_app_config.active_model))
135
+ gr.ChatInterface(
136
+ fn=lambda message, history: chat(message, history, _app_config.active_model),
137
+ examples=["Hello! What can you help me with?", "Explain llama.cpp in one sentence."],
138
+ )
139
+ demo.load(lambda: warmup(_app_config.active_model), outputs=status)
140
 
141
  return demo
142
 
libs/inference/pyproject.toml CHANGED
@@ -10,6 +10,7 @@ requires-python = ">=3.12"
10
  dependencies = [
11
  "huggingface-hub>=0.27.0",
12
  "llama-cpp-python>=0.3.0",
 
13
  ]
14
 
15
  [project.optional-dependencies]
 
10
  dependencies = [
11
  "huggingface-hub>=0.27.0",
12
  "llama-cpp-python>=0.3.0",
13
+ "pyyaml>=6.0.2",
14
  ]
15
 
16
  [project.optional-dependencies]
models.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model preset registry for dev and Hugging Face Space.
2
+ # Select active preset with ACTIVE_MODEL; override any field via .env (see .env.example).
3
+
4
+ defaults:
5
+ active_model: qwen3b-gguf
6
+ # Dev: set ALLOW_MODEL_SWITCH=true in .env to expose a dropdown in Gradio.
7
+ # Space: keep false so visitors use one pinned model.
8
+ allow_model_switch: false
9
+
10
+ models:
11
+ qwen3b-gguf:
12
+ label: Qwen 2.5 3B Instruct (GGUF, default)
13
+ backend: llama_cpp
14
+ model_repo: Qwen/Qwen2.5-3B-Instruct-GGUF
15
+ model_file: qwen2.5-3b-instruct-q4_k_m.gguf
16
+ n_ctx: 4096
17
+ n_gpu_layers: 0
18
+
19
+ llama32-3b-gguf:
20
+ label: Llama 3.2 3B Instruct (GGUF)
21
+ backend: llama_cpp
22
+ model_repo: bartowski/Llama-3.2-3B-Instruct-GGUF
23
+ model_file: Llama-3.2-3B-Instruct-Q4_K_M.gguf
24
+ n_ctx: 4096
25
+ n_gpu_layers: 0
26
+
27
+ minicpm5-1b:
28
+ label: MiniCPM5 1B (Transformers)
29
+ backend: transformers
30
+ model_id: openbmb/MiniCPM5-1B
31
+ trust_remote_code: true
32
+
33
+ gemma4-e2b-mobile:
34
+ label: Gemma 4 E2B IT QAT Mobile (Transformers)
35
+ backend: transformers
36
+ model_id: google/gemma-4-E2B-it-qat-mobile-transformers
37
+ trust_remote_code: true
38
+
39
+ gemma-merged-local:
40
+ label: Fine-tuned merged model (local path)
41
+ backend: transformers
42
+ model_id: ./gemma_merged_model
43
+
44
+ gemma-lora-local:
45
+ label: Fine-tuned LoRA adapter (local path)
46
+ backend: transformers
47
+ model_id: ./gemma_finetuned_model