MSGEncrypted commited on
Commit
7060f14
·
1 Parent(s): d1d46b8
libs/inference/README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # inference
2
+
3
+ Swappable local inference backends (`llama_cpp` default, `transformers` optional extra).
4
+
5
+ ```python
6
+ from inference.factory import get_backend
7
+
8
+ backend = get_backend()
9
+ backend.load()
10
+ reply = backend.chat([{"role": "user", "content": "Hello!"}])
11
+ ```
libs/inference/pyproject.toml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "inference"
3
+ version = "0.1.0"
4
+ description = "Swappable local inference backends for the hackathon Space"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "MSGhais", email = "msghais135@gmail.com" }
8
+ ]
9
+ requires-python = ">=3.12"
10
+ dependencies = [
11
+ "huggingface-hub>=0.27.0",
12
+ "llama-cpp-python>=0.3.0",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ transformers = [
17
+ "accelerate>=1.2.0",
18
+ "torch>=2.5.0",
19
+ "transformers>=4.47.0",
20
+ ]
21
+
22
+ [build-system]
23
+ requires = ["uv_build>=0.8.13,<0.9.0"]
24
+ build-backend = "uv_build"
libs/inference/src/inference/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from inference.factory import get_backend
2
+
3
+ __all__ = ["get_backend"]
libs/inference/src/inference/base.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Protocol
2
+
3
+
4
+ class InferenceBackend(Protocol):
5
+ def load(self) -> None:
6
+ """Load model weights into memory."""
7
+
8
+ def generate(
9
+ self,
10
+ prompt: str,
11
+ *,
12
+ max_tokens: int = 512,
13
+ temperature: float = 0.7,
14
+ ) -> str:
15
+ """Generate text from a single prompt."""
16
+
17
+ def chat(
18
+ self,
19
+ messages: list[dict[str, str]],
20
+ *,
21
+ max_tokens: int = 512,
22
+ temperature: float = 0.7,
23
+ ) -> str:
24
+ """Generate a reply from a chat message history."""
libs/inference/src/inference/factory.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from functools import lru_cache
3
+
4
+ from inference.base import InferenceBackend
5
+ from inference.llama_cpp import LlamaCppBackend
6
+
7
+
8
+ @lru_cache(maxsize=1)
9
+ def get_backend() -> InferenceBackend:
10
+ backend_name = os.environ.get("INFERENCE_BACKEND", "llama_cpp").lower()
11
+
12
+ if backend_name == "llama_cpp":
13
+ return LlamaCppBackend()
14
+
15
+ if backend_name == "transformers":
16
+ from inference.transformers import TransformersBackend
17
+
18
+ return TransformersBackend()
19
+
20
+ raise ValueError(
21
+ f"Unknown INFERENCE_BACKEND={backend_name!r}. "
22
+ "Expected 'llama_cpp' or 'transformers'."
23
+ )
libs/inference/src/inference/llama_cpp.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from huggingface_hub import hf_hub_download
5
+ from llama_cpp import Llama
6
+
7
+
8
+ DEFAULT_MODEL_REPO = "Qwen/Qwen2.5-3B-Instruct-GGUF"
9
+ DEFAULT_MODEL_FILE = "qwen2.5-3b-instruct-q4_k_m.gguf"
10
+
11
+
12
+ class LlamaCppBackend:
13
+ def __init__(self) -> None:
14
+ self._model: Llama | None = None
15
+ self._model_path: str | None = None
16
+
17
+ def _resolve_model_path(self) -> str:
18
+ model_path = os.environ.get("MODEL_PATH")
19
+ if model_path:
20
+ path = Path(model_path)
21
+ if not path.exists():
22
+ raise FileNotFoundError(f"MODEL_PATH does not exist: {model_path}")
23
+ return str(path)
24
+
25
+ model_repo = os.environ.get("MODEL_REPO", DEFAULT_MODEL_REPO)
26
+ model_file = os.environ.get("MODEL_FILE", DEFAULT_MODEL_FILE)
27
+ cache_dir = os.environ.get("MODEL_CACHE_DIR")
28
+
29
+ return hf_hub_download(
30
+ repo_id=model_repo,
31
+ filename=model_file,
32
+ cache_dir=cache_dir,
33
+ )
34
+
35
+ def load(self) -> None:
36
+ if self._model is not None:
37
+ return
38
+
39
+ self._model_path = self._resolve_model_path()
40
+ n_ctx = int(os.environ.get("N_CTX", "4096"))
41
+ n_gpu_layers = int(os.environ.get("N_GPU_LAYERS", "0"))
42
+
43
+ self._model = Llama(
44
+ model_path=self._model_path,
45
+ n_ctx=n_ctx,
46
+ n_gpu_layers=n_gpu_layers,
47
+ verbose=False,
48
+ )
49
+
50
+ def generate(
51
+ self,
52
+ prompt: str,
53
+ *,
54
+ max_tokens: int = 512,
55
+ temperature: float = 0.7,
56
+ ) -> str:
57
+ self.load()
58
+ assert self._model is not None
59
+
60
+ result = self._model(
61
+ prompt,
62
+ max_tokens=max_tokens,
63
+ temperature=temperature,
64
+ echo=False,
65
+ )
66
+ return result["choices"][0]["text"].strip()
67
+
68
+ def chat(
69
+ self,
70
+ messages: list[dict[str, str]],
71
+ *,
72
+ max_tokens: int = 512,
73
+ temperature: float = 0.7,
74
+ ) -> str:
75
+ self.load()
76
+ assert self._model is not None
77
+
78
+ result = self._model.create_chat_completion(
79
+ messages=messages,
80
+ max_tokens=max_tokens,
81
+ temperature=temperature,
82
+ )
83
+ return result["choices"][0]["message"]["content"].strip()
libs/inference/src/inference/transformers.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from inference.base import InferenceBackend
4
+
5
+
6
+ class TransformersBackend:
7
+ def __init__(self) -> None:
8
+ self._model = None
9
+ self._tokenizer = None
10
+
11
+ def load(self) -> None:
12
+ if self._model is not None:
13
+ return
14
+
15
+ try:
16
+ import torch
17
+ from transformers import AutoModelForCausalLM, AutoTokenizer
18
+ except ImportError as exc:
19
+ raise ImportError(
20
+ "transformers backend requires optional deps. "
21
+ "Install with: uv sync --package inference --extra transformers"
22
+ ) from exc
23
+
24
+ model_id = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-3B-Instruct")
25
+ device = "cuda" if torch.cuda.is_available() else "cpu"
26
+
27
+ self._tokenizer = AutoTokenizer.from_pretrained(model_id)
28
+ self._model = AutoModelForCausalLM.from_pretrained(
29
+ model_id,
30
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
31
+ device_map="auto" if device == "cuda" else None,
32
+ )
33
+ if device == "cpu":
34
+ self._model.to(device)
35
+
36
+ def generate(
37
+ self,
38
+ prompt: str,
39
+ *,
40
+ max_tokens: int = 512,
41
+ temperature: float = 0.7,
42
+ ) -> str:
43
+ self.load()
44
+ assert self._model is not None
45
+ assert self._tokenizer is not None
46
+
47
+ import torch
48
+
49
+ inputs = self._tokenizer(prompt, return_tensors="pt").to(self._model.device)
50
+ output = self._model.generate(
51
+ **inputs,
52
+ max_new_tokens=max_tokens,
53
+ temperature=temperature,
54
+ do_sample=temperature > 0,
55
+ )
56
+ generated = output[0][inputs["input_ids"].shape[-1] :]
57
+ return self._tokenizer.decode(generated, skip_special_tokens=True).strip()
58
+
59
+ def chat(
60
+ self,
61
+ messages: list[dict[str, str]],
62
+ *,
63
+ max_tokens: int = 512,
64
+ temperature: float = 0.7,
65
+ ) -> str:
66
+ self.load()
67
+ assert self._model is not None
68
+ assert self._tokenizer is not None
69
+
70
+ if hasattr(self._tokenizer, "apply_chat_template"):
71
+ prompt = self._tokenizer.apply_chat_template(
72
+ messages,
73
+ tokenize=False,
74
+ add_generation_prompt=True,
75
+ )
76
+ else:
77
+ parts = []
78
+ for message in messages:
79
+ role = message["role"]
80
+ content = message["content"]
81
+ parts.append(f"{role}: {content}")
82
+ parts.append("assistant:")
83
+ prompt = "\n".join(parts)
84
+
85
+ return self.generate(prompt, max_tokens=max_tokens, temperature=temperature)
86
+
87
+
88
+ # Satisfy static type checkers that expect InferenceBackend.
89
+ _: InferenceBackend = TransformersBackend()