| from __future__ import annotations |
|
|
| import json |
| import os |
| import sys |
| import urllib.parse |
| import urllib.request |
| from pathlib import Path |
|
|
| from huggingface_hub import hf_hub_download |
|
|
| |
| MODEL_REPO = os.getenv("MODEL_REPO", "mixtao/MixTAO-7Bx2-MoE-v8.1-GGUF") |
| MODEL_FILE = os.getenv("MODEL_FILE", "mixtao-7bx2-moe-v8.1.Q4_K_M.gguf") |
| MODEL_DIR = Path(os.getenv("MODEL_DIR", "/data/models/llm")) |
| CHAT_TEMPLATE_FILE = Path(os.getenv("CHAT_TEMPLATE_FILE", "/data/models/llm/chat_template.jinja")) |
|
|
| LLAMA_SERVER_BIN = os.getenv("LLAMA_SERVER_BIN", "/opt/llama.cpp/llama-server") |
| LLAMA_HOST = os.getenv("LLAMA_HOST", "0.0.0.0") |
| LLAMA_PORT = os.getenv("LLAMA_PORT", "7860") |
|
|
| |
| API_KEY = os.getenv("API_KEY", "") |
|
|
| |
| THREADS = os.getenv("THREADS", "4") |
| CTX_SIZE = os.getenv("CTX_SIZE", "2048") |
| BATCH_SIZE = os.getenv("BATCH_SIZE", "default") |
| UBATCH_SIZE = os.getenv("UBATCH_SIZE", "default") |
| GPU_LAYERS = os.getenv("GPU_LAYERS", "0") |
| FLASH_ATTN = os.getenv("FLASH_ATTN", "false") |
| CACHE_TYPE_K = os.getenv("CACHE_TYPE_K", "default") |
| CACHE_TYPE_V = os.getenv("CACHE_TYPE_V", "default") |
|
|
| TEMPERATURE = os.getenv("TEMPERATURE", "0.2") |
| TOP_P = os.getenv("TOP_P", "0.95") |
| TOP_K = os.getenv("TOP_K", "64") |
| REPEAT_PENALTY = os.getenv("REPEAT_PENALTY", "1.08") |
|
|
|
|
| def log(message: str) -> None: |
| print(f"[startup] {message}", flush=True) |
|
|
|
|
| def download_model() -> str: |
| MODEL_DIR.mkdir(parents=True, exist_ok=True) |
| local_file = MODEL_DIR / MODEL_FILE |
| if local_file.exists(): |
| log(f"Using cached model: {local_file}") |
| return str(local_file) |
|
|
| log(f"Downloading {MODEL_REPO}/{MODEL_FILE}") |
| model_path = hf_hub_download( |
| repo_id=MODEL_REPO, |
| filename=MODEL_FILE, |
| local_dir=str(MODEL_DIR), |
| local_dir_use_symlinks=False |
| ) |
| log(f"Model ready: {model_path}") |
| return model_path |
|
|
|
|
| def download_chat_template() -> str | None: |
| if CHAT_TEMPLATE_FILE.exists() and CHAT_TEMPLATE_FILE.stat().st_size > 0: |
| log(f"Using cached chat template: {CHAT_TEMPLATE_FILE}") |
| return str(CHAT_TEMPLATE_FILE) |
|
|
| encoded_repo = urllib.parse.quote(MODEL_REPO, safe="/") |
| api_url = f"https://huggingface.co{encoded_repo}" |
| log("Fetching chat template from model metadata") |
|
|
| try: |
| req = urllib.request.Request(api_url, headers={"User-Agent": "Mozilla/5.0"}) |
| with urllib.request.urlopen(req, timeout=30) as response: |
| metadata = json.loads(response.read().decode("utf-8")) |
| except Exception as exc: |
| log(f"Could not fetch chat template metadata: {exc}") |
| return None |
|
|
| template = (metadata.get("gguf") or {}).get("chat_template") |
| if not template: |
| log("No chat template found in model metadata; llama-server will use GGUF metadata") |
| return None |
|
|
| CHAT_TEMPLATE_FILE.parent.mkdir(parents=True, exist_ok=True) |
| CHAT_TEMPLATE_FILE.write_text(template, encoding="utf-8") |
| log(f"Chat template ready: {CHAT_TEMPLATE_FILE}") |
| return str(CHAT_TEMPLATE_FILE) |
|
|
|
|
| def build_command(model_path: str, template_path: str | None) -> list[str]: |
| def is_valid(value: str) -> bool: |
| return value.strip().lower() not in {"", "default", "auto", "none", "off", "false"} |
|
|
| cmd = [ |
| LLAMA_SERVER_BIN, |
| "-m", model_path, |
| "--host", LLAMA_HOST, |
| "--port", LLAMA_PORT, |
| "--threads", THREADS, |
| "--ctx-size", CTX_SIZE, |
| "--n-gpu-layers", GPU_LAYERS, |
| "--parallel", "4", |
| "--cont-batching", |
| "--temp", TEMPERATURE, |
| "--top-p", TOP_P, |
| "--top-k", TOP_K, |
| "--repeat-penalty", REPEAT_PENALTY, |
| ] |
|
|
| |
| |
| |
| if API_KEY.strip(): |
| cmd.extend(["--api-key", API_KEY.strip()]) |
| log("OpenAI API authentication enabled with custom API_KEY.") |
| else: |
| log("Warning: Running without API_KEY. Server is publicly accessible.") |
|
|
| |
| if is_valid(BATCH_SIZE): cmd.extend(["--batch-size", BATCH_SIZE]) |
| if is_valid(UBATCH_SIZE): cmd.extend(["--ubatch-size", UBATCH_SIZE]) |
| if is_valid(CACHE_TYPE_K): cmd.extend(["--cache-type-k", CACHE_TYPE_K]) |
| if is_valid(CACHE_TYPE_V): cmd.extend(["--cache-type-v", CACHE_TYPE_V]) |
| if is_valid(FLASH_ATTN): cmd.append("--flash-attn") |
| if template_path: cmd.extend(["--chat-template-file", template_path]) |
|
|
| return cmd |
|
|
|
|
| def main() -> None: |
| binary_dir = str(Path(LLAMA_SERVER_BIN).parent) |
| existing_lib = os.environ.get("LD_LIBRARY_PATH") |
| os.environ["LD_LIBRARY_PATH"] = binary_dir if not existing_lib else f"{binary_dir}:{existing_lib}" |
|
|
| for env_var in ["OMP_NUM_THREADS", "OPENBLAS_NUM_THREADS", "MKL_NUM_THREADS"]: |
| os.environ.setdefault(env_var, THREADS) |
|
|
| model_path = download_model() |
| template_path = download_chat_template() |
| cmd = build_command(model_path, template_path) |
|
|
| log("Starting OpenAI-compatible llama.cpp API server") |
| log(" ".join(cmd)) |
| os.execvpe(cmd[0], cmd, os.environ) |
|
|
|
|
| if __name__ == "__main__": |
| try: |
| main() |
| except Exception as exc: |
| print(f"[fatal] {exc}", file=sys.stderr, flush=True) |
| raise |
|
|