INFERENCE_BACKEND=llama_cpp MODEL_REPO=Qwen/Qwen2.5-3B-Instruct-GGUF MODEL_FILE=qwen2.5-3b-instruct-q4_k_m.gguf N_CTX=4096 N_GPU_LAYERS=0 # Optional: local GGUF path instead of Hub download # MODEL_PATH=./models/qwen2.5-3b-instruct-q4_k_m.gguf # Optional: transformers backend (requires inference[transformers] extra) # INFERENCE_BACKEND=transformers # MODEL_ID=Qwen/Qwen2.5-3B-Instruct