HearthNet-Nemotron

Running on Zero

HearthNet-Nemotron / scripts /modal_deploy.py

GitHub Actions

feat: P5/P6 model swap + modal fix + doc consolidation

495b06b 9 days ago

3.98 kB

	"""Modal deployment script for HearthNet LLM inference.

	Run once to deploy a serverless GPU endpoint on Modal:

	modal deploy scripts/modal_deploy.py

	Then set MODAL_ENDPOINT in your HF Space / local .env to the printed URL.

	Qualifies for: Modal Best Use Of Modal prize ($10k credits).
	See docs: https://modal.com/docs/guide/webhooks
	"""

	from __future__ import annotations

	# ── Requirements ──────────────────────────────────────────────────────────────
	# pip install modal transformers torch accelerate fastapi

	import modal

	# ── Modal app definition ──────────────────────────────────────────────────────
	app = modal.App("hearthnet-llm")

	MODEL_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

	# Build a container image with the required packages
	image = (
	modal.Image.debian_slim(python_version="3.11")
	.pip_install(
	"transformers>=4.40",
	"torch>=2.2",
	"accelerate>=0.30",
	"fastapi",
	"uvicorn",
	)
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
	)


	@app.cls(
	gpu="T4",
	image=image,
	scaledown_window=300,
	timeout=300,
	)
	class HearthNetLLM:
	@modal.enter()
	def load_model(self):
	from transformers import pipeline

	self.pipe = pipeline(
	"text-generation",
	model=MODEL_ID,
	device_map="auto",
	torch_dtype="auto",
	)

	@modal.web_endpoint(method="GET", label="hearthnet-llm")
	def health(self) -> dict:
	return {"status": "ok", "model": MODEL_ID}

	@modal.web_endpoint(method="POST", label="hearthnet-llm-chat")
	def chat_completions(self, request: dict) -> dict:
	"""OpenAI-compatible /v1/chat/completions endpoint."""
	messages = request.get("messages", [])
	max_tokens = request.get("max_tokens", 512)
	temperature = request.get("temperature", 0.7)

	# Format messages into prompt
	prompt = ""
	for msg in messages:
	role = msg.get("role", "user")
	content = msg.get("content", "")
	if role == "system":
	prompt += f"<\|system\|>\n{content}\n"
	elif role == "user":
	prompt += f"<\|user\|>\n{content}\n"
	elif role == "assistant":
	prompt += f"<\|assistant\|>\n{content}\n"
	prompt += "<\|assistant\|>\n"

	result = self.pipe(
	prompt,
	max_new_tokens=max_tokens,
	temperature=temperature,
	do_sample=temperature > 0,
	return_full_text=False,
	)
	text = result[0]["generated_text"]

	return {
	"id": "modal-chat-1",
	"object": "chat.completion",
	"model": MODEL_ID,
	"choices": [
	{
	"index": 0,
	"message": {"role": "assistant", "content": text},
	"finish_reason": "stop",
	}
	],
	"usage": {
	"prompt_tokens": len(prompt.split()),
	"completion_tokens": len(text.split()),
	"total_tokens": len(prompt.split()) + len(text.split()),
	},
	}


	# ── Local entrypoint for testing ──────────────────────────────────────────────
	@app.local_entrypoint()
	def main():
	print("Deploying HearthNet LLM to Modal...")
	print(f"Model: {MODEL_ID}")
	print("After deployment, set MODAL_ENDPOINT to the printed web endpoint URL")
	print("Then add to HearthNet config.toml:")
	print()
	print(" [[llm.backends]]")
	print(" name = 'modal'")
	print(" endpoint = 'https://YOUR-ORG--hearthnet-llm-chat.modal.run'")
	print()