Instructions to use dogtooth/open-lm-3b-202407 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use dogtooth/open-lm-3b-202407 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="dogtooth/open-lm-3b-202407", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("dogtooth/open-lm-3b-202407", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use dogtooth/open-lm-3b-202407 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "dogtooth/open-lm-3b-202407"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "dogtooth/open-lm-3b-202407",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/dogtooth/open-lm-3b-202407

SGLang

How to use dogtooth/open-lm-3b-202407 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "dogtooth/open-lm-3b-202407" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "dogtooth/open-lm-3b-202407",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "dogtooth/open-lm-3b-202407" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "dogtooth/open-lm-3b-202407",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use dogtooth/open-lm-3b-202407 with Docker Model Runner:
```
docker model run hf.co/dogtooth/open-lm-3b-202407
```

open-lm-3b-202407 / modeling_open_lm_hf.py

dogtooth

Upload folder using huggingface_hub

bbc6941 verified 4 months ago

raw

history blame contribute delete

4.43 kB

	"""
	Custom HuggingFace model for Open LM checkpoints.

	Open LM uses LayerNorm (not RMSNorm) and QK norm, which standard
	LlamaForCausalLM does not support. This module provides:
	- OpenLMConfig: LlamaConfig subclass with qk_norm flag
	- OpenLMForCausalLM: LlamaForCausalLM subclass with LayerNorm + QK norm

	Usage:
	model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True)
	"""

	from typing import Callable, Optional

	import torch
	import torch.nn as nn
	from transformers import LlamaConfig, LlamaForCausalLM
	from transformers.models.llama.modeling_llama import (
	ALL_ATTENTION_FUNCTIONS,
	LlamaAttention,
	LlamaRMSNorm,
	apply_rotary_pos_emb,
	eager_attention_forward,
	)

	try:
	from typing import Unpack
	from transformers.utils.generic import TransformersKwargs
	except ImportError:
	pass

	from transformers.cache_utils import Cache


	class OpenLMConfig(LlamaConfig):
	model_type = "open_lm"

	def __init__(self, qk_norm: bool = True, **kwargs):
	super().__init__(**kwargs)
	self.qk_norm = qk_norm


	class OpenLMAttention(LlamaAttention):
	"""LlamaAttention with QK norm applied before reshape (matching Open LM)."""

	def __init__(self, config: OpenLMConfig, layer_idx: int):
	super().__init__(config, layer_idx)
	if getattr(config, "qk_norm", False):
	self.q_norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps, bias=False)
	self.k_norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps, bias=False)
	else:
	self.q_norm = nn.Identity()
	self.k_norm = nn.Identity()

	def forward(
	self,
	hidden_states: torch.Tensor,
	position_embeddings: tuple[torch.Tensor, torch.Tensor],
	attention_mask: Optional[torch.Tensor],
	past_key_values: Optional[Cache] = None,
	cache_position: Optional[torch.LongTensor] = None,
	**kwargs,
	) -> tuple[torch.Tensor, torch.Tensor]:
	input_shape = hidden_states.shape[:-1]
	hidden_shape = (*input_shape, -1, self.head_dim)

	# QK norm applied to flat projected vectors BEFORE reshape (matches Open LM)
	query_states = self.q_norm(self.q_proj(hidden_states)).view(hidden_shape).transpose(1, 2)
	key_states = self.k_norm(self.k_proj(hidden_states)).view(hidden_shape).transpose(1, 2)
	value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)

	cos, sin = position_embeddings
	query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

	if past_key_values is not None:
	cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
	key_states, value_states = past_key_values.update(
	key_states, value_states, self.layer_idx, cache_kwargs
	)

	attention_interface: Callable = eager_attention_forward
	if self.config._attn_implementation != "eager":
	attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

	attn_output, attn_weights = attention_interface(
	self,
	query_states,
	key_states,
	value_states,
	attention_mask,
	dropout=0.0 if not self.training else self.attention_dropout,
	scaling=self.scaling,
	**kwargs,
	)

	attn_output = attn_output.reshape(*input_shape, -1).contiguous()
	attn_output = self.o_proj(attn_output)
	return attn_output, attn_weights


	class OpenLMForCausalLM(LlamaForCausalLM):
	"""LlamaForCausalLM with LayerNorm (instead of RMSNorm) and QK norm support."""

	config_class = OpenLMConfig

	def __init__(self, config: OpenLMConfig):
	super().__init__(config)

	# Replace all LlamaRMSNorm with nn.LayerNorm(bias=False)
	eps = config.rms_norm_eps
	hidden_size = config.hidden_size

	self.model.norm = nn.LayerNorm(hidden_size, eps=eps, bias=False)
	for layer in self.model.layers:
	layer.input_layernorm = nn.LayerNorm(hidden_size, eps=eps, bias=False)
	layer.post_attention_layernorm = nn.LayerNorm(hidden_size, eps=eps, bias=False)

	# Replace attention module with QK norm version
	layer.self_attn = OpenLMAttention(config, layer.self_attn.layer_idx)

	# Re-run post_init to tie weights etc.
	self.post_init()