Instructions to use xlr8harder/talkie-1930-13b-base-tf with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use xlr8harder/talkie-1930-13b-base-tf with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="xlr8harder/talkie-1930-13b-base-tf", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("xlr8harder/talkie-1930-13b-base-tf", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use xlr8harder/talkie-1930-13b-base-tf with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "xlr8harder/talkie-1930-13b-base-tf"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "xlr8harder/talkie-1930-13b-base-tf",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/xlr8harder/talkie-1930-13b-base-tf

SGLang

How to use xlr8harder/talkie-1930-13b-base-tf with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "xlr8harder/talkie-1930-13b-base-tf" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "xlr8harder/talkie-1930-13b-base-tf",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "xlr8harder/talkie-1930-13b-base-tf" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "xlr8harder/talkie-1930-13b-base-tf",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use xlr8harder/talkie-1930-13b-base-tf with Docker Model Runner:
```
docker model run hf.co/xlr8harder/talkie-1930-13b-base-tf
```

xlr8harder commited on May 14

Commit

294862a

verified ·

1 Parent(s): 15d5bc6

Fix vLLM CUDA graph capture in forward path

Browse files

Files changed (2) hide show

configuration_talkie.py +54 -0
modeling_talkie.py +221 -42

configuration_talkie.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
 from transformers import PretrainedConfig
@@ -15,6 +17,8 @@ class TalkieConfig(PretrainedConfig):
         head_dim: int = 128,
         max_position_embeddings: int = 2048,
         rope_base: int = 1_000_000,
         logit_scale: float = 1.0,
         use_cache: bool = True,
         tie_word_embeddings: bool = False,
@@ -23,6 +27,11 @@ class TalkieConfig(PretrainedConfig):
         pad_token_id: int | None = None,
         **kwargs,
     ):
         super().__init__(
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
@@ -37,6 +46,8 @@ class TalkieConfig(PretrainedConfig):
         self.head_dim = head_dim
         self.max_position_embeddings = max_position_embeddings
         self.rope_base = rope_base
         self.logit_scale = logit_scale
         self.use_cache = use_cache
@@ -44,3 +55,46 @@ class TalkieConfig(PretrainedConfig):
         self.hidden_size = n_embd
         self.num_hidden_layers = n_layer
         self.num_attention_heads = n_head

 from __future__ import annotations
+from collections.abc import Mapping
 from transformers import PretrainedConfig
         head_dim: int = 128,
         max_position_embeddings: int = 2048,
         rope_base: int = 1_000_000,
+        rope_scaling: dict | None = None,
+        rope_parameters: dict | None = None,
         logit_scale: float = 1.0,
         use_cache: bool = True,
         tie_word_embeddings: bool = False,
         pad_token_id: int | None = None,
         **kwargs,
     ):
+        if rope_scaling is None:
+            rope_scaling = rope_parameters
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_scaling = self._normalize_rope_scaling(rope_scaling)
+        self.rope_parameters = self.rope_scaling
         super().__init__(
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
         self.head_dim = head_dim
         self.max_position_embeddings = max_position_embeddings
         self.rope_base = rope_base
+        self.rope_scaling = self._normalize_rope_scaling(rope_scaling)
+        self.rope_parameters = self.rope_scaling
         self.logit_scale = logit_scale
         self.use_cache = use_cache
         self.hidden_size = n_embd
         self.num_hidden_layers = n_layer
         self.num_attention_heads = n_head
+    @staticmethod
+    def _normalize_rope_scaling(rope_scaling: dict | None) -> dict | None:
+        if rope_scaling is None:
+            return None
+        if not isinstance(rope_scaling, Mapping):
+            raise TypeError("rope_scaling must be a dictionary")
+        scaling = dict(rope_scaling)
+        rope_type = scaling.get("rope_type", scaling.get("type"))
+        if rope_type is None:
+            raise ValueError("rope_scaling must include 'rope_type' or 'type'")
+        rope_type = str(rope_type).lower()
+        if rope_type == "ntk":
+            rope_type = "dynamic"
+        supported = {"default", "linear", "dynamic", "yarn"}
+        if rope_type not in supported:
+            raise ValueError(
+                f"unsupported rope_scaling type {rope_type!r}; expected one of {sorted(supported)}"
+            )
+        if rope_type == "default":
+            return None
+        factor = float(scaling.get("factor", 1.0))
+        if factor < 1.0:
+            raise ValueError("rope_scaling factor must be >= 1.0")
+        scaling["rope_type"] = rope_type
+        scaling.pop("type", None)
+        scaling["factor"] = factor
+        if "original_max_position_embeddings" in scaling:
+            scaling["original_max_position_embeddings"] = int(
+                scaling["original_max_position_embeddings"]
+            )
+        if "beta_fast" in scaling:
+            scaling["beta_fast"] = float(scaling["beta_fast"])
+        if "beta_slow" in scaling:
+            scaling["beta_slow"] = float(scaling["beta_slow"])
+        if "attention_factor" in scaling and scaling["attention_factor"] is not None:
+            scaling["attention_factor"] = float(scaling["attention_factor"])
+        return scaling

modeling_talkie.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -185,7 +187,7 @@ class Block(nn.Module):
 class TalkiePreTrainedModel(PreTrainedModel):
     config_class = TalkieConfig
     base_model_prefix = ""
-    supports_gradient_checkpointing = False
     _supports_sdpa = True
     _supports_attention_backend = True
     _no_split_modules = ["Block"]
@@ -200,28 +202,153 @@ class TalkieModel(TalkiePreTrainedModel, GenerationMixin):
         super().__init__(config)
         self.embed = nn.Embedding(config.vocab_size, config.n_embd)
         self.blocks = nn.ModuleList([Block(config, i) for i in range(config.n_layer)])
-        cos, sin = self._precompute_rotary_embeddings(
-            config.max_position_embeddings, config.head_dim, config.rope_base
-        )
         self.register_buffer("cos", cos, persistent=False)
         self.register_buffer("sin", sin, persistent=False)
         self._rotary_initialized = cos.device.type != "meta"
         self.post_init()
     def _precompute_rotary_embeddings(
-        self, seq_len: int, head_dim: int, base: int
     ) -> tuple[torch.Tensor, torch.Tensor]:
         device = self.embed.weight.device if hasattr(self, "embed") else "cpu"
-        channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)
-        inv_freq = 1.0 / (base ** (channel_range / head_dim))
         t = torch.arange(seq_len, dtype=torch.float32, device=device)
         freqs = torch.outer(t, inv_freq)
         cos, sin = freqs.cos(), freqs.sin()
         cos, sin = cos.bfloat16(), sin.bfloat16()
         cos, sin = cos[None, :, None, :], sin[None, :, None, :]
         return cos, sin
     def _ensure_rotary_embeddings(self, seq_len: int) -> None:
         device = self.embed.weight.device
         needs_init = (
@@ -232,13 +359,14 @@ class TalkieModel(TalkiePreTrainedModel, GenerationMixin):
         )
         if needs_init:
             max_seq_len = max(seq_len, self.config.max_position_embeddings)
-            cos, sin = self._precompute_rotary_embeddings(
-                max_seq_len, self.config.head_dim, self.config.rope_base
-            )
             self.cos = cos.to(device=device)
             self.sin = sin.to(device=device)
             self._rotary_initialized = True
     def get_input_embeddings(self) -> nn.Embedding:
         return self.embed
@@ -265,7 +393,7 @@ class TalkieModel(TalkiePreTrainedModel, GenerationMixin):
             return cache_position.to(device=input_ids.device, dtype=torch.long)
         past_seen = past_key_values.get_seq_length() if past_key_values is not None else 0
         position_ids = torch.arange(seq_len, device=input_ids.device, dtype=torch.long) + past_seen
-        return position_ids.unsqueeze(0)
     def _attention_mask(
         self,
@@ -279,10 +407,13 @@ class TalkieModel(TalkiePreTrainedModel, GenerationMixin):
             return attention_mask
         batch_size, query_length = input_ids.shape
         past_seen = past_key_values.get_seq_length() if past_key_values is not None else 0
-        key_length = past_seen + query_length
         if attention_mask is not None and attention_mask.dim() != 2:
             return attention_mask
         if attention_mask is not None:
             if attention_mask.shape[-1] == query_length and past_seen:
                 prefix = torch.ones(
@@ -293,25 +424,17 @@ class TalkieModel(TalkiePreTrainedModel, GenerationMixin):
                 )
                 attention_mask = torch.cat([prefix, attention_mask], dim=-1)
             key_length = attention_mask.shape[-1]
-            has_padding = not bool(torch.all(attention_mask == 1))
-        else:
-            has_padding = False
-        if attention_mask is None and past_seen == 0:
-            return None
         key_positions = torch.arange(key_length, device=input_ids.device, dtype=torch.long)
         future_mask = key_positions.view(1, 1, 1, key_length) > position_ids.view(
             batch_size, 1, query_length, 1
         )
-        if attention_mask is not None and has_padding:
             padding_mask = attention_mask[:, None, None, :].to(device=input_ids.device) == 0
             mask = future_mask | padding_mask
         else:
             mask = future_mask
-        if not bool(mask.any()):
-            return None
         min_value = torch.finfo(dtype).min
         causal_mask = torch.zeros(
             batch_size, 1, query_length, key_length, dtype=dtype, device=input_ids.device
@@ -341,17 +464,15 @@ class TalkieModel(TalkiePreTrainedModel, GenerationMixin):
                 device=inputs_embeds.device,
             )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         if use_cache and past_key_values is None:
             past_key_values = DynamicCache(config=self.config)
         position_ids = self._position_ids(input_ids, position_ids, cache_position, past_key_values)
-        needed_seq_len = int(position_ids.max().item()) + 1
-        self._ensure_rotary_embeddings(needed_seq_len)
-        if needed_seq_len > self.cos.shape[1]:
-            raise ValueError(
-                f"Sequence length {needed_seq_len} exceeds max_position_embeddings "
-                f"{self.cos.shape[1]}"
-            )
         cos = self.cos[0, position_ids, :, :]
         sin = self.sin[0, position_ids, :, :]
@@ -361,14 +482,34 @@ class TalkieModel(TalkiePreTrainedModel, GenerationMixin):
         attention_mask = self._attention_mask(attention_mask, input_ids, position_ids, past_key_values, x.dtype)
         e_x = x
         for block in self.blocks:
-            x = block(
-                e_x,
-                x,
-                cos_sin,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values if use_cache else None,
-                **kwargs,
-            )
         x = F.rms_norm(x, (x.shape[-1],))
         past_key_values = past_key_values if use_cache else None
         use_return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -392,6 +533,32 @@ class TalkieForCausalLM(TalkieModel):
     def set_output_embeddings(self, value: nn.Linear) -> None:
         self.lm_head = value
     def forward(
         self,
         input_ids: torch.LongTensor | None = None,
@@ -403,6 +570,8 @@ class TalkieForCausalLM(TalkieModel):
         use_cache: bool | None = None,
         position_ids: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
     ) -> CausalLMOutputWithPast | tuple[torch.Tensor, ...]:
         if input_ids is None and inputs_embeds is None:
@@ -420,13 +589,23 @@ class TalkieForCausalLM(TalkieModel):
             **kwargs,
         )
         hidden_states = outputs.last_hidden_state
-        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
-        logits = self.lm_head(hidden_states[:, slice_indices, :]).float()
-        if self.config.logit_scale != 1.0:
-            logits = logits * self.config.logit_scale
         loss = None
-        if labels is not None:
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             loss = F.cross_entropy(

 from __future__ import annotations
+import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class TalkiePreTrainedModel(PreTrainedModel):
     config_class = TalkieConfig
     base_model_prefix = ""
+    supports_gradient_checkpointing = True
     _supports_sdpa = True
     _supports_attention_backend = True
     _no_split_modules = ["Block"]
         super().__init__(config)
         self.embed = nn.Embedding(config.vocab_size, config.n_embd)
         self.blocks = nn.ModuleList([Block(config, i) for i in range(config.n_layer)])
+        self.gradient_checkpointing = False
+        cos, sin = self._precompute_rotary_embeddings(config.max_position_embeddings)
         self.register_buffer("cos", cos, persistent=False)
         self.register_buffer("sin", sin, persistent=False)
         self._rotary_initialized = cos.device.type != "meta"
         self.post_init()
     def _precompute_rotary_embeddings(
+        self,
+        seq_len: int,
+        head_dim: int | None = None,
+        base: int | float | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         device = self.embed.weight.device if hasattr(self, "embed") else "cpu"
+        head_dim = head_dim if head_dim is not None else self.config.head_dim
+        base = base if base is not None else self.config.rope_base
+        inv_freq, attention_factor = self._rotary_inv_freq(seq_len, head_dim, float(base), device)
         t = torch.arange(seq_len, dtype=torch.float32, device=device)
         freqs = torch.outer(t, inv_freq)
         cos, sin = freqs.cos(), freqs.sin()
+        if attention_factor != 1.0:
+            cos = cos * attention_factor
+            sin = sin * attention_factor
         cos, sin = cos.bfloat16(), sin.bfloat16()
         cos, sin = cos[None, :, None, :], sin[None, :, None, :]
         return cos, sin
+    def _rotary_inv_freq(
+        self,
+        seq_len: int,
+        head_dim: int,
+        base: float,
+        device: torch.device | str,
+    ) -> tuple[torch.Tensor, float]:
+        scaling = self.config.rope_scaling
+        rope_type = scaling.get("rope_type") if scaling else None
+        if rope_type in (None, "default"):
+            return self._default_rotary_inv_freq(head_dim, base, device), 1.0
+        if rope_type == "linear":
+            inv_freq = self._default_rotary_inv_freq(head_dim, base, device)
+            return inv_freq / float(scaling["factor"]), 1.0
+        if rope_type == "dynamic":
+            return self._dynamic_rotary_inv_freq(seq_len, head_dim, base, device, scaling), 1.0
+        if rope_type == "yarn":
+            return self._yarn_rotary_inv_freq(head_dim, base, device, scaling)
+        raise ValueError(f"unsupported rope_scaling type {rope_type!r}")
+    @staticmethod
+    def _default_rotary_inv_freq(
+        head_dim: int, base: float, device: torch.device | str
+    ) -> torch.Tensor:
+        channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)
+        return 1.0 / (base ** (channel_range / head_dim))
+    def _original_max_position_embeddings(self, scaling: dict | None) -> int:
+        if scaling and "original_max_position_embeddings" in scaling:
+            return int(scaling["original_max_position_embeddings"])
+        return int(self.config.max_position_embeddings)
+    def _dynamic_rotary_inv_freq(
+        self,
+        seq_len: int,
+        head_dim: int,
+        base: float,
+        device: torch.device | str,
+        scaling: dict,
+    ) -> torch.Tensor:
+        original_max_position_embeddings = self._original_max_position_embeddings(scaling)
+        scaled_seq_len = max(seq_len, original_max_position_embeddings)
+        factor = float(scaling["factor"])
+        base = base * (
+            (factor * scaled_seq_len / original_max_position_embeddings) - (factor - 1.0)
+        ) ** (head_dim / (head_dim - 2.0))
+        return self._default_rotary_inv_freq(head_dim, base, device)
+    def _yarn_rotary_inv_freq(
+        self,
+        head_dim: int,
+        base: float,
+        device: torch.device | str,
+        scaling: dict,
+    ) -> tuple[torch.Tensor, float]:
+        factor = float(scaling["factor"])
+        original_max_position_embeddings = self._original_max_position_embeddings(scaling)
+        beta_fast = float(scaling.get("beta_fast", 32.0))
+        beta_slow = float(scaling.get("beta_slow", 1.0))
+        attention_factor = scaling.get("attention_factor")
+        if attention_factor is None:
+            attention_factor = 1.0 if factor <= 1.0 else 0.1 * math.log(factor) + 1.0
+        channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)
+        pos_freqs = base ** (channel_range / head_dim)
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+        low, high = self._yarn_correction_range(
+            beta_fast,
+            beta_slow,
+            head_dim,
+            base,
+            original_max_position_embeddings,
+            truncate=bool(scaling.get("truncate", True)),
+        )
+        ramp = self._yarn_linear_ramp(low, high, head_dim // 2, device)
+        extrapolation_factor = 1.0 - ramp
+        inv_freq = (
+            inv_freq_interpolation * (1.0 - extrapolation_factor)
+            + inv_freq_extrapolation * extrapolation_factor
+        )
+        return inv_freq, float(attention_factor)
+    @staticmethod
+    def _yarn_correction_range(
+        low_rot: float,
+        high_rot: float,
+        head_dim: int,
+        base: float,
+        original_max_position_embeddings: int,
+        truncate: bool,
+    ) -> tuple[float, float]:
+        def correction_dim(num_rotations: float) -> float:
+            return (
+                head_dim
+                * math.log(original_max_position_embeddings / (num_rotations * 2.0 * math.pi))
+                / (2.0 * math.log(base))
+            )
+        low = correction_dim(low_rot)
+        high = correction_dim(high_rot)
+        if truncate:
+            low = math.floor(low)
+            high = math.ceil(high)
+        return max(low, 0.0), min(high, float(head_dim - 1))
+    @staticmethod
+    def _yarn_linear_ramp(
+        low: float,
+        high: float,
+        dim: int,
+        device: torch.device | str,
+    ) -> torch.Tensor:
+        if low == high:
+            high += 0.001
+        ramp = (torch.arange(dim, dtype=torch.float32, device=device) - low) / (high - low)
+        return torch.clamp(ramp, 0.0, 1.0)
     def _ensure_rotary_embeddings(self, seq_len: int) -> None:
         device = self.embed.weight.device
         needs_init = (
         )
         if needs_init:
             max_seq_len = max(seq_len, self.config.max_position_embeddings)
+            cos, sin = self._precompute_rotary_embeddings(max_seq_len)
             self.cos = cos.to(device=device)
             self.sin = sin.to(device=device)
             self._rotary_initialized = True
+    def reset_rotary_embeddings(self) -> None:
+        self._rotary_initialized = False
     def get_input_embeddings(self) -> nn.Embedding:
         return self.embed
             return cache_position.to(device=input_ids.device, dtype=torch.long)
         past_seen = past_key_values.get_seq_length() if past_key_values is not None else 0
         position_ids = torch.arange(seq_len, device=input_ids.device, dtype=torch.long) + past_seen
+        return position_ids.unsqueeze(0).expand(batch_size, -1)
     def _attention_mask(
         self,
             return attention_mask
         batch_size, query_length = input_ids.shape
         past_seen = past_key_values.get_seq_length() if past_key_values is not None else 0
         if attention_mask is not None and attention_mask.dim() != 2:
             return attention_mask
+        if attention_mask is None and past_seen == 0:
+            return None
+        key_length = past_seen + query_length
         if attention_mask is not None:
             if attention_mask.shape[-1] == query_length and past_seen:
                 prefix = torch.ones(
                 )
                 attention_mask = torch.cat([prefix, attention_mask], dim=-1)
             key_length = attention_mask.shape[-1]
         key_positions = torch.arange(key_length, device=input_ids.device, dtype=torch.long)
         future_mask = key_positions.view(1, 1, 1, key_length) > position_ids.view(
             batch_size, 1, query_length, 1
         )
+        if attention_mask is not None:
             padding_mask = attention_mask[:, None, None, :].to(device=input_ids.device) == 0
             mask = future_mask | padding_mask
         else:
             mask = future_mask
         min_value = torch.finfo(dtype).min
         causal_mask = torch.zeros(
             batch_size, 1, query_length, key_length, dtype=dtype, device=input_ids.device
                 device=inputs_embeds.device,
             )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if self.gradient_checkpointing and self.training:
+            use_cache = False
         if use_cache and past_key_values is None:
             past_key_values = DynamicCache(config=self.config)
         position_ids = self._position_ids(input_ids, position_ids, cache_position, past_key_values)
+        # Keep graph capture free of CUDA tensor -> Python scalar syncs. The
+        # configured context length is the static serving/training contract.
+        self._ensure_rotary_embeddings(int(self.config.max_position_embeddings))
         cos = self.cos[0, position_ids, :, :]
         sin = self.sin[0, position_ids, :, :]
         attention_mask = self._attention_mask(attention_mask, input_ids, position_ids, past_key_values, x.dtype)
         e_x = x
         for block in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                def custom_forward(
+                    e_x: torch.Tensor,
+                    x: torch.Tensor,
+                    cos: torch.Tensor,
+                    sin: torch.Tensor,
+                    attention_mask: torch.Tensor | None,
+                    block: Block = block,
+                ) -> torch.Tensor:
+                    return block(e_x, x, (cos, sin), attention_mask=attention_mask)
+                x = self._gradient_checkpointing_func(
+                    custom_forward,
+                    e_x,
+                    x,
+                    cos,
+                    sin,
+                    attention_mask,
+                )
+            else:
+                x = block(
+                    e_x,
+                    x,
+                    cos_sin,
+                    attention_mask=attention_mask,
+                    past_key_values=past_key_values if use_cache else None,
+                    **kwargs,
+                )
         x = F.rms_norm(x, (x.shape[-1],))
         past_key_values = past_key_values if use_cache else None
         use_return_dict = return_dict if return_dict is not None else self.config.use_return_dict
     def set_output_embeddings(self, value: nn.Linear) -> None:
         self.lm_head = value
+    def _chunked_lm_loss(
+        self,
+        hidden_states: torch.Tensor,
+        labels: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+        if chunk_size <= 0:
+            raise ValueError("chunk_size must be positive")
+        total_loss = hidden_states.new_zeros((), dtype=torch.float32)
+        total_tokens = hidden_states.new_zeros((), dtype=torch.float32)
+        for start in range(0, hidden_states.shape[1], chunk_size):
+            end = min(start + chunk_size, hidden_states.shape[1])
+            logits = self.lm_head(hidden_states[:, start:end, :]).float()
+            if self.config.logit_scale != 1.0:
+                logits = logits * self.config.logit_scale
+            chunk_labels = labels[:, start:end].contiguous()
+            total_loss = total_loss + F.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                chunk_labels.reshape(-1),
+                ignore_index=-100,
+                reduction="sum",
+            )
+            total_tokens = total_tokens + (chunk_labels != -100).sum(dtype=torch.float32)
+        return total_loss / total_tokens.clamp_min(1.0)
     def forward(
         self,
         input_ids: torch.LongTensor | None = None,
         use_cache: bool | None = None,
         position_ids: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
+        loss_chunk_size: int = 0,
+        return_logits: bool = True,
         **kwargs,
     ) -> CausalLMOutputWithPast | tuple[torch.Tensor, ...]:
         if input_ids is None and inputs_embeds is None:
             **kwargs,
         )
         hidden_states = outputs.last_hidden_state
         loss = None
+        logits = None
+        if labels is not None and loss_chunk_size > 0:
+            loss = self._chunked_lm_loss(
+                hidden_states[:, :-1, :],
+                labels[:, 1:],
+                loss_chunk_size,
+            )
+        if return_logits:
+            slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+            logits = self.lm_head(hidden_states[:, slice_indices, :]).float()
+            if self.config.logit_scale != 1.0:
+                logits = logits * self.config.logit_scale
+        if labels is not None and loss is None:
+            if logits is None:
+                raise ValueError("return_logits must be true when loss_chunk_size is not used")
             shift_logits = logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             loss = F.cross_entropy(