Upload lambda-160m pretrained model

Browse files

Files changed (14) hide show

config.json +29 -0
configuration_myllm.py +49 -0
generation_config.json +9 -0
kv_cache.py +4 -0
model.pth +3 -0
model.safetensors +3 -0
model_config.json +1 -0
modeling_myllm.py +154 -0
position_encoding.py +34 -0
self_attention.py +123 -0
special_tokens_map.json +17 -0
tokenizer.json +0 -0
tokenizer_config.json +20 -0
transformer.py +329 -0

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "MyLLMForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_myllm.MyLLMConfig",
+    "AutoModelForCausalLM": "modeling_myllm.MyLLMForCausalLM",
+    "AutoModel": "modeling_myllm.MyLLMForCausalLM"
+  },
+  "bos_token_id": 2,
+  "d_ff": 3072,
+  "d_model": 768,
+  "dtype": "float32",
+  "eos_token_id": 3,
+  "hidden_size": 768,
+  "intermediate_size": 3072,
+  "learning_rate": 0.0002,
+  "max_len": 1024,
+  "max_position_embeddings": 1024,
+  "model_type": "myllm",
+  "num_attention_heads": 12,
+  "num_heads": 12,
+  "num_hidden_layers": 16,
+  "num_layers": 16,
+  "pad_token_id": 0,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.8.0",
+  "vocab_size": 65536
+}

configuration_myllm.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from transformers import PreTrainedConfig
+class MyLLMConfig(PreTrainedConfig):
+    model_type = "myllm"
+    def __init__(
+        self,
+        vocab_size: int = 4,
+        max_len: int = 6,
+        d_model: int = 2,
+        num_layers: int = 2,
+        num_heads: int = 1,
+        d_ff: int = 8,
+        learning_rate: float = 0.1,
+        pad_token_id: int = 0,
+        bos_token_id: int = 2,
+        eos_token_id: int = 3,
+        tie_word_embeddings: bool = True,
+        **kwargs: object,
+    ) -> None:
+        # ---------------------------------------------------------
+        # Store the architecture values needed to rebuild the
+        # PyTorch decoder-only Transformer during AutoModel loading.
+        # ---------------------------------------------------------
+        self.vocab_size = vocab_size
+        self.max_len = max_len
+        self.d_model = d_model
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.d_ff = d_ff
+        self.learning_rate = learning_rate
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = d_model
+        self.num_hidden_layers = num_layers
+        self.num_attention_heads = num_heads
+        self.intermediate_size = d_ff
+        self.max_position_embeddings = max_len
+        # ---------------------------------------------------------
+        # Pass standard token ids to the Transformers base config so
+        # generation utilities can resolve special tokens normally.
+        # ---------------------------------------------------------
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 3,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 0,
+  "transformers_version": "5.8.0"
+}

kv_cache.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import torch
+LayerKeyValueCache = tuple[torch.Tensor, torch.Tensor]
+KeyValueCache = list[LayerKeyValueCache]

model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccf44ef8dd3ef60402ff149195e31321f994ced33109e58b09e5e596196b4e05
+size 658115811

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11f43b55acbb2069c12d4b2bfe9fb3d4ee523ebba0b1bded93f460dea404a4d7
+size 658085248

model_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"max_len": 1024, "d_model": 768, "num_layers": 16, "num_heads": 12, "d_ff": 3072, "learning_rate": 0.0002, "lr_schedule": "warmup_cosine", "lr_warmup_steps": 2000, "min_learning_rate": 2e-05, "min_learning_rate_ratio": 0.1, "loss_chunk_size": 32, "pad_token_id": 0, "bos_token_id": 2, "eos_token_id": 3, "corpus_signature": "551ac72eceb57f5f", "dataset_cases": [{"name": "fineweb2-edu-ja", "genre": "web", "language": "ja", "dataset_path": "hotchpotch/fineweb-2-edu-japanese", "config_name": "default", "split": "train", "text_column": "text", "token_percentage": 30.0, "is_ramped": false, "repeat_on_end": true, "excluded_url_domains": ["wikipedia.org"]}, {"name": "cleanedwiki-jp", "genre": "wiki", "language": "ja", "dataset_path": "MK0727/CleanedWiki-jp", "config_name": "all", "split": "train", "text_column": "text", "token_percentage": 70.0, "is_ramped": true, "repeat_on_end": true, "excluded_url_domains": []}], "mix_cycle_tokens": 100000, "ramp_start_progress": 0.5, "val_split_modulo": 100, "val_split_index": 0, "validation_cache_path": "models/lambda-160m/validation-cache-551ac72eceb57f5f-bos-eos-text-hash-len1024-samples6144-split100-0.pt", "validation_sample_count": 6144, "trained_steps": 40960}

modeling_myllm.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_myllm import MyLLMConfig
+from .kv_cache import KeyValueCache
+from .position_encoding import PositionEncoding
+from .self_attention import Attention
+from .transformer import DecoderOnlyTransformer
+# ---------------------------------------------------------
+# Reference nested remote-code dependencies directly so local
+# AutoModel loading copies every file needed by relative imports.
+# ---------------------------------------------------------
+REMOTE_CODE_DEPENDENCIES = (Attention, PositionEncoding)
+class MyLLMForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = MyLLMConfig
+    main_input_name = "input_ids"
+    _tied_weights_keys = {"transformer.fc_layer.weight": "transformer.we.weight"}
+    def __init__(self, config: MyLLMConfig) -> None:
+        super().__init__(config)
+        # ---------------------------------------------------------
+        # Reuse the existing PyTorch Transformer implementation and
+        # keep the HF wrapper responsible only for AutoModel APIs.
+        # ---------------------------------------------------------
+        self.transformer = DecoderOnlyTransformer(
+            num_tokens=config.vocab_size,
+            d_model=config.d_model,
+            max_len=config.max_len,
+            num_layers=config.num_layers,
+            num_heads=config.num_heads,
+            d_ff=config.d_ff,
+            learning_rate=config.learning_rate,
+            pad_token_id=config.pad_token_id,
+        )
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Embedding:
+        # ---------------------------------------------------------
+        # Expose input embeddings through the standard Transformers
+        # interface used by resizing and generation helpers.
+        # ---------------------------------------------------------
+        return self.transformer.we
+    def set_input_embeddings(self, value: nn.Embedding) -> None:
+        # ---------------------------------------------------------
+        # Keep tied output weights aligned when callers replace the
+        # token embedding module through the Transformers interface.
+        # ---------------------------------------------------------
+        self.transformer.we = value
+        self.transformer.fc_layer.weight = value.weight
+    def get_output_embeddings(self) -> nn.Linear:
+        # ---------------------------------------------------------
+        # Expose the tied LM head through the standard Transformers
+        # interface used by causal language model utilities.
+        # ---------------------------------------------------------
+        return self.transformer.fc_layer
+    def set_output_embeddings(self, value: nn.Linear) -> None:
+        # ---------------------------------------------------------
+        # Allow Transformers utilities to replace the LM head while
+        # preserving the module expected by the existing model.
+        # ---------------------------------------------------------
+        self.transformer.fc_layer = value
+    def _supports_default_dynamic_cache(self) -> bool:
+        # ---------------------------------------------------------
+        # Use the existing list-based KV cache instead of letting
+        # Transformers allocate its DynamicCache implementation.
+        # ---------------------------------------------------------
+        return False
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: KeyValueCache | None = None,
+        **kwargs: object,
+    ) -> dict[str, torch.Tensor | KeyValueCache | bool | None]:
+        # ---------------------------------------------------------
+        # Feed only the newest token after the cache is populated so
+        # generate can reuse the existing incremental forward path.
+        # ---------------------------------------------------------
+        del kwargs
+        model_input_ids = input_ids[:, -1:] if past_key_values is not None else input_ids
+        return {
+            "input_ids": model_input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+        }
+    def forward(
+        self,
+        input_ids: torch.Tensor | None = None,
+        labels: torch.Tensor | None = None,
+        past_key_values: KeyValueCache | None = None,
+        use_cache: bool | None = None,
+        return_dict: bool | None = None,
+        **kwargs: object,
+    ) -> CausalLMOutputWithPast | tuple[torch.Tensor, ...]:
+        # ---------------------------------------------------------
+        # Accept the standard AutoModelForCausalLM argument names and
+        # delegate the actual tensor computation to the PyTorch model.
+        # ---------------------------------------------------------
+        del kwargs
+        if input_ids is None:
+            raise ValueError("input_ids is required")
+        should_use_cache = bool(use_cache)
+        if past_key_values is not None or should_use_cache:
+            logits, next_key_values = self.transformer.forward_with_cache(
+                token_ids=input_ids,
+                past_key_values=past_key_values,
+            )
+        else:
+            logits = self.transformer(token_ids=input_ids)
+            next_key_values = None
+        # ---------------------------------------------------------
+        # Follow causal LM convention for labels supplied by HF
+        # Trainer and examples: predict token n+1 from position n.
+        # ---------------------------------------------------------
+        loss = None
+        if labels is not None:
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss = nn.functional.cross_entropy(
+                shift_logits.view(-1, self.config.vocab_size),
+                shift_labels.view(-1),
+                ignore_index=self.config.pad_token_id,
+            )
+        # ---------------------------------------------------------
+        # Return either the standard modeling output or a tuple for
+        # callers that explicitly disable dictionary-style outputs.
+        # ---------------------------------------------------------
+        if return_dict is False:
+            output = (logits,)
+            return (loss, *output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=next_key_values,
+        )

position_encoding.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import torch.nn as nn
+class PositionEncoding(nn.Module):
+    def __init__(self, d_model: int = 2, max_len: int = 6) -> None:
+        super().__init__()
+        # ---------------------------------------------------------
+        # Precompute sinusoidal positions once so token embeddings
+        # can be shifted cheaply during training and inference.
+        # ---------------------------------------------------------
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)
+        embedding_index = torch.arange(start=0, end=d_model, step=2).float()
+        div_term = 1 / torch.tensor(10000.0) ** (embedding_index / d_model)
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+    def forward(self, word_embeddings: torch.Tensor, position_offset: int = 0) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Add positions for the visible slice, starting at the cache
+        # length when incremental inference supplies an offset.
+        # ---------------------------------------------------------
+        seq_len = word_embeddings.size(1)
+        position_end = position_offset + seq_len
+        return word_embeddings + self.pe[position_offset:position_end, :].unsqueeze(0)
+if __name__ == "__main__":
+    n = PositionEncoding()

self_attention.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from src.pretraining.kv_cache import LayerKeyValueCache
+class Attention(nn.Module):
+    def __init__(self, d_model: int = 2, num_heads: int = 1) -> None:
+        super().__init__()
+        # ---------------------------------------------------------
+        # Split the model dimension into multiple heads so the same
+        # attention module can be reused in a more general structure.
+        # ---------------------------------------------------------
+        if d_model % num_heads != 0:
+            raise ValueError("d_model must be divisible by num_heads")
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+        # ---------------------------------------------------------
+        # Project inputs into query, key, and value spaces and merge
+        # the heads back into the model dimension after attention.
+        # ---------------------------------------------------------
+        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
+        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
+        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
+        self.W_o = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
+    def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Rearrange the last dimension into head count and head size
+        # so attention can be computed independently per head.
+        # ---------------------------------------------------------
+        batch_size, seq_len, _ = x.size()
+        reshaped = x.view(batch_size, seq_len, self.num_heads, self.head_dim)
+        return reshaped.transpose(1, 2)
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Restore the tensor to the original model dimension after
+        # per-head attention has been combined.
+        # ---------------------------------------------------------
+        batch_size, _, seq_len, _ = x.size()
+        transposed = x.transpose(1, 2).contiguous()
+        return transposed.view(batch_size, seq_len, self.d_model)
+    def forward(
+        self,
+        encoding_for_q: torch.Tensor,
+        encoding_for_k: torch.Tensor,
+        encoding_for_v: torch.Tensor,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Create the projected queries, keys, and values for each
+        # attention head from the incoming hidden states.
+        # ---------------------------------------------------------
+        q = self._split_heads(self.W_q(encoding_for_q))
+        k = self._split_heads(self.W_k(encoding_for_k))
+        v = self._split_heads(self.W_v(encoding_for_v))
+        # ---------------------------------------------------------
+        # Use PyTorch's fused scaled dot-product attention so large
+        # score and softmax tensors do not need to be materialized.
+        # ---------------------------------------------------------
+        attention_scores = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            is_causal=is_causal,
+        )
+        # ---------------------------------------------------------
+        # Merge the attended heads and project the result back into
+        # the model dimension for the next layer.
+        # ---------------------------------------------------------
+        merged_scores = self._merge_heads(attention_scores)
+        return self.W_o(merged_scores)
+    def forward_with_cache(
+        self,
+        encoding_for_q: torch.Tensor,
+        encoding_for_k: torch.Tensor,
+        encoding_for_v: torch.Tensor,
+        past_key_value: LayerKeyValueCache | None,
+        is_causal: bool = False,
+    ) -> tuple[torch.Tensor, LayerKeyValueCache]:
+        # ---------------------------------------------------------
+        # Project the current tokens and append previous keys and
+        # values so generation can avoid recomputing old states.
+        # ---------------------------------------------------------
+        q = self._split_heads(self.W_q(encoding_for_q))
+        current_k = self._split_heads(self.W_k(encoding_for_k))
+        current_v = self._split_heads(self.W_v(encoding_for_v))
+        k = current_k
+        v = current_v
+        if past_key_value is not None:
+            past_k, past_v = past_key_value
+            k = torch.cat((past_k, current_k), dim=2)
+            v = torch.cat((past_v, current_v), dim=2)
+        # ---------------------------------------------------------
+        # Attend the current query positions over cached and current
+        # keys with the fused scaled dot-product implementation.
+        # ---------------------------------------------------------
+        attention_scores = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            is_causal=is_causal,
+        )
+        # ---------------------------------------------------------
+        # Return both the attention result and the updated cache for
+        # this layer so the caller can feed the next token directly.
+        # ---------------------------------------------------------
+        merged_scores = self._merge_heads(attention_scores)
+        return self.W_o(merged_scores), (k, v)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "pad_token": "|<pad>|",
+  "unk_token": "|<unknown>|",
+  "bos_token": "|<bos>|",
+  "eos_token": "|<eos>|",
+  "sep_token": "|<sep>|",
+  "cls_token": "|<cls>|",
+  "mask_token": "|<mask>|",
+  "extra_special_tokens": [
+    "|<system>|",
+    "|<user>|",
+    "|<assistant>|",
+    "|<thinking>|",
+    "|<end_of_thinking>|",
+    "|<end_of_turn>|"
+  ]
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "|<bos>|",
+  "cls_token": "|<cls>|",
+  "eos_token": "|<eos>|",
+  "extra_special_tokens": [
+    "|<system>|",
+    "|<user>|",
+    "|<assistant>|",
+    "|<thinking>|",
+    "|<end_of_thinking>|",
+    "|<end_of_turn>|"
+  ],
+  "mask_token": "|<mask>|",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "|<pad>|",
+  "sep_token": "|<sep>|",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "|<unknown>|"
+}

transformer.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import math
+import torch
+import torch.nn as nn
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import LambdaLR
+import lightning as L
+from .kv_cache import KeyValueCache, LayerKeyValueCache
+from .position_encoding import PositionEncoding
+from .self_attention import Attention
+class FeedForward(nn.Module):
+    def __init__(self, d_model: int, d_ff: int) -> None:
+        super().__init__()
+        # ---------------------------------------------------------
+        # Use the standard Transformer feed-forward sublayer so each
+        # token can be transformed independently after attention.
+        # ---------------------------------------------------------
+        self.linear_1 = nn.Linear(in_features=d_model, out_features=d_ff)
+        self.activation = nn.GELU()
+        self.linear_2 = nn.Linear(in_features=d_ff, out_features=d_model)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Expand the channel dimension, apply a non-linearity, and
+        # project back to the model dimension.
+        # ---------------------------------------------------------
+        hidden = self.linear_1(x)
+        activated = self.activation(hidden)
+        return self.linear_2(activated)
+class DecoderBlock(nn.Module):
+    def __init__(self, d_model: int, num_heads: int, d_ff: int) -> None:
+        super().__init__()
+        # ---------------------------------------------------------
+        # Compose one decoder block from attention, feed-forward, and
+        # RMS normalization layers with residual connections.
+        # ---------------------------------------------------------
+        self.norm_1 = nn.RMSNorm(normalized_shape=d_model)
+        self.attention = Attention(d_model=d_model, num_heads=num_heads)
+        self.norm_2 = nn.RMSNorm(normalized_shape=d_model)
+        self.feed_forward = FeedForward(d_model=d_model, d_ff=d_ff)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Apply pre-norm self-attention so multiple decoder blocks can
+        # be stacked without changing the external interface.
+        # ---------------------------------------------------------
+        attention_input = self.norm_1(x)
+        attention_output = self.attention(
+            attention_input,
+            attention_input,
+            attention_input,
+            is_causal=True,
+        )
+        attention_residual = x + attention_output
+        # ---------------------------------------------------------
+        # Apply the position-wise feed-forward network as the second
+        # sublayer inside the decoder block.
+        # ---------------------------------------------------------
+        feed_forward_input = self.norm_2(attention_residual)
+        feed_forward_output = self.feed_forward(feed_forward_input)
+        return attention_residual + feed_forward_output
+    def forward_with_cache(
+        self,
+        x: torch.Tensor,
+        past_key_value: LayerKeyValueCache | None,
+    ) -> tuple[torch.Tensor, LayerKeyValueCache]:
+        # ---------------------------------------------------------
+        # Apply self-attention with a layer-local cache, then keep the
+        # feed-forward path identical to the full sequence forward.
+        # ---------------------------------------------------------
+        attention_input = self.norm_1(x)
+        attention_output, key_value_cache = self.attention.forward_with_cache(
+            attention_input,
+            attention_input,
+            attention_input,
+            past_key_value,
+            is_causal=past_key_value is None,
+        )
+        attention_residual = x + attention_output
+        # ---------------------------------------------------------
+        # Transform only the visible token states because old states
+        # have already been folded into the cached keys and values.
+        # ---------------------------------------------------------
+        feed_forward_input = self.norm_2(attention_residual)
+        feed_forward_output = self.feed_forward(feed_forward_input)
+        return attention_residual + feed_forward_output, key_value_cache
+class DecoderOnlyTransformer(L.LightningModule):
+    def __init__(
+        self,
+        num_tokens: int = 4,
+        d_model: int = 2,
+        max_len: int = 6,
+        num_layers: int = 2,
+        num_heads: int = 1,
+        d_ff: int = 8,
+        learning_rate: float = 0.1,
+        pad_token_id: int = 0,
+        use_fused_optimizer: bool = False,
+        loss_chunk_size: int = 32,
+        lr_warmup_steps: int | None = None,
+        lr_total_steps: int | None = None,
+        min_learning_rate: float | None = None,
+    ) -> None:
+        super().__init__()
+        # ---------------------------------------------------------
+        # Embed tokens and positions before passing them through a
+        # stack of decoder blocks.
+        # ---------------------------------------------------------
+        self.we = nn.Embedding(num_embeddings=num_tokens, embedding_dim=d_model)
+        self.pe = PositionEncoding(d_model=d_model, max_len=max_len)
+        self.blocks = nn.ModuleList(
+            [DecoderBlock(d_model=d_model, num_heads=num_heads, d_ff=d_ff) for _ in range(num_layers)]
+        )
+        self.final_norm = nn.RMSNorm(normalized_shape=d_model)
+        self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens)
+        # ---------------------------------------------------------
+        # Share token embedding weights with the output projection
+        # so small models spend more parameters inside the blocks.
+        # ---------------------------------------------------------
+        self.fc_layer.weight = self.we.weight
+        self.learning_rate = learning_rate
+        self.pad_token_id = pad_token_id
+        self.use_fused_optimizer = use_fused_optimizer
+        self.loss_chunk_size = loss_chunk_size
+        self.lr_warmup_steps = lr_warmup_steps
+        self.lr_total_steps = lr_total_steps
+        self.min_learning_rate = min_learning_rate
+        # ---------------------------------------------------------
+        # Reject partially configured schedules so posttraining can
+        # keep fixed LR while pretraining opts into full scheduling.
+        # ---------------------------------------------------------
+        lr_schedule_values = [lr_warmup_steps, lr_total_steps, min_learning_rate]
+        if any(value is None for value in lr_schedule_values) and any(
+            value is not None for value in lr_schedule_values
+        ):
+            raise ValueError("LR schedule requires warmup steps, total steps, and minimum learning rate")
+        # ---------------------------------------------------------
+        # Keep summed token loss local so large vocabulary logits
+        # can be reduced chunk by chunk during training.
+        # ---------------------------------------------------------
+        self.loss = nn.CrossEntropyLoss(ignore_index=pad_token_id, reduction="sum")
+    def forward_hidden(self, token_ids: torch.Tensor) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Convert token ids into hidden states and apply positional
+        # information before the decoder stack.
+        # ---------------------------------------------------------
+        word_embeddings = self.we(token_ids)
+        hidden_states = self.pe(word_embeddings)
+        # ---------------------------------------------------------
+        # Reuse the same decoder block interface for every layer to
+        # make the model depth configurable.
+        # ---------------------------------------------------------
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+        # ---------------------------------------------------------
+        # Normalize the final hidden states and map them into token
+        # logits for next-token prediction.
+        # ---------------------------------------------------------
+        return self.final_norm(hidden_states)
+    def forward(self, token_ids: torch.Tensor) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Keep the public forward path returning full vocabulary
+        # logits for inference and compatibility with callers.
+        # ---------------------------------------------------------
+        hidden_states = self.forward_hidden(token_ids)
+        return self.fc_layer(hidden_states)
+    def forward_with_cache(
+        self,
+        token_ids: torch.Tensor,
+        past_key_values: KeyValueCache | None,
+    ) -> tuple[torch.Tensor, KeyValueCache]:
+        # ---------------------------------------------------------
+        # Offset positions by the cached sequence length so one-token
+        # inference matches full-sequence absolute positions.
+        # ---------------------------------------------------------
+        position_offset = 0
+        if past_key_values is not None:
+            position_offset = past_key_values[0][0].size(dim=2)
+        word_embeddings = self.we(token_ids)
+        hidden_states = self.pe(word_embeddings, position_offset=position_offset)
+        next_key_values: KeyValueCache = []
+        # ---------------------------------------------------------
+        # Pass each layer its own cache entry and collect the updated
+        # entries in the same order for the next generation step.
+        # ---------------------------------------------------------
+        for layer_index, block in enumerate(self.blocks):
+            past_key_value = None if past_key_values is None else past_key_values[layer_index]
+            hidden_states, key_value_cache = block.forward_with_cache(
+                hidden_states,
+                past_key_value,
+            )
+            next_key_values.append(key_value_cache)
+        # ---------------------------------------------------------
+        # Produce logits only for the currently supplied token slice
+        # while returning cache tensors that include all past tokens.
+        # ---------------------------------------------------------
+        normalized_hidden_states = self.final_norm(hidden_states)
+        return self.fc_layer(normalized_hidden_states), next_key_values
+    def configure_optimizers(self) -> AdamW | dict[str, object]:
+        # ---------------------------------------------------------
+        # Use AdamW for decoupled weight decay and enable the fused
+        # CUDA implementation only when the training script requests it.
+        # ---------------------------------------------------------
+        optimizer = AdamW(
+            self.parameters(),
+            lr=self.learning_rate,
+            fused=self.use_fused_optimizer,
+        )
+        # ---------------------------------------------------------
+        # Keep callers without scheduler settings on fixed learning
+        # rate while pretraining uses step-wise warmup and cosine decay.
+        # ---------------------------------------------------------
+        if self.lr_warmup_steps is None or self.lr_total_steps is None or self.min_learning_rate is None:
+            return optimizer
+        scheduler = LambdaLR(
+            optimizer=optimizer,
+            lr_lambda=lambda step: resolve_warmup_cosine_learning_rate(
+                step=step,
+                max_learning_rate=self.learning_rate,
+                min_learning_rate=self.min_learning_rate,
+                warmup_steps=self.lr_warmup_steps,
+                total_steps=self.lr_total_steps,
+            )
+            / self.learning_rate,
+        )
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": scheduler,
+                "interval": "step",
+                "frequency": 1,
+            },
+        }
+    def compute_chunked_loss(self, input_tokens: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Run the Transformer stack once, then split only the large
+        # vocabulary projection and cross-entropy over token positions.
+        # ---------------------------------------------------------
+        hidden_states = self.forward_hidden(input_tokens)
+        seq_len = hidden_states.size(dim=1)
+        chunk_starts = range(0, seq_len, self.loss_chunk_size)
+        # ---------------------------------------------------------
+        # Accumulate summed token losses so padding can be ignored
+        # with the same weighting as a single full cross-entropy call.
+        # ---------------------------------------------------------
+        loss_chunks = [
+            self.loss(
+                self.fc_layer(
+                    hidden_states[:, chunk_start : chunk_start + self.loss_chunk_size, :]
+                ).transpose(1, 2),
+                labels[:, chunk_start : chunk_start + self.loss_chunk_size],
+            )
+            for chunk_start in chunk_starts
+        ]
+        total_loss = torch.stack(loss_chunks).sum()
+        valid_token_count = labels.ne(self.pad_token_id).sum()
+        return total_loss / valid_token_count
+    def training_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Run the forward pass and compute token-level cross-entropy
+        # against the shifted labels.
+        # ---------------------------------------------------------
+        del batch_idx
+        input_tokens, labels = batch
+        loss = self.compute_chunked_loss(input_tokens=input_tokens, labels=labels)
+        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=False)
+        return loss
+    def validation_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
+        # ---------------------------------------------------------
+        # Reuse the same autoregressive loss during validation so
+        # checkpoints can monitor held-out next-token accuracy.
+        # ---------------------------------------------------------
+        del batch_idx
+        input_tokens, labels = batch
+        loss = self.compute_chunked_loss(input_tokens=input_tokens, labels=labels)
+        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
+        return loss
+def resolve_warmup_cosine_learning_rate(
+    step: int,
+    max_learning_rate: float,
+    min_learning_rate: float,
+    warmup_steps: int,
+    total_steps: int,
+) -> float:
+    # ---------------------------------------------------------
+    # Raise the learning rate linearly at the start, then decay it
+    # smoothly to the configured minimum by the final training step.
+    # ---------------------------------------------------------
+    if step < warmup_steps:
+        return max_learning_rate * step / warmup_steps
+    decay_progress = min(1.0, (step - warmup_steps) / (total_steps - warmup_steps))
+    cosine_scale = 0.5 * (1.0 + math.cos(math.pi * decay_progress))
+    return min_learning_rate + (max_learning_rate - min_learning_rate) * cosine_scale