# coding=utf-8 # Copyright 2026 The OdinNext authors. # Licensed under the Apache License, Version 2.0. """OdinNext model configuration.""" from transformers import PretrainedConfig class OdinNextConfig(PretrainedConfig): r"""Configuration class for [`OdinNextForCausalLM`]. OdinNext is a 138M-parameter HGRN2+RoPE hybrid causal language model. The architecture interleaves two layer types: * Even layers (0, 2, 4, ..., 14): HGRN2 gated linear recurrence with rotary position embeddings (RoPE) on q/k. * Odd layers (1, 3, 5, ..., 15): the same HGRN2 recurrence WITHOUT positional encoding (position-free, generalizes to any length). HGRN2 gives O(T) training and O(1) per-token inference: the per-layer recurrent state has a fixed size independent of context length. Args: vocab_size (`int`, *optional*, defaults to 32768): Vocabulary size of the OdinNext model. d_model (`int`, *optional*, defaults to 768): Hidden size of the residual stream. n_layers (`int`, *optional*, defaults to 16): Number of transformer-style blocks. n_heads (`int`, *optional*, defaults to 6): Number of recurrence heads. Per-head expand dim is `d_model // n_heads = 128` for the default configuration. ffn_inner (`int`, *optional*, defaults to 2048): SwiGLU2 inner dimension. max_seq_len (`int`, *optional*, defaults to 2048): Maximum sequence length the RoPE cache covers. Generation past this position raises (extend by raising and re-instantiating). rope_theta (`float`, *optional*, defaults to 100000.0): RoPE base frequency. Even layers only. tie_embeddings (`bool`, *optional*, defaults to `True`): Tie input embedding matrix and output LM-head weight. initializer_range (`float`, *optional*, defaults to 0.02): Unused at inference; recorded for parity with HF conventions. bos_token_id (`int`, *optional*, defaults to 0): Same as eos for this tokenizer (`<|endoftext|>`). eos_token_id (`int`, *optional*, defaults to 0): `<|endoftext|>` token id. pad_token_id (`int`, *optional*, defaults to 1): `<|pad|>` token id in the odin-32k tokenizer. use_cache (`bool`, *optional*, defaults to `True`): Whether to return per-layer recurrent states from `forward()`, and whether `generate()` should consume them. The "cache" here is a list of fixed-size HGRN2 states, NOT a growing KV cache. Example: ```python >>> from transformers import AutoConfig >>> config = AutoConfig.from_pretrained( ... "joelhenwang/OdinNext-138M-Early-Checkpoint", ... trust_remote_code=True, ... ) >>> config.d_model 768 ``` """ model_type = "odinnext" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size: int = 32768, d_model: int = 768, n_layers: int = 16, n_heads: int = 6, ffn_inner: int = 2048, max_seq_len: int = 2048, rope_theta: float = 100000.0, tie_embeddings: bool = True, initializer_range: float = 0.02, bos_token_id: int = 0, eos_token_id: int = 0, pad_token_id: int = 1, use_cache: bool = True, **kwargs, ): self.vocab_size = vocab_size self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.ffn_inner = ffn_inner self.max_seq_len = max_seq_len self.rope_theta = rope_theta self.tie_embeddings = tie_embeddings self.initializer_range = initializer_range self.use_cache = use_cache # Common HF aliases — many libraries (lm-eval-harness, vLLM compat # layers, etc.) reach for these names. Provide them as direct # passthroughs so external tooling has a chance of working. self.hidden_size = d_model self.num_hidden_layers = n_layers self.num_attention_heads = n_heads self.intermediate_size = ffn_inner self.max_position_embeddings = max_seq_len # Strip keys we are about to pass explicitly so they don't double up # via **kwargs (config.json may carry duplicates). kwargs.pop("tie_word_embeddings", None) kwargs.pop("bos_token_id", None) kwargs.pop("eos_token_id", None) kwargs.pop("pad_token_id", None) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, tie_word_embeddings=tie_embeddings, **kwargs, )