# coding=utf-8
# Copyright 2026 The OdinNext authors.
# Licensed under the Apache License, Version 2.0.
"""OdinNext model configuration."""

from transformers import PretrainedConfig


class OdinNextConfig(PretrainedConfig):
    r"""Configuration class for [`OdinNextForCausalLM`].

    OdinNext is a 138M-parameter HGRN2+RoPE hybrid causal language model.
    The architecture interleaves two layer types:
      * Even layers (0, 2, 4, ..., 14): HGRN2 gated linear recurrence with
        rotary position embeddings (RoPE) on q/k.
      * Odd layers (1, 3, 5, ..., 15): the same HGRN2 recurrence WITHOUT
        positional encoding (position-free, generalizes to any length).

    HGRN2 gives O(T) training and O(1) per-token inference: the per-layer
    recurrent state has a fixed size independent of context length.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the OdinNext model.
        d_model (`int`, *optional*, defaults to 768):
            Hidden size of the residual stream.
        n_layers (`int`, *optional*, defaults to 16):
            Number of transformer-style blocks.
        n_heads (`int`, *optional*, defaults to 6):
            Number of recurrence heads. Per-head expand dim is
            `d_model // n_heads = 128` for the default configuration.
        ffn_inner (`int`, *optional*, defaults to 2048):
            SwiGLU2 inner dimension.
        max_seq_len (`int`, *optional*, defaults to 2048):
            Maximum sequence length the RoPE cache covers. Generation past
            this position raises (extend by raising and re-instantiating).
        rope_theta (`float`, *optional*, defaults to 100000.0):
            RoPE base frequency. Even layers only.
        tie_embeddings (`bool`, *optional*, defaults to `True`):
            Tie input embedding matrix and output LM-head weight.
        initializer_range (`float`, *optional*, defaults to 0.02):
            Unused at inference; recorded for parity with HF conventions.
        bos_token_id (`int`, *optional*, defaults to 0):
            Same as eos for this tokenizer (`<|endoftext|>`).
        eos_token_id (`int`, *optional*, defaults to 0):
            `<|endoftext|>` token id.
        pad_token_id (`int`, *optional*, defaults to 1):
            `<|pad|>` token id in the odin-32k tokenizer.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether to return per-layer recurrent states from `forward()`,
            and whether `generate()` should consume them. The "cache" here
            is a list of fixed-size HGRN2 states, NOT a growing KV cache.

    Example:

    ```python
    >>> from transformers import AutoConfig
    >>> config = AutoConfig.from_pretrained(
    ...     "joelhenwang/OdinNext-138M-Early-Checkpoint",
    ...     trust_remote_code=True,
    ... )
    >>> config.d_model
    768
    ```
    """

    model_type = "odinnext"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size: int = 32768,
        d_model: int = 768,
        n_layers: int = 16,
        n_heads: int = 6,
        ffn_inner: int = 2048,
        max_seq_len: int = 2048,
        rope_theta: float = 100000.0,
        tie_embeddings: bool = True,
        initializer_range: float = 0.02,
        bos_token_id: int = 0,
        eos_token_id: int = 0,
        pad_token_id: int = 1,
        use_cache: bool = True,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.ffn_inner = ffn_inner
        self.max_seq_len = max_seq_len
        self.rope_theta = rope_theta
        self.tie_embeddings = tie_embeddings
        self.initializer_range = initializer_range
        self.use_cache = use_cache

        # Common HF aliases — many libraries (lm-eval-harness, vLLM compat
        # layers, etc.) reach for these names. Provide them as direct
        # passthroughs so external tooling has a chance of working.
        self.hidden_size = d_model
        self.num_hidden_layers = n_layers
        self.num_attention_heads = n_heads
        self.intermediate_size = ffn_inner
        self.max_position_embeddings = max_seq_len

        # Strip keys we are about to pass explicitly so they don't double up
        # via **kwargs (config.json may carry duplicates).
        kwargs.pop("tie_word_embeddings", None)
        kwargs.pop("bos_token_id", None)
        kwargs.pop("eos_token_id", None)
        kwargs.pop("pad_token_id", None)

        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            tie_word_embeddings=tie_embeddings,
            **kwargs,
        )