"""HuggingFace-compatible config class for MimeLens.

Copied verbatim into each per-cell HF repo (`mjbommar/mimelens-001-*`). Lets users
do `AutoConfig.from_pretrained("mjbommar/mimelens-001-medium-bpe-16k-s1",
trust_remote_code=True)` after the auto_map in config.json is honored.

This file has zero non-stdlib dependencies beyond `transformers`. It's
intentionally short — all torch / nn imports live in modeling_mimelens.py.
"""

from __future__ import annotations

from typing import Optional

from transformers import PretrainedConfig


class MimeLensConfig(PretrainedConfig):
    """Configuration for a MimeLens encoder cell.

    A MimeLens cell is one (size × vocab × seed) point of the binary-embedding
    paper's 3x4xN cube: a BERT-style transformer encoder pretrained MLM-only on
    33 GB of position-arbitrary binary content, with one of four input
    pipelines (raw bytes, or BPE at 4K/16K/64K vocabulary).

    For the architectural rationale and pretraining details see
    docs/02-model-architecture.md and docs/04-training-protocol.md in the
    paper repository (https://github.com/mjbommar/binary-embedding-paper).

    Args:
        vocab_size: int — full vocabulary including 7 special tokens (start, end,
            pad, unk, cls, sep, mask). byte cells: 263 (256 bytes + 7 specials).
            BPE cells: 4103 / 16391 / 65543.
        hidden_size: int — transformer model dimension (256 / 384 / 512 for
            tiny / small / medium).
        num_hidden_layers: int — layer count (4 / 8 / 12 for tiny / small /
            medium).
        num_attention_heads: int — head count (4 / 6 / 8). Head dim is always
            64 by design.
        head_dim: int — per-head attention dimension. Fixed at 64 in the paper.
        ffn_multiplier_num / ffn_multiplier_den: int — GeGLU FFN expansion as
            a rational (8/3 ≈ 2.67, the LLaMA convention).
        max_position_embeddings: int — RoPE position table size. Fixed at 1024
            in the paper.
        rope_theta: float — RoPE base frequency. Fixed at 10,000.
        rms_norm_eps: float — RMSNorm epsilon. Fixed at 1e-6.
        pad_token_id / cls_token_id / sep_token_id / mask_token_id: int —
            special-token indices, matching binary_embedding.constants.
        byte_offset: int — for byte cells, ord(b)+byte_offset gives the token
            id. Fixed at 7 (after the 7 special tokens). Unused for BPE cells.
        cls_pool_dim: int — output dim of the cls_pool layer. Note: this layer
            receives no gradient under MLM-only training (see paper §3.4); the
            mean-pool over body tokens is the trained pooling, not cls_pool.
        initializer_range: float — std of trunc-normal init.

        # MimeLens-specific provenance / tokenizer metadata:
        mimelens_cell_id: str — e.g. "medium/bpe-16k/s1".
        mimelens_vocab_pipeline: str — one of "byte", "bpe-4k", "bpe-16k",
            "bpe-64k". Drives the tokenization in modeling_mimelens.
        mimelens_tokenizer_hub_id: Optional[str] — for BPE cells, the HF Hub
            id of the canonical tokenizer (e.g.
            "mjbommar/binary-tokenizer-001-16k"). None for byte cells.
        mimelens_pretraining_steps: int — total gradient updates (22,888
            standard; 47,808 for the matched-tokens-seen ablation cell).
        mimelens_seed: int — pretraining RNG seed (1, 2, or 3).
    """

    model_type = "mimelens"

    def __init__(
        self,
        vocab_size: int = 16391,
        hidden_size: int = 512,
        num_hidden_layers: int = 12,
        num_attention_heads: int = 8,
        head_dim: int = 64,
        ffn_multiplier_num: int = 8,
        ffn_multiplier_den: int = 3,
        max_position_embeddings: int = 1024,
        rope_theta: float = 10_000.0,
        rms_norm_eps: float = 1e-6,
        pad_token_id: int = 2,
        cls_token_id: int = 4,
        sep_token_id: int = 5,
        mask_token_id: int = 6,
        byte_offset: int = 7,
        cls_pool_dim: int = 256,
        initializer_range: float = 0.02,
        mimelens_cell_id: str = "medium/bpe-16k/s1",
        mimelens_vocab_pipeline: str = "bpe-16k",
        mimelens_tokenizer_hub_id: Optional[str] = "mjbommar/binary-tokenizer-001-16k",
        mimelens_pretraining_steps: int = 22_888,
        mimelens_seed: int = 1,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.head_dim = head_dim
        self.ffn_multiplier_num = ffn_multiplier_num
        self.ffn_multiplier_den = ffn_multiplier_den
        self.max_position_embeddings = max_position_embeddings
        self.rope_theta = rope_theta
        self.rms_norm_eps = rms_norm_eps
        self.cls_token_id = cls_token_id
        self.sep_token_id = sep_token_id
        self.mask_token_id = mask_token_id
        self.byte_offset = byte_offset
        self.cls_pool_dim = cls_pool_dim
        self.initializer_range = initializer_range
        self.mimelens_cell_id = mimelens_cell_id
        self.mimelens_vocab_pipeline = mimelens_vocab_pipeline
        self.mimelens_tokenizer_hub_id = mimelens_tokenizer_hub_id
        self.mimelens_pretraining_steps = mimelens_pretraining_steps
        self.mimelens_seed = mimelens_seed
        super().__init__(pad_token_id=pad_token_id, **kwargs)

    @property
    def head_size(self) -> int:
        """For HF compatibility — alias for head_dim."""
        return self.head_dim

    @property
    def intermediate_size(self) -> int:
        """GeGLU expansion: hidden * (ffn_multiplier_num / ffn_multiplier_den)."""
        return self.hidden_size * self.ffn_multiplier_num // self.ffn_multiplier_den