"""HuggingFace-compatible config class for MimeLens. Copied verbatim into each per-cell HF repo (`mjbommar/mimelens-001-*`). Lets users do `AutoConfig.from_pretrained("mjbommar/mimelens-001-medium-bpe-16k-s1", trust_remote_code=True)` after the auto_map in config.json is honored. This file has zero non-stdlib dependencies beyond `transformers`. It's intentionally short — all torch / nn imports live in modeling_mimelens.py. """ from __future__ import annotations from typing import Optional from transformers import PretrainedConfig class MimeLensConfig(PretrainedConfig): """Configuration for a MimeLens encoder cell. A MimeLens cell is one (size × vocab × seed) point of the binary-embedding paper's 3x4xN cube: a BERT-style transformer encoder pretrained MLM-only on 33 GB of position-arbitrary binary content, with one of four input pipelines (raw bytes, or BPE at 4K/16K/64K vocabulary). For the architectural rationale and pretraining details see docs/02-model-architecture.md and docs/04-training-protocol.md in the paper repository (https://github.com/mjbommar/binary-embedding-paper). Args: vocab_size: int — full vocabulary including 7 special tokens (start, end, pad, unk, cls, sep, mask). byte cells: 263 (256 bytes + 7 specials). BPE cells: 4103 / 16391 / 65543. hidden_size: int — transformer model dimension (256 / 384 / 512 for tiny / small / medium). num_hidden_layers: int — layer count (4 / 8 / 12 for tiny / small / medium). num_attention_heads: int — head count (4 / 6 / 8). Head dim is always 64 by design. head_dim: int — per-head attention dimension. Fixed at 64 in the paper. ffn_multiplier_num / ffn_multiplier_den: int — GeGLU FFN expansion as a rational (8/3 ≈ 2.67, the LLaMA convention). max_position_embeddings: int — RoPE position table size. Fixed at 1024 in the paper. rope_theta: float — RoPE base frequency. Fixed at 10,000. rms_norm_eps: float — RMSNorm epsilon. Fixed at 1e-6. pad_token_id / cls_token_id / sep_token_id / mask_token_id: int — special-token indices, matching binary_embedding.constants. byte_offset: int — for byte cells, ord(b)+byte_offset gives the token id. Fixed at 7 (after the 7 special tokens). Unused for BPE cells. cls_pool_dim: int — output dim of the cls_pool layer. Note: this layer receives no gradient under MLM-only training (see paper §3.4); the mean-pool over body tokens is the trained pooling, not cls_pool. initializer_range: float — std of trunc-normal init. # MimeLens-specific provenance / tokenizer metadata: mimelens_cell_id: str — e.g. "medium/bpe-16k/s1". mimelens_vocab_pipeline: str — one of "byte", "bpe-4k", "bpe-16k", "bpe-64k". Drives the tokenization in modeling_mimelens. mimelens_tokenizer_hub_id: Optional[str] — for BPE cells, the HF Hub id of the canonical tokenizer (e.g. "mjbommar/binary-tokenizer-001-16k"). None for byte cells. mimelens_pretraining_steps: int — total gradient updates (22,888 standard; 47,808 for the matched-tokens-seen ablation cell). mimelens_seed: int — pretraining RNG seed (1, 2, or 3). """ model_type = "mimelens" def __init__( self, vocab_size: int = 16391, hidden_size: int = 512, num_hidden_layers: int = 12, num_attention_heads: int = 8, head_dim: int = 64, ffn_multiplier_num: int = 8, ffn_multiplier_den: int = 3, max_position_embeddings: int = 1024, rope_theta: float = 10_000.0, rms_norm_eps: float = 1e-6, pad_token_id: int = 2, cls_token_id: int = 4, sep_token_id: int = 5, mask_token_id: int = 6, byte_offset: int = 7, cls_pool_dim: int = 256, initializer_range: float = 0.02, mimelens_cell_id: str = "medium/bpe-16k/s1", mimelens_vocab_pipeline: str = "bpe-16k", mimelens_tokenizer_hub_id: Optional[str] = "mjbommar/binary-tokenizer-001-16k", mimelens_pretraining_steps: int = 22_888, mimelens_seed: int = 1, **kwargs, ): self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.head_dim = head_dim self.ffn_multiplier_num = ffn_multiplier_num self.ffn_multiplier_den = ffn_multiplier_den self.max_position_embeddings = max_position_embeddings self.rope_theta = rope_theta self.rms_norm_eps = rms_norm_eps self.cls_token_id = cls_token_id self.sep_token_id = sep_token_id self.mask_token_id = mask_token_id self.byte_offset = byte_offset self.cls_pool_dim = cls_pool_dim self.initializer_range = initializer_range self.mimelens_cell_id = mimelens_cell_id self.mimelens_vocab_pipeline = mimelens_vocab_pipeline self.mimelens_tokenizer_hub_id = mimelens_tokenizer_hub_id self.mimelens_pretraining_steps = mimelens_pretraining_steps self.mimelens_seed = mimelens_seed super().__init__(pad_token_id=pad_token_id, **kwargs) @property def head_size(self) -> int: """For HF compatibility — alias for head_dim.""" return self.head_dim @property def intermediate_size(self) -> int: """GeGLU expansion: hidden * (ffn_multiplier_num / ffn_multiplier_den).""" return self.hidden_size * self.ffn_multiplier_num // self.ffn_multiplier_den