mimelens-001-tiny-byte-s2 / configuration_mimelens.py
mjbommar's picture
mimelens-001 cell: tiny/byte/s2
85ca285 verified
"""HuggingFace-compatible config class for MimeLens.
Copied verbatim into each per-cell HF repo (`mjbommar/mimelens-001-*`). Lets users
do `AutoConfig.from_pretrained("mjbommar/mimelens-001-medium-bpe-16k-s1",
trust_remote_code=True)` after the auto_map in config.json is honored.
This file has zero non-stdlib dependencies beyond `transformers`. It's
intentionally short β€” all torch / nn imports live in modeling_mimelens.py.
"""
from __future__ import annotations
from typing import Optional
from transformers import PretrainedConfig
class MimeLensConfig(PretrainedConfig):
"""Configuration for a MimeLens encoder cell.
A MimeLens cell is one (size Γ— vocab Γ— seed) point of the binary-embedding
paper's 3x4xN cube: a BERT-style transformer encoder pretrained MLM-only on
33 GB of position-arbitrary binary content, with one of four input
pipelines (raw bytes, or BPE at 4K/16K/64K vocabulary).
For the architectural rationale and pretraining details see
docs/02-model-architecture.md and docs/04-training-protocol.md in the
paper repository (https://github.com/mjbommar/binary-embedding-paper).
Args:
vocab_size: int β€” full vocabulary including 7 special tokens (start, end,
pad, unk, cls, sep, mask). byte cells: 263 (256 bytes + 7 specials).
BPE cells: 4103 / 16391 / 65543.
hidden_size: int β€” transformer model dimension (256 / 384 / 512 for
tiny / small / medium).
num_hidden_layers: int β€” layer count (4 / 8 / 12 for tiny / small /
medium).
num_attention_heads: int β€” head count (4 / 6 / 8). Head dim is always
64 by design.
head_dim: int β€” per-head attention dimension. Fixed at 64 in the paper.
ffn_multiplier_num / ffn_multiplier_den: int β€” GeGLU FFN expansion as
a rational (8/3 β‰ˆ 2.67, the LLaMA convention).
max_position_embeddings: int β€” RoPE position table size. Fixed at 1024
in the paper.
rope_theta: float β€” RoPE base frequency. Fixed at 10,000.
rms_norm_eps: float β€” RMSNorm epsilon. Fixed at 1e-6.
pad_token_id / cls_token_id / sep_token_id / mask_token_id: int β€”
special-token indices, matching binary_embedding.constants.
byte_offset: int β€” for byte cells, ord(b)+byte_offset gives the token
id. Fixed at 7 (after the 7 special tokens). Unused for BPE cells.
cls_pool_dim: int β€” output dim of the cls_pool layer. Note: this layer
receives no gradient under MLM-only training (see paper Β§3.4); the
mean-pool over body tokens is the trained pooling, not cls_pool.
initializer_range: float β€” std of trunc-normal init.
# MimeLens-specific provenance / tokenizer metadata:
mimelens_cell_id: str β€” e.g. "medium/bpe-16k/s1".
mimelens_vocab_pipeline: str β€” one of "byte", "bpe-4k", "bpe-16k",
"bpe-64k". Drives the tokenization in modeling_mimelens.
mimelens_tokenizer_hub_id: Optional[str] β€” for BPE cells, the HF Hub
id of the canonical tokenizer (e.g.
"mjbommar/binary-tokenizer-001-16k"). None for byte cells.
mimelens_pretraining_steps: int β€” total gradient updates (22,888
standard; 47,808 for the matched-tokens-seen ablation cell).
mimelens_seed: int β€” pretraining RNG seed (1, 2, or 3).
"""
model_type = "mimelens"
def __init__(
self,
vocab_size: int = 16391,
hidden_size: int = 512,
num_hidden_layers: int = 12,
num_attention_heads: int = 8,
head_dim: int = 64,
ffn_multiplier_num: int = 8,
ffn_multiplier_den: int = 3,
max_position_embeddings: int = 1024,
rope_theta: float = 10_000.0,
rms_norm_eps: float = 1e-6,
pad_token_id: int = 2,
cls_token_id: int = 4,
sep_token_id: int = 5,
mask_token_id: int = 6,
byte_offset: int = 7,
cls_pool_dim: int = 256,
initializer_range: float = 0.02,
mimelens_cell_id: str = "medium/bpe-16k/s1",
mimelens_vocab_pipeline: str = "bpe-16k",
mimelens_tokenizer_hub_id: Optional[str] = "mjbommar/binary-tokenizer-001-16k",
mimelens_pretraining_steps: int = 22_888,
mimelens_seed: int = 1,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.head_dim = head_dim
self.ffn_multiplier_num = ffn_multiplier_num
self.ffn_multiplier_den = ffn_multiplier_den
self.max_position_embeddings = max_position_embeddings
self.rope_theta = rope_theta
self.rms_norm_eps = rms_norm_eps
self.cls_token_id = cls_token_id
self.sep_token_id = sep_token_id
self.mask_token_id = mask_token_id
self.byte_offset = byte_offset
self.cls_pool_dim = cls_pool_dim
self.initializer_range = initializer_range
self.mimelens_cell_id = mimelens_cell_id
self.mimelens_vocab_pipeline = mimelens_vocab_pipeline
self.mimelens_tokenizer_hub_id = mimelens_tokenizer_hub_id
self.mimelens_pretraining_steps = mimelens_pretraining_steps
self.mimelens_seed = mimelens_seed
super().__init__(pad_token_id=pad_token_id, **kwargs)
@property
def head_size(self) -> int:
"""For HF compatibility β€” alias for head_dim."""
return self.head_dim
@property
def intermediate_size(self) -> int:
"""GeGLU expansion: hidden * (ffn_multiplier_num / ffn_multiplier_den)."""
return self.hidden_size * self.ffn_multiplier_num // self.ffn_multiplier_den