mimelens-001-tiny-byte-s2 / configuration_mimelens.py

mimelens-001 cell: tiny/byte/s2

85ca285 verified 12 days ago

5.82 kB

	"""HuggingFace-compatible config class for MimeLens.

	Copied verbatim into each per-cell HF repo (`mjbommar/mimelens-001-*`). Lets users
	do `AutoConfig.from_pretrained("mjbommar/mimelens-001-medium-bpe-16k-s1",
	trust_remote_code=True)` after the auto_map in config.json is honored.

	This file has zero non-stdlib dependencies beyond `transformers`. It's
	intentionally short — all torch / nn imports live in modeling_mimelens.py.
	"""

	from __future__ import annotations

	from typing import Optional

	from transformers import PretrainedConfig


	class MimeLensConfig(PretrainedConfig):
	"""Configuration for a MimeLens encoder cell.

	A MimeLens cell is one (size × vocab × seed) point of the binary-embedding
	paper's 3x4xN cube: a BERT-style transformer encoder pretrained MLM-only on
	33 GB of position-arbitrary binary content, with one of four input
	pipelines (raw bytes, or BPE at 4K/16K/64K vocabulary).

	For the architectural rationale and pretraining details see
	docs/02-model-architecture.md and docs/04-training-protocol.md in the
	paper repository (https://github.com/mjbommar/binary-embedding-paper).

	Args:
	vocab_size: int — full vocabulary including 7 special tokens (start, end,
	pad, unk, cls, sep, mask). byte cells: 263 (256 bytes + 7 specials).
	BPE cells: 4103 / 16391 / 65543.
	hidden_size: int — transformer model dimension (256 / 384 / 512 for
	tiny / small / medium).
	num_hidden_layers: int — layer count (4 / 8 / 12 for tiny / small /
	medium).
	num_attention_heads: int — head count (4 / 6 / 8). Head dim is always
	64 by design.
	head_dim: int — per-head attention dimension. Fixed at 64 in the paper.
	ffn_multiplier_num / ffn_multiplier_den: int — GeGLU FFN expansion as
	a rational (8/3 ≈ 2.67, the LLaMA convention).
	max_position_embeddings: int — RoPE position table size. Fixed at 1024
	in the paper.
	rope_theta: float — RoPE base frequency. Fixed at 10,000.
	rms_norm_eps: float — RMSNorm epsilon. Fixed at 1e-6.
	pad_token_id / cls_token_id / sep_token_id / mask_token_id: int —
	special-token indices, matching binary_embedding.constants.
	byte_offset: int — for byte cells, ord(b)+byte_offset gives the token
	id. Fixed at 7 (after the 7 special tokens). Unused for BPE cells.
	cls_pool_dim: int — output dim of the cls_pool layer. Note: this layer
	receives no gradient under MLM-only training (see paper §3.4); the
	mean-pool over body tokens is the trained pooling, not cls_pool.
	initializer_range: float — std of trunc-normal init.

	# MimeLens-specific provenance / tokenizer metadata:
	mimelens_cell_id: str — e.g. "medium/bpe-16k/s1".
	mimelens_vocab_pipeline: str — one of "byte", "bpe-4k", "bpe-16k",
	"bpe-64k". Drives the tokenization in modeling_mimelens.
	mimelens_tokenizer_hub_id: Optional[str] — for BPE cells, the HF Hub
	id of the canonical tokenizer (e.g.
	"mjbommar/binary-tokenizer-001-16k"). None for byte cells.
	mimelens_pretraining_steps: int — total gradient updates (22,888
	standard; 47,808 for the matched-tokens-seen ablation cell).
	mimelens_seed: int — pretraining RNG seed (1, 2, or 3).
	"""

	model_type = "mimelens"

	def __init__(
	self,
	vocab_size: int = 16391,
	hidden_size: int = 512,
	num_hidden_layers: int = 12,
	num_attention_heads: int = 8,
	head_dim: int = 64,
	ffn_multiplier_num: int = 8,
	ffn_multiplier_den: int = 3,
	max_position_embeddings: int = 1024,
	rope_theta: float = 10_000.0,
	rms_norm_eps: float = 1e-6,
	pad_token_id: int = 2,
	cls_token_id: int = 4,
	sep_token_id: int = 5,
	mask_token_id: int = 6,
	byte_offset: int = 7,
	cls_pool_dim: int = 256,
	initializer_range: float = 0.02,
	mimelens_cell_id: str = "medium/bpe-16k/s1",
	mimelens_vocab_pipeline: str = "bpe-16k",
	mimelens_tokenizer_hub_id: Optional[str] = "mjbommar/binary-tokenizer-001-16k",
	mimelens_pretraining_steps: int = 22_888,
	mimelens_seed: int = 1,
	**kwargs,
	):
	self.vocab_size = vocab_size
	self.hidden_size = hidden_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.head_dim = head_dim
	self.ffn_multiplier_num = ffn_multiplier_num
	self.ffn_multiplier_den = ffn_multiplier_den
	self.max_position_embeddings = max_position_embeddings
	self.rope_theta = rope_theta
	self.rms_norm_eps = rms_norm_eps
	self.cls_token_id = cls_token_id
	self.sep_token_id = sep_token_id
	self.mask_token_id = mask_token_id
	self.byte_offset = byte_offset
	self.cls_pool_dim = cls_pool_dim
	self.initializer_range = initializer_range
	self.mimelens_cell_id = mimelens_cell_id
	self.mimelens_vocab_pipeline = mimelens_vocab_pipeline
	self.mimelens_tokenizer_hub_id = mimelens_tokenizer_hub_id
	self.mimelens_pretraining_steps = mimelens_pretraining_steps
	self.mimelens_seed = mimelens_seed
	super().__init__(pad_token_id=pad_token_id, **kwargs)

	@property
	def head_size(self) -> int:
	"""For HF compatibility — alias for head_dim."""
	return self.head_dim

	@property
	def intermediate_size(self) -> int:
	"""GeGLU expansion: hidden * (ffn_multiplier_num / ffn_multiplier_den)."""
	return self.hidden_size * self.ffn_multiplier_num // self.ffn_multiplier_den