# coding=utf-8 # Copyright (c) 2025 # # Ramo configuration for Hugging Face Transformers. # # This version is updated to match your tokenizer + exported config.json: # bos_token_id=1, eos_token_id=2, pad_token_id=46945, unk_token_id=46946 # vocab_size=46957 # and includes rope_config + moe_config defaults that align with your config.json. from __future__ import annotations from typing import Any, Dict, Optional from transformers.configuration_utils import PretrainedConfig class RamoConfig(PretrainedConfig): """ Configuration class for the Ramo model. Notes: - Set pad_token_id / unk_token_id explicitly to match your tokenizer. - rope_config and moe_config are stored as dicts for flexibility. """ model_type = "ramo" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, # Core architecture vocab_size: int = 46957, hidden_size: int = 768, intermediate_size: int = 2048, num_hidden_layers: int = 24, num_attention_heads: int = 12, num_key_value_heads: int = 4, max_position_embeddings: int = 2048, original_max_position_embeddings: int = 512, # Attention attention_dropout: float = 0.0, attention_implementation: str = "auto", use_qk_norm: bool = True, # Token ids (CRITICAL) pad_token_id: int = 46945, bos_token_id: int = 1, eos_token_id: int = 2, unk_token_id: int = 46946, # Embeddings / tying tie_word_embeddings: bool = False, # Optional sub-configs rope_config: Optional[Dict[str, Any]] = None, moe_config: Optional[Dict[str, Any]] = None, **kwargs, ): # Defaults matching your exported config.json if rope_config is None: rope_config = { "rope_type": "yarn", "rope_theta": 1000000.0, "factor": 4.0, "partial_rotary_factor": 1.0, "beta_fast": 32, "beta_slow": 1, "mscale": None, "mscale_all_dim": None, "attention_factor": None, } if moe_config is None: moe_config = { "intermediate_size": 1024, "n_dense_layer": 1, "num_experts_per_tok": 2, "n_shared_experts": 1, "n_routed_experts": 8, "routed_scaling_factor": 1.0, "seq_aux": True, "norm_topk_prob": True, } self.vocab_size = vocab_size self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.max_position_embeddings = max_position_embeddings self.original_max_position_embeddings = original_max_position_embeddings self.attention_dropout = attention_dropout self.attention_implementation = attention_implementation self.use_qk_norm = use_qk_norm self.rope_config = rope_config self.moe_config = moe_config super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, unk_token_id=unk_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, ) def to_dict(self) -> Dict[str, Any]: """ Ensure rope_config/moe_config are preserved in serialization. """ output = super().to_dict() output["rope_config"] = dict(self.rope_config) if self.rope_config is not None else None output["moe_config"] = dict(self.moe_config) if self.moe_config is not None else None return output