from __future__ import annotations from transformers.configuration_utils import PretrainedConfig class HTRConfig(PretrainedConfig): model_type = "htr-convtext" def __init__( self, vocab_size: int = 80, blank_token_id: int = 0, image_height: int = 64, image_max_width: int = 3072, width_stride: int = 32, patch_size: list[int] | tuple[int, int] = (4, 64), embed_dim: int = 512, depth: int = 8, num_heads: int = 8, mlp_ratio: float = 4.0, conv_kernel_size: int = 7, dropout: float = 0.1, drop_path: float = 0.1, down_after: int = 3, up_after: int = 7, ds_kernel: int = 3, max_seq_len: int = 128, upsample_mode: str = "nearest", use_masking_default: bool = False, **kwargs, ) -> None: self.vocab_size = vocab_size self.blank_token_id = blank_token_id self.image_height = image_height self.image_max_width = image_max_width self.width_stride = width_stride self.patch_size = list(patch_size) self.embed_dim = embed_dim self.depth = depth self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.conv_kernel_size = conv_kernel_size self.dropout = dropout self.drop_path = drop_path self.down_after = down_after self.up_after = up_after self.ds_kernel = ds_kernel self.max_seq_len = max_seq_len self.upsample_mode = upsample_mode self.use_masking_default = use_masking_default super().__init__(**kwargs)