# AUTO-GENERATED — do not edit manually. Run build_hub_files.py to regenerate. from __future__ import annotations import argparse import enum import json import logging import math import os import queue import random import re import tempfile import warnings from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import Callable from copy import deepcopy from dataclasses import dataclass, field from functools import cached_property, partial from os import PathLike from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, NotRequired, Self, TypeAlias, TypedDict, cast, overload import numpy as np import torch import torch.multiprocessing as mp import torch.nn as nn import torch.nn.functional as F import torchaudio import torchaudio.functional from torch.nn.utils.rnn import pad_sequence import soundfile as sf from tqdm.auto import trange from transformers import ( Cache, DynamicCache, GenerationMixin, LogitsProcessorList, MimiConfig, PretrainedConfig, PreTrainedModel, Qwen2TokenizerFast, Qwen3Config, Qwen3Model, Qwen3OmniMoePreTrainedModel, Qwen3OmniMoeTalkerCodePredictorModel, StaticCache, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper, WhisperFeatureExtractor, ) from transformers.activations import ACT2FN from transformers.cache_utils import Cache as _Cache, DynamicCache as _DynCache from transformers.configuration_utils import PretrainedConfig as _PConfig from transformers.modeling_layers import GradientCheckpointingLayer from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.modeling_utils import PreTrainedModel as _PTModel from transformers.models.mimi import MimiConfig as _MimiCfg, MimiModel from transformers.models.mimi.modeling_mimi import ( MimiConv1d, MimiConv1dPaddingCache, MimiConvTranspose1d, MimiEncoder, MimiEncoderOutput, MimiResnetBlock, MimiTransformerModel, ) from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import ( Qwen3OmniMoeAudioEncoderConfig, Qwen3OmniMoeTalkerCodePredictorConfig, Qwen3OmniMoeTextConfig, ) from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( Qwen3OmniMoeAudioEncoder, Qwen3OmniMoePreTrainedModel as _QOMMPreTrained, Qwen3OmniMoeTalkerCodePredictorOutputWithPast, Qwen3OmniMoeThinkerTextModel, SinusoidsPositionEmbedding, ) from transformers.utils.generic import ModelOutput from transformers.utils.import_utils import is_torchdynamo_compiling from .configuration_raon import ( EmbeddingAdaptorConfig, RaonConfig, RaonDuplexConfig, SpeakerEncoderConfig, VoxtralRealtimeEncoderConfig, ) # ── from utils/special_tokens.py ── logger = logging.getLogger(__name__) @dataclass(frozen=True) class SpecialToken: """Frozen container for a special token's id and surface text.""" id: int text: str def __int__(self) -> int: return self.id def __str__(self) -> str: return self.text PAD = SpecialToken(id=151679, text="<|endoftext|>") IM_START = SpecialToken(id=151644, text="<|im_start|>") IM_END = SpecialToken(id=151645, text="<|im_end|>") AUDIO_START = SpecialToken(id=151669, text="<|audio_start|>") AUDIO_END = SpecialToken(id=151670, text="<|audio_end|>") SPEAKER_EMBEDDING_PLACEHOLDER = SpecialToken(id=151671, text="<|speaker_embedding_placeholder|>") AUDIO_OUTPUT_PLACEHOLDER = SpecialToken(id=151675, text="<|audio_output_placeholder|>") AUDIO_INPUT_PLACEHOLDER = SpecialToken(id=151676, text="<|audio_input_placeholder|>") AUDIO_OUTPUT_PAD = SpecialToken(id=151677, text="<|audio_output_pad|>") AUDIO_OUTPUT_END_PAD = SpecialToken(id=151678, text="<|audio_output_end_pad|>") # Duplex SIL token (dedicated token, not repurposed FIM) DUPLEX_SIL = SpecialToken(id=151672, text="<|audio_output_sil|>") # Backchannel onset token (marks "uh-huh", "mm-hmm" turns instead of EPAD) AUDIO_OUTPUT_BC = SpecialToken(id=151673, text="<|audio_output_backchannel|>") PRETRAINING_AUDIO_TAG = "