Spaces:

milchchan
/

Merkurius

Running on Zero

App Files Files Community

Masaaki Kawata commited on 30 days ago

Commit

e07602f

1 Parent(s): 44e5c9c

initial commit

Browse files

Files changed (34) hide show

.dockerignore +12 -0
.gitignore +167 -0
Dockerfile +47 -0
README.md +4 -3
app.py +288 -0
docker-compose.yml +27 -0
faster_qwen3_tts/__init__.py +7 -0
faster_qwen3_tts/cli.py +407 -0
faster_qwen3_tts/generate.py +215 -0
faster_qwen3_tts/model.py +1370 -0
faster_qwen3_tts/predictor_graph.py +214 -0
faster_qwen3_tts/sampling.py +66 -0
faster_qwen3_tts/streaming.py +359 -0
faster_qwen3_tts/talker_graph.py +214 -0
faster_qwen3_tts/utils.py +30 -0
main.py +144 -0
qwen_tts/__init__.py +24 -0
qwen_tts/__main__.py +24 -0
qwen_tts/core/__init__.py +19 -0
qwen_tts/core/models/__init__.py +18 -0
qwen_tts/core/models/configuration_qwen3_tts.py +502 -0
qwen_tts/core/models/modeling_qwen3_tts.py +0 -0
qwen_tts/core/models/processing_qwen3_tts.py +106 -0
qwen_tts/core/tokenizer_12hz/configuration_qwen3_tts_tokenizer_v2.py +172 -0
qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py +1025 -0
qwen_tts/core/tokenizer_25hz/configuration_qwen3_tts_tokenizer_v1.py +332 -0
qwen_tts/core/tokenizer_25hz/modeling_qwen3_tts_tokenizer_v1.py +1528 -0
qwen_tts/core/tokenizer_25hz/vq/assets/mel_filters.npz +3 -0
qwen_tts/core/tokenizer_25hz/vq/core_vq.py +523 -0
qwen_tts/core/tokenizer_25hz/vq/speech_vq.py +357 -0
qwen_tts/core/tokenizer_25hz/vq/whisper_encoder.py +406 -0
qwen_tts/inference/qwen3_tts_model.py +877 -0
qwen_tts/inference/qwen3_tts_tokenizer.py +411 -0
requirements.txt +15 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,12 @@

+.DS_Store
+.env
+.git
+.gitignore
+__pycache__/
+*.py[cod]
+.cache/
+.venv/
+venv/
+logs/
+data/
+models/

.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Generated by MacOS
+.DS_Store
+#GPT_SoVITS/text/ja_userdic/

Dockerfile ADDED Viewed

	@@ -0,0 +1,47 @@

+FROM python:3.10-slim
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HOME=/data/huggingface \
+    WHISPER_CACHE_DIR=/data/whisper \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_SERVER_PORT=7860 \
+    NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        ffmpeg \
+        git \
+        libsndfile1 \
+        sox \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt .
+RUN python -m pip install --upgrade pip setuptools wheel \
+    && python -m pip install \
+        --index-url https://download.pytorch.org/whl/cu128 \
+        torch==2.10.0+cu128 \
+        torchaudio==2.10.0+cu128 \
+    && sed '/^torch==/d; /^torchaudio==/d' requirements.txt > /tmp/requirements-no-torch.txt \
+    && python -m pip install -r /tmp/requirements-no-torch.txt
+COPY app.py .
+COPY faster_qwen3_tts ./faster_qwen3_tts
+COPY qwen_tts ./qwen_tts
+RUN useradd --create-home --uid 1000 appuser \
+    && mkdir -p /data/huggingface /data/whisper \
+    && chown -R appuser:appuser /app /data
+USER appuser
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
 title: Merkurius
-emoji: 🦀
-colorFrom: yellow
-colorTo: pink
 sdk: gradio
 sdk_version: 6.14.0
 python_version: '3.12'
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Merkurius
+emoji: 🌟
+colorFrom: pink
+colorTo: yellow
 sdk: gradio
 sdk_version: 6.14.0
 python_version: '3.12'
 app_file: app.py
 pinned: false
+short_description: milchchan.com
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,288 @@

+#import subprocess
+#subprocess.run('pip install flash-attn==2.7.4.post1', shell=True)
+import io
+import re
+import os
+import json
+import hashlib
+import threading
+import time
+import numpy as np
+import torch
+import spaces
+import whisper
+import gradio as gr
+from gradio.themes.base import Base
+from gradio.themes.utils import colors, fonts, sizes
+from typing import Iterable
+from dotenv import load_dotenv
+from urllib.request import urlopen, Request
+from scipy.signal import resample_poly
+#from huggingface_hub import snapshot_download
+#from qwen_tts import Qwen3TTSModel
+from faster_qwen3_tts import FasterQwen3TTS
+load_dotenv(verbose=False)
+#TTS_MODEL = Qwen3TTSModel.from_pretrained(snapshot_download('Qwen/Qwen3-TTS-12Hz-1.7B-Base', token=os.environ['HF_TOKEN']), device_map=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), dtype=torch.bfloat16, token=os.environ['HF_TOKEN'], attn_implementation='kernels-community/flash-attn3')
+TTS_MODEL = FasterQwen3TTS.from_pretrained('Qwen/Qwen3-TTS-12Hz-1.7B-Base')
+WHISPER_MODEL = whisper.load_model('turbo', device='cpu', download_root=os.environ.get('WHISPER_CACHE_DIR'))
+REFERENCE_AUDIO_TRANSCRIPTION_CACHE: dict[str, tuple[float, str, str]] = {}
+REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LOCK = threading.Lock()
+REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LIMIT = max(1, int(os.environ.get('REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LIMIT', 100)))
+class Theme(Base):
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.neutral,
+        secondary_hue: colors.Color | str = colors.neutral,
+        neutral_hue: colors.Color | str = colors.neutral,
+        spacing_size: sizes.Size | str = sizes.spacing_md,
+        radius_size: sizes.Size | str = sizes.radius_md,
+        text_size: sizes.Size | str = sizes.text_md,
+        font: fonts.Font | str | Iterable[fonts.Font | str] = (fonts.GoogleFont('Barlow'), 'ui-sans-serif', 'sans-serif'),
+        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (fonts.GoogleFont('IBM Plex Mono'), 'ui-monospace', 'monospace'),
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            spacing_size=spacing_size,
+            radius_size=radius_size,
+            text_size=text_size,
+            font=font,
+            font_mono=font_mono,
+        )
+        super().set(
+            color_accent='rgb(0 231 255 / 1)',
+            slider_color='rgb(0 231 255 / 1)',
+            slider_color_dark='rgb(0 231 255 / 1)',
+            button_primary_background_fill='rgb(0 231 255 / 1)',
+            button_primary_background_fill_hover='rgb(0 231 255 / .75)',
+            button_primary_text_color='#ffffff',
+            button_primary_background_fill_dark='rgb(0 231 255 / 1)',
+            button_primary_background_fill_hover_dark='rgb(0 231 255 / .75)',
+            button_primary_text_color_dark='#ffffff',
+            loader_color='rgb(255 199 229 / 1)',
+            loader_color_dark='rgb(255 199 229 / 1)'
+        )
+def _normalize_audio(wav, eps=1e-12, clip=True):
+    """Normalize audio to float32 in [-1, 1] range."""
+    x = np.asarray(wav)
+    if np.issubdtype(x.dtype, np.integer):
+        info = np.iinfo(x.dtype)
+        if info.min < 0:
+            y = x.astype(np.float32) / max(abs(info.min), info.max)
+        else:
+            mid = (info.max + 1) / 2.0
+            y = (x.astype(np.float32) - mid) / mid
+    elif np.issubdtype(x.dtype, np.floating):
+        y = x.astype(np.float32)
+        m = np.max(np.abs(y)) if y.size else 0.0
+        if m > 1.0 + 1e-6:
+            y = y / (m + eps)
+    else:
+        return None
+    if clip:
+        y = np.clip(y, -1.0, 1.0)
+    if y.ndim > 1:
+        y = np.mean(y, axis=-1).astype(np.float32)
+    return y
+def _resample(x: np.ndarray, original_sample_rate: int, target_sample_rate: int, axis: int = 0) -> np.ndarray:
+    g = np.gcd(original_sample_rate, target_sample_rate)
+    return resample_poly(x, up=target_sample_rate // g, down=original_sample_rate // g, axis=axis)
+def _reference_audio_hash(reference_audio: tuple[np.ndarray, int]) -> str:
+    audio = reference_audio[0]
+    audio = np.ascontiguousarray(np.asarray(audio))
+    digest = hashlib.sha256()
+    digest.update(audio.tobytes())
+    return digest.hexdigest()
+def _detect_reference_text_and_language(reference_audio: tuple[np.ndarray, int], sample_rate: int) -> tuple[str, str]:
+    audio = np.asarray(reference_audio[0])
+    if audio.ndim == 2:
+        audio = audio.mean(axis=1)
+    if sample_rate != 16000:
+        audio = _resample(audio, sample_rate, 16000).astype(np.float32)
+    model = WHISPER_MODEL.to(device='cuda' if torch.cuda.is_available() else 'cpu')
+    audio = np.clip(audio, -1.0, 1.0)
+    audio = whisper.pad_or_trim(audio)
+    mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
+    _, probs = model.detect_language(mel)
+    detected_language = max(probs, key=probs.get)
+    result = whisper.decode(model, mel, whisper.DecodingOptions())
+    reference_text = re.sub(r'\s*\n\s*', '', result.text)
+    if detected_language == 'ja':
+        converted_reference_text = generate_text(reference_text)
+        if converted_reference_text is not None:
+            reference_text = converted_reference_text
+    return reference_text, detected_language
+def _get_reference_text_and_language(reference_audio: tuple[np.ndarray, int], sample_rate: int) -> tuple[str, str]:
+    cache_key = _reference_audio_hash(reference_audio)
+    with REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LOCK:
+        cached_result = REFERENCE_AUDIO_TRANSCRIPTION_CACHE.get(cache_key)
+        if cached_result is not None:
+            _, reference_text, detected_language = cached_result
+            REFERENCE_AUDIO_TRANSCRIPTION_CACHE[cache_key] = (time.time(), reference_text, detected_language)
+    if cached_result is not None:
+        return reference_text, detected_language
+    reference_text, detected_language = _detect_reference_text_and_language(reference_audio, sample_rate)
+    with REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LOCK:
+        REFERENCE_AUDIO_TRANSCRIPTION_CACHE[cache_key] = (time.time(), reference_text, detected_language)
+        if len(REFERENCE_AUDIO_TRANSCRIPTION_CACHE) > REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LIMIT:
+            expired_cache_keys = sorted(
+                REFERENCE_AUDIO_TRANSCRIPTION_CACHE,
+                key=lambda key: REFERENCE_AUDIO_TRANSCRIPTION_CACHE[key][0]
+            )[:-REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LIMIT]
+            for expired_cache_key in expired_cache_keys:
+                del REFERENCE_AUDIO_TRANSCRIPTION_CACHE[expired_cache_key]
+    return reference_text, detected_language
+def generate_text(prompt: str) -> str | None:
+    system_prompt = '''あなたは日本語テキストを「読み（かな）」だけに変換する変換器です。
+出力に含めてよい文字は ひらがな・カタカナ・長音記号ー・空白 のみです。改行も禁止（1行で出力）。
+入力に含まれる 漢字は必ずかなにする。
+英数字・記号は、可能な範囲で日本語のカナ読みにする（例：AI→えーあい、LLM→えるえるえむ、2026→にせんにじゅうろく）。
+出力は 変換後の本文のみ。説明、注釈、引用符、箇条書き、コードブロックは一切禁止。
+最後に必ず自己検査を行う：出力が ^[ぁ-ゟ゠-ヿー ]+$ に一致しない場合、条件を満たすまで修正してから出力する。
+それでも読めない文字がある場合は、意味を落としてよいので「最も近いかな」に置き換える（記号は省略よりも読みを優先。ただし許可文字以外は絶対に出さない）。'''
+    request = Request('https://api.openai.com/v1/responses', data=json.dumps({
+    'model': os.environ.get('OPENAI_MODEL', 'gpt-5.4-mini'),
+    'input': [{
+            'role': 'developer',
+            'content': system_prompt
+        },
+        {
+            'role': 'user',
+            'content': [
+                {
+                    'type': 'input_text',
+                    'text': prompt
+                }
+            ]
+        }],
+    'temperature': 1,
+    'reasoning': {'effort': 'none'},
+    }).encode('utf-8'), method='POST', headers={'Content-Type': 'application/json', 'Authorization': f'Bearer {os.environ["OPENAI_API_KEY"]}'})
+    with urlopen(request) as response:
+        result = json.loads(response.read().decode('utf-8'))
+        for output in result['output']:
+            if 'type' in output and output['type'] == 'message':
+                for content in output['content']:
+                    if 'type' in content and content['type'] == 'output_text':
+                        return content['text']
+    return None
+@spaces.GPU(duration=30)
+def generate_voice_clone(input_text: str, language: str | None, reference_audio: np.ndarray, reference_text: str | None, temperature: float, progress: gr.Progress=gr.Progress(track_tqdm=True)) -> (np.ndarray, str | None, str | None):
+    language_codes = {'en': 'English', 'ja': 'Japanese'}
+    transcribed_text = None
+    detected_language = None
+    if isinstance(reference_audio, tuple) and len(reference_audio) == 2 and isinstance(reference_audio[0], int):
+        sample_rate, wav = reference_audio
+        sample_rate = int(sample_rate)
+        reference_audio = (_normalize_audio(wav), sample_rate)
+    if isinstance(reference_audio, dict) and 'sampling_rate' in reference_audio and 'data' in reference_audio:
+        sample_rate = int(reference_audio['sampling_rate'])
+        reference_audio = (_normalize_audio(reference_audio['data']), sample_rate)
+    if reference_text is None or len(reference_text) == 0:
+        reference_text, detected_language = _get_reference_text_and_language(reference_audio, sample_rate)
+        transcribed_text = reference_text
+        if language is None:
+            if detected_language in language_codes:
+                language = language_codes[detected_language]
+            else:
+                language = 'Auto'
+        elif language == 'Auto':
+            if detected_language in language_codes:
+                language = language_codes[detected_language]
+        elif language in language_codes:
+            language = language_codes[language]
+    elif language is None:
+        language = 'Auto'
+    elif language in language_codes:
+        language = language_codes[language]
+    if sample_rate != 48000:
+        reference_audio = (_resample(reference_audio[0], sample_rate, 48000), 48000)
+    wavs, sample_rate = TTS_MODEL.generate_voice_clone(text=input_text.strip(), language=language, ref_audio=reference_audio, ref_text=reference_text.strip(), temperature=temperature, append_silence=False)
+    #wavs, sample_rate = TTS_MODEL.generate_voice_clone(text=input_text.strip(), language=language, ref_audio=reference_audio, ref_text=reference_text, max_new_tokens=2048, temperature=temperature)
+    return (sample_rate, (np.clip(wavs[0], -1.0, 1.0) * 32768.0).round().astype(np.int16)), transcribed_text, detected_language
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=2):
+            with gr.Group():
+                tts_reference_audio = gr.Audio(label='Reference Audio', type='numpy', buttons=['download'], waveform_options={'waveform_color': 'rgb(0 231 255 / 1)', 'waveform_progress_color': 'rgb(255 199 229 / 1)'})
+                tts_reference_text = gr.Textbox(label='Reference Text', value='', lines=1)
+            tts_input_text = gr.Textbox(label='Input', lines=4)
+            tts_language = gr.Dropdown(label='Language', choices=[('Automatic', 'Auto'), ('English', 'en'), ('Japanese', 'ja')], value='Auto', interactive=True)
+            tts_temperature_slider = gr.Slider(minimum=0.1, maximum=2.0, value=0.9, step=0.1, label='Temperature')
+            tts_generate_button = gr.Button('Generate', variant='primary')
+        with gr.Column(scale=2):
+            tts_audio_output = gr.Audio(label='Output', type='numpy', buttons=['download'], waveform_options={'waveform_color': 'rgb(0 231 255 / 1)', 'waveform_progress_color': 'rgb(255 199 229 / 1)'})
+            tts_transcribed_text = gr.Label(label='Transcript', value='')
+            tts_detected_language = gr.Label(label='Language', value='')
+    tts_generate_button.click(fn=generate_voice_clone, inputs=[tts_input_text, tts_language, tts_reference_audio, tts_reference_text, tts_temperature_slider], outputs=[tts_audio_output, tts_transcribed_text, tts_detected_language], api_name='synthesize')
+if __name__ == '__main__':
+    demo.launch(
+        server_name=os.environ.get('GRADIO_SERVER_NAME', '0.0.0.0'),
+        server_port=int(os.environ.get('GRADIO_SERVER_PORT', os.environ.get('PORT', 7860))),
+        theme=Theme(),
+        css='.column>.row>.column:first-of-type .block { border-width: 0px !important; }'
+    )

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,27 @@

+services:
+  ai:
+    container_name: "milchchanai"
+    build:
+      context: .
+    restart: unless-stopped
+    tty: true
+    env_file:
+      - .env
+    environment:
+      GRADIO_SERVER_NAME: 0.0.0.0
+      GRADIO_SERVER_PORT: 7860
+      HF_HOME: /data/huggingface
+      WHISPER_CACHE_DIR: /data/whisper
+    volumes:
+      - hf-cache:/data/huggingface
+      - whisper-cache:/data/whisper
+    ports:
+      - "7860:7860"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+volumes:
+  hf-cache:
+  whisper-cache:

faster_qwen3_tts/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+faster-qwen3-tts: Real-time Qwen3-TTS inference using CUDA graphs
+"""
+from .model import FasterQwen3TTS
+__version__ = "0.2.5"
+__all__ = ["FasterQwen3TTS"]

faster_qwen3_tts/cli.py ADDED Viewed

	@@ -0,0 +1,407 @@

+#!/usr/bin/env python3
+"""CLI for FasterQwen3TTS."""
+import argparse
+import os
+import sys
+import time
+import numpy as np
+import soundfile as sf
+import torch
+from faster_qwen3_tts import FasterQwen3TTS
+def _load_model(model_id: str, device: str, dtype: str):
+    if dtype == "bf16":
+        torch_dtype = torch.bfloat16
+    elif dtype == "fp16":
+        torch_dtype = torch.float16
+    else:
+        torch_dtype = torch.float32
+    return FasterQwen3TTS.from_pretrained(
+        model_id,
+        device=device,
+        dtype=torch_dtype,
+        attn_implementation="sdpa",
+        max_seq_len=2048,
+    )
+def _write_audio(out_path: str, audio: np.ndarray, sr: int):
+    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
+    sf.write(out_path, audio, sr)
+def _stream_to_audio(gen):
+    chunks = []
+    sr = None
+    for audio_chunk, sr, _ in gen:
+        chunks.append(audio_chunk)
+    if not chunks:
+        return np.zeros(1, dtype=np.float32), 24000
+    return np.concatenate(chunks), sr
+def cmd_clone(args):
+    model = _load_model(args.model, args.device, args.dtype)
+    if args.streaming:
+        start = time.perf_counter()
+        gen = model.generate_voice_clone_streaming(
+            text=args.text,
+            language=args.language,
+            ref_audio=args.ref_audio,
+            ref_text=args.ref_text,
+            chunk_size=args.chunk_size,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            do_sample=not args.greedy,
+            repetition_penalty=args.repetition_penalty,
+            xvec_only=args.xvec_only,
+            non_streaming_mode=args.non_streaming_mode,
+        )
+        audio, sr = _stream_to_audio(gen)
+        total_time = time.perf_counter() - start
+        audio_dur = len(audio) / sr if sr else 0.0
+        rtf = audio_dur / total_time if total_time > 0 else 0.0
+    else:
+        start = time.perf_counter()
+        audio_list, sr = model.generate_voice_clone(
+            text=args.text,
+            language=args.language,
+            ref_audio=args.ref_audio,
+            ref_text=args.ref_text,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            do_sample=not args.greedy,
+            repetition_penalty=args.repetition_penalty,
+            xvec_only=args.xvec_only,
+            non_streaming_mode=args.non_streaming_mode,
+        )
+        audio = audio_list[0]
+        total_time = time.perf_counter() - start
+        audio_dur = len(audio) / sr if sr else 0.0
+        rtf = audio_dur / total_time if total_time > 0 else 0.0
+    _write_audio(args.output, audio, sr)
+    print(f"Wrote {args.output} (dur {audio_dur:.2f}s, RTF {rtf:.2f})")
+def cmd_custom(args):
+    model = _load_model(args.model, args.device, args.dtype)
+    if args.list_speakers:
+        speakers = model.model.get_supported_speakers() or []
+        print("\n".join(speakers))
+        return
+    if not args.speaker:
+        print("ERROR: --speaker is required (use --list-speakers)")
+        sys.exit(2)
+    if args.streaming:
+        start = time.perf_counter()
+        gen = model.generate_custom_voice_streaming(
+            text=args.text,
+            speaker=args.speaker,
+            language=args.language,
+            instruct=args.instruct,
+            chunk_size=args.chunk_size,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            do_sample=not args.greedy,
+            repetition_penalty=args.repetition_penalty,
+        )
+        audio, sr = _stream_to_audio(gen)
+        total_time = time.perf_counter() - start
+        audio_dur = len(audio) / sr if sr else 0.0
+        rtf = audio_dur / total_time if total_time > 0 else 0.0
+    else:
+        start = time.perf_counter()
+        audio_list, sr = model.generate_custom_voice(
+            text=args.text,
+            speaker=args.speaker,
+            language=args.language,
+            instruct=args.instruct,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            do_sample=not args.greedy,
+            repetition_penalty=args.repetition_penalty,
+        )
+        audio = audio_list[0]
+        total_time = time.perf_counter() - start
+        audio_dur = len(audio) / sr if sr else 0.0
+        rtf = audio_dur / total_time if total_time > 0 else 0.0
+    _write_audio(args.output, audio, sr)
+    print(f"Wrote {args.output} (dur {audio_dur:.2f}s, RTF {rtf:.2f})")
+def cmd_design(args):
+    model = _load_model(args.model, args.device, args.dtype)
+    if args.streaming:
+        start = time.perf_counter()
+        gen = model.generate_voice_design_streaming(
+            text=args.text,
+            instruct=args.instruct,
+            language=args.language,
+            chunk_size=args.chunk_size,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            do_sample=not args.greedy,
+            repetition_penalty=args.repetition_penalty,
+        )
+        audio, sr = _stream_to_audio(gen)
+        total_time = time.perf_counter() - start
+        audio_dur = len(audio) / sr if sr else 0.0
+        rtf = audio_dur / total_time if total_time > 0 else 0.0
+    else:
+        start = time.perf_counter()
+        audio_list, sr = model.generate_voice_design(
+            text=args.text,
+            instruct=args.instruct,
+            language=args.language,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            do_sample=not args.greedy,
+            repetition_penalty=args.repetition_penalty,
+        )
+        audio = audio_list[0]
+        total_time = time.perf_counter() - start
+        audio_dur = len(audio) / sr if sr else 0.0
+        rtf = audio_dur / total_time if total_time > 0 else 0.0
+    _write_audio(args.output, audio, sr)
+    print(f"Wrote {args.output} (dur {audio_dur:.2f}s, RTF {rtf:.2f})")
+def cmd_serve(args):
+    model = _load_model(args.model, args.device, args.dtype)
+    if args.mode == "clone":
+        if not args.ref_audio or not args.ref_text:
+            print("ERROR: --ref-audio and --ref-text are required for clone mode")
+            sys.exit(2)
+    if args.mode == "custom" and not args.speaker:
+        print("ERROR: --speaker is required for custom mode")
+        sys.exit(2)
+    if args.mode == "design" and not args.instruct:
+        print("ERROR: --instruct is required for design mode")
+        sys.exit(2)
+    print("Server started. Enter text per line. Type 'exit' or 'quit' to stop.")
+    idx = 1
+    for line in sys.stdin:
+        text = line.strip()
+        if not text:
+            continue
+        if text.lower() in ("exit", "quit", "stop"):
+            break
+        out_path = os.path.join(args.output_dir, f"out_{idx:04d}.wav")
+        idx += 1
+        start = time.perf_counter()
+        if args.mode == "clone":
+            if args.streaming:
+                gen = model.generate_voice_clone_streaming(
+                    text=text,
+                    language=args.language,
+                    ref_audio=args.ref_audio,
+                    ref_text=args.ref_text,
+                    chunk_size=args.chunk_size,
+                    max_new_tokens=args.max_new_tokens,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    do_sample=not args.greedy,
+                    repetition_penalty=args.repetition_penalty,
+                    xvec_only=False,
+                    non_streaming_mode=args.non_streaming_mode,
+                )
+                audio, sr = _stream_to_audio(gen)
+            else:
+                audio_list, sr = model.generate_voice_clone(
+                    text=text,
+                    language=args.language,
+                    ref_audio=args.ref_audio,
+                    ref_text=args.ref_text,
+                    max_new_tokens=args.max_new_tokens,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    do_sample=not args.greedy,
+                    repetition_penalty=args.repetition_penalty,
+                    xvec_only=False,
+                    non_streaming_mode=args.non_streaming_mode,
+                )
+                audio = audio_list[0]
+        elif args.mode == "custom":
+            if args.streaming:
+                gen = model.generate_custom_voice_streaming(
+                    text=text,
+                    speaker=args.speaker,
+                    language=args.language,
+                    instruct=args.instruct,
+                    chunk_size=args.chunk_size,
+                    max_new_tokens=args.max_new_tokens,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    do_sample=not args.greedy,
+                    repetition_penalty=args.repetition_penalty,
+                )
+                audio, sr = _stream_to_audio(gen)
+            else:
+                audio_list, sr = model.generate_custom_voice(
+                    text=text,
+                    speaker=args.speaker,
+                    language=args.language,
+                    instruct=args.instruct,
+                    max_new_tokens=args.max_new_tokens,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    do_sample=not args.greedy,
+                    repetition_penalty=args.repetition_penalty,
+                )
+                audio = audio_list[0]
+        else:
+            if args.streaming:
+                gen = model.generate_voice_design_streaming(
+                    text=text,
+                    instruct=args.instruct,
+                    language=args.language,
+                    chunk_size=args.chunk_size,
+                    max_new_tokens=args.max_new_tokens,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    do_sample=not args.greedy,
+                    repetition_penalty=args.repetition_penalty,
+                )
+                audio, sr = _stream_to_audio(gen)
+            else:
+                audio_list, sr = model.generate_voice_design(
+                    text=text,
+                    instruct=args.instruct,
+                    language=args.language,
+                    max_new_tokens=args.max_new_tokens,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    do_sample=not args.greedy,
+                    repetition_penalty=args.repetition_penalty,
+                )
+                audio = audio_list[0]
+        _write_audio(out_path, audio, sr)
+        total_time = time.perf_counter() - start
+        audio_dur = len(audio) / sr if sr else 0.0
+        rtf = audio_dur / total_time if total_time > 0 else 0.0
+        print(f"Wrote {out_path} (dur {audio_dur:.2f}s, RTF {rtf:.2f})")
+def build_parser():
+    p = argparse.ArgumentParser(prog="faster-qwen3-tts", description="FasterQwen3TTS CLI")
+    p.add_argument("--device", default="cuda", help="Device (cuda or cpu)")
+    p.add_argument("--dtype", default="bf16", choices=["bf16", "fp16", "fp32"], help="Model dtype")
+    sub = p.add_subparsers(dest="command", required=True)
+    def add_common(sp):
+        sp.add_argument("--text", required=True, help="Text to synthesize")
+        sp.add_argument("--language", default="Auto", help="Language (Auto, English, French, ...)" )
+        sp.add_argument("--output", required=True, help="Output wav path")
+        sp.add_argument("--model", required=True, help="Model id or local path")
+        sp.add_argument("--max-new-tokens", type=int, default=2048)
+        sp.add_argument("--temperature", type=float, default=0.9)
+        sp.add_argument("--top-k", type=int, default=50)
+        sp.add_argument("--repetition-penalty", type=float, default=1.05)
+        sp.add_argument("--greedy", action="store_true", help="Disable sampling")
+        sp.add_argument("--streaming", action="store_true", help="Use streaming generation")
+        nsm_group = sp.add_mutually_exclusive_group()
+        nsm_group.add_argument(
+            "--non-streaming-mode",
+            dest="non_streaming_mode",
+            action="store_true",
+            help="Prefill full text before decode",
+        )
+        nsm_group.add_argument(
+            "--no-non-streaming-mode",
+            dest="non_streaming_mode",
+            action="store_false",
+            help="Use upstream step-by-step text feeding during decode",
+        )
+        sp.set_defaults(non_streaming_mode=True)
+        sp.add_argument("--chunk-size", type=int, default=8, help="Streaming chunk size")
+    sp = sub.add_parser("clone", help="Voice cloning (reference audio)")
+    add_common(sp)
+    sp.add_argument("--ref-audio", required=True, help="Reference audio path")
+    sp.add_argument("--ref-text", required=True, help="Reference transcript")
+    sp.add_argument(
+        "--xvec-only",
+        action="store_true",
+        help="Use speaker embedding only instead of upstream-default ICL mode",
+    )
+    sp.set_defaults(non_streaming_mode=False)
+    sp.set_defaults(fn=cmd_clone)
+    sp = sub.add_parser("custom", help="CustomVoice model (speaker IDs)")
+    add_common(sp)
+    sp.add_argument("--speaker", help="Speaker ID")
+    sp.add_argument("--instruct", default="", help="Optional instruction")
+    sp.add_argument("--list-speakers", action="store_true", help="List available speaker IDs")
+    sp.set_defaults(fn=cmd_custom)
+    sp = sub.add_parser("design", help="VoiceDesign model (instruction-based)")
+    add_common(sp)
+    sp.add_argument("--instruct", required=True, help="Voice/style instruction")
+    sp.set_defaults(fn=cmd_design)
+    sp = sub.add_parser("serve", help="Keep model hot and generate multiple requests from stdin")
+    sp.add_argument("--mode", required=True, choices=["clone", "custom", "design"])
+    sp.add_argument("--model", required=True, help="Model id or local path")
+    sp.add_argument("--language", default="Auto", help="Language (Auto, English, French, ...)")
+    sp.add_argument("--ref-audio", help="Reference audio path (clone)")
+    sp.add_argument("--ref-text", help="Reference transcript (clone)")
+    sp.add_argument("--speaker", help="Speaker ID (custom)")
+    sp.add_argument("--instruct", default="", help="Instruction (custom/design)")
+    sp.add_argument("--streaming", action="store_true", help="Use streaming generation")
+    nsm_group = sp.add_mutually_exclusive_group()
+    nsm_group.add_argument(
+        "--non-streaming-mode",
+        dest="non_streaming_mode",
+        action="store_true",
+        help="Prefill full text before decode",
+    )
+    nsm_group.add_argument(
+        "--no-non-streaming-mode",
+        dest="non_streaming_mode",
+        action="store_false",
+        help="Use upstream step-by-step text feeding during decode",
+    )
+    sp.set_defaults(non_streaming_mode=False)
+    sp.add_argument("--chunk-size", type=int, default=8, help="Streaming chunk size")
+    sp.add_argument("--max-new-tokens", type=int, default=2048)
+    sp.add_argument("--temperature", type=float, default=0.9)
+    sp.add_argument("--top-k", type=int, default=50)
+    sp.add_argument("--repetition-penalty", type=float, default=1.05)
+    sp.add_argument("--greedy", action="store_true", help="Disable sampling")
+    sp.add_argument("--output-dir", default="outputs", help="Directory for output wavs")
+    sp.set_defaults(fn=cmd_serve)
+    return p
+def main():
+    parser = build_parser()
+    args = parser.parse_args()
+    args.fn(args)
+if __name__ == "__main__":
+    main()

faster_qwen3_tts/generate.py ADDED Viewed

	@@ -0,0 +1,215 @@

+#!/usr/bin/env python3
+"""
+Non-streaming generation loop using CUDA graphs for both predictor and talker.
+"""
+import time
+from typing import Optional, Tuple
+import torch
+from .predictor_graph import PredictorGraph
+from .sampling import apply_repetition_penalty, sample_logits
+from .talker_graph import TalkerGraph
+@torch.inference_mode()
+def fast_generate(
+    talker,
+    talker_input_embeds: torch.Tensor,
+    attention_mask: torch.Tensor,
+    trailing_text_hiddens: torch.Tensor,
+    tts_pad_embed: torch.Tensor,
+    config,
+    predictor_graph: PredictorGraph,
+    talker_graph: TalkerGraph,
+    max_new_tokens: int = 2048,
+    min_new_tokens: int = 2,
+    temperature: float = 0.9,
+    top_k: int = 50,
+    top_p: float = 1.0,
+    do_sample: bool = True,
+    repetition_penalty: float = 1.05,
+    subtalker_dosample: Optional[bool] = None,
+    subtalker_top_k: Optional[int] = None,
+    subtalker_top_p: Optional[float] = None,
+    subtalker_temperature: Optional[float] = None,
+    parity_mode: bool = False,
+) -> Tuple[Optional[torch.Tensor], dict]:
+    """
+    Fast autoregressive generation with CUDA-graphed predictor and talker.
+    """
+    eos_id = config.codec_eos_token_id
+    num_code_groups = config.num_code_groups
+    vocab_size = config.vocab_size
+    device = talker_input_embeds.device
+    suppress_mask = torch.zeros(vocab_size, dtype=torch.bool, device=device)
+    suppress_start = max(0, vocab_size - 1024)
+    for i in range(suppress_start, vocab_size):
+        if i != eos_id:
+            suppress_mask[i] = True
+    if parity_mode:
+        suppress_tokens = [i for i in range(suppress_start, vocab_size) if i != eos_id]
+        t_start = time.time()
+        talker_result = talker.generate(
+            inputs_embeds=talker_input_embeds,
+            attention_mask=attention_mask,
+            trailing_text_hidden=trailing_text_hiddens,
+            tts_pad_embed=tts_pad_embed,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,
+            do_sample=do_sample,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            eos_token_id=eos_id,
+            suppress_tokens=suppress_tokens,
+            subtalker_dosample=subtalker_dosample if subtalker_dosample is not None else do_sample,
+            subtalker_top_k=subtalker_top_k if subtalker_top_k is not None else top_k,
+            subtalker_top_p=subtalker_top_p if subtalker_top_p is not None else top_p,
+            subtalker_temperature=subtalker_temperature if subtalker_temperature is not None else temperature,
+            output_hidden_states=True,
+            return_dict_in_generate=True,
+        )
+        talker_codes = torch.stack(
+            [hid[-1] for hid in talker_result.hidden_states if hid[-1] is not None],
+            dim=1,
+        )
+        first_codebook = talker_codes[:, :, 0]
+        is_stop_token = first_codebook == eos_id
+        stop_indices = torch.argmax(is_stop_token.int(), dim=1)
+        has_stop_token = is_stop_token.any(dim=1)
+        effective_lengths = torch.where(has_stop_token, stop_indices, talker_codes.shape[1])
+        talker_codes_list = [talker_codes[i, :length, :] for i, length in enumerate(effective_lengths)]
+        torch.cuda.synchronize()
+        total_time = time.time() - t_start
+        steps = int(talker_codes_list[0].shape[0]) if talker_codes_list else 0
+        timing = {
+            'prefill_ms': 0.0,
+            'decode_s': total_time,
+            'steps': steps,
+            'ms_per_step': (total_time / steps * 1000) if steps > 0 else 0.0,
+            'steps_per_s': (steps / total_time) if total_time > 0 else 0.0,
+        }
+        return talker_codes_list[0] if talker_codes_list else None, timing
+    predictor = talker.code_predictor
+    talker_codec_embed = talker.get_input_embeddings()
+    talker_codec_head = talker.codec_head
+    predictor_codec_embeds = predictor.get_input_embeddings()
+    # === PREFILL (still uses HF forward for variable-length prefill) ===
+    t_start = time.time()
+    out = talker.forward(
+        inputs_embeds=talker_input_embeds,
+        attention_mask=attention_mask,
+        use_cache=True,
+        output_hidden_states=True,
+        return_dict=True,
+        trailing_text_hidden=trailing_text_hiddens,
+        tts_pad_embed=tts_pad_embed,
+        generation_step=None,
+        past_hidden=None,
+        past_key_values=None,
+    )
+    talker_past_kv = out.past_key_values
+    past_hidden = out.past_hidden
+    gen_step = out.generation_step
+    logits = out.logits[:, -1, :]
+    suppress_eos = min_new_tokens > 0
+    token = sample_logits(
+        logits,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        do_sample=do_sample,
+        suppress_mask=suppress_mask,
+        suppress_tokens=[eos_id] if suppress_eos else None,
+    )
+    # Copy prefill KV cache into talker graph's static cache
+    prefill_len = talker_graph.prefill_kv(talker_past_kv)
+    # Sync padding mask + rope deltas for decode parity
+    rope_deltas = getattr(talker, "rope_deltas", None)
+    talker_graph.set_generation_state(attention_mask, rope_deltas)
+    torch.cuda.synchronize()
+    t_prefill = time.time() - t_start
+    # === DECODE LOOP ===
+    t_decode_start = time.time()
+    all_codec_ids = []
+    for step_idx in range(max_new_tokens):
+        if token.item() == eos_id:
+            break
+        # --- CUDA-Graphed Code Predictor ---
+        last_id_hidden = talker_codec_embed(token.unsqueeze(1))  # [1, 1, H]
+        pred_input = torch.cat((past_hidden, last_id_hidden), dim=1)  # [1, 2, H]
+        codebook_token_ids = predictor_graph.run(pred_input)  # [15] long tensor
+        # Build full codec: [first_cb, cb1, ..., cb15]
+        all_cb = torch.cat([token.view(1), codebook_token_ids])  # [16]
+        all_codec_ids.append(all_cb.detach())
+        # --- Build input embedding for talker ---
+        codec_hiddens = [last_id_hidden]
+        for i in range(num_code_groups - 1):
+            codec_hiddens.append(predictor_codec_embeds[i](codebook_token_ids[i].unsqueeze(0).unsqueeze(0)))
+        inputs_embeds = torch.cat(codec_hiddens, dim=1).sum(1, keepdim=True)
+        if gen_step < trailing_text_hiddens.shape[1]:
+            inputs_embeds = inputs_embeds + trailing_text_hiddens[:, gen_step].unsqueeze(1)
+        else:
+            inputs_embeds = inputs_embeds + tts_pad_embed
+        # --- CUDA-Graphed Talker decode step ---
+        current_pos = prefill_len + step_idx
+        if current_pos >= talker_graph.max_seq_len - 1:
+            # Stop if we exceed max_seq_len
+            break
+        hidden_states = talker_graph.run(inputs_embeds, position=current_pos)
+        # hidden_states is the static output buffer - use it immediately
+        logits = talker_codec_head(hidden_states[:, -1, :]).unsqueeze(0)
+        if repetition_penalty != 1.0 and len(all_codec_ids) > 0:
+            history = torch.stack([c[0] for c in all_codec_ids])
+            logits = apply_repetition_penalty(logits, history, repetition_penalty)
+        suppress_eos = len(all_codec_ids) < min_new_tokens
+        token = sample_logits(
+            logits.squeeze(0),
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            suppress_mask=suppress_mask,
+            suppress_tokens=[eos_id] if suppress_eos else None,
+        )
+        past_hidden = hidden_states[:, -1:, :].clone()  # clone since it's the static buffer
+        gen_step += 1
+    torch.cuda.synchronize()
+    t_decode = time.time() - t_decode_start
+    n_steps = len(all_codec_ids)
+    timing = {
+        'prefill_ms': t_prefill * 1000,
+        'decode_s': t_decode,
+        'steps': n_steps,
+        'ms_per_step': (t_decode / n_steps * 1000) if n_steps > 0 else 0,
+        'steps_per_s': (n_steps / t_decode) if t_decode > 0 else 0,
+    }
+    if all_codec_ids:
+        return torch.stack(all_codec_ids), timing
+    return None, timing

faster_qwen3_tts/model.py ADDED Viewed

	@@ -0,0 +1,1370 @@

+"""
+FasterQwen3TTS: Real-time TTS using CUDA graph capture.
+Wrapper class that provides a Qwen3-TTS API while using
+CUDA graphs for 6-10x speedup.
+"""
+import logging
+from pathlib import Path
+from typing import Any, Dict, Generator, List, Optional, Tuple, Union
+import numpy as np
+import soundfile as sf
+import torch
+from .utils import suppress_flash_attn_warning
+logger = logging.getLogger(__name__)
+class FasterQwen3TTS:
+    """
+    Qwen3-TTS model with CUDA graphs for real-time inference.
+    Compatible API with Qwen3TTSModel, but uses CUDA graph
+    capture for 6-10x speedup on NVIDIA GPUs.
+    """
+    def __init__(
+        self,
+        base_model,
+        predictor_graph,
+        talker_graph,
+        device: str = "cuda",
+        dtype: torch.dtype = torch.bfloat16,
+        max_seq_len: int = 2048,
+    ):
+        self.model = base_model  # The qwen-tts Qwen3TTSModel instance
+        self.predictor_graph = predictor_graph
+        self.talker_graph = talker_graph
+        self.device = device
+        self.dtype = dtype
+        self.max_seq_len = max_seq_len
+        self.sample_rate = self._infer_sample_rate(base_model)
+        self._warmed_up = False
+        self._voice_prompt_cache = {}  # Cache (ref_audio, ref_text) -> (vcp, ref_ids)
+    @staticmethod
+    def _get_speech_tokenizer(base_model):
+        """Return the nested qwen-tts speech tokenizer when available."""
+        return getattr(getattr(base_model, "model", None), "speech_tokenizer", None)
+    @property
+    def speech_tokenizer(self):
+        """Expose the codec decoder on the wrapper's public surface."""
+        speech_tokenizer = self._get_speech_tokenizer(self.model)
+        if speech_tokenizer is None:
+            raise AttributeError("Underlying model does not expose a speech_tokenizer")
+        return speech_tokenizer
+    @staticmethod
+    def _infer_sample_rate(base_model) -> int:
+        """Infer output audio sample rate from qwen-tts internals."""
+        # Qwen3-TTS model IDs include "12Hz", but that is codec frame-rate (tokens/s),
+        # not waveform sampling rate. Generated audio is 24kHz.
+        sample_rate = None
+        speech_tokenizer = FasterQwen3TTS._get_speech_tokenizer(base_model)
+        if speech_tokenizer is not None:
+            sample_rate = getattr(speech_tokenizer, "sample_rate", None)
+        if sample_rate is None:
+            sample_rate = getattr(base_model, "sample_rate", None)
+        if sample_rate is None:
+            logger.warning(
+                "Could not infer sample rate from base model; defaulting to 24000 Hz."
+            )
+            return 24000
+        return int(sample_rate)
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name: str,
+        device: str = "cuda",
+        dtype: Union[str, torch.dtype] = torch.bfloat16,
+        attn_implementation: str = "sdpa",
+        max_seq_len: int = 2048,
+    ):
+        """
+        Load Qwen3-TTS model and prepare CUDA graphs.
+        Args:
+            model_name: Model path or HuggingFace Hub ID
+            device: Device to use ("cuda" or "cpu")
+            dtype: Data type for inference
+            attn_implementation: Attention implementation ("sdpa" or "flash_attention_2")
+            max_seq_len: Maximum sequence length for static cache
+        Returns:
+            FasterQwen3TTS instance
+        """
+        if isinstance(dtype, str):
+            dtype = getattr(torch, dtype)
+        if not device.startswith("cuda") or not torch.cuda.is_available():
+            raise ValueError("CUDA graphs require CUDA device")
+        logger.info(f"Loading Qwen3-TTS model: {model_name}")
+        # Import here to avoid dependency issues (and suppress flash-attn warning)
+        with suppress_flash_attn_warning():
+            from qwen_tts import Qwen3TTSModel
+        from .predictor_graph import PredictorGraph
+        from .talker_graph import TalkerGraph
+        # Load base model using qwen-tts library
+        base_model = Qwen3TTSModel.from_pretrained(
+            model_name,
+            device_map=device,
+            torch_dtype=dtype,
+            attn_implementation=attn_implementation,
+        )
+        talker = base_model.model.talker
+        talker_config = base_model.model.config.talker_config
+        # Extract predictor config from loaded model
+        predictor = talker.code_predictor
+        pred_config = predictor.model.config
+        talker_hidden = talker_config.hidden_size
+        # Build CUDA graphs
+        logger.info("Building CUDA graphs...")
+        predictor_graph = PredictorGraph(
+            predictor,
+            pred_config,
+            talker_hidden,
+            device=device,
+            dtype=dtype,
+            do_sample=True, # subtalker_dosample (Default: True)
+            top_k=50, # subtalker_top_k (Default: 50)
+            top_p=1.0, # subtalker_top_p (Default: 1.0)
+            temperature=0.2, # subtalker_temperature (Default: 0.9)
+        )
+        talker_graph = TalkerGraph(
+            talker.model,
+            talker_config,
+            device=device,
+            dtype=dtype,
+            max_seq_len=max_seq_len,
+        )
+        logger.info("CUDA graphs initialized (will capture on first run)")
+        return cls(
+            base_model=base_model,
+            predictor_graph=predictor_graph,
+            talker_graph=talker_graph,
+            device=device,
+            dtype=dtype,
+            max_seq_len=max_seq_len,
+        )
+    def _warmup(self, prefill_len: int):
+        """Warm up and capture CUDA graphs with given prefill length."""
+        if self._warmed_up:
+            return
+        logger.info("Warming up CUDA graphs...")
+        self.predictor_graph.capture(num_warmup=3)
+        self.talker_graph.capture(prefill_len=prefill_len, num_warmup=3)
+        self._warmed_up = True
+        logger.info("CUDA graphs captured and ready")
+    def generate(
+        self,
+        text: str,
+        language: str = "English",
+        max_new_tokens: int = 2048,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        do_sample: bool = True,
+        repetition_penalty: float = 1.05,
+    ) -> Tuple[list, int]:
+        """
+        Generate speech from text using default voice.
+        Not yet implemented - use generate_voice_clone() instead.
+        """
+        raise NotImplementedError(
+            "Default voice generation not yet implemented. "
+            "Use generate_voice_clone() with reference audio."
+        )
+    def _load_ref_audio_with_silence(self, ref_audio: Union[str, Path, tuple], silence_secs: float = 0.5) -> Tuple[np.ndarray, int]:
+        """Load reference audio and optionally append trailing silence.
+        The ICL voice-cloning prompt ends with the last codec token of the reference
+        audio, so the model's first generated token is conditioned on whatever phoneme
+        the reference ends with. Appending a short silence makes the last tokens
+        encode silence instead, preventing that phoneme from bleeding into the start
+        of the generated speech. Set silence_secs=0 to disable this behavior.
+        """
+        if isinstance(ref_audio, tuple):
+            audio, sr = ref_audio
+        else:
+            audio, sr = sf.read(str(ref_audio), dtype="float32", always_2d=False)
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1)  # convert to mono
+        if silence_secs > 0:
+            silence = np.zeros(int(silence_secs * sr), dtype=np.float32)
+            audio = np.concatenate([audio, silence])
+        return audio, sr
+    def _resolve_voice_clone_prompt(
+        self,
+        input_ids,
+        ref_audio: Optional[Union[str, Path, tuple]],
+        ref_text: str,
+        xvec_only: bool,
+        append_silence: bool,
+        voice_clone_prompt: Optional[Union[Dict[str, Any], List[Any]]],
+    ) -> Tuple[Dict[str, Any], list, bool]:
+        """Resolve voice clone prompt data and return (prompt, ref_ids, using_icl_mode)."""
+        if voice_clone_prompt is not None:
+            return self._resolve_precomputed_voice_clone_prompt(
+                input_ids=input_ids,
+                ref_text=ref_text,
+                voice_clone_prompt=voice_clone_prompt,
+            )
+        if ref_audio is None:
+            raise ValueError("ref_audio is required when voice_clone_prompt is not provided")
+        return self._resolve_voice_clone_prompt_from_reference(
+            input_ids=input_ids,
+            ref_audio=ref_audio,
+            ref_text=ref_text,
+            xvec_only=xvec_only,
+            append_silence=append_silence,
+        )
+    def _resolve_precomputed_voice_clone_prompt(
+        self,
+        input_ids,
+        ref_text: str,
+        voice_clone_prompt: Union[Dict[str, Any], List[Any]],
+    ) -> Tuple[Dict[str, Any], list, bool]:
+        if isinstance(voice_clone_prompt, list):
+            if len(voice_clone_prompt) != len(input_ids):
+                raise ValueError(
+                    f"voice_clone_prompt must have length {len(input_ids)}, got {len(voice_clone_prompt)}"
+                )
+            vcp = self.model._prompt_items_to_voice_clone_prompt(voice_clone_prompt)
+            ref_ids = []
+            for item in voice_clone_prompt:
+                if bool(item.icl_mode):
+                    item_ref_text = item.ref_text if item.ref_text else ref_text
+                    if not item_ref_text:
+                        raise ValueError(
+                            "ref_text is required when voice_clone_prompt uses ICL mode."
+                        )
+                    ref_id = self.model._tokenize_texts(
+                        [self.model._build_ref_text(item_ref_text)]
+                    )[0]
+                    ref_ids.append(ref_id)
+                else:
+                    ref_ids.append(None)
+            return vcp, ref_ids, any(vcp["icl_mode"])
+        required_keys = ("ref_spk_embedding",)
+        missing = [k for k in required_keys if k not in voice_clone_prompt]
+        if missing:
+            raise ValueError(
+                f"voice_clone_prompt missing required keys: {missing}. "
+                f"Expected keys: {list(required_keys)}"
+            )
+        list_keys = ("ref_spk_embedding", "x_vector_only_mode", "icl_mode", "ref_code")
+        for key in list_keys:
+            if key not in voice_clone_prompt:
+                continue
+            value = voice_clone_prompt[key]
+            if not isinstance(value, list) or len(value) != len(input_ids):
+                raise ValueError(
+                    f"voice_clone_prompt[{key!r}] must be a list with length {len(input_ids)}"
+                )
+        xvec_modes = voice_clone_prompt.get("x_vector_only_mode", [True] * len(input_ids))
+        if "icl_mode" in voice_clone_prompt:
+            icl_modes = [bool(v) for v in voice_clone_prompt["icl_mode"]]
+            for i, (xvec_mode, icl_mode) in enumerate(zip(xvec_modes, icl_modes)):
+                if bool(xvec_mode) == bool(icl_mode):
+                    raise ValueError(
+                        f"voice_clone_prompt has inconsistent mode flags at index {i}: "
+                        "x_vector_only_mode and icl_mode must be opposites"
+                    )
+        else:
+            icl_modes = [not bool(v) for v in xvec_modes]
+        ref_codes = voice_clone_prompt.get("ref_code", [None] * len(input_ids))
+        for i, (xvec_mode, icl_mode, ref_code) in enumerate(zip(xvec_modes, icl_modes, ref_codes)):
+            if bool(xvec_mode) and ref_code is not None:
+                raise ValueError(
+                    f"voice_clone_prompt index {i}: ref_code must be None in x_vector_only mode"
+                )
+            if bool(icl_mode) and ref_code is None:
+                raise ValueError(
+                    f"voice_clone_prompt index {i}: ref_code is required in ICL mode"
+                )
+        vcp = dict(
+            ref_code=ref_codes,
+            ref_spk_embedding=voice_clone_prompt["ref_spk_embedding"],
+            x_vector_only_mode=[bool(v) for v in xvec_modes],
+            icl_mode=[bool(v) for v in icl_modes],
+        )
+        using_icl_mode = any(vcp["icl_mode"])
+        if using_icl_mode:
+            if not ref_text:
+                raise ValueError(
+                    "ref_text is required when voice_clone_prompt uses ICL mode."
+                )
+            ref_texts = [self.model._build_ref_text(ref_text)]
+            # NOTE: single ref_text is shared across all ICL items in the batch.
+            ref_id = self.model._tokenize_texts(ref_texts)[0]
+            ref_ids = [ref_id if is_icl else None for is_icl in vcp["icl_mode"]]
+        else:
+            ref_ids = [None] * len(input_ids)
+        return vcp, ref_ids, using_icl_mode
+    def _resolve_voice_clone_prompt_from_reference(
+        self,
+        input_ids,
+        ref_audio: Union[str, Path, tuple],
+        ref_text: str,
+        xvec_only: bool,
+        append_silence: bool,
+    ) -> Tuple[Dict[str, Any], list, bool]:
+        using_icl_mode = not xvec_only
+        cache_key = (str(ref_audio), ref_text, xvec_only, append_silence)
+        if cache_key in self._voice_prompt_cache:
+            vcp, ref_ids = self._voice_prompt_cache[cache_key]
+            return vcp, ref_ids, using_icl_mode
+        if xvec_only:
+            prompt_items = self.model.create_voice_clone_prompt(
+                ref_audio=str(ref_audio),
+                ref_text="",
+                x_vector_only_mode=True,
+            )
+            spk_emb = prompt_items[0].ref_spk_embedding
+            vcp = dict(
+                ref_code=[None],
+                ref_spk_embedding=[spk_emb],
+                x_vector_only_mode=[True],
+                icl_mode=[False],
+            )
+            ref_ids = [None] * len(input_ids)
+            self._voice_prompt_cache[cache_key] = (vcp, ref_ids)
+            return vcp, ref_ids, using_icl_mode
+        silence_secs = 0.5 if append_silence else 0.0
+        ref_audio_input = self._load_ref_audio_with_silence(ref_audio, silence_secs=silence_secs)
+        prompt_items = self.model.create_voice_clone_prompt(
+            ref_audio=ref_audio_input,
+            ref_text=ref_text
+        )
+        vcp = self.model._prompt_items_to_voice_clone_prompt(prompt_items)
+        ref_ids = []
+        rt = prompt_items[0].ref_text
+        if rt:
+            ref_texts = [self.model._build_ref_text(rt)]
+            ref_ids.append(self.model._tokenize_texts(ref_texts)[0])
+        else:
+            ref_ids.append(None)
+        self._voice_prompt_cache[cache_key] = (vcp, ref_ids)
+        return vcp, ref_ids, using_icl_mode
+    def _prepare_generation(
+        self,
+        text: str,
+        ref_audio: Optional[Union[str, Path, tuple]] = None,
+        ref_text: str = "",
+        language: str = "English",
+        xvec_only: bool = False,
+        non_streaming_mode: bool = False,
+        append_silence: bool = True,
+        voice_clone_prompt: Optional[Union[Dict[str, Any], List[Any]]] = None,
+        instruct: Optional[str] = None,
+    ):
+        """Prepare inputs for generation (shared by streaming and non-streaming).
+        Args:
+            xvec_only: When True, use only the speaker embedding (x-vector) for voice
+                cloning instead of the full ICL acoustic prompt. This prevents the model from
+                continuing the reference audio's last phoneme and allows natural language switching.
+                Default False to match upstream ICL behavior, where the full reference
+                audio codec tokens are included in context.
+            voice_clone_prompt: Optional precomputed prompt dict from
+                `create_voice_clone_prompt`/`_prompt_items_to_voice_clone_prompt`.
+                When provided, `xvec_only` is ignored. This path supports both:
+                x-vector-only prompts (`ref_spk_embedding` only) and ICL prompts
+                (`ref_spk_embedding` + `ref_code` + mode flags). `ref_text` is ignored
+                for x-vector-only and required for ICL.
+            instruct: Optional instruction string to guide generation style/language (e.g.
+                "请用纯正广东话朗读"). Prepended as a user turn before the assistant TTS turn.
+        """
+        input_texts = [self.model._build_assistant_text(text)]
+        input_ids = self.model._tokenize_texts(input_texts)
+        instruct_ids = [None]
+        if instruct:
+            instruct_ids = [self.model._tokenize_texts([self.model._build_instruct_text(instruct)])[0]]
+        vcp, ref_ids, using_icl_mode = self._resolve_voice_clone_prompt(
+            input_ids=input_ids,
+            ref_audio=ref_audio,
+            ref_text=ref_text,
+            xvec_only=xvec_only,
+            append_silence=append_silence,
+            voice_clone_prompt=voice_clone_prompt,
+        )
+        if instruct and not using_icl_mode:
+            logger.warning(
+                "Base-model instruct with x-vector-only voice cloning is experimental. "
+                "Upstream Qwen3-TTS itself does not follow instructions reliably in this "
+                "mode. Prefer xvec_only=False (ICL mode) when using instruct for voice "
+                "cloning."
+            )
+        m = self.model.model
+        tie, tam, tth, tpe = self._build_talker_inputs_local(
+            m=m,
+            input_ids=input_ids,
+            ref_ids=ref_ids,
+            voice_clone_prompt=vcp,
+            languages=[language] if language is not None else ["Auto"],
+            speakers=None,
+            non_streaming_mode=non_streaming_mode,
+            instruct_ids=instruct_ids,
+        )
+        if not self._warmed_up:
+            self._warmup(tie.shape[1])
+        talker = m.talker
+        config = m.config.talker_config
+        talker.rope_deltas = None
+        # For ICL mode: return ref_codes so the decoder can use them as acoustic context
+        ref_codes = None
+        if using_icl_mode and vcp.get("ref_code") and vcp["ref_code"][0] is not None:
+            ref_codes = vcp["ref_code"][0]
+        return m, talker, config, tie, tam, tth, tpe, ref_codes
+    def _prepare_generation_custom(
+        self,
+        text: str,
+        language: str,
+        speaker: Optional[str],
+        instruct: Optional[str] = None,
+        non_streaming_mode: bool = True,
+    ):
+        input_texts = [self.model._build_assistant_text(text)]
+        input_ids = self.model._tokenize_texts(input_texts)
+        instruct_ids = []
+        if instruct is None or instruct == "":
+            instruct_ids.append(None)
+        else:
+            instruct_ids.append(self.model._tokenize_texts([self.model._build_instruct_text(instruct)])[0])
+        m = self.model.model
+        tie, tam, tth, tpe = self._build_talker_inputs_local(
+            m=m,
+            input_ids=input_ids,
+            ref_ids=[None],
+            voice_clone_prompt=None,
+            languages=[language] if language is not None else ["Auto"],
+            speakers=[speaker],
+            non_streaming_mode=non_streaming_mode,
+            instruct_ids=instruct_ids,
+        )
+        if not self._warmed_up:
+            self._warmup(tie.shape[1])
+        talker = m.talker
+        config = m.config.talker_config
+        talker.rope_deltas = None
+        return m, talker, config, tie, tam, tth, tpe
+    def _build_talker_inputs_local(
+        self,
+        m,
+        input_ids,
+        ref_ids,
+        voice_clone_prompt,
+        languages,
+        speakers,
+        non_streaming_mode: bool,
+        instruct_ids=None,
+    ):
+        """Local copy of upstream talker input building for qwen-tts main repo."""
+        talker_input_embeds = [[] for _ in range(len(input_ids))]
+        voice_clone_spk_embeds = None
+        if voice_clone_prompt is not None:
+            voice_clone_spk_embeds = m.generate_speaker_prompt(voice_clone_prompt)
+        if instruct_ids is not None:
+            for index, instruct_id in enumerate(instruct_ids):
+                if instruct_id is not None:
+                    talker_input_embeds[index].append(
+                        m.talker.text_projection(m.talker.get_text_embeddings()(instruct_id))
+                    )
+        if speakers is None:
+            speakers = [None] * len(input_ids)
+        trailing_text_hiddens = []
+        tts_pad_embed = None
+        for index, (input_id, language, speaker) in enumerate(zip(input_ids, languages, speakers)):
+            if voice_clone_spk_embeds is None:
+                if speaker == "" or speaker is None:
+                    speaker_embed = None
+                else:
+                    if speaker.lower() not in m.config.talker_config.spk_id:
+                        raise NotImplementedError(f"Speaker {speaker} not implemented")
+                    spk_id = m.config.talker_config.spk_id[speaker.lower()]
+                    speaker_embed = m.talker.get_input_embeddings()(
+                        torch.tensor(spk_id, device=m.talker.device, dtype=input_id.dtype)
+                    )
+            else:
+                if voice_clone_prompt["x_vector_only_mode"][index] or voice_clone_prompt["icl_mode"][index]:
+                    speaker_embed = voice_clone_spk_embeds[index]
+                else:
+                    speaker_embed = None
+            assert language is not None
+            if language.lower() == "auto":
+                language_id = None
+            else:
+                if language.lower() not in m.config.talker_config.codec_language_id:
+                    raise NotImplementedError(f"Language {language} not implemented")
+                language_id = m.config.talker_config.codec_language_id[language.lower()]
+            if (
+                language.lower() in ["chinese", "auto"]
+                and speaker not in ("", None)
+                and m.config.talker_config.spk_is_dialect[speaker.lower()]
+            ):
+                dialect = m.config.talker_config.spk_is_dialect[speaker.lower()]
+                language_id = m.config.talker_config.codec_language_id[dialect]
+            tts_bos_embed, tts_eos_embed, tts_pad_embed = m.talker.text_projection(
+                m.talker.get_text_embeddings()(
+                    torch.tensor(
+                        [[m.config.tts_bos_token_id, m.config.tts_eos_token_id, m.config.tts_pad_token_id]],
+                        device=m.talker.device,
+                        dtype=input_id.dtype,
+                    )
+                )
+            ).chunk(3, dim=1)
+            if language_id is None:
+                codec_prefill_list = [[
+                    m.config.talker_config.codec_nothink_id,
+                    m.config.talker_config.codec_think_bos_id,
+                    m.config.talker_config.codec_think_eos_id,
+                ]]
+            else:
+                codec_prefill_list = [[
+                    m.config.talker_config.codec_think_id,
+                    m.config.talker_config.codec_think_bos_id,
+                    language_id,
+                    m.config.talker_config.codec_think_eos_id,
+                ]]
+            codec_input_emebdding_0 = m.talker.get_input_embeddings()(
+                torch.tensor(codec_prefill_list, device=m.talker.device, dtype=input_id.dtype)
+            )
+            codec_input_emebdding_1 = m.talker.get_input_embeddings()(
+                torch.tensor(
+                    [[m.config.talker_config.codec_pad_id, m.config.talker_config.codec_bos_id]],
+                    device=m.talker.device,
+                    dtype=input_id.dtype,
+                )
+            )
+            if speaker_embed is None:
+                codec_input_emebdding = torch.cat([codec_input_emebdding_0, codec_input_emebdding_1], dim=1)
+            else:
+                codec_input_emebdding = torch.cat([codec_input_emebdding_0, speaker_embed.view(1, 1, -1), codec_input_emebdding_1], dim=1)
+            _talker_input_embed_role = m.talker.text_projection(
+                m.talker.get_text_embeddings()(input_id[:, :3])
+            )
+            _talker_input_embed = torch.cat(
+                (
+                    tts_pad_embed.expand(-1, codec_input_emebdding.shape[1] - 2, -1),
+                    tts_bos_embed,
+                ),
+                dim=1,
+            ) + codec_input_emebdding[:, :-1]
+            talker_input_embed = torch.cat((_talker_input_embed_role, _talker_input_embed), dim=1)
+            if (
+                voice_clone_prompt is not None
+                and voice_clone_prompt.get("ref_code", None) is not None
+                and voice_clone_prompt["icl_mode"][index]
+            ):
+                icl_input_embed, trailing_text_hidden = m.generate_icl_prompt(
+                    text_id=input_id[:, 3:-5],
+                    ref_id=ref_ids[index][:, 3:-2],
+                    ref_code=voice_clone_prompt["ref_code"][index].to(m.talker.device).clone(),  # escape inference_mode context
+                    tts_pad_embed=tts_pad_embed,
+                    tts_eos_embed=tts_eos_embed,
+                    non_streaming_mode=non_streaming_mode,
+                )
+                talker_input_embed = torch.cat([talker_input_embed, icl_input_embed], dim=1)
+            else:
+                talker_input_embed = torch.cat(
+                    [
+                        talker_input_embed,
+                        m.talker.text_projection(
+                            m.talker.get_text_embeddings()(input_id[:, 3:4])
+                        )
+                        + codec_input_emebdding[:, -1:],
+                    ],
+                    dim=1,
+                )
+                if non_streaming_mode:
+                    talker_input_embed = talker_input_embed[:, :-1]
+                    talker_input_embed = torch.cat(
+                        [
+                            talker_input_embed,
+                            torch.cat(
+                                (
+                                    m.talker.text_projection(
+                                        m.talker.get_text_embeddings()(input_id[:, 3:-5])
+                                    ),
+                                    tts_eos_embed,
+                                ),
+                                dim=1,
+                            )
+                            + m.talker.get_input_embeddings()(
+                                torch.tensor(
+                                    [[m.config.talker_config.codec_pad_id] * (input_id[:, 3:-5].shape[1] + 1)],
+                                    device=m.talker.device,
+                                    dtype=input_id.dtype,
+                                )
+                            ),
+                            tts_pad_embed
+                            + m.talker.get_input_embeddings()(
+                                torch.tensor(
+                                    [[m.config.talker_config.codec_bos_id]],
+                                    device=m.talker.device,
+                                    dtype=input_id.dtype,
+                                )
+                            ),
+                        ],
+                        dim=1,
+                    )
+                    trailing_text_hidden = tts_pad_embed
+                else:
+                    trailing_text_hidden = torch.cat(
+                        (
+                            m.talker.text_projection(
+                                m.talker.get_text_embeddings()(input_id[:, 4:-5])
+                            ),
+                            tts_eos_embed,
+                        ),
+                        dim=1,
+                    )
+            talker_input_embeds[index].append(talker_input_embed)
+            trailing_text_hiddens.append(trailing_text_hidden)
+        for index, talker_input_embed in enumerate(talker_input_embeds):
+            talker_input_embeds[index] = torch.cat([item for item in talker_input_embed if item is not None], dim=1)
+        original_lengths = torch.tensor([t.shape[1] for t in talker_input_embeds])
+        sequences = [t.squeeze(0) for t in talker_input_embeds]
+        sequences_reversed = [t.flip(dims=[0]) for t in sequences]
+        padded_reversed = torch.nn.utils.rnn.pad_sequence(
+            sequences_reversed,
+            batch_first=True,
+            padding_value=0.0,
+        )
+        talker_input_embeds = padded_reversed.flip(dims=[1])
+        batch_size, max_len = talker_input_embeds.shape[0], talker_input_embeds.shape[1]
+        indices = torch.arange(max_len).expand(batch_size, -1)
+        num_pads = max_len - original_lengths
+        talker_attention_mask = (indices >= num_pads.unsqueeze(1)).long().to(talker_input_embeds.device)
+        pad_embedding_vector = tts_pad_embed.squeeze()
+        sequences_to_pad = [t.squeeze(0) for t in trailing_text_hiddens]
+        trailing_text_original_lengths = [s.shape[0] for s in sequences_to_pad]
+        padded_hiddens = torch.nn.utils.rnn.pad_sequence(
+            sequences_to_pad,
+            batch_first=True,
+            padding_value=0.0,
+        )
+        arange_tensor = torch.arange(max(trailing_text_original_lengths), device=padded_hiddens.device).expand(
+            len(trailing_text_original_lengths), -1
+        )
+        lengths_tensor = torch.tensor(trailing_text_original_lengths, device=padded_hiddens.device).unsqueeze(1)
+        padding_mask = arange_tensor >= lengths_tensor
+        padded_hiddens[padding_mask] = pad_embedding_vector
+        trailing_text_hiddens = padded_hiddens
+        return talker_input_embeds, talker_attention_mask, trailing_text_hiddens, tts_pad_embed
+    @torch.inference_mode()
+    def generate_voice_clone(
+        self,
+        text: str,
+        language: str,
+        ref_audio: Optional[Union[str, Path, tuple]] = None,
+        ref_text: str = "",
+        max_new_tokens: int = 2048,
+        min_new_tokens: int = 2,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        do_sample: bool = True,
+        repetition_penalty: float = 1.05,
+        xvec_only: bool = False,
+        non_streaming_mode: bool = False,
+        append_silence: bool = True,
+        instruct: Optional[str] = None,
+        voice_clone_prompt: Optional[Union[Dict[str, Any], List[Any]]] = None,
+    ) -> Tuple[list, int]:
+        """
+        Generate speech with voice cloning using reference audio.
+        Args:
+            text: Text to synthesize
+            language: Target language
+            ref_audio: Path to reference audio file. Required when `voice_clone_prompt` is not provided.
+            ref_text: Transcription of reference audio.
+            max_new_tokens: Maximum tokens to generate
+            min_new_tokens: Minimum tokens before EOS is allowed
+            temperature: Sampling temperature
+            top_k: Top-k sampling
+            top_p: Top-p (nucleus) sampling
+            do_sample: Whether to sample
+            repetition_penalty: Repetition penalty
+            xvec_only: When True, use only the speaker embedding for voice cloning.
+                This prevents phoneme bleed-through from the reference and allows clean
+                language switching. Default False to match upstream ICL behavior
+                (reference audio in context).
+            non_streaming_mode: Match upstream text-feeding layout. Default False to match
+                upstream step-by-step text feeding during decode.
+            voice_clone_prompt: Optional precomputed voice clone prompt dict. When provided,
+                `xvec_only` is ignored and prompt extraction from `ref_audio` is skipped.
+                This path supports x-vector-only prompts (`ref_spk_embedding` only)
+                and ICL prompts (`ref_spk_embedding` + `ref_code` + mode flags).
+                `ref_text` is ignored for x-vector-only and required for ICL.
+            instruct: Optional instruction to guide generation style/dialect (e.g.
+                "请用纯正广东话朗读"). Prepended as a user turn before the TTS assistant turn.
+                Experimental for x-vector-only voice cloning; prefer `xvec_only=False`.
+        Returns:
+            Tuple of ([audio_waveform], sample_rate)
+        """
+        from .generate import fast_generate
+        m, talker, config, tie, tam, tth, tpe, ref_codes = self._prepare_generation(
+            text=text,
+            language=language,
+            ref_audio=ref_audio,
+            ref_text=ref_text,
+            xvec_only=xvec_only,
+            non_streaming_mode=non_streaming_mode,
+            append_silence=append_silence,
+            voice_clone_prompt=voice_clone_prompt,
+            instruct=instruct,
+        )
+        codec_ids, timing = fast_generate(
+            talker=talker,
+            talker_input_embeds=tie,
+            attention_mask=tam,
+            trailing_text_hiddens=tth,
+            tts_pad_embed=tpe,
+            config=config,
+            predictor_graph=self.predictor_graph,
+            talker_graph=self.talker_graph,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            repetition_penalty=repetition_penalty,
+        )
+        if codec_ids is None:
+            logger.warning("Generation returned no tokens")
+            return [np.zeros(1, dtype=np.float32)], self.sample_rate
+        # In ICL mode: prepend reference codes before decoding so the codec decoder
+        # has acoustic context from the reference audio (matches official implementation).
+        speech_tokenizer = m.speech_tokenizer
+        if ref_codes is not None:
+            ref_codes_dev = ref_codes.to(codec_ids.device)
+            codes_for_decode = torch.cat([ref_codes_dev, codec_ids], dim=0)
+        else:
+            codes_for_decode = codec_ids
+        audio_list, sr = speech_tokenizer.decode({"audio_codes": codes_for_decode.unsqueeze(0)})
+        # Convert to numpy and trim off the reference audio portion
+        ref_len = ref_codes.shape[0] if ref_codes is not None else 0
+        total_len = codes_for_decode.shape[0]
+        audio_arrays = []
+        for a in audio_list:
+            if hasattr(a, 'cpu'):  # torch tensor
+                a = a.flatten().cpu().numpy()
+            else:  # already numpy
+                a = a.flatten() if hasattr(a, 'flatten') else a
+            if ref_len > 0:
+                cut = int(ref_len / max(total_len, 1) * len(a))
+                a = a[cut:]
+            audio_arrays.append(a)
+        n_steps = timing['steps']
+        audio_duration = n_steps / 12.0  # 12 Hz codec
+        total_time = timing['prefill_ms']/1000 + timing['decode_s']
+        rtf = audio_duration / total_time if total_time > 0 else 0
+        logger.info(
+            f"Generated {audio_duration:.2f}s audio in {total_time:.2f}s "
+            f"({timing['ms_per_step']:.1f}ms/step, RTF: {rtf:.2f})"
+        )
+        return audio_arrays, sr
+    @torch.inference_mode()
+    def generate_voice_clone_streaming(
+        self,
+        text: str,
+        language: str,
+        ref_audio: Optional[Union[str, Path]] = None,
+        ref_text: str = "",
+        max_new_tokens: int = 2048,
+        min_new_tokens: int = 2,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        do_sample: bool = True,
+        repetition_penalty: float = 1.05,
+        chunk_size: int = 12,
+        xvec_only: bool = False,
+        non_streaming_mode: bool = False,
+        append_silence: bool = True,
+        parity_mode: bool = False,
+        instruct: Optional[str] = None,
+        voice_clone_prompt: Optional[Union[Dict[str, Any], List[Any]]] = None,
+    ) -> Generator[Tuple[np.ndarray, int, dict], None, None]:
+        """
+        Stream voice-cloned speech generation, yielding audio chunks.
+        Same as generate_voice_clone() but yields (audio_chunk, sample_rate, timing)
+        tuples every chunk_size codec steps (~chunk_size/12 seconds of audio).
+        Args:
+            text: Text to synthesize
+            language: Target language
+            ref_audio: Path to reference audio file. Required when `voice_clone_prompt` is not provided.
+            ref_text: Transcription of reference audio.
+            max_new_tokens: Maximum tokens to generate
+            min_new_tokens: Minimum tokens before EOS is allowed
+            temperature: Sampling temperature
+            top_k: Top-k sampling
+            top_p: Top-p (nucleus) sampling
+            do_sample: Whether to sample
+            repetition_penalty: Repetition penalty
+            chunk_size: Codec steps per chunk (12 = ~1 second)
+            xvec_only: When True, use only the speaker embedding for voice cloning.
+                This prevents phoneme bleed-through from the reference and allows clean
+                language switching. Default False to match upstream ICL behavior
+                (reference audio in context).
+            non_streaming_mode: Default False to match upstream text feeding during decode.
+                Set to True to prefill the full target text before streaming decode.
+            parity_mode: When True, disables CUDA graphs and uses dynamic cache streaming.
+            voice_clone_prompt: Optional precomputed voice clone prompt dict. When provided,
+                `xvec_only` is ignored and prompt extraction from `ref_audio` is skipped.
+                This path supports x-vector-only prompts (`ref_spk_embedding` only)
+                and ICL prompts (`ref_spk_embedding` + `ref_code` + mode flags).
+                `ref_text` is ignored for x-vector-only and required for ICL.
+            instruct: Optional instruction to guide generation style/dialect (e.g.
+                "请用纯正广东话朗读"). Prepended as a user turn before the TTS assistant turn.
+                Experimental for x-vector-only voice cloning; prefer `xvec_only=False`.
+        Yields:
+            Tuple of (audio_chunk_numpy, sample_rate, timing_dict)
+        """
+        from .streaming import fast_generate_streaming, parity_generate_streaming
+        m, talker, config, tie, tam, tth, tpe, ref_codes = self._prepare_generation(
+            text=text,
+            language=language,
+            ref_audio=ref_audio,
+            ref_text=ref_text,
+            xvec_only=xvec_only,
+            non_streaming_mode=non_streaming_mode,
+            append_silence=append_silence,
+            voice_clone_prompt=voice_clone_prompt,
+            instruct=instruct,
+        )
+        speech_tokenizer = m.speech_tokenizer
+        # Hybrid decode strategy:
+        # 1. Accumulated decode for early chunks (correct, calibrates samples_per_frame)
+        # 2. Sliding window with 25-frame left context once calibrated (constant cost)
+        # This avoids boundary artifacts (pops) while keeping decode cost bounded.
+        context_frames = 25
+        min_calibration_frames = max(context_frames, chunk_size)
+        all_codes = []
+        prev_gen_audio_len = 0  # tracks position within the generated (non-ref) audio
+        samples_per_frame = None
+        stream_fn = parity_generate_streaming if parity_mode else fast_generate_streaming
+        stream_kwargs = dict(
+            talker=talker,
+            talker_input_embeds=tie,
+            attention_mask=tam,
+            trailing_text_hiddens=tth,
+            tts_pad_embed=tpe,
+            config=config,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            repetition_penalty=repetition_penalty,
+            chunk_size=chunk_size,
+        )
+        if not parity_mode:
+            stream_kwargs["predictor_graph"] = self.predictor_graph
+            stream_kwargs["talker_graph"] = self.talker_graph
+        for codec_chunk, timing in stream_fn(**stream_kwargs):
+            all_codes.append(codec_chunk)
+            n_new = codec_chunk.shape[0]
+            all_flat = torch.cat(all_codes, dim=0)
+            n_total = all_flat.shape[0]
+            if samples_per_frame is None:
+                # Phase 1: accumulated decode until we can calibrate.
+                # In ICL mode prepend reference codes so the codec decoder has acoustic
+                # context from the reference audio (matches official implementation).
+                if ref_codes is not None:
+                    codes_input = torch.cat([ref_codes.to(all_flat.device), all_flat], dim=0)
+                else:
+                    codes_input = all_flat
+                audio_list, sr = speech_tokenizer.decode(
+                    {"audio_codes": codes_input.unsqueeze(0)}
+                )
+                audio = audio_list[0]
+                if hasattr(audio, 'cpu'):
+                    audio = audio.flatten().cpu().numpy()
+                else:
+                    audio = audio.flatten() if hasattr(audio, 'flatten') else audio
+                # Separate out reference audio portion; track position in generated audio only
+                if ref_codes is not None:
+                    ref_len = ref_codes.shape[0]
+                    total_len = codes_input.shape[0]
+                    ref_audio_cut = int(ref_len / max(total_len, 1) * len(audio))
+                    gen_audio = audio[ref_audio_cut:]
+                else:
+                    gen_audio = audio
+                new_audio = gen_audio[prev_gen_audio_len:]
+                prev_gen_audio_len = len(gen_audio)
+                if n_total >= min_calibration_frames:
+                    samples_per_frame = len(gen_audio) / n_total
+            else:
+                # Phase 2: sliding window with left context
+                ctx_start = max(0, n_total - n_new - context_frames)
+                window = all_flat[ctx_start:]
+                n_ctx = window.shape[0] - n_new
+                audio_list, sr = speech_tokenizer.decode(
+                    {"audio_codes": window.unsqueeze(0)}
+                )
+                audio = audio_list[0]
+                if hasattr(audio, 'cpu'):
+                    audio = audio.flatten().cpu().numpy()
+                else:
+                    audio = audio.flatten() if hasattr(audio, 'flatten') else audio
+                if n_ctx > 0:
+                    ctx_samples = int(round(n_ctx * samples_per_frame))
+                    new_audio = audio[ctx_samples:]
+                else:
+                    new_audio = audio
+            yield new_audio, sr, timing
+    @torch.inference_mode()
+    def generate_custom_voice(
+        self,
+        text: str,
+        speaker: str,
+        language: str,
+        instruct: Optional[str] = None,
+        non_streaming_mode: bool = True,
+        max_new_tokens: int = 2048,
+        min_new_tokens: int = 2,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        do_sample: bool = True,
+        repetition_penalty: float = 1.05,
+    ) -> Tuple[list, int]:
+        if self.model.model.tts_model_type != "custom_voice":
+            raise ValueError("Loaded model does not support custom voice generation")
+        self.model._validate_languages([language])
+        self.model._validate_speakers([speaker])
+        if self.model.model.tts_model_size in "0b6":
+            instruct = None
+        from .generate import fast_generate
+        m, talker, config, tie, tam, tth, tpe = self._prepare_generation_custom(
+            text=text,
+            language=language,
+            speaker=speaker,
+            instruct=instruct,
+            non_streaming_mode=non_streaming_mode,
+        )
+        codec_ids, timing = fast_generate(
+            talker=talker,
+            talker_input_embeds=tie,
+            attention_mask=tam,
+            trailing_text_hiddens=tth,
+            tts_pad_embed=tpe,
+            config=config,
+            predictor_graph=self.predictor_graph,
+            talker_graph=self.talker_graph,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            repetition_penalty=repetition_penalty,
+        )
+        if codec_ids is None:
+            logger.warning("Generation returned no tokens")
+            return [np.zeros(1, dtype=np.float32)], self.sample_rate
+        speech_tokenizer = m.speech_tokenizer
+        audio_list, sr = speech_tokenizer.decode({"audio_codes": codec_ids.unsqueeze(0)})
+        audio_arrays = []
+        for a in audio_list:
+            if hasattr(a, "cpu"):
+                audio_arrays.append(a.flatten().cpu().numpy())
+            else:
+                audio_arrays.append(a.flatten() if hasattr(a, "flatten") else a)
+        n_steps = timing["steps"]
+        audio_duration = n_steps / 12.0
+        total_time = timing["prefill_ms"] / 1000 + timing["decode_s"]
+        rtf = audio_duration / total_time if total_time > 0 else 0
+        logger.info(
+            f"Generated {audio_duration:.2f}s audio in {total_time:.2f}s "
+            f"({timing['ms_per_step']:.1f}ms/step, RTF: {rtf:.2f})"
+        )
+        return audio_arrays, sr
+    @torch.inference_mode()
+    def generate_custom_voice_streaming(
+        self,
+        text: str,
+        speaker: str,
+        language: str,
+        instruct: Optional[str] = None,
+        non_streaming_mode: bool = True,
+        max_new_tokens: int = 2048,
+        min_new_tokens: int = 2,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        do_sample: bool = True,
+        repetition_penalty: float = 1.05,
+        chunk_size: int = 12,
+    ) -> Generator[Tuple[np.ndarray, int, dict], None, None]:
+        if self.model.model.tts_model_type != "custom_voice":
+            raise ValueError("Loaded model does not support custom voice generation")
+        self.model._validate_languages([language])
+        self.model._validate_speakers([speaker])
+        if self.model.model.tts_model_size in "0b6":
+            instruct = None
+        from .streaming import fast_generate_streaming
+        m, talker, config, tie, tam, tth, tpe = self._prepare_generation_custom(
+            text=text,
+            language=language,
+            speaker=speaker,
+            instruct=instruct,
+            non_streaming_mode=non_streaming_mode,
+        )
+        speech_tokenizer = m.speech_tokenizer
+        context_frames = 25
+        min_calibration_frames = max(context_frames, chunk_size)
+        all_codes = []
+        prev_audio_len = 0
+        samples_per_frame = None
+        for codec_chunk, timing in fast_generate_streaming(
+            talker=talker,
+            talker_input_embeds=tie,
+            attention_mask=tam,
+            trailing_text_hiddens=tth,
+            tts_pad_embed=tpe,
+            config=config,
+            predictor_graph=self.predictor_graph,
+            talker_graph=self.talker_graph,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            repetition_penalty=repetition_penalty,
+            chunk_size=chunk_size,
+        ):
+            all_codes.append(codec_chunk)
+            n_new = codec_chunk.shape[0]
+            all_flat = torch.cat(all_codes, dim=0)
+            n_total = all_flat.shape[0]
+            if samples_per_frame is None:
+                audio_list, sr = speech_tokenizer.decode({"audio_codes": all_flat.unsqueeze(0)})
+                audio = audio_list[0]
+                if hasattr(audio, "cpu"):
+                    audio = audio.flatten().cpu().numpy()
+                else:
+                    audio = audio.flatten() if hasattr(audio, "flatten") else audio
+                new_audio = audio[prev_audio_len:]
+                prev_audio_len = len(audio)
+                if n_total >= min_calibration_frames:
+                    samples_per_frame = len(audio) / n_total
+            else:
+                ctx_start = max(0, n_total - n_new - context_frames)
+                window = all_flat[ctx_start:]
+                n_ctx = window.shape[0] - n_new
+                audio_list, sr = speech_tokenizer.decode({"audio_codes": window.unsqueeze(0)})
+                audio = audio_list[0]
+                if hasattr(audio, "cpu"):
+                    audio = audio.flatten().cpu().numpy()
+                else:
+                    audio = audio.flatten() if hasattr(audio, "flatten") else audio
+                if n_ctx > 0:
+                    ctx_samples = int(round(n_ctx * samples_per_frame))
+                    new_audio = audio[ctx_samples:]
+                else:
+                    new_audio = audio
+            yield new_audio, sr, timing
+    @torch.inference_mode()
+    def generate_voice_design(
+        self,
+        text: str,
+        instruct: str,
+        language: str,
+        non_streaming_mode: bool = True,
+        max_new_tokens: int = 2048,
+        min_new_tokens: int = 2,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        do_sample: bool = True,
+        repetition_penalty: float = 1.05,
+    ) -> Tuple[list, int]:
+        if self.model.model.tts_model_type != "voice_design":
+            raise ValueError("Loaded model does not support voice design generation")
+        self.model._validate_languages([language])
+        from .generate import fast_generate
+        m, talker, config, tie, tam, tth, tpe = self._prepare_generation_custom(
+            text=text,
+            language=language,
+            speaker=None,
+            instruct=instruct,
+            non_streaming_mode=non_streaming_mode,
+        )
+        codec_ids, timing = fast_generate(
+            talker=talker,
+            talker_input_embeds=tie,
+            attention_mask=tam,
+            trailing_text_hiddens=tth,
+            tts_pad_embed=tpe,
+            config=config,
+            predictor_graph=self.predictor_graph,
+            talker_graph=self.talker_graph,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            repetition_penalty=repetition_penalty,
+        )
+        if codec_ids is None:
+            logger.warning("Generation returned no tokens")
+            return [np.zeros(1, dtype=np.float32)], self.sample_rate
+        speech_tokenizer = m.speech_tokenizer
+        audio_list, sr = speech_tokenizer.decode({"audio_codes": codec_ids.unsqueeze(0)})
+        audio_arrays = []
+        for a in audio_list:
+            if hasattr(a, "cpu"):
+                audio_arrays.append(a.flatten().cpu().numpy())
+            else:
+                audio_arrays.append(a.flatten() if hasattr(a, "flatten") else a)
+        n_steps = timing["steps"]
+        audio_duration = n_steps / 12.0
+        total_time = timing["prefill_ms"] / 1000 + timing["decode_s"]
+        rtf = audio_duration / total_time if total_time > 0 else 0
+        logger.info(
+            f"Generated {audio_duration:.2f}s audio in {total_time:.2f}s "
+            f"({timing['ms_per_step']:.1f}ms/step, RTF: {rtf:.2f})"
+        )
+        return audio_arrays, sr
+    @torch.inference_mode()
+    def generate_voice_design_streaming(
+        self,
+        text: str,
+        instruct: str,
+        language: str,
+        non_streaming_mode: bool = True,
+        max_new_tokens: int = 2048,
+        min_new_tokens: int = 2,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        do_sample: bool = True,
+        repetition_penalty: float = 1.05,
+        chunk_size: int = 12,
+    ) -> Generator[Tuple[np.ndarray, int, dict], None, None]:
+        if self.model.model.tts_model_type != "voice_design":
+            raise ValueError("Loaded model does not support voice design generation")
+        self.model._validate_languages([language])
+        from .streaming import fast_generate_streaming
+        m, talker, config, tie, tam, tth, tpe = self._prepare_generation_custom(
+            text=text,
+            language=language,
+            speaker=None,
+            instruct=instruct,
+            non_streaming_mode=non_streaming_mode,
+        )
+        speech_tokenizer = m.speech_tokenizer
+        context_frames = 25
+        min_calibration_frames = max(context_frames, chunk_size)
+        all_codes = []
+        prev_audio_len = 0
+        samples_per_frame = None
+        for codec_chunk, timing in fast_generate_streaming(
+            talker=talker,
+            talker_input_embeds=tie,
+            attention_mask=tam,
+            trailing_text_hiddens=tth,
+            tts_pad_embed=tpe,
+            config=config,
+            predictor_graph=self.predictor_graph,
+            talker_graph=self.talker_graph,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            repetition_penalty=repetition_penalty,
+            chunk_size=chunk_size,
+        ):
+            all_codes.append(codec_chunk)
+            n_new = codec_chunk.shape[0]
+            all_flat = torch.cat(all_codes, dim=0)
+            n_total = all_flat.shape[0]
+            if samples_per_frame is None:
+                audio_list, sr = speech_tokenizer.decode({"audio_codes": all_flat.unsqueeze(0)})
+                audio = audio_list[0]
+                if hasattr(audio, "cpu"):
+                    audio = audio.flatten().cpu().numpy()
+                else:
+                    audio = audio.flatten() if hasattr(audio, "flatten") else audio
+                new_audio = audio[prev_audio_len:]
+                prev_audio_len = len(audio)
+                if n_total >= min_calibration_frames:
+                    samples_per_frame = len(audio) / n_total
+            else:
+                ctx_start = max(0, n_total - n_new - context_frames)
+                window = all_flat[ctx_start:]
+                n_ctx = window.shape[0] - n_new
+                audio_list, sr = speech_tokenizer.decode({"audio_codes": window.unsqueeze(0)})
+                audio = audio_list[0]
+                if hasattr(audio, "cpu"):
+                    audio = audio.flatten().cpu().numpy()
+                else:
+                    audio = audio.flatten() if hasattr(audio, "flatten") else audio
+                if n_ctx > 0:
+                    ctx_samples = int(round(n_ctx * samples_per_frame))
+                    new_audio = audio[ctx_samples:]
+                else:
+                    new_audio = audio
+            yield new_audio, sr, timing

faster_qwen3_tts/predictor_graph.py ADDED Viewed

	@@ -0,0 +1,214 @@

+#!/usr/bin/env python3
+"""
+CUDA graph capture for the code predictor's 15-step decode loop,
+using transformers StaticCache.
+The predictor generates 15 codebooks autoregressively:
+- Step 0: prefill with 2 tokens (past_hidden + first_codebook_embed), get logits[0]
+- Steps 1-14: decode 1 token at a time using previous codebook token's embedding
+Strategy:
+- Use transformers StaticCache for KV cache management
+- Use the predictor's inner model forward (handles mask, RoPE, attention internally)
+- Unroll the full 15-step loop for deterministic shapes
+- Capture the entire loop as a single CUDA graph
+"""
+import torch
+from transformers import StaticCache
+from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from .sampling import sample_logits
+class PredictorGraph:
+    """
+    Captures the full predictor 15-step loop as a CUDA graph,
+    using the model's forward with transformers StaticCache.
+    Usage:
+        mpg = PredictorGraph(code_predictor, pred_config, talker_hidden_size)
+        mpg.capture()
+        codebook_tokens = mpg.run(pred_input)  # pred_input: [1, 2, H]
+    """
+    def __init__(self, code_predictor, pred_config, talker_hidden_size, device='cuda', dtype=torch.bfloat16,
+                 do_sample=True, top_k=50, top_p=1.0, temperature=0.9):
+        self.device = device
+        device_index = torch.device(device).index
+        device_index = device_index if device_index is not None else torch.cuda.current_device()
+        self.device_index = device_index
+        self.dtype = dtype
+        self.num_layers = pred_config.num_hidden_layers
+        self.hidden_size = pred_config.hidden_size
+        self.num_code_groups = pred_config.num_code_groups
+        self.num_codebooks = self.num_code_groups - 1  # 15
+        self.max_seq = 2 + self.num_codebooks  # 17
+        self.do_sample = do_sample
+        self.top_k = top_k
+        self.top_p = top_p
+        self.temperature = temperature
+        # Extract model components (references, not copies)
+        cp = code_predictor
+        self.small_to_mtp = cp.small_to_mtp_projection
+        self.pred_model = cp.model  # Inner transformer model (5 layers)
+        self.lm_heads = cp.lm_head  # ModuleList[15]
+        self.codec_embeds = cp.model.codec_embedding  # ModuleList[15]
+        self.has_sliding_layers = "sliding_attention" in getattr(self.pred_model.config, "layer_types", [])
+        # Transformers StaticCache for the predictor
+        self.static_cache = StaticCache(config=pred_config, max_cache_len=self.max_seq)
+        # Pre-allocate cache_position tensors for each step (avoids CPU→GPU in graph)
+        self.prefill_cache_pos = torch.arange(2, device=device)
+        self.decode_cache_positions = [
+            torch.tensor([2 + i], device=device) for i in range(self.num_codebooks - 1)
+        ]
+        # I/O buffers
+        self.input_buf = torch.zeros(1, 2, talker_hidden_size, dtype=dtype, device=device)
+        self.output_tokens = torch.zeros(self.num_codebooks, dtype=torch.long, device=device)
+        self.graph = None
+        self.captured = False
+        self.prefill_attn = None
+        self.decode_attn = None
+    def _init_cache_layers(self):
+        """Force lazy initialization of StaticCache layers before graph capture."""
+        config = self.pred_model.config
+        num_kv_heads = getattr(config, 'num_key_value_heads', config.num_attention_heads)
+        head_dim = getattr(config, 'head_dim', config.hidden_size // config.num_attention_heads)
+        dummy_k = torch.zeros(1, num_kv_heads, 1, head_dim, dtype=self.dtype, device=self.device)
+        for layer in self.static_cache.layers:
+            if not layer.is_initialized:
+                layer.lazy_initialization(dummy_k)
+    def _make_attn_mask(self, input_embeds: torch.Tensor, cache_position: torch.Tensor):
+        mask = create_causal_mask(
+            config=self.pred_model.config,
+            input_embeds=input_embeds,
+            attention_mask=None,
+            cache_position=cache_position,
+            past_key_values=self.static_cache,
+        )
+        if self.has_sliding_layers:
+            sliding = create_sliding_window_causal_mask(
+                config=self.pred_model.config,
+                input_embeds=input_embeds,
+                attention_mask=None,
+                cache_position=cache_position,
+                past_key_values=self.static_cache,
+            )
+            return {"full_attention": mask, "sliding_attention": sliding}
+        return {"full_attention": mask}
+    def _build_attention_masks(self):
+        dummy_prefill = torch.zeros(1, 2, self.hidden_size, dtype=self.dtype, device=self.device)
+        dummy_decode = torch.zeros(1, 1, self.hidden_size, dtype=self.dtype, device=self.device)
+        self.prefill_attn = self._make_attn_mask(dummy_prefill, self.prefill_cache_pos)
+        self.decode_attn = []
+        for pos in self.decode_cache_positions:
+            self.decode_attn.append(self._make_attn_mask(dummy_decode, pos))
+    def _full_loop(self):
+        """The full 15-step predictor loop on static buffers."""
+        # Project input from talker hidden size to predictor hidden size
+        h = self.small_to_mtp(self.input_buf)  # [1, 2, hidden]
+        # Prefill: 2 tokens through all layers
+        out = self.pred_model(
+            inputs_embeds=h,
+            attention_mask=self.prefill_attn,
+            past_key_values=self.static_cache,
+            cache_position=self.prefill_cache_pos,
+            use_cache=True,
+        )
+        h = out.last_hidden_state  # [1, 2, hidden] — already normalized
+        # First codebook: logits from last position
+        logits = self.lm_heads[0](h[:, -1:, :])  # [1, 1, vocab]
+        tok = sample_logits(
+            logits[:, 0, :],
+            temperature=self.temperature,
+            top_k=self.top_k,
+            top_p=self.top_p,
+            do_sample=self.do_sample,
+        )
+        self.output_tokens[0] = tok[0]
+        # Remaining 14 codebooks
+        for cb_idx in range(1, self.num_codebooks):
+            # Embed previous token using codebook-specific embedding
+            emb = self.codec_embeds[cb_idx - 1](tok.unsqueeze(0))  # [1, 1, codec_hidden]
+            emb = self.small_to_mtp(emb)  # [1, 1, hidden]
+            # Single-token decode through all layers
+            out = self.pred_model(
+                inputs_embeds=emb,
+                attention_mask=self.decode_attn[cb_idx - 1],
+                past_key_values=self.static_cache,
+                cache_position=self.decode_cache_positions[cb_idx - 1],
+                use_cache=True,
+            )
+            h = out.last_hidden_state
+            logits = self.lm_heads[cb_idx](h[:, -1:, :])
+            tok = sample_logits(
+                logits[:, 0, :],
+                temperature=self.temperature,
+                top_k=self.top_k,
+                top_p=self.top_p,
+                do_sample=self.do_sample,
+            )
+            self.output_tokens[cb_idx] = tok[0]
+        return self.output_tokens
+    @torch.inference_mode()
+    def capture(self, num_warmup=3):
+        """Warmup and capture the CUDA graph."""
+        print(f"Warming up predictor ({num_warmup} runs)...")
+        # Force cache initialization before graph capture
+        self._init_cache_layers()
+        self._build_attention_masks()
+        for _ in range(num_warmup):
+            self.static_cache.reset()
+            self._full_loop()
+        torch.cuda.synchronize()
+        print("Capturing CUDA graph for predictor...")
+        with torch.cuda.device(self.device_index):
+            s = torch.cuda.Stream()
+            s.wait_stream(torch.cuda.current_stream())
+            with torch.cuda.stream(s):
+                self.graph = torch.cuda.CUDAGraph()
+                # Warmup in capture stream
+                self.static_cache.reset()
+                self._full_loop()
+                torch.cuda.synchronize()
+                self.static_cache.reset()
+                with torch.cuda.graph(self.graph):
+                    self._full_loop()
+        torch.cuda.current_stream().wait_stream(s)
+        torch.cuda.synchronize()
+        self.captured = True
+        print("CUDA graph captured!")
+    @torch.inference_mode()
+    def run(self, pred_input: torch.Tensor) -> torch.Tensor:
+        """
+        Run the captured graph.
+        pred_input: [1, 2, talker_hidden_size] (past_hidden cat first_codebook_embed)
+        Returns: [15] long tensor of codebook tokens
+        """
+        self.input_buf.copy_(pred_input)
+        self.static_cache.reset()
+        self.graph.replay()
+        return self.output_tokens.clone()

faster_qwen3_tts/sampling.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Shared sampling helpers for talker and predictor generation."""
+from __future__ import annotations
+from typing import Iterable, Optional
+import torch
+import torch.nn.functional as F
+def apply_repetition_penalty(
+    logits: torch.Tensor,
+    token_history: torch.Tensor,
+    repetition_penalty: float,
+) -> torch.Tensor:
+    """Apply repetition penalty to logits in-place and return them.
+    Args:
+        logits: Tensor shaped [1, 1, vocab] or [1, vocab].
+        token_history: 1-D tensor of previously generated token ids.
+        repetition_penalty: HF-style repetition penalty (>1.0).
+    """
+    if repetition_penalty == 1.0 or token_history.numel() == 0:
+        return logits
+    unique_toks = token_history.unique()
+    tok_logits = logits[..., unique_toks]
+    logits[..., unique_toks] = torch.where(
+        tok_logits > 0, tok_logits / repetition_penalty, tok_logits * repetition_penalty
+    )
+    return logits
+def sample_logits(
+    logits: torch.Tensor,
+    *,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    do_sample: bool,
+    suppress_mask: Optional[torch.Tensor] = None,
+    suppress_tokens: Optional[Iterable[int]] = None,
+) -> torch.Tensor:
+    """Sample a token from logits.
+    Mirrors HF order: suppress -> temperature -> top-k -> top-p -> sample.
+    """
+    logits = logits.clone()
+    if suppress_mask is not None:
+        logits[..., suppress_mask] = float("-inf")
+    if suppress_tokens:
+        logits[..., list(suppress_tokens)] = float("-inf")
+    if not do_sample:
+        return torch.argmax(logits, dim=-1)
+    logits = logits / temperature
+    if top_k > 0:
+        topk_vals, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+        logits = torch.where(logits < topk_vals[..., -1:], torch.full_like(logits, float("-inf")), logits)
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        probs = F.softmax(sorted_logits, dim=-1)
+        cumulative_probs = torch.cumsum(probs, dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[..., 0] = False
+        sorted_logits[sorted_indices_to_remove] = float("-inf")
+        logits = torch.full_like(logits, float("-inf"))
+        logits.scatter_(-1, sorted_indices, sorted_logits)
+    return torch.multinomial(F.softmax(logits, dim=-1), 1).squeeze(-1)

faster_qwen3_tts/streaming.py ADDED Viewed

	@@ -0,0 +1,359 @@

+#!/usr/bin/env python3
+"""
+Streaming generation with CUDA graphs for both predictor and talker.
+Yields codec ID chunks during generation instead of collecting all at once.
+CUDA graph usage is identical to non-streaming — same per-step performance.
+"""
+import time
+from typing import Generator, Tuple
+import torch
+from .predictor_graph import PredictorGraph
+from .sampling import apply_repetition_penalty, sample_logits
+from .talker_graph import TalkerGraph
+@torch.inference_mode()
+def fast_generate_streaming(
+    talker,
+    talker_input_embeds: torch.Tensor,
+    attention_mask: torch.Tensor,
+    trailing_text_hiddens: torch.Tensor,
+    tts_pad_embed: torch.Tensor,
+    config,
+    predictor_graph: PredictorGraph,
+    talker_graph: TalkerGraph,
+    max_new_tokens: int = 2048,
+    min_new_tokens: int = 2,
+    temperature: float = 0.9,
+    top_k: int = 50,
+    top_p: float = 1.0,
+    do_sample: bool = True,
+    repetition_penalty: float = 1.05,
+    chunk_size: int = 12,
+) -> Generator[Tuple[torch.Tensor, dict], None, None]:
+    """
+    Streaming autoregressive generation with CUDA-graphed predictor and talker.
+    Yields (codec_chunk, timing_info) tuples every chunk_size steps.
+    codec_chunk: [chunk_steps, 16] tensor of codec IDs.
+    The final chunk may be shorter than chunk_size.
+    """
+    eos_id = config.codec_eos_token_id
+    vocab_size = config.vocab_size
+    device = talker_input_embeds.device
+    suppress_mask = torch.zeros(vocab_size, dtype=torch.bool, device=device)
+    suppress_start = max(0, vocab_size - 1024)
+    for i in range(suppress_start, vocab_size):
+        if i != eos_id:
+            suppress_mask[i] = True
+    predictor = talker.code_predictor
+    talker_codec_embed = talker.get_input_embeddings()
+    talker_codec_head = talker.codec_head
+    predictor_codec_embeds = predictor.get_input_embeddings()
+    num_code_groups = config.num_code_groups
+    # === PREFILL (still uses HF forward for variable-length prefill) ===
+    t_start = time.time()
+    out = talker.forward(
+        inputs_embeds=talker_input_embeds,
+        attention_mask=attention_mask,
+        use_cache=True,
+        output_hidden_states=True,
+        return_dict=True,
+        trailing_text_hidden=trailing_text_hiddens,
+        tts_pad_embed=tts_pad_embed,
+        generation_step=None,
+        past_hidden=None,
+        past_key_values=None,
+    )
+    talker_past_kv = out.past_key_values
+    past_hidden = out.past_hidden
+    gen_step = out.generation_step
+    logits = out.logits[:, -1, :]
+    suppress_eos = min_new_tokens > 0
+    token = sample_logits(
+        logits,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        do_sample=do_sample,
+        suppress_mask=suppress_mask,
+        suppress_tokens=[eos_id] if suppress_eos else None,
+    )
+    prefill_len = talker_graph.prefill_kv(talker_past_kv)
+    rope_deltas = getattr(talker, "rope_deltas", None)
+    talker_graph.set_generation_state(attention_mask, rope_deltas)
+    torch.cuda.synchronize()
+    t_prefill = time.time() - t_start
+    # === DECODE LOOP — yield chunks ===
+    chunk_buffer = []
+    all_first_tokens = []  # for repetition penalty across chunks
+    total_steps = 0
+    chunk_count = 0
+    chunk_start = time.time()
+    for step_idx in range(max_new_tokens):
+        if token.item() == eos_id:
+            break
+        # --- CUDA-Graphed Code Predictor ---
+        last_id_hidden = talker_codec_embed(token.unsqueeze(1))
+        pred_input = torch.cat((past_hidden, last_id_hidden), dim=1)
+        codebook_token_ids = predictor_graph.run(pred_input)
+        all_cb = torch.cat([token.view(1), codebook_token_ids])
+        chunk_buffer.append(all_cb.detach())
+        all_first_tokens.append(token.detach())
+        # --- Build input embedding for talker ---
+        codec_hiddens = [last_id_hidden]
+        for i in range(num_code_groups - 1):
+            codec_hiddens.append(predictor_codec_embeds[i](codebook_token_ids[i].unsqueeze(0).unsqueeze(0)))
+        inputs_embeds = torch.cat(codec_hiddens, dim=1).sum(1, keepdim=True)
+        if gen_step < trailing_text_hiddens.shape[1]:
+            inputs_embeds = inputs_embeds + trailing_text_hiddens[:, gen_step].unsqueeze(1)
+        else:
+            inputs_embeds = inputs_embeds + tts_pad_embed
+        # --- CUDA-Graphed Talker decode step ---
+        current_pos = prefill_len + step_idx
+        if current_pos >= talker_graph.max_seq_len - 1:
+            break
+        hidden_states = talker_graph.run(inputs_embeds, position=current_pos)
+        logits = talker_codec_head(hidden_states[:, -1, :]).unsqueeze(0)
+        if repetition_penalty != 1.0 and all_first_tokens:
+            history = torch.stack(all_first_tokens)
+            logits = apply_repetition_penalty(logits, history, repetition_penalty)
+        suppress_eos = len(all_first_tokens) < min_new_tokens
+        token = sample_logits(
+            logits.squeeze(0),
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            suppress_mask=suppress_mask,
+            suppress_tokens=[eos_id] if suppress_eos else None,
+        )
+        past_hidden = hidden_states[:, -1:, :].clone()
+        gen_step += 1
+        # --- Yield chunk when buffer is full ---
+        if len(chunk_buffer) >= chunk_size:
+            torch.cuda.synchronize()
+            chunk_decode_time = time.time() - chunk_start
+            total_steps += len(chunk_buffer)
+            yield torch.stack(chunk_buffer), {
+                'chunk_index': chunk_count,
+                'chunk_steps': len(chunk_buffer),
+                'prefill_ms': t_prefill * 1000 if chunk_count == 0 else 0,
+                'decode_ms': chunk_decode_time * 1000,
+                'total_steps_so_far': total_steps,
+                'is_final': False,
+            }
+            chunk_buffer = []
+            chunk_count += 1
+            chunk_start = time.time()
+    # --- Yield final partial chunk ---
+    if chunk_buffer:
+        torch.cuda.synchronize()
+        chunk_decode_time = time.time() - chunk_start
+        total_steps += len(chunk_buffer)
+        yield torch.stack(chunk_buffer), {
+            'chunk_index': chunk_count,
+            'chunk_steps': len(chunk_buffer),
+            'prefill_ms': t_prefill * 1000 if chunk_count == 0 else 0,
+            'decode_ms': chunk_decode_time * 1000,
+            'total_steps_so_far': total_steps,
+            'is_final': True,
+        }
+@torch.inference_mode()
+def parity_generate_streaming(
+    talker,
+    talker_input_embeds: torch.Tensor,
+    attention_mask: torch.Tensor,
+    trailing_text_hiddens: torch.Tensor,
+    tts_pad_embed: torch.Tensor,
+    config,
+    max_new_tokens: int = 2048,
+    min_new_tokens: int = 2,
+    temperature: float = 0.9,
+    top_k: int = 50,
+    top_p: float = 1.0,
+    do_sample: bool = True,
+    repetition_penalty: float = 1.05,
+    chunk_size: int = 12,
+) -> Generator[Tuple[torch.Tensor, dict], None, None]:
+    """
+    Streaming generation without CUDA graphs (dynamic cache).
+    Yields (codec_chunk, timing_info) tuples every chunk_size steps.
+    """
+    # NOTE: This function intentionally mirrors fast_generate_streaming. The core
+    # decode loop is duplicated so we can swap CUDA graphs/static cache for the
+    # dynamic-cache path while keeping sampling/chunking identical. If you edit
+    # the fast path, check parity_generate_streaming for matching changes.
+    eos_id = config.codec_eos_token_id
+    vocab_size = config.vocab_size
+    device = talker_input_embeds.device
+    suppress_mask = torch.zeros(vocab_size, dtype=torch.bool, device=device)
+    suppress_start = max(0, vocab_size - 1024)
+    for i in range(suppress_start, vocab_size):
+        if i != eos_id:
+            suppress_mask[i] = True
+    # === PREFILL ===
+    t_start = time.time()
+    out = talker.forward(
+        inputs_embeds=talker_input_embeds,
+        attention_mask=attention_mask,
+        use_cache=True,
+        output_hidden_states=True,
+        return_dict=True,
+        trailing_text_hidden=trailing_text_hiddens,
+        tts_pad_embed=tts_pad_embed,
+        generation_step=None,
+        past_hidden=None,
+        past_key_values=None,
+    )
+    talker_past_kv = out.past_key_values
+    past_hidden = out.past_hidden
+    gen_step = out.generation_step
+    logits = out.logits[:, -1, :]
+    suppress_eos = min_new_tokens > 0
+    token = sample_logits(
+        logits,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        do_sample=do_sample,
+        suppress_mask=suppress_mask,
+        suppress_tokens=[eos_id] if suppress_eos else None,
+    )
+    if attention_mask is not None:
+        attention_mask = attention_mask.clone()
+    torch.cuda.synchronize()
+    t_prefill = time.time() - t_start
+    # === DECODE LOOP — yield chunks ===
+    chunk_buffer = []
+    all_first_tokens = []
+    total_steps = 0
+    chunk_count = 0
+    chunk_start = time.time()
+    for _ in range(max_new_tokens):
+        if token.item() == eos_id:
+            break
+        cache_position = None
+        if attention_mask is not None:
+            attention_mask = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))],
+                dim=1,
+            )
+            cache_position = torch.tensor([attention_mask.shape[1] - 1], device=attention_mask.device)
+        out = talker.forward(
+            input_ids=token.view(1, 1),
+            attention_mask=attention_mask,
+            use_cache=True,
+            output_hidden_states=True,
+            return_dict=True,
+            trailing_text_hidden=trailing_text_hiddens,
+            tts_pad_embed=tts_pad_embed,
+            generation_step=gen_step,
+            past_hidden=past_hidden,
+            past_key_values=talker_past_kv,
+            subtalker_dosample=do_sample,
+            subtalker_top_k=top_k,
+            subtalker_top_p=top_p,
+            subtalker_temperature=temperature,
+            cache_position=cache_position,
+        )
+        codec_ids = out.hidden_states[1]
+        if codec_ids is None:
+            break
+        chunk_buffer.append(codec_ids.squeeze(0).detach())
+        all_first_tokens.append(token.detach())
+        logits = out.logits[:, -1, :]
+        if repetition_penalty != 1.0 and all_first_tokens:
+            history = torch.stack(all_first_tokens)
+            logits = apply_repetition_penalty(logits, history, repetition_penalty)
+        suppress_eos = len(all_first_tokens) < min_new_tokens
+        token = sample_logits(
+            logits,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            suppress_mask=suppress_mask,
+            suppress_tokens=[eos_id] if suppress_eos else None,
+        )
+        talker_past_kv = out.past_key_values
+        past_hidden = out.past_hidden
+        gen_step = out.generation_step
+        if len(chunk_buffer) >= chunk_size:
+            torch.cuda.synchronize()
+            chunk_decode_time = time.time() - chunk_start
+            total_steps += len(chunk_buffer)
+            yield torch.stack(chunk_buffer), {
+                'chunk_index': chunk_count,
+                'chunk_steps': len(chunk_buffer),
+                'prefill_ms': t_prefill * 1000 if chunk_count == 0 else 0,
+                'decode_ms': chunk_decode_time * 1000,
+                'total_steps_so_far': total_steps,
+                'is_final': False,
+            }
+            chunk_buffer = []
+            chunk_count += 1
+            chunk_start = time.time()
+    if chunk_buffer:
+        torch.cuda.synchronize()
+        chunk_decode_time = time.time() - chunk_start
+        total_steps += len(chunk_buffer)
+        yield torch.stack(chunk_buffer), {
+            'chunk_index': chunk_count,
+            'chunk_steps': len(chunk_buffer),
+            'prefill_ms': t_prefill * 1000 if chunk_count == 0 else 0,
+            'decode_ms': chunk_decode_time * 1000,
+            'total_steps_so_far': total_steps,
+            'is_final': True,
+        }

faster_qwen3_tts/talker_graph.py ADDED Viewed

	@@ -0,0 +1,214 @@

+#!/usr/bin/env python3
+"""
+CUDA graph capture for the talker's single-token decode step,
+using transformers StaticCache.
+The talker has 28 transformer layers. Instead of reimplementing the
+forward pass manually, we use the model's own forward with StaticCache.
+The StaticCache provides fixed-size KV tensors compatible with CUDA graphs.
+Strategy:
+- Use transformers StaticCache for KV cache management
+- Use the model's forward method (handles mask, RoPE, attention internally)
+- Capture the single-token decode as a CUDA graph
+- Update cache_position buffer between replays
+"""
+import torch
+from transformers import StaticCache
+from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
+class TalkerGraph:
+    """
+    Captures the talker's single-token decode step as a CUDA graph,
+    using the model's own forward with transformers StaticCache.
+    """
+    def __init__(self, talker_model, talker_config, device='cuda', dtype=torch.bfloat16,
+                 max_seq_len=512):
+        self.device = device
+        device_index = torch.device(device).index
+        device_index = device_index if device_index is not None else torch.cuda.current_device()
+        self.device_index = device_index
+        self.dtype = dtype
+        self.max_seq_len = max_seq_len
+        self.hidden_size = talker_config.hidden_size
+        self.num_layers = talker_config.num_hidden_layers
+        # Keep reference to the inner model (transformer backbone)
+        self.model = talker_model
+        # Transformers StaticCache — handles index_copy_ and fixed-size KV internally
+        self.static_cache = StaticCache(config=talker_config, max_cache_len=max_seq_len)
+        # Static I/O buffers for CUDA graph
+        self.input_buf = torch.zeros(1, 1, self.hidden_size, dtype=dtype, device=device)
+        self.output_buf = torch.zeros(1, 1, self.hidden_size, dtype=dtype, device=device)
+        # Cache position buffer — updated before each graph replay
+        self.cache_position = torch.zeros(1, dtype=torch.long, device=device)
+        # Rope deltas from prefill (shape [batch, 1]) and position ids buffer.
+        self.rope_deltas = torch.zeros(1, 1, dtype=torch.float32, device=device)
+        self.position_ids = torch.zeros(3, 1, 1, dtype=torch.float32, device=device)
+        self.graph = None
+        self.captured = False
+        self.attn_mask = None
+        self.attn_mask_table = None
+        self._mask_key = None
+    def _init_cache_layers(self):
+        """Force lazy initialization of StaticCache layers before graph capture."""
+        config = self.model.config
+        num_kv_heads = getattr(config, 'num_key_value_heads', config.num_attention_heads)
+        head_dim = getattr(config, 'head_dim', config.hidden_size // config.num_attention_heads)
+        dummy_k = torch.zeros(1, num_kv_heads, 1, head_dim, dtype=self.dtype, device=self.device)
+        for layer in self.static_cache.layers:
+            if not layer.is_initialized:
+                layer.lazy_initialization(dummy_k)
+    def _build_attention_masks(self, attention_mask: torch.Tensor | None = None):
+        dummy = torch.zeros(1, 1, self.hidden_size, dtype=self.dtype, device=self.device)
+        max_len = self.max_seq_len
+        self.attn_mask_table = [None] * max_len
+        mask_fn = create_causal_mask if self.model.config.sliding_window is None else create_sliding_window_causal_mask
+        for i in range(max_len):
+            pos = torch.tensor([i], device=self.device)
+            full = mask_fn(
+                config=self.model.config,
+                input_embeds=dummy,
+                attention_mask=attention_mask,
+                cache_position=pos,
+                past_key_values=self.static_cache,
+            )
+            self.attn_mask_table[i] = full
+        if self.attn_mask is None:
+            self.attn_mask = self.attn_mask_table[0].clone()
+        else:
+            self.attn_mask.copy_(self.attn_mask_table[0])
+    def _set_attention_mask(self, position: int):
+        self.attn_mask.copy_(self.attn_mask_table[position])
+    def _decode_step(self):
+        """Single-token decode through the model's forward."""
+        out = self.model(
+            inputs_embeds=self.input_buf,
+            attention_mask=self.attn_mask,
+            past_key_values=self.static_cache,
+            cache_position=self.cache_position,
+            position_ids=self.position_ids,
+            use_cache=True,
+        )
+        self.output_buf.copy_(out.last_hidden_state)
+    @torch.inference_mode()
+    def capture(self, prefill_len=100, num_warmup=3):
+        """
+        Capture CUDA graph for single-token decode.
+        prefill_len: simulated prefill length for warmup (graph is position-independent).
+        """
+        print(f"Warming up talker graph ({num_warmup} runs)...")
+        # Force cache initialization before graph capture
+        self._init_cache_layers()
+        self._build_attention_masks()
+        # Set cache_position for warmup
+        self.cache_position[0] = prefill_len
+        self._set_attention_mask(prefill_len)
+        for _ in range(num_warmup):
+            self._decode_step()
+        torch.cuda.synchronize()
+        print("Capturing CUDA graph for talker decode...")
+        with torch.cuda.device(self.device_index):
+            self.graph = torch.cuda.CUDAGraph()
+            s = torch.cuda.Stream()
+            s.wait_stream(torch.cuda.current_stream())
+            with torch.cuda.stream(s):
+                # Warmup in capture stream
+                self._decode_step()
+                torch.cuda.synchronize()
+                with torch.cuda.graph(self.graph):
+                    self._decode_step()
+        torch.cuda.current_stream().wait_stream(s)
+        torch.cuda.synchronize()
+        self.captured = True
+        print("Talker CUDA graph captured!")
+    def reset(self, prefill_len: int):
+        """Reset cache for new sequence."""
+        self.static_cache.reset()
+    def prefill_kv(self, past_key_values):
+        """
+        Copy HF DynamicCache from prefill into our StaticCache.
+        past_key_values: DynamicCache with num_layers layers of [1, kv_heads, seq_len, head_dim]
+        """
+        self.static_cache.reset()
+        seq_len = 0
+        for li in range(self.num_layers):
+            k, v = past_key_values[li]  # each [1, kv_heads, seq_len, head_dim]
+            seq_len = k.shape[2]
+            if seq_len > self.max_seq_len:
+                raise RuntimeError(
+                    f"Input is too long: prefill has {seq_len} tokens but max_seq_len={self.max_seq_len}. "
+                    "Use shorter text or shorter reference audio."
+                )
+            cache_pos = torch.arange(seq_len, device=self.device)
+            self.static_cache.update(k, v, li, {"cache_position": cache_pos})
+        return seq_len
+    def set_generation_state(self, attention_mask: torch.Tensor, rope_deltas: torch.Tensor | None):
+        """Set padding-aware attention mask and rope deltas for decode parity."""
+        mask_key = None
+        full_attention_mask = None
+        if attention_mask is not None:
+            pad_counts = (attention_mask == 0).sum(dim=-1)
+            mask_key = tuple(pad_counts.tolist())
+            full_attention_mask = torch.ones(
+                attention_mask.shape[0],
+                self.max_seq_len,
+                dtype=attention_mask.dtype,
+                device=attention_mask.device,
+            )
+            for b, pads in enumerate(pad_counts.tolist()):
+                if pads > 0:
+                    full_attention_mask[b, :pads] = 0
+        if self.attn_mask_table is None or mask_key != self._mask_key:
+            self._build_attention_masks(full_attention_mask)
+            self._mask_key = mask_key
+        if rope_deltas is None:
+            self.rope_deltas.zero_()
+        else:
+            if rope_deltas.dim() == 1:
+                rope_deltas = rope_deltas.unsqueeze(1)
+            self.rope_deltas.copy_(rope_deltas.to(self.rope_deltas.device, dtype=self.rope_deltas.dtype))
+    @torch.inference_mode()
+    def run(self, input_embeds: torch.Tensor, position: int) -> torch.Tensor:
+        """
+        Run one decode step.
+        input_embeds: [1, 1, hidden_size]
+        position: current sequence position
+        Returns: [1, 1, hidden_size] hidden states
+        """
+        self.input_buf.copy_(input_embeds)
+        self.cache_position[0] = position
+        self._set_attention_mask(position)
+        # position_ids = arange(seq_len=1) + cache_position + rope_deltas
+        delta = self.rope_deltas + self.cache_position[0].to(self.rope_deltas.dtype)
+        self.position_ids.copy_(delta.unsqueeze(0).expand(3, -1, -1))
+        self.graph.replay()
+        return self.output_buf  # static buffer — caller should use immediately or clone

faster_qwen3_tts/utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import contextlib
+import sys
+class _FilteredStdout:
+    def __init__(self, stream, suppress_substrings):
+        self._stream = stream
+        self._suppress = suppress_substrings
+    def write(self, data):
+        if any(s in data for s in self._suppress):
+            return len(data)
+        return self._stream.write(data)
+    def flush(self):
+        return self._stream.flush()
+@contextlib.contextmanager
+def suppress_flash_attn_warning():
+    filtered = _FilteredStdout(
+        sys.stdout,
+        suppress_substrings=(
+            "flash-attn is not installed",
+            "manual PyTorch version",
+            "Please install flash-attn",
+        ),
+    )
+    with contextlib.redirect_stdout(filtered):
+        yield

main.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# PYTORCH_ENABLE_MPS_FALLBACK=1 uvicorn main:app --host 0.0.0.0 --port 8888 --reload
+# PYTORCH_ENABLE_MPS_FALLBACK=1 gunicorn main:app -b 0.0.0.0:8000 -w 4 -k uvicorn.workers.UvicornWorker
+import io
+import re
+import os
+import logging
+import json
+from time import gmtime
+from datetime import datetime, timezone
+from scipy.io import wavfile
+from dotenv import load_dotenv
+from contextlib import asynccontextmanager
+from tts import synthesize, device
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+from fastapi import FastAPI, Response, Body, UploadFile, HTTPException
+from starlette.middleware.cors import CORSMiddleware
+load_dotenv(verbose=False)
+LOGGING_DIRECTORY = os.getenv('LOGGING_DIRECTORY', 'logs')
+if not os.path.isdir(LOGGING_DIRECTORY):
+    os.makedirs(LOGGING_DIRECTORY)
+file_handler = logging.FileHandler(os.path.join(LOGGING_DIRECTORY, 'api.log'), mode='a', encoding='utf-8')
+formatter = logging.Formatter(fmt='%(asctime)s.%(msecs)03dZ - %(levelname)s - %(message)s', datefmt='%Y-%m-%dT%H:%M:%S')
+formatter.converter = gmtime
+file_handler.setFormatter(formatter)
+#logger = logging.getLogger('uvicorn')
+logger = logging.getLogger('gunicorn.error')
+logger.addHandler(file_handler)
+llm_prompt_format = os.getenv('LLM_PROMPT_FORMAT', None)
+model_path = os.environ.get('LLAMACPP_PATH', None)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global model_path
+    base_directory = 'data'
+    for language in os.listdir(base_directory):
+        path = os.path.join(base_directory, language)
+        if os.path.isdir(path):
+            for filename in os.listdir(path):
+                _, extension = os.path.splitext(filename)
+                if extension.lower() == '.wav':
+                    with open(os.path.join(path, filename), mode='rb') as f, io.BytesIO() as wave_bytes, open(os.path.join(path, 'prompt.txt'), 'r', encoding='utf-8') as prompt_file, open(os.path.join(path, 'input.txt'), 'r', encoding='utf-8') as input_file:
+                        wave_bytes.write(f.read())
+                        wave_bytes.seek(0)
+                        synthesize(prompt_wave=wave_bytes, prompt_text=prompt_file.read(), prompt_language=language, input_text=input_file.read(), input_language=language, top_p=1, temperature=1)
+    if model_path is None:
+        model_path = hf_hub_download(repo_id=os.environ['LLAMACPP_REPO_ID'], filename=os.environ['LLAMACPP_FILENAME'], local_dir='./models')
+    yield
+app = FastAPI(lifespan=lifespan)
+app.add_middleware(CORSMiddleware, allow_origins=['*'], allow_credentials=True, allow_methods=['*'], allow_headers=['*'])
+@app.get("/device")
+async def read_device():
+    return {'device': str(device), 'timestamp': int(datetime.now(timezone.utc).replace(tzinfo=timezone.utc).timestamp())}
+@app.post("/generate", status_code=201)
+def create_generated_text(messages: list[dict[str, str]] = Body(...), temperature: float = Body(default=1.0)):
+    input_text = ''
+    if llm_prompt_format == 'Llama':
+        for message in messages:
+            if message['role'] == 'system':
+                input_text += f"<|start_header_id|>system<|end_header_id|>\n\n{message['content']}<|eot_id|>"
+            elif message['role'] == 'user':
+                input_text += f"<|start_header_id|>user<|end_header_id|>\n\n{message['content']}<|eot_id|>"
+            elif message['role'] == 'assistant':
+                input_text += f"<|start_header_id|>assistant<|end_header_id|>\n\n{message['content']}<|eot_id|>"
+        input_text += '<|start_header_id|>assistant<|end_header_id|>\n\n'
+        pattern = r'<|start_header_id|>assistant<|end_header_id|>\n\n(.+?)(?:(?:<|eot_id|>)|$)'
+    else:
+        for message in messages:
+            if message['role'] == 'system' or message['role'] == 'user':
+                input_text += f"<start_of_turn>user\n{message['content']}<end_of_turn>\n"
+            elif message['role'] == 'assistant':
+                input_text += f"<start_of_turn>model\n{message['content']}<end_of_turn>\n"
+        input_text += '<start_of_turn>model\n'
+        pattern = r'<start_of_turn>model\n(.+?)(?:(?:<end_of_turn>)|$)'
+    if len(input_text) > 0:
+        llm = Llama(model_path=model_path, n_ctx=8192, n_gpu_layers=-1, n_batch=32, verbose=False)
+        choices = []
+        try:
+            for choice in llm(input_text, max_tokens=2048, temperature=temperature, top_p=0.95, echo=True)['choices']:
+                matches = re.findall(pattern, choice['text'], re.DOTALL)
+                if len(matches) > 0:
+                    choices.append({'role': 'assistant', 'content': matches[len(matches) - 1]})
+        finally:
+            llm.close()
+        return {'choices': choices, 'timestamp': int(datetime.now(timezone.utc).replace(tzinfo=timezone.utc).timestamp())}
+    else:
+        raise HTTPException(status_code=400)
+@app.post("/synthesize", status_code=201)
+def create_uploaded_file(file: UploadFile, data = Body(...)):
+    if file.content_type == 'audio/wav':
+        try:
+            data = json.loads(data)
+            with io.BytesIO() as prompt_wave_bytes, io.BytesIO() as output_wave_bytes:
+                prompt_wave_bytes.write(file.file.read())
+                prompt_wave_bytes.seek(0)
+                output, sample_rate = synthesize(prompt_wave=prompt_wave_bytes, prompt_text=data['prompt'] if 'prompt' in data else None, prompt_language=data['language'], input_text=data['input'], input_language=data['language'], top_p=data['top_p'] if 'top_p' in data else 1.0, temperature=data['temperature'] if 'temperature' in data else 1.0)
+                wavfile.write(output_wave_bytes, sample_rate, output)
+                output_wave_bytes.seek(0)
+                return Response(content=output_wave_bytes.read(), media_type="audio/wav")
+        except Exception as e:
+            logging.error(f'{e}')
+            raise HTTPException(status_code=400, detail=str(e))
+    else:
+        raise HTTPException(status_code=400)

qwen_tts/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+qwen_tts: Qwen-TTS package.
+"""
+from .inference.qwen3_tts_model import Qwen3TTSModel, VoiceClonePromptItem
+from .inference.qwen3_tts_tokenizer import Qwen3TTSTokenizer
+__all__ = ["__version__"]

qwen_tts/__main__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def main():
+    print(
+        "qwen_tts package.\n"
+        "Use CLI entrypoints:\n"
+        "  - qwen-tts-demo\n"
+    )
+if __name__ == "__main__":
+    main()

qwen_tts/core/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1 import Qwen3TTSTokenizerV1Config
+from .tokenizer_25hz.modeling_qwen3_tts_tokenizer_v1 import Qwen3TTSTokenizerV1Model
+from .tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2 import Qwen3TTSTokenizerV2Config
+from .tokenizer_12hz.modeling_qwen3_tts_tokenizer_v2 import Qwen3TTSTokenizerV2Model

qwen_tts/core/models/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_qwen3_tts import Qwen3TTSConfig
+from .modeling_qwen3_tts import Qwen3TTSForConditionalGeneration
+from .processing_qwen3_tts import Qwen3TTSProcessor

qwen_tts/core/models/configuration_qwen3_tts.py ADDED Viewed

	@@ -0,0 +1,502 @@

+# coding=utf-8
+# Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Qwen3TTSSpeakerEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3TTSSpeakerEncoder`].
+    It is used to instantiate a Qwen3TTS speaker encoder model according to the specified arguments, defining the model
+    architecture. The architecture is based on the ECAPA-TDNN model.
+    Args:
+        mel_dim (`int`, *optional*, defaults to 128):
+            The dimension of the input mel-spectrogram.
+        enc_dim (`int`, *optional*, defaults to 192):
+            The dimension of the final speaker embedding.
+        enc_channels (`list[int]`, *optional*, defaults to `[512, 512, 512, 512, 1536]`):
+            A list of output channels for each TDNN/SERes2Net layer in the encoder. The first channel size is for the initial TDNN layer,
+            the intermediate ones for the `SqueezeExcitationRes2NetBlock` layers, and the last one for the multi-layer feature aggregation.
+        enc_kernel_sizes (`list[int]`, *optional*, defaults to `[5, 3, 3, 3, 1]`):
+            A list of kernel sizes for each layer in the encoder, corresponding to `enc_channels`.
+        enc_dilations (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 1]`):
+            A list of dilations for each layer in the encoder, corresponding to `enc_channels`.
+        enc_attention_channels (`int`, *optional*, defaults to 128):
+            The number of attention channels in the `AttentiveStatisticsPooling` layer.
+        enc_res2net_scale (`int`, *optional*,defaults to 8):
+            The scale of the `Res2NetBlock` in the encoder.
+        enc_se_channels (`int`, *optional*, defaults to 128):
+            The number of channels in the squeeze part of the `SqueezeExcitationBlock`.
+    """
+    def __init__(
+        self,
+        mel_dim=128,
+        enc_dim=1024,
+        enc_channels=[512, 512, 512, 512, 1536],
+        enc_kernel_sizes=[5, 3, 3, 3, 1],
+        enc_dilations=[1, 2, 3, 4, 1],
+        enc_attention_channels=128,
+        enc_res2net_scale=8,
+        enc_se_channels=128,
+        sample_rate=24000,
+    ):
+        self.mel_dim = mel_dim
+        self.enc_dim = enc_dim
+        self.enc_channels = enc_channels
+        self.enc_kernel_sizes = enc_kernel_sizes
+        self.enc_dilations = enc_dilations
+        self.enc_attention_channels = enc_attention_channels
+        self.enc_res2net_scale = enc_res2net_scale
+        self.enc_se_channels = enc_se_channels
+        self.sample_rate = sample_rate
+class Qwen3TTSTalkerCodePredictorConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3TTSTalkerCodePredictorModel`]. It is used to instantiate a
+    Qwen3TTSTalkerCodePredictor model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen3TTSTalkerCodePredictor model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen3TTSTalkerCodePredictorModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
+            additional layer afterwards will use SWA (Sliding Window Attention).
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    """
+    model_type = "qwen3_tts_talker_code_predictor"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Qwen3TTSTalkerCodePredictor`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=2048,
+        hidden_size=1024,
+        intermediate_size=3072,
+        num_hidden_layers=5,
+        num_attention_heads=16,
+        num_key_value_heads=8,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=0.000001,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000,
+        rope_scaling=None,
+        attention_bias=False,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        layer_types=None,
+        attention_dropout=0,
+        num_code_groups=32,
+        **kwargs,
+    ):
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if self.use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if self.sliding_window is not None and i >= self.max_window_layers
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+        self.num_code_groups = num_code_groups
+class Qwen3TTSTalkerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3TTSTalkerModel`]. It is used to instantiate a
+    Qwen3TTSTalker model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen3TTSTalker model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen3TTSTalkerModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    """
+    model_type = "qwen3_tts_talker"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Qwen3TTSTalker`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    sub_configs = {"code_predictor_config": Qwen3TTSTalkerCodePredictorConfig}
+    def __init__(
+        self,
+        code_predictor_config=None,
+        vocab_size=3072,
+        hidden_size=1024,
+        intermediate_size=2048,
+        num_hidden_layers=20,
+        num_attention_heads=16,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=0.000001,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000,
+        rope_scaling=None,
+        attention_bias=False,
+        use_sliding_window=False,
+        sliding_window=4096,
+        attention_dropout=0,
+        num_code_groups=32,
+        text_hidden_size=2048,
+        codec_eos_token_id=4198,
+        codec_think_id=4202,
+        codec_nothink_id=4203,
+        codec_think_bos_id=4204,
+        codec_think_eos_id=4205,
+        codec_pad_id=4196,
+        codec_bos_id=4197,
+        spk_id=None,
+        spk_is_dialect=None,
+        codec_language_id=None,
+        **kwargs,
+    ):
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        if code_predictor_config is None:
+            code_predictor_config = {}
+            self.code_predictor_config = Qwen3TTSTalkerCodePredictorConfig()
+            logger.info("code_predictor_config is None. Initializing code_predictor model with default values")
+        elif isinstance(code_predictor_config, Qwen3TTSTalkerCodePredictorConfig):
+            self.code_predictor_config = code_predictor_config
+        else:
+            self.code_predictor_config = Qwen3TTSTalkerCodePredictorConfig(**code_predictor_config)
+        self.num_code_groups = num_code_groups
+        self.text_hidden_size = text_hidden_size
+        self.codec_eos_token_id = codec_eos_token_id
+        self.codec_think_id = codec_think_id
+        self.codec_language_id = codec_language_id
+        self.codec_nothink_id = codec_nothink_id
+        self.codec_think_bos_id = codec_think_bos_id
+        self.codec_think_eos_id = codec_think_eos_id
+        self.codec_pad_id = codec_pad_id
+        self.codec_bos_id = codec_bos_id
+        self.spk_id = spk_id
+        self.spk_is_dialect = spk_is_dialect
+class Qwen3TTSConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`Qwen3TTSForConditionalGeneration`].
+    """
+    model_type = "qwen3_tts"
+    sub_configs = {
+        "talker_config": Qwen3TTSTalkerConfig,
+        "speaker_encoder_config": Qwen3TTSSpeakerEncoderConfig,
+    }
+    def __init__(
+        self,
+        talker_config=None,
+        speaker_encoder_config=None,
+        tokenizer_type=None,
+        tts_model_size=None,
+        tts_model_type=None,
+        im_start_token_id=151644,
+        im_end_token_id=151645,
+        tts_pad_token_id=151671,
+        tts_bos_token_id=151672,
+        tts_eos_token_id=151673,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if talker_config is None:
+            talker_config = {}
+            logger.info("talker_config is None. Initializing talker model with default values")
+        if speaker_encoder_config is None:
+            speaker_encoder_config = {}
+            logger.info("speaker_encoder_config is None. Initializing talker model with default values")
+        self.talker_config = Qwen3TTSTalkerConfig(**talker_config)
+        self.speaker_encoder_config = Qwen3TTSSpeakerEncoderConfig(**speaker_encoder_config)
+        self.tokenizer_type = tokenizer_type
+        self.tts_model_size = tts_model_size
+        self.tts_model_type = tts_model_type
+        self.im_start_token_id = im_start_token_id
+        self.im_end_token_id = im_end_token_id
+        self.tts_pad_token_id = tts_pad_token_id
+        self.tts_bos_token_id = tts_bos_token_id
+        self.tts_eos_token_id = tts_eos_token_id
+__all__ = ["Qwen3TTSConfig", "Qwen3TTSTalkerConfig", "Qwen3TTSSpeakerEncoderConfig"]

qwen_tts/core/models/modeling_qwen3_tts.py ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen_tts/core/models/processing_qwen3_tts.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# coding=utf-8
+# Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin
+class Qwen3TTSProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "padding_side": "left",
+        }
+    }
+class Qwen3TTSProcessor(ProcessorMixin):
+    r"""
+    Constructs a Qwen3TTS processor.
+    Args:
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The text tokenizer.
+        chat_template (`Optional[str]`, *optional*):
+            The Jinja template to use for formatting the conversation. If not provided, the default chat template is used.
+    """
+    attributes = ["tokenizer"]
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(
+        self, tokenizer=None, chat_template=None
+    ):
+        super().__init__(tokenizer, chat_template=chat_template)
+    def __call__(self, text=None, **kwargs) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+        """
+        if text is None:
+            raise ValueError("You need to specify either a `text` input to process.")
+        output_kwargs = self._merge_kwargs(
+            Qwen3TTSProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if not isinstance(text, list):
+            text = [text]
+        texts_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        return BatchFeature(
+            data={**texts_inputs},
+            tensor_type=kwargs.get("return_tensors"),
+        )
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def apply_chat_template(self, conversations, chat_template=None, **kwargs):
+        if isinstance(conversations[0], dict):
+            conversations = [conversations]
+        return super().apply_chat_template(conversations, chat_template, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        return list(
+            dict.fromkeys(
+                tokenizer_input_names
+            )
+        )
+__all__ = ["Qwen3TTSProcessor"]

qwen_tts/core/tokenizer_12hz/configuration_qwen3_tts_tokenizer_v2.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# coding=utf-8
+# Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3TTSTokenizerV2 model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers import MimiConfig
+logger = logging.get_logger(__name__)
+class Qwen3TTSTokenizerV2DecoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3TTSTokenizerV2DecoderConfig`].
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        codebook_size (`int`, *optional*, defaults to 2048):
+            Number of entries in each residual codebook used for acoustic token quantization.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the hidden states and embeddings in the autoregressive transformer decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 8000):
+            Maximum sequence length that the autoregressive decoder can handle. Determines positional embedding size.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period for rotary position embeddings (RoPE) applied to attention layers.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            Number of key and value attention heads used in grouped-query attention (if applicable).
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the attention projection layers.
+        sliding_window (`int`, *optional*, defaults to 72):
+            Window size for local attention mechanism, limiting attention context to improve efficiency.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the feed-forward (intermediate) layer in each transformer block.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function used in the feed-forward layers. Supports `"silu"`, `"relu"`, `"gelu"`, etc.
+        layer_scale_initial_scale (`float`, *optional*, defaults to 0.01):
+            Initial value for LayerScale applied in transformer blocks, helping stabilize training.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-5):
+            Epsilon value for RMS normalization layers to prevent division by zero.
+        num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of transformer blocks in the autoregressive decoder.
+        num_quantizers (`int`, *optional*, defaults to 16):
+            Number of residual vector quantizers used in the vocoder for fine-grained audio reconstruction.
+        upsample_rates (`Tuple[int]`, *optional*, defaults to `(8, 5, 4, 3)`):
+            Rate at which features are upsampled in the final waveform synthesis stage.
+        upsampling_ratios (`Tuple[int]`, *optional*, defaults to `(2, 2)`):
+            Ratios used in transposed convolutional layers to progressively upsample feature maps to waveform.
+        decoder_dim (`int`, *optional*, defaults to 1536):
+            Final dimensionality of the decoder's output before waveform generation.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability applied to attention weights in the decoder.
+    """
+    def __init__(
+        self,
+        codebook_size=2048,
+        hidden_size=1024,
+        latent_dim=1024,
+        max_position_embeddings=8000,
+        rope_theta=10000,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        attention_bias=False,
+        sliding_window=72,
+        intermediate_size=3072,
+        hidden_act="silu",
+        layer_scale_initial_scale=0.01,
+        rms_norm_eps=1e-5,
+        num_hidden_layers=8,
+        num_quantizers=16,
+        upsample_rates=(8, 5, 4, 3),
+        upsampling_ratios=(2, 2),
+        decoder_dim=1536,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.codebook_size = codebook_size
+        self.hidden_size = hidden_size
+        self.latent_dim = latent_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.attention_bias = attention_bias
+        self.sliding_window = sliding_window
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.layer_scale_initial_scale = layer_scale_initial_scale
+        self.rms_norm_eps = rms_norm_eps
+        self.num_hidden_layers = num_hidden_layers
+        self.num_quantizers = num_quantizers
+        self.upsample_rates = upsample_rates
+        self.upsampling_ratios = upsampling_ratios
+        self.decoder_dim = decoder_dim
+        self.attention_dropout = attention_dropout
+    @property
+    def layer_types(self):
+        """
+        All layer in code2wav should be sliding attention
+        """
+        return ["sliding_attention"] * self.num_hidden_layers
+class Qwen3TTSTokenizerV2Config(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`Qwen3TTSTokenizerV2Config`]. It is used to instantiate a Qwen3TTSTokenizerV2Model
+    model according to the specified sub-models configurations, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        encoder_config (`dict`, *optional*): Configuration of the underlying encoder sub-model.
+        decoder_config (`dict`, *optional*): Configuration of the underlying decoder sub-model.
+    """
+    model_type = "qwen3_tts_tokenizer_12hz"
+    sub_configs = {
+        "encoder_config": MimiConfig,
+        "decoder_config": Qwen3TTSTokenizerV2DecoderConfig,
+    }
+    def __init__(
+        self,
+        encoder_config=None,
+        decoder_config=None,
+        encoder_valid_num_quantizers=16,
+        input_sample_rate=24000,
+        output_sample_rate=24000,
+        decode_upsample_rate=1920,
+        encode_downsample_rate=1920,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if encoder_config is None:
+            encoder_config = {}
+            logger.info("encoder_config is None. Initializing encoder with default values")
+        if decoder_config is None:
+            decoder_config = {}
+            logger.info("decoder_config is None. Initializing decoder with default values")
+        self.encoder_config = MimiConfig(**encoder_config)
+        self.decoder_config = Qwen3TTSTokenizerV2DecoderConfig(**decoder_config)
+        self.encoder_valid_num_quantizers = encoder_valid_num_quantizers
+        self.input_sample_rate = input_sample_rate
+        self.output_sample_rate = output_sample_rate
+        self.decode_upsample_rate = decode_upsample_rate
+        self.encode_downsample_rate = encode_downsample_rate
+__all__ = ["Qwen3TTSTokenizerV2Config", "Qwen3TTSTokenizerV2DecoderConfig"]

qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py ADDED Viewed

	@@ -0,0 +1,1025 @@

+# coding=utf-8
+# Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen3TTSTokenizerV2 model."""
+import math
+from dataclasses import dataclass
+from typing import Callable, Optional, Union, List
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import Parameter
+from torch.nn import functional as F
+from transformers import MimiConfig, MimiModel
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import (
+    create_causal_mask,
+    create_sliding_window_causal_mask,
+)
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import ModelOutput, auto_docstring, logging
+from transformers.utils.deprecation import deprecate_kwarg
+from transformers.utils.generic import check_model_inputs
+from .configuration_qwen3_tts_tokenizer_v2 import (
+    Qwen3TTSTokenizerV2Config,
+    Qwen3TTSTokenizerV2DecoderConfig,
+)
+logger = logging.get_logger(__name__)
+@dataclass
+@auto_docstring
+class Qwen3TTSTokenizerV2EncoderOutput(ModelOutput):
+    r"""
+    audio_codes (`List[torch.LongTensor]`):
+        Discret code embeddings computed using `model.encode`, each tensor has shape (codes_length_i, num_quantizers).
+    """
+    audio_codes: List[torch.LongTensor] = None
+@dataclass
+@auto_docstring
+class Qwen3TTSTokenizerV2DecoderOutput(ModelOutput):
+    r"""
+    audio_values (`List[torch.FloatTensor]`):
+        Decoded audio values, obtained using the decoder part of Qwen3TTSTokenizerV1.
+        Each tensor has shape (segment_length_i).
+    """
+    audio_values: List[torch.FloatTensor] = None
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+@auto_docstring
+class Qwen3TTSTokenizerV2DecoderPreTrainedModel(PreTrainedModel):
+    config: Qwen3TTSTokenizerV2DecoderConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+class Qwen3TTSTokenizerV2CausalConvNet(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation=1,
+        stride=1,
+        groups=1,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            groups=groups,
+        )
+        self.stride = stride
+        self.kernel_size = (kernel_size - 1) * dilation + 1
+        self.dilation = dilation
+        self.padding = self.kernel_size - self.stride
+    def _get_extra_padding_for_conv1d(self, hidden_state: torch.Tensor) -> int:
+        length = hidden_state.shape[-1]
+        n_frames = (length - self.kernel_size + self.padding) / self.stride + 1
+        ideal_length = (math.ceil(n_frames) - 1) * self.stride + (self.kernel_size - self.padding)
+        return ideal_length - length
+    def forward(self, hidden_state):
+        extra_padding = self._get_extra_padding_for_conv1d(hidden_state)
+        hidden_state = F.pad(hidden_state, (self.padding, extra_padding), mode="constant", value=0)
+        return self.conv(hidden_state).contiguous()
+class Qwen3TTSTokenizerV2CausalTransConvNet(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
+        super().__init__()
+        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride)
+        pad = kernel_size - stride
+        self.left_pad = math.ceil(pad)
+        self.right_pad = pad = self.left_pad
+    def forward(self, hidden_state):
+        hidden_state = self.conv(hidden_state)
+        hidden_state = hidden_state[..., self.left_pad : hidden_state.shape[-1] - self.right_pad]
+        return hidden_state.contiguous()
+class Qwen3TTSTokenizerV2ConvNeXtBlock(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dwconv = Qwen3TTSTokenizerV2CausalConvNet(
+            dim,
+            dim,
+            kernel_size=7,
+            groups=dim,
+            dilation=1,
+        )
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, 4 * dim)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(1e-6 * torch.ones(dim))
+    def forward(self, hidden_states):
+        input = hidden_states
+        hidden_states = self.dwconv(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.norm(hidden_states)
+        hidden_states = self.pwconv1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.pwconv2(hidden_states)
+        hidden_states = self.gamma * hidden_states
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = input + hidden_states
+        return hidden_states
+class Qwen3TTSTokenizerV2DecoderRotatoryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: Qwen3TTSTokenizerV2DecoderConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Qwen3TTSTokenizerV2DecoderAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Qwen3TTSTokenizerV2DecoderConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = nn.Identity()
+        self.k_norm = nn.Identity()
+        self.sliding_window = config.sliding_window
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,  # diff with Llama
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class Qwen3TTSTokenizerV2DecoderMlp(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+@use_kernel_forward_from_hub("RMSNorm")
+class Qwen3TTSTokenizerV2DecoderRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
+        """
+        Qwen3TTSTokenizerV2DecoderRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class Qwen3TTSTokenizerV2DecoderLayerScale(nn.Module):
+    """Layer scale from [Touvron et al 2021] (https://huggingface.co/papers/2103.17239).
+    This rescales diagonally the residual outputs close to 0, with a learnt scale.
+    """
+    def __init__(self, config):
+        super().__init__()
+        channels = config.hidden_size
+        initial_scale = config.layer_scale_initial_scale
+        self.scale = nn.Parameter(torch.full((channels,), initial_scale, requires_grad=True))
+    def forward(self, x: torch.Tensor):
+        return self.scale * x
+class Qwen3TTSTokenizerV2DecoderTransformerLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Qwen3TTSTokenizerV2DecoderConfig, layer_idx):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Qwen3TTSTokenizerV2DecoderAttention(config, layer_idx)
+        self.mlp = Qwen3TTSTokenizerV2DecoderMlp(config)
+        self.input_layernorm = Qwen3TTSTokenizerV2DecoderRMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3TTSTokenizerV2DecoderRMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.self_attn_layer_scale = Qwen3TTSTokenizerV2DecoderLayerScale(config)
+        self.mlp_layer_scale = Qwen3TTSTokenizerV2DecoderLayerScale(config)
+        self.attention_type = "sliding_attention"
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + self.self_attn_layer_scale(hidden_states)
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.mlp_layer_scale(hidden_states)
+        return hidden_states
+@auto_docstring
+class Qwen3TTSTokenizerV2DecoderTransformerModel(Qwen3TTSTokenizerV2DecoderPreTrainedModel):
+    _can_record_outputs = {
+        "hidden_states": Qwen3TTSTokenizerV2DecoderTransformerLayer,
+        "attentions": Qwen3TTSTokenizerV2DecoderAttention,
+    }
+    def __init__(self, config: Qwen3TTSTokenizerV2DecoderConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [Qwen3TTSTokenizerV2DecoderTransformerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen3TTSTokenizerV2DecoderRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3TTSTokenizerV2DecoderRotatoryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.has_sliding_layers = "sliding_attention" in self.config.layer_types
+        self.window_size = config.sliding_window
+        self.input_proj = nn.Linear(config.latent_dim, config.hidden_size)
+        self.output_proj = nn.Linear(config.hidden_size, config.latent_dim)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_position=None,
+        **kwargs,
+    ) -> BaseModelOutputWithPast:
+        if input_ids is not None:
+            raise ValueError("input_ids is not expected")
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.input_proj(inputs_embeds)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            # The sliding window alternating layers are not always activated depending on the config
+            if self.has_sliding_layers:
+                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        hidden_states = self.output_proj(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+class SnakeBeta(nn.Module):
+    """
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://huggingface.co/papers/2006.08195
+    """
+    def __init__(self, in_features, alpha=1.0):
+        super().__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        self.beta = Parameter(torch.zeros(in_features) * alpha)
+        self.no_div_by_zero = 0.000000001
+    def forward(self, hidden_states):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        alpha = torch.exp(alpha)
+        beta = torch.exp(beta)
+        hidden_states = hidden_states + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(
+            torch.sin(hidden_states * alpha), 2
+        )
+        return hidden_states
+class Qwen3TTSTokenizerV2DecoderDecoderResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1):
+        super().__init__()
+        self.act1 = SnakeBeta(dim)
+        self.conv1 = Qwen3TTSTokenizerV2CausalConvNet(dim, dim, kernel_size=7, dilation=dilation)
+        self.act2 = SnakeBeta(dim)
+        self.conv2 = Qwen3TTSTokenizerV2CausalConvNet(dim, dim, kernel_size=1)
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.act1(hidden_state)
+        hidden_state = self.conv1(hidden_state)
+        hidden_state = self.act2(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+        return hidden_state + residual
+class Qwen3TTSTokenizerV2DecoderDecoderBlock(Qwen3TTSTokenizerV2DecoderPreTrainedModel):
+    def __init__(self, config: Qwen3TTSTokenizerV2DecoderConfig, layer_idx):
+        super().__init__(config)
+        in_dim = config.decoder_dim // 2**layer_idx
+        out_dim = config.decoder_dim // 2 ** (layer_idx + 1)
+        upsample_rate = config.upsample_rates[layer_idx]
+        block = [
+            SnakeBeta(in_dim),
+            Qwen3TTSTokenizerV2CausalTransConvNet(in_dim, out_dim, 2 * upsample_rate, upsample_rate),
+        ]
+        for dilation in (1, 3, 9):
+            block.append(Qwen3TTSTokenizerV2DecoderDecoderResidualUnit(out_dim, dilation))
+        self.block = nn.ModuleList(block)
+    def forward(self, hidden):
+        for block in self.block:
+            hidden = block(hidden)
+        return hidden
+class EuclideanCodebook(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        epsilon: float = 1e-5,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.codebook_size = codebook_size
+        self.epsilon = epsilon
+        self.cluster_usage = nn.Parameter(torch.ones(codebook_size))
+        self.embedding_sum = nn.Parameter(torch.zeros(codebook_size, dim))
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        embedding = self.embedding_sum / self.cluster_usage.clamp(min=self.epsilon)[:, None]
+        quantized = F.embedding(codes, embedding)
+        return quantized
+class VectorQuantization(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: Optional[int] = None,
+        epsilon: float = 1e-5,
+    ):
+        super().__init__()
+        if codebook_dim is None:
+            codebook_dim = dim
+        requires_projection = codebook_dim != dim
+        self.project_out = (
+            nn.Linear(codebook_dim, dim) if requires_projection else nn.Identity()
+        )
+        self.epsilon = epsilon
+        self._codebook = EuclideanCodebook(
+            dim=codebook_dim,
+            codebook_size=codebook_size,
+            epsilon=epsilon
+        )
+        self.codebook_size = codebook_size
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        quantized = self._codebook.decode(codes)
+        quantized = self.project_out(quantized)
+        quantized = quantized.transpose(1, 2)
+        return quantized
+class ResidualVectorQuantization(nn.Module):
+    def __init__(self, *, num_quantizers: int, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        quantized = torch.zeros([1], device=codes.device)[0]
+        for idx, layer_codes in enumerate(codes):
+            layer = self.layers[idx]
+            assert isinstance(layer, VectorQuantization)
+            quantized = quantized + layer.decode(layer_codes)
+        return quantized
+class ResidualVectorQuantizer(nn.Module):
+    def __init__(
+        self,
+        dimension: int = 128,
+        input_dimension: Optional[int] = None,
+        output_dimension: Optional[int] = None,
+        n_q: int = 8,
+        q_dropout: bool = False,
+        no_quantization_rate: float = 0.0,
+        bins: int = 1024,
+        decay: float = 0.99,
+        force_projection: bool = False,
+    ):
+        super().__init__()
+        self.max_n_q = n_q
+        self.n_q = n_q
+        self.q_dropout = q_dropout
+        self.no_quantization_rate = no_quantization_rate
+        self.dimension = dimension
+        self.input_dimension = input_dimension or dimension
+        self.output_dimension = output_dimension or dimension
+        self.bins = bins
+        self.decay = decay
+        self.input_proj: torch.nn.Module
+        self.output_proj: torch.nn.Module
+        if self.input_dimension == self.dimension and not force_projection:
+            self.input_proj = torch.nn.Identity()
+        else:
+            self.input_proj = torch.nn.Conv1d(
+                self.input_dimension, self.dimension, 1, bias=False
+            )
+        if self.output_dimension == self.dimension and not force_projection:
+            self.output_proj = torch.nn.Identity()
+        else:
+            self.output_proj = torch.nn.Conv1d(
+                self.dimension, self.output_dimension, 1, bias=False
+            )
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q
+        )
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        codes = codes.transpose(0, 1)
+        quantized = self.vq.decode(codes)
+        quantized = self.output_proj(quantized)
+        return quantized
+class SplitResidualVectorQuantizer(nn.Module):
+    """Residual Vector Quantizer with separate projections for the first quantizer and the rest.
+    Args:
+        n_q (int): Number of residual vector quantizers used.
+        n_semantic_q (int): Number of residual vector quantizers used for the semantic quantizer.
+        **kwargs: Arguments to the constructor of `ResidualVectorQuantizer` that are shared between both.
+    """
+    def __init__(
+        self,
+        *,
+        n_q: int = 8,
+        n_q_semantic: int = 1,
+        **kwargs,
+    ):
+        super().__init__()
+        assert n_q > n_q_semantic, (
+            f"Number of quantizers {n_q} must be larger "
+            f"than the number of semantic quantizers {n_q_semantic}."
+        )
+        self.max_n_q = n_q
+        self.n_q_semantic = n_q_semantic
+        self.n_q_acoustic = n_q - n_q_semantic
+        q_dropout = kwargs.pop("q_dropout", False)
+        self.rvq_first = ResidualVectorQuantizer(
+            n_q=n_q_semantic, force_projection=True, q_dropout=False, **kwargs
+        )
+        self.rvq_rest = ResidualVectorQuantizer(
+            n_q=n_q - n_q_semantic,
+            force_projection=True,
+            q_dropout=q_dropout,
+            **kwargs,
+        )
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        quantized = self.rvq_first.decode(codes[:, : self.n_q_semantic])
+        if codes.shape[1] > self.n_q_semantic:
+            quantized += self.rvq_rest.decode(codes[:, self.n_q_semantic :])
+        return quantized
+class Qwen3TTSTokenizerV2Decoder(Qwen3TTSTokenizerV2DecoderPreTrainedModel):
+    def __init__(self, config: Qwen3TTSTokenizerV2DecoderConfig):
+        super().__init__(config)
+        self.total_upsample = np.prod(config.upsample_rates + config.upsampling_ratios)
+        self.pre_transformer = Qwen3TTSTokenizerV2DecoderTransformerModel._from_config(config)
+        self.quantizer = SplitResidualVectorQuantizer(
+            dimension=config.codebook_dim // 2,
+            n_q=config.num_quantizers,
+            n_q_semantic=1,
+            bins=config.codebook_size,
+            input_dimension=config.codebook_dim,
+            output_dimension=config.codebook_dim,
+        )
+        self.pre_conv = Qwen3TTSTokenizerV2CausalConvNet(
+            config.codebook_dim,
+            config.latent_dim,
+            kernel_size=3,
+        )
+        upsample = []
+        for factor in config.upsampling_ratios:
+            upsample.append(
+                nn.ModuleList(
+                    [
+                        Qwen3TTSTokenizerV2CausalTransConvNet(config.latent_dim, config.latent_dim, factor, factor),
+                        Qwen3TTSTokenizerV2ConvNeXtBlock(config.latent_dim),
+                    ]
+                )
+            )
+        self.upsample = nn.ModuleList(upsample)
+        decoder = [Qwen3TTSTokenizerV2CausalConvNet(config.latent_dim, config.decoder_dim, 7)]
+        for i in range(len(config.upsample_rates)):
+            decoder.append(Qwen3TTSTokenizerV2DecoderDecoderBlock(config, i))
+        output_dim = config.decoder_dim // 2 ** len(config.upsample_rates)
+        decoder += [
+            SnakeBeta(output_dim),
+            Qwen3TTSTokenizerV2CausalConvNet(output_dim, 1, 7),
+        ]
+        self.decoder = nn.ModuleList(decoder)
+        self.post_init()
+    def forward(self, codes):
+        if codes.shape[1] != self.config.num_quantizers:
+            raise ValueError(f"Expected {self.config.num_quantizers} layer of codes, got {codes.shape[1]}")
+        hidden = self.quantizer.decode(codes)
+        hidden = self.pre_conv(hidden).transpose(1, 2)
+        hidden = self.pre_transformer(inputs_embeds=hidden).last_hidden_state
+        hidden = hidden.permute(0, 2, 1)
+        for blocks in self.upsample:
+            for block in blocks:
+                hidden = block(hidden)
+        wav = hidden
+        for block in self.decoder:
+            wav = block(wav)
+        return wav.clamp(min=-1, max=1)
+    def chunked_decode(self, codes, chunk_size=300, left_context_size=25):
+        wavs = []
+        start_index = 0
+        while start_index < codes.shape[-1]:
+            end_index = min(start_index + chunk_size, codes.shape[-1])
+            context_size = left_context_size if start_index - left_context_size > 0 else start_index
+            codes_chunk = codes[..., start_index - context_size : end_index]
+            wav_chunk = self(codes_chunk)
+            wavs.append(wav_chunk[..., context_size * self.total_upsample :])
+            start_index = end_index
+        return torch.cat(wavs, dim=-1)
+class Qwen3TTSTokenizerV2Encoder(MimiModel):
+    def __init__(self, config: MimiConfig):
+        super().__init__(config)
+        self.config = config
+        self.upsample = None
+        self.decoder_transformer = None
+        self.decoder = None
+        self.post_init()
+@auto_docstring
+class Qwen3TTSTokenizerV2PreTrainedModel(PreTrainedModel):
+    config: Qwen3TTSTokenizerV2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+@auto_docstring(
+    custom_intro="""
+    The Qwen3TTSTokenizerV2 model.
+    """
+)
+class Qwen3TTSTokenizerV2Model(Qwen3TTSTokenizerV2PreTrainedModel):
+    def __init__(self, config: Qwen3TTSTokenizerV2Config):
+        super().__init__(config)
+        self.config = config
+        self.encoder_valid_num_quantizers = config.encoder_valid_num_quantizers
+        self.input_sample_rate = config.input_sample_rate
+        self.output_sample_rate = config.output_sample_rate
+        self.decode_upsample_rate = config.decode_upsample_rate
+        self.encode_downsample_rate = config.encode_downsample_rate
+        self.encoder = Qwen3TTSTokenizerV2Encoder._from_config(self.config.encoder_config)
+        self.decoder = Qwen3TTSTokenizerV2Decoder._from_config(self.config.decoder_config)
+        self.post_init()
+    def get_model_type(self):
+        return self.config.model_type
+    def get_input_sample_rate(self):
+        return self.input_sample_rate
+    def get_output_sample_rate(self):
+        return self.output_sample_rate
+    def get_encode_downsample_rate(self):
+        return self.encode_downsample_rate
+    def get_decode_upsample_rate(self):
+        return self.decode_upsample_rate
+    def encode(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor, Optional[torch.Tensor]], Qwen3TTSTokenizerV2EncoderOutput]:
+        """
+        Encodes the input audio waveform into discrete codes.
+        Args:
+            input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Float values of the input audio waveform.
+            padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+                for *masked*.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        encoded_frames = self.encoder.encode(input_values=input_values.unsqueeze(1),
+                                             return_dict=True)
+        audio_codes = encoded_frames.audio_codes[:, :self.encoder_valid_num_quantizers]
+        audio_codes = [code[..., :-(-mask.sum() // self.encode_downsample_rate)].transpose(0, 1) for code, mask in zip(audio_codes, padding_mask)]
+        if not return_dict:
+            return (
+                audio_codes,
+            )
+        return Qwen3TTSTokenizerV2EncoderOutput(audio_codes)
+    def decode(
+        self,
+        audio_codes: torch.Tensor,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], Qwen3TTSTokenizerV2DecoderOutput]:
+        """
+        Decodes the given frames into an output audio waveform.
+        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
+        trimmed.
+        Args:
+            audio_codes (`torch.LongTensor`  of shape `(batch_size, codes_length, num_quantizers)`, *optional*):
+                Discret code embeddings computed using `model.encode`.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        audio_values = self.decoder.chunked_decode(audio_codes.transpose(1, 2)).squeeze(1)
+        audio_lengths = (audio_codes[..., 0] > 0).sum(1) * self.decode_upsample_rate
+        audio_values = [a[:l] for a, l in zip(audio_values, audio_lengths)]
+        if not return_dict:
+            return (
+                audio_values,
+            )
+        return Qwen3TTSTokenizerV2DecoderOutput(audio_values)
+__all__ = ["Qwen3TTSTokenizerV2Model", "Qwen3TTSTokenizerV2PreTrainedModel"]

qwen_tts/core/tokenizer_25hz/configuration_qwen3_tts_tokenizer_v1.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# coding=utf-8
+# Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3TTSTokenizerV1 model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Qwen3TTSTokenizerV1DecoderDiTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of the Qwen3TTSTokenizerV1DecoderToken2WavDiT.
+    It defines the architecture of the DiT model, which is used for generating mel-spectrograms from tokens.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            The dimension of the model.
+        num_hidden_layers (`int`, *optional*, defaults to 22):
+            The number of transformer blocks in the DiT model.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            The number of attention heads in each transformer block.
+        ff_mult (`int`, *optional*, defaults to 2):
+            The multiplier for the feedforward layer in each transformer block.
+        emb_dim (`int`, *optional*, defaults to 512):
+            The dimension of the embedding layer.
+        head_dim (`int`, *optional*, defaults to 64):
+            The dimension of each attention head.
+        repeats (`int`, *optional*, defaults to 2):
+            The number of times the codec embeddings are repeated.
+        num_embeds (`int`, *optional*, defaults to 8193):
+            The number of unique embeddings in the codec.
+        mel_dim (`int`, *optional*, defaults to 80):
+            The dimension of the mel-spectrogram.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout rate for the transformer blocks.
+        enc_emb_dim (`int`, *optional*, defaults to 192):
+            The dimension of the pre-trained speaker embedding.
+        enc_dim (`int`, *optional*, defaults to 128):
+            The dimension of the encoder output.
+        enc_channels (`list[int]`, *optional*, defaults to `[256, 256, 256, 256, 768]`):
+            A list of output channels for each TDNN/SERes2Net layer in the encoder.
+        enc_kernel_sizes (`list[int]`, *optional*, defaults to `[5, 3, 3, 3, 1]`):
+            A list of kernel sizes for each layer in the encoder.
+        enc_dilations (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 1]`):
+            A list of dilations for each layer in the encoder.
+        enc_attention_channels (`int`, *optional*, defaults to 64):
+            The number of attention channels in the SqueezeExcitationBlock.
+        enc_res2net_scale (`int`, *optional*, defaults to 2):
+            The scale of the Res2Net block in the encoder.
+        enc_se_channels (`int`, *optional*, defaults to 64):
+            The number of output channels after squeeze in the SqueezeExcitationBlock.
+    """
+    model_type = "qwen3_tts_tokenizer_v1_decoder_dit"
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=22,
+        num_attention_heads=16,
+        ff_mult=2,
+        emb_dim=512,
+        head_dim=64,
+        rope_theta=10000.0,
+        max_position_embeddings=32768,
+        block_size=24,
+        look_ahead_layers=[10],
+        look_backward_layers=[0, 20],
+        repeats=2,
+        num_embeds=8193,
+        mel_dim=80,
+        dropout=0.1,
+        enc_emb_dim=192,
+        enc_dim=128,
+        enc_channels=[256, 256, 256, 256, 768],
+        enc_kernel_sizes=[5, 3, 3, 3, 1],
+        enc_dilations=[1, 2, 3, 4, 1],
+        enc_attention_channels=64,
+        enc_res2net_scale=2,
+        enc_se_channels=64,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.ff_mult = ff_mult
+        self.emb_dim = emb_dim
+        self.head_dim = head_dim
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.block_size = block_size
+        self.look_ahead_layers = look_ahead_layers
+        self.look_backward_layers = look_backward_layers
+        self.repeats = repeats
+        self.num_embeds = num_embeds
+        self.mel_dim = mel_dim
+        self.dropout = dropout
+        self.enc_emb_dim = enc_emb_dim
+        self.enc_dim = enc_dim
+        self.enc_channels = enc_channels
+        self.enc_kernel_sizes = enc_kernel_sizes
+        self.enc_dilations = enc_dilations
+        self.enc_attention_channels = enc_attention_channels
+        self.enc_res2net_scale = enc_res2net_scale
+        self.enc_se_channels = enc_se_channels
+        super().__init__(**kwargs)
+class Qwen3TTSTokenizerV1DecoderBigVGANConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of the Qwen3TTSTokenizerV1DecoderToken2WavBigVGAN module.
+    It defines the architecture of the BigVGAN model, which is used for converting mel-spectrograms to waveforms.
+    Args:
+        mel_dim (`int`, *optional*, defaults to 80):
+            The dimension of the mel-spectrogram.
+        upsample_initial_channel (`int`, *optional*, defaults to 1536):
+            The number of channels in the initial upsampling layer.
+        resblock_kernel_sizes (`list[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A list of kernel sizes for each residual block.
+        resblock_dilation_sizes (`list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A list of dilation sizes for each residual block.
+        upsample_rates (`list[int]`, *optional*, defaults to `[5, 3, 2, 2, 2, 2]`):
+            A list of upsampling rates for each upsampling layer.
+        upsample_kernel_sizes (`list[int]`, *optional*, defaults to `[11, 7, 4, 4, 4, 4]`):
+            A list of kernel sizes for each upsampling layer.
+    """
+    model_type = "qwen3_tts_tokenizer_v1_decoder_bigvgan"
+    def __init__(
+        self,
+        mel_dim=80,
+        upsample_initial_channel=1536,
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        upsample_rates=[5, 3, 2, 2, 2, 2],
+        upsample_kernel_sizes=[11, 7, 4, 4, 4, 4],
+        **kwargs,
+    ):
+        self.mel_dim = mel_dim
+        self.upsample_initial_channel = upsample_initial_channel
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        super().__init__(**kwargs)
+class Qwen3TTSTokenizerV1DecoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3TTSTokenizerV1DecoderConfig`].
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        dit_config ([`DiT_Args`], *optional*):
+            Configuration class for the Diffusion Transformer (DiT) module responsible for generating mel-spectrograms.
+        bigvgan_config ([`BigVGAN_Args`], *optional*):
+            Configuration class for the BigVGAN module responsible for converting mel-spectrograms to waveforms.
+    """
+    model_type = "qwen3_tts_tokenizer_v1_decoder"
+    sub_configs = {
+        "dit_config": Qwen3TTSTokenizerV1DecoderDiTConfig,
+        "bigvgan_config": Qwen3TTSTokenizerV1DecoderBigVGANConfig,
+    }
+    def __init__(self, dit_config=None, bigvgan_config=None, **kwargs):
+        if dit_config is None:
+            dit_config = {}
+        if bigvgan_config is None:
+            bigvgan_config = {}
+        self.dit_config = Qwen3TTSTokenizerV1DecoderDiTConfig(**dit_config)
+        self.bigvgan_config = Qwen3TTSTokenizerV1DecoderBigVGANConfig(**bigvgan_config)
+        super().__init__(**kwargs)
+class Qwen3TTSTokenizerV1EncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of the Qwen3TTSTokenizerV1 Encoder.
+    The encoder typically takes mel-spectrogram features and produces high-level audio representations, then (optionally)
+    applies an Audio-VQ module (e.g., GRVQ) to discretize continuous representations into codes.
+    Args:
+        n_mels (`int`, *optional*, defaults to 128):
+            Number of mel bins in the input mel-spectrogram.
+        n_ctx (`int`, *optional*, defaults to 1500):
+            Maximum input sequence length (in frames/tokens) for the encoder.
+        n_state (`int`, *optional*, defaults to 1280):
+            Hidden size (model dimension) of the encoder transformer.
+        n_head (`int`, *optional*, defaults to 20):
+            Number of attention heads in each transformer layer.
+        n_layer (`int`, *optional*, defaults to 32):
+            Number of transformer layers.
+        n_window (`int`, *optional*, defaults to 100):
+            Window size used by the model for local attention / chunking (implementation-dependent).
+        output_dim (`int`, *optional*, defaults to 3584):
+            Output feature dimension produced by the encoder head (before/after projection, implementation-dependent).
+        grad_checkpointing (`bool`, *optional*, defaults to `False`):
+            Whether to enable gradient checkpointing to reduce memory usage during training.
+        enable_mp (`bool`, *optional*, defaults to `False`):
+            Whether to enable model parallel features (implementation-dependent).
+        audio_sequence_parallel (`bool`, *optional*, defaults to `False`):
+            Whether to enable sequence parallelism for audio branch (implementation-dependent).
+        audio_vq_type (`str`, *optional*, defaults to `"GRVQ"`):
+            Type of audio vector-quantization module. Common choices: `"GRVQ"`, `"RVQ"`, etc.
+        audio_vq_layers (`int`, *optional*, defaults to 6):
+            Number of VQ layers / quantizers (e.g., number of residual quantizers for RVQ/GRVQ-like designs).
+        audio_vq_codebook_size (`int`, *optional*, defaults to 32768):
+            Size of each codebook (number of entries).
+        audio_vq_codebook_dim (`int`, *optional*, defaults to 1280):
+            Dimension of codebook vectors (often equals encoder hidden size).
+        audio_vq_pe (`bool`, *optional*, defaults to `True`):
+            Whether to use positional encoding (or position embeddings) inside the VQ module.
+        audio_vq_ds_rate (`int`, *optional*, defaults to 2):
+            Downsampling rate applied before VQ (e.g., temporal downsample factor).
+    """
+    model_type = "qwen3_tts_tokenizer_v1_encoder"
+    def __init__(
+        self,
+        n_mels=128,
+        n_ctx=1500,
+        n_state=1280,
+        n_head=20,
+        n_layer=32,
+        n_window=100,
+        output_dim=3584,
+        grad_checkpointing=False,
+        enable_mp=False,
+        audio_sequence_parallel=False,
+        audio_vq_type="GRVQ",
+        audio_vq_layers=6,
+        audio_vq_codebook_size=32768,
+        audio_vq_codebook_dim=1280,
+        audio_vq_pe=True,
+        audio_vq_ds_rate=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.n_mels = n_mels
+        self.n_ctx = n_ctx
+        self.n_state = n_state
+        self.n_head = n_head
+        self.n_layer = n_layer
+        self.n_window = n_window
+        self.output_dim = output_dim
+        self.grad_checkpointing = grad_checkpointing
+        self.enable_mp = enable_mp
+        self.audio_sequence_parallel = audio_sequence_parallel
+        self.audio_vq_type = audio_vq_type
+        self.audio_vq_layers = audio_vq_layers
+        self.audio_vq_codebook_size = audio_vq_codebook_size
+        self.audio_vq_codebook_dim = audio_vq_codebook_dim
+        self.audio_vq_pe = audio_vq_pe
+        self.audio_vq_ds_rate = audio_vq_ds_rate
+class Qwen3TTSTokenizerV1Config(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`Qwen3TTSTokenizerV1Config`]. It is used to instantiate a Qwen3TTSTokenizerV1Model
+    model according to the specified sub-models configurations, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        encoder_config (`dict`, *optional*): Configuration of the underlying encoder sub-model.
+        decoder_config (`dict`, *optional*): Configuration of the underlying decoder sub-model.
+    """
+    model_type = "qwen3_tts_tokenizer_25hz"
+    sub_configs = {
+        "encoder_config": Qwen3TTSTokenizerV1EncoderConfig,
+        "decoder_config": Qwen3TTSTokenizerV1DecoderConfig,
+    }
+    def __init__(
+        self,
+        encoder_config=None,
+        decoder_config=None,
+        input_sample_rate=24000,
+        output_sample_rate=24000,
+        decode_upsample_rate=1920,
+        encode_downsample_rate=1920,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if encoder_config is None:
+            encoder_config = {}
+            logger.info("encoder_config is None. Initializing encoder with default values")
+        if decoder_config is None:
+            decoder_config = {}
+            logger.info("decoder_config is None. Initializing decoder with default values")
+        self.encoder_config = Qwen3TTSTokenizerV1EncoderConfig(**encoder_config)
+        self.decoder_config = Qwen3TTSTokenizerV1DecoderConfig(**decoder_config)
+        self.input_sample_rate = input_sample_rate
+        self.output_sample_rate = output_sample_rate
+        self.decode_upsample_rate = decode_upsample_rate
+        self.encode_downsample_rate = encode_downsample_rate
+__all__ = [
+    "Qwen3TTSTokenizerV1Config",
+    "Qwen3TTSTokenizerV1EncoderConfig",
+    "Qwen3TTSTokenizerV1DecoderConfig",
+    "Qwen3TTSTokenizerV1DecoderBigVGANConfig",
+    "Qwen3TTSTokenizerV1DecoderDiTConfig"
+]

qwen_tts/core/tokenizer_25hz/modeling_qwen3_tts_tokenizer_v1.py ADDED Viewed

	@@ -0,0 +1,1528 @@

+# coding=utf-8
+# Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen3TTSTokenizerV1 model."""
+import math
+from dataclasses import dataclass
+from typing import Optional, Union, List
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import Parameter
+from torch.nn import functional as F
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.utils import ModelOutput, auto_docstring, logging
+from transformers.utils.hub import cached_file
+from torch.nn.utils.rnn import pad_sequence
+from .vq.whisper_encoder import get_mel_audio, get_T_after_cnn
+from .vq.speech_vq import WhisperEncoderVQ, XVectorExtractor
+from .configuration_qwen3_tts_tokenizer_v1 import (
+    Qwen3TTSTokenizerV1Config,
+    Qwen3TTSTokenizerV1EncoderConfig,
+    Qwen3TTSTokenizerV1DecoderConfig,
+    Qwen3TTSTokenizerV1DecoderBigVGANConfig,
+    Qwen3TTSTokenizerV1DecoderDiTConfig
+)
+logger = logging.get_logger(__name__)
+@dataclass
+@auto_docstring
+class Qwen3TTSTokenizerV1EncoderOutput(ModelOutput):
+    r"""
+    audio_codes (`List[torch.LongTensor]`):
+        Discret code embeddings computed using `model.encode`, each tensor has shape (codes_length_i,).
+    xvectors (`List[torch.FloatTensor]`):
+        X-vector embeddings computed using `model.encode`, each tensor has shape (xvector_dim,).
+    ref_mels (`List[torch.FloatTensor]`):
+        Reference mel spectrogram computed using `model.encode`, each tensor has shape (mel_length_i, mel_dim,).
+    """
+    audio_codes: List[torch.LongTensor] = None
+    xvectors: List[torch.FloatTensor] = None
+    ref_mels: List[torch.FloatTensor] = None
+@dataclass
+@auto_docstring
+class Qwen3TTSTokenizerV1DecoderOutput(ModelOutput):
+    r"""
+    audio_values (`List[torch.FloatTensor]`):
+        Decoded audio values, obtained using the decoder part of Qwen3TTSTokenizerV1.
+        Each tensor has shape (segment_length_i).
+    """
+    audio_values: List[torch.FloatTensor] = None
+@auto_docstring
+class Qwen3TTSTokenizerV1DecoderPreTrainedModel(PreTrainedModel):
+    config: Qwen3TTSTokenizerV1DecoderConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+@auto_docstring
+class Qwen3TTSTokenizerV1EncoderPreTrainedModel(PreTrainedModel):
+    config: Qwen3TTSTokenizerV1EncoderConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+class Qwen3TTSTokenizerV1DecoderDiTRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, dim, base=10000):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+    def forward(self, x):
+        batch_size, seq_len = x.shape[0], x.shape[1]
+        t = torch.arange(seq_len, device=x.device)
+        device_type = x.device.type
+        device_type = device_type if device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = t.unsqueeze(1).float() @ self.inv_freq.unsqueeze(0).float()
+            freqs = torch.stack((freqs, freqs), dim=-1)
+            freqs = freqs.reshape(*freqs.shape[:-2], -1)
+            freqs = freqs.repeat(batch_size, *([1] * freqs.dim()))
+            cos = freqs.cos()
+            sin = freqs.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class TimeDelayNetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding="same",
+            padding_mode="reflect",
+        )
+        self.activation = nn.ReLU()
+    def forward(self, hidden_states: torch.Tensor):
+        return self.activation(self.conv(hidden_states))
+class Res2NetBlock(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1):
+        super().__init__()
+        in_channel = in_channels // scale
+        hidden_channel = out_channels // scale
+        self.blocks = nn.ModuleList(
+            [
+                TimeDelayNetBlock(
+                    in_channel,
+                    hidden_channel,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                )
+                for i in range(scale - 1)
+            ]
+        )
+        self.scale = scale
+    def forward(self, hidden_states):
+        outputs = []
+        for i, hidden_part in enumerate(torch.chunk(hidden_states, self.scale, dim=1)):
+            if i == 0:
+                output_part = hidden_part
+            elif i == 1:
+                output_part = self.blocks[i - 1](hidden_part)
+            else:
+                output_part = self.blocks[i - 1](hidden_part + output_part)
+            outputs.append(output_part)
+        output = torch.cat(outputs, dim=1)
+        return output
+class SqueezeExcitationBlock(nn.Module):
+    def __init__(self, in_channels, se_channels, out_channels):
+        super().__init__()
+        self.conv1 = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=se_channels,
+            kernel_size=1,
+            padding="same",
+            padding_mode="reflect",
+        )
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv1d(
+            in_channels=se_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            padding="same",
+            padding_mode="reflect",
+        )
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, hidden_states):
+        hidden_states_mean = hidden_states.mean(dim=2, keepdim=True)
+        hidden_states_mean = self.relu(self.conv1(hidden_states_mean))
+        hidden_states_mean = self.sigmoid(self.conv2(hidden_states_mean))
+        return hidden_states * hidden_states_mean
+class AttentiveStatisticsPooling(nn.Module):
+    """This class implements an attentive statistic pooling layer for each channel.
+    It returns the concatenated mean and std of the input tensor.
+    """
+    def __init__(self, channels, attention_channels=128):
+        super().__init__()
+        self.eps = 1e-12
+        self.tdnn = TimeDelayNetBlock(channels * 3, attention_channels, 1, 1)
+        self.tanh = nn.Tanh()
+        self.conv = nn.Conv1d(
+            in_channels=attention_channels,
+            out_channels=channels,
+            kernel_size=1,
+            padding="same",
+            padding_mode="reflect",
+        )
+    def _length_to_mask(self, length, max_len=None, dtype=None, device=None):
+        """Creates a binary mask for each sequence.
+        Reference: https://discuss.pytorch.org/t/how-to-generate-variable-length-mask/23397/3
+        Arguments
+        ---------
+        length : torch.LongTensor
+            Containing the length of each sequence in the batch. Must be 1D.
+        max_len : int
+            Max length for the mask, also the size of the second dimension.
+        dtype : torch.dtype, default: None
+            The dtype of the generated mask.
+        device: torch.device, default: None
+            The device to put the mask variable.
+        Returns
+        -------
+        mask : tensor
+            The binary mask.
+        """
+        if max_len is None:
+            max_len = length.max().long().item()  # using arange to generate mask
+        mask = torch.arange(max_len, device=length.device, dtype=length.dtype).expand(
+            len(length), max_len
+        ) < length.unsqueeze(1)
+        mask = torch.as_tensor(mask, dtype=dtype, device=device)
+        return mask
+    def _compute_statistics(self, x, m, dim=2):
+        mean = (m * x).sum(dim)
+        std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(self.eps))
+        return mean, std
+    def forward(self, hidden_states):
+        seq_length = hidden_states.shape[-1]
+        lengths = torch.ones(hidden_states.shape[0], device=hidden_states.device)
+        # Make binary mask of shape [N, 1, L]
+        mask = self._length_to_mask(
+            lengths * seq_length, max_len=seq_length, dtype=hidden_states.dtype, device=hidden_states.device
+        )
+        mask = mask.unsqueeze(1)
+        # Expand the temporal context of the pooling layer by allowing the
+        # self-attention to look at global properties of the utterance.
+        total = mask.sum(dim=2, keepdim=True)
+        mean, std = self._compute_statistics(hidden_states, mask / total)
+        mean = mean.unsqueeze(2).repeat(1, 1, seq_length)
+        std = std.unsqueeze(2).repeat(1, 1, seq_length)
+        attention = torch.cat([hidden_states, mean, std], dim=1)
+        # Apply layers
+        attention = self.conv(self.tanh(self.tdnn(attention)))
+        # Filter out zero-paddings
+        attention = attention.masked_fill(mask == 0, float("-inf"))
+        attention = F.softmax(attention, dim=2)
+        mean, std = self._compute_statistics(hidden_states, attention)
+        # Append mean and std of the batch
+        pooled_stats = torch.cat((mean, std), dim=1)
+        pooled_stats = pooled_stats.unsqueeze(2)
+        return pooled_stats
+class SqueezeExcitationRes2NetBlock(nn.Module):
+    """An implementation of building block in ECAPA-TDNN, i.e.,
+    TDNN-Res2Net-TDNN-SqueezeExcitationBlock.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        res2net_scale=8,
+        se_channels=128,
+        kernel_size=1,
+        dilation=1,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.tdnn1 = TimeDelayNetBlock(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+        )
+        self.res2net_block = Res2NetBlock(out_channels, out_channels, res2net_scale, kernel_size, dilation)
+        self.tdnn2 = TimeDelayNetBlock(
+            out_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+        )
+        self.se_block = SqueezeExcitationBlock(out_channels, se_channels, out_channels)
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.tdnn1(hidden_state)
+        hidden_state = self.res2net_block(hidden_state)
+        hidden_state = self.tdnn2(hidden_state)
+        hidden_state = self.se_block(hidden_state)
+        return hidden_state + residual
+class ECAPA_TimeDelayNet(torch.nn.Module):
+    """An implementation of the speaker embedding model in a paper.
+    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
+    TDNN Based Speaker Verification" (https://huggingface.co/papers/2005.07143).
+    """
+    def __init__(self, config: Qwen3TTSTokenizerV1DecoderBigVGANConfig):
+        super().__init__()
+        if len(config.enc_channels) != len(config.enc_kernel_sizes) or len(config.enc_channels) != len(
+            config.enc_dilations
+        ):
+            raise ValueError("enc_channels, enc_kernel_sizes and enc_dilations should have same length")
+        self.channels = config.enc_channels
+        self.blocks = nn.ModuleList()
+        # The initial TDNN layer
+        self.blocks.append(
+            TimeDelayNetBlock(
+                config.mel_dim,
+                config.enc_channels[0],
+                config.enc_kernel_sizes[0],
+                config.enc_dilations[0],
+            )
+        )
+        # SE-Res2Net layers
+        for i in range(1, len(config.enc_channels) - 1):
+            self.blocks.append(
+                SqueezeExcitationRes2NetBlock(
+                    config.enc_channels[i - 1],
+                    config.enc_channels[i],
+                    res2net_scale=config.enc_res2net_scale,
+                    se_channels=config.enc_se_channels,
+                    kernel_size=config.enc_kernel_sizes[i],
+                    dilation=config.enc_dilations[i],
+                )
+            )
+        # Multi-layer feature aggregation
+        self.mfa = TimeDelayNetBlock(
+            config.enc_channels[-1],
+            config.enc_channels[-1],
+            config.enc_kernel_sizes[-1],
+            config.enc_dilations[-1],
+        )
+        # Attentive Statistical Pooling
+        self.asp = AttentiveStatisticsPooling(
+            config.enc_channels[-1],
+            attention_channels=config.enc_attention_channels,
+        )
+        # Final linear transformation
+        self.fc = nn.Conv1d(
+            in_channels=config.enc_channels[-1] * 2,
+            out_channels=config.enc_dim,
+            kernel_size=1,
+            padding="same",
+            padding_mode="reflect",
+        )
+    def forward(self, hidden_states):
+        # Minimize transpose for efficiency
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states_list = []
+        for layer in self.blocks:
+            hidden_states = layer(hidden_states)
+            hidden_states_list.append(hidden_states)
+        # Multi-layer feature aggregation
+        hidden_states = torch.cat(hidden_states_list[1:], dim=1)
+        hidden_states = self.mfa(hidden_states)
+        # Attentive Statistical Pooling
+        hidden_states = self.asp(hidden_states)
+        # Final linear transformation
+        hidden_states = self.fc(hidden_states)
+        hidden_states = hidden_states.squeeze(-1)
+        return hidden_states
+class DiTInputEmbedding(nn.Module):
+    def __init__(self, config: Qwen3TTSTokenizerV1DecoderBigVGANConfig):
+        super().__init__()
+        self.proj = nn.Linear(
+            config.mel_dim + config.enc_dim + config.enc_emb_dim + config.emb_dim,
+            config.hidden_size,
+        )
+        self.spk_encoder = ECAPA_TimeDelayNet(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        speaker_embedding: torch.Tensor,
+        condition_vector: torch.Tensor,
+        code_embed: torch.Tensor,
+        drop_audio_cond: Optional[bool] = False,
+        code_embed_uncond: Optional[bool] = None,
+        apply_cfg: Optional[bool] = True,
+    ):
+        if apply_cfg:
+            hidden_states = torch.cat([hidden_states, hidden_states], dim=0)
+            speaker_embedding = torch.cat([speaker_embedding, torch.zeros_like(speaker_embedding)], dim=0)
+            condition_vector = torch.cat([condition_vector, torch.zeros_like(condition_vector)], dim=0)
+            code_embed = torch.cat([code_embed, code_embed_uncond], dim=0)
+        elif drop_audio_cond:  # cfg for cond audio
+            condition_vector = torch.zeros_like(condition_vector)
+            speaker_embedding = torch.zeros_like(speaker_embedding)
+        condition_vector = self.spk_encoder(condition_vector).unsqueeze(1).repeat(1, hidden_states.size(1), 1)
+        hidden_states = self.proj(torch.cat((hidden_states, condition_vector, code_embed, speaker_embedding), dim=-1))
+        return hidden_states
+# Transformer backbone using DiT blocks
+class DiTCodecEmbedding(nn.Module):
+    def __init__(self, codec_num_embeds, codec_dim, repeats):
+        super().__init__()
+        self.repeats = repeats
+        self.codec_embed = nn.Embedding(codec_num_embeds + 1, codec_dim)
+    def forward(self, code, drop_code=False):
+        if drop_code:
+            code = torch.zeros_like(code)
+        code_embed = self.codec_embed(code)
+        code_embed = torch.repeat_interleave(code_embed, repeats=self.repeats, dim=1)
+        return code_embed
+# AdaLayerNormZero
+# return with modulated x for attn input, and params for later mlp modulation
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 6)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, hidden_states, emb=None):
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
+        hidden_states = self.norm(hidden_states) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp
+# AdaLayerNormZero for final layer
+# return only with modulated x for attn input, cuz no more mlp modulation
+class AdaLayerNormZero_Final(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 2)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, hidden_states, emb):
+        emb = self.linear(self.silu(emb))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return hidden_states
+# FeedForward
+class DiTMLP(nn.Module):
+    def __init__(self, dim, mult=4, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        self.ff = nn.ModuleList(
+            [
+                nn.Linear(dim, inner_dim),
+                nn.GELU(approximate="tanh"),
+                nn.Dropout(dropout),
+                nn.Linear(inner_dim, dim),
+            ]
+        )
+    def forward(self, hidden_states):
+        for layer in self.ff:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+# Modified from Llama with a different rotate function, will fixed in next release
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    def rotate_half_codec(x):
+        # x = rearrange(x, "... (d r) -> ... d r", r=2)
+        x = x.reshape(*x.shape[:-1], -1, 2)
+        x1, x2 = x.unbind(dim=-1)
+        x = torch.stack((-x2, x1), dim=-1)
+        return x.reshape(*x.shape[:-2], -1)
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half_codec(q) * sin)
+    k_embed = (k * cos) + (rotate_half_codec(k) * sin)
+    return q_embed, k_embed
+class DiTAttention(nn.Module):
+    def __init__(self, config: Qwen3TTSTokenizerV1DecoderBigVGANConfig):
+        super().__init__()
+        self.config = config
+        self.dim = config.hidden_size
+        self.heads = config.num_attention_heads
+        self.inner_dim = config.head_dim * config.num_attention_heads
+        self.dropout = config.dropout
+        self.is_causal = False
+        self.to_q = nn.Linear(config.hidden_size, self.inner_dim)
+        self.to_k = nn.Linear(config.hidden_size, self.inner_dim)
+        self.to_v = nn.Linear(config.hidden_size, self.inner_dim)
+        self.to_out = nn.ModuleList([nn.Linear(self.inner_dim, config.hidden_size), nn.Dropout(config.dropout)])
+    def forward(
+        self,
+        hidden_states,  # noised input x
+        position_embeddings=None,  # rotary position embedding for x
+        attention_mask=None,
+    ) -> torch.Tensor:
+        batch_size = hidden_states.shape[0]
+        # `sample` projections.
+        query = self.to_q(hidden_states)
+        key = self.to_k(hidden_states)
+        value = self.to_v(hidden_states)
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // self.heads
+        query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        # apply rotary position embedding
+        # Due to training process, only first head is applied with RoPE, will be fixed at next release
+        cos, sin = position_embeddings
+        query, key = apply_rotary_pos_emb(query, key, cos, sin)
+        attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attention_weights, _ = attention_interface(
+            self,
+            query,
+            key,
+            value,
+            attention_mask=attention_mask,
+            is_causal=False,
+        )
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        attention_weights = attention_weights.reshape(batch_size, -1, self.heads * head_dim)
+        attention_weights = attention_weights.to(query.dtype)
+        # linear proj
+        attention_output = self.to_out[0](attention_weights)
+        attention_output = self.to_out[1](attention_output)
+        return attention_output
+# time step conditioning embedding
+class SinusPositionEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, hidden_states, scale=1000):
+        device = hidden_states.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * hidden_states.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb.type_as(hidden_states)
+class DiTTimestepEmbedding(nn.Module):
+    def __init__(self, dim, freq_embed_dim=256):
+        super().__init__()
+        self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.ModuleList([nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim)])
+    def forward(self, timestep):
+        time_hidden = self.time_embed(timestep)
+        time_hidden = time_hidden.to(timestep.dtype)
+        for layer in self.time_mlp:
+            time_hidden = layer(time_hidden)  # b d
+        return time_hidden
+class DiTDecoderLayer(nn.Module):
+    def __init__(self, config: Qwen3TTSTokenizerV1DecoderBigVGANConfig, look_ahead_block=0, look_backward_block=0):
+        super().__init__()
+        self.attn_norm = AdaLayerNormZero(config.hidden_size)
+        self.attn = DiTAttention(config)
+        self.look_ahead_block = look_ahead_block
+        self.look_backward_block = look_backward_block
+        self.ff_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff = DiTMLP(dim=config.hidden_size, mult=config.ff_mult, dropout=config.dropout)
+    def forward(
+        self, hidden_states, timestep, position_embeddings=None, block_diff=None
+    ):  # x: noised input, t: time embedding
+        # pre-norm & modulation for attention input
+        norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(hidden_states, emb=timestep)
+        # attention
+        attn_output = self.attn(
+            hidden_states=norm,
+            position_embeddings=position_embeddings,
+            attention_mask=(block_diff >= -float(self.look_backward_block))
+            & (block_diff <= float(self.look_ahead_block)),
+        )
+        # process attention output for input x
+        hidden_states = hidden_states + gate_msa.unsqueeze(1) * attn_output
+        norm = self.ff_norm(hidden_states) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm)
+        hidden_states = hidden_states + gate_mlp.unsqueeze(1) * ff_output
+        return hidden_states
+class SnakeBeta(nn.Module):
+    """
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://huggingface.co/papers/2006.08195
+    """
+    def __init__(self, in_features, alpha=1.0):
+        super().__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        self.beta = Parameter(torch.zeros(in_features) * alpha)
+        self.no_div_by_zero = 0.000000001
+    def forward(self, hidden_states):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        alpha = torch.exp(alpha)
+        beta = torch.exp(beta)
+        hidden_states = hidden_states + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(
+            torch.sin(hidden_states * alpha), 2
+        )
+        return hidden_states
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):
+    """Generates a 1D Kaiser-windowed sinc filter.
+    Args:
+        cutoff (float): Normalized cutoff frequency (0 to 0.5).
+        half_width (float): Transition bandwidth.
+        kernel_size (int): Number of filter taps.
+    Returns:
+        torch.Tensor: A tensor of shape (1, 1, kernel_size) representing the filter.
+    """
+    is_even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+    # Compute Kaiser window parameters
+    delta_f = 4 * half_width
+    attenuation = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if attenuation > 50.0:
+        beta = 0.1102 * (attenuation - 8.7)
+    elif attenuation >= 21.0:
+        beta = 0.5842 * (attenuation - 21) ** 0.4 + 0.07886 * (attenuation - 21.0)
+    else:
+        beta = 0.0
+    kaiser_window = torch.kaiser_window(kernel_size, beta=beta, periodic=False, dtype=torch.float32)
+    # Compute time indices
+    if is_even:
+        time_indices = torch.arange(-half_size, half_size) + 0.5
+    else:
+        time_indices = torch.arange(kernel_size) - half_size
+    # Compute sinc filter
+    if cutoff == 0:
+        return torch.zeros((1, 1, kernel_size), dtype=torch.float32)  # Ensures correct shape
+    sinc_filter = torch.sinc(2 * cutoff * time_indices)
+    normalized_filter = 2 * cutoff * kaiser_window * sinc_filter
+    # Normalize to ensure sum = 1 (avoid leakage of constant component)
+    normalized_filter /= normalized_filter.sum()
+    return normalized_filter.view(1, 1, kernel_size)
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size)
+        self.register_buffer("filter", filter, persistent=False)
+    def forward(self, hidden_states):
+        channels = hidden_states.shape[1]
+        hidden_states = F.pad(hidden_states, (self.pad, self.pad), mode="replicate")
+        hidden_states = self.ratio * F.conv_transpose1d(
+            hidden_states, self.filter.expand(channels, -1, -1), stride=self.stride, groups=channels
+        )
+        hidden_states = hidden_states[..., self.pad_left : -self.pad_right]
+        return hidden_states
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        cutoff = 0.5 / ratio
+        half_width = 0.6 / ratio
+        if cutoff < 0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.even = kernel_size % 2 == 0
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = ratio
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter, persistent=False)
+    def forward(self, hidden_states):
+        channels = hidden_states.shape[1]
+        hidden_states = F.pad(hidden_states, (self.pad_left, self.pad_right), mode="replicate")
+        out = F.conv1d(hidden_states, self.filter.expand(channels, -1, -1), stride=self.stride, groups=channels)
+        return out
+class TorchActivation1d(nn.Module):
+    def __init__(
+        self,
+        activation,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ):
+        super().__init__()
+        if not callable(activation):
+            raise TypeError("Activation function must be callable")
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    def forward(self, hidden_states):
+        hidden_states = self.upsample(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.downsample(hidden_states)
+        return hidden_states
+class CausalConv1d(nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.causal_padding = self.dilation[0] * (self.kernel_size[0] - 1)
+    def forward(self, x):
+        return self._conv_forward(F.pad(x, [self.causal_padding, 0]), self.weight, self.bias)
+class AMPBlock(torch.nn.Module):
+    def __init__(
+        self,
+        channels,
+        kernel_size=3,
+        dilation=(1, 3, 5),
+        causal_type='1',
+    ):
+        super().__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                CausalConv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                ),
+                CausalConv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                ),
+                CausalConv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                ),
+            ]
+        )
+        if causal_type == '1':
+            self.convs2 = nn.ModuleList(
+                [
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self._get_padding(kernel_size, 1),
+                    ),
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self._get_padding(kernel_size, 1),
+                    ),
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self._get_padding(kernel_size, 1),
+                    ),
+                ]
+            )
+        else:
+            self.convs2 = nn.ModuleList(
+                [
+                    CausalConv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                    ),
+                    CausalConv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                    ),
+                    CausalConv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                    ),
+                ]
+            )
+        self.num_layers = len(self.convs1) + len(self.convs2)  # total number of conv layers
+        self.activations = nn.ModuleList(
+            [TorchActivation1d(activation=SnakeBeta(channels)) for _ in range(self.num_layers)]
+        )
+        if causal_type == '2':
+            self.pre_conv = nn.Conv1d(
+                                channels,
+                                channels,
+                                kernel_size,
+                                stride=1,
+                                padding=self._get_padding(kernel_size, 1),
+                            )
+            self.pre_act = TorchActivation1d(activation=SnakeBeta(channels))
+        else:
+            self.pre_conv = nn.Identity()
+            self.pre_act = nn.Identity()
+    def _get_padding(self, kernel_size, dilation=1):
+        return int((kernel_size * dilation - dilation) / 2)
+    def forward(self, x):
+        hidden_states = self.pre_conv(x)
+        hidden_states = self.pre_act(hidden_states)
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for conv1, conv2, act1, act2 in zip(self.convs1, self.convs2, acts1, acts2):
+            hidden_states = act1(hidden_states)
+            hidden_states = conv1(hidden_states)
+            hidden_states = act2(hidden_states)
+            hidden_states = conv2(hidden_states)
+            x = x + hidden_states
+        return x
+@auto_docstring
+class Qwen3TTSTokenizerV1DecoderBigVGANModel(Qwen3TTSTokenizerV1DecoderPreTrainedModel):
+    config: Qwen3TTSTokenizerV1DecoderBigVGANConfig
+    def __init__(self, config: Qwen3TTSTokenizerV1DecoderBigVGANConfig):
+        super().__init__(config)
+        self.num_residual_blocks = len(config.resblock_kernel_sizes)
+        self.num_upsample_layers = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(config.mel_dim, config.upsample_initial_channel, 5, 1, padding=2)
+        # Removing extra ModuleList breaks official state dict
+        ups = [
+            nn.ModuleList(
+                [
+                    nn.ConvTranspose1d(
+                        config.upsample_initial_channel // (2**layer_idx),
+                        config.upsample_initial_channel // (2 ** (layer_idx + 1)),
+                        kernel_size,
+                        stride,
+                        padding=(kernel_size - stride) // 2,
+                    )
+                ]
+            )
+            for layer_idx, (stride, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes))
+        ]
+        self.ups = nn.ModuleList(ups)
+        self.resblocks = nn.ModuleList(
+            [
+                AMPBlock(config.upsample_initial_channel // (2 ** (layer_idx + 1)), kernel_size, dilation, '1' if layer_idx > 1 else '2')
+                for layer_idx in range(self.num_upsample_layers)
+                for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes)
+            ]
+        )
+        self.activation_post = TorchActivation1d(
+            activation=SnakeBeta(config.upsample_initial_channel // (2**self.num_upsample_layers))
+        )
+        self.conv_post = nn.Conv1d(
+            config.upsample_initial_channel // (2**self.num_upsample_layers), 1, 7, 1, padding=3, bias=False
+        )
+    def normalize_spectrogram(self, spectrogram, max_value, min_db):
+        return torch.clamp((2 * max_value) * ((spectrogram - min_db) / (-min_db)) - max_value, -max_value, max_value)
+    def amplitude_to_db(self, amplitude, min_db_level):
+        min_level = torch.exp(
+            torch.tensor(min_db_level / 20.0 * np.log(10), device=amplitude.device, dtype=amplitude.dtype)
+        )
+        return 20 * torch.log10(torch.clamp(amplitude, min=min_level))
+    def process_mel_spectrogram(self, mel_spectrogram):
+        amplitude_spectrum = torch.exp(mel_spectrogram)
+        decibel_spectrum = self.amplitude_to_db(amplitude_spectrum, -115) - 20
+        return self.normalize_spectrogram(decibel_spectrum, 1, -115)
+    def forward(self, mel_spectrogram):
+        processed_spectrogram = self.process_mel_spectrogram(mel_spectrogram)
+        hidden_representation = self.conv_pre(processed_spectrogram)
+        for layer_index in range(self.num_upsample_layers):
+            hidden_representation = self.ups[layer_index][0](hidden_representation)
+            residual_output = sum(
+                self.resblocks[layer_index * self.num_residual_blocks + block_index](hidden_representation)
+                for block_index in range(self.num_residual_blocks)
+            )
+            residual_output = residual_output / self.num_residual_blocks
+            hidden_representation = residual_output
+        hidden_representation = self.activation_post(hidden_representation)
+        output_waveform = self.conv_post(hidden_representation)
+        return torch.clamp(output_waveform, min=-1.0, max=1.0).squeeze(1)
+@auto_docstring
+class Qwen3TTSTokenizerV1DecoderDiTModel(Qwen3TTSTokenizerV1DecoderPreTrainedModel):
+    config: Qwen3TTSTokenizerV1DecoderDiTConfig
+    _no_split_modules = ["DiTDecoderLayer"]
+    def __init__(self, config: Qwen3TTSTokenizerV1DecoderDiTConfig):
+        super().__init__(config)
+        self.mel_dim = config.mel_dim
+        self.repeats = config.repeats
+        self.time_embed = DiTTimestepEmbedding(config.hidden_size)
+        self.text_embed = DiTCodecEmbedding(config.num_embeds, config.emb_dim, config.repeats)
+        self.input_embed = DiTInputEmbedding(config)
+        self.rotary_embed = Qwen3TTSTokenizerV1DecoderDiTRotaryEmbedding(config.head_dim)
+        self.hidden_size = config.hidden_size
+        self.layers = config.num_hidden_layers
+        self.block_size = config.block_size
+        self.num_attention_heads = config.num_attention_heads
+        self.transformer_blocks = nn.ModuleList()
+        for i in range(config.num_hidden_layers):
+            self.transformer_blocks.append(
+                DiTDecoderLayer(
+                    config,
+                    look_ahead_block=1 if i in config.look_ahead_layers else 0,
+                    look_backward_block=1 if i in config.look_backward_layers else 0,
+                )
+            )
+        self.norm_out = AdaLayerNormZero_Final(config.hidden_size)  # final modulation
+        self.proj_out = nn.Linear(config.hidden_size, config.mel_dim)
+    def _create_block_diff(self, hidden_states):
+        batch, seq_len = hidden_states.shape[0], hidden_states.shape[1]
+        block_indices = torch.arange(seq_len, device=hidden_states.device) // self.block_size  # [seq_length]
+        block_i = block_indices.unsqueeze(1)  # [seq_length, 1]
+        block_j = block_indices.unsqueeze(0)  # [1, seq_length]
+        block_diff = block_j - block_i  # (n, n)
+        return block_diff.expand(batch, self.num_attention_heads, seq_len, seq_len)
+    def forward(
+        self,
+        hidden_states,
+        condition_vector,
+        speaker_embedding,
+        quantized_code,
+        time_step,
+        drop_audio_conditioning=False,
+        drop_code=False,
+        apply_cfg=True,
+    ):
+        batch_size = hidden_states.shape[0] * 2
+        if time_step.ndim == 0:
+            time_step = time_step.repeat(batch_size)
+        # Compute embeddings
+        time_embedding = self.time_embed(time_step)
+        text_embedding = self.text_embed(quantized_code, drop_code=False if apply_cfg else drop_code)
+        text_embedding_unconditioned = self.text_embed(quantized_code, drop_code=True) if apply_cfg else None
+        hidden_states = self.input_embed(
+            hidden_states,
+            speaker_embedding,
+            condition_vector,
+            text_embedding,
+            drop_audio_cond=drop_audio_conditioning,
+            code_embed_uncond=text_embedding_unconditioned,
+            apply_cfg=apply_cfg,
+        )
+        # Compute positional encodings
+        position_embeddings = self.rotary_embed(hidden_states)
+        blockwise_difference = self._create_block_diff(hidden_states)
+        # Transformer blocks
+        for transformer_block in self.transformer_blocks:
+            hidden_states = transformer_block(
+                hidden_states,
+                time_embedding,
+                position_embeddings=position_embeddings,
+                block_diff=blockwise_difference,
+            )
+        hidden_states = self.norm_out(hidden_states, time_embedding)
+        output = self.proj_out(hidden_states)
+        return output
+    def optimized_scale(self, positive_flat, negative_flat):
+        # Calculate dot production
+        dot_product = torch.sum(positive_flat * negative_flat, dim=1, keepdim=True)
+        # Squared norm of uncondition
+        squared_norm = torch.sum(negative_flat ** 2, dim=1, keepdim=True) + 1e-8
+        # st_star = v_cond^T * v_uncond / ||v_uncond||^2
+        st_star = dot_product / squared_norm
+        return st_star
+    @torch.no_grad()
+    def sample(
+        self,
+        conditioning_vector,
+        reference_mel_spectrogram,
+        quantized_code,
+        num_steps=10,
+        guidance_scale=0.5,
+        sway_coefficient=-1.0,
+    ):
+        noise_initialization = torch.randn([quantized_code.shape[0], 30000, self.mel_dim], dtype=reference_mel_spectrogram.dtype)
+        maximum_duration = quantized_code.shape[1] * self.repeats
+        initial_state = noise_initialization[:, :maximum_duration].to(quantized_code.device)
+        conditioning_vector = conditioning_vector.unsqueeze(1).repeat(1, maximum_duration, 1)
+        def ode_function(time_step, hidden_states):
+            if guidance_scale < 1e-5:
+                prediction = self(
+                    hidden_states=hidden_states,
+                    speaker_embedding=conditioning_vector,
+                    condition_vector=reference_mel_spectrogram,
+                    quantized_code=quantized_code,
+                    time_step=time_step,
+                    drop_audio_conditioning=False,
+                    drop_code=False,
+                )
+                return prediction
+            model_output = self(
+                hidden_states=hidden_states,
+                quantized_code=quantized_code,
+                speaker_embedding=conditioning_vector,
+                condition_vector=reference_mel_spectrogram,
+                time_step=time_step,
+                apply_cfg=True,
+            )
+            guided_prediction, null_prediction = torch.chunk(model_output, 2, dim=0)
+            return guided_prediction + (guided_prediction - null_prediction) * guidance_scale
+        initial_time = 0
+        time_embedding = torch.linspace(
+            initial_time, 1, num_steps, device=quantized_code.device, dtype=conditioning_vector.dtype
+        )
+        if sway_coefficient is not None:
+            time_embedding += sway_coefficient * (torch.cos(torch.pi / 2 * time_embedding) - 1 + time_embedding)
+        values = initial_state.clone()
+        for t0, t1 in zip(time_embedding[:-1], time_embedding[1:]):
+            dt = t1 - t0
+            vt = ode_function(t0, values)
+            values = values + vt * dt
+        generated_mel_spectrogram = values.permute(0, 2, 1)
+        return generated_mel_spectrogram
+@auto_docstring
+class Qwen3TTSTokenizerV1Decoder(Qwen3TTSTokenizerV1DecoderPreTrainedModel):
+    config: Qwen3TTSTokenizerV1DecoderConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["Qwen3TTSTokenizerV1DecoderDiTModel", "Qwen3TTSTokenizerV1DecoderBigVGANModel"]
+    def __init__(self, config: Qwen3TTSTokenizerV1DecoderConfig):
+        super().__init__(config)
+        attn_impl = config._attn_implementation
+        if config._attn_implementation == "flash_attention_2":
+            logger.warning_once(
+                "Qwen3TTSTokenizerV1Decoder must inference with fp32, but flash_attention_2 only supports fp16 and bf16, "
+                "attention implementation of Qwen3TTSTokenizerV1Decoder will fallback to sdpa."
+            )
+            attn_impl = "sdpa"
+        elif config._attn_implementation == "eager":
+            logger.warning_once(
+                "Qwen3TTSTokenizerV1Decoder does not support eager attention implementation, fall back to sdpa"
+            )
+            attn_impl = "sdpa"
+        self.dit = Qwen3TTSTokenizerV1DecoderDiTModel._from_config(
+            config.dit_config, attn_implementation=attn_impl
+        )
+        self.bigvgan = Qwen3TTSTokenizerV1DecoderBigVGANModel._from_config(
+            config.bigvgan_config, attn_implementation=attn_impl
+        )
+    def forward(
+        self,
+        code,
+        conditioning,
+        reference_mel,
+        num_steps=10,
+        guidance_scale=0.5,
+        sway_coefficient=-1.0,
+        **kwargs,
+    ):
+        """Generates a waveform from input code and conditioning parameters."""
+        mel_spectrogram = self.dit.sample(
+            conditioning,
+            reference_mel,
+            code,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            sway_coefficient=sway_coefficient,
+        )
+        waveform = self.bigvgan(mel_spectrogram)
+        return waveform
+class Qwen3TTSTokenizerV1Encoder(Qwen3TTSTokenizerV1EncoderPreTrainedModel):
+    config: Qwen3TTSTokenizerV1EncoderConfig
+    def __init__(self, config: Qwen3TTSTokenizerV1EncoderConfig):
+        super().__init__(config)
+        self.tokenizer = WhisperEncoderVQ(
+            n_mels=config.n_mels,
+            n_ctx=config.n_ctx,
+            n_state=config.n_state,
+            n_head=config.n_head,
+            n_layer=config.n_layer,
+            n_window=config.n_window,
+            output_dim=config.output_dim,
+            grad_checkpointing=config.grad_checkpointing,
+            enable_mp=config.enable_mp,
+            audio_sequence_parallel=config.audio_sequence_parallel,
+            audio_vq_type=config.audio_vq_type,
+            audio_vq_layers=config.audio_vq_layers,
+            audio_vq_codebook_size=config.audio_vq_codebook_size,
+            audio_vq_codebook_dim=config.audio_vq_codebook_dim,
+            audio_vq_pe=config.audio_vq_pe,
+            audio_vq_ds_rate=config.audio_vq_ds_rate,
+        )
+        self.padding = True
+        self.audio_vq_ds_rate = self.tokenizer.audio_vq_ds_rate
+    def speech2mel(self, speechs):
+        mels = [
+            get_mel_audio(
+                speech, padding = self.padding, audio_vq_ds_rate = self.audio_vq_ds_rate
+            ).to(speech.dtype).to(self.tokenizer.conv1.weight.device)
+            for speech in speechs
+        ]
+        return mels
+    def mel2code(self, mels):
+        audio_mellens = [mel.size(-1) for mel in mels]
+        audio_aftercnnlens = [get_T_after_cnn(T) for T in audio_mellens]
+        audio_seqlens = [T + 2 for T in audio_aftercnnlens]
+        with torch.no_grad():
+            _, indices = self.tokenizer(
+                x_list = mels,
+                audio_mellens = audio_mellens,
+                audio_aftercnnlens = audio_aftercnnlens,
+                audio_seqlens = audio_seqlens,
+                return_indices=True,
+            )
+        indice_lens = [T // self.tokenizer.audio_vq_ds_rate for T in audio_aftercnnlens]
+        indices  = pad_sequence(torch.split(indices, indice_lens), batch_first=True, padding_value=0)
+        return indices, indice_lens
+    def quantize_speech(self, speechs):
+        mels = self.speech2mel(speechs)
+        indices, indice_lens = self.mel2code(mels)
+        return indices, indice_lens
+@auto_docstring
+class Qwen3TTSTokenizerV1PreTrainedModel(PreTrainedModel):
+    config: Qwen3TTSTokenizerV1Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+@auto_docstring(
+    custom_intro="""
+    The Qwen3TTSTokenizerV1 model.
+    """
+)
+class Qwen3TTSTokenizerV1Model(Qwen3TTSTokenizerV1PreTrainedModel):
+    def __init__(self, config: Qwen3TTSTokenizerV1Config):
+        super().__init__(config)
+        self.config = config
+        self.input_sample_rate = config.input_sample_rate
+        self.output_sample_rate = config.output_sample_rate
+        self.decode_upsample_rate = config.decode_upsample_rate
+        self.encode_downsample_rate = config.encode_downsample_rate
+        self.encoder = Qwen3TTSTokenizerV1Encoder._from_config(self.config.encoder_config)
+        self.decoder = Qwen3TTSTokenizerV1Decoder._from_config(self.config.decoder_config)
+        self.encoder_xvector_extractor = None
+        self.post_init()
+    def load_encoder_xvector_extractor(self, model_path):
+        self.encoder_xvector_extractor = XVectorExtractor(model_path)
+    def get_model_type(self):
+        return self.config.model_type
+    def get_input_sample_rate(self):
+        return self.input_sample_rate
+    def get_output_sample_rate(self):
+        return self.output_sample_rate
+    def get_encode_downsample_rate(self):
+        return self.encode_downsample_rate
+    def get_decode_upsample_rate(self):
+        return self.decode_upsample_rate
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        *model_args,
+        config=None,
+        cache_dir=None,
+        ignore_mismatched_sizes=False,
+        force_download=False,
+        local_files_only=False,
+        token=None,
+        revision="main",
+        use_safetensors=None,
+        weights_only=True,
+        **kwargs,
+    ):
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        encoder_xvector_extractor_path = cached_file(
+            pretrained_model_name_or_path,
+            "campplus.onnx",
+            subfolder=kwargs.pop("subfolder", None),
+            cache_dir=kwargs.pop("cache_dir", None),
+            force_download=kwargs.pop("force_download", False),
+            proxies=kwargs.pop("proxies", None),
+            resume_download=kwargs.pop("resume_download", None),
+            local_files_only=kwargs.pop("local_files_only", False),
+            token=kwargs.pop("use_auth_token", None),
+            revision=kwargs.pop("revision", None),
+        )
+        if encoder_xvector_extractor_path is None:
+            raise ValueError(f"""{pretrained_model_name_or_path}/{encoder_xvector_extractor_path} not exists""")
+        model.load_encoder_xvector_extractor(encoder_xvector_extractor_path)
+        return model
+    def encode(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor, Optional[torch.Tensor]], Qwen3TTSTokenizerV1EncoderOutput]:
+        """
+        Encodes the input audio waveform into discrete codes.
+        Args:
+            input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Float values of the input audio waveform.
+            padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+                for *masked*.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        wavs = [value[:mask.sum()] for value, mask in zip(input_values, padding_mask)]
+        codes, codes_lens = self.encoder.quantize_speech(wavs)
+        codes = [c[:l] for c, l in zip(codes, codes_lens)]
+        xvectors = []
+        ref_mels = []
+        for wav in wavs:
+            xvector, ref_mel = self.encoder_xvector_extractor.extract_code(wav.cpu().numpy())
+            xvector = torch.tensor(xvector).to(wav.dtype).to(wav.device)
+            ref_mel = torch.tensor(ref_mel).to(wav.dtype).to(wav.device)
+            xvectors.append(xvector)
+            ref_mels.append(ref_mel)
+        if not return_dict:
+            return (
+                codes,
+                xvectors,
+                ref_mels
+            )
+        return Qwen3TTSTokenizerV1EncoderOutput(codes, xvectors, ref_mels)
+    def decode(
+        self,
+        audio_codes: torch.Tensor,
+        xvectors: torch.Tensor,
+        ref_mels: torch.Tensor,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], Qwen3TTSTokenizerV1DecoderOutput]:
+        """
+        Decodes the given frames into an output audio waveform.
+        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
+        trimmed.
+        Args:
+            audio_codes (`torch.LongTensor`  of shape `(batch_size, codes_length)`, *optional*):
+                Discret code embeddings computed using `model.encode`.
+            xvectors (`torch.FloatTensor` of shape `(batch_size, xvector_dim)`, *optional*):
+                X-vector embeddings computed using `model.encode`.
+            ref_mels (`torch.FloatTensor` of shape `(batch_size, mel_length, mel_dim)`, *optional*):
+                Reference mel spectrogram computed using `model.encode`.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        audio_values = self.decoder(code=audio_codes,
+                                    reference_mel=ref_mels,
+                                    conditioning=xvectors)
+        audio_lengths = (audio_codes > 0).sum(1) * self.decode_upsample_rate
+        audio_values = [a[:l] for a, l in zip(audio_values, audio_lengths)]
+        if not return_dict:
+            return (
+                audio_values,
+            )
+        return Qwen3TTSTokenizerV1DecoderOutput(audio_values)
+__all__ = ["Qwen3TTSTokenizerV1Model", "Qwen3TTSTokenizerV1PreTrainedModel"]

qwen_tts/core/tokenizer_25hz/vq/assets/mel_filters.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7450ae70723a5ef9d341e3cee628c7cb0177f36ce42c44b7ed2bf3325f0f6d4c
+size 4271

qwen_tts/core/tokenizer_25hz/vq/core_vq.py ADDED Viewed

	@@ -0,0 +1,523 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# This implementation is inspired from
+# https://github.com/lucidrains/vector-quantize-pytorch
+# which is released under MIT License. Hereafter, the original license:
+# MIT License
+#
+# Copyright (c) 2020 Phil Wang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Core vector quantization implementation."""
+import random
+import typing as tp
+from random import randrange
+import numpy as np
+from einops import rearrange, repeat
+from math import ceil
+import torch
+from torch import nn
+import torch.nn.functional as F
+def round_up_multiple(num, mult):
+    return ceil(num / mult) * mult
+def default(val: tp.Any, d: tp.Any) -> tp.Any:
+    return val if val is not None else d
+def ema_inplace(moving_avg, new, decay: float):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices]
+@torch.no_grad()
+def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+    means = sample_vectors(samples, num_clusters)
+    for _ in range(num_iters):
+        dists = -(
+                samples.pow(2).sum(1, keepdim=True)
+                - 2 * torch.matmul(samples, means.t())
+                + means.t().pow(2).sum(0, keepdim=True)
+        )
+        buckets = dists.max(dim=-1).indices
+        del dists
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+def preprocess(x):
+    x = rearrange(x, "... d -> (...) d")
+    return x
+def postprocess_emb(embed_ind, shape):
+    return embed_ind.view(*shape[:-1])
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+            self,
+            dim: int,
+            codebook_size: int,
+            kmeans_init: int = False,
+            kmeans_iters: int = 10,
+            decay: float = 0.99,
+            epsilon: float = 1e-5,
+            threshold_ema_dead_code: float = 2.0,
+    ):
+        super().__init__()
+        self.decay = decay
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.inited = None
+        self.cluster_size = None
+        self.embed = None
+        self.embed_avg = None
+        self.training = True
+    def init_embed_(self, data):
+        if self.inited:
+            return
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        # distrib.broadcast_tensors([self.embed, self.embed_avg, self.cluster_size, self.inited])
+    def replace_(self, samples, mask):
+        modified_codebook = torch.where(
+            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        )
+        self.embed.data.copy_(modified_codebook)
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+        cluster_size = self.cluster_size / sum(self.cluster_size) * self.codebook_size
+        expired_codes = cluster_size < self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+        else:
+            print(f"VQ expire infos: num_expire={sum(expired_codes)}, cluster_size[:5]={cluster_size[:5]}")
+        batch_samples = rearrange(batch_samples, "... d -> (...) d")
+        self.replace_(batch_samples, mask=expired_codes)
+        # sync buffers outside for efficiency
+        # distrib.broadcast_tensors(self.buffers())
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+    def encode(self, x, buffers):
+        self.inited, self.cluster_size, self.embed, self.embed_avg = buffers
+        shape = x.shape
+        # pre-process
+        x = preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = postprocess_emb(embed_ind, shape)
+        return embed_ind
+    def decode(self, embed_ind, buffers):
+        self.inited, self.cluster_size, self.embed, self.embed_avg = buffers
+        quantize = self.dequantize(embed_ind)
+        return quantize
+    def forward(self, x, buffers):
+        self.inited, self.cluster_size, self.embed, self.embed_avg = buffers
+        shape, dtype = x.shape, x.dtype
+        x = preprocess(x)
+        self.init_embed_(x)
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+        if self.training:
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+            # Note: after ema update, there is a very small difference between codebooks on GPUs.
+            # The impact can be very small, ignore it.
+        return quantize, embed_ind
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation.
+    Currently, supports only euclidean distance.
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        commitment_weight (float): Weight for commitment loss.
+    """
+    def __init__(
+            self,
+            dim: int,
+            codebook_size: int,
+            codebook_dim: tp.Optional[int] = None,
+            decay: float = 0.99,
+            epsilon: float = 1e-5,
+            kmeans_init: bool = True,
+            kmeans_iters: int = 50,
+            threshold_ema_dead_code: float = 2.0,
+            commitment_weight: float = 1.,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+        requires_projection = _codebook_dim != dim
+        self.project_in = (nn.Linear(dim, _codebook_dim)) if requires_projection else (nn.Identity())
+        self.project_out = (nn.Linear(_codebook_dim, dim)) if requires_projection else (nn.Identity())
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
+                                           kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
+                                           decay=decay, epsilon=epsilon,
+                                           threshold_ema_dead_code=threshold_ema_dead_code)
+        self.codebook_size = codebook_size
+        self.training = True
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    def encode(self, x, buffers):
+        # x = rearrange(x, "b d n -> b n d")
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x, buffers)
+        return embed_in
+    def decode(self, embed_ind, buffers):
+        quantize = self._codebook.decode(embed_ind, buffers)
+        quantize = self.project_out(quantize)
+        # quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+    def forward(self, x, buffers):
+        device = x.device
+        # x = rearrange(x, "b d n -> b n d")
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x, buffers)
+        if self.training:
+            quantize = x + (quantize - x).detach()
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+        if self.training:
+            if self.commitment_weight > 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+        quantize = self.project_out(quantize)
+        # quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize, embed_ind, loss
+class DistributedResidualVectorQuantization(nn.Module):
+    """Efficient distributed residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *,
+                 num_quantizers,
+                 quantize_dropout: bool = False,
+                 rand_num_quant: tp.Optional[tp.List] = None,
+                 **kwargs):
+        super().__init__()
+        """
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        """
+        codebook_size, codebook_dim = kwargs["codebook_size"], kwargs["codebook_dim"] if kwargs["codebook_dim"] else kwargs["dim"]
+        kmeans_init = kwargs["kmeans_init"]
+        if isinstance(kmeans_init, bool):
+            if not kwargs["kmeans_init"]:
+                # use uniform init
+                embed = uniform_init(num_quantizers, codebook_size, codebook_dim)
+                inited = True
+            else:
+                # to perform kmeans init on first batch
+                embed = torch.zeros(num_quantizers, codebook_size, codebook_dim)
+                inited = False
+        elif isinstance(kmeans_init, str):
+            # use prepared kmeans init
+            embed = np.load(kmeans_init)
+            embed = torch.from_numpy(embed)
+            if embed.dim() == 2:
+                embed = embed.unsqueeze(0)
+            inited = True
+        else:
+            raise TypeError("kmeans_init should be either a bool or string path to init weights.")
+        self.register_buffer("inited", torch.Tensor([[inited] for _ in range(num_quantizers)]))
+        self.register_buffer("cluster_size", torch.zeros(num_quantizers, codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+        self.q0_ds_ratio = 1
+        if "q0_ds_ratio" in kwargs:
+            self.q0_ds_ratio = kwargs.pop("q0_ds_ratio")
+        self.layers = nn.ModuleList()
+        for i in range(num_quantizers):
+            vq_args = dict(**kwargs)
+            vq = VectorQuantization(**vq_args)
+            self.layers.append(vq)
+        self.quantize_dropout = quantize_dropout
+        self.rand_num_quant = rand_num_quant
+    def forward(self, x, n_q: tp.Optional[int] = None):
+        quantized_out = torch.zeros_like(x)
+        residual = x
+        bb, cc, tt = x.shape
+        device = x.device
+        all_losses = []
+        all_indices = []
+        all_sub_quants = []
+        n_q = n_q or len(self.layers)
+        should_quantize_dropout = self.training and self.quantize_dropout and self.rand_num_quant is not None
+        if should_quantize_dropout:
+            rand_quantize_dropout_index = random.choice(self.rand_num_quant)
+            null_indices_shape = (x.shape[0], x.shape[2])
+            null_indices = torch.full(null_indices_shape, -1., device=device, dtype=torch.long)
+            null_loss = torch.full((1,), 0., device=device, dtype=x.dtype)
+            null_sub_quant = torch.full(x.shape, -1, device=device, dtype=x.dtype)
+        for quantizer_index, layer in enumerate(self.layers[:n_q]):
+            # dropout except the first quantizer
+            if should_quantize_dropout and quantizer_index >= rand_quantize_dropout_index:
+                all_indices.append(null_indices)
+                all_losses.append(null_loss)
+                all_sub_quants.append(null_sub_quant)
+                continue
+            quant_in = residual
+            if self.q0_ds_ratio > 1 and quantizer_index == 0:
+                quant_in = F.interpolate(quant_in, size=[tt//2])
+            quantized, indices, loss = layer(quant_in, [
+                self.inited[quantizer_index],
+                self.cluster_size[quantizer_index],
+                self.embed[quantizer_index],
+                self.embed_avg[quantizer_index]
+            ])
+            if self.q0_ds_ratio > 1 and quantizer_index == 0:
+                quantized = F.interpolate(quantized, size=[tt])
+                indices = F.interpolate(indices.unsqueeze(1).float(), size=[tt]).squeeze(1).long()
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+            all_sub_quants.append(quantized)
+        # sync buffers after one forward step
+        # distrib.broadcast_tensors(self.buffers())
+        out_losses, out_indices, out_sub_quants = map(torch.stack, (all_losses, all_indices, all_sub_quants))
+        return quantized_out, out_indices, out_losses
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for i, layer in enumerate(self.layers[:n_q]):
+            indices = layer.encode(residual, [
+                self.inited[i],
+                self.cluster_size[i],
+                self.embed[i],
+                self.embed_avg[i]
+            ])
+            quantized = layer.decode(indices, [
+                self.inited[i],
+                self.cluster_size[i],
+                self.embed[i],
+                self.embed_avg[i]
+            ])
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices, [
+                self.inited[i],
+                self.cluster_size[i],
+                self.embed[i],
+                self.embed_avg[i]
+            ])
+            quantized_out = quantized_out + quantized
+        return quantized_out
+class DistributedGroupResidualVectorQuantization(nn.Module):
+    """Efficient distributed group residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/abs/2305.02765
+    Group Then rvq
+    """
+    def __init__(self, *,
+                 num_groups,
+                 num_quantizers,
+                 quantize_dropout: bool = False,
+                 rand_num_quant: tp.Optional[tp.List] = None,
+                 **kwargs):
+        super().__init__()
+        self.rvqs = nn.ModuleList(
+            [
+                DistributedResidualVectorQuantization(
+                    num_quantizers=num_quantizers,
+                    quantize_dropout=quantize_dropout,
+                    rand_num_quant=rand_num_quant,
+                    **kwargs
+                )
+                for _ in range(num_groups)
+            ]
+        )
+        self.num_groups = num_groups
+    def forward(self, x, n_q: tp.Optional[int] = None):
+        x_lst = torch.chunk(x, chunks=self.num_groups, dim=1)
+        all_quantized_out = []
+        all_indices = []
+        all_losses = []
+        for mod, item in zip(self.rvqs, x_lst):
+            quantized_out, out_indices, out_losses = mod(item, n_q)
+            all_quantized_out.append(quantized_out)
+            all_indices.append(out_indices)
+            all_losses.append(out_losses)
+        out_losses = torch.stack(all_losses, dim=1).mean(dim=1)
+        return torch.cat(all_quantized_out, dim=1), torch.stack(all_indices, dim=1), out_losses
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
+        x_lst = torch.chunk(x, chunks=self.num_groups, dim=1)
+        return torch.stack([mod.encode(item, n_q) for mod, item in zip(self.rvqs, x_lst)], dim=1)
+    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
+        q_indices_lst = torch.chunk(q_indices, chunks=self.num_groups, dim=1)
+        return torch.cat([mod.decode(item.squeeze(1)) for mod, item in zip(self.rvqs, q_indices_lst)], dim=1)

qwen_tts/core/tokenizer_25hz/vq/speech_vq.py ADDED Viewed

	@@ -0,0 +1,357 @@

+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sox
+import copy
+import torch
+import operator
+import onnxruntime
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as kaldi
+from librosa.filters import mel as librosa_mel_fn
+from itertools import accumulate
+from typing import List
+from torch import Tensor
+from .core_vq import DistributedGroupResidualVectorQuantization
+from .whisper_encoder import WhisperEncoder, Conv1d, ConvTranspose1d
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+class MelSpectrogramFeatures(nn.Module):
+    """
+    Calculate the BigVGAN style mel spectrogram of an input signal.
+    Args:
+        filter_length (int): The number of samples in the filter window, used for the Fourier Transform. Default is 1024.
+        hop_length (int): The number of samples between successive frames (stride of the STFT). Default is 160.
+        win_length (int): The length of the window function applied to each frame, usually less than or equal to the filter length. Default is 640.
+        n_mel_channels (int): The number of Mel-frequency channels to output from the Mel-scale spectrogram. Default is 80.
+        mel_fmin (int): The minimum frequency (in Hz) of the Mel-scale spectrogram. Default is 0.
+        mel_fmax (int): The maximum frequency (in Hz) of the Mel-scale spectrogram. Default is 8000.
+        sampling_rate (int): The sampling rate of the audio data (in Hz). Default is 16000.
+        sampling_rate_org (int, optional): The original sampling rate of the audio data before any resampling (in Hz), if applicable. Default is None.
+        padding (str): The padding mode for the input signal. 'center' pads the signal symmetrically around its center. Default is 'center'.
+    Returns:
+        torch.Tensor: Mel spectrogram.
+    """
+    def __init__(self,
+                 filter_length=1024,
+                 hop_length=160,
+                 win_length=640,
+                 n_mel_channels=80,
+                 mel_fmin=0,
+                 mel_fmax=8000,
+                 sampling_rate=16000,
+                 sampling_rate_org=None,
+                 padding='center',
+                 use_db = False,
+                 ):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.n_mel_channels = n_mel_channels
+        self.mel_fmin = mel_fmin
+        self.mel_fmax = mel_fmax
+        self.sampling_rate = sampling_rate
+        self.sampling_rate_org = sampling_rate_org if sampling_rate_org is not None else sampling_rate
+        self.mel_basis = {}
+        self.hann_window = {}
+    def forward(self, audio: torch.Tensor, **kwargs) -> torch.Tensor:
+        with torch.no_grad():
+            feats = self.extract(audio, **kwargs)
+        return feats
+    def extract(self, audio, **kwargs):
+        if len(audio.shape) == 3:
+            audio = audio.squeeze(1) if audio.shape[1] == 1 else audio.squeeze(2)
+        assert len(audio.shape) == 2
+        y = audio
+        if len(list(self.mel_basis.keys())) == 0:
+            mel = librosa_mel_fn(sr=self.sampling_rate, n_fft=self.filter_length, n_mels=self.n_mel_channels, fmin=self.mel_fmin, fmax=self.mel_fmax)
+            self.mel_basis[str(self.mel_fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+            self.hann_window[str(y.device)] = torch.hann_window(self.win_length).to(y.device)
+        y = torch.nn.functional.pad(y.unsqueeze(1), (int((self.filter_length-self.hop_length)/2), int((self.filter_length-self.hop_length)/2)), mode='reflect')
+        y = y.squeeze(1)
+        spec = torch.stft(y, self.filter_length, hop_length=self.hop_length, win_length=self.win_length, window=self.hann_window[str(y.device)],
+                          center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
+        spec = torch.view_as_real(spec)
+        spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+        spec = torch.matmul(self.mel_basis[str(self.mel_fmax)+'_'+str(y.device)], spec)
+        spec = spectral_normalize_torch(spec)
+        return spec
+class XVectorExtractor(nn.Module):
+    def __init__(self, audio_codec_with_xvector):
+        super().__init__()
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ["CPUExecutionProvider"]
+        self.ort_session = onnxruntime.InferenceSession(audio_codec_with_xvector, sess_options=option, providers=providers)
+        self.tfm = sox.Transformer()
+        self.tfm.norm(db_level=-6)
+        self.mel_ext = MelSpectrogramFeatures(
+            filter_length=1024,
+            hop_length=160,
+            win_length=640,
+            n_mel_channels=80,
+            mel_fmin=0,
+            mel_fmax=8000,
+            sampling_rate=16000
+        )
+    def extract_code(self, audio):
+        with torch.no_grad():
+            norm_audio = self.sox_norm(audio)
+            norm_audio = torch.from_numpy(copy.deepcopy(norm_audio)).unsqueeze(0)
+            feat = kaldi.fbank(norm_audio,
+                            num_mel_bins=80,
+                            dither=0,
+                            sample_frequency=16000)
+            feat = feat - feat.mean(dim=0, keepdim=True)
+            norm_embedding = self.ort_session.run(None, {self.ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten()
+            norm_embedding = F.normalize(torch.from_numpy(norm_embedding), dim=0)
+            ref_mel = self.mel_ext.extract(audio=norm_audio)
+        return norm_embedding.numpy(), ref_mel.permute(0,2,1).squeeze(0).numpy()
+    def sox_norm(self, audio):
+        wav_norm = self.tfm.build_array(input_array=audio, sample_rate_in=16000)
+        return wav_norm
+class WhisperEncoderVQ(WhisperEncoder):
+    def __init__(
+            self,
+            n_mels: int,
+            n_ctx: int,
+            n_state: int,
+            n_head: int,
+            n_layer: int,
+            n_window: int = 1500,
+            output_dim: int = 512,
+            grad_checkpointing: bool = False,
+            enable_mp: bool = False,
+            audio_sequence_parallel: bool = False,
+            audio_vq_layers: int = -1,
+            audio_vq_type: str = "NULL",
+            audio_vq_codebook_size: int = 4096,
+            audio_vq_pe: bool = False,
+            audio_vq_commit_loss: float = 0.0,
+            audio_vq_out_commit_loss: float = 0.0,
+            audio_vq_no_quantize: bool = False,
+            audio_vq_ff_layer: int = 0,
+            audio_vq_threshold_ema_dead_code: float = 0.1,
+            audio_vq_codebook_dim: int = None,
+            audio_vq_ds_rate: int = None,
+    ):
+        super().__init__(n_mels, n_ctx, n_state, n_head, n_layer, n_window, output_dim, grad_checkpointing, enable_mp, audio_sequence_parallel)
+        self.audio_vq_layers = audio_vq_layers
+        self.audio_vq_type = audio_vq_type
+        self.audio_vq_codebook_size = audio_vq_codebook_size
+        self.audio_vq_pe = audio_vq_pe
+        self.audio_vq_commit_loss = audio_vq_commit_loss
+        self.audio_vq_out_commit_loss = audio_vq_out_commit_loss
+        self.audio_vq_no_quantize = audio_vq_no_quantize
+        self.audio_vq_ff_layer = audio_vq_ff_layer
+        if audio_vq_layers > 0:
+            self.vq_feature_dim = self.n_state
+            self.audio_vq_ds_rate = 1
+        else:
+            raise NotImplementedError(f"Unsupported audio_vq_layers: {audio_vq_layers}")
+        if self.audio_vq_ds_rate == audio_vq_ds_rate:
+            self.audio_vq_downsample = nn.Identity()
+            self.audio_vq_upsample   = nn.Identity()
+        else:
+            assert audio_vq_ds_rate % self.audio_vq_ds_rate == 0
+            stride = audio_vq_ds_rate // self.audio_vq_ds_rate
+            self.audio_vq_downsample = Conv1d(self.vq_feature_dim, self.vq_feature_dim, kernel_size=stride, stride=stride)
+            self.audio_vq_upsample = ConvTranspose1d(self.vq_feature_dim, self.vq_feature_dim, kernel_size=stride, stride=stride)
+            self.audio_vq_ds_rate = audio_vq_ds_rate
+        if audio_vq_type == "GRVQ":
+            self.audio_quantizer = DistributedGroupResidualVectorQuantization(
+                codebook_size = audio_vq_codebook_size,
+                dim = self.vq_feature_dim,
+                codebook_dim = self.vq_codebook_dim if audio_vq_codebook_dim is None else audio_vq_codebook_dim,
+                num_groups=1,
+                num_quantizers=1,
+                kmeans_init=False,
+                threshold_ema_dead_code = audio_vq_threshold_ema_dead_code
+            )
+        else:
+            raise NotImplementedError(f"Unsupported audio_vq_type: {audio_vq_type}")
+        if self.audio_vq_pe:
+            self.project_after_vq_pe = nn.Linear(self.n_state, self.n_state)
+    def _calc_quantize_activities(self, indices):
+        indices_onehot = F.one_hot(indices.long().flatten(), self.audio_vq_codebook_size).sum(dim=0)
+        vq_num_activities = sum(indices_onehot>0)
+        vq_num_tokens = sum(indices_onehot)
+        return {
+            "vq_num_activities": vq_num_activities,
+            "vq_num_tokens": vq_num_tokens,
+        }
+    def _do_quantize(self, x, pe=None, y=None):
+        """
+            x: torch.Tensor, shape = (T, D)
+            q: torch.Tensor, shape = (T, D)
+            i: torch.Tensor, shape = (T)
+        """
+        if self.audio_vq_out_commit_loss > 0:
+            x_teacher = x.clone()
+        x = x.unsqueeze(0)
+        x = self.audio_vq_downsample(x.transpose(1, 2))
+        x = x.transpose(1, 2)
+        vq_stats = {}
+        if self.audio_vq_type == "GRVQ":
+            if self.training:
+                raise NotImplementedError
+            else:
+                indices = self.audio_quantizer.encode(x)
+                x = self.audio_quantizer.decode(indices)
+                indices = indices.squeeze(2).squeeze(1)
+        vq_stats.update(self._calc_quantize_activities(indices))
+        x, indices = x.squeeze(0), indices.squeeze(0)
+        if self.audio_vq_pe:
+            x = x + pe
+            x = self.project_after_vq_pe(x)
+        x = self.audio_vq_upsample(x.unsqueeze(0).transpose(1, 2))
+        x = x.transpose(1, 2).squeeze(0)
+        if self.audio_vq_out_commit_loss > 0:
+            vq_out_commit_loss = F.mse_loss(x_teacher.detach(), x)
+            vq_stats["vq_out_commit_loss"] = vq_out_commit_loss * self.audio_vq_out_commit_loss
+        return x, indices, vq_stats
+    def forward(self, x_list: List[Tensor], audio_mellens:List[int], audio_aftercnnlens:List[int], audio_seqlens:List[int], return_indices=False, audio_pitchs=None):
+        """
+        x : torch.Tensor, shape = (n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        aftercnn_x_list = []
+        pe_for_vq_list = []
+        for each_x in x_list:
+            each_x_split_list = each_x.split(self.n_window * 2, dim=1)
+            for each_x_split in each_x_split_list:
+                each_x_split = F.gelu(self.conv1(each_x_split))
+                each_x_split = F.gelu(self.conv2(each_x_split))
+                each_x_split = each_x_split.permute(1, 0) # L,D
+                each_positional_embedding_split = self.positional_embedding[:each_x_split.shape[0]]
+                aftercnn_x_list.append(each_x_split+each_positional_embedding_split.to(each_x_split.dtype))
+                pe_for_vq_split = self.positional_embedding[:each_x_split.shape[0] // self.audio_vq_ds_rate]
+                pe_for_vq_list.append(pe_for_vq_split.to(each_x_split.dtype))
+        pe_for_vq = torch.cat(pe_for_vq_list, dim=0)
+        x = torch.cat(aftercnn_x_list, dim=0)
+        src_len = x.size(0)
+        output_list = []
+        for item in audio_aftercnnlens:
+            while item > self.n_window:
+                output_list.append(self.n_window)
+                item -= self.n_window
+            output_list.append(item)
+        cu_seqlens = list(accumulate(output_list, func=operator.add,initial=0))
+        cu_seqlens = torch.Tensor(cu_seqlens).to(device=x.device, dtype=torch.int32)
+        layer_id = 0
+        for block in self.blocks:
+            layer_id+=1
+            x = block(x, cu_seqlens=cu_seqlens)
+            if self.audio_vq_layers == layer_id: # vq inside encoder
+                x, indices, vq_stats = self._do_quantize(x, pe_for_vq)
+                if return_indices:
+                    return x, indices
+        if self.avg_pooler:
+            x_list = x.split(audio_aftercnnlens, dim=0)
+            token_x_list = []
+            for x in x_list:
+                x = x.permute(1, 0)
+                x = self.avg_pooler(x)
+                x = x.permute(1, 0)
+                token_x_list.append(x)
+            x = torch.cat(token_x_list, dim=0)
+        x = self.ln_post(x)
+        x = self.proj(x)
+        output = torch.zeros(
+            (x.size(0) + len(audio_seqlens) * 2, x.size(1)),
+            device=x.device, dtype=x.dtype
+        )
+        audio_seqlens_acc = list(accumulate(audio_seqlens, func=operator.add, initial=0))
+        start_ids = torch.tensor(audio_seqlens_acc[:-1], device=x.device, dtype=torch.int32)
+        end_ids = torch.tensor(audio_seqlens_acc[1:], device=x.device, dtype=torch.int32) - 1
+        audio_tokens_mask = torch.ones(output.size(0), device=x.device, dtype=torch.bool)
+        audio_tokens_mask[start_ids] = False
+        audio_tokens_mask[end_ids] = False
+        output[start_ids] = self.audio_bos_eos_token.weight[0].to(x.dtype)
+        output[end_ids] = self.audio_bos_eos_token.weight[1].to(x.dtype)
+        output[audio_tokens_mask] = x
+        if self.audio_vq_type != "NULL":
+            return output, vq_stats
+        return output

qwen_tts/core/tokenizer_25hz/vq/whisper_encoder.py ADDED Viewed

	@@ -0,0 +1,406 @@

+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import math
+import torch
+import operator
+import numpy as np
+import torch.nn.functional as F
+from functools import lru_cache
+from typing import Optional, Union, List
+from torch import nn, Tensor
+from itertools import accumulate
+try:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_varlen_func
+except ImportError:
+    try:
+        from flash_attn.flash_attn_interface import flash_attn_unpadded_func as flash_attn_varlen_func
+    except ImportError:
+        print("\n********\nWarning: flash-attn is not installed. Will only run the manual PyTorch version. Please install flash-attn for faster inference.\n********\n ")
+        flash_attn_varlen_func = None
+N_FFT = 400
+HOP_LENGTH = 160
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
+        )
+    """
+    assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}"
+    filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
+    with np.load(filters_path, allow_pickle=False) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[str, np.ndarray, torch.Tensor],
+    n_mels: int = 80,
+    padding: int = 0,
+    device: Optional[Union[str, torch.device]] = None,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    padding: int
+        Number of zero samples to pad to the right
+    device: Optional[Union[str, torch.device]]
+        If given, the audio tensor is moved to this device before STFT
+    Returns
+    -------
+    torch.Tensor, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if not torch.is_tensor(audio):
+        audio = torch.from_numpy(audio)
+    if device is not None:
+        audio = audio.to(device)
+    if padding > 0:
+        audio = F.pad(audio, (0, padding))
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec
+def get_T_after_cnn(L_in, dilation=1):
+    for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
+        L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
+        L_out = 1 + L_out // stride
+        L_in = L_out
+    return L_out
+def get_mel_audio(audio, padding=False, audio_vq_ds_rate = 1, n_mels = 128):
+    audio_len = len(audio)
+    if padding:
+        reduction = 160 * 2 * audio_vq_ds_rate
+        audio_pad = math.ceil(audio_len / reduction) * reduction - audio_len
+        mel = log_mel_spectrogram(audio, n_mels=n_mels, padding=audio_pad)
+    else:
+        mel = log_mel_spectrogram(audio, n_mels=n_mels)  # [F,T]
+    return mel
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+class Conv1d(nn.Conv1d):
+    def _conv_forward(
+        self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+class ConvTranspose1d(nn.ConvTranspose1d):
+    def _conv_forward(
+        self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype) )
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+        self.use_flash_attention = True
+    def forward(
+        self,
+        x: Tensor,
+        cu_seqlens = None,
+    ):
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+        if self.use_flash_attention:
+            if flash_attn_varlen_func is None:
+                x = self.qkv_attention_manual(q, k, v, cu_seqlens=cu_seqlens)
+            else:
+                if q.dtype not in [torch.float16, torch.bfloat16]:
+                    x = self.qkv_attention_manual(q, k, v, cu_seqlens=cu_seqlens)
+                    self.use_flash_attention = False
+                else:
+                    x = self.qkv_flash_attention(q, k, v, cu_seqlens=cu_seqlens)
+        else:
+            x = self.qkv_attention_manual(q, k, v, cu_seqlens=cu_seqlens)
+        output = self.out(x)
+        return output
+    def qkv_flash_attention(
+        self, q: Tensor, k: Tensor, v: Tensor, cu_seqlens=None
+    ):
+        n_ctx, n_state = q.shape
+        # scale = (n_state // self.n_head) ** -0.25
+        q = q.view(n_ctx, self.n_head, -1)# (batch_size, seqlen, nheads, headdim)
+        k = k.view(n_ctx, self.n_head, -1)
+        v = v.view(n_ctx, self.n_head, -1)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        x = flash_attn_varlen_func(
+            q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, dropout_p=0.0
+        )
+        x = x.reshape(n_ctx, n_state)
+        return x
+    def qkv_attention_manual(
+        self, q: Tensor, k: Tensor, v: Tensor, cu_seqlens: Tensor
+    ):
+        n_ctx, n_state = q.shape
+        head_dim = n_state // self.n_head
+        scale = head_dim ** -0.5
+        q = q.view(n_ctx, self.n_head, head_dim)
+        k = k.view(n_ctx, self.n_head, head_dim)
+        v = v.view(n_ctx, self.n_head, head_dim)
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        batch_size = len(seqlens)
+        max_seqlen = max(seqlens)
+        q_padded = torch.zeros(batch_size, max_seqlen, self.n_head, head_dim, dtype=q.dtype, device=q.device)
+        k_padded = torch.zeros_like(q_padded)
+        v_padded = torch.zeros_like(q_padded)
+        for i in range(batch_size):
+            start_idx = cu_seqlens[i]
+            end_idx = cu_seqlens[i+1]
+            seq_len = seqlens[i]
+            q_padded[i, :seq_len] = q[start_idx:end_idx]
+            k_padded[i, :seq_len] = k[start_idx:end_idx]
+            v_padded[i, :seq_len] = v[start_idx:end_idx]
+        q_padded = q_padded.transpose(1, 2)
+        k_padded = k_padded.transpose(1, 2)
+        v_padded = v_padded.transpose(1, 2)
+        attn_mask = torch.arange(max_seqlen, device=q.device)[None, :] < torch.tensor(seqlens, device=q.device)[:, None]
+        attn_mask = attn_mask.unsqueeze(1).unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask == 0, -torch.finfo(q.dtype).max)
+        attn_scores = torch.matmul(q_padded, k_padded.transpose(-2, -1)) * scale
+        attn_scores = attn_scores + attn_mask
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        context = torch.matmul(attn_weights, v_padded)
+        context = context.transpose(1, 2).contiguous().view(batch_size, max_seqlen, n_state)
+        output_packed = torch.cat([context[i, :seqlens[i]] for i in range(batch_size)], dim=0)
+        assert output_packed.shape == (n_ctx, n_state)
+        return output_packed
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int,
+                 enable_mp: bool = False, sequence_parallel: bool = False):
+        super().__init__()
+        n_mlp = n_state * 4
+        self.attn_ln = nn.LayerNorm(n_state)
+        self.mlp_ln = nn.LayerNorm(n_state)
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.mlp = nn.Sequential(
+                Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
+            )
+    def forward(
+        self,
+        x: Tensor,
+        cu_seqlens = None
+    ):
+        x = x + self.attn(self.attn_ln(x), cu_seqlens=cu_seqlens)
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class WhisperEncoder(nn.Module):
+    def __init__(
+            self,
+            n_mels: int,
+            n_ctx: int,
+            n_state: int,
+            n_head: int,
+            n_layer: int,
+            n_window: int = 1500,
+            output_dim: int = 512,
+            grad_checkpointing: bool = False,
+            enable_mp: bool = False,
+            audio_sequence_parallel: bool = False,
+    ):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+        self.n_layer = n_layer
+        self.n_mels = n_mels
+        self.blocks = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head, enable_mp=enable_mp, sequence_parallel=audio_sequence_parallel)
+             for _ in range(n_layer)]
+        )
+        self.ln_post = nn.LayerNorm(n_state)
+        self.avg_pooler = nn.AvgPool1d(2, stride=2)
+        self.proj = torch.nn.Linear(n_state, output_dim)
+        self.audio_bos_eos_token = nn.Embedding(2, output_dim)
+        self.output_dim = output_dim
+        self.grad_checkpointing = grad_checkpointing
+        self.enable_mp = enable_mp
+        self.n_head = n_head
+        self.n_state = n_state
+        self.n_window = n_window
+        self.audio_sequence_parallel = audio_sequence_parallel
+        self.tp_world_size = 1
+        self.set_audio_sync()
+    def set_audio_sync(self):
+        for name, param in self.named_parameters():
+            if not name.startswith("blocks"):
+                setattr(param, "audio_sync", True)
+    def forward(self, x_list: List[Tensor], audio_mellens:List[int], audio_aftercnnlens:List[int], audio_seqlens:List[int]):
+        """
+        x : torch.Tensor, shape = (n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        aftercnn_x_list = []
+        for each_x in x_list:
+            each_x_split_list = each_x.split(self.n_window * 2, dim=1)
+            for each_x_split in each_x_split_list:
+                each_x_split = F.gelu(self.conv1(each_x_split))
+                each_x_split = F.gelu(self.conv2(each_x_split))
+                each_x_split = each_x_split.permute(1, 0) # L,D
+                each_positional_embedding_split = self.positional_embedding[:each_x_split.shape[0]]
+                aftercnn_x_list.append(each_x_split+each_positional_embedding_split.to(each_x_split.dtype))
+        x = torch.cat(aftercnn_x_list, dim=0)
+        src_len = x.size(0)
+        output_list = []
+        for item in audio_aftercnnlens:
+            while item > self.n_window:
+                output_list.append(self.n_window)
+                item -= self.n_window
+            output_list.append(item)
+        cu_seqlens = list(accumulate(output_list, func=operator.add,initial=0))
+        cu_seqlens = torch.Tensor(cu_seqlens).to(device=x.device, dtype=torch.int32)
+        layer_id = 0
+        for block in self.blocks:
+            layer_id+=1
+            x = block(x, cu_seqlens=cu_seqlens)
+        if self.avg_pooler:
+            x_list = x.split(audio_aftercnnlens, dim=0)
+            token_x_list = []
+            for x in x_list:
+                x = x.permute(1, 0)
+                x = self.avg_pooler(x)
+                x = x.permute(1, 0)
+                token_x_list.append(x)
+            x = torch.cat(token_x_list, dim=0)
+        x = self.ln_post(x)
+        x = self.proj(x)
+        output = torch.zeros(
+            (x.size(0) + len(audio_seqlens) * 2, x.size(1)),
+            device=x.device, dtype=x.dtype
+        )
+        audio_seqlens_acc = list(accumulate(audio_seqlens, func=operator.add, initial=0))
+        start_ids = torch.tensor(audio_seqlens_acc[:-1], device=x.device, dtype=torch.int32)
+        end_ids = torch.tensor(audio_seqlens_acc[1:], device=x.device, dtype=torch.int32) - 1
+        audio_tokens_mask = torch.ones(output.size(0), device=x.device, dtype=torch.bool)
+        audio_tokens_mask[start_ids] = False
+        audio_tokens_mask[end_ids] = False
+        output[start_ids] = self.audio_bos_eos_token.weight[0].to(x.dtype)
+        output[end_ids] = self.audio_bos_eos_token.weight[1].to(x.dtype)
+        output[audio_tokens_mask] = x
+        return output
+    def lock(self, layers: int):
+        self.conv1.requires_grad_(False)
+        self.conv2.requires_grad_(False)
+        for i in range(min(layers, len(self.blocks))):
+            self.blocks[i].requires_grad_(False)

qwen_tts/inference/qwen3_tts_model.py ADDED Viewed

	@@ -0,0 +1,877 @@

+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import io
+import urllib.request
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+from urllib.parse import urlparse
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from transformers import AutoConfig, AutoModel, AutoProcessor
+from ..core.models import Qwen3TTSConfig, Qwen3TTSForConditionalGeneration, Qwen3TTSProcessor
+AudioLike = Union[
+    str,                     # wav path, URL, base64
+    np.ndarray,              # waveform (requires sr)
+    Tuple[np.ndarray, int],  # (waveform, sr)
+]
+MaybeList = Union[Any, List[Any]]
+@dataclass
+class VoiceClonePromptItem:
+    """
+    Container for one sample's voice-clone prompt information that can be fed to the model.
+    Fields are aligned with `Qwen3TTSForConditionalGeneration.generate(..., voice_clone_prompt=...)`.
+    """
+    ref_code: Optional[torch.Tensor]                 # (T, Q) or (T,) depending on tokenizer 25Hz/12Hz
+    ref_spk_embedding: torch.Tensor                  # (D,)
+    x_vector_only_mode: bool
+    icl_mode: bool
+    ref_text: Optional[str] = None
+class Qwen3TTSModel:
+    """
+    A HuggingFace-style wrapper for Qwen3 TTS models (CustomVoice/VoiceDesign/Base) that provides:
+      - from_pretrained() initialization via AutoModel/AutoProcessor
+      - generation APIs for:
+          * CustomVoice: generate_custom_voice()
+          * VoiceDesign: generate_voice_design()
+          * Base: generate_voice_clone() + create_voice_clone_prompt()
+      - consistent output: (wavs: List[np.ndarray], sample_rate: int)
+    Notes:
+      - This wrapper expects the underlying model class to be `Qwen3TTSForConditionalGeneration`
+      - Language / speaker validation is done via model methods:
+          model.get_supported_languages(), model.get_supported_speakers()
+    """
+    def __init__(self, model: Qwen3TTSForConditionalGeneration, processor, generate_defaults: Optional[Dict[str, Any]] = None):
+        self.model = model
+        self.processor = processor
+        self.generate_defaults = generate_defaults or {}
+        self.device = getattr(model, "device", None)
+        if self.device is None:
+            try:
+                self.device = next(model.parameters()).device
+            except StopIteration:
+                self.device = torch.device("cpu")
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        **kwargs,
+    ) -> "Qwen3TTSModel":
+        """
+        Load a Qwen3 TTS model and its processor in HuggingFace `from_pretrained` style.
+        This method:
+          1) Loads config via AutoConfig (so your side can register model_type -> config/model).
+          2) Loads the model via AutoModel.from_pretrained(...), forwarding `kwargs` unchanged.
+          3) Loads the processor via AutoProcessor.from_pretrained(model_path).
+          4) Loads optional `generate_config.json` from the model directory/repo snapshot if present.
+        Args:
+            pretrained_model_name_or_path (str):
+                HuggingFace repo id or local directory of the model.
+            **kwargs:
+                Forwarded as-is into `AutoModel.from_pretrained(...)`.
+                Typical examples: device_map="cuda:0", dtype=torch.bfloat16, attn_implementation="flash_attention_2".
+        Returns:
+            Qwen3TTSModel:
+                Wrapper instance containing `model`, `processor`, and generation defaults.
+        """
+        AutoConfig.register("qwen3_tts", Qwen3TTSConfig)
+        AutoModel.register(Qwen3TTSConfig, Qwen3TTSForConditionalGeneration)
+        AutoProcessor.register(Qwen3TTSConfig, Qwen3TTSProcessor)
+        model = AutoModel.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        if not isinstance(model, Qwen3TTSForConditionalGeneration):
+            raise TypeError(
+                f"AutoModel returned {type(model)}, expected Qwen3TTSForConditionalGeneration. "
+            )
+        processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path, fix_mistral_regex=True,)
+        generate_defaults = model.generate_config
+        return cls(model=model, processor=processor, generate_defaults=generate_defaults)
+    def _supported_languages_set(self) -> Optional[set]:
+        langs = getattr(self.model, "get_supported_languages", None)
+        if callable(langs):
+            v = langs()
+            if v is None:
+                return None
+            return set([str(x).lower() for x in v])
+        return None
+    def _supported_speakers_set(self) -> Optional[set]:
+        spks = getattr(self.model, "get_supported_speakers", None)
+        if callable(spks):
+            v = spks()
+            if v is None:
+                return None
+            return set([str(x).lower() for x in v])
+        return None
+    def _validate_languages(self, languages: List[str]) -> None:
+        """
+        Validate that requested languages are supported by the model.
+        Args:
+            languages (List[str]): Language names for each sample.
+        Raises:
+            ValueError: If any language is not supported.
+        """
+        supported = self._supported_languages_set()
+        if supported is None:
+            return
+        bad = []
+        for lang in languages:
+            if lang is None:
+                bad.append(lang)
+                continue
+            if str(lang).lower() not in supported:
+                bad.append(lang)
+        if bad:
+            raise ValueError(f"Unsupported languages: {bad}. Supported: {sorted(supported)}")
+    def _validate_speakers(self, speakers: List[Optional[str]]) -> None:
+        """
+        Validate that requested speakers are supported by the Instruct model.
+        Args:
+            speakers (List[Optional[str]]): Speaker names for each sample.
+        Raises:
+            ValueError: If any speaker is not supported.
+        """
+        supported = self._supported_speakers_set()
+        if supported is None:
+            return
+        bad = []
+        for spk in speakers:
+            if spk is None or spk == "":
+                continue
+            if str(spk).lower() not in supported:
+                bad.append(spk)
+        if bad:
+            raise ValueError(f"Unsupported speakers: {bad}. Supported: {sorted(supported)}")
+    def _is_probably_base64(self, s: str) -> bool:
+        if s.startswith("data:audio"):
+            return True
+        if ("/" not in s and "\\" not in s) and len(s) > 256:
+            return True
+        return False
+    def _is_url(self, s: str) -> bool:
+        try:
+            u = urlparse(s)
+            return u.scheme in ("http", "https") and bool(u.netloc)
+        except Exception:
+            return False
+    def _decode_base64_to_wav_bytes(self, b64: str) -> bytes:
+        if "," in b64 and b64.strip().startswith("data:"):
+            b64 = b64.split(",", 1)[1]
+        return base64.b64decode(b64)
+    def _load_audio_to_np(self, x: str) -> Tuple[np.ndarray, int]:
+        if self._is_url(x):
+            with urllib.request.urlopen(x) as resp:
+                audio_bytes = resp.read()
+            with io.BytesIO(audio_bytes) as f:
+                audio, sr = sf.read(f, dtype="float32", always_2d=False)
+        elif self._is_probably_base64(x):
+            wav_bytes = self._decode_base64_to_wav_bytes(x)
+            with io.BytesIO(wav_bytes) as f:
+                audio, sr = sf.read(f, dtype="float32", always_2d=False)
+        else:
+            audio, sr = librosa.load(x, sr=None, mono=True)
+        if audio.ndim > 1:
+            audio = np.mean(audio, axis=-1)
+        return audio.astype(np.float32), int(sr)
+    def _normalize_audio_inputs(self, audios: Union[AudioLike, List[AudioLike]]) -> List[Tuple[np.ndarray, int]]:
+        """
+        Normalize audio inputs into a list of (waveform, sr).
+        Supported forms:
+          - str: wav path / URL / base64 audio string
+          - (np.ndarray, sr): waveform + sampling rate
+          - list of the above
+        Args:
+            audios:
+                Audio input(s).
+        Returns:
+            List[Tuple[np.ndarray, int]]:
+                List of (float32 waveform, original sr).
+        Raises:
+            ValueError: If a numpy waveform is provided without sr.
+        """
+        if isinstance(audios, list):
+            items = audios
+        else:
+            items = [audios]
+        out: List[Tuple[np.ndarray, int]] = []
+        for a in items:
+            if isinstance(a, str):
+                out.append(self._load_audio_to_np(a))
+            elif isinstance(a, tuple) and len(a) == 2 and isinstance(a[0], np.ndarray):
+                out.append((a[0].astype(np.float32), int(a[1])))
+            elif isinstance(a, np.ndarray):
+                raise ValueError("For numpy waveform input, pass a tuple (audio, sr).")
+            else:
+                raise TypeError(f"Unsupported audio input type: {type(a)}")
+        for i, a in enumerate(out):
+            if a[0].ndim > 1:
+                a[0] = np.mean(a[0], axis=-1).astype(np.float32)
+                out[i] = (a[0], a[1])
+        return out
+    def _ensure_list(self, x: MaybeList) -> List[Any]:
+        return x if isinstance(x, list) else [x]
+    def _build_assistant_text(self, text: str) -> str:
+        return f"<|im_start|>assistant\n{text}<|im_end|>\n<|im_start|>assistant\n"
+    def _build_ref_text(self, text: str) -> str:
+        return f"<|im_start|>assistant\n{text}<|im_end|>\n"
+    def _build_instruct_text(self, instruct: str) -> str:
+        return f"<|im_start|>user\n{instruct}<|im_end|>\n"
+    def _tokenize_texts(self, texts: List[str]) -> List[torch.Tensor]:
+        input_ids = []
+        for text in texts:
+            input = self.processor(text=text, return_tensors="pt", padding=True)
+            input_id = input["input_ids"].to(self.device)
+            input_id = input_id.unsqueeze(0) if input_id.dim() == 1 else input_id
+            input_ids.append(input_id)
+        return input_ids
+    def _merge_generate_kwargs(
+        self,
+        do_sample: Optional[bool] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        temperature: Optional[float] = None,
+        repetition_penalty: Optional[float] = None,
+        subtalker_dosample: Optional[bool] = None,
+        subtalker_top_k: Optional[int] = None,
+        subtalker_top_p: Optional[float] = None,
+        subtalker_temperature: Optional[float] = None,
+        max_new_tokens: Optional[int] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """
+        Merge user-provided generation arguments with defaults from `generate_config.json`.
+        Rule:
+          - If the user explicitly passes a value (not None), use it.
+          - Otherwise, use the value from generate_config.json if present.
+          - Otherwise, fall back to the hard defaults.
+        Args:
+            do_sample, top_k, top_p, temperature, repetition_penalty,
+            subtalker_dosample, subtalker_top_k, subtalker_top_p, subtalker_temperature, max_new_tokens:
+                Common generation parameters.
+            **kwargs:
+                Other arguments forwarded to model.generate().
+        Returns:
+            Dict[str, Any]: Final kwargs to pass into model.generate().
+        """
+        hard_defaults = dict(
+            do_sample=True,
+            top_k=50,
+            top_p=1.0,
+            temperature=0.9,
+            repetition_penalty=1.05,
+            subtalker_dosample=True,
+            subtalker_top_k=50,
+            subtalker_top_p=1.0,
+            subtalker_temperature=0.9,
+            max_new_tokens=2048,
+        )
+        def pick(name: str, user_val: Any) -> Any:
+            if user_val is not None:
+                return user_val
+            if name in self.generate_defaults:
+                return self.generate_defaults[name]
+            return hard_defaults[name]
+        merged = dict(kwargs)
+        merged.update(
+            do_sample=pick("do_sample", do_sample),
+            top_k=pick("top_k", top_k),
+            top_p=pick("top_p", top_p),
+            temperature=pick("temperature", temperature),
+            repetition_penalty=pick("repetition_penalty", repetition_penalty),
+            subtalker_dosample=pick("subtalker_dosample", subtalker_dosample),
+            subtalker_top_k=pick("subtalker_top_k", subtalker_top_k),
+            subtalker_top_p=pick("subtalker_top_p", subtalker_top_p),
+            subtalker_temperature=pick("subtalker_temperature", subtalker_temperature),
+            max_new_tokens=pick("max_new_tokens", max_new_tokens),
+        )
+        return merged
+    # voice clone model
+    @torch.inference_mode()
+    def create_voice_clone_prompt(
+        self,
+        ref_audio: Union[AudioLike, List[AudioLike]],
+        ref_text: Optional[Union[str, List[Optional[str]]]] = None,
+        x_vector_only_mode: Union[bool, List[bool]] = False,
+    ) -> List[VoiceClonePromptItem]:
+        """
+        Build voice-clone prompt items from reference audio (and optionally reference text) using Base model.
+        Modes:
+          - x_vector_only_mode=True:
+              Only speaker embedding is used to clone voice; ref_text/ref_code are ignored.
+              This is mutually exclusive with ICL.
+          - x_vector_only_mode=False:
+              ICL mode is enabled automatically (icl_mode=True). In this case ref_text is required,
+              because the model continues/conditions on the reference text + reference speech codes.
+        Batch behavior:
+          - ref_audio can be a single item or a list.
+          - ref_text and x_vector_only_mode can be scalars or lists.
+          - If any of them are lists with length > 1, lengths must match.
+        Audio input:
+          - str: local wav path / URL / base64
+          - (np.ndarray, sr): waveform + sampling rate
+        Args:
+            ref_audio:
+                Reference audio(s) used to extract:
+                  - ref_code via `model.speech_tokenizer.encode(...)`
+                  - ref_spk_embedding via `model.extract_speaker_embedding(...)` (resampled to 24k)
+            ref_text:
+                Reference transcript(s). Required when x_vector_only_mode=False (ICL mode).
+            x_vector_only_mode:
+                Whether to use speaker embedding only. If False, ICL mode will be used.
+        Returns:
+            List[VoiceClonePromptItem]:
+                List of prompt items that can be converted into `voice_clone_prompt` dict.
+        Raises:
+            ValueError:
+                - If x_vector_only_mode=False but ref_text is missing.
+                - If batch lengths mismatch.
+        """
+        if self.model.tts_model_type != "base":
+            raise ValueError(
+                f"model with \ntokenizer_type: {self.model.tokenizer_type}\n"
+                f"tts_model_size: {self.model.tts_model_size}\n"
+                f"tts_model_type: {self.model.tts_model_type}\n"
+                "does not support create_voice_clone_prompt, Please check Model Card or Readme for more details."
+            )
+        ref_audio_list = self._ensure_list(ref_audio)
+        ref_text_list = self._ensure_list(ref_text) if isinstance(ref_text, list) else ([ref_text] * len(ref_audio_list))
+        xvec_list = self._ensure_list(x_vector_only_mode) if isinstance(x_vector_only_mode, list) else ([x_vector_only_mode] * len(ref_audio_list))
+        if len(ref_text_list) != len(ref_audio_list) or len(xvec_list) != len(ref_audio_list):
+            raise ValueError(
+                f"Batch size mismatch: ref_audio={len(ref_audio_list)}, ref_text={len(ref_text_list)}, x_vector_only_mode={len(xvec_list)}"
+            )
+        normalized = self._normalize_audio_inputs(ref_audio_list)
+        ref_wavs_for_code: List[np.ndarray] = []
+        ref_sr_for_code: List[int] = []
+        for wav, sr in normalized:
+            ref_wavs_for_code.append(wav)
+            ref_sr_for_code.append(sr)
+        if len(set(ref_sr_for_code)) == 1:
+            enc = self.model.speech_tokenizer.encode(ref_wavs_for_code, sr=ref_sr_for_code[0])
+            ref_codes = enc.audio_codes
+        else:
+            ref_codes = []
+            for wav, sr in normalized:
+                ref_codes.append(self.model.speech_tokenizer.encode(wav, sr=sr).audio_codes[0])
+        items: List[VoiceClonePromptItem] = []
+        for i, ((wav, sr), code, rtext, xvec_only) in enumerate(zip(normalized, ref_codes, ref_text_list, xvec_list)):
+            if not xvec_only:
+                if rtext is None or rtext == "":
+                    raise ValueError(f"ref_text is required when x_vector_only_mode=False (ICL mode). Bad index={i}")
+            wav_resample = wav
+            if sr != self.model.speaker_encoder_sample_rate:
+                wav_resample = librosa.resample(y=wav_resample.astype(np.float32),
+                                           orig_sr=int(sr),
+                                           target_sr=self.model.speaker_encoder_sample_rate)
+            spk_emb = self.model.extract_speaker_embedding(audio=wav_resample,
+                                                           sr=self.model.speaker_encoder_sample_rate)
+            items.append(
+                VoiceClonePromptItem(
+                    ref_code=None if xvec_only else code,
+                    ref_spk_embedding=spk_emb,
+                    x_vector_only_mode=bool(xvec_only),
+                    icl_mode=bool(not xvec_only),
+                    ref_text=rtext,
+                )
+            )
+        return items
+    def _prompt_items_to_voice_clone_prompt(self, items: List[VoiceClonePromptItem]) -> Dict[str, Any]:
+        return dict(
+            ref_code=[it.ref_code for it in items],
+            ref_spk_embedding=[it.ref_spk_embedding for it in items],
+            x_vector_only_mode=[it.x_vector_only_mode for it in items],
+            icl_mode=[it.icl_mode for it in items],
+        )
+    # voice clone model
+    @torch.no_grad()
+    def generate_voice_clone(
+        self,
+        text: Union[str, List[str]],
+        language: Union[str, List[str]] = None,
+        ref_audio: Optional[Union[AudioLike, List[AudioLike]]] = None,
+        ref_text: Optional[Union[str, List[Optional[str]]]] = None,
+        x_vector_only_mode: Union[bool, List[bool]] = False,
+        voice_clone_prompt: Optional[Union[Dict[str, Any], List[VoiceClonePromptItem]]] = None,
+        non_streaming_mode: bool = False,
+        **kwargs,
+    ) -> Tuple[List[np.ndarray], int]:
+        """
+        Voice clone speech using the Base model.
+        You can provide either:
+          - (ref_audio, ref_text, x_vector_only_mode) and let this method build the prompt, OR
+          - `VoiceClonePromptItem` returned by `create_voice_clone_prompt`, OR
+          - a list of `VoiceClonePromptItem` returned by `create_voice_clone_prompt`.
+        `ref_audio` Supported forms:
+        - str: wav path / URL / base64 audio string
+        - (np.ndarray, sr): waveform + sampling rate
+        - list of the above
+        Input flexibility:
+          - text/language can be scalar or list.
+          - prompt can be single or batch.
+          - If batch mode (len(text)>1), lengths must match.
+        Args:
+            text:
+                Text(s) to synthesize.
+            language:
+                Language(s) for each sample.
+            ref_audio:
+                Reference audio(s) for prompt building. Required if voice_clone_prompt is not provided.
+            ref_text:
+                Reference text(s) used for ICL mode (required when x_vector_only_mode=False).
+            x_vector_only_mode:
+                If True, only speaker embedding is used (ignores ref_text/ref_code).
+                If False, ICL mode is used automatically.
+            voice_clone_prompt:
+                list[VoiceClonePromptItem] from `create_voice_clone_prompt`.
+            non_streaming_mode:
+                Using non-streaming text input, this option currently only simulates streaming text input when set to `false`,
+                rather than enabling true streaming input or streaming generation.
+            do_sample:
+                Whether to use sampling, recommended to be set to `true` for most use cases.
+            top_k:
+                Top-k sampling parameter.
+            top_p:
+                Top-p sampling parameter.
+            temperature:
+                Sampling temperature; higher => more random.
+            repetition_penalty:
+                Penalty to reduce repeated tokens/codes.
+            subtalker_dosample:
+                Sampling switch for the sub-talker (only valid for qwen3-tts-tokenizer-v2) if applicable.
+            subtalker_top_k:
+                Top-k for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
+            subtalker_top_p:
+                Top-p for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
+            subtalker_temperature:
+                Temperature for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
+            max_new_tokens:
+                Maximum number of new codec tokens to generate.
+            **kwargs:
+                Any other keyword arguments supported by HuggingFace Transformers `generate()` can be passed.
+                They will be forwarded to the underlying `Qwen3TTSForConditionalGeneration.generate(...)`.
+        Returns:
+            Tuple[List[np.ndarray], int]:
+                (wavs, sample_rate)
+        Raises:
+            ValueError:
+                If batch sizes mismatch or required prompt inputs are missing.
+        """
+        if self.model.tts_model_type != "base":
+            raise ValueError(
+                f"model with \ntokenizer_type: {self.model.tokenizer_type}\n"
+                f"tts_model_size: {self.model.tts_model_size}\n"
+                f"tts_model_type: {self.model.tts_model_type}\n"
+                "does not support generate_voice_clone, Please check Model Card or Readme for more details."
+            )
+        texts = self._ensure_list(text)
+        languages = self._ensure_list(language) if isinstance(language, list) else ([language] * len(texts) if language is not None else ["Auto"] * len(texts))
+        if len(languages) == 1 and len(texts) > 1:
+            languages = languages * len(texts)
+        if len(texts) != len(languages):
+            raise ValueError(f"Batch size mismatch: text={len(texts)}, language={len(languages)}")
+        self._validate_languages(languages)
+        if voice_clone_prompt is None:
+            if ref_audio is None:
+                raise ValueError("Either `voice_clone_prompt` or `ref_audio` must be provided.")
+            prompt_items = self.create_voice_clone_prompt(ref_audio=ref_audio, ref_text=ref_text, x_vector_only_mode=x_vector_only_mode)
+            if len(prompt_items) == 1 and len(texts) > 1:
+                prompt_items = prompt_items * len(texts)
+            if len(prompt_items) != len(texts):
+                raise ValueError(f"Batch size mismatch: prompt={len(prompt_items)}, text={len(texts)}")
+            voice_clone_prompt_dict = self._prompt_items_to_voice_clone_prompt(prompt_items)
+            ref_texts_for_ids = [it.ref_text for it in prompt_items]
+        else:
+            if isinstance(voice_clone_prompt, list):
+                prompt_items = voice_clone_prompt
+                if len(prompt_items) == 1 and len(texts) > 1:
+                    prompt_items = prompt_items * len(texts)
+                if len(prompt_items) != len(texts):
+                    raise ValueError(f"Batch size mismatch: prompt={len(prompt_items)}, text={len(texts)}")
+                voice_clone_prompt_dict = self._prompt_items_to_voice_clone_prompt(prompt_items)
+                ref_texts_for_ids = [it.ref_text for it in prompt_items]
+            else:
+                voice_clone_prompt_dict = voice_clone_prompt
+                ref_texts_for_ids = None
+        input_texts = [self._build_assistant_text(t) for t in texts]
+        input_ids = self._tokenize_texts(input_texts)
+        ref_ids = None
+        if ref_texts_for_ids is not None:
+            ref_ids = []
+            for i, rt in enumerate(ref_texts_for_ids):
+                if rt is None or rt == "":
+                    ref_ids.append(None)
+                else:
+                    ref_tok = self._tokenize_texts([self._build_ref_text(rt)])[0]
+                    ref_ids.append(ref_tok)
+        gen_kwargs = self._merge_generate_kwargs(**kwargs)
+        talker_codes_list, _ = self.model.generate(
+            input_ids=input_ids,
+            ref_ids=ref_ids,
+            voice_clone_prompt=voice_clone_prompt_dict,
+            languages=languages,
+            non_streaming_mode=non_streaming_mode,
+            **gen_kwargs,
+        )
+        codes_for_decode = []
+        for i, codes in enumerate(talker_codes_list):
+            ref_code_list = voice_clone_prompt_dict.get("ref_code", None)
+            if ref_code_list is not None and ref_code_list[i] is not None:
+                codes_for_decode.append(torch.cat([ref_code_list[i].to(codes.device), codes], dim=0))
+            else:
+                codes_for_decode.append(codes)
+        wavs_all, fs = self.model.speech_tokenizer.decode([{"audio_codes": c} for c in codes_for_decode])
+        wavs_out: List[np.ndarray] = []
+        for i, wav in enumerate(wavs_all):
+            ref_code_list = voice_clone_prompt_dict.get("ref_code", None)
+            if ref_code_list is not None and ref_code_list[i] is not None:
+                ref_len = int(ref_code_list[i].shape[0])
+                total_len = int(codes_for_decode[i].shape[0])
+                cut = int(ref_len / max(total_len, 1) * wav.shape[0])
+                wavs_out.append(wav[cut:])
+            else:
+                wavs_out.append(wav)
+        return wavs_out, fs
+    # voice design model
+    @torch.no_grad()
+    def generate_voice_design(
+        self,
+        text: Union[str, List[str]],
+        instruct: Union[str, List[str]],
+        language: Union[str, List[str]] = None,
+        non_streaming_mode: bool = True,
+        **kwargs,
+    ) -> Tuple[List[np.ndarray], int]:
+        """
+        Generate speech with the VoiceDesign model using natural-language style instructions.
+        Args:
+            text:
+                Text(s) to synthesize.
+            language:
+                Language(s) for each sample.
+            instruct:
+                Instruction(s) describing desired voice/style. Empty string is allowed (treated as no instruction).
+            non_streaming_mode:
+                Using non-streaming text input, this option currently only simulates streaming text input when set to `false`,
+                rather than enabling true streaming input or streaming generation.
+            do_sample:
+                Whether to use sampling, recommended to be set to `true` for most use cases.
+            top_k:
+                Top-k sampling parameter.
+            top_p:
+                Top-p sampling parameter.
+            temperature:
+                Sampling temperature; higher => more random.
+            repetition_penalty:
+                Penalty to reduce repeated tokens/codes.
+            subtalker_dosample:
+                Sampling switch for the sub-talker (only valid for qwen3-tts-tokenizer-v2) if applicable.
+            subtalker_top_k:
+                Top-k for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
+            subtalker_top_p:
+                Top-p for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
+            subtalker_temperature:
+                Temperature for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
+            max_new_tokens:
+                Maximum number of new codec tokens to generate.
+            **kwargs:
+                Any other keyword arguments supported by HuggingFace Transformers `generate()` can be passed.
+                They will be forwarded to the underlying `Qwen3TTSForConditionalGeneration.generate(...)`.
+        Returns:
+            Tuple[List[np.ndarray], int]:
+                (wavs, sample_rate)
+        """
+        if self.model.tts_model_type != "voice_design":
+            raise ValueError(
+                f"model with \ntokenizer_type: {self.model.tokenizer_type}\n"
+                f"tts_model_size: {self.model.tts_model_size}\n"
+                f"tts_model_type: {self.model.tts_model_type}\n"
+                "does not support generate_voice_design, Please check Model Card or Readme for more details."
+            )
+        texts = self._ensure_list(text)
+        languages = self._ensure_list(language) if isinstance(language, list) else ([language] * len(texts) if language is not None else ["Auto"] * len(texts))
+        instructs = self._ensure_list(instruct)
+        if len(languages) == 1 and len(texts) > 1:
+            languages = languages * len(texts)
+        if len(instructs) == 1 and len(texts) > 1:
+            instructs = instructs * len(texts)
+        if not (len(texts) == len(languages) == len(instructs)):
+            raise ValueError(f"Batch size mismatch: text={len(texts)}, language={len(languages)}, instruct={len(instructs)}")
+        self._validate_languages(languages)
+        input_ids = self._tokenize_texts([self._build_assistant_text(t) for t in texts])
+        instruct_ids: List[Optional[torch.Tensor]] = []
+        for ins in instructs:
+            if ins is None or ins == "":
+                instruct_ids.append(None)
+            else:
+                instruct_ids.append(self._tokenize_texts([self._build_instruct_text(ins)])[0])
+        gen_kwargs = self._merge_generate_kwargs(**kwargs)
+        talker_codes_list, _ = self.model.generate(
+            input_ids=input_ids,
+            instruct_ids=instruct_ids,
+            languages=languages,
+            non_streaming_mode=non_streaming_mode,
+            **gen_kwargs,
+        )
+        wavs, fs = self.model.speech_tokenizer.decode([{"audio_codes": c} for c in talker_codes_list])
+        return wavs, fs
+    # custom voice model
+    @torch.no_grad()
+    def generate_custom_voice(
+        self,
+        text: Union[str, List[str]],
+        speaker: Union[str, List[str]],
+        language: Union[str, List[str]] = None,
+        instruct: Optional[Union[str, List[str]]] = None,
+        non_streaming_mode: bool = True,
+        **kwargs,
+    ) -> Tuple[List[np.ndarray], int]:
+        """
+        Generate speech with the CustomVoice model using a predefined speaker id, optionally controlled by instruction text.
+        Args:
+            text:
+                Text(s) to synthesize.
+            language:
+                Language(s) for each sample.
+            speaker:
+                Speaker name(s). Will be validated against `model.get_supported_speakers()` (case-insensitive).
+            instruct:
+                Optional instruction(s). If None, treated as empty (no instruction).
+            non_streaming_mode:
+                Using non-streaming text input, this option currently only simulates streaming text input when set to `false`,
+                rather than enabling true streaming input or streaming generation.
+            do_sample:
+                Whether to use sampling, recommended to be set to `true` for most use cases.
+            top_k:
+                Top-k sampling parameter.
+            top_p:
+                Top-p sampling parameter.
+            temperature:
+                Sampling temperature; higher => more random.
+            repetition_penalty:
+                Penalty to reduce repeated tokens/codes.
+            subtalker_dosample:
+                Sampling switch for the sub-talker (only valid for qwen3-tts-tokenizer-v2) if applicable.
+            subtalker_top_k:
+                Top-k for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
+            subtalker_top_p:
+                Top-p for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
+            subtalker_temperature:
+                Temperature for sub-talker sampling (only valid for qwen3-tts-tokenizer-v2).
+            max_new_tokens:
+                Maximum number of new codec tokens to generate.
+            **kwargs:
+                Any other keyword arguments supported by HuggingFace Transformers `generate()` can be passed.
+                They will be forwarded to the underlying `Qwen3TTSForConditionalGeneration.generate(...)`.
+        Returns:
+            Tuple[List[np.ndarray], int]:
+                (wavs, sample_rate)
+        Raises:
+            ValueError:
+                If any speaker/language is unsupported or batch sizes mismatch.
+        """
+        if self.model.tts_model_type != "custom_voice":
+            raise ValueError(
+                f"model with \ntokenizer_type: {self.model.tokenizer_type}\n"
+                f"tts_model_size: {self.model.tts_model_size}\n"
+                f"tts_model_type: {self.model.tts_model_type}\n"
+                "does not support generate_custom_voice, Please check Model Card or Readme for more details."
+            )
+        texts = self._ensure_list(text)
+        languages = self._ensure_list(language) if isinstance(language, list) else ([language] * len(texts) if language is not None else ["Auto"] * len(texts))
+        speakers = self._ensure_list(speaker)
+        if self.model.tts_model_size in "0b6": # for 0b6 model, instruct is not supported
+            instruct = None
+        instructs = self._ensure_list(instruct) if isinstance(instruct, list) else ([instruct] * len(texts) if instruct is not None else [""] * len(texts))
+        if len(languages) == 1 and len(texts) > 1:
+            languages = languages * len(texts)
+        if len(speakers) == 1 and len(texts) > 1:
+            speakers = speakers * len(texts)
+        if len(instructs) == 1 and len(texts) > 1:
+            instructs = instructs * len(texts)
+        if not (len(texts) == len(languages) == len(speakers) == len(instructs)):
+            raise ValueError(
+                f"Batch size mismatch: text={len(texts)}, language={len(languages)}, speaker={len(speakers)}, instruct={len(instructs)}"
+            )
+        self._validate_languages(languages)
+        self._validate_speakers(speakers)
+        input_ids = self._tokenize_texts([self._build_assistant_text(t) for t in texts])
+        instruct_ids: List[Optional[torch.Tensor]] = []
+        for ins in instructs:
+            if ins is None or ins == "":
+                instruct_ids.append(None)
+            else:
+                instruct_ids.append(self._tokenize_texts([self._build_instruct_text(ins)])[0])
+        gen_kwargs = self._merge_generate_kwargs(**kwargs)
+        talker_codes_list, _ = self.model.generate(
+            input_ids=input_ids,
+            instruct_ids=instruct_ids,
+            languages=languages,
+            speakers=speakers,
+            non_streaming_mode=non_streaming_mode,
+            **gen_kwargs,
+        )
+        wavs, fs = self.model.speech_tokenizer.decode([{"audio_codes": c} for c in talker_codes_list])
+        return wavs, fs
+    def get_supported_speakers(self) -> Optional[List[str]]:
+        """
+        List supported speaker names for the current model.
+        This is a convenience wrapper around `model.get_supported_speakers()`.
+        If the underlying model does not expose speaker constraints (returns None),
+        this method also returns None.
+        Returns:
+            Optional[List[str]]:
+                - A sorted list of supported speaker names (lowercased), if available.
+                - None if the model does not provide supported speakers.
+        """
+        supported = self._supported_speakers_set()
+        if supported is None:
+            return None
+        return sorted(supported)
+    def get_supported_languages(self) -> Optional[List[str]]:
+        """
+        List supported language names for the current model.
+        This is a convenience wrapper around `model.get_supported_languages()`.
+        If the underlying model does not expose language constraints (returns None),
+        this method also returns None.
+        Returns:
+            Optional[List[str]]:
+                - A sorted list of supported language names (lowercased), if available.
+                - None if the model does not provide supported languages.
+        """
+        supported = self._supported_languages_set()
+        if supported is None:
+            return None
+        return sorted(supported)

qwen_tts/inference/qwen3_tts_tokenizer.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import io
+import urllib.request
+from typing import List, Optional, Tuple, Union
+from urllib.parse import urlparse
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoConfig, AutoFeatureExtractor, AutoModel
+from ..core import (
+    Qwen3TTSTokenizerV1Config,
+    Qwen3TTSTokenizerV1Model,
+    Qwen3TTSTokenizerV2Config,
+    Qwen3TTSTokenizerV2Model,
+)
+AudioInput = Union[
+    str,  # wav path, or base64 string
+    np.ndarray,  # 1-D float array
+    List[str],
+    List[np.ndarray],
+]
+class Qwen3TTSTokenizer:
+    """
+    A wrapper for Qwen3 TTS Tokenizer 25Hz/12Hz with HuggingFace-style loading.
+    - from_pretrained(): loads speech tokenizer model via AutoModel and feature_extractor via AutoFeatureExtractor.
+    - encode(): supports wav path(s), base64 audio string(s), numpy array(s).
+    - decode(): accepts either the raw model encode output, or a minimal dict/list-of-dicts.
+    Notes:
+    - For numpy array input, you must pass `sr` so the audio can be resampled to model sample rate.
+    - Returned audio is float32 numpy arrays and the output sample rate.
+    """
+    def __init__(self):
+        self.model = None
+        self.feature_extractor = None
+        self.config = None
+        self.device = None
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs) -> "Qwen3TTSTokenizer":
+        """
+        Initialize tokenizer with HuggingFace `from_pretrained` style.
+        Args:
+            pretrained_model_name_or_path (str):
+                HuggingFace repo id or local directory.
+            **kwargs (Any):
+                Forwarded to `AutoModel.from_pretrained(...)` directly.
+                Typical examples: device_map="cuda:0", dtype=torch.bfloat16, attn_implementation="eager".
+        Returns:
+            Qwen3TTSTokenizer:
+                Initialized instance with `model`, `feature_extractor`, `config`.
+        """
+        inst = cls()
+        AutoConfig.register("qwen3_tts_tokenizer_25hz", Qwen3TTSTokenizerV1Config)
+        AutoModel.register(Qwen3TTSTokenizerV1Config, Qwen3TTSTokenizerV1Model)
+        AutoConfig.register("qwen3_tts_tokenizer_12hz", Qwen3TTSTokenizerV2Config)
+        AutoModel.register(Qwen3TTSTokenizerV2Config, Qwen3TTSTokenizerV2Model)
+        inst.feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path)
+        inst.model = AutoModel.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        inst.config = inst.model.config
+        inst.device = getattr(inst.model, "device", None)
+        if inst.device is None:
+            # fallback: infer from first parameter device
+            try:
+                inst.device = next(inst.model.parameters()).device
+            except StopIteration:
+                inst.device = torch.device("cpu")
+        return inst
+    def _is_probably_base64(self, s: str) -> bool:
+        if s.startswith("data:audio"):
+            return True
+        # Heuristic: no filesystem path separators and long enough.
+        if ("/" not in s and "\\" not in s) and len(s) > 256:
+            return True
+        return False
+    def _is_url(self, s: str) -> bool:
+        try:
+            u = urlparse(s)
+            return u.scheme in ("http", "https") and bool(u.netloc)
+        except Exception:
+            return False
+    def _decode_base64_to_wav_bytes(self, b64: str) -> bytes:
+        # Accept both "data:audio/wav;base64,...." and raw base64
+        if "," in b64 and b64.strip().startswith("data:"):
+            b64 = b64.split(",", 1)[1]
+        return base64.b64decode(b64)
+    def load_audio(
+        self,
+        x: str,
+        target_sr: int,
+    ) -> np.ndarray:
+        """
+        Load audio from wav path or base64 string, then resample to target_sr.
+        Args:
+            x (str):
+                A wav file path, or a base64 audio string (raw or data URL).
+            target_sr (int):
+                Target sampling rate.
+        Returns:
+            np.ndarray:
+                1-D float32 waveform at target_sr.
+        """
+        if self._is_url(x):
+            with urllib.request.urlopen(x) as resp:
+                audio_bytes = resp.read()
+            with io.BytesIO(audio_bytes) as f:
+                audio, sr = sf.read(f, dtype="float32", always_2d=False)
+        elif self._is_probably_base64(x):
+            wav_bytes = self._decode_base64_to_wav_bytes(x)
+            with io.BytesIO(wav_bytes) as f:
+                audio, sr = sf.read(f, dtype="float32", always_2d=False)
+        else:
+            audio, sr = librosa.load(x, sr=None, mono=True)
+        if audio.ndim > 1:
+            audio = np.mean(audio, axis=-1)
+        if sr != target_sr:
+            audio = librosa.resample(y=audio, orig_sr=sr, target_sr=target_sr)
+        return audio.astype(np.float32)
+    def _normalize_audio_inputs(
+        self,
+        audios: AudioInput,
+        sr: Optional[int],
+    ) -> List[np.ndarray]:
+        """
+        Normalize all supported input types into a list of 1-D numpy float32 waveforms
+        at `self.feature_extractor.sampling_rate`.
+        Args:
+            audios (AudioInput):
+                - str: wav path OR base64 audio string
+                - np.ndarray: raw waveform (sr must be provided)
+                - list[str] / list[np.ndarray]
+            sr (Optional[int]):
+                Sampling rate for raw numpy input. Required if input is np.ndarray or list[np.ndarray].
+        Returns:
+            List[np.ndarray]:
+                List of float32 waveforms resampled to model input SR.
+        """
+        target_sr = int(self.feature_extractor.sampling_rate)
+        if isinstance(audios, (str, np.ndarray)):
+            audios = [audios]
+        if len(audios) == 0:
+            return []
+        if isinstance(audios[0], str):
+            # wav path list or base64 list
+            return [self.load_audio(x, target_sr=target_sr) for x in audios]  # type: ignore[arg-type]
+        # numpy list
+        if sr is None:
+            raise ValueError("For numpy waveform input, you must provide `sr` (original sampling rate).")
+        out: List[np.ndarray] = []
+        for a in audios:  # type: ignore[assignment]
+            if not isinstance(a, np.ndarray):
+                raise TypeError("Mixed input types are not supported. Use all paths/base64 or all numpy arrays.")
+            if a.ndim > 1:
+                a = np.mean(a, axis=-1)
+            if int(sr) != target_sr:
+                a = librosa.resample(y=a.astype(np.float32), orig_sr=int(sr), target_sr=target_sr)
+            out.append(a.astype(np.float32))
+        return out
+    def encode(
+        self,
+        audios: AudioInput,
+        sr: Optional[int] = None,
+        return_dict: bool = True,
+    ):
+        """
+        Batch-encode audio into discrete codes (and optional conditioning, depending on 25Hz/12Hz).
+        Args:
+            audios (AudioInput):
+                Supported forms:
+                - np.ndarray: waveform (requires sr)
+                - list[np.ndarray]: waveforms (requires sr)
+                - str: wav path OR base64 audio string
+                - list[str]: wav paths and/or base64 strings
+            sr (Optional[int], default=None):
+                Original sampling rate for numpy waveform input.
+            return_dict (bool, default=True):
+                Forwarded to model.encode(...). If True, returns ModelOutput.
+        Returns:
+            25Hz:
+                Qwen3TTSTokenizerV1EncoderOutput (if return_dict=True) with fields:
+                  - audio_codes: List[torch.LongTensor] each (codes_len,)
+                  - xvectors:   List[torch.FloatTensor] each (xvector_dim,)
+                  - ref_mels:   List[torch.FloatTensor] each (mel_len, mel_dim)
+            12Hz:
+                Qwen3TTSTokenizerV2EncoderOutput (if return_dict=True) with fields:
+                  - audio_codes: List[torch.LongTensor] each (codes_len, num_quantizers)
+            If return_dict=False, returns the raw tuple from model.encode.
+        """
+        wavs = self._normalize_audio_inputs(audios, sr=sr)
+        inputs = self.feature_extractor(
+            raw_audio=wavs,
+            sampling_rate=int(self.feature_extractor.sampling_rate),
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.device).to(self.model.dtype)
+        with torch.inference_mode():
+            # model.encode expects (B, T) and (B, T)
+            enc = self.model.encode(
+                inputs["input_values"].squeeze(1),
+                inputs["padding_mask"].squeeze(1),
+                return_dict=return_dict,
+            )
+        return enc
+    def decode(
+        self,
+        encoded,
+    ) -> Tuple[List[np.ndarray], int]:
+        """
+        Decode back to waveform.
+        Usage:
+        1) Pass the raw output of `encode(...)` directly (recommended).
+           - 25Hz: expects fields audio_codes, xvectors, ref_mels
+           - 12Hz: expects field audio_codes
+        2) Pass a dict or list[dict] (minimal form) for custom pipelines:
+           - 25Hz dict keys: {"audio_codes", "xvectors", "ref_mels"}
+           - 12Hz dict keys: {"audio_codes"}
+           Values can be torch tensors or numpy arrays.
+        Args:
+            encoded (Any):
+                - ModelOutput returned by `encode()`, OR
+                - dict, OR
+                - list[dict]
+        Returns:
+            Tuple[List[np.ndarray], int]:
+                - wavs: list of 1-D float32 numpy arrays
+                - sample_rate: int, model output sampling rate
+        """
+        model_type = self.model.get_model_type()
+        def _to_tensor(x, dtype=None):
+            if isinstance(x, torch.Tensor):
+                return x
+            x = np.asarray(x)
+            t = torch.from_numpy(x)
+            if dtype is not None:
+                t = t.to(dtype)
+            return t
+        # Normalize `encoded` into the same shapes as the official demo uses.
+        if hasattr(encoded, "audio_codes"):
+            # ModelOutput from encode()
+            audio_codes_list = encoded.audio_codes
+            xvectors_list = getattr(encoded, "xvectors", None)
+            ref_mels_list = getattr(encoded, "ref_mels", None)
+        elif isinstance(encoded, dict):
+            audio_codes_list = encoded["audio_codes"]
+            xvectors_list = encoded.get("xvectors", None)
+            ref_mels_list = encoded.get("ref_mels", None)
+        elif isinstance(encoded, list):
+            # list of dicts
+            audio_codes_list = [e["audio_codes"] for e in encoded]
+            xvectors_list = [e["xvectors"] for e in encoded] if ("xvectors" in encoded[0]) else None
+            ref_mels_list = [e["ref_mels"] for e in encoded] if ("ref_mels" in encoded[0]) else None
+        else:
+            raise TypeError("`encoded` must be an encode output, a dict, or a list of dicts.")
+        # Ensure list form for per-sample tensors
+        if isinstance(audio_codes_list, torch.Tensor):
+            # Could be a single sample tensor or an already padded batch tensor.
+            t = audio_codes_list
+            if t.dim() == 1:
+                # 25Hz single sample: (C,) -> (1, C)
+                t = t.unsqueeze(0)
+            elif t.dim() == 2:
+                # 12Hz single sample: (C, Q) -> (1, C, Q)
+                t = t.unsqueeze(0)
+            audio_codes_padded = t.to(self.device)
+        else:
+            # List[Tensor/np]
+            audio_codes_list = [_to_tensor(c, dtype=torch.long) for c in audio_codes_list]
+            audio_codes_padded = pad_sequence(audio_codes_list, batch_first=True, padding_value=0).to(self.device)
+        with torch.inference_mode():
+            if model_type == "qwen3_tts_tokenizer_25hz":
+                if xvectors_list is None or ref_mels_list is None:
+                    raise ValueError("25Hz decode requires `xvectors` and `ref_mels`.")
+                if isinstance(xvectors_list, torch.Tensor):
+                    xvectors_batch = xvectors_list
+                    if xvectors_batch.dim() == 1:  # (D,) -> (1, D)
+                        xvectors_batch = xvectors_batch.unsqueeze(0)
+                    xvectors_batch = xvectors_batch.to(self.device).to(self.model.dtype)
+                else:
+                    xvectors_list = [_to_tensor(x, dtype=torch.float32) for x in xvectors_list]
+                    xvectors_batch = torch.stack(xvectors_list, dim=0).to(self.device).to(self.model.dtype)
+                if isinstance(ref_mels_list, torch.Tensor):
+                    ref_mels_padded = ref_mels_list
+                    if ref_mels_padded.dim() == 2:  # (T, M) -> (1, T, M)
+                        ref_mels_padded = ref_mels_padded.unsqueeze(0)
+                    ref_mels_padded = ref_mels_padded.to(self.device).to(self.model.dtype)
+                else:
+                    ref_mels_list = [_to_tensor(m, dtype=torch.float32) for m in ref_mels_list]
+                    ref_mels_padded = pad_sequence(ref_mels_list, batch_first=True, padding_value=0).to(self.device).to(self.model.dtype)
+                dec = self.model.decode(audio_codes_padded, xvectors_batch, ref_mels_padded, return_dict=True)
+                wav_tensors = dec.audio_values
+            elif model_type == "qwen3_tts_tokenizer_12hz":
+                dec = self.model.decode(audio_codes_padded, return_dict=True)
+                wav_tensors = dec.audio_values
+            else:
+                raise ValueError(f"Unknown model type: {model_type}")
+        wavs = [w.to(torch.float32).detach().cpu().numpy() for w in wav_tensors]
+        return wavs, int(self.model.get_output_sample_rate())
+    def get_model_type(self) -> str:
+        """
+        Get the underlying tokenizer model type.
+        Returns:
+            str: Model type string from `self.model.config.model_type`
+                (e.g. "qwen3_tts_tokenizer_25hz" / "qwen3_tts_tokenizer_12hz").
+        """
+        return self.model.get_model_type()
+    def get_input_sample_rate(self) -> int:
+        """
+        Get the expected input sample rate for encoding.
+        Returns:
+            int: Input sample rate (Hz).
+        """
+        return int(self.model.get_input_sample_rate())
+    def get_output_sample_rate(self) -> int:
+        """
+        Get the output sample rate for decoded waveforms.
+        Returns:
+            int: Output sample rate (Hz).
+        """
+        return int(self.model.get_output_sample_rate())
+    def get_encode_downsample_rate(self) -> int:
+        """
+        Get the encoder downsample rate (waveform samples per code step).
+        Returns:
+            int: Encode downsample rate.
+        """
+        return int(self.model.get_encode_downsample_rate())
+    def get_decode_upsample_rate(self) -> int:
+        """
+        Get the decoder upsample rate (waveform samples per code step).
+        Returns:
+            int: Decode upsample rate.
+        """
+        return int(self.model.get_decode_upsample_rate())

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+python-dotenv
+torch==2.8.0
+torchaudio==2.8.0
+transformers==4.57.3
+accelerate==1.12.0
+einops
+gradio
+librosa
+soundfile
+sox
+onnxruntime
+spaces
+numpy
+kernels
+openai-whisper