fdugyt commited on 11 days ago

Commit

2b6d647

verified ·

1 Parent(s): d594ff3

Add files using upload-large-folder tool

Browse files

Files changed (18) hide show

.gitattributes +1 -0
README.md +278 -0
__init__.py +9 -0
added_tokens.json +31 -0
chat_template.jinja +89 -0
config.json +381 -0
configuration_moss_tts.py +158 -0
gpt2_decoder.py +721 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_moss_tts.py +623 -0
processing_moss_tts.py +899 -0
processor_config.json +7 -0
qwen3_decoder.py +582 -0
special_tokens_map.json +21 -0
tokenizer.json +3 -0
tokenizer_config.json +253 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,281 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+library_name: transformers
+pipeline_tag: text-to-speech
+tags:
+- text-to-speech
+- voice-cloning
+- custom_code
+- moss-tts
+- moss-tts-local
+- arxiv:2603.18090
+language:
+- zh
+- yue
+- en
+- ar
+- cs
+- da
+- de
+- nl
+- es
+- fr
+- fi
+- el
+- he
+- hi
+- hu
+- ja
+- it
+- ko
+- mk
+- ms
+- ru
+- fa
+- pl
+- pt
+- sv
+- ro
+- sw
+- tl
+- th
+- tr
+- vi
 ---
+# MOSS-TTS Family
+<br>
+<p align="center">
+  &nbsp;&nbsp;&nbsp;&nbsp;
+  <img src="https://speech-demo.oss-cn-shanghai.aliyuncs.com/moss_tts_demo/tts_readme_imgaes_demo/openmoss_x_mosi" height="50" align="middle" />
+</p>
+<div align="center">
+  <a href="https://github.com/OpenMOSS/MOSS-TTS/tree/main"><img src="https://img.shields.io/badge/Project%20Page-GitHub-blue"></a>
+  <a href="https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Local-Transformer-v1.5"><img src="https://img.shields.io/badge/HuggingFace-Model-yellow?logo=huggingface"></a>
+  <a href="https://modelscope.cn/collections/OpenMOSS-Team/MOSS-TTS"><img src="https://img.shields.io/badge/ModelScope-Models-lightgrey?logo=modelscope&amp"></a>
+  <a href="https://mosi.cn/#models"><img src="https://img.shields.io/badge/Blog-View-blue?logo=internet-explorer&amp"></a>
+  <a href="https://arxiv.org/abs/2603.18090"><img src="https://img.shields.io/badge/Arxiv-2603.18090-red?logo=Arxiv&amp"></a>
+  <a href="https://studio.mosi.cn"><img src="https://img.shields.io/badge/AIStudio-Try-green?logo=internet-explorer&amp"></a>
+  <a href="https://studio.mosi.cn/docs/moss-tts"><img src="https://img.shields.io/badge/API-Docs-00A3FF?logo=fastapi&amp"></a>
+  <a href="https://x.com/Open_MOSS"><img src="https://img.shields.io/badge/Twitter-Follow-black?logo=x&amp"></a>
+  <a href="https://discord.gg/fvm5TaWjU3"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&amp"></a>
+</div>
+# MOSS-TTS-Local-Transformer-v1.5
+**MOSS-TTS-Local-Transformer-v1.5** is continued from [MOSS-TTS-Local-Transformer-v1.0](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Local-Transformer). It preserves the main 1.0 capabilities, including zero-shot voice cloning, long-form speech generation, token-level duration control, Pinyin/IPA pronunciation control, multilingual synthesis, and code-switching. For the full 1.0 feature walkthrough, input schema, and evaluation tables, please refer to the [MOSS-TTS-Local-Transformer-v1.0 README](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Local-Transformer).
+Compared with [MOSS-TTS-Local-Transformer-v1.0](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Local-Transformer), v1.5 focuses on the following improvements:
+- **Higher-fidelity stereo audio modeling**: v1.5 uses [MOSS-Audio-Tokenizer-v2](https://huggingface.co/OpenMOSS-Team/MOSS-Audio-Tokenizer-v2) as the audio tokenizer, supporting native 48 kHz stereo input and output for richer spatial detail and more natural perceived audio quality. Since the codec output is stereo, save the `[channels, samples]` tensor returned by `processor.decode(...)` directly.
+- **Stronger multilingual synthesis with language tags**: when the `language` field is omitted, v1.5 may improve some languages and regress slightly on others compared with 1.0. When the language is specified, v1.5 is stronger than 1.0 on almost all supported languages. Set the tag when building the user message, for example `processor.build_user_message(text=text_fr, language="French")`.
+- **More stable voice cloning**: v1.5 improves speaker similarity and reduces cloning variance, making repeated generations more consistent.
+- **Better long-reference, short-text cloning**: v1.5 handles scenarios where the reference audio is much longer than the target text more reliably than 1.0.
+- **More stable punctuation-following prosody**: v1.5 follows punctuation-driven pauses more closely, especially in long sentences.
+- **Explicit pause control**: v1.5 supports inline pause markers such as `"[pause 3.2s]"`. For example, `我今天学习了一首中国的古诗，它的名字是[pause 3.2s]静夜思！` inserts an explicit 3.2s pause before `静夜思`.
+## Supported Languages
+MOSS-TTS Local Transformer v1.5 supports **31 languages**. It keeps the 20 languages supported by [MOSS-TTS-Local-Transformer-v1.0](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Local-Transformer) and extends multilingual continued training to additional languages including Cantonese, Dutch, Finnish, Hindi, Macedonian, Malay, Romanian, Swahili, Tagalog, Thai, and Vietnamese.
+| Language | Code | Flag | Language | Code | Flag | Language | Code | Flag |
+|---|---|---|---|---|---|---|---|---|
+| Chinese | zh | 🇨🇳 | Cantonese | yue | 🇭🇰 | English | en | 🇺🇸 |
+| Arabic | ar | 🇸🇦 | Czech | cs | 🇨🇿 | Danish | da | 🇩🇰 |
+| Dutch | nl | 🇳🇱 | Finnish | fi | 🇫🇮 | French | fr | 🇫🇷 |
+| German | de | 🇩🇪 | Greek | el | 🇬🇷 | Hebrew | he | 🇮🇱 |
+| Hindi | hi | 🇮🇳 | Hungarian | hu | 🇭🇺 | Italian | it | 🇮🇹 |
+| Japanese | ja | 🇯🇵 | Korean | ko | 🇰🇷 | Macedonian | mk | 🇲🇰 |
+| Malay | ms | 🇲🇾 | Persian (Farsi) | fa | 🇮🇷 | Polish | pl | 🇵🇱 |
+| Portuguese | pt | 🇵🇹 | Romanian | ro | 🇷🇴 | Russian | ru | 🇷🇺 |
+| Spanish | es | 🇪🇸 | Swahili | sw | 🇹🇿 | Swedish | sv | 🇸🇪 |
+| Tagalog | tl | 🇵🇭 | Thai | th | 🇹🇭 | Turkish | tr | 🇹🇷 |
+| Vietnamese | vi | 🇻🇳 | | | | | | |
+## Quick Start
+### Environment Setup
+We recommend a clean, isolated Python environment with **Transformers 5.0.0**, or a recent Transformers version with Qwen3 support, to avoid dependency conflicts.
+```bash
+conda create -n moss-tts python=3.12 -y
+conda activate moss-tts
+```
+Install all required dependencies:
+```bash
+git clone https://github.com/OpenMOSS/MOSS-TTS.git
+cd MOSS-TTS
+pip install --extra-index-url https://download.pytorch.org/whl/cu128 -e .
+```
+#### (Optional) Install FlashAttention 2
+For better speed and lower GPU memory usage, you can install FlashAttention 2 if your hardware supports it.
+```bash
+pip install --extra-index-url https://download.pytorch.org/whl/cu128 -e ".[flash-attn]"
+```
+If your machine has limited RAM and many CPU cores, you can cap build parallelism:
+```bash
+MAX_JOBS=4 pip install --extra-index-url https://download.pytorch.org/whl/cu128 -e ".[flash-attn]"
+```
+Notes:
+- Dependencies are managed in `pyproject.toml`, which currently pins `torch==2.9.1+cu128` and `torchaudio==2.9.1+cu128`.
+- If FlashAttention 2 fails to build on your machine, you can skip it and use the default attention backend.
+- FlashAttention 2 is only available on supported GPUs and is typically used with `torch.float16` or `torch.bfloat16`.
+### Basic Usage
+> Tip: MOSS-TTS-Local-Transformer-v1.5 uses a fixed 12-codebook RVQ depth. Do not set `n_vq_for_inference` to a value different from `config.n_vq`.
+MOSS-TTS-Local-Transformer-v1.5 provides the standard Hugging Face `AutoProcessor` and `AutoModel` interface. The examples below cover:
+1. Direct generation with language tags
+2. Voice cloning
+3. Duration control
+4. Explicit pause control with `[pause X.Ys]`
+```python
+from pathlib import Path
+from tqdm import tqdm
+import importlib.util
+import torch
+import torchaudio
+from transformers import AutoModel, AutoProcessor
+# Disable the broken cuDNN SDPA backend on some CUDA/PyTorch combinations.
+torch.backends.cuda.enable_cudnn_sdp(False)
+# Keep these enabled as fallbacks.
+torch.backends.cuda.enable_flash_sdp(True)
+torch.backends.cuda.enable_mem_efficient_sdp(True)
+torch.backends.cuda.enable_math_sdp(True)
+pretrained_model_name_or_path = "OpenMOSS-Team/MOSS-TTS-Local-Transformer-v1.5"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.bfloat16 if device == "cuda" else torch.float32
+def resolve_attn_implementation() -> str:
+    # Prefer FlashAttention 2 when package + device conditions are met.
+    if (
+        device == "cuda"
+        and importlib.util.find_spec("flash_attn") is not None
+        and dtype in {torch.float16, torch.bfloat16}
+    ):
+        major, _ = torch.cuda.get_device_capability()
+        if major >= 8:
+            return "flash_attention_2"
+    # CUDA fallback: use PyTorch SDPA kernels.
+    if device == "cuda":
+        return "sdpa"
+    # CPU fallback.
+    return "eager"
+attn_implementation = resolve_attn_implementation()
+print(f"[INFO] Using attn_implementation={attn_implementation}")
+processor = AutoProcessor.from_pretrained(
+    pretrained_model_name_or_path,
+    trust_remote_code=True,
+)
+processor.audio_tokenizer = processor.audio_tokenizer.to(device)
+text_zh = "亲爱的你，愿你的每一天都值得被记住，也值得被珍惜。"
+text_en = "We stand on the threshold of the AI era, where intelligence becomes an extension of human creativity."
+text_fr = "Bonjour, je voudrais essayer une voix francaise naturelle et stable."
+text_pause = "我今天学习了一首中国的古诗，它的名字是[pause 3.2s]静夜思！"
+# Use remote demo audio to avoid requiring local assets.
+ref_audio_zh = "https://speech-demo.oss-cn-shanghai.aliyuncs.com/moss_tts_demo/tts_readme_demo/reference_zh.wav"
+ref_audio_en = "https://speech-demo.oss-cn-shanghai.aliyuncs.com/moss_tts_demo/tts_readme_demo/reference_en.m4a"
+conversations = [
+    # Direct TTS. Language tags are recommended in v1.5 when the language is known.
+    [processor.build_user_message(text=text_zh, language="Chinese")],
+    [processor.build_user_message(text=text_en, language="English")],
+    [processor.build_user_message(text=text_fr, language="French")],
+    # Explicit pause control. Use [pause X.Ys], such as [pause 3.2s].
+    [processor.build_user_message(text=text_pause, language="Chinese")],
+    # Voice cloning with a reference audio.
+    [processor.build_user_message(text=text_zh, reference=[ref_audio_zh], language="Chinese")],
+    [processor.build_user_message(text=text_en, reference=[ref_audio_en], language="English")],
+    # Duration control. At 12.5 frames per second, 125 frames is about 10 seconds.
+    [processor.build_user_message(text=text_en, tokens=125, language="English")],
+]
+model = AutoModel.from_pretrained(
+    pretrained_model_name_or_path,
+    trust_remote_code=True,
+    attn_implementation=attn_implementation,
+    torch_dtype=dtype,
+).to(device)
+model.eval()
+batch_size = 1
+save_dir = Path("inference_root_moss_tts_local_v1_5")
+save_dir.mkdir(exist_ok=True, parents=True)
+sample_idx = 0
+with torch.no_grad():
+    for start in tqdm(range(0, len(conversations), batch_size)):
+        batch_conversations = conversations[start : start + batch_size]
+        batch = processor(batch_conversations, mode="generation")
+        input_ids = batch["input_ids"].to(device)
+        attention_mask = batch["attention_mask"].to(device)
+        outputs = model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=4096,
+            do_sample=True,
+            audio_temperature=1.7,
+            audio_top_p=0.8,
+            audio_top_k=25,
+            audio_repetition_penalty=1.0,
+        )
+        for message in processor.decode(outputs):
+            if message is None:
+                continue
+            audio = message.audio_codes_list[0]
+            out_path = save_dir / f"sample{sample_idx}.wav"
+            sample_idx += 1
+            # MOSS-TTS Local v1.5 codec returns stereo audio as [channels, samples].
+            # Save the two-channel tensor directly.
+            torchaudio.save(str(out_path), audio, processor.model_config.sampling_rate)
+```
+## Generation Parameters
+| Parameter | Recommended | Description |
+|---|---:|---|
+| `audio_temperature` | `1.7` | Sampling temperature for audio RVQ layers. |
+| `audio_top_p` | `0.8` | Nucleus sampling cutoff for audio RVQ layers. |
+| `audio_top_k` | `25` | Top-k sampling cutoff for audio RVQ layers. |
+| `audio_repetition_penalty` | `1.0` | Penalty for repeated acoustic token patterns. |
+| `n_vq_for_inference` | `12` | Fixed by this release. Values other than `config.n_vq` are rejected. |
+## Notes
+- This repository uses Hugging Face remote code. Load it with `trust_remote_code=True`.
+- The MOSS-TTS-Local-Transformer-v1.5 codec is stereo. `processor.decode(...)` returns audio tensors shaped as `[channels, samples]`, so save them directly with `torchaudio.save(path, audio, sampling_rate)`.
+- Audio encoding and decoding use `OpenMOSS-Team/MOSS-Audio-Tokenizer-v2`.
+- The model configuration sets `sampling_rate` to 48000 and `n_vq` to 12.
+- If FlashAttention 2 is unavailable, the example falls back to SDPA on CUDA and eager attention on CPU.
+## More Usage
+MOSS-TTS-Local-Transformer-v1.5 is API-compatible with MOSS-TTS-Local-Transformer-v1.0. For continuation with prefix audio, detailed `UserMessage` and `AssistantMessage` fields, generation hyperparameters, Pinyin/IPA preprocessing examples, and evaluation results, see the [MOSS-TTS-Local-Transformer-v1.0](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Local-Transformer).
+## Citation
+If you use this model, please cite the [MOSS-TTS Technical Report](https://arxiv.org/abs/2603.18090).

__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .configuration_moss_tts import MossTTSLocalConfig
+from .modeling_moss_tts import MossTTSLocalModel
+from .processing_moss_tts import MossTTSLocalProcessor
+__all__ = [
+    "MossTTSLocalConfig",
+    "MossTTSLocalModel",
+    "MossTTSLocalProcessor",
+]

added_tokens.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|audio_end|>": 151670,
+  "<|audio_pad|>": 151671,
+  "<|audio_start|>": 151669,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,381 @@

+{
+  "model_type": "moss_tts_local",
+  "architectures": [
+    "MossTTSLocalModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_moss_tts.MossTTSLocalConfig",
+    "AutoModel": "modeling_moss_tts.MossTTSLocalModel",
+    "AutoProcessor": "processing_moss_tts.MossTTSLocalProcessor"
+  },
+  "processor_class": "MossTTSLocalProcessor",
+  "qwen3_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dtype": "bfloat16",
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 151643,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "gradient_checkpointing_use_reentrant": false,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 36,
+    "min_length": 0,
+    "model_type": "qwen3",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 32,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 151643,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sep_token_id": null,
+    "sliding_window": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torchscript": false,
+    "transformers_version": "4.57.1",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "language_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dtype": "bfloat16",
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 151643,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "gradient_checkpointing_use_reentrant": false,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 36,
+    "min_length": 0,
+    "model_type": "qwen3",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 32,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 151643,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sep_token_id": null,
+    "sliding_window": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torchscript": false,
+    "transformers_version": "4.57.1",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "gpt2_config": {
+    "_name_or_path": "",
+    "activation_function": "silu",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attn_pdrop": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dtype": null,
+    "early_stopping": false,
+    "embd_pdrop": 0.0,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 151645,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_epsilon": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "gpt2",
+    "n_ctx": 10240,
+    "n_embd": 2560,
+    "n_head": 32,
+    "n_inner": 9728,
+    "n_layer": 1,
+    "n_positions": 10240,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "position_embedding_type": "rope",
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "reorder_and_upcast_attn": false,
+    "repetition_penalty": 1.0,
+    "resid_pdrop": 0.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rope_base": 1000000.0,
+    "scale_attn_by_inverse_layer_idx": false,
+    "scale_attn_weights": true,
+    "sep_token_id": null,
+    "summary_activation": null,
+    "summary_first_dropout": 0.1,
+    "summary_proj_to_labels": true,
+    "summary_type": "cls_index",
+    "summary_use_proj": true,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torchscript": false,
+    "transformers_version": "4.57.1",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 151936
+  },
+  "n_vq": 12,
+  "audio_vocab_size": 1024,
+  "audio_codebook_sizes": [
+    1024,
+    1024,
+    1024,
+    1024,
+    1024,
+    1024,
+    1024,
+    1024,
+    1024,
+    1024,
+    1024,
+    1024
+  ],
+  "audio_pad_token_id": 1024,
+  "audio_pad_code": 1024,
+  "pad_token_id": 151643,
+  "im_start_token_id": 151644,
+  "im_end_token_id": 151645,
+  "audio_start_token_id": 151669,
+  "audio_end_token_id": 151670,
+  "audio_user_slot_token_id": 151654,
+  "audio_assistant_slot_token_id": 151656,
+  "audio_assistant_gen_slot_token_id": 151656,
+  "sampling_rate": 48000,
+  "audio_tokenizer_name_or_path": "OpenMOSS-Team/MOSS-Audio-Tokenizer-v2",
+  "attn_implementation": "flash_attention_2",
+  "local_transformer_layers": 1,
+  "local_text_head_mode": "binary",
+  "use_static_local_kv_cache": true,
+  "initializer_range": 0.02
+}

configuration_moss_tts.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# coding=utf-8
+"""Configuration for the MOSS-TTS-Local-Transformer-v1.5 release."""
+from __future__ import annotations
+from typing import Any, Dict, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+SUPPORTED_ATTENTION_IMPLEMENTATIONS = {"flash_attention_2", "sdpa", "eager"}
+def _normalize_attention_implementation(value: Optional[str], default: str = "flash_attention_2") -> str:
+    normalized = str(value or default).strip().lower()
+    if normalized in {"flash", "flash_attn", "flash-attn", "flash_attention"}:
+        normalized = "flash_attention_2"
+    if normalized not in SUPPORTED_ATTENTION_IMPLEMENTATIONS:
+        raise ValueError(
+            "attn_implementation must be one of "
+            f"{sorted(SUPPORTED_ATTENTION_IMPLEMENTATIONS)}, got {value!r}."
+        )
+    return normalized
+class MossTTSLocalConfig(PretrainedConfig):
+    model_type = "moss_tts_local"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        qwen3_config: Optional[Union[Qwen3Config, Dict[str, Any]]] = None,
+        gpt2_config: Optional[Union[GPT2Config, Dict[str, Any]]] = None,
+        language_config: Optional[Union[Qwen3Config, Dict[str, Any]]] = None,
+        n_vq: int = 12,
+        audio_vocab_size: int = 1024,
+        audio_codebook_sizes: Optional[list[int]] = None,
+        audio_pad_token_id: int = 1024,
+        audio_pad_code: Optional[int] = None,
+        pad_token_id: int = 151643,
+        im_start_token_id: int = 151644,
+        im_end_token_id: int = 151645,
+        audio_start_token_id: int = 151669,
+        audio_end_token_id: int = 151670,
+        audio_user_slot_token_id: int = 151654,
+        audio_assistant_slot_token_id: int = 151656,
+        audio_assistant_gen_slot_token_id: Optional[int] = None,
+        sampling_rate: int = 48000,
+        audio_tokenizer_name_or_path: Optional[str] = None,
+        attn_implementation: str = "flash_attention_2",
+        local_transformer_attn_implementation: Optional[str] = None,
+        local_text_head_mode: str = "binary",
+        initializer_range: float = 0.02,
+        **kwargs: Any,
+    ) -> None:
+        if qwen3_config is None and language_config is not None:
+            qwen3_config = language_config
+        if isinstance(qwen3_config, dict):
+            self.qwen3_config = Qwen3Config(**qwen3_config)
+        elif qwen3_config is None:
+            self.qwen3_config = Qwen3Config()
+        else:
+            self.qwen3_config = qwen3_config
+        if isinstance(gpt2_config, dict):
+            self.gpt2_config = GPT2Config(**gpt2_config)
+        elif gpt2_config is None:
+            self.gpt2_config = GPT2Config(
+                vocab_size=int(self.qwen3_config.vocab_size),
+                n_embd=int(self.qwen3_config.hidden_size),
+                n_layer=1,
+                n_head=max(1, int(self.qwen3_config.hidden_size) // 80),
+                n_positions=int(n_vq) + 1,
+                n_ctx=int(n_vq) + 1,
+                activation_function="silu",
+                layer_norm_epsilon=1e-6,
+                resid_pdrop=0.0,
+                embd_pdrop=0.0,
+                attn_pdrop=0.0,
+            )
+        else:
+            self.gpt2_config = gpt2_config
+        self.n_vq = int(n_vq)
+        if self.n_vq <= 0:
+            raise ValueError("n_vq must be positive.")
+        if audio_codebook_sizes is None:
+            self.audio_codebook_sizes = [int(audio_vocab_size)] * self.n_vq
+        else:
+            self.audio_codebook_sizes = [int(size) for size in audio_codebook_sizes]
+        if len(self.audio_codebook_sizes) != self.n_vq:
+            raise ValueError(
+                f"audio_codebook_sizes must have length n_vq={self.n_vq}, "
+                f"got {len(self.audio_codebook_sizes)}."
+            )
+        if any(size <= 0 for size in self.audio_codebook_sizes):
+            raise ValueError("audio_codebook_sizes must contain positive integers.")
+        self.audio_vocab_size = int(max(int(audio_vocab_size), max(self.audio_codebook_sizes)))
+        self.audio_pad_token_id = int(audio_pad_code if audio_pad_code is not None else audio_pad_token_id)
+        self.audio_pad_code = self.audio_pad_token_id
+        if self.audio_pad_token_id < self.audio_vocab_size:
+            raise ValueError("audio_pad_token_id/audio_pad_code must be outside the audio vocab.")
+        self.pad_token_id = int(pad_token_id)
+        self.im_start_token_id = int(im_start_token_id)
+        self.im_end_token_id = int(im_end_token_id)
+        self.audio_start_token_id = int(audio_start_token_id)
+        self.audio_end_token_id = int(audio_end_token_id)
+        self.audio_user_slot_token_id = int(audio_user_slot_token_id)
+        self.audio_assistant_slot_token_id = int(
+            audio_assistant_slot_token_id
+            if audio_assistant_gen_slot_token_id is None
+            else audio_assistant_gen_slot_token_id
+        )
+        self.audio_assistant_gen_slot_token_id = self.audio_assistant_slot_token_id
+        self.sampling_rate = int(sampling_rate)
+        self.audio_tokenizer_name_or_path = audio_tokenizer_name_or_path
+        self.attn_implementation = _normalize_attention_implementation(attn_implementation)
+        self.local_transformer_attn_implementation = _normalize_attention_implementation(
+            local_transformer_attn_implementation,
+            default=self.attn_implementation,
+        )
+        self.initializer_range = float(initializer_range)
+        self.hidden_size = int(self.qwen3_config.hidden_size)
+        self.vocab_size = int(self.qwen3_config.vocab_size)
+        self.local_hidden_size = int(self.gpt2_config.hidden_size)
+        if self.local_hidden_size != self.hidden_size:
+            raise ValueError(
+                "This MOSS-TTS-Local-Transformer-v1.5 release expects local hidden size to "
+                "match Qwen3 hidden size so audio embeddings and heads are tied."
+            )
+        normalized_text_head_mode = str(local_text_head_mode or "full_vocab").strip().lower()
+        if normalized_text_head_mode in {"full", "full-vocab", "vocab"}:
+            normalized_text_head_mode = "full_vocab"
+        if normalized_text_head_mode not in {"full_vocab", "binary"}:
+            raise ValueError("local_text_head_mode must be 'full_vocab' or 'binary'.")
+        self.local_text_head_mode = normalized_text_head_mode
+        kwargs.setdefault("tie_word_embeddings", True)
+        super().__init__(pad_token_id=self.pad_token_id, **kwargs)
+    @property
+    def language_config(self) -> Qwen3Config:
+        return self.qwen3_config
+    def to_dict(self) -> Dict[str, Any]:
+        output = super().to_dict()
+        output["qwen3_config"] = self.qwen3_config.to_dict()
+        output["language_config"] = self.qwen3_config.to_dict()
+        output["gpt2_config"] = self.gpt2_config.to_dict()
+        output["audio_pad_code"] = self.audio_pad_token_id
+        output["audio_assistant_gen_slot_token_id"] = self.audio_assistant_slot_token_id
+        return output

gpt2_decoder.py ADDED Viewed

	@@ -0,0 +1,721 @@

+# coding=utf-8
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import pad_input, unpad_input
+    _FLASH_ATTN_AVAILABLE = True
+except Exception:
+    flash_attn_func = None
+    flash_attn_varlen_func = None
+    pad_input = None
+    unpad_input = None
+    _FLASH_ATTN_AVAILABLE = False
+@dataclass
+class PackedSequenceMetadata:
+    cu_seqlens: torch.Tensor
+    max_seqlen: int
+    indices: Optional[torch.Tensor] = None
+    batch_size: Optional[int] = None
+    seq_len: Optional[int] = None
+def _is_static_kv_cache_layer(layer_past: object) -> bool:
+    return isinstance(layer_past, dict) and bool(layer_past.get("static_kv_cache", False))
+class MossTTSNanoGPT2RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, base: float = 10000.0) -> None:
+        super().__init__()
+        if dim % 2 != 0:
+            raise ValueError(f"RoPE head_dim must be even, got {dim}")
+        self.dim = int(dim)
+        self.base = float(base)
+        self.register_buffer("inv_freq", self._compute_inv_freq(), persistent=False)
+    def _compute_inv_freq(self, device: Optional[torch.device] = None) -> torch.Tensor:
+        return 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
+        )
+    def forward(
+        self,
+        position_ids: torch.LongTensor,
+        *,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if position_ids.ndim == 1:
+            position_ids = position_ids.unsqueeze(0)
+        inv_freq = self._compute_inv_freq(device=device)
+        freqs = torch.einsum("bs,d->bsd", position_ids.to(device=device, dtype=inv_freq.dtype), inv_freq)
+        cos = freqs.cos().repeat_interleave(2, dim=-1).unsqueeze(2).to(dtype=dtype)
+        sin = freqs.sin().repeat_interleave(2, dim=-1).unsqueeze(2).to(dtype=dtype)
+        return cos, sin
+def rotate_half(hidden_states: torch.Tensor) -> torch.Tensor:
+    even = hidden_states[..., ::2]
+    odd = hidden_states[..., 1::2]
+    return torch.stack((-odd, even), dim=-1).reshape_as(hidden_states)
+def apply_rotary_pos_emb(
+    hidden_states: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> torch.Tensor:
+    return (hidden_states * cos) + (rotate_half(hidden_states) * sin)
+class MossTTSNanoGPT2MLP(nn.Module):
+    def __init__(self, config: GPT2Config) -> None:
+        super().__init__()
+        hidden_size = int(config.hidden_size)
+        inner_size = int(config.n_inner or 4 * hidden_size)
+        self.fc_in = nn.Linear(hidden_size, inner_size)
+        self.fc_out = nn.Linear(inner_size, hidden_size)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc_out(hidden_states)
+        return self.dropout(hidden_states)
+class MossTTSNanoGPT2Attention(nn.Module):
+    def __init__(self, config: GPT2Config, layer_idx: int, attn_implementation: str) -> None:
+        super().__init__()
+        hidden_size = int(config.hidden_size)
+        num_heads = int(config.num_attention_heads)
+        if hidden_size % num_heads != 0:
+            raise ValueError(f"hidden_size={hidden_size} must be divisible by num_attention_heads={num_heads}")
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.embed_dim = hidden_size
+        self.layer_idx = layer_idx
+        self.attn_implementation = attn_implementation
+        self.attn_dropout = float(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.scale_attn_weights = bool(getattr(config, "scale_attn_weights", True))
+        self.scale_attn_by_inverse_layer_idx = bool(getattr(config, "scale_attn_by_inverse_layer_idx", False))
+        self.position_embedding_type = str(getattr(config, "position_embedding_type", "absolute")).lower()
+        if self.position_embedding_type not in {"absolute", "rope"}:
+            raise ValueError(f"Unsupported position_embedding_type={self.position_embedding_type!r}")
+        self.c_attn = nn.Linear(hidden_size, 3 * hidden_size)
+        self.c_proj = nn.Linear(hidden_size, hidden_size)
+        self.rotary_emb = None
+        if self.position_embedding_type == "rope":
+            self.rotary_emb = MossTTSNanoGPT2RotaryEmbedding(
+                self.head_dim,
+                base=float(getattr(config, "rope_base", 10000.0)),
+            )
+    def _split_heads(self, tensor: torch.Tensor) -> torch.Tensor:
+        if tensor.ndim == 3:
+            batch_size, seq_len, _ = tensor.shape
+            return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim)
+        if tensor.ndim == 2:
+            total_tokens, _ = tensor.shape
+            return tensor.view(total_tokens, self.num_heads, self.head_dim)
+        raise ValueError(f"Unsupported tensor rank for attention split: {tensor.ndim}")
+    def _merge_heads(self, tensor: torch.Tensor) -> torch.Tensor:
+        if tensor.ndim == 4:
+            batch_size, seq_len, _, _ = tensor.shape
+            return tensor.reshape(batch_size, seq_len, self.embed_dim)
+        if tensor.ndim == 3:
+            total_tokens, _, _ = tensor.shape
+            return tensor.reshape(total_tokens, self.embed_dim)
+        raise ValueError(f"Unsupported tensor rank for attention merge: {tensor.ndim}")
+    def _causal_attention_mask(
+        self,
+        attention_mask: Optional[torch.Tensor],
+        query_length: int,
+        key_length: int,
+        device: torch.device,
+    ) -> torch.Tensor:
+        query_positions = torch.arange(query_length, device=device, dtype=torch.long)
+        query_positions = query_positions + max(key_length - query_length, 0)
+        key_positions = torch.arange(key_length, device=device, dtype=torch.long)
+        causal = key_positions.unsqueeze(0) <= query_positions.unsqueeze(1)
+        causal = causal.unsqueeze(0).unsqueeze(0)
+        if attention_mask is None:
+            return causal
+        key_mask = attention_mask[:, None, None, :].to(dtype=torch.bool)
+        return causal & key_mask
+    def _eager_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        scale = 1.0
+        if self.scale_attn_weights:
+            scale /= self.head_dim ** 0.5
+        if self.scale_attn_by_inverse_layer_idx:
+            scale /= float(self.layer_idx + 1)
+        scores = torch.matmul(query, key.transpose(-1, -2)) * scale
+        causal_mask = self._causal_attention_mask(
+            attention_mask=attention_mask,
+            query_length=query.shape[-2],
+            key_length=key.shape[-2],
+            device=query.device,
+        )
+        scores = scores.masked_fill(~causal_mask, torch.finfo(scores.dtype).min)
+        probs = torch.softmax(scores, dim=-1)
+        if self.training and self.attn_dropout > 0:
+            probs = torch.dropout(probs, self.attn_dropout, train=True)
+        output = torch.matmul(probs, value)
+        return output.transpose(1, 2).contiguous()
+    def _sdpa_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        mask = None
+        if attention_mask is not None or query.shape[-2] != key.shape[-2]:
+            mask = self._causal_attention_mask(
+                attention_mask=attention_mask,
+                query_length=query.shape[-2],
+                key_length=key.shape[-2],
+                device=query.device,
+            )
+        output = torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=mask,
+            dropout_p=self.attn_dropout if self.training else 0.0,
+            is_causal=mask is None,
+        )
+        return output.transpose(1, 2).contiguous()
+    def _flash_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        packed_metadata: Optional[PackedSequenceMetadata],
+    ) -> torch.Tensor:
+        if not _FLASH_ATTN_AVAILABLE:
+            raise ImportError("flash_attn is not installed, but attn_implementation='flash_attention_2' was requested.")
+        if query.device.type != "cuda":
+            raise ValueError("flash_attention_2 requires CUDA tensors.")
+        if query.dtype not in (torch.float16, torch.bfloat16):
+            raise ValueError(
+                f"flash_attention_2 requires fp16/bf16 tensors, but received dtype={query.dtype}."
+            )
+        dropout_p = self.attn_dropout if self.training else 0.0
+        if packed_metadata is not None:
+            if packed_metadata.indices is not None:
+                query = query.reshape(-1, self.num_heads, self.head_dim).index_select(0, packed_metadata.indices)
+                key = key.reshape(-1, self.num_heads, self.head_dim).index_select(0, packed_metadata.indices)
+                value = value.reshape(-1, self.num_heads, self.head_dim).index_select(0, packed_metadata.indices)
+            output = flash_attn_varlen_func(
+                query,
+                key,
+                value,
+                packed_metadata.cu_seqlens,
+                packed_metadata.cu_seqlens,
+                packed_metadata.max_seqlen,
+                packed_metadata.max_seqlen,
+                dropout_p=dropout_p,
+                causal=True,
+            )
+            if packed_metadata.indices is None:
+                return output
+            return pad_input(
+                output,
+                packed_metadata.indices,
+                packed_metadata.batch_size,
+                packed_metadata.seq_len,
+            )
+        if attention_mask is None or bool(attention_mask.all()):
+            return flash_attn_func(
+                query,
+                key,
+                value,
+                dropout_p=dropout_p,
+                causal=True,
+            )
+        if query.shape[1] != key.shape[1]:
+            query_attention_mask = attention_mask[:, -query.shape[1] :]
+            unpadded_query, query_indices, cu_seqlens_q, max_seqlen_q, _ = unpad_input(
+                query,
+                query_attention_mask,
+            )
+            unpadded_key, _, cu_seqlens_k, max_seqlen_k, _ = unpad_input(key, attention_mask)
+            unpadded_value, _, _, _, _ = unpad_input(value, attention_mask)
+            output = flash_attn_varlen_func(
+                unpadded_query,
+                unpadded_key,
+                unpadded_value,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                dropout_p=dropout_p,
+                causal=True,
+            )
+            return pad_input(output, query_indices, query.shape[0], query.shape[1])
+        unpadded_query, indices, cu_seqlens, max_seqlen, _ = unpad_input(query, attention_mask)
+        unpadded_key, _, _, _, _ = unpad_input(key, attention_mask)
+        unpadded_value, _, _, _, _ = unpad_input(value, attention_mask)
+        output = flash_attn_varlen_func(
+            unpadded_query,
+            unpadded_key,
+            unpadded_value,
+            cu_seqlens,
+            cu_seqlens,
+            max_seqlen,
+            max_seqlen,
+            dropout_p=dropout_p,
+            causal=True,
+        )
+        return pad_input(output, indices, query.shape[0], query.shape[1])
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        packed_metadata: Optional[PackedSequenceMetadata] = None,
+        layer_past: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        qkv = self.c_attn(hidden_states)
+        query, key, value = qkv.split(self.embed_dim, dim=-1)
+        query = self._split_heads(query)
+        key = self._split_heads(key)
+        value = self._split_heads(value)
+        if self.rotary_emb is not None:
+            if position_ids is None:
+                raise ValueError("position_ids must be provided when position_embedding_type='rope'.")
+            cos, sin = self.rotary_emb(
+                position_ids.to(device=query.device),
+                device=query.device,
+                dtype=query.dtype,
+            )
+            query = apply_rotary_pos_emb(query, cos, sin)
+            key = apply_rotary_pos_emb(key, cos, sin)
+        static_layer_past = layer_past is not None and _is_static_kv_cache_layer(layer_past)
+        if static_layer_past:
+            past_length = int(layer_past.get("length", 0))
+            new_length = past_length + int(key.shape[1])
+            key_cache = layer_past["key"]
+            value_cache = layer_past["value"]
+            if new_length > int(key_cache.shape[1]):
+                raise ValueError(
+                    f"Static KV cache is too short: need {new_length}, capacity={int(key_cache.shape[1])}."
+                )
+            key_cache[:, past_length:new_length].copy_(key)
+            value_cache[:, past_length:new_length].copy_(value)
+            key = key_cache[:, :new_length]
+            value = value_cache[:, :new_length]
+            layer_past["length"] = new_length
+        elif layer_past is not None:
+            past_key, past_value = layer_past
+            key = torch.cat([past_key.to(device=key.device, dtype=key.dtype), key], dim=1)
+            value = torch.cat([past_value.to(device=value.device, dtype=value.dtype), value], dim=1)
+        present = layer_past if (use_cache and static_layer_past) else ((key, value) if use_cache else None)
+        if self.attn_implementation == "flash_attention_2":
+            attn_output = self._flash_attention(
+                query=query,
+                key=key,
+                value=value,
+                attention_mask=attention_mask,
+                packed_metadata=packed_metadata,
+            )
+        elif self.attn_implementation == "sdpa":
+            attn_output = self._sdpa_attention(
+                query=query,
+                key=key,
+                value=value,
+                attention_mask=attention_mask,
+            )
+        else:
+            attn_output = self._eager_attention(
+                query=query,
+                key=key,
+                value=value,
+                attention_mask=attention_mask,
+            )
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.c_proj(attn_output)
+        return self.resid_dropout(attn_output), present
+class MossTTSNanoGPT2Block(nn.Module):
+    def __init__(self, config: GPT2Config, layer_idx: int, attn_implementation: str) -> None:
+        super().__init__()
+        hidden_size = int(config.hidden_size)
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = MossTTSNanoGPT2Attention(config, layer_idx=layer_idx, attn_implementation=attn_implementation)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = MossTTSNanoGPT2MLP(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        packed_metadata: Optional[PackedSequenceMetadata] = None,
+        layer_past: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        attn_output, present = self.attn(
+            self.ln_1(hidden_states),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            packed_metadata=packed_metadata,
+            layer_past=layer_past,
+            use_cache=use_cache,
+        )
+        hidden_states = hidden_states + attn_output
+        hidden_states = hidden_states + self.mlp(self.ln_2(hidden_states))
+        return hidden_states, present
+class MossTTSNanoGPT2Model(nn.Module):
+    def __init__(self, config: GPT2Config, attn_implementation: str = "eager") -> None:
+        super().__init__()
+        self.config = config
+        self.attn_implementation = attn_implementation
+        self.position_embedding_type = str(getattr(config, "position_embedding_type", "absolute")).lower()
+        if self.position_embedding_type not in {"absolute", "rope"}:
+            raise ValueError(f"Unsupported position_embedding_type={self.position_embedding_type!r}")
+        hidden_size = int(config.hidden_size)
+        self.wte = nn.Embedding(config.vocab_size, hidden_size)
+        self.wpe = nn.Embedding(config.n_positions, hidden_size) if self.position_embedding_type == "absolute" else nn.Identity()
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList(
+            [MossTTSNanoGPT2Block(config, layer_idx=index, attn_implementation=attn_implementation) for index in range(config.n_layer)]
+        )
+        self.ln_f = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.gradient_checkpointing = False
+        self._reset_parameters()
+    def _reset_parameters(self) -> None:
+        init_std = float(self.config.initializer_range)
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.normal_(module.weight, mean=0.0, std=init_std)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.Embedding):
+                nn.init.normal_(module.weight, mean=0.0, std=init_std)
+            elif isinstance(module, nn.LayerNorm):
+                nn.init.ones_(module.weight)
+                nn.init.zeros_(module.bias)
+    @staticmethod
+    def _normalize_num_sequences(
+        cu_seqlens: torch.Tensor,
+        num_sequences: Optional[torch.Tensor],
+        device: torch.device,
+    ) -> torch.Tensor:
+        if cu_seqlens.ndim == 1:
+            cu_seqlens = cu_seqlens.unsqueeze(0)
+        if num_sequences is None:
+            diffs = cu_seqlens[:, 1:] - cu_seqlens[:, :-1]
+            return diffs.gt(0).sum(dim=-1).to(device=device, dtype=torch.long)
+        if num_sequences.ndim == 0:
+            num_sequences = num_sequences.unsqueeze(0)
+        return num_sequences.to(device=device, dtype=torch.long)
+    @staticmethod
+    def _packed_segments_from_cu_seqlens(
+        cu_seqlens: torch.Tensor,
+        num_sequences: Optional[torch.Tensor],
+        device: torch.device,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if cu_seqlens.ndim == 1:
+            cu_seqlens = cu_seqlens.unsqueeze(0)
+        cu_seqlens = cu_seqlens.to(device=device)
+        batch_size, boundary_count = cu_seqlens.shape
+        segment_slots = boundary_count - 1
+        if segment_slots <= 0:
+            empty = torch.empty(0, dtype=torch.long, device=device)
+            return empty, empty, empty
+        counts = MossTTSNanoGPT2Model._normalize_num_sequences(cu_seqlens, num_sequences, device=device)
+        counts = counts.clamp(min=0, max=segment_slots)
+        segment_slots = int(counts.max().item()) if counts.numel() > 0 else 0
+        if segment_slots <= 0:
+            empty = torch.empty(0, dtype=torch.long, device=device)
+            return empty, empty, empty
+        cu_seqlens = cu_seqlens[:, : segment_slots + 1]
+        slot_ids = torch.arange(segment_slots, device=device).unsqueeze(0)
+        valid_slots = slot_ids < counts.unsqueeze(1)
+        starts = cu_seqlens[:, :-1].to(dtype=torch.long)
+        ends = cu_seqlens[:, 1:].to(dtype=torch.long)
+        lengths = (ends - starts).clamp_min(0)
+        lengths = torch.where(valid_slots, lengths, torch.zeros((), dtype=torch.long, device=device))
+        batch_ids = torch.arange(batch_size, device=device, dtype=torch.long).unsqueeze(1).expand(batch_size, segment_slots)
+        batch_ids = batch_ids.reshape(-1)
+        starts = starts.reshape(-1)
+        lengths = lengths.reshape(-1)
+        valid_segments = lengths.gt(0)
+        valid_count = int(valid_segments.to(dtype=torch.long).sum().item())
+        if valid_count <= 0:
+            empty = torch.empty(0, dtype=torch.long, device=device)
+            return empty, empty, empty
+        if valid_count == lengths.numel():
+            return batch_ids, starts, lengths
+        valid_order = torch.argsort(valid_segments.to(dtype=torch.long), descending=True, stable=True)[:valid_count]
+        return (
+            batch_ids.index_select(0, valid_order),
+            starts.index_select(0, valid_order),
+            lengths.index_select(0, valid_order),
+        )
+    @staticmethod
+    def _packed_token_indices(
+        batch_ids: torch.Tensor,
+        starts: torch.Tensor,
+        lengths: torch.Tensor,
+        seq_len: int,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        total_tokens = int(lengths.sum().item())
+        if total_tokens <= 0:
+            empty = torch.empty(0, dtype=torch.long, device=lengths.device)
+            return empty, empty
+        segment_ids = torch.repeat_interleave(
+            torch.arange(lengths.numel(), device=lengths.device, dtype=torch.long),
+            lengths,
+            output_size=total_tokens,
+        )
+        segment_starts = torch.cumsum(lengths, dim=0) - lengths
+        positions = torch.arange(total_tokens, device=lengths.device, dtype=torch.long) - segment_starts[segment_ids]
+        indices = batch_ids[segment_ids] * seq_len + starts[segment_ids] + positions
+        return indices, positions
+    @staticmethod
+    def build_packed_position_ids(
+        attention_mask: Optional[torch.Tensor],
+        cu_seqlens: torch.Tensor,
+        num_sequences: Optional[torch.Tensor],
+        sequence_length: Optional[int] = None,
+    ) -> torch.Tensor:
+        if cu_seqlens.ndim == 1:
+            cu_seqlens = cu_seqlens.unsqueeze(0)
+        batch_size = cu_seqlens.shape[0]
+        seq_len = int(sequence_length or (cu_seqlens.shape[1] - 1))
+        device = cu_seqlens.device
+        position_ids = torch.zeros((batch_size, seq_len), dtype=torch.long, device=device)
+        batch_ids, starts, lengths = MossTTSNanoGPT2Model._packed_segments_from_cu_seqlens(
+            cu_seqlens,
+            num_sequences,
+            device,
+        )
+        if lengths.numel() > 0:
+            indices, positions = MossTTSNanoGPT2Model._packed_token_indices(batch_ids, starts, lengths, seq_len)
+            position_ids.view(-1).scatter_(0, indices, positions)
+        if attention_mask is not None:
+            position_ids = position_ids * attention_mask.to(dtype=position_ids.dtype)
+        return position_ids
+    @staticmethod
+    def build_packed_metadata(
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        num_sequences: Optional[torch.Tensor],
+    ) -> PackedSequenceMetadata:
+        if cu_seqlens.ndim == 1:
+            cu_seqlens = cu_seqlens.unsqueeze(0)
+        device = hidden_states.device
+        seq_len = hidden_states.shape[1]
+        batch_ids, starts, lengths = MossTTSNanoGPT2Model._packed_segments_from_cu_seqlens(
+            cu_seqlens,
+            num_sequences,
+            device,
+        )
+        if lengths.numel() == 0:
+            raise ValueError("cu_seqlens did not describe any non-empty packed sequences.")
+        indices, _ = MossTTSNanoGPT2Model._packed_token_indices(batch_ids, starts, lengths, seq_len)
+        cumulative = torch.empty(lengths.numel() + 1, dtype=torch.int32, device=device)
+        cumulative[0] = 0
+        cumulative[1:] = lengths.to(dtype=torch.int32).cumsum(dim=0)
+        return PackedSequenceMetadata(
+            cu_seqlens=cumulative,
+            max_seqlen=int(lengths.max().item()),
+            indices=indices,
+            batch_size=hidden_states.shape[0],
+            seq_len=hidden_states.shape[1],
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: bool = True,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        num_sequences: Optional[torch.Tensor] = None,
+    ) -> BaseModelOutputWithPast:
+        del input_ids, output_attentions
+        if inputs_embeds is None:
+            raise ValueError("inputs_embeds must be provided.")
+        use_cache = bool(use_cache)
+        if use_cache and cu_seqlens is not None:
+            raise ValueError("use_cache=True is not supported together with cu_seqlens packing.")
+        hidden_states = inputs_embeds
+        query_attention_mask = None
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(dtype=torch.bool, device=hidden_states.device)
+            query_attention_mask = attention_mask[:, -hidden_states.shape[1] :]
+        packed_metadata = None
+        if position_ids is None:
+            if cu_seqlens is not None:
+                if attention_mask is None:
+                    raise ValueError("attention_mask must be provided with cu_seqlens packing.")
+                position_ids = self.build_packed_position_ids(
+                    attention_mask=attention_mask,
+                    cu_seqlens=cu_seqlens.to(device=hidden_states.device),
+                    num_sequences=num_sequences.to(device=hidden_states.device) if num_sequences is not None else None,
+                    sequence_length=hidden_states.shape[1],
+                )
+            elif attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(dim=-1) - 1
+                position_ids = position_ids.masked_fill(~attention_mask, 0)
+                position_ids = position_ids[:, -hidden_states.shape[1] :]
+            else:
+                past_length = 0
+                if past_key_values is not None and len(past_key_values) > 0:
+                    first_layer_past = past_key_values[0]
+                    if _is_static_kv_cache_layer(first_layer_past):
+                        past_length = int(first_layer_past.get("length", 0))
+                    else:
+                        past_length = first_layer_past[0].shape[1]
+                position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device, dtype=torch.long)
+                position_ids = position_ids + past_length
+                position_ids = position_ids.unsqueeze(0).expand(hidden_states.shape[0], -1)
+        if cu_seqlens is not None and self.attn_implementation == "flash_attention_2":
+            packed_metadata = self.build_packed_metadata(
+                hidden_states=hidden_states,
+                cu_seqlens=cu_seqlens.to(device=hidden_states.device),
+                num_sequences=num_sequences.to(device=hidden_states.device) if num_sequences is not None else None,
+            )
+        if self.position_embedding_type == "absolute":
+            hidden_states = hidden_states + self.wpe(position_ids)
+        hidden_states = self.drop(hidden_states)
+        if query_attention_mask is not None:
+            hidden_states = hidden_states * query_attention_mask.unsqueeze(-1).to(dtype=hidden_states.dtype)
+        all_hidden_states = () if output_hidden_states else None
+        presents = [] if use_cache else None
+        for layer_index, block in enumerate(self.h):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    raise ValueError("use_cache=True is not supported when gradient checkpointing is enabled during training.")
+                def custom_forward(*inputs):
+                    output, _ = block(
+                        inputs[0],
+                        attention_mask=inputs[1],
+                        position_ids=inputs[2],
+                        packed_metadata=packed_metadata,
+                        layer_past=None,
+                        use_cache=False,
+                    )
+                    return output
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    custom_forward,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    use_reentrant=False,
+                )
+                present = None
+            else:
+                hidden_states, present = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    packed_metadata=packed_metadata,
+                    layer_past=None if past_key_values is None else past_key_values[layer_index],
+                    use_cache=use_cache,
+                )
+            if query_attention_mask is not None:
+                hidden_states = hidden_states * query_attention_mask.unsqueeze(-1).to(dtype=hidden_states.dtype)
+            if presents is not None:
+                presents.append(present)
+        hidden_states = self.ln_f(hidden_states)
+        if query_attention_mask is not None:
+            hidden_states = hidden_states * query_attention_mask.unsqueeze(-1).to(dtype=hidden_states.dtype)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return (hidden_states, tuple(presents) if presents is not None else None, all_hidden_states, None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=tuple(presents) if presents is not None else None,
+            hidden_states=all_hidden_states,
+            attentions=None,
+        )

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:608f1ff64bc6caa9be836060fc7c78a15c4658c4a07b8d73c78d6f70d1b39c23
+size 9100859544

modeling_moss_tts.py ADDED Viewed

	@@ -0,0 +1,623 @@

+# coding=utf-8
+"""Modeling code for the MOSS-TTS-Local-Transformer-v1.5 HuggingFace release."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+import torch
+import torch.nn as nn
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.utils import ModelOutput
+from .configuration_moss_tts import MossTTSLocalConfig
+from .gpt2_decoder import MossTTSNanoGPT2Model
+from .qwen3_decoder import MossQwen3Model
+@dataclass
+class MossTTSLocalOutput(ModelOutput):
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[tuple[tuple[torch.Tensor, torch.Tensor], ...]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+def _find_last_equal(input_ids: torch.LongTensor, value: int) -> torch.LongTensor:
+    matches = input_ids.eq(int(value))
+    if not bool(matches.any(dim=1).all().item()):
+        raise ValueError(f"Every sample must contain token id {int(value)}.")
+    positions = torch.arange(input_ids.shape[1], device=input_ids.device, dtype=torch.long)
+    masked_positions = positions.unsqueeze(0).masked_fill(~matches, -1)
+    return masked_positions.max(dim=1).values
+class MossTTSLocalPreTrainedModel(PreTrainedModel):
+    config_class = MossTTSLocalConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MossTTSNanoGPT2Block", "MossQwen3DecoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False) -> None:
+        if isinstance(module, MossTTSNanoGPT2Model) or isinstance(module, MossQwen3Model):
+            module.gradient_checkpointing = value
+class MossTTSLocalModel(MossTTSLocalPreTrainedModel):
+    _tied_weights_keys = None
+    def __init__(self, config: MossTTSLocalConfig) -> None:
+        super().__init__(config)
+        self._tied_weights_keys = self._build_tied_weights_keys(config)
+        config.qwen3_config.pad_token_id = config.pad_token_id
+        config.qwen3_config._attn_implementation = config.attn_implementation
+        local_gpt2_config = config.gpt2_config.to_dict()
+        local_gpt2_config["n_layer"] = int(getattr(config, "local_transformer_layers", config.gpt2_config.n_layer))
+        local_gpt2_config["n_positions"] = int(config.n_vq) + 1
+        local_gpt2_config["n_ctx"] = int(config.n_vq) + 1
+        local_gpt2_config = GPT2Config(**local_gpt2_config)
+        local_gpt2_config.pad_token_id = config.pad_token_id
+        local_gpt2_config._attn_implementation = config.local_transformer_attn_implementation
+        self.transformer = MossQwen3Model(config.qwen3_config)
+        self.local_transformer = MossTTSNanoGPT2Model(
+            local_gpt2_config,
+            attn_implementation=config.local_transformer_attn_implementation,
+        )
+        self.local_transformer.wte = nn.Identity()
+        hidden_size = int(config.hidden_size)
+        self.audio_embeddings = nn.ModuleList(
+            [
+                nn.Embedding(int(config.audio_codebook_sizes[index]), hidden_size)
+                for index in range(config.n_vq)
+            ]
+        )
+        self.text_lm_head = nn.Linear(hidden_size, int(config.vocab_size), bias=False)
+        self.audio_lm_heads = nn.ModuleList(
+            [
+                nn.Linear(hidden_size, int(config.audio_codebook_sizes[index]), bias=False)
+                for index in range(config.n_vq)
+            ]
+        )
+        self.local_text_lm_head = (
+            nn.Linear(hidden_size, 2, bias=False)
+            if self._use_binary_local_text_head()
+            else None
+        )
+        self.post_init()
+        self.tie_weights()
+        self.initialize_local_text_lm_head_from_text_lm_head()
+    def can_generate(self) -> bool:
+        return True
+    @staticmethod
+    def _build_tied_weights_keys(config: MossTTSLocalConfig) -> dict[str, str]:
+        tied_weights = {"text_lm_head.weight": "transformer.embed_tokens.weight"}
+        tied_weights.update(
+            {
+                f"audio_lm_heads.{index}.weight": f"audio_embeddings.{index}.weight"
+                for index in range(config.n_vq)
+            }
+        )
+        return tied_weights
+    def tie_weights(self, *args, **kwargs) -> None:
+        del args, kwargs
+        self.text_lm_head.weight = self.transformer.embed_tokens.weight
+        for embedding, head in zip(self.audio_embeddings, self.audio_lm_heads):
+            head.weight = embedding.weight
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.transformer.embed_tokens
+    def set_input_embeddings(self, value: nn.Embedding) -> None:
+        self.transformer.embed_tokens = value
+        self.tie_weights()
+        self.initialize_local_text_lm_head_from_text_lm_head()
+    def get_output_embeddings(self) -> nn.Linear:
+        return self.text_lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
+        self.text_lm_head = new_embeddings
+        self.tie_weights()
+        self.initialize_local_text_lm_head_from_text_lm_head()
+    def _use_binary_local_text_head(self) -> bool:
+        return str(getattr(self.config, "local_text_head_mode", "full_vocab")).strip().lower() == "binary"
+    def _local_text_candidate_ids(self, device: torch.device) -> torch.LongTensor:
+        return torch.tensor(
+            [
+                int(self.config.audio_assistant_slot_token_id),
+                int(self.config.audio_end_token_id),
+            ],
+            dtype=torch.long,
+            device=device,
+        )
+    def initialize_local_text_lm_head_from_text_lm_head(self) -> None:
+        if not self._use_binary_local_text_head() or self.local_text_lm_head is None:
+            return
+        candidate_ids = self._local_text_candidate_ids(self.text_lm_head.weight.device)
+        with torch.no_grad():
+            source_weight = self.text_lm_head.weight.index_select(0, candidate_ids)
+            if tuple(source_weight.shape) == tuple(self.local_text_lm_head.weight.shape):
+                self.local_text_lm_head.weight.copy_(
+                    source_weight.to(
+                        device=self.local_text_lm_head.weight.device,
+                        dtype=self.local_text_lm_head.weight.dtype,
+                    )
+                )
+    def _resolve_fixed_nq(
+        self,
+        n_vq_for_inference: Optional[int] = None,
+        nq: Optional[int] = None,
+    ) -> int:
+        requested = n_vq_for_inference if n_vq_for_inference is not None else nq
+        config_nq = int(self.config.n_vq)
+        if requested is not None and int(requested) != config_nq:
+            raise ValueError(
+                "This MOSS-TTS-Local-Transformer-v1.5 release is trained with a fixed RVQ depth. "
+                f"Expected n_vq={config_nq}, got {int(requested)}."
+            )
+        return config_nq
+    def _build_inputs_embeds(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
+        if input_ids.ndim != 3 or input_ids.shape[-1] != self.config.n_vq + 1:
+            raise ValueError(
+                f"Expected input_ids shape [batch, seq, {self.config.n_vq + 1}], "
+                f"got {tuple(input_ids.shape)}."
+            )
+        text_ids = input_ids[..., 0]
+        inputs_embeds = self.transformer.embed_tokens(text_ids)
+        for channel_index, embedding in enumerate(self.audio_embeddings):
+            channel_ids = input_ids[..., channel_index + 1]
+            valid_mask = channel_ids.ne(self.config.audio_pad_token_id)
+            safe_ids = channel_ids.masked_fill(~valid_mask, 0)
+            audio_embeds = embedding(safe_ids) * valid_mask.unsqueeze(-1)
+            inputs_embeds = inputs_embeds + audio_embeds
+        return inputs_embeds
+    def _global_hidden_to_local(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        return hidden_states
+    @staticmethod
+    def _local_past_length(past_key_values: Optional[tuple[Any, ...]]) -> int:
+        if past_key_values is None or len(past_key_values) == 0:
+            return 0
+        first_layer_past = past_key_values[0]
+        if isinstance(first_layer_past, dict) and bool(first_layer_past.get("static_kv_cache", False)):
+            return int(first_layer_past.get("length", 0))
+        return int(first_layer_past[0].shape[1])
+    def _new_static_local_past_key_values(
+        self,
+        batch_size: int,
+        max_length: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> tuple[dict[str, Any], ...]:
+        layers = []
+        for block in self.local_transformer.h:
+            attn = block.attn
+            cache_shape = (
+                int(batch_size),
+                int(max_length),
+                int(attn.num_heads),
+                int(attn.head_dim),
+            )
+            layers.append(
+                {
+                    "static_kv_cache": True,
+                    "key": torch.empty(cache_shape, device=device, dtype=dtype),
+                    "value": torch.empty(cache_shape, device=device, dtype=dtype),
+                    "length": 0,
+                }
+            )
+        return tuple(layers)
+    def _decode_local_hidden_states_with_cache(
+        self,
+        local_inputs_embeds: torch.FloatTensor,
+        past_key_values: Optional[tuple[Any, ...]] = None,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[Any, ...]]]:
+        if (
+            past_key_values is None
+            and not self.training
+            and bool(getattr(self.config, "use_static_local_kv_cache", True))
+        ):
+            max_length = max(int(getattr(self.config, "n_vq", 0)) + 1, int(local_inputs_embeds.shape[1]))
+            past_key_values = self._new_static_local_past_key_values(
+                batch_size=int(local_inputs_embeds.shape[0]),
+                max_length=max_length,
+                device=local_inputs_embeds.device,
+                dtype=local_inputs_embeds.dtype,
+            )
+        past_length = self._local_past_length(past_key_values)
+        local_seq_len = int(local_inputs_embeds.shape[1])
+        local_position_ids = torch.arange(
+            past_length,
+            past_length + local_seq_len,
+            device=local_inputs_embeds.device,
+            dtype=torch.long,
+        ).unsqueeze(0)
+        if int(local_inputs_embeds.shape[0]) != 1:
+            local_position_ids = local_position_ids.expand(int(local_inputs_embeds.shape[0]), -1)
+        local_outputs = self.local_transformer(
+            input_ids=None,
+            past_key_values=past_key_values,
+            attention_mask=None,
+            position_ids=local_position_ids,
+            inputs_embeds=local_inputs_embeds,
+            use_cache=True,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+            cu_seqlens=None,
+            num_sequences=None,
+        )
+        return local_outputs.last_hidden_state, local_outputs.past_key_values
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        **kwargs,
+    ) -> Union[tuple, MossTTSLocalOutput]:
+        del kwargs
+        if inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError("Either input_ids or inputs_embeds must be provided.")
+            inputs_embeds = self._build_inputs_embeds(input_ids)
+        outputs = self.transformer(
+            input_ids=None,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cu_seqlens=None,
+            num_sequences=None,
+        )
+        if not return_dict:
+            return (
+                outputs.last_hidden_state,
+                outputs.past_key_values,
+                outputs.hidden_states,
+                outputs.attentions,
+            )
+        return MossTTSLocalOutput(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def _decode_local_last_hidden_state(
+        self,
+        local_inputs_embeds: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        local_seq_len = int(local_inputs_embeds.shape[1])
+        local_position_ids = torch.arange(
+            0,
+            local_seq_len,
+            device=local_inputs_embeds.device,
+            dtype=torch.long,
+        ).unsqueeze(0)
+        if int(local_inputs_embeds.shape[0]) != 1:
+            local_position_ids = local_position_ids.expand(int(local_inputs_embeds.shape[0]), -1)
+        local_outputs = self.local_transformer(
+            input_ids=None,
+            attention_mask=None,
+            position_ids=local_position_ids,
+            inputs_embeds=local_inputs_embeds,
+            use_cache=False,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+            cu_seqlens=None,
+            num_sequences=None,
+        )
+        return local_outputs.last_hidden_state[:, -1, :]
+    def _filter_logits(
+        self,
+        logits: torch.FloatTensor,
+        top_k: Optional[int],
+        top_p: Optional[float],
+    ) -> torch.FloatTensor:
+        scores = logits
+        if top_k is not None and int(top_k) > 0 and int(top_k) < scores.shape[-1]:
+            kth = torch.topk(scores, int(top_k), dim=-1).values[..., -1, None]
+            scores = scores.masked_fill(scores < kth, -torch.inf)
+        if top_p is not None and 0.0 < float(top_p) < 1.0:
+            sorted_scores, sorted_indices = torch.sort(scores, descending=True, dim=-1)
+            sorted_probs = torch.softmax(sorted_scores, dim=-1)
+            cumulative_probs = sorted_probs.cumsum(dim=-1)
+            sorted_mask = cumulative_probs > float(top_p)
+            sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
+            sorted_mask[..., 0] = False
+            remove_mask = torch.zeros_like(scores, dtype=torch.bool)
+            remove_mask.scatter_(dim=-1, index=sorted_indices, src=sorted_mask)
+            scores = scores.masked_fill(remove_mask, -torch.inf)
+        return scores
+    def _apply_repetition_penalty(
+        self,
+        scores: torch.FloatTensor,
+        previous_token_ids: Optional[torch.LongTensor],
+        penalty: float,
+    ) -> torch.FloatTensor:
+        if previous_token_ids is None or float(penalty) == 1.0:
+            return scores
+        if previous_token_ids.ndim == 1:
+            previous_token_ids = previous_token_ids.unsqueeze(0)
+        updated = scores.clone()
+        for batch_index in range(updated.shape[0]):
+            unique_token_ids = torch.unique(previous_token_ids[batch_index])
+            unique_token_ids = unique_token_ids[
+                (unique_token_ids >= 0) & (unique_token_ids < updated.shape[-1])
+            ]
+            if unique_token_ids.numel() == 0:
+                continue
+            token_scores = updated[batch_index].index_select(0, unique_token_ids)
+            token_scores = torch.where(
+                token_scores < 0,
+                token_scores * float(penalty),
+                token_scores / float(penalty),
+            )
+            updated[batch_index].scatter_(0, unique_token_ids, token_scores)
+        return updated
+    def _sample_next_token(
+        self,
+        logits: torch.FloatTensor,
+        do_sample: bool,
+        temperature: float,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        previous_token_ids: Optional[torch.LongTensor] = None,
+        repetition_penalty: float = 1.0,
+    ) -> torch.LongTensor:
+        scores = logits.float()
+        scores = self._apply_repetition_penalty(scores, previous_token_ids, repetition_penalty)
+        if not do_sample:
+            return torch.argmax(scores, dim=-1)
+        if float(temperature) <= 0:
+            raise ValueError("temperature must be positive when do_sample=True.")
+        scores = scores / float(temperature)
+        scores = self._filter_logits(scores, top_k=top_k, top_p=top_p)
+        probs = torch.softmax(scores, dim=-1)
+        return torch.multinomial(probs, num_samples=1).squeeze(-1)
+    def _sample_next_assistant_text_token(
+        self,
+        local_hidden_states: torch.FloatTensor,
+        do_sample: bool,
+        temperature: float,
+        top_k: Optional[int],
+        top_p: Optional[float],
+    ) -> torch.LongTensor:
+        if self._use_binary_local_text_head() and self.local_text_lm_head is not None:
+            logits = self.local_text_lm_head(local_hidden_states)
+            sampled_indices = self._sample_next_token(
+                logits=logits,
+                do_sample=do_sample,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+            )
+            candidate_ids = self._local_text_candidate_ids(logits.device)
+            return candidate_ids[sampled_indices]
+        candidate_ids = self._local_text_candidate_ids(local_hidden_states.device)
+        logits = self.text_lm_head(local_hidden_states).index_select(dim=-1, index=candidate_ids)
+        sampled_indices = self._sample_next_token(
+            logits=logits,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+        )
+        return candidate_ids[sampled_indices]
+    def _build_generation_row(
+        self,
+        batch_size: int,
+        device: torch.device,
+        audio_token_ids: torch.LongTensor,
+    ) -> torch.LongTensor:
+        row = torch.full(
+            (batch_size, 1, self.config.n_vq + 1),
+            int(self.config.audio_pad_token_id),
+            dtype=torch.long,
+            device=device,
+        )
+        row[:, :, 0] = int(self.config.audio_assistant_slot_token_id)
+        row[:, :, 1:] = audio_token_ids.unsqueeze(1)
+        return row
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        max_new_tokens: Optional[int] = None,
+        max_new_frames: Optional[int] = None,
+        do_sample: bool = True,
+        text_temperature: float = 1.0,
+        text_top_p: float = 1.0,
+        text_top_k: int = 50,
+        audio_temperature: Optional[float] = None,
+        audio_top_p: Optional[float] = None,
+        audio_top_k: Optional[int] = None,
+        audio_repetition_penalty: Optional[float] = None,
+        temperature: float = 1.0,
+        top_p: float = 0.95,
+        top_k: int = 50,
+        repetition_penalty: float = 1.0,
+        use_kv_cache: bool = True,
+        n_vq_for_inference: Optional[int] = None,
+        nq: Optional[int] = None,
+        **kwargs,
+    ) -> list[tuple[int, torch.LongTensor]]:
+        del kwargs
+        self._resolve_fixed_nq(n_vq_for_inference=n_vq_for_inference, nq=nq)
+        if input_ids.ndim == 2:
+            input_ids = input_ids.unsqueeze(0)
+        if input_ids.ndim != 3:
+            raise ValueError(f"Expected input_ids with 3 dims, got {tuple(input_ids.shape)}.")
+        if input_ids.shape[-1] != self.config.n_vq + 1:
+            raise ValueError(
+                f"Expected {self.config.n_vq + 1} channels from config.n_vq, got {input_ids.shape[-1]}."
+            )
+        if attention_mask is None:
+            attention_mask = torch.ones(input_ids.shape[:2], dtype=torch.bool, device=input_ids.device)
+        elif attention_mask.ndim == 1:
+            attention_mask = attention_mask.unsqueeze(0)
+        attention_mask = attention_mask.to(device=input_ids.device, dtype=torch.bool)
+        frame_budget = max_new_frames if max_new_frames is not None else max_new_tokens
+        if frame_budget is None:
+            frame_budget = 4096
+        frame_budget = int(frame_budget)
+        audio_temperature = float(temperature if audio_temperature is None else audio_temperature)
+        audio_top_p = float(top_p if audio_top_p is None else audio_top_p)
+        audio_top_k = int(top_k if audio_top_k is None else audio_top_k)
+        audio_repetition_penalty = float(
+            repetition_penalty if audio_repetition_penalty is None else audio_repetition_penalty
+        )
+        batch_size = input_ids.shape[0]
+        input_ids_length = input_ids.shape[1]
+        current_input_ids = input_ids
+        current_attention_mask = attention_mask
+        current_model_input_ids = current_input_ids
+        generated_frames: list[torch.LongTensor] = []
+        finished = torch.zeros(batch_size, dtype=torch.bool, device=input_ids.device)
+        past_key_values = None
+        local_dtype = self.local_transformer.ln_f.weight.dtype
+        for _ in range(frame_budget):
+            generated_audio_history = torch.stack(generated_frames, dim=1) if generated_frames else None
+            global_inputs_embeds = self._build_inputs_embeds(current_model_input_ids)
+            global_outputs = self.transformer(
+                input_ids=None,
+                past_key_values=past_key_values,
+                attention_mask=current_attention_mask,
+                position_ids=None,
+                inputs_embeds=global_inputs_embeds,
+                use_cache=use_kv_cache,
+                output_attentions=False,
+                output_hidden_states=False,
+                return_dict=True,
+                cu_seqlens=None,
+                num_sequences=None,
+            )
+            global_hidden_states = global_outputs.last_hidden_state[:, -1, :]
+            local_global_hidden_states = self._global_hidden_to_local(global_hidden_states).to(dtype=local_dtype)
+            local_prefix_hidden_states, local_prefix_past_key_values = self._decode_local_hidden_states_with_cache(
+                local_global_hidden_states.unsqueeze(1)
+            )
+            local_hidden_states = local_prefix_hidden_states[:, -1, :]
+            next_text_tokens = self._sample_next_assistant_text_token(
+                local_hidden_states=local_hidden_states,
+                do_sample=do_sample,
+                temperature=text_temperature,
+                top_k=text_top_k,
+                top_p=text_top_p,
+            )
+            should_continue = next_text_tokens.eq(int(self.config.audio_assistant_slot_token_id)) & ~finished
+            finished = finished | next_text_tokens.eq(int(self.config.audio_end_token_id))
+            if not bool(should_continue.any().item()):
+                break
+            next_frame_tokens = []
+            for channel_index in range(int(self.config.n_vq)):
+                channel_logits = self.audio_lm_heads[channel_index](local_hidden_states)
+                channel_token = self._sample_next_token(
+                    logits=channel_logits,
+                    do_sample=do_sample,
+                    temperature=audio_temperature,
+                    top_k=audio_top_k,
+                    top_p=audio_top_p,
+                    previous_token_ids=(
+                        None
+                        if generated_audio_history is None
+                        else generated_audio_history[:, :, channel_index]
+                    ),
+                    repetition_penalty=audio_repetition_penalty,
+                )
+                next_frame_tokens.append(channel_token)
+                if channel_index + 1 < int(self.config.n_vq):
+                    current_local_input = self.audio_embeddings[channel_index](channel_token).to(dtype=local_dtype)
+                    local_token_hidden_states, local_prefix_past_key_values = (
+                        self._decode_local_hidden_states_with_cache(
+                            current_local_input.unsqueeze(1),
+                            past_key_values=local_prefix_past_key_values,
+                        )
+                    )
+                    local_hidden_states = local_token_hidden_states[:, -1, :]
+            next_frame = torch.stack(next_frame_tokens, dim=-1)
+            next_frame = next_frame.masked_fill(
+                ~should_continue.unsqueeze(-1),
+                int(self.config.audio_pad_token_id),
+            )
+            generated_frames.append(next_frame)
+            next_row = self._build_generation_row(
+                batch_size=batch_size,
+                device=input_ids.device,
+                audio_token_ids=next_frame,
+            )
+            if bool((~should_continue).any().item()):
+                next_row[~should_continue, 0, 0] = int(self.config.pad_token_id)
+                next_row[~should_continue, 0, 1:] = int(self.config.audio_pad_token_id)
+            current_input_ids = torch.cat([current_input_ids, next_row], dim=1)
+            current_attention_mask = torch.cat(
+                [current_attention_mask, should_continue.unsqueeze(1)],
+                dim=1,
+            )
+            if use_kv_cache:
+                current_model_input_ids = next_row
+                past_key_values = global_outputs.past_key_values
+            else:
+                current_model_input_ids = current_input_ids
+        start_indices = _find_last_equal(input_ids[..., 0], int(self.config.audio_start_token_id))
+        start_lengths = input_ids_length - start_indices - 1
+        outputs: list[tuple[int, torch.LongTensor]] = []
+        for start_index, start_length, generation_ids in zip(
+            start_indices.tolist(),
+            start_lengths.tolist(),
+            current_input_ids,
+        ):
+            outputs.append((int(start_length), generation_ids[int(start_index):].detach().cpu()))
+        return outputs

processing_moss_tts.py ADDED Viewed

	@@ -0,0 +1,899 @@

+# coding=utf-8
+"""Processor for the MOSS-TTS-Local-Transformer-v1.5 HuggingFace release."""
+from __future__ import annotations
+import os
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast
+import torch
+import torchaudio
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoTokenizer,
+    BatchFeature,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    logging,
+    processing_utils,
+)
+from .configuration_moss_tts import MossTTSLocalConfig
+if hasattr(processing_utils, "MODALITY_TO_BASE_CLASS_MAPPING"):
+    processing_utils.MODALITY_TO_BASE_CLASS_MAPPING["audio_tokenizer"] = "PreTrainedModel"
+else:
+    processing_utils.AUTO_TO_BASE_CLASS_MAPPING["AutoModel"] = "PreTrainedModel"
+logger = logging.get_logger(__name__)
+AUDIO_PLACEHOLDER = "<|audio|>"
+USER_ROLE_PREFIX = "user\n"
+USER_TEMPLATE_REFERENCE_PREFIX = (
+    "<user_inst>\n"
+    "- Reference(s):\n"
+)
+USER_TEMPLATE_AFTER_REFERENCE_SUFFIX = (
+    "\n"
+    "- Text:\n"
+)
+USER_TEMPLATE_SUFFIX = "\n</user_inst>"
+ASSISTANT_TURN_PREFIX = "\n"
+ASSISTANT_ROLE_PREFIX = "assistant\n"
+USER_MESSAGE_FIELDS = (
+    "text",
+    "reference",
+    "instruction",
+    "tokens",
+    "quality",
+    "sound_event",
+    "ambient_sound",
+    "language",
+)
+def _normalize_template_value(value: Any) -> str:
+    if value is None:
+        return "None"
+    resolved = str(value).strip()
+    return resolved or "None"
+def _render_user_prompt_after_reference(
+    language_code: object | None = None,
+    prompt_fields: Optional[Dict[str, Any]] = None,
+) -> str:
+    fields = dict(prompt_fields or {})
+    return (
+        "\n- Instruction:\n"
+        + _normalize_template_value(fields.get("instruction"))
+        + "\n- Tokens:\n"
+        + _normalize_template_value(fields.get("tokens"))
+        + "\n- Quality:\n"
+        + _normalize_template_value(fields.get("quality"))
+        + "\n- Sound Event:\n"
+        + _normalize_template_value(fields.get("sound_event"))
+        + "\n- Ambient Sound:\n"
+        + _normalize_template_value(fields.get("ambient_sound"))
+        + "\n- Language:\n"
+        + _normalize_template_value(fields.get("language", language_code))
+        + USER_TEMPLATE_AFTER_REFERENCE_SUFFIX
+    )
+@dataclass
+class Message:
+    def to_dict(self) -> Dict[str, Any]:
+        raise NotImplementedError
+@dataclass
+class UserMessage(Message):
+    text: Optional[str] = None
+    reference: Optional[List[Optional[Union[str, os.PathLike, torch.Tensor]]]] = None
+    instruction: Optional[str] = None
+    tokens: Optional[int] = None
+    quality: Optional[str] = None
+    sound_event: Optional[str] = None
+    ambient_sound: Optional[str] = None
+    language: Optional[str] = None
+    def __post_init__(self) -> None:
+        template = """<user_inst>
+- Reference(s):
+{reference}
+- Instruction:
+{instruction}
+- Tokens:
+{tokens}
+- Quality:
+{quality}
+- Sound Event:
+{sound_event}
+- Ambient Sound:
+{ambient_sound}
+- Language:
+{language}
+- Text:
+{text}
+</user_inst>"""
+        audio_codes_list: list[Union[str, os.PathLike, torch.Tensor]] = []
+        if self.reference is None:
+            reference = "None"
+        else:
+            reference_items: list[str] = []
+            for speaker_idx, speaker_reference in enumerate(self.reference):
+                if speaker_reference is None:
+                    continue
+                # Keep raw audio placeholders directly under "- Reference(s):".
+                # Speaker labels such as "[S1]:" change the token sequence and
+                # can affect voice-clone conditioning.
+                reference_items.append(AUDIO_PLACEHOLDER)
+                audio_codes_list.append(speaker_reference)
+            reference = "\n".join(reference_items) if reference_items else "None"
+        self._content = (
+            template.replace("{reference}", str(reference))
+            .replace("{instruction}", str(self.instruction))
+            .replace("{tokens}", str(self.tokens))
+            .replace("{quality}", str(self.quality))
+            .replace("{sound_event}", str(self.sound_event))
+            .replace("{ambient_sound}", str(self.ambient_sound))
+            .replace("{language}", str(self.language))
+            .replace("{text}", str(self.text))
+        )
+        self._audio_codes_list = audio_codes_list
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "role": "user",
+            "content": self._content,
+            "audio_codes_list": self._audio_codes_list,
+            "text": self.text,
+            "instruction": self.instruction,
+            "tokens": self.tokens,
+            "quality": self.quality,
+            "sound_event": self.sound_event,
+            "ambient_sound": self.ambient_sound,
+            "language": self.language,
+        }
+@dataclass
+class AssistantMessage(Message):
+    audio_codes_list: List[Union[str, os.PathLike, torch.Tensor]]
+    content: str = AUDIO_PLACEHOLDER
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "role": "assistant",
+            "content": self.content,
+            "audio_codes_list": self.audio_codes_list,
+        }
+class MossTTSLocalProcessor(ProcessorMixin):
+    attributes = ["tokenizer"]
+    tokenizer_class = "AutoTokenizer"
+    audio_tokenizer_class = "AutoModel"
+    tokenizer: PreTrainedTokenizerBase
+    audio_tokenizer: Any
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        audio_tokenizer: Any = None,
+        model_config: Optional[MossTTSLocalConfig] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(tokenizer=tokenizer, audio_tokenizer=audio_tokenizer, **kwargs)
+        self.tokenizer = tokenizer
+        self.audio_tokenizer = audio_tokenizer
+        self.model_config = model_config or MossTTSLocalConfig()
+        def _id_to_token(token_id: int) -> str:
+            token = tokenizer.convert_ids_to_tokens(int(token_id))
+            if isinstance(token, list):
+                return token[0] if token else ""
+            return cast(str, token)
+        self.audio_user_slot_token = _id_to_token(self.model_config.audio_user_slot_token_id)
+        self.audio_assistant_slot_token = _id_to_token(self.model_config.audio_assistant_slot_token_id)
+        self.audio_start_token = _id_to_token(self.model_config.audio_start_token_id)
+        self.audio_end_token = _id_to_token(self.model_config.audio_end_token_id)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        trust_remote_code = kwargs.pop("trust_remote_code", True)
+        kwargs.pop("_from_auto", None)
+        codec_path = kwargs.pop("codec_path", None)
+        model_ref = Path(str(pretrained_model_name_or_path))
+        model_ref_or_name = model_ref if model_ref.exists() else pretrained_model_name_or_path
+        model_config = cast(
+            MossTTSLocalConfig,
+            AutoConfig.from_pretrained(
+                model_ref_or_name,
+                *args,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            ),
+        )
+        if codec_path is None:
+            try:
+                processor_dict, _ = cls.get_processor_dict(
+                    pretrained_model_name_or_path,
+                    **dict(kwargs),
+                )
+                codec_path = processor_dict.get("audio_tokenizer_name_or_path")
+                audio_tokenizer_dict = processor_dict.get("audio_tokenizer", {})
+                if isinstance(audio_tokenizer_dict, dict):
+                    codec_path = audio_tokenizer_dict.get("audio_tokenizer_name_or_path") or codec_path
+            except Exception:
+                codec_path = None
+        if codec_path is None:
+            codec_path = getattr(model_config, "audio_tokenizer_name_or_path", None)
+        if codec_path is None:
+            codec_path = "OpenMOSS-Team/MOSS-Audio-Tokenizer-v2"
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_ref_or_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+        audio_tokenizer = AutoModel.from_pretrained(
+            codec_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+        return cls(
+            tokenizer=tokenizer,
+            audio_tokenizer=audio_tokenizer,
+            model_config=model_config,
+            **kwargs,
+        )
+    @staticmethod
+    def build_user_message(
+        text: Optional[str] = None,
+        reference: Optional[List[Optional[Union[str, os.PathLike, torch.Tensor]]]] = None,
+        instruction: Optional[str] = None,
+        tokens: Optional[int] = None,
+        quality: Optional[str] = None,
+        sound_event: Optional[str] = None,
+        ambient_sound: Optional[str] = None,
+        language: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        if reference is not None and not isinstance(reference, list):
+            reference = [reference]
+        return UserMessage(
+            text=text,
+            reference=reference,
+            instruction=instruction,
+            tokens=tokens,
+            quality=quality,
+            sound_event=sound_event,
+            ambient_sound=ambient_sound,
+            language=language,
+        ).to_dict()
+    @staticmethod
+    def build_assistant_message(
+        audio_codes_list: List[Union[str, os.PathLike, torch.Tensor]],
+        content: str = AUDIO_PLACEHOLDER,
+    ) -> Dict[str, Any]:
+        return AssistantMessage(audio_codes_list=audio_codes_list, content=content).to_dict()
+    def _assert_fixed_nq(self, n_vq: Optional[int]) -> int:
+        config_nq = int(self.model_config.n_vq)
+        if n_vq is not None and int(n_vq) != config_nq:
+            raise ValueError(
+                "This MOSS-TTS-Local-Transformer-v1.5 release uses the RVQ depth stored in the model config. "
+                f"Expected n_vq={config_nq}, got {int(n_vq)}."
+            )
+        return config_nq
+    def _encode_text(self, text: str) -> list[int]:
+        try:
+            return list(self.tokenizer.encode(text, add_special_tokens=False))
+        except TypeError:
+            return list(self.tokenizer.encode(text))
+    def _build_text_rows(self, token_ids: Sequence[int], *, device: Optional[torch.device] = None) -> torch.Tensor:
+        rows = torch.full(
+            (len(token_ids), int(self.model_config.n_vq) + 1),
+            int(self.model_config.audio_pad_token_id),
+            dtype=torch.long,
+            device=device,
+        )
+        if token_ids:
+            rows[:, 0] = torch.tensor([int(token_id) for token_id in token_ids], dtype=torch.long, device=rows.device)
+        return rows
+    def _build_audio_rows(self, audio_tokens: torch.Tensor, slot_token_id: int) -> torch.Tensor:
+        rows = torch.full(
+            (int(audio_tokens.shape[0]), int(self.model_config.n_vq) + 1),
+            int(self.model_config.audio_pad_token_id),
+            dtype=torch.long,
+            device=audio_tokens.device,
+        )
+        if rows.shape[0] > 0:
+            rows[:, 0] = int(slot_token_id)
+            rows[:, 1:] = audio_tokens.to(dtype=torch.long)
+        return rows
+    def _user_prompt_prefix_ids(self) -> list[int]:
+        return (
+            [int(self.model_config.im_start_token_id)]
+            + self._encode_text(USER_ROLE_PREFIX)
+            + self._encode_text(USER_TEMPLATE_REFERENCE_PREFIX)
+        )
+    def _user_prompt_after_reference_ids(
+        self,
+        language_code: object | None,
+        prompt_fields: Optional[Dict[str, Any]],
+    ) -> list[int]:
+        return self._encode_text(
+            _render_user_prompt_after_reference(
+                language_code=language_code,
+                prompt_fields=prompt_fields,
+            )
+        )
+    def _assistant_prompt_prefix_ids(self) -> list[int]:
+        return (
+            self._encode_text(USER_TEMPLATE_SUFFIX)
+            + [int(self.model_config.im_end_token_id)]
+            + self._encode_text(ASSISTANT_TURN_PREFIX)
+            + [int(self.model_config.im_start_token_id)]
+            + self._encode_text(ASSISTANT_ROLE_PREFIX)
+        )
+    def _prompt_fields_from_user_message(self, message: Dict[str, Any]) -> dict[str, Any]:
+        fields = {}
+        for key in ("instruction", "tokens", "quality", "sound_event", "ambient_sound"):
+            if key in message and message.get(key) is not None:
+                fields[key] = message.get(key)
+        if "language" in message and message.get("language") is not None:
+            fields["language"] = message.get("language")
+        return fields
+    def _build_generation_or_voice_clone_codes(
+        self,
+        message: Dict[str, Any],
+        n_vq: int,
+    ) -> torch.Tensor:
+        if "text" not in message:
+            raise ValueError("Direct MOSS-TTS-Local-Transformer-v1.5 generation requires messages built by build_user_message(...).")
+        text = "" if message.get("text") is None else str(message.get("text"))
+        prompt_fields = self._prompt_fields_from_user_message(message)
+        language_code = message.get("language")
+        audio_codes_list = self._resolve_audio_items(message.get("audio_codes_list", []), n_vq)
+        text_token_ids = self._encode_text(text)
+        if audio_codes_list:
+            parts: list[torch.Tensor] = [self._build_text_rows(
+                self._user_prompt_prefix_ids(),
+                device=audio_codes_list[0].device,
+            )]
+            for reference_codes in audio_codes_list:
+                parts.append(self._build_text_rows([int(self.model_config.audio_start_token_id)], device=reference_codes.device))
+                parts.append(self._build_audio_rows(reference_codes, int(self.model_config.audio_user_slot_token_id)))
+                parts.append(self._build_text_rows([int(self.model_config.audio_end_token_id)], device=reference_codes.device))
+            parts.append(
+                self._build_text_rows(
+                    self._user_prompt_after_reference_ids(language_code, prompt_fields)
+                    + text_token_ids
+                    + self._assistant_prompt_prefix_ids()
+                    + [int(self.model_config.audio_start_token_id)],
+                    device=audio_codes_list[0].device,
+                )
+            )
+            return torch.cat(parts, dim=0)
+        prompt_token_ids = (
+            self._user_prompt_prefix_ids()
+            + self._encode_text("None")
+            + self._user_prompt_after_reference_ids(language_code, prompt_fields)
+            + text_token_ids
+            + self._assistant_prompt_prefix_ids()
+            + [int(self.model_config.audio_start_token_id)]
+        )
+        return self._build_text_rows(prompt_token_ids)
+    def _build_continuation_codes(
+        self,
+        conversation: list[Dict[str, Any]],
+        n_vq: int,
+    ) -> torch.Tensor:
+        if len(conversation) < 2:
+            raise ValueError("continuation mode requires a user message followed by an assistant audio message.")
+        user_message = conversation[-2]
+        assistant_message = conversation[-1]
+        if user_message.get("role") != "user" or assistant_message.get("role") != "assistant":
+            raise ValueError("continuation mode requires the last two messages to be user, assistant.")
+        if "text" not in user_message:
+            raise ValueError("Direct MOSS-TTS-Local-Transformer-v1.5 continuation requires user messages built by build_user_message(...).")
+        text = "" if user_message.get("text") is None else str(user_message.get("text"))
+        prompt_fields = self._prompt_fields_from_user_message(user_message)
+        language_code = user_message.get("language")
+        prompt_token_ids = (
+            self._user_prompt_prefix_ids()
+            + self._encode_text("None")
+            + self._user_prompt_after_reference_ids(language_code, prompt_fields)
+            + self._encode_text(text)
+            + self._assistant_prompt_prefix_ids()
+            + [int(self.model_config.audio_start_token_id)]
+        )
+        audio_codes_list = self._resolve_audio_items(assistant_message.get("audio_codes_list", []), n_vq)
+        if not audio_codes_list:
+            return self._build_text_rows(prompt_token_ids)
+        if len(audio_codes_list) != 1:
+            raise ValueError("The MOSS-TTS-Local-Transformer-v1.5 continuation path expects exactly one prompt audio item.")
+        prompt_audio_codes = audio_codes_list[0]
+        return torch.cat(
+            [
+                self._build_text_rows(prompt_token_ids, device=prompt_audio_codes.device),
+                self._build_audio_rows(prompt_audio_codes, int(self.model_config.audio_assistant_slot_token_id)),
+            ],
+            dim=0,
+        )
+    def _try_build_direct_codes(
+        self,
+        conversation: list[Dict[str, Any]],
+        mode: str,
+        n_vq: int,
+    ) -> Optional[torch.Tensor]:
+        if mode == "generation" and len(conversation) == 1 and conversation[-1].get("role") == "user":
+            if "text" in conversation[-1]:
+                return self._build_generation_or_voice_clone_codes(conversation[-1], n_vq)
+            return None
+        if mode == "continuation" and len(conversation) >= 2:
+            if "text" in conversation[-2]:
+                return self._build_continuation_codes(conversation, n_vq)
+            return None
+        return None
+    def __call__(self, *args, **kwargs) -> BatchFeature:
+        conversations = args[0] if args else kwargs.pop("conversations")
+        mode: str = kwargs.pop("mode", "generation")
+        apply_chat_template: bool = kwargs.pop("apply_chat_template", True)
+        n_vq = self._assert_fixed_nq(kwargs.pop("n_vq", None))
+        kwargs.pop("return_tensors", None)
+        kwargs.pop("padding", None)
+        kwargs.pop("truncation", None)
+        if mode not in {"generation", "continuation", "computing_loss"}:
+            raise ValueError(f"Unsupported mode: {mode}")
+        if isinstance(conversations, (Message, dict)):
+            conversations = [conversations]
+        elif isinstance(conversations, list) and conversations and all(
+            isinstance(item, (Message, dict)) for item in conversations
+        ):
+            conversations = [conversations]
+        input_ids_list: list[torch.Tensor] = []
+        for conversation in conversations:
+            if isinstance(conversation, (Message, dict)):
+                conversation = [conversation]
+            conversation = [self._normalize_message(message) for message in conversation]
+            if (mode == "generation") ^ (conversation[-1]["role"] == "user"):
+                raise ValueError("generation mode must end with a user message.")
+            if mode == "continuation" and conversation[-1]["role"] != "assistant":
+                raise ValueError("continuation mode must end with an assistant message.")
+            direct_codes = self._try_build_direct_codes(conversation, mode, n_vq)
+            if direct_codes is not None:
+                input_ids_list.append(direct_codes)
+                continue
+            unified_parts = []
+            for message_idx, message in enumerate(conversation):
+                content = str(message["content"])
+                if apply_chat_template:
+                    add_generation_prompt = mode == "generation" and message_idx == len(conversation) - 1
+                    try:
+                        content = self.tokenizer.apply_chat_template(
+                            [{"role": message["role"], "content": content}],
+                            add_generation_prompt=add_generation_prompt,
+                            tokenize=False,
+                        )
+                    except Exception:
+                        logger.warning("apply_chat_template failed; falling back to raw message content.")
+                raw_audio_items = message.get("audio_codes_list", [])
+                audio_codes_list = self._resolve_audio_items(raw_audio_items, n_vq)
+                unified_parts.append(
+                    self._get_unified_codes(
+                        role=message["role"],
+                        content=content,
+                        audio_codes_list=audio_codes_list,
+                        truncation=(mode == "continuation"),
+                    )
+                )
+            unified_codes = torch.cat(unified_parts, dim=0)
+            if mode == "generation":
+                audio_start_row = torch.full(
+                    (1, n_vq + 1),
+                    int(self.model_config.audio_pad_token_id),
+                    dtype=unified_codes.dtype,
+                    device=unified_codes.device,
+                )
+                audio_start_row[:, 0] = int(self.model_config.audio_start_token_id)
+                unified_codes = torch.cat([unified_codes, audio_start_row], dim=0)
+            input_ids_list.append(unified_codes)
+        return BatchFeature(data=self._pad(input_ids_list))
+    def _normalize_message(self, message: Union[Message, Dict[str, Any]]) -> Dict[str, Any]:
+        if isinstance(message, Message):
+            return message.to_dict()
+        if not isinstance(message, dict):
+            raise TypeError("Each message must be a Message or dict.")
+        if "content" in message and "audio_codes_list" in message:
+            return message
+        role = message.get("role")
+        if role == "user":
+            return self.build_user_message(**{key: message.get(key) for key in USER_MESSAGE_FIELDS})
+        if role == "assistant":
+            return self.build_assistant_message(
+                audio_codes_list=message.get("audio_codes_list", []),
+                content=message.get("content", AUDIO_PLACEHOLDER),
+            )
+        raise ValueError(f"Unsupported role: {role}")
+    def _resolve_audio_items(
+        self,
+        raw_audio_items: list[Any],
+        n_vq: int,
+    ) -> list[torch.Tensor]:
+        if not raw_audio_items:
+            return []
+        resolved: list[Optional[torch.Tensor]] = [None] * len(raw_audio_items)
+        paths: list[str] = []
+        path_positions: list[int] = []
+        for index, item in enumerate(raw_audio_items):
+            if isinstance(item, torch.Tensor):
+                if item.ndim != 2 or int(item.shape[1]) != n_vq:
+                    raise ValueError(f"audio code tensor must have shape [T, {n_vq}], got {tuple(item.shape)}.")
+                resolved[index] = item.to(dtype=torch.long).cpu()
+            elif isinstance(item, (str, os.PathLike)):
+                paths.append(str(item))
+                path_positions.append(index)
+            else:
+                raise TypeError("Audio items must be tensors or path-like values.")
+        if paths:
+            encoded = self.encode_audios_from_path(paths, n_vq=n_vq)
+            for position, codes in zip(path_positions, encoded):
+                resolved[position] = codes
+        return [cast(torch.Tensor, item) for item in resolved]
+    def _pad(self, input_ids_list: list[torch.Tensor]) -> Dict[str, torch.Tensor]:
+        device = input_ids_list[0].device
+        lengths = torch.tensor([item.shape[0] for item in input_ids_list], device=device)
+        padded = torch.nn.utils.rnn.pad_sequence(
+            input_ids_list,
+            batch_first=True,
+            padding_value=int(self.model_config.audio_pad_token_id),
+            padding_side="left",
+        )
+        left_pad_mask = (padded.shape[1] - lengths).unsqueeze(1) > torch.arange(
+            padded.shape[1],
+            device=device,
+        ).unsqueeze(0)
+        padded[..., 0][left_pad_mask] = int(self.model_config.pad_token_id)
+        attention_mask = torch.zeros(padded.shape[:2], dtype=torch.bool, device=device)
+        attention_mask[~left_pad_mask] = True
+        return {"input_ids": padded, "attention_mask": attention_mask}
+    @staticmethod
+    def _replace_audio_placeholders(
+        content: str,
+        lengths: list[int],
+        slot_token: str,
+        audio_start_token: str,
+        audio_end_token: str,
+    ) -> str:
+        placeholder_count = content.count(AUDIO_PLACEHOLDER)
+        if placeholder_count != len(lengths):
+            raise ValueError(
+                f"Number of {AUDIO_PLACEHOLDER} ({placeholder_count}) does not match "
+                f"audio item count ({len(lengths)})."
+            )
+        lengths_iter = iter(lengths)
+        def replacer(_: re.Match) -> str:
+            length = int(next(lengths_iter))
+            if length <= 0:
+                return f"{audio_start_token}{audio_end_token}"
+            return f"{audio_start_token}{slot_token * length}{audio_end_token}"
+        return re.sub(re.escape(AUDIO_PLACEHOLDER), replacer, content)
+    def _get_unified_codes(
+        self,
+        role: str,
+        content: str,
+        audio_codes_list: list[torch.Tensor],
+        truncation: bool,
+    ) -> torch.Tensor:
+        n_vq = int(self.model_config.n_vq)
+        slot_token = self.audio_user_slot_token if role == "user" else self.audio_assistant_slot_token
+        content = self._replace_audio_placeholders(
+            content=content,
+            lengths=[int(codes.shape[0]) for codes in audio_codes_list],
+            slot_token=slot_token,
+            audio_start_token=self.audio_start_token,
+            audio_end_token=self.audio_end_token,
+        )
+        text_codes = torch.tensor(
+            self.tokenizer.encode(content),
+            dtype=torch.long,
+            device=audio_codes_list[0].device if audio_codes_list else None,
+        )
+        audio_start_indices = torch.where(text_codes == int(self.model_config.audio_start_token_id))[0]
+        audio_end_indices = torch.where(text_codes == int(self.model_config.audio_end_token_id))[0]
+        if len(audio_start_indices) != len(audio_codes_list) or len(audio_end_indices) != len(audio_codes_list):
+            raise ValueError("Audio placeholders do not match the encoded audio spans.")
+        if not audio_codes_list:
+            audio_codes = torch.full(
+                (len(text_codes), n_vq),
+                int(self.model_config.audio_pad_token_id),
+                dtype=torch.long,
+                device=text_codes.device,
+            )
+        else:
+            pieces: list[torch.Tensor] = []
+            prefix_idx = 0
+            for start_t, end_t, codes in zip(audio_start_indices, audio_end_indices, audio_codes_list):
+                start_idx = int(start_t.item())
+                end_idx = int(end_t.item())
+                pad_before = torch.full(
+                    (start_idx - prefix_idx + 1, n_vq),
+                    int(self.model_config.audio_pad_token_id),
+                    dtype=torch.long,
+                    device=codes.device,
+                )
+                pieces.extend([pad_before, codes.to(dtype=torch.long)])
+                prefix_idx = end_idx
+            if truncation:
+                trailing = torch.zeros(
+                    (0, n_vq),
+                    dtype=torch.long,
+                    device=audio_codes_list[0].device,
+                )
+            else:
+                last_end = int(audio_end_indices[-1].item())
+                trailing = torch.full(
+                    (len(text_codes) - last_end, n_vq),
+                    int(self.model_config.audio_pad_token_id),
+                    dtype=torch.long,
+                    device=audio_codes_list[0].device,
+                )
+            pieces.append(trailing)
+            audio_codes = torch.cat(pieces, dim=0)
+        if text_codes.shape[0] != audio_codes.shape[0]:
+            min_len = min(text_codes.shape[0], audio_codes.shape[0])
+            text_codes = text_codes[:min_len]
+            audio_codes = audio_codes[:min_len]
+        return torch.cat([text_codes.unsqueeze(1), audio_codes], dim=1)
+    def _parse_text_codes(self, start_length: int, text_codes: torch.LongTensor) -> str:
+        text = cast(str, self.tokenizer.decode(text_codes))
+        prefix = cast(str, self.tokenizer.decode(text_codes[:start_length]))
+        text = text[len(prefix):]
+        audio_pattern = re.compile(
+            rf"(?:{re.escape(self.audio_start_token)})?"
+            rf"(?:{re.escape(self.audio_assistant_slot_token)})*"
+            rf"{re.escape(self.audio_end_token)}"
+        )
+        return audio_pattern.sub(
+            lambda match: AUDIO_PLACEHOLDER if self.audio_assistant_slot_token in match.group(0) else "",
+            text,
+        )
+    def _parse_audio_codes(
+        self,
+        start_length: int,
+        audio_codes: torch.LongTensor,
+        *,
+        return_stereo: bool = True,
+    ) -> list[torch.Tensor]:
+        is_pad = audio_codes.eq(int(self.model_config.audio_pad_token_id)).all(dim=1)
+        non_pad = ~is_pad
+        if not bool(non_pad.any().item()):
+            return []
+        idx = torch.nonzero(non_pad).squeeze(1)
+        breaks = torch.where(idx[1:] != idx[:-1] + 1)[0] + 1
+        segment_indices = [idx] if breaks.numel() == 0 else list(torch.tensor_split(idx, breaks.cpu().tolist()))
+        code_segments = [audio_codes[segment] for segment in segment_indices]
+        decoded = self.decode_audio_codes(code_segments, return_stereo=return_stereo)
+        if start_length > 0 and code_segments and decoded:
+            first_code_length = int(code_segments[0].shape[0])
+            if first_code_length > 0:
+                trim_ratio = max(0.0, min(float(start_length) / float(first_code_length), 1.0))
+                if trim_ratio >= 1.0:
+                    decoded = decoded[1:]
+                elif trim_ratio > 0.0:
+                    trim_samples = int(decoded[0].shape[-1] * trim_ratio)
+                    decoded[0] = decoded[0][..., trim_samples:]
+        return decoded
+    def decode(self, output: Any, *, return_stereo: bool = True) -> list[Optional[AssistantMessage]]:
+        generated_messages: list[Optional[AssistantMessage]] = []
+        for start_length, generation_ids in output:
+            content = self._parse_text_codes(int(start_length), generation_ids[:, 0])
+            audio_codes_list = self._parse_audio_codes(
+                int(start_length),
+                generation_ids[:, 1:],
+                return_stereo=return_stereo,
+            )
+            if content == "":
+                generated_messages.append(None)
+            else:
+                generated_messages.append(
+                    AssistantMessage(
+                        content=content,
+                        audio_codes_list=cast(list[Union[str, torch.Tensor]], audio_codes_list),
+                    )
+                )
+        return generated_messages
+    @staticmethod
+    def loudness_normalize(
+        wav: torch.Tensor,
+        target_dbfs: float = -20.0,
+        gain_range: tuple[float, float] = (-3.0, 3.0),
+    ) -> torch.Tensor:
+        wav = wav.to(torch.float32)
+        if wav.numel() == 0:
+            return wav
+        current_dbfs = 10.0 * torch.log10(torch.mean(wav**2) + 1e-9)
+        gain = max(gain_range[0], min(float(target_dbfs - current_dbfs), gain_range[1]))
+        return wav * (10.0 ** (gain / 20.0))
+    def _get_audio_tokenizer_device(self) -> torch.device:
+        audio_tokenizer = getattr(self, "audio_tokenizer", None)
+        if audio_tokenizer is None:
+            raise RuntimeError("audio_tokenizer is not set.")
+        try:
+            return next(audio_tokenizer.parameters()).device
+        except StopIteration:
+            return torch.device("cpu")
+    def encode_audios_from_wav(
+        self,
+        wav_list: Union[torch.Tensor, list[torch.Tensor]],
+        sampling_rate: int,
+        n_vq: Optional[int] = None,
+    ) -> list[torch.Tensor]:
+        n_vq = self._assert_fixed_nq(n_vq)
+        if self.audio_tokenizer is None:
+            raise RuntimeError("audio_tokenizer is not set.")
+        if isinstance(wav_list, torch.Tensor):
+            wav_list = [wav_list]
+        target_sr = int(self.model_config.sampling_rate)
+        device = self._get_audio_tokenizer_device()
+        prepared = []
+        for wav in wav_list:
+            if wav.ndim == 1:
+                wav = wav.unsqueeze(0)
+            if wav.shape[0] == 1:
+                wav = wav.repeat(2, 1)
+            elif wav.shape[0] > 2:
+                wav = wav[:2]
+            if int(sampling_rate) != target_sr:
+                wav = torchaudio.functional.resample(wav, int(sampling_rate), target_sr)
+            prepared.append(self.loudness_normalize(wav).to(device))
+        if hasattr(self.audio_tokenizer, "batch_encode"):
+            encoded = self.audio_tokenizer.batch_encode(prepared, num_quantizers=n_vq)
+            audio_codes = encoded.audio_codes
+            audio_lengths = encoded.audio_codes_lengths
+        else:
+            max_len = max(int(wav.shape[-1]) for wav in prepared)
+            input_values = torch.zeros(len(prepared), 1, max_len, dtype=torch.float32, device=device)
+            padding_mask = torch.zeros(len(prepared), max_len, dtype=torch.bool, device=device)
+            for index, wav in enumerate(prepared):
+                input_values[index, 0, : wav.shape[-1]] = wav
+                padding_mask[index, : wav.shape[-1]] = True
+            encoded = self.audio_tokenizer.encode(
+                input_values,
+                padding_mask=padding_mask,
+                num_quantizers=n_vq,
+                return_dict=True,
+            )
+            audio_codes = encoded.audio_codes
+            audio_lengths = encoded.audio_codes_lengths
+        if audio_codes is None or audio_lengths is None:
+            raise RuntimeError("audio_tokenizer did not return audio_codes/audio_codes_lengths.")
+        result = []
+        for index in range(int(audio_codes.shape[1])):
+            length = int(audio_lengths[index].item())
+            result.append(audio_codes[:, index, :length].transpose(0, 1).contiguous().cpu().long())
+        return result
+    def encode_audios_from_path(
+        self,
+        wav_path_list: Union[str, os.PathLike, list[Union[str, os.PathLike]]],
+        n_vq: Optional[int] = None,
+    ) -> list[torch.Tensor]:
+        if isinstance(wav_path_list, (str, os.PathLike)):
+            wav_path_list = [wav_path_list]
+        wavs = []
+        target_sr = int(self.model_config.sampling_rate)
+        for wav_path in wav_path_list:
+            wav, sr = torchaudio.load(str(wav_path))
+            if int(sr) != target_sr:
+                wav = torchaudio.functional.resample(wav, int(sr), target_sr)
+            wavs.append(wav)
+        return self.encode_audios_from_wav(wavs, target_sr, n_vq=n_vq)
+    def decode_audio_codes(
+        self,
+        audio_tokens_list: Union[torch.Tensor, list[torch.Tensor]],
+        *,
+        return_stereo: bool = True,
+    ) -> list[torch.Tensor]:
+        if self.audio_tokenizer is None:
+            raise RuntimeError("audio_tokenizer is not set.")
+        if isinstance(audio_tokens_list, torch.Tensor):
+            audio_tokens_list = [audio_tokens_list]
+        if not audio_tokens_list:
+            return []
+        n_vq = int(self.model_config.n_vq)
+        device = self._get_audio_tokenizer_device()
+        codes_list = [
+            codes[:, :n_vq].transpose(0, 1).contiguous().to(device=device, dtype=torch.long)
+            for codes in audio_tokens_list
+        ]
+        max_len = max(int(codes.shape[1]) for codes in codes_list)
+        audio_codes = torch.zeros(n_vq, len(codes_list), max_len, device=device, dtype=torch.long)
+        padding_mask = torch.zeros(len(codes_list), max_len, device=device, dtype=torch.bool)
+        for index, codes in enumerate(codes_list):
+            length = int(codes.shape[1])
+            audio_codes[:, index, :length] = codes
+            padding_mask[index, :length] = True
+        decoded = self.audio_tokenizer.decode(
+            audio_codes,
+            padding_mask=padding_mask,
+            num_quantizers=n_vq,
+            return_dict=True,
+            chunk_duration=8,
+        )
+        audio = decoded.audio
+        audio_lengths = decoded.audio_lengths
+        if audio is None or audio_lengths is None:
+            raise RuntimeError("audio_tokenizer.decode did not return audio/audio_lengths.")
+        wavs = []
+        for index in range(int(audio.shape[0])):
+            length = int(audio_lengths[index].item())
+            wav = audio[index, :, :length].contiguous().cpu().to(torch.float32)
+            if not return_stereo:
+                if wav.shape[0] == 1:
+                    wav = wav.squeeze(0)
+                else:
+                    wav = wav.mean(dim=0)
+            wavs.append(wav)
+        return wavs

processor_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "processor_class": "MossTTSLocalProcessor",
+  "audio_tokenizer_name_or_path": "OpenMOSS-Team/MOSS-Audio-Tokenizer-v2",
+  "auto_map": {
+    "AutoProcessor": "processing_moss_tts.MossTTSLocalProcessor"
+  }
+}

qwen3_decoder.py ADDED Viewed

	@@ -0,0 +1,582 @@

+# coding=utf-8
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from safetensors.torch import load_file
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from .gpt2_decoder import PackedSequenceMetadata, MossTTSNanoGPT2Model
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import pad_input, unpad_input
+    _FLASH_ATTN_AVAILABLE = True
+except Exception:
+    flash_attn_func = None
+    flash_attn_varlen_func = None
+    pad_input = None
+    unpad_input = None
+    _FLASH_ATTN_AVAILABLE = False
+class MossQwen3RMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class MossQwen3RotaryEmbedding(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        head_dim = int(getattr(config, "head_dim", config.hidden_size // config.num_attention_heads))
+        rope_theta = getattr(config, "rope_theta", None)
+        if rope_theta is None:
+            rope_scaling = getattr(config, "rope_scaling", None)
+            if isinstance(rope_scaling, dict):
+                rope_theta = rope_scaling.get("rope_theta")
+        if rope_theta is None:
+            rope_theta = 1000000.0
+        rope_theta = float(rope_theta)
+        self.head_dim = head_dim
+        self.rope_theta = rope_theta
+        self.register_buffer("inv_freq", self._compute_inv_freq(), persistent=False)
+    def _compute_inv_freq(self, device: Optional[torch.device] = None) -> torch.Tensor:
+        return 1.0 / (
+            self.rope_theta ** (torch.arange(0, self.head_dim, 2, device=device, dtype=torch.float32) / self.head_dim)
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.LongTensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        inv_freq = self._compute_inv_freq(device=hidden_states.device)
+        freqs = torch.einsum(
+            "bs,d->bsd",
+            position_ids.to(device=hidden_states.device, dtype=inv_freq.dtype),
+            inv_freq,
+        )
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos().to(dtype=hidden_states.dtype), emb.sin().to(dtype=hidden_states.dtype)
+def rotate_half(hidden_states: torch.Tensor) -> torch.Tensor:
+    first_half = hidden_states[..., : hidden_states.shape[-1] // 2]
+    second_half = hidden_states[..., hidden_states.shape[-1] // 2 :]
+    return torch.cat((-second_half, first_half), dim=-1)
+def apply_rotary_pos_emb(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.unsqueeze(-2)
+    sin = sin.unsqueeze(-2)
+    query = (query * cos) + (rotate_half(query) * sin)
+    key = (key * cos) + (rotate_half(key) * sin)
+    return query, key
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    if n_rep == 1:
+        return hidden_states
+    batch, seq_len, num_key_value_heads, head_dim = hidden_states.shape
+    hidden_states = hidden_states[:, :, :, None, :].expand(batch, seq_len, num_key_value_heads, n_rep, head_dim)
+    return hidden_states.reshape(batch, seq_len, num_key_value_heads * n_rep, head_dim)
+class MossQwen3MLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = ACT2FN[getattr(config, "hidden_act", "silu")]
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
+class MossQwen3Attention(nn.Module):
+    def __init__(self, config, layer_idx: int) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = int(layer_idx)
+        self.hidden_size = int(config.hidden_size)
+        self.num_heads = int(config.num_attention_heads)
+        self.num_key_value_heads = int(config.num_key_value_heads)
+        self.head_dim = int(getattr(config, "head_dim", self.hidden_size // self.num_heads))
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.scaling = self.head_dim ** -0.5
+        self.attention_dropout = float(getattr(config, "attention_dropout", 0.0))
+        self.attn_implementation = str(getattr(config, "_attn_implementation", "eager"))
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=bool(config.attention_bias))
+        self.k_proj = nn.Linear(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=bool(config.attention_bias),
+        )
+        self.v_proj = nn.Linear(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=bool(config.attention_bias),
+        )
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=bool(config.attention_bias))
+        self.q_norm = MossQwen3RMSNorm(self.head_dim, eps=float(config.rms_norm_eps))
+        self.k_norm = MossQwen3RMSNorm(self.head_dim, eps=float(config.rms_norm_eps))
+    def _causal_attention_mask(
+        self,
+        attention_mask: Optional[torch.Tensor],
+        query_length: int,
+        key_length: int,
+        device: torch.device,
+    ) -> torch.Tensor:
+        query_positions = torch.arange(query_length, device=device, dtype=torch.long)
+        query_positions = query_positions + max(key_length - query_length, 0)
+        key_positions = torch.arange(key_length, device=device, dtype=torch.long)
+        causal = key_positions.unsqueeze(0) <= query_positions.unsqueeze(1)
+        causal = causal.unsqueeze(0).unsqueeze(0)
+        if attention_mask is None:
+            return causal
+        key_mask = attention_mask[:, None, None, :].to(dtype=torch.bool)
+        return causal & key_mask
+    def _eager_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        key = repeat_kv(key, self.num_key_value_groups)
+        value = repeat_kv(value, self.num_key_value_groups)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        scores = torch.matmul(query, key.transpose(-1, -2)) * self.scaling
+        mask = self._causal_attention_mask(
+            attention_mask=attention_mask,
+            query_length=query.shape[-2],
+            key_length=key.shape[-2],
+            device=query.device,
+        )
+        scores = scores.masked_fill(~mask, torch.finfo(scores.dtype).min)
+        probs = torch.softmax(scores, dim=-1)
+        if self.training and self.attention_dropout > 0:
+            probs = torch.dropout(probs, self.attention_dropout, train=True)
+        output = torch.matmul(probs, value)
+        return output.transpose(1, 2).contiguous()
+    def _sdpa_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        key = repeat_kv(key, self.num_key_value_groups)
+        value = repeat_kv(value, self.num_key_value_groups)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        mask = None
+        if attention_mask is not None or query.shape[-2] != key.shape[-2]:
+            mask = self._causal_attention_mask(
+                attention_mask=attention_mask,
+                query_length=query.shape[-2],
+                key_length=key.shape[-2],
+                device=query.device,
+            )
+        output = torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=mask is None,
+            scale=self.scaling,
+        )
+        return output.transpose(1, 2).contiguous()
+    def _flash_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        packed_metadata: Optional[PackedSequenceMetadata],
+    ) -> torch.Tensor:
+        if not _FLASH_ATTN_AVAILABLE:
+            raise ImportError("flash_attn is not installed, but attn_implementation='flash_attention_2' was requested.")
+        if query.device.type != "cuda":
+            raise ValueError("flash_attention_2 requires CUDA tensors.")
+        if query.dtype not in (torch.float16, torch.bfloat16):
+            raise ValueError(f"flash_attention_2 requires fp16/bf16 tensors, got dtype={query.dtype}.")
+        dropout_p = self.attention_dropout if self.training else 0.0
+        if packed_metadata is not None:
+            if packed_metadata.indices is not None:
+                query = query.reshape(-1, self.num_heads, self.head_dim).index_select(0, packed_metadata.indices)
+                key = key.reshape(-1, self.num_key_value_heads, self.head_dim).index_select(0, packed_metadata.indices)
+                value = value.reshape(-1, self.num_key_value_heads, self.head_dim).index_select(0, packed_metadata.indices)
+            output = flash_attn_varlen_func(
+                query,
+                key,
+                value,
+                packed_metadata.cu_seqlens,
+                packed_metadata.cu_seqlens,
+                packed_metadata.max_seqlen,
+                packed_metadata.max_seqlen,
+                dropout_p=dropout_p,
+                causal=True,
+            )
+            if packed_metadata.indices is None:
+                return output
+            return pad_input(
+                output,
+                packed_metadata.indices,
+                packed_metadata.batch_size,
+                packed_metadata.seq_len,
+            )
+        if attention_mask is None or bool(attention_mask.all()):
+            return flash_attn_func(query, key, value, dropout_p=dropout_p, causal=True)
+        if query.shape[1] != key.shape[1]:
+            query_attention_mask = attention_mask[:, -query.shape[1] :]
+            unpadded_query, query_indices, cu_seqlens_q, max_seqlen_q, _ = unpad_input(
+                query,
+                query_attention_mask,
+            )
+            unpadded_key, _, cu_seqlens_k, max_seqlen_k, _ = unpad_input(key, attention_mask)
+            unpadded_value, _, _, _, _ = unpad_input(value, attention_mask)
+            output = flash_attn_varlen_func(
+                unpadded_query,
+                unpadded_key,
+                unpadded_value,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                dropout_p=dropout_p,
+                causal=True,
+            )
+            return pad_input(output, query_indices, query.shape[0], query.shape[1])
+        unpadded_query, indices, cu_seqlens, max_seqlen, _ = unpad_input(query, attention_mask)
+        unpadded_key, _, _, _, _ = unpad_input(key, attention_mask)
+        unpadded_value, _, _, _, _ = unpad_input(value, attention_mask)
+        output = flash_attn_varlen_func(
+            unpadded_query,
+            unpadded_key,
+            unpadded_value,
+            cu_seqlens,
+            cu_seqlens,
+            max_seqlen,
+            max_seqlen,
+            dropout_p=dropout_p,
+            causal=True,
+        )
+        return pad_input(output, indices, query.shape[0], query.shape[1])
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        packed_metadata: Optional[PackedSequenceMetadata] = None,
+        layer_past: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        query_states = self.q_norm(
+            self.q_proj(hidden_states).view(*input_shape, self.num_heads, self.head_dim)
+        )
+        key_states = self.k_norm(
+            self.k_proj(hidden_states).view(*input_shape, self.num_key_value_heads, self.head_dim)
+        )
+        value_states = self.v_proj(hidden_states).view(*input_shape, self.num_key_value_heads, self.head_dim)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            key_states = torch.cat([past_key.to(device=key_states.device, dtype=key_states.dtype), key_states], dim=1)
+            value_states = torch.cat(
+                [past_value.to(device=value_states.device, dtype=value_states.dtype), value_states],
+                dim=1,
+            )
+        present = (key_states, value_states) if use_cache else None
+        if self.attn_implementation == "flash_attention_2":
+            attn_output = self._flash_attention(
+                query=query_states,
+                key=key_states,
+                value=value_states,
+                attention_mask=attention_mask,
+                packed_metadata=packed_metadata,
+            )
+        elif self.attn_implementation == "sdpa":
+            attn_output = self._sdpa_attention(
+                query=query_states,
+                key=key_states,
+                value=value_states,
+                attention_mask=attention_mask,
+            )
+        else:
+            attn_output = self._eager_attention(
+                query=query_states,
+                key=key_states,
+                value=value_states,
+                attention_mask=attention_mask,
+            )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        return self.o_proj(attn_output), present
+class MossQwen3DecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx: int) -> None:
+        super().__init__()
+        self.self_attn = MossQwen3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = MossQwen3MLP(config)
+        self.input_layernorm = MossQwen3RMSNorm(config.hidden_size, eps=float(config.rms_norm_eps))
+        self.post_attention_layernorm = MossQwen3RMSNorm(config.hidden_size, eps=float(config.rms_norm_eps))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        packed_metadata: Optional[PackedSequenceMetadata] = None,
+        layer_past: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_output, present = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            packed_metadata=packed_metadata,
+            layer_past=layer_past,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + attn_output
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + self.mlp(hidden_states)
+        return hidden_states, present
+class MossQwen3Model(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.config = config
+        self.attn_implementation = str(getattr(config, "_attn_implementation", "eager"))
+        self.padding_idx = getattr(config, "pad_token_id", None)
+        self.vocab_size = int(config.vocab_size)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MossQwen3DecoderLayer(config, layer_idx=index) for index in range(config.num_hidden_layers)]
+        )
+        self.norm = MossQwen3RMSNorm(config.hidden_size, eps=float(config.rms_norm_eps))
+        self.rotary_emb = MossQwen3RotaryEmbedding(config)
+        self.gradient_checkpointing = False
+        self.gradient_checkpointing_use_reentrant = bool(
+            getattr(config, "gradient_checkpointing_use_reentrant", False)
+        )
+        self._reset_parameters()
+    def _reset_parameters(self) -> None:
+        init_std = float(getattr(self.config, "initializer_range", 0.02))
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.normal_(module.weight, mean=0.0, std=init_std)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.Embedding):
+                nn.init.normal_(module.weight, mean=0.0, std=init_std)
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def load_qwen3_pretrained_weights(self, pretrained_path: str) -> None:
+        model_dir = Path(pretrained_path)
+        index_path = model_dir / "model.safetensors.index.json"
+        if not index_path.exists():
+            raise FileNotFoundError(f"Missing Qwen3 safetensors index: {index_path}")
+        with index_path.open("r", encoding="utf-8") as handle:
+            index = json.load(handle)
+        weight_map = index.get("weight_map", {})
+        shard_to_keys: dict[str, list[str]] = {}
+        for key, shard in weight_map.items():
+            if not key.startswith("model."):
+                continue
+            shard_to_keys.setdefault(str(shard), []).append(key)
+        state_dict = self.state_dict()
+        loaded_state = {}
+        for shard, keys in sorted(shard_to_keys.items()):
+            shard_tensors = load_file(str(model_dir / shard), device="cpu")
+            for key in keys:
+                target_key = key[len("model.") :]
+                if target_key not in state_dict:
+                    continue
+                tensor = shard_tensors[key]
+                if tuple(tensor.shape) != tuple(state_dict[target_key].shape):
+                    raise ValueError(
+                        f"Shape mismatch while loading Qwen3 weight {key}: "
+                        f"checkpoint={tuple(tensor.shape)} model={tuple(state_dict[target_key].shape)}"
+                    )
+                loaded_state[target_key] = tensor
+        missing, unexpected = self.load_state_dict(loaded_state, strict=False)
+        unexpected = [key for key in unexpected if key]
+        if unexpected:
+            raise RuntimeError(f"Unexpected Qwen3 pretrained keys after load: {unexpected[:10]}")
+        missing = [key for key in missing if key not in loaded_state]
+        if missing:
+            raise RuntimeError(f"Missing Qwen3 pretrained keys after load: {missing[:10]}")
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: bool = True,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        num_sequences: Optional[torch.Tensor] = None,
+    ) -> BaseModelOutputWithPast:
+        del input_ids, output_attentions
+        if inputs_embeds is None:
+            raise ValueError("inputs_embeds must be provided.")
+        use_cache = bool(use_cache)
+        if use_cache and cu_seqlens is not None:
+            raise ValueError("use_cache=True is not supported together with cu_seqlens packing.")
+        hidden_states = inputs_embeds
+        if attention_mask is None:
+            attention_mask = torch.ones(hidden_states.shape[:2], dtype=torch.bool, device=hidden_states.device)
+        else:
+            attention_mask = attention_mask.to(dtype=torch.bool, device=hidden_states.device)
+        query_attention_mask = attention_mask[:, -hidden_states.shape[1] :]
+        packed_metadata = None
+        if position_ids is None:
+            if cu_seqlens is not None:
+                position_ids = MossTTSNanoGPT2Model.build_packed_position_ids(
+                    attention_mask=attention_mask,
+                    cu_seqlens=cu_seqlens.to(device=hidden_states.device),
+                    num_sequences=num_sequences.to(device=hidden_states.device) if num_sequences is not None else None,
+                    sequence_length=hidden_states.shape[1],
+                )
+            elif attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(dim=-1) - 1
+                position_ids = position_ids.masked_fill(~attention_mask, 0)
+                position_ids = position_ids[:, -hidden_states.shape[1] :]
+            else:
+                past_length = 0
+                if past_key_values is not None and len(past_key_values) > 0:
+                    past_length = past_key_values[0][0].shape[1]
+                position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device, dtype=torch.long)
+                position_ids = position_ids + past_length
+                position_ids = position_ids.unsqueeze(0).expand(hidden_states.shape[0], -1)
+        if cu_seqlens is not None and self.attn_implementation == "flash_attention_2":
+            packed_metadata = MossTTSNanoGPT2Model.build_packed_metadata(
+                hidden_states=hidden_states,
+                cu_seqlens=cu_seqlens.to(device=hidden_states.device),
+                num_sequences=num_sequences.to(device=hidden_states.device) if num_sequences is not None else None,
+            )
+        hidden_states = hidden_states * query_attention_mask.unsqueeze(-1).to(dtype=hidden_states.dtype)
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        all_hidden_states = () if output_hidden_states else None
+        presents = [] if use_cache else None
+        for layer_index, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    raise ValueError("use_cache=True is not supported when gradient checkpointing is enabled during training.")
+                def custom_forward(*inputs):
+                    output, _ = decoder_layer(
+                        hidden_states=inputs[0],
+                        attention_mask=inputs[1],
+                        packed_metadata=packed_metadata,
+                        layer_past=None,
+                        use_cache=False,
+                        position_embeddings=position_embeddings,
+                    )
+                    return output
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    custom_forward,
+                    hidden_states,
+                    attention_mask,
+                    use_reentrant=self.gradient_checkpointing_use_reentrant,
+                )
+                present = None
+            else:
+                hidden_states, present = decoder_layer(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    packed_metadata=packed_metadata,
+                    layer_past=None if past_key_values is None else past_key_values[layer_index],
+                    use_cache=use_cache,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = hidden_states * query_attention_mask.unsqueeze(-1).to(dtype=hidden_states.dtype)
+            if presents is not None:
+                presents.append(present)
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states * query_attention_mask.unsqueeze(-1).to(dtype=hidden_states.dtype)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return (hidden_states, tuple(presents) if presents is not None else None, all_hidden_states, None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=tuple(presents) if presents is not None else None,
+            hidden_states=all_hidden_states,
+            attentions=None,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "additional_special_tokens": [
+    "<|audio_start|>",
+    "<|audio_end|>",
+    "<|audio_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06902d1fb775216338802205886a24bc715ccc606bd872a892e3d3c83ca1b9e2
+size 11423220

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,253 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|audio_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|audio_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|audio_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|audio_start|>",
+    "<|audio_end|>",
+    "<|audio_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff