bupalinyu commited on
Commit
9c9ed0e
·
verified ·
1 Parent(s): c9dec28

Add files using upload-large-folder tool

Browse files
.eval_results/open_asr_leaderboard.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - dataset:
2
+ id: hf-audio/open-asr-leaderboard
3
+ task_id: mean_wer
4
+ value: 5.13
5
+ date: '2026-06-22'
6
+ source:
7
+ url: https://huggingface.co/datasets/hf-audio/open-asr-leaderboard
8
+ name: open-asr-leaderboard
9
+ user: hf-audio
10
+
11
+ - dataset:
12
+ id: hf-audio/open-asr-leaderboard
13
+ task_id: ami_wer
14
+ value: 8.91
15
+ date: '2026-06-22'
16
+ source:
17
+ url: https://huggingface.co/datasets/hf-audio/open-asr-leaderboard
18
+ name: open-asr-leaderboard
19
+ user: hf-audio
20
+
21
+ - dataset:
22
+ id: hf-audio/open-asr-leaderboard
23
+ task_id: earnings22_wer
24
+ value: 8.25
25
+ date: '2026-06-22'
26
+ source:
27
+ url: https://huggingface.co/datasets/hf-audio/open-asr-leaderboard
28
+ name: open-asr-leaderboard
29
+ user: hf-audio
30
+
31
+ - dataset:
32
+ id: hf-audio/open-asr-leaderboard
33
+ task_id: gigaspeech_wer
34
+ value: 7.30
35
+ date: '2026-06-22'
36
+ source:
37
+ url: https://huggingface.co/datasets/hf-audio/open-asr-leaderboard
38
+ name: open-asr-leaderboard
39
+ user: hf-audio
40
+
41
+ - dataset:
42
+ id: hf-audio/open-asr-leaderboard
43
+ task_id: librispeech_clean_wer
44
+ value: 1.09
45
+ date: '2026-06-22'
46
+ source:
47
+ url: https://huggingface.co/datasets/hf-audio/open-asr-leaderboard
48
+ name: open-asr-leaderboard
49
+ user: hf-audio
50
+
51
+ - dataset:
52
+ id: hf-audio/open-asr-leaderboard
53
+ task_id: librispeech_other_wer
54
+ value: 2.41
55
+ date: '2026-06-22'
56
+ source:
57
+ url: https://huggingface.co/datasets/hf-audio/open-asr-leaderboard
58
+ name: open-asr-leaderboard
59
+ user: hf-audio
60
+
61
+ - dataset:
62
+ id: hf-audio/open-asr-leaderboard
63
+ task_id: spgispeech_wer
64
+ value: 2.49
65
+ date: '2026-06-22'
66
+ source:
67
+ url: https://huggingface.co/datasets/hf-audio/open-asr-leaderboard
68
+ name: open-asr-leaderboard
69
+ user: hf-audio
70
+
71
+ - dataset:
72
+ id: hf-audio/open-asr-leaderboard
73
+ task_id: voxpopuli_wer
74
+ value: 5.48
75
+ date: '2026-06-22'
76
+ source:
77
+ url: https://huggingface.co/datasets/hf-audio/open-asr-leaderboard
78
+ name: open-asr-leaderboard
79
+ user: hf-audio
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ figures/ark_asr_architecture.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - automatic-speech-recognition
5
+ - speech
6
+ - audio
7
+ - transformers
8
+ - pytorch
9
+ - safetensors
10
+ - vllm
11
+ - ark-asr
12
+ pipeline_tag: automatic-speech-recognition
13
+ language:
14
+ - zh
15
+ - en
16
+ - de
17
+ - ja
18
+ - fr
19
+ - ko
20
+ - es
21
+ - pl
22
+ - it
23
+ - ro
24
+ - hu
25
+ - cs
26
+ - nl
27
+ - fi
28
+ - hr
29
+ - sk
30
+ - sl
31
+ - et
32
+ - lt
33
+ license: apache-2.0
34
+ repository: https://github.com/AutoArk/open-audio-opd
35
+ ---
36
+
37
+ <div align="center">
38
+
39
+ # ARK-ASR-3B: State-of-the-Art Multilingual ASR with Online Policy Distillation
40
+
41
+ [![GitHub](https://img.shields.io/badge/GitHub-AutoArk%2Fopen--audio--opd-blue?logo=github)](https://github.com/AutoArk/open-audio-opd)
42
+ [![arXiv](https://img.shields.io/badge/arXiv-2605.28139-b31b1b?logo=arxiv)](https://arxiv.org/abs/2605.28139)
43
+ [![License](https://img.shields.io/badge/License-Apache--2.0-green)](https://www.apache.org/licenses/LICENSE-2.0)
44
+
45
+ </div>
46
+
47
+ > **TL;DR** ARK-ASR-3B is an automatic speech recognition model trained with teacher-data adaptation and on-policy distillation. It achieves current state-of-the-art results on the Hugging Face Open ASR Leaderboard English short-form benchmark, with an average WER of **5.13%** across AMI, Earnings22, GigaSpeech, LibriSpeech, SPGISpeech, and VoxPopuli. The accompanying training, inference, and evaluation code is available at [AutoArk/open-audio-opd](https://github.com/AutoArk/open-audio-opd).
48
+
49
+ ## Abstract
50
+
51
+ ARK-ASR is an audio ASR student model optimized with the **teacher-data adaptation + online policy distillation (TD + OPD)** recipe from `open-audio-opd`.
52
+
53
+ Instead of relying only on static supervised transcripts, OPD lets the student generate transcripts online and trains it against token-level teacher scores on the student's own generated behavior. This checkpoint is the 3B-scale ARK-ASR model trained with the TD + OPD recipe.
54
+
55
+ ARK-ASR currently supports Chinese, English, German, Japanese, French, Korean, Spanish, Polish, Italian, Romanian, Hungarian, Czech, Dutch, Finnish, Croatian, Slovak, Slovene, Estonian, and Lithuanian ASR.
56
+
57
+ ## Supported Languages
58
+
59
+ Chinese, English, German, Japanese, French, Korean, Spanish, Polish, Italian, Romanian, Hungarian, Czech, Dutch, Finnish, Croatian, Slovak, Slovene, Estonian, and Lithuanian.
60
+
61
+ ## Model Overview
62
+
63
+ <div align="center">
64
+ <img src="figures/ark_asr_architecture.png" width="95%" alt="ARK-ASR architecture"/>
65
+ <br>
66
+ <p><strong>Figure 1: ARK-ASR architecture.</strong> Audio is encoded by a Whisper-style encoder with RoPE, merged through an MLP adapter, and injected into a Qwen decoder by replacing audio placeholder token embeddings before transcript generation.</p>
67
+ </div>
68
+
69
+ - **Model size:** 3B-scale decoder LLM with a dedicated Whisper-style audio encoder and MLP adapter
70
+ - **Task:** automatic speech recognition
71
+ - **Architecture:** audio-capable autoregressive Transformers model with custom `arkasr` remote code
72
+ - **Checkpoint format:** `safetensors`
73
+ - **Sampling rate:** 16 kHz
74
+ - **Recommended inference code:** [`scripts/infer/ark_asr_transformers.py`](https://github.com/AutoArk/open-audio-opd/blob/main/scripts/infer/ark_asr_transformers.py)
75
+ - **vLLM serving:** [`scripts/vllm/ark_asr_vllm`](https://github.com/AutoArk/open-audio-opd/tree/master/scripts/vllm/ark_asr_vllm)
76
+
77
+ The model should be loaded with `trust_remote_code=True`. The official inference script handles the processor, tokenizer, audio prompt format, generation cleanup, and ASR token filtering.
78
+
79
+ ## Performance
80
+
81
+ The following results are from the Hugging Face [Open ASR Leaderboard](https://huggingface.co/datasets/hf-audio/open-asr-leaderboard). Lower WER is better. ARK-ASR-3B reaches the current state of the art on this English short-form benchmark.
82
+
83
+ ### English WER
84
+
85
+ | Model | AMI | Earnings22 | GigaSpeech | LS Clean | LS Other | SPGISpeech | VoxPopuli | Avg |
86
+ | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
87
+ | ARK-ASR-3B | **8.91%** | **8.25%** | **7.30%** | **1.09%** | **2.41%** | **2.49%** | **5.48%** | **5.13%** |
88
+ | ARK-ASR-0.6B | 10.02% | 9.77% | 8.00% | 1.53% | 3.51% | 2.63% | 6.31% | 5.97% |
89
+
90
+ ## Inference
91
+
92
+ Run ASR inference with Hugging Face Transformers:
93
+
94
+ ```python
95
+ import torch
96
+ from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
97
+
98
+ model_path = "AutoArk-AI/ARK-ASR-3B"
99
+ audio_path = "assets/libai.wav"
100
+
101
+ device = "cuda" if torch.cuda.is_available() else "cpu"
102
+ torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
103
+
104
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
105
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
106
+ model = AutoModelForCausalLM.from_pretrained(
107
+ model_path,
108
+ trust_remote_code=True,
109
+ torch_dtype=torch_dtype,
110
+ attn_implementation="sdpa",
111
+ ).to(device)
112
+ model.eval()
113
+
114
+
115
+ def build_bad_words_ids(tokenizer):
116
+ eos_ids = tokenizer.eos_token_id
117
+ keep_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids or [])
118
+ bad_ids = set(tokenizer.all_special_ids) - keep_ids
119
+ bad_ids.update(
120
+ token_id
121
+ for token, token_id in tokenizer.get_added_vocab().items()
122
+ if token.startswith("<") and token.endswith(">") and token_id not in keep_ids
123
+ )
124
+ return [[token_id] for token_id in sorted(bad_ids)]
125
+
126
+ conversation = [
127
+ {
128
+ "role": "user",
129
+ "content": [
130
+ {"type": "audio", "path": audio_path},
131
+ {"type": "text", "text": "Please transcribe this audio."},
132
+ ],
133
+ }
134
+ ]
135
+
136
+ inputs = processor.apply_chat_template(
137
+ conversation,
138
+ add_generation_prompt=True,
139
+ return_tensors="pt",
140
+ sampling_rate=16000,
141
+ audio_padding="longest",
142
+ text_kwargs={"padding": "longest"},
143
+ audio_max_length=30 * 16000,
144
+ )
145
+ inputs = inputs.to(device)
146
+ if "audios" in inputs:
147
+ inputs["audios"] = inputs["audios"].to(dtype=torch_dtype)
148
+
149
+ bad_words_ids = build_bad_words_ids(tokenizer)
150
+ with torch.inference_mode():
151
+ outputs = model.generate(
152
+ **inputs,
153
+ do_sample=False,
154
+ max_new_tokens=256,
155
+ pad_token_id=tokenizer.pad_token_id,
156
+ eos_token_id=tokenizer.eos_token_id,
157
+ bad_words_ids=bad_words_ids,
158
+ )
159
+ decoded_outputs = tokenizer.batch_decode(
160
+ outputs[:, inputs.input_ids.shape[1] :],
161
+ skip_special_tokens=True,
162
+ )
163
+ print(decoded_outputs)
164
+ ```
165
+
166
+ For batch JSONL inference, use the open-source inference code:
167
+
168
+ ```bash
169
+ git clone https://github.com/AutoArk/open-audio-opd
170
+ cd open-audio-opd
171
+ pip install -e .
172
+ ```
173
+
174
+ The input JSONL should contain one ASR sample per line:
175
+
176
+ ```json
177
+ {"audio":"/path/to/audio.wav","text":"","task":"asr","begin_time":-1,"end_time":-1}
178
+ ```
179
+
180
+ ```bash
181
+ python scripts/infer/ark_asr_transformers.py \
182
+ --input /path/to/input.jsonl \
183
+ --output runs/infer/predictions.jsonl \
184
+ --model_path AutoArk-AI/ARK-ASR-3B \
185
+ --processor_path AutoArk-AI/ARK-ASR-3B \
186
+ --batch_size 40 \
187
+ --dtype bfloat16 \
188
+ --attn_impl sdpa
189
+ ```
190
+
191
+ The output JSONL preserves input metadata and adds:
192
+
193
+ - `pred_text`: cleaned prediction text for downstream evaluation
194
+ - `pred_text_raw`: raw decoded generation before cleanup
195
+
196
+ ## vLLM Online Serving
197
+
198
+ ARK-ASR can also be deployed as a vLLM-backed online ASR service with the
199
+ adapter in
200
+ [`scripts/vllm/ark_asr_vllm`](https://github.com/AutoArk/open-audio-opd/tree/master/scripts/vllm/ark_asr_vllm).
201
+ The service exposes both a compact `/asr` endpoint and an OpenAI-style
202
+ `/v1/audio/transcriptions` endpoint.
203
+
204
+ Clone and install the serving code:
205
+
206
+ ```bash
207
+ git clone https://github.com/AutoArk/open-audio-opd
208
+ cd open-audio-opd
209
+ pip install -e ".[vllm]"
210
+ ```
211
+
212
+ Start the service:
213
+
214
+ ```bash
215
+ MODEL=AutoArk-AI/ARK-ASR-3B \
216
+ GPU=0 \
217
+ PORT=8025 \
218
+ scripts/vllm/deploy_ark_asr_vllm_service.sh start
219
+ ```
220
+
221
+ Check the service:
222
+
223
+ ```bash
224
+ scripts/vllm/deploy_ark_asr_vllm_service.sh status
225
+ curl -sS http://127.0.0.1:8025/health
226
+ curl -sS http://127.0.0.1:8025/token-mask
227
+ ```
228
+
229
+ Run one transcription request:
230
+
231
+ ```bash
232
+ curl -sS -X POST http://127.0.0.1:8025/asr \
233
+ -F file=@/path/to/audio.wav \
234
+ -F max_new_tokens=256
235
+ ```
236
+
237
+ OpenAI-style transcription endpoint:
238
+
239
+ ```bash
240
+ curl -sS -X POST http://127.0.0.1:8025/v1/audio/transcriptions \
241
+ -F file=@/path/to/audio.wav \
242
+ -F model=ark-asr
243
+ ```
244
+
245
+ Stop the service:
246
+
247
+ ```bash
248
+ scripts/vllm/deploy_ark_asr_vllm_service.sh stop
249
+ ```
250
+
251
+ The vLLM adapter registers the custom `arkasr` model, loads the local
252
+ processor/tokenizer with `trust_remote_code=True`, applies generation-time
253
+ token masking for non-ASR control tokens, and keeps `<|im_end|>` as the stop
254
+ token. Service logs and PID files are written under `runs/vllm/`.
255
+
256
+ ## Evaluation
257
+
258
+ The repository also includes a J/WER evaluation entrypoint:
259
+
260
+ ```bash
261
+ python scripts/eval/eval_jwer_ark_asr_transformers.py \
262
+ --input /path/to/test.jsonl \
263
+ --output runs/eval/result.jsonl \
264
+ --model_path AutoArk-AI/ARK-ASR-3B \
265
+ --processor_path AutoArk-AI/ARK-ASR-3B \
266
+ --batch_size 40 \
267
+ --dtype bfloat16 \
268
+ --attn_impl sdpa
269
+ ```
270
+
271
+ No evaluation audio or dataset files are bundled with this model repository.
272
+
273
+ ## Acknowledgements
274
+
275
+ The training code is based on [THUNLP/OPD](https://github.com/thunlp/OPD/) and [verl](https://github.com/volcengine/verl). The OPD recipe uses a stronger ASR teacher to score online student rollouts.
276
+
277
+ ## Citation
278
+
279
+ If you find ARK-ASR or open-audio-opd useful, please cite:
280
+
281
+ ```bibtex
282
+ @misc{lin2026dataefficientopd,
283
+ title={Data-Efficient On-Policy Distillation for Automatic Speech Recognition},
284
+ author={Lin, Yu and Wang, Yiming and Cai, Runyuan and Zeng, Xiaodong},
285
+ year={2026},
286
+ eprint={2605.28139},
287
+ archivePrefix={arXiv},
288
+ primaryClass={cs.AI},
289
+ url={https://arxiv.org/abs/2605.28139}
290
+ }
291
+ ```
added_tokens.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|assistant|>": 151668,
5
+ "<|audio|>": 151663,
6
+ "<|begin_of_audio|>": 151666,
7
+ "<|end_content|>": 151651,
8
+ "<|end_global_token|>": 151649,
9
+ "<|end_of_audio|>": 151667,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|start_content|>": 151650,
22
+ "<|start_global_token|>": 151648,
23
+ "<|system|>": 151669,
24
+ "<|user|>": 151665,
25
+ "<|video_pad|>": 151656,
26
+ "<|vision_end|>": 151653,
27
+ "<|vision_pad|>": 151654,
28
+ "<|vision_start|>": 151652
29
+ }
chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message.role == "system" %}
3
+ <|system|>
4
+ {{ message.content }}
5
+ {% elif message.role == "user" %}
6
+ <|user|>
7
+ {{ message.content }}
8
+ {% elif message.role == "assistant" %}
9
+ <|assistant|>
10
+ {{ message.content }}
11
+ {% endif %}
12
+ {% endfor %}
13
+ {% if add_generation_prompt %}
14
+ <|assistant|>
15
+ {% endif %}
config.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "adapter_type": "mlp",
3
+ "architectures": [
4
+ "ArkasrForConditionalGeneration"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "audio_token_id": 151663,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_arkasr.ArkasrConfig",
10
+ "AutoModelForCausalLM": "modeling_arkasr.ArkasrForConditionalGeneration"
11
+ },
12
+ "bos_token_id": 151643,
13
+ "dtype": "bfloat16",
14
+ "eos_token_id": 151645,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 2048,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 11008,
19
+ "layer_types": [
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention"
56
+ ],
57
+ "max_position_embeddings": 32768,
58
+ "max_whisper_length": 1500,
59
+ "max_window_layers": 70,
60
+ "merge_factor": 4,
61
+ "mlp_adapter_act": "gelu",
62
+ "model_type": "arkasr",
63
+ "num_attention_heads": 16,
64
+ "num_hidden_layers": 36,
65
+ "num_key_value_heads": 2,
66
+ "pad_token_id": 151643,
67
+ "rms_norm_eps": 1e-06,
68
+ "rope_scaling": null,
69
+ "rope_theta": 1000000.0,
70
+ "sliding_window": null,
71
+ "spec_aug": false,
72
+ "tie_word_embeddings": true,
73
+ "transformers_version": "4.57.6",
74
+ "use_cache": true,
75
+ "use_mrope": false,
76
+ "use_rope": true,
77
+ "use_sliding_window": false,
78
+ "vocab_size": 151936,
79
+ "whisper_config": {
80
+ "activation_dropout": 0.0,
81
+ "activation_function": "gelu",
82
+ "apply_spec_augment": false,
83
+ "architectures": [
84
+ "WhisperForConditionalGeneration"
85
+ ],
86
+ "attention_dropout": 0.0,
87
+ "begin_suppress_tokens": [
88
+ 220,
89
+ 50257
90
+ ],
91
+ "bos_token_id": 50257,
92
+ "classifier_proj_size": 256,
93
+ "d_model": 1280,
94
+ "decoder_attention_heads": 20,
95
+ "decoder_ffn_dim": 5120,
96
+ "decoder_layerdrop": 0.0,
97
+ "decoder_layers": 32,
98
+ "decoder_start_token_id": 50258,
99
+ "dropout": 0.0,
100
+ "dtype": "bfloat16",
101
+ "encoder_attention_heads": 20,
102
+ "encoder_ffn_dim": 5120,
103
+ "encoder_layerdrop": 0.0,
104
+ "encoder_layers": 32,
105
+ "eos_token_id": 50257,
106
+ "init_std": 0.02,
107
+ "mask_feature_length": 10,
108
+ "mask_feature_min_masks": 0,
109
+ "mask_feature_prob": 0.0,
110
+ "mask_time_length": 10,
111
+ "mask_time_min_masks": 2,
112
+ "mask_time_prob": 0.05,
113
+ "max_length": 448,
114
+ "max_source_positions": 1500,
115
+ "max_target_positions": 448,
116
+ "median_filter_width": 7,
117
+ "model_type": "whisper",
118
+ "num_hidden_layers": 32,
119
+ "num_mel_bins": 128,
120
+ "scale_embedding": false,
121
+ "use_cache": true,
122
+ "use_weighted_layer_sum": false,
123
+ "vocab_size": 51866
124
+ }
125
+ }
configuration_arkasr.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, Optional, Union
2
+ from transformers import Qwen2Config, WhisperConfig
3
+
4
+
5
+ class ArkasrConfig(Qwen2Config):
6
+ model_type = "arkasr"
7
+ is_composition = True
8
+
9
+ def __init__(
10
+ self,
11
+ whisper_config: Optional[Union[Dict[str, Any], WhisperConfig]] = None,
12
+ adapter_type: str = "mlp",
13
+ merge_factor: int = 4,
14
+ spec_aug: bool = False,
15
+ use_rope: bool = True,
16
+ max_whisper_length: int = 1500,
17
+ mlp_adapter_act: str = "gelu",
18
+ **kwargs, # 👈 所有 Qwen2Config 的参数都从这里进来
19
+ ):
20
+ # === 1️⃣ 关键点:初始化 Qwen2Config(LM 部分)===
21
+ # 这里会吃掉:
22
+ # vocab_size / hidden_size / num_hidden_layers / rope_scaling / ...
23
+ super().__init__(**kwargs)
24
+
25
+ # === 2️⃣ Whisper 子配置 ===
26
+ if isinstance(whisper_config, dict):
27
+ self.whisper_config = WhisperConfig(**whisper_config)
28
+ elif isinstance(whisper_config, WhisperConfig):
29
+ self.whisper_config = whisper_config
30
+ else:
31
+ self.whisper_config = WhisperConfig()
32
+
33
+ # === 3️⃣ ArkASR 自己的参数 ===
34
+ self.adapter_type = adapter_type
35
+ self.merge_factor = int(merge_factor)
36
+ self.spec_aug = bool(spec_aug)
37
+ self.use_rope = bool(use_rope)
38
+ self.max_whisper_length = int(max_whisper_length)
39
+ self.mlp_adapter_act = mlp_adapter_act
40
+
41
+ def to_dict(self):
42
+ output = super().to_dict()
43
+ output["whisper_config"] = self.whisper_config.to_dict()
44
+ return output
45
+
46
+
47
+ __all__ = ["ArkasrConfig"]
figures/ark_asr_architecture.png ADDED

Git LFS Details

  • SHA256: a31af4173acf07f4ab5984820eb796362a82aa942d4fa133ebe201192e7737b8
  • Pointer size: 132 Bytes
  • Size of remote file: 1.14 MB
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "pad_token_id": 151643,
6
+ "transformers_version": "4.57.6"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0f93d36c42c3487d95532c7d54952c7810bf629ee2a2265ed4e2f9e988e5342
3
+ size 4996098432
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db6d540d2b230e322efc17176d7f67ae44a25705bd85c5caccdf5d8c426c072c
3
+ size 3130890232
model.safetensors.index.json ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4063438848,
4
+ "total_size": 8126877696
5
+ },
6
+ "weight_map": {
7
+ "audio_encoder.adapting.0.bias": "model-00001-of-00002.safetensors",
8
+ "audio_encoder.adapting.0.weight": "model-00002-of-00002.safetensors",
9
+ "audio_encoder.adapting.2.bias": "model-00001-of-00002.safetensors",
10
+ "audio_encoder.adapting.2.weight": "model-00001-of-00002.safetensors",
11
+ "audio_encoder.layer_norm.bias": "model-00001-of-00002.safetensors",
12
+ "audio_encoder.layer_norm.weight": "model-00002-of-00002.safetensors",
13
+ "audio_encoder.whisper.conv1.bias": "model-00002-of-00002.safetensors",
14
+ "audio_encoder.whisper.conv1.weight": "model-00001-of-00002.safetensors",
15
+ "audio_encoder.whisper.conv2.bias": "model-00002-of-00002.safetensors",
16
+ "audio_encoder.whisper.conv2.weight": "model-00002-of-00002.safetensors",
17
+ "audio_encoder.whisper.embed_positions.weight": "model-00001-of-00002.safetensors",
18
+ "audio_encoder.whisper.layers.0.fc1.bias": "model-00001-of-00002.safetensors",
19
+ "audio_encoder.whisper.layers.0.fc1.weight": "model-00001-of-00002.safetensors",
20
+ "audio_encoder.whisper.layers.0.fc2.bias": "model-00001-of-00002.safetensors",
21
+ "audio_encoder.whisper.layers.0.fc2.weight": "model-00002-of-00002.safetensors",
22
+ "audio_encoder.whisper.layers.0.final_layer_norm.bias": "model-00001-of-00002.safetensors",
23
+ "audio_encoder.whisper.layers.0.final_layer_norm.weight": "model-00001-of-00002.safetensors",
24
+ "audio_encoder.whisper.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
25
+ "audio_encoder.whisper.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
26
+ "audio_encoder.whisper.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
27
+ "audio_encoder.whisper.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
28
+ "audio_encoder.whisper.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
29
+ "audio_encoder.whisper.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
30
+ "audio_encoder.whisper.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
31
+ "audio_encoder.whisper.layers.0.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
32
+ "audio_encoder.whisper.layers.0.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
33
+ "audio_encoder.whisper.layers.1.fc1.bias": "model-00001-of-00002.safetensors",
34
+ "audio_encoder.whisper.layers.1.fc1.weight": "model-00002-of-00002.safetensors",
35
+ "audio_encoder.whisper.layers.1.fc2.bias": "model-00002-of-00002.safetensors",
36
+ "audio_encoder.whisper.layers.1.fc2.weight": "model-00002-of-00002.safetensors",
37
+ "audio_encoder.whisper.layers.1.final_layer_norm.bias": "model-00002-of-00002.safetensors",
38
+ "audio_encoder.whisper.layers.1.final_layer_norm.weight": "model-00002-of-00002.safetensors",
39
+ "audio_encoder.whisper.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
40
+ "audio_encoder.whisper.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
41
+ "audio_encoder.whisper.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
42
+ "audio_encoder.whisper.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
43
+ "audio_encoder.whisper.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
44
+ "audio_encoder.whisper.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
45
+ "audio_encoder.whisper.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
46
+ "audio_encoder.whisper.layers.1.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
47
+ "audio_encoder.whisper.layers.1.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
48
+ "audio_encoder.whisper.layers.10.fc1.bias": "model-00002-of-00002.safetensors",
49
+ "audio_encoder.whisper.layers.10.fc1.weight": "model-00001-of-00002.safetensors",
50
+ "audio_encoder.whisper.layers.10.fc2.bias": "model-00002-of-00002.safetensors",
51
+ "audio_encoder.whisper.layers.10.fc2.weight": "model-00001-of-00002.safetensors",
52
+ "audio_encoder.whisper.layers.10.final_layer_norm.bias": "model-00001-of-00002.safetensors",
53
+ "audio_encoder.whisper.layers.10.final_layer_norm.weight": "model-00002-of-00002.safetensors",
54
+ "audio_encoder.whisper.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
55
+ "audio_encoder.whisper.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
56
+ "audio_encoder.whisper.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
57
+ "audio_encoder.whisper.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
58
+ "audio_encoder.whisper.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
59
+ "audio_encoder.whisper.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
60
+ "audio_encoder.whisper.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
61
+ "audio_encoder.whisper.layers.10.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
62
+ "audio_encoder.whisper.layers.10.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
63
+ "audio_encoder.whisper.layers.11.fc1.bias": "model-00001-of-00002.safetensors",
64
+ "audio_encoder.whisper.layers.11.fc1.weight": "model-00001-of-00002.safetensors",
65
+ "audio_encoder.whisper.layers.11.fc2.bias": "model-00002-of-00002.safetensors",
66
+ "audio_encoder.whisper.layers.11.fc2.weight": "model-00002-of-00002.safetensors",
67
+ "audio_encoder.whisper.layers.11.final_layer_norm.bias": "model-00001-of-00002.safetensors",
68
+ "audio_encoder.whisper.layers.11.final_layer_norm.weight": "model-00001-of-00002.safetensors",
69
+ "audio_encoder.whisper.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
70
+ "audio_encoder.whisper.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
71
+ "audio_encoder.whisper.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
72
+ "audio_encoder.whisper.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
73
+ "audio_encoder.whisper.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
74
+ "audio_encoder.whisper.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
75
+ "audio_encoder.whisper.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
76
+ "audio_encoder.whisper.layers.11.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
77
+ "audio_encoder.whisper.layers.11.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
78
+ "audio_encoder.whisper.layers.12.fc1.bias": "model-00002-of-00002.safetensors",
79
+ "audio_encoder.whisper.layers.12.fc1.weight": "model-00001-of-00002.safetensors",
80
+ "audio_encoder.whisper.layers.12.fc2.bias": "model-00002-of-00002.safetensors",
81
+ "audio_encoder.whisper.layers.12.fc2.weight": "model-00001-of-00002.safetensors",
82
+ "audio_encoder.whisper.layers.12.final_layer_norm.bias": "model-00002-of-00002.safetensors",
83
+ "audio_encoder.whisper.layers.12.final_layer_norm.weight": "model-00002-of-00002.safetensors",
84
+ "audio_encoder.whisper.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
85
+ "audio_encoder.whisper.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
86
+ "audio_encoder.whisper.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
87
+ "audio_encoder.whisper.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
88
+ "audio_encoder.whisper.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
89
+ "audio_encoder.whisper.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
90
+ "audio_encoder.whisper.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
91
+ "audio_encoder.whisper.layers.12.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
92
+ "audio_encoder.whisper.layers.12.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
93
+ "audio_encoder.whisper.layers.13.fc1.bias": "model-00001-of-00002.safetensors",
94
+ "audio_encoder.whisper.layers.13.fc1.weight": "model-00002-of-00002.safetensors",
95
+ "audio_encoder.whisper.layers.13.fc2.bias": "model-00001-of-00002.safetensors",
96
+ "audio_encoder.whisper.layers.13.fc2.weight": "model-00001-of-00002.safetensors",
97
+ "audio_encoder.whisper.layers.13.final_layer_norm.bias": "model-00001-of-00002.safetensors",
98
+ "audio_encoder.whisper.layers.13.final_layer_norm.weight": "model-00002-of-00002.safetensors",
99
+ "audio_encoder.whisper.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
100
+ "audio_encoder.whisper.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
101
+ "audio_encoder.whisper.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
102
+ "audio_encoder.whisper.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
103
+ "audio_encoder.whisper.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
104
+ "audio_encoder.whisper.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
105
+ "audio_encoder.whisper.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
106
+ "audio_encoder.whisper.layers.13.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
107
+ "audio_encoder.whisper.layers.13.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
108
+ "audio_encoder.whisper.layers.14.fc1.bias": "model-00002-of-00002.safetensors",
109
+ "audio_encoder.whisper.layers.14.fc1.weight": "model-00002-of-00002.safetensors",
110
+ "audio_encoder.whisper.layers.14.fc2.bias": "model-00001-of-00002.safetensors",
111
+ "audio_encoder.whisper.layers.14.fc2.weight": "model-00001-of-00002.safetensors",
112
+ "audio_encoder.whisper.layers.14.final_layer_norm.bias": "model-00002-of-00002.safetensors",
113
+ "audio_encoder.whisper.layers.14.final_layer_norm.weight": "model-00002-of-00002.safetensors",
114
+ "audio_encoder.whisper.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
115
+ "audio_encoder.whisper.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
116
+ "audio_encoder.whisper.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
117
+ "audio_encoder.whisper.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
118
+ "audio_encoder.whisper.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
119
+ "audio_encoder.whisper.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
120
+ "audio_encoder.whisper.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
121
+ "audio_encoder.whisper.layers.14.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
122
+ "audio_encoder.whisper.layers.14.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
123
+ "audio_encoder.whisper.layers.15.fc1.bias": "model-00002-of-00002.safetensors",
124
+ "audio_encoder.whisper.layers.15.fc1.weight": "model-00001-of-00002.safetensors",
125
+ "audio_encoder.whisper.layers.15.fc2.bias": "model-00001-of-00002.safetensors",
126
+ "audio_encoder.whisper.layers.15.fc2.weight": "model-00001-of-00002.safetensors",
127
+ "audio_encoder.whisper.layers.15.final_layer_norm.bias": "model-00001-of-00002.safetensors",
128
+ "audio_encoder.whisper.layers.15.final_layer_norm.weight": "model-00002-of-00002.safetensors",
129
+ "audio_encoder.whisper.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
130
+ "audio_encoder.whisper.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
131
+ "audio_encoder.whisper.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
132
+ "audio_encoder.whisper.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
133
+ "audio_encoder.whisper.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
134
+ "audio_encoder.whisper.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
135
+ "audio_encoder.whisper.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
136
+ "audio_encoder.whisper.layers.15.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
137
+ "audio_encoder.whisper.layers.15.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
138
+ "audio_encoder.whisper.layers.16.fc1.bias": "model-00001-of-00002.safetensors",
139
+ "audio_encoder.whisper.layers.16.fc1.weight": "model-00001-of-00002.safetensors",
140
+ "audio_encoder.whisper.layers.16.fc2.bias": "model-00001-of-00002.safetensors",
141
+ "audio_encoder.whisper.layers.16.fc2.weight": "model-00001-of-00002.safetensors",
142
+ "audio_encoder.whisper.layers.16.final_layer_norm.bias": "model-00002-of-00002.safetensors",
143
+ "audio_encoder.whisper.layers.16.final_layer_norm.weight": "model-00001-of-00002.safetensors",
144
+ "audio_encoder.whisper.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
145
+ "audio_encoder.whisper.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
146
+ "audio_encoder.whisper.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
147
+ "audio_encoder.whisper.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
148
+ "audio_encoder.whisper.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
149
+ "audio_encoder.whisper.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
150
+ "audio_encoder.whisper.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
151
+ "audio_encoder.whisper.layers.16.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
152
+ "audio_encoder.whisper.layers.16.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
153
+ "audio_encoder.whisper.layers.17.fc1.bias": "model-00002-of-00002.safetensors",
154
+ "audio_encoder.whisper.layers.17.fc1.weight": "model-00002-of-00002.safetensors",
155
+ "audio_encoder.whisper.layers.17.fc2.bias": "model-00002-of-00002.safetensors",
156
+ "audio_encoder.whisper.layers.17.fc2.weight": "model-00001-of-00002.safetensors",
157
+ "audio_encoder.whisper.layers.17.final_layer_norm.bias": "model-00001-of-00002.safetensors",
158
+ "audio_encoder.whisper.layers.17.final_layer_norm.weight": "model-00001-of-00002.safetensors",
159
+ "audio_encoder.whisper.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
160
+ "audio_encoder.whisper.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
161
+ "audio_encoder.whisper.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
162
+ "audio_encoder.whisper.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
163
+ "audio_encoder.whisper.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
164
+ "audio_encoder.whisper.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
165
+ "audio_encoder.whisper.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
166
+ "audio_encoder.whisper.layers.17.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
167
+ "audio_encoder.whisper.layers.17.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
168
+ "audio_encoder.whisper.layers.18.fc1.bias": "model-00001-of-00002.safetensors",
169
+ "audio_encoder.whisper.layers.18.fc1.weight": "model-00001-of-00002.safetensors",
170
+ "audio_encoder.whisper.layers.18.fc2.bias": "model-00001-of-00002.safetensors",
171
+ "audio_encoder.whisper.layers.18.fc2.weight": "model-00001-of-00002.safetensors",
172
+ "audio_encoder.whisper.layers.18.final_layer_norm.bias": "model-00001-of-00002.safetensors",
173
+ "audio_encoder.whisper.layers.18.final_layer_norm.weight": "model-00001-of-00002.safetensors",
174
+ "audio_encoder.whisper.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
175
+ "audio_encoder.whisper.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
176
+ "audio_encoder.whisper.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
177
+ "audio_encoder.whisper.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
178
+ "audio_encoder.whisper.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
179
+ "audio_encoder.whisper.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
180
+ "audio_encoder.whisper.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
181
+ "audio_encoder.whisper.layers.18.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
182
+ "audio_encoder.whisper.layers.18.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
183
+ "audio_encoder.whisper.layers.19.fc1.bias": "model-00002-of-00002.safetensors",
184
+ "audio_encoder.whisper.layers.19.fc1.weight": "model-00002-of-00002.safetensors",
185
+ "audio_encoder.whisper.layers.19.fc2.bias": "model-00002-of-00002.safetensors",
186
+ "audio_encoder.whisper.layers.19.fc2.weight": "model-00001-of-00002.safetensors",
187
+ "audio_encoder.whisper.layers.19.final_layer_norm.bias": "model-00001-of-00002.safetensors",
188
+ "audio_encoder.whisper.layers.19.final_layer_norm.weight": "model-00001-of-00002.safetensors",
189
+ "audio_encoder.whisper.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
190
+ "audio_encoder.whisper.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
191
+ "audio_encoder.whisper.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
192
+ "audio_encoder.whisper.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
193
+ "audio_encoder.whisper.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
194
+ "audio_encoder.whisper.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
195
+ "audio_encoder.whisper.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
196
+ "audio_encoder.whisper.layers.19.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
197
+ "audio_encoder.whisper.layers.19.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
198
+ "audio_encoder.whisper.layers.2.fc1.bias": "model-00002-of-00002.safetensors",
199
+ "audio_encoder.whisper.layers.2.fc1.weight": "model-00002-of-00002.safetensors",
200
+ "audio_encoder.whisper.layers.2.fc2.bias": "model-00001-of-00002.safetensors",
201
+ "audio_encoder.whisper.layers.2.fc2.weight": "model-00002-of-00002.safetensors",
202
+ "audio_encoder.whisper.layers.2.final_layer_norm.bias": "model-00002-of-00002.safetensors",
203
+ "audio_encoder.whisper.layers.2.final_layer_norm.weight": "model-00002-of-00002.safetensors",
204
+ "audio_encoder.whisper.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
205
+ "audio_encoder.whisper.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
206
+ "audio_encoder.whisper.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
207
+ "audio_encoder.whisper.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
208
+ "audio_encoder.whisper.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
209
+ "audio_encoder.whisper.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
210
+ "audio_encoder.whisper.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
211
+ "audio_encoder.whisper.layers.2.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
212
+ "audio_encoder.whisper.layers.2.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
213
+ "audio_encoder.whisper.layers.20.fc1.bias": "model-00002-of-00002.safetensors",
214
+ "audio_encoder.whisper.layers.20.fc1.weight": "model-00002-of-00002.safetensors",
215
+ "audio_encoder.whisper.layers.20.fc2.bias": "model-00002-of-00002.safetensors",
216
+ "audio_encoder.whisper.layers.20.fc2.weight": "model-00002-of-00002.safetensors",
217
+ "audio_encoder.whisper.layers.20.final_layer_norm.bias": "model-00002-of-00002.safetensors",
218
+ "audio_encoder.whisper.layers.20.final_layer_norm.weight": "model-00001-of-00002.safetensors",
219
+ "audio_encoder.whisper.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
220
+ "audio_encoder.whisper.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
221
+ "audio_encoder.whisper.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
222
+ "audio_encoder.whisper.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
223
+ "audio_encoder.whisper.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
224
+ "audio_encoder.whisper.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
225
+ "audio_encoder.whisper.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
226
+ "audio_encoder.whisper.layers.20.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
227
+ "audio_encoder.whisper.layers.20.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
228
+ "audio_encoder.whisper.layers.21.fc1.bias": "model-00001-of-00002.safetensors",
229
+ "audio_encoder.whisper.layers.21.fc1.weight": "model-00001-of-00002.safetensors",
230
+ "audio_encoder.whisper.layers.21.fc2.bias": "model-00002-of-00002.safetensors",
231
+ "audio_encoder.whisper.layers.21.fc2.weight": "model-00001-of-00002.safetensors",
232
+ "audio_encoder.whisper.layers.21.final_layer_norm.bias": "model-00001-of-00002.safetensors",
233
+ "audio_encoder.whisper.layers.21.final_layer_norm.weight": "model-00001-of-00002.safetensors",
234
+ "audio_encoder.whisper.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
235
+ "audio_encoder.whisper.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
236
+ "audio_encoder.whisper.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
237
+ "audio_encoder.whisper.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
238
+ "audio_encoder.whisper.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
239
+ "audio_encoder.whisper.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
240
+ "audio_encoder.whisper.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
241
+ "audio_encoder.whisper.layers.21.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
242
+ "audio_encoder.whisper.layers.21.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
243
+ "audio_encoder.whisper.layers.22.fc1.bias": "model-00002-of-00002.safetensors",
244
+ "audio_encoder.whisper.layers.22.fc1.weight": "model-00001-of-00002.safetensors",
245
+ "audio_encoder.whisper.layers.22.fc2.bias": "model-00001-of-00002.safetensors",
246
+ "audio_encoder.whisper.layers.22.fc2.weight": "model-00002-of-00002.safetensors",
247
+ "audio_encoder.whisper.layers.22.final_layer_norm.bias": "model-00001-of-00002.safetensors",
248
+ "audio_encoder.whisper.layers.22.final_layer_norm.weight": "model-00001-of-00002.safetensors",
249
+ "audio_encoder.whisper.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
250
+ "audio_encoder.whisper.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
251
+ "audio_encoder.whisper.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
252
+ "audio_encoder.whisper.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
253
+ "audio_encoder.whisper.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
254
+ "audio_encoder.whisper.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
255
+ "audio_encoder.whisper.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
256
+ "audio_encoder.whisper.layers.22.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
257
+ "audio_encoder.whisper.layers.22.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
258
+ "audio_encoder.whisper.layers.23.fc1.bias": "model-00002-of-00002.safetensors",
259
+ "audio_encoder.whisper.layers.23.fc1.weight": "model-00001-of-00002.safetensors",
260
+ "audio_encoder.whisper.layers.23.fc2.bias": "model-00001-of-00002.safetensors",
261
+ "audio_encoder.whisper.layers.23.fc2.weight": "model-00001-of-00002.safetensors",
262
+ "audio_encoder.whisper.layers.23.final_layer_norm.bias": "model-00002-of-00002.safetensors",
263
+ "audio_encoder.whisper.layers.23.final_layer_norm.weight": "model-00001-of-00002.safetensors",
264
+ "audio_encoder.whisper.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
265
+ "audio_encoder.whisper.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
266
+ "audio_encoder.whisper.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
267
+ "audio_encoder.whisper.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
268
+ "audio_encoder.whisper.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
269
+ "audio_encoder.whisper.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
270
+ "audio_encoder.whisper.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
271
+ "audio_encoder.whisper.layers.23.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
272
+ "audio_encoder.whisper.layers.23.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
273
+ "audio_encoder.whisper.layers.24.fc1.bias": "model-00002-of-00002.safetensors",
274
+ "audio_encoder.whisper.layers.24.fc1.weight": "model-00001-of-00002.safetensors",
275
+ "audio_encoder.whisper.layers.24.fc2.bias": "model-00001-of-00002.safetensors",
276
+ "audio_encoder.whisper.layers.24.fc2.weight": "model-00001-of-00002.safetensors",
277
+ "audio_encoder.whisper.layers.24.final_layer_norm.bias": "model-00002-of-00002.safetensors",
278
+ "audio_encoder.whisper.layers.24.final_layer_norm.weight": "model-00001-of-00002.safetensors",
279
+ "audio_encoder.whisper.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
280
+ "audio_encoder.whisper.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
281
+ "audio_encoder.whisper.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
282
+ "audio_encoder.whisper.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
283
+ "audio_encoder.whisper.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
284
+ "audio_encoder.whisper.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
285
+ "audio_encoder.whisper.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
286
+ "audio_encoder.whisper.layers.24.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
287
+ "audio_encoder.whisper.layers.24.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
288
+ "audio_encoder.whisper.layers.25.fc1.bias": "model-00002-of-00002.safetensors",
289
+ "audio_encoder.whisper.layers.25.fc1.weight": "model-00002-of-00002.safetensors",
290
+ "audio_encoder.whisper.layers.25.fc2.bias": "model-00001-of-00002.safetensors",
291
+ "audio_encoder.whisper.layers.25.fc2.weight": "model-00001-of-00002.safetensors",
292
+ "audio_encoder.whisper.layers.25.final_layer_norm.bias": "model-00001-of-00002.safetensors",
293
+ "audio_encoder.whisper.layers.25.final_layer_norm.weight": "model-00001-of-00002.safetensors",
294
+ "audio_encoder.whisper.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
295
+ "audio_encoder.whisper.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
296
+ "audio_encoder.whisper.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
297
+ "audio_encoder.whisper.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
298
+ "audio_encoder.whisper.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
299
+ "audio_encoder.whisper.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
300
+ "audio_encoder.whisper.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
301
+ "audio_encoder.whisper.layers.25.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
302
+ "audio_encoder.whisper.layers.25.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
303
+ "audio_encoder.whisper.layers.26.fc1.bias": "model-00001-of-00002.safetensors",
304
+ "audio_encoder.whisper.layers.26.fc1.weight": "model-00001-of-00002.safetensors",
305
+ "audio_encoder.whisper.layers.26.fc2.bias": "model-00001-of-00002.safetensors",
306
+ "audio_encoder.whisper.layers.26.fc2.weight": "model-00001-of-00002.safetensors",
307
+ "audio_encoder.whisper.layers.26.final_layer_norm.bias": "model-00001-of-00002.safetensors",
308
+ "audio_encoder.whisper.layers.26.final_layer_norm.weight": "model-00002-of-00002.safetensors",
309
+ "audio_encoder.whisper.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
310
+ "audio_encoder.whisper.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
311
+ "audio_encoder.whisper.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
312
+ "audio_encoder.whisper.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
313
+ "audio_encoder.whisper.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
314
+ "audio_encoder.whisper.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
315
+ "audio_encoder.whisper.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
316
+ "audio_encoder.whisper.layers.26.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
317
+ "audio_encoder.whisper.layers.26.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
318
+ "audio_encoder.whisper.layers.27.fc1.bias": "model-00002-of-00002.safetensors",
319
+ "audio_encoder.whisper.layers.27.fc1.weight": "model-00002-of-00002.safetensors",
320
+ "audio_encoder.whisper.layers.27.fc2.bias": "model-00001-of-00002.safetensors",
321
+ "audio_encoder.whisper.layers.27.fc2.weight": "model-00002-of-00002.safetensors",
322
+ "audio_encoder.whisper.layers.27.final_layer_norm.bias": "model-00001-of-00002.safetensors",
323
+ "audio_encoder.whisper.layers.27.final_layer_norm.weight": "model-00002-of-00002.safetensors",
324
+ "audio_encoder.whisper.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
325
+ "audio_encoder.whisper.layers.27.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
326
+ "audio_encoder.whisper.layers.27.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
327
+ "audio_encoder.whisper.layers.27.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
328
+ "audio_encoder.whisper.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
329
+ "audio_encoder.whisper.layers.27.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
330
+ "audio_encoder.whisper.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
331
+ "audio_encoder.whisper.layers.27.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
332
+ "audio_encoder.whisper.layers.27.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
333
+ "audio_encoder.whisper.layers.28.fc1.bias": "model-00001-of-00002.safetensors",
334
+ "audio_encoder.whisper.layers.28.fc1.weight": "model-00001-of-00002.safetensors",
335
+ "audio_encoder.whisper.layers.28.fc2.bias": "model-00001-of-00002.safetensors",
336
+ "audio_encoder.whisper.layers.28.fc2.weight": "model-00002-of-00002.safetensors",
337
+ "audio_encoder.whisper.layers.28.final_layer_norm.bias": "model-00001-of-00002.safetensors",
338
+ "audio_encoder.whisper.layers.28.final_layer_norm.weight": "model-00002-of-00002.safetensors",
339
+ "audio_encoder.whisper.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
340
+ "audio_encoder.whisper.layers.28.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
341
+ "audio_encoder.whisper.layers.28.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
342
+ "audio_encoder.whisper.layers.28.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
343
+ "audio_encoder.whisper.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
344
+ "audio_encoder.whisper.layers.28.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
345
+ "audio_encoder.whisper.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
346
+ "audio_encoder.whisper.layers.28.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
347
+ "audio_encoder.whisper.layers.28.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
348
+ "audio_encoder.whisper.layers.29.fc1.bias": "model-00001-of-00002.safetensors",
349
+ "audio_encoder.whisper.layers.29.fc1.weight": "model-00002-of-00002.safetensors",
350
+ "audio_encoder.whisper.layers.29.fc2.bias": "model-00001-of-00002.safetensors",
351
+ "audio_encoder.whisper.layers.29.fc2.weight": "model-00002-of-00002.safetensors",
352
+ "audio_encoder.whisper.layers.29.final_layer_norm.bias": "model-00002-of-00002.safetensors",
353
+ "audio_encoder.whisper.layers.29.final_layer_norm.weight": "model-00001-of-00002.safetensors",
354
+ "audio_encoder.whisper.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
355
+ "audio_encoder.whisper.layers.29.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
356
+ "audio_encoder.whisper.layers.29.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
357
+ "audio_encoder.whisper.layers.29.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
358
+ "audio_encoder.whisper.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
359
+ "audio_encoder.whisper.layers.29.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
360
+ "audio_encoder.whisper.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
361
+ "audio_encoder.whisper.layers.29.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
362
+ "audio_encoder.whisper.layers.29.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
363
+ "audio_encoder.whisper.layers.3.fc1.bias": "model-00002-of-00002.safetensors",
364
+ "audio_encoder.whisper.layers.3.fc1.weight": "model-00002-of-00002.safetensors",
365
+ "audio_encoder.whisper.layers.3.fc2.bias": "model-00001-of-00002.safetensors",
366
+ "audio_encoder.whisper.layers.3.fc2.weight": "model-00001-of-00002.safetensors",
367
+ "audio_encoder.whisper.layers.3.final_layer_norm.bias": "model-00002-of-00002.safetensors",
368
+ "audio_encoder.whisper.layers.3.final_layer_norm.weight": "model-00001-of-00002.safetensors",
369
+ "audio_encoder.whisper.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
370
+ "audio_encoder.whisper.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
371
+ "audio_encoder.whisper.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
372
+ "audio_encoder.whisper.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
373
+ "audio_encoder.whisper.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
374
+ "audio_encoder.whisper.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
375
+ "audio_encoder.whisper.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
376
+ "audio_encoder.whisper.layers.3.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
377
+ "audio_encoder.whisper.layers.3.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
378
+ "audio_encoder.whisper.layers.30.fc1.bias": "model-00002-of-00002.safetensors",
379
+ "audio_encoder.whisper.layers.30.fc1.weight": "model-00001-of-00002.safetensors",
380
+ "audio_encoder.whisper.layers.30.fc2.bias": "model-00002-of-00002.safetensors",
381
+ "audio_encoder.whisper.layers.30.fc2.weight": "model-00002-of-00002.safetensors",
382
+ "audio_encoder.whisper.layers.30.final_layer_norm.bias": "model-00001-of-00002.safetensors",
383
+ "audio_encoder.whisper.layers.30.final_layer_norm.weight": "model-00002-of-00002.safetensors",
384
+ "audio_encoder.whisper.layers.30.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
385
+ "audio_encoder.whisper.layers.30.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
386
+ "audio_encoder.whisper.layers.30.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
387
+ "audio_encoder.whisper.layers.30.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
388
+ "audio_encoder.whisper.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
389
+ "audio_encoder.whisper.layers.30.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
390
+ "audio_encoder.whisper.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
391
+ "audio_encoder.whisper.layers.30.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
392
+ "audio_encoder.whisper.layers.30.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
393
+ "audio_encoder.whisper.layers.31.fc1.bias": "model-00002-of-00002.safetensors",
394
+ "audio_encoder.whisper.layers.31.fc1.weight": "model-00001-of-00002.safetensors",
395
+ "audio_encoder.whisper.layers.31.fc2.bias": "model-00002-of-00002.safetensors",
396
+ "audio_encoder.whisper.layers.31.fc2.weight": "model-00001-of-00002.safetensors",
397
+ "audio_encoder.whisper.layers.31.final_layer_norm.bias": "model-00002-of-00002.safetensors",
398
+ "audio_encoder.whisper.layers.31.final_layer_norm.weight": "model-00002-of-00002.safetensors",
399
+ "audio_encoder.whisper.layers.31.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
400
+ "audio_encoder.whisper.layers.31.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
401
+ "audio_encoder.whisper.layers.31.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
402
+ "audio_encoder.whisper.layers.31.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
403
+ "audio_encoder.whisper.layers.31.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
404
+ "audio_encoder.whisper.layers.31.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
405
+ "audio_encoder.whisper.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
406
+ "audio_encoder.whisper.layers.31.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
407
+ "audio_encoder.whisper.layers.31.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
408
+ "audio_encoder.whisper.layers.4.fc1.bias": "model-00001-of-00002.safetensors",
409
+ "audio_encoder.whisper.layers.4.fc1.weight": "model-00001-of-00002.safetensors",
410
+ "audio_encoder.whisper.layers.4.fc2.bias": "model-00001-of-00002.safetensors",
411
+ "audio_encoder.whisper.layers.4.fc2.weight": "model-00002-of-00002.safetensors",
412
+ "audio_encoder.whisper.layers.4.final_layer_norm.bias": "model-00001-of-00002.safetensors",
413
+ "audio_encoder.whisper.layers.4.final_layer_norm.weight": "model-00001-of-00002.safetensors",
414
+ "audio_encoder.whisper.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
415
+ "audio_encoder.whisper.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
416
+ "audio_encoder.whisper.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
417
+ "audio_encoder.whisper.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
418
+ "audio_encoder.whisper.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
419
+ "audio_encoder.whisper.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
420
+ "audio_encoder.whisper.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
421
+ "audio_encoder.whisper.layers.4.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
422
+ "audio_encoder.whisper.layers.4.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
423
+ "audio_encoder.whisper.layers.5.fc1.bias": "model-00002-of-00002.safetensors",
424
+ "audio_encoder.whisper.layers.5.fc1.weight": "model-00002-of-00002.safetensors",
425
+ "audio_encoder.whisper.layers.5.fc2.bias": "model-00001-of-00002.safetensors",
426
+ "audio_encoder.whisper.layers.5.fc2.weight": "model-00002-of-00002.safetensors",
427
+ "audio_encoder.whisper.layers.5.final_layer_norm.bias": "model-00001-of-00002.safetensors",
428
+ "audio_encoder.whisper.layers.5.final_layer_norm.weight": "model-00002-of-00002.safetensors",
429
+ "audio_encoder.whisper.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
430
+ "audio_encoder.whisper.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
431
+ "audio_encoder.whisper.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
432
+ "audio_encoder.whisper.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
433
+ "audio_encoder.whisper.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
434
+ "audio_encoder.whisper.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
435
+ "audio_encoder.whisper.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
436
+ "audio_encoder.whisper.layers.5.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
437
+ "audio_encoder.whisper.layers.5.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
438
+ "audio_encoder.whisper.layers.6.fc1.bias": "model-00002-of-00002.safetensors",
439
+ "audio_encoder.whisper.layers.6.fc1.weight": "model-00001-of-00002.safetensors",
440
+ "audio_encoder.whisper.layers.6.fc2.bias": "model-00001-of-00002.safetensors",
441
+ "audio_encoder.whisper.layers.6.fc2.weight": "model-00002-of-00002.safetensors",
442
+ "audio_encoder.whisper.layers.6.final_layer_norm.bias": "model-00001-of-00002.safetensors",
443
+ "audio_encoder.whisper.layers.6.final_layer_norm.weight": "model-00002-of-00002.safetensors",
444
+ "audio_encoder.whisper.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
445
+ "audio_encoder.whisper.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
446
+ "audio_encoder.whisper.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
447
+ "audio_encoder.whisper.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
448
+ "audio_encoder.whisper.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
449
+ "audio_encoder.whisper.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
450
+ "audio_encoder.whisper.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
451
+ "audio_encoder.whisper.layers.6.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
452
+ "audio_encoder.whisper.layers.6.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
453
+ "audio_encoder.whisper.layers.7.fc1.bias": "model-00002-of-00002.safetensors",
454
+ "audio_encoder.whisper.layers.7.fc1.weight": "model-00001-of-00002.safetensors",
455
+ "audio_encoder.whisper.layers.7.fc2.bias": "model-00002-of-00002.safetensors",
456
+ "audio_encoder.whisper.layers.7.fc2.weight": "model-00001-of-00002.safetensors",
457
+ "audio_encoder.whisper.layers.7.final_layer_norm.bias": "model-00002-of-00002.safetensors",
458
+ "audio_encoder.whisper.layers.7.final_layer_norm.weight": "model-00002-of-00002.safetensors",
459
+ "audio_encoder.whisper.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
460
+ "audio_encoder.whisper.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
461
+ "audio_encoder.whisper.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
462
+ "audio_encoder.whisper.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
463
+ "audio_encoder.whisper.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
464
+ "audio_encoder.whisper.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
465
+ "audio_encoder.whisper.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
466
+ "audio_encoder.whisper.layers.7.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
467
+ "audio_encoder.whisper.layers.7.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
468
+ "audio_encoder.whisper.layers.8.fc1.bias": "model-00001-of-00002.safetensors",
469
+ "audio_encoder.whisper.layers.8.fc1.weight": "model-00002-of-00002.safetensors",
470
+ "audio_encoder.whisper.layers.8.fc2.bias": "model-00001-of-00002.safetensors",
471
+ "audio_encoder.whisper.layers.8.fc2.weight": "model-00001-of-00002.safetensors",
472
+ "audio_encoder.whisper.layers.8.final_layer_norm.bias": "model-00002-of-00002.safetensors",
473
+ "audio_encoder.whisper.layers.8.final_layer_norm.weight": "model-00002-of-00002.safetensors",
474
+ "audio_encoder.whisper.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
475
+ "audio_encoder.whisper.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
476
+ "audio_encoder.whisper.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
477
+ "audio_encoder.whisper.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
478
+ "audio_encoder.whisper.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
479
+ "audio_encoder.whisper.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
480
+ "audio_encoder.whisper.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
481
+ "audio_encoder.whisper.layers.8.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
482
+ "audio_encoder.whisper.layers.8.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
483
+ "audio_encoder.whisper.layers.9.fc1.bias": "model-00001-of-00002.safetensors",
484
+ "audio_encoder.whisper.layers.9.fc1.weight": "model-00001-of-00002.safetensors",
485
+ "audio_encoder.whisper.layers.9.fc2.bias": "model-00001-of-00002.safetensors",
486
+ "audio_encoder.whisper.layers.9.fc2.weight": "model-00001-of-00002.safetensors",
487
+ "audio_encoder.whisper.layers.9.final_layer_norm.bias": "model-00001-of-00002.safetensors",
488
+ "audio_encoder.whisper.layers.9.final_layer_norm.weight": "model-00001-of-00002.safetensors",
489
+ "audio_encoder.whisper.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
490
+ "audio_encoder.whisper.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
491
+ "audio_encoder.whisper.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
492
+ "audio_encoder.whisper.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
493
+ "audio_encoder.whisper.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
494
+ "audio_encoder.whisper.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
495
+ "audio_encoder.whisper.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
496
+ "audio_encoder.whisper.layers.9.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
497
+ "audio_encoder.whisper.layers.9.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
498
+ "lm_head.weight": "model-00001-of-00002.safetensors",
499
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
500
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
501
+ "model.layers.0.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
502
+ "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
503
+ "model.layers.0.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
504
+ "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
505
+ "model.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
506
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
507
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
508
+ "model.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
509
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
510
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
511
+ "model.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
512
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
513
+ "model.layers.1.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
514
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
515
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
516
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
517
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
518
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
519
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
520
+ "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
521
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
522
+ "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
523
+ "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
524
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors",
525
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
526
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
527
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
528
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
529
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
530
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
531
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
532
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
533
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
534
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
535
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
536
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors",
537
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
538
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
539
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
540
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
541
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
542
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
543
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
544
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
545
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
546
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
547
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
548
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
549
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
550
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
551
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
552
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
553
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
554
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
555
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
556
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
557
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
558
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
559
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
560
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00002.safetensors",
561
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
562
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
563
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
564
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
565
+ "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
566
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
567
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
568
+ "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
569
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
570
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
571
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
572
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
573
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
574
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
575
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
576
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
577
+ "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
578
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
579
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
580
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
581
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
582
+ "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
583
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
584
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
585
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
586
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
587
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
588
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
589
+ "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
590
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
591
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
592
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
593
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
594
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
595
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
596
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00002.safetensors",
597
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
598
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
599
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
600
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
601
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
602
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
603
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
604
+ "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
605
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
606
+ "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
607
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
608
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
609
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
610
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
611
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
612
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
613
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
614
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
615
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
616
+ "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
617
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
618
+ "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
619
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
620
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
621
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
622
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
623
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
624
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
625
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
626
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
627
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
628
+ "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
629
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
630
+ "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
631
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
632
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
633
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
634
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
635
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
636
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
637
+ "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
638
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
639
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
640
+ "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
641
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
642
+ "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
643
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
644
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
645
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
646
+ "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
647
+ "model.layers.2.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
648
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
649
+ "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
650
+ "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
651
+ "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
652
+ "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
653
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
654
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
655
+ "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
656
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
657
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
658
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
659
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
660
+ "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
661
+ "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
662
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
663
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
664
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
665
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
666
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
667
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
668
+ "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
669
+ "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
670
+ "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
671
+ "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
672
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
673
+ "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
674
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
675
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
676
+ "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
677
+ "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
678
+ "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
679
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
680
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
681
+ "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
682
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
683
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
684
+ "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
685
+ "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
686
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
687
+ "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
688
+ "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
689
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
690
+ "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
691
+ "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
692
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
693
+ "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
694
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
695
+ "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
696
+ "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
697
+ "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
698
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
699
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
700
+ "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
701
+ "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
702
+ "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
703
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
704
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
705
+ "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
706
+ "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
707
+ "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
708
+ "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
709
+ "model.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
710
+ "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
711
+ "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
712
+ "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
713
+ "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
714
+ "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
715
+ "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
716
+ "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
717
+ "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
718
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
719
+ "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
720
+ "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
721
+ "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
722
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
723
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
724
+ "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
725
+ "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
726
+ "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
727
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
728
+ "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
729
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
730
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
731
+ "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
732
+ "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
733
+ "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
734
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
735
+ "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
736
+ "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
737
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
738
+ "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
739
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
740
+ "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
741
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
742
+ "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
743
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
744
+ "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
745
+ "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
746
+ "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
747
+ "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
748
+ "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
749
+ "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
750
+ "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
751
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
752
+ "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors",
753
+ "model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
754
+ "model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
755
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
756
+ "model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
757
+ "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
758
+ "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
759
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
760
+ "model.layers.28.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
761
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
762
+ "model.layers.28.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
763
+ "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
764
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
765
+ "model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
766
+ "model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
767
+ "model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
768
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
769
+ "model.layers.29.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
770
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
771
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
772
+ "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
773
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
774
+ "model.layers.29.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
775
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
776
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00002.safetensors",
777
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
778
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
779
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
780
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
781
+ "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
782
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
783
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
784
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
785
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
786
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
787
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
788
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
789
+ "model.layers.30.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
790
+ "model.layers.30.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
791
+ "model.layers.30.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
792
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
793
+ "model.layers.30.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
794
+ "model.layers.30.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
795
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
796
+ "model.layers.30.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
797
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
798
+ "model.layers.30.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
799
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
800
+ "model.layers.31.input_layernorm.weight": "model-00001-of-00002.safetensors",
801
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
802
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
803
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
804
+ "model.layers.31.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
805
+ "model.layers.31.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
806
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
807
+ "model.layers.31.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
808
+ "model.layers.31.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
809
+ "model.layers.31.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
810
+ "model.layers.31.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
811
+ "model.layers.31.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
812
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
813
+ "model.layers.32.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
814
+ "model.layers.32.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
815
+ "model.layers.32.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
816
+ "model.layers.32.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
817
+ "model.layers.32.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
818
+ "model.layers.32.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
819
+ "model.layers.32.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
820
+ "model.layers.32.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
821
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
822
+ "model.layers.32.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
823
+ "model.layers.32.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
824
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
825
+ "model.layers.33.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
826
+ "model.layers.33.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
827
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
828
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
829
+ "model.layers.33.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
830
+ "model.layers.33.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
831
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
832
+ "model.layers.33.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
833
+ "model.layers.33.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
834
+ "model.layers.33.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
835
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
836
+ "model.layers.34.input_layernorm.weight": "model-00001-of-00002.safetensors",
837
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
838
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
839
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
840
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
841
+ "model.layers.34.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
842
+ "model.layers.34.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
843
+ "model.layers.34.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
844
+ "model.layers.34.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
845
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
846
+ "model.layers.34.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
847
+ "model.layers.34.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
848
+ "model.layers.35.input_layernorm.weight": "model-00001-of-00002.safetensors",
849
+ "model.layers.35.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
850
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
851
+ "model.layers.35.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
852
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
853
+ "model.layers.35.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
854
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
855
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
856
+ "model.layers.35.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
857
+ "model.layers.35.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
858
+ "model.layers.35.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
859
+ "model.layers.35.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
860
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
861
+ "model.layers.4.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
862
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
863
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
864
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
865
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
866
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
867
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
868
+ "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
869
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
870
+ "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
871
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
872
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00002.safetensors",
873
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
874
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
875
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
876
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
877
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
878
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
879
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
880
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
881
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
882
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
883
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
884
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00002.safetensors",
885
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
886
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
887
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
888
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
889
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
890
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
891
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
892
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
893
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
894
+ "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
895
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
896
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors",
897
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
898
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
899
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
900
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
901
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
902
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
903
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
904
+ "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
905
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
906
+ "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
907
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
908
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00002.safetensors",
909
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
910
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
911
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
912
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
913
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
914
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
915
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
916
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
917
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
918
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
919
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
920
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
921
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
922
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
923
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
924
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
925
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
926
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
927
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
928
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
929
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
930
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
931
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
932
+ "model.norm.weight": "model-00002-of-00002.safetensors"
933
+ }
934
+ }
modeling_arkasr.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, List, Tuple, Union, Dict
4
+
5
+ import torch
6
+ from torch import Tensor, nn
7
+ from transformers import Qwen2ForCausalLM
8
+ from transformers.modeling_outputs import CausalLMOutputWithPast
9
+
10
+ from .configuration_arkasr import ArkasrConfig
11
+ from .modeling_audio import WhisperSpecialEncoder
12
+
13
+
14
+ class AudioMLPAdapter(nn.Module):
15
+ def __init__(self, config: ArkasrConfig):
16
+ super().__init__()
17
+ whisper_config = config.whisper_config
18
+ self.merge_factor = int(config.merge_factor)
19
+
20
+ # 音频编码器
21
+ self.whisper = WhisperSpecialEncoder(
22
+ whisper_config,
23
+ use_rope=getattr(config, "use_rope", False),
24
+ )
25
+ # 禁用 Whisper 自带 LayerNorm
26
+ self.whisper.layer_norm = nn.Identity()
27
+ self.layer_norm = nn.LayerNorm(whisper_config.hidden_size)
28
+
29
+ act_fn_map = {
30
+ "gelu": nn.GELU(),
31
+ "relu": nn.ReLU(),
32
+ "selu": nn.SELU(),
33
+ }
34
+ act = act_fn_map.get(getattr(config, "mlp_adapter_act", "gelu"), nn.GELU())
35
+
36
+ input_dim = whisper_config.hidden_size * self.merge_factor
37
+ output_dim = config.hidden_size
38
+
39
+ self.adapting = nn.Sequential(
40
+ nn.Linear(input_dim, output_dim * 2),
41
+ act,
42
+ nn.Linear(output_dim * 2, output_dim),
43
+ )
44
+
45
+ def forward(self, audios: Tensor) -> Tensor:
46
+ """
47
+ Args:
48
+ audios: (B, mel, T) 或 (B, raw_len) —— 由 WhisperSpecialEncoder 决定
49
+ Returns:
50
+ adapted_features: (B, Seq_Audio, LLM_Hidden_Dim)
51
+ """
52
+ bsz = audios.size(0)
53
+
54
+ encoded = self.whisper(audios)[0] # (B, T, D)
55
+ encoded = self.layer_norm(encoded)
56
+
57
+ seq_len = encoded.size(1)
58
+ if seq_len % self.merge_factor != 0:
59
+ target_len = (seq_len // self.merge_factor) * self.merge_factor
60
+ if target_len <= 0:
61
+ # 极短音频兜底:pad 到 merge_factor
62
+ target_len = self.merge_factor
63
+ if seq_len < target_len:
64
+ pad_len = target_len - seq_len
65
+ pad = encoded.new_zeros((bsz, pad_len, encoded.size(-1)))
66
+ encoded = torch.cat([encoded, pad], dim=1)
67
+ else:
68
+ encoded = encoded[:, :target_len, :]
69
+
70
+ encoded = encoded.reshape(bsz, -1, encoded.size(-1) * self.merge_factor)
71
+ adapted = self.adapting(encoded) # (B, T/k, hidden)
72
+ return adapted
73
+
74
+
75
+ class ArkasrForConditionalGeneration(Qwen2ForCausalLM):
76
+ config_class = ArkasrConfig
77
+ _no_split_modules = ["WhisperSpecialEncoder"]
78
+
79
+ def __init__(self, config: ArkasrConfig):
80
+ super().__init__(config)
81
+ self.audio_encoder = AudioMLPAdapter(config)
82
+
83
+ self.audio_token_id = getattr(config, "audio_token_id", None)
84
+ if self.audio_token_id is None:
85
+ raise ValueError("`audio_token_id` must be defined in config.")
86
+
87
+ @staticmethod
88
+ def _cache_seq_len(past_key_values) -> int:
89
+ if past_key_values is None:
90
+ return 0
91
+ if hasattr(past_key_values, "get_seq_length"):
92
+ try:
93
+ return int(past_key_values.get_seq_length())
94
+ except Exception:
95
+ return 0
96
+ try:
97
+ return int(past_key_values[0][0].shape[-2])
98
+ except Exception:
99
+ return 0
100
+
101
+ def _inject_audio_embeddings_batch_encode_then_loop_scatter(
102
+ self,
103
+ input_ids: torch.LongTensor, # (B, S)
104
+ inputs_embeds: torch.FloatTensor, # (B, S, H)
105
+ audios: Tensor, # (B, ...)
106
+ ) -> torch.FloatTensor:
107
+ """
108
+ 先对「有 audio token 的样本」做一次 batch 音频编码,
109
+ 然后 for-loop 把每个样本的 audio features 按 audio_token 位置写回 inputs_embeds。
110
+
111
+ 好处:
112
+ - encoder 只跑一次(快)
113
+ - 写回按样本做,不会跨样本错位(稳)
114
+ - 碰到某行没有 audio_token:直接跳过(TTS 行无影响)
115
+
116
+ 约束:
117
+ - 每条样本的 audio_token 数量 n_i 需要和 audio_encoder 输出的 Sa 对齐。
118
+ 如果不对齐:这里采用截断/补零对齐到 n_i(不报错)。
119
+ """
120
+ B, S = input_ids.shape
121
+ H = inputs_embeds.size(-1)
122
+ device = inputs_embeds.device
123
+ dtype = inputs_embeds.dtype
124
+
125
+ # 找到哪些样本需要注入
126
+ mask = (input_ids == self.audio_token_id) # (B, S)
127
+ per_counts = mask.sum(dim=1) # (B,)
128
+ need_idx = (per_counts > 0).nonzero(as_tuple=False).squeeze(1) # (K,)
129
+
130
+ if need_idx.numel() == 0:
131
+ return inputs_embeds
132
+
133
+ # 只编码需要注入的那部分音频(K, ...)
134
+ audios_sub = audios.index_select(0, need_idx)
135
+ feats_sub = self.audio_encoder(audios_sub) # (K, Sa, H)
136
+
137
+ # 写回:逐样本替换(写回操作本身几��不耗时)
138
+ feats_sub = feats_sub.to(device=device, dtype=dtype)
139
+ Sa = feats_sub.size(1)
140
+
141
+ # 逐个样本注入
142
+ for k in range(need_idx.numel()):
143
+ i = int(need_idx[k].item())
144
+ n_i = int(per_counts[i].item())
145
+ if n_i <= 0:
146
+ continue
147
+
148
+ feat_i = feats_sub[k] # (Sa, H)
149
+
150
+ # 对齐到该样本的 audio token 数 n_i
151
+ if Sa < n_i:
152
+ pad = feat_i.new_zeros((n_i - Sa, H))
153
+ feat_i_use = torch.cat([feat_i, pad], dim=0)
154
+ elif Sa > n_i:
155
+ feat_i_use = feat_i[:n_i]
156
+ else:
157
+ feat_i_use = feat_i
158
+
159
+ pos_i = mask[i].nonzero(as_tuple=False).squeeze(1) # (n_i,)
160
+ # 写回 embeddings
161
+ inputs_embeds[i, pos_i, :] = feat_i_use
162
+
163
+ return inputs_embeds
164
+
165
+ def forward(
166
+ self,
167
+ input_ids: Optional[torch.LongTensor] = None,
168
+ audios: Optional[Tensor] = None,
169
+ attention_mask: Optional[Tensor] = None,
170
+ position_ids: Optional[Tensor] = None,
171
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
172
+ inputs_embeds: Optional[torch.FloatTensor] = None,
173
+ use_cache: Optional[bool] = None,
174
+ labels: Optional[torch.LongTensor] = None,
175
+ output_attentions: Optional[bool] = None,
176
+ output_hidden_states: Optional[bool] = None,
177
+ logits_to_keep: int | torch.Tensor = 0,
178
+ **kwargs,
179
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
180
+
181
+ if inputs_embeds is None:
182
+ if input_ids is None:
183
+ raise ValueError("Either `input_ids` or `inputs_embeds` must be provided.")
184
+ inputs_embeds = self.model.embed_tokens(input_ids)
185
+
186
+ # 只在首步(past_len==0)注入,避免 generation 后续重复 encode
187
+ past_len = self._cache_seq_len(past_key_values)
188
+ if audios is not None and input_ids is not None and past_len == 0:
189
+ inputs_embeds = self._inject_audio_embeddings_batch_encode_then_loop_scatter(
190
+ input_ids=input_ids,
191
+ inputs_embeds=inputs_embeds,
192
+ audios=audios,
193
+ )
194
+
195
+ outputs = self.model(
196
+ input_ids=None,
197
+ attention_mask=attention_mask,
198
+ position_ids=position_ids,
199
+ past_key_values=past_key_values,
200
+ inputs_embeds=inputs_embeds,
201
+ use_cache=use_cache,
202
+ output_attentions=output_attentions,
203
+ output_hidden_states=output_hidden_states,
204
+ )
205
+
206
+ hidden_states = outputs[0]
207
+
208
+ # logits(避免重复算 lm_head)
209
+ if isinstance(logits_to_keep, int) and logits_to_keep > 0:
210
+ hidden_for_logits = hidden_states[:, -logits_to_keep:, :]
211
+ elif isinstance(logits_to_keep, torch.Tensor):
212
+ hidden_for_logits = hidden_states[:, logits_to_keep, :]
213
+ else:
214
+ hidden_for_logits = hidden_states
215
+
216
+ logits = self.lm_head(hidden_for_logits)
217
+
218
+ loss = None
219
+ if labels is not None:
220
+ loss = self.loss_function(
221
+ logits=logits,
222
+ labels=labels,
223
+ vocab_size=self.config.vocab_size,
224
+ **kwargs,
225
+ )
226
+
227
+ return CausalLMOutputWithPast(
228
+ loss=loss,
229
+ logits=logits,
230
+ past_key_values=outputs.past_key_values,
231
+ hidden_states=outputs.hidden_states,
232
+ attentions=outputs.attentions,
233
+ )
234
+
235
+ def prepare_inputs_for_generation(
236
+ self,
237
+ input_ids,
238
+ past_key_values=None,
239
+ attention_mask=None,
240
+ inputs_embeds=None,
241
+ **kwargs,
242
+ ):
243
+ past_len = self._cache_seq_len(past_key_values)
244
+ if past_len > 0:
245
+ input_ids = input_ids[:, -1:]
246
+
247
+ model_inputs = {
248
+ "input_ids": input_ids,
249
+ "past_key_values": past_key_values,
250
+ "use_cache": kwargs.get("use_cache"),
251
+ "attention_mask": attention_mask,
252
+ # audios 透传;forward 内 past_len==0 才注入,所以后续 step 不会重复编码
253
+ "audios": kwargs.get("audios", None),
254
+ }
255
+
256
+ if inputs_embeds is not None and past_key_values is None:
257
+ model_inputs["inputs_embeds"] = inputs_embeds
258
+ del model_inputs["input_ids"]
259
+
260
+ return model_inputs
261
+
262
+
263
+ __all__ = ["ArkasrForConditionalGeneration", "AudioMLPAdapter"]
modeling_audio.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Optional, Tuple, Union
2
+
3
+ import torch
4
+ from torch import Tensor, nn
5
+ from torch.nn.functional import scaled_dot_product_attention
6
+ from transformers import WhisperConfig
7
+ from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
8
+ from transformers.models.whisper.modeling_whisper import WhisperEncoder, WhisperEncoderLayer
9
+ from transformers.utils import logging
10
+
11
+ logger = logging.get_logger(__name__)
12
+
13
+ # ==========================================
14
+ # 1. Rotary Embedding 核心组件
15
+ # ==========================================
16
+
17
+ class RotaryEmbedding(nn.Module):
18
+ def __init__(self, dim, rope_ratio=1):
19
+ super().__init__()
20
+ self.dim = dim
21
+ self.rope_ratio = rope_ratio
22
+
23
+ @torch.no_grad()
24
+ def get_emb(self, seq_len: int, dtype: torch.dtype, device: torch.device, base: int = 10000):
25
+ """生成 RoPE 缓存"""
26
+ base = base * self.rope_ratio
27
+ # 计算频率 theta
28
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.float, device=device) / self.dim))
29
+
30
+ # 生成位置索引
31
+ t = torch.arange(seq_len, device=device, dtype=torch.float)
32
+ freqs = torch.outer(t, inv_freq) # [seq_len, dim/2]
33
+
34
+ # 构造 cos 和 sin 缓存
35
+ # 形状: [seq_len, dim/2, 2]
36
+ emb = torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
37
+
38
+ if dtype in (torch.float16, torch.bfloat16):
39
+ emb = emb.to(dtype)
40
+ return emb
41
+
42
+ def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
43
+ """
44
+ x: [batch, num_heads, seq_len, head_dim]
45
+ rope_cache: [1, seq_len, dim/2, 2]
46
+ """
47
+ b, nh, sq, hd = x.shape
48
+ rot_dim = rope_cache.shape[-2] * 2
49
+
50
+ # 将 x 分为旋转部分和不旋转部分
51
+ x_rot, x_pass = x[..., :rot_dim], x[..., rot_dim:]
52
+
53
+ # 调整 x_rot 形状以匹配 rope_cache: [b, nh, sq, rot_dim/2, 2]
54
+ x_shaped = x_rot.reshape(b, nh, sq, rot_dim // 2, 2)
55
+
56
+ # 计算旋转: (a+bi)(c+di) = (ac-bd) + (ad+bc)i
57
+ cos = rope_cache[..., 0] # [1, sq, rot_dim/2]
58
+ sin = rope_cache[..., 1] # [1, sq, rot_dim/2]
59
+
60
+ # 增加 head 维度
61
+ cos = cos.unsqueeze(1) # [1, 1, sq, rot_dim/2]
62
+ sin = sin.unsqueeze(1) # [1, 1, sq, rot_dim/2]
63
+
64
+ x_out = torch.stack([
65
+ x_shaped[..., 0] * cos - x_shaped[..., 1] * sin,
66
+ x_shaped[..., 1] * cos + x_shaped[..., 0] * sin
67
+ ], dim=-1)
68
+
69
+ x_out = x_out.flatten(3) # 合并最后两维到 rot_dim
70
+ return torch.cat([x_out, x_pass], dim=-1)
71
+
72
+ # ==========================================
73
+ # 2. 基于 SDPA 的 RoPE Attention
74
+ # ==========================================
75
+
76
+ class WhisperRoPESdpaAttention(nn.Module):
77
+ """
78
+ 使用 PyTorch 原生 scaled_dot_product_attention 替代 WhisperFlashAttention2。
79
+ """
80
+ def __init__(self, config: WhisperConfig, embed_dim: int, num_heads: int, dropout: float = 0.0):
81
+ super().__init__()
82
+ self.config = config
83
+ self.embed_dim = embed_dim
84
+ self.num_heads = num_heads
85
+ self.dropout = dropout
86
+ self.head_dim = embed_dim // num_heads
87
+
88
+ # Whisper 标准投影层
89
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
90
+ self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
91
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
92
+ self.out_proj = nn.Linear(embed_dim, embed_dim)
93
+
94
+ self.is_causal = False
95
+
96
+ def forward(
97
+ self,
98
+ hidden_states: torch.Tensor,
99
+ attention_mask: Optional[torch.Tensor] = None,
100
+ layer_head_mask: Optional[torch.Tensor] = None,
101
+ output_attentions: bool = False,
102
+ rotary_pos_emb: Optional[torch.Tensor] = None,
103
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], None]:
104
+
105
+ bsz, q_len, _ = hidden_states.size()
106
+
107
+ # 1. 投影映射
108
+ query_states = self.q_proj(hidden_states)
109
+ key_states = self.k_proj(hidden_states)
110
+ value_states = self.v_proj(hidden_states)
111
+
112
+ # 2. 变形为 [batch, heads, seq, dim] 并确保内存连续
113
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
114
+ key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
115
+ value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
116
+
117
+ # 3. 应用 RoPE
118
+ if rotary_pos_emb is not None:
119
+ query_states = apply_rotary_pos_emb(query_states, rotary_pos_emb)
120
+ key_states = apply_rotary_pos_emb(key_states, rotary_pos_emb)
121
+
122
+ # 4. 数据类型对齐 (处理 fp32 LayerNorm 带来的类型不匹配)
123
+ target_dtype = self.q_proj.weight.dtype
124
+ query_states = query_states.to(target_dtype)
125
+ key_states = key_states.to(target_dtype)
126
+ value_states = value_states.to(target_dtype)
127
+
128
+ # 5. SDPA 计算 (关键:不要手动乘以 scaling, SDPA 内部会自动处理)
129
+ # 注意: 如果传入了 4D attention_mask,SDPA 会正确应用它
130
+ attn_output = scaled_dot_product_attention(
131
+ query_states,
132
+ key_states,
133
+ value_states,
134
+ attn_mask=attention_mask,
135
+ dropout_p=self.dropout if self.training else 0.0,
136
+ is_causal=self.is_causal,
137
+ )
138
+
139
+ # 6. 恢复形状并输出投影
140
+ attn_output = attn_output.transpose(1, 2).contiguous()
141
+ attn_output = attn_output.reshape(bsz, q_len, self.embed_dim)
142
+ attn_output = self.out_proj(attn_output)
143
+
144
+ return attn_output, None, None
145
+
146
+ # ==========================================
147
+ # 3. 封装好的 Encoder 层和 Encoder
148
+ # ==========================================
149
+
150
+ class WhisperSpecialEncoderLayer(WhisperEncoderLayer):
151
+ def __init__(self, config: WhisperConfig):
152
+ super().__init__(config)
153
+ # 替换 Self-Attention 为我们的 RoPE SDPA 版本
154
+ self.self_attn = WhisperRoPESdpaAttention(
155
+ config=config,
156
+ embed_dim=self.embed_dim,
157
+ num_heads=config.encoder_attention_heads,
158
+ dropout=config.attention_dropout,
159
+ )
160
+
161
+ def forward(
162
+ self,
163
+ hidden_states: torch.Tensor,
164
+ attention_mask: Optional[torch.Tensor] = None,
165
+ layer_head_mask: Optional[torch.Tensor] = None,
166
+ output_attentions: bool = False,
167
+ rotary_pos_emb: Optional[torch.Tensor] = None,
168
+ position_ids: Optional[torch.Tensor] = None,
169
+ ) -> Tuple[torch.Tensor, Any]:
170
+
171
+ residual = hidden_states
172
+ hidden_states = self.self_attn_layer_norm(hidden_states)
173
+
174
+ hidden_states, attn_weights, _ = self.self_attn(
175
+ hidden_states=hidden_states,
176
+ attention_mask=attention_mask,
177
+ layer_head_mask=layer_head_mask,
178
+ output_attentions=output_attentions,
179
+ rotary_pos_emb=rotary_pos_emb,
180
+ )
181
+
182
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
183
+ hidden_states = residual + hidden_states
184
+
185
+ residual = hidden_states
186
+ hidden_states = self.final_layer_norm(hidden_states)
187
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
188
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
189
+ hidden_states = self.fc2(hidden_states)
190
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
191
+ hidden_states = residual + hidden_states
192
+
193
+ if hidden_states.dtype == torch.float16:
194
+ clamp_value = torch.finfo(hidden_states.dtype).max - 1000
195
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
196
+
197
+ return (hidden_states, None) # 保持与 Whisper 接口一致的 tuple 长度
198
+
199
+ class WhisperSpecialEncoder(WhisperEncoder):
200
+ def __init__(self, config: WhisperConfig, use_rope=True, rope_ratio=1):
201
+ super().__init__(config)
202
+ self.use_rope = use_rope
203
+ # 覆盖父类的层列表
204
+ self.layers = nn.ModuleList(
205
+ [WhisperSpecialEncoderLayer(config) for _ in range(config.encoder_layers)]
206
+ )
207
+
208
+ if use_rope:
209
+ # 计算 RoPE 维度: 通常是 head_dim 的一部分
210
+ head_dim = config.d_model // config.encoder_attention_heads
211
+ self.rotary_embedding = RotaryEmbedding(head_dim // 2, rope_ratio)
212
+
213
+ def forward(
214
+ self,
215
+ input_features,
216
+ attention_mask=None,
217
+ head_mask=None,
218
+ output_attentions=None,
219
+ output_hidden_states=None,
220
+ return_dict=None,
221
+ position_ids=None,
222
+ ):
223
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
224
+ output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
225
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
226
+
227
+ # Whisper 卷积特征提取
228
+ inputs_embeds = nn.functional.gelu(self.conv1(input_features))
229
+ inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
230
+ inputs_embeds = inputs_embeds.permute(0, 2, 1) # [B, T_down, D]
231
+
232
+ if self.use_rope:
233
+ # 生成旋转编码缓存
234
+ rotary_embs = self.rotary_embedding.get_emb(
235
+ seq_len=inputs_embeds.shape[1],
236
+ dtype=inputs_embeds.dtype,
237
+ device=inputs_embeds.device
238
+ )
239
+ # 形状调整为 [1, seq_len, dim/2, 2] 以便广播
240
+ rotary_embs = rotary_embs.unsqueeze(0)
241
+ hidden_states = inputs_embeds
242
+ else:
243
+ rotary_embs = None
244
+ # 回退到绝对位置编码
245
+ embed_pos = self.embed_positions.weight[:inputs_embeds.shape[1]]
246
+ hidden_states = inputs_embeds + embed_pos
247
+
248
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
249
+
250
+ encoder_states = () if output_hidden_states else None
251
+ all_attentions = () if output_attentions else None
252
+
253
+ for idx, encoder_layer in enumerate(self.layers):
254
+ if output_hidden_states:
255
+ encoder_states = encoder_states + (hidden_states,)
256
+
257
+ if self.gradient_checkpointing and self.training:
258
+ layer_outputs = self._gradient_checkpointing_func(
259
+ encoder_layer.__call__,
260
+ hidden_states,
261
+ None, # attention_mask
262
+ (head_mask[idx] if head_mask is not None else None),
263
+ output_attentions,
264
+ rotary_embs,
265
+ position_ids,
266
+ )
267
+ else:
268
+ layer_outputs = encoder_layer(
269
+ hidden_states,
270
+ attention_mask=None,
271
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
272
+ output_attentions=output_attentions,
273
+ rotary_pos_emb=rotary_embs,
274
+ position_ids=position_ids,
275
+ )
276
+
277
+ hidden_states = layer_outputs[0]
278
+
279
+ if output_attentions:
280
+ all_attentions = all_attentions + (layer_outputs[2],)
281
+
282
+ hidden_states = self.layer_norm(hidden_states)
283
+ if output_hidden_states:
284
+ encoder_states = encoder_states + (hidden_states,)
285
+
286
+ if not return_dict:
287
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
288
+
289
+ return BaseModelOutputWithPastAndCrossAttentions(
290
+ last_hidden_state=hidden_states,
291
+ hidden_states=encoder_states,
292
+ attentions=all_attentions,
293
+ )
preprocessor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_arkasr.ArkasrProcessor"
4
+ },
5
+ "chunk_length": 30,
6
+ "dither": 0.0,
7
+ "feature_extractor_type": "WhisperFeatureExtractor",
8
+ "feature_size": 128,
9
+ "hop_length": 160,
10
+ "n_fft": 400,
11
+ "n_samples": 480000,
12
+ "nb_max_frames": 3000,
13
+ "padding_side": "right",
14
+ "padding_value": 0.0,
15
+ "processor_class": "ArkasrProcessor",
16
+ "return_attention_mask": false,
17
+ "sampling_rate": 16000
18
+ }
processing_arkasr.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ from __future__ import annotations
3
+
4
+ import base64
5
+ import io
6
+ import json
7
+ import os
8
+ from typing import Any, Dict, List, Optional, Union
9
+
10
+ import numpy as np
11
+ import torch
12
+ import librosa
13
+ import soundfile as sf # 显式引入 soundfile 以处理 BytesIO
14
+
15
+ from transformers import AutoTokenizer, WhisperFeatureExtractor
16
+ from transformers.feature_extraction_utils import BatchFeature
17
+ from transformers.processing_utils import ProcessorMixin
18
+ from transformers.utils import logging
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+ _AUDIO_MARKER = "<<AUDIO_TOKENS>>"
23
+
24
+ def _normalize_dtype_name(name: str) -> str:
25
+ name = name.strip().lower()
26
+ alias = {
27
+ "fp16": "float16",
28
+ "float16": "float16",
29
+ "half": "float16",
30
+ "bf16": "bfloat16",
31
+ "bfloat16": "bfloat16",
32
+ "fp32": "float32",
33
+ "float32": "float32",
34
+ "float": "float32",
35
+ }
36
+ return alias.get(name, name)
37
+
38
+
39
+ def _resolve_torch_dtype(x: Any, default: str = "float32") -> torch.dtype:
40
+ if isinstance(x, torch.dtype):
41
+ return x
42
+ if x is None:
43
+ x = default
44
+ if isinstance(x, str):
45
+ name = _normalize_dtype_name(x)
46
+ if not hasattr(torch, name):
47
+ raise ValueError(f"Unknown torch dtype string: {x} (normalized: {name})")
48
+ return getattr(torch, name)
49
+ raise TypeError(f"audio_dtype/audio_torch_dtype must be str or torch.dtype or None, got {type(x)}")
50
+
51
+
52
+ class ArkasrProcessor(ProcessorMixin):
53
+ attributes = ["feature_extractor", "tokenizer"]
54
+ valid_kwargs = ["merge_factor", "audio_token", "audio_dtype"]
55
+ feature_extractor_class = ("WhisperFeatureExtractor", "SequenceFeatureExtractor")
56
+ tokenizer_class = ("PreTrainedTokenizerFast", "PreTrainedTokenizer")
57
+
58
+ def __init__(
59
+ self,
60
+ feature_extractor,
61
+ tokenizer,
62
+ merge_factor: int = 4,
63
+ audio_token: str = "<|audio|>",
64
+ audio_dtype: str = "float32",
65
+ **kwargs,
66
+ ):
67
+ super().__init__(feature_extractor, tokenizer)
68
+ self.merge_factor = int(merge_factor)
69
+ self.audio_token = str(audio_token)
70
+ self.audio_dtype = str(audio_dtype)
71
+
72
+ self.bos_audio_token = "<|begin_of_audio|>"
73
+ self.eos_audio_token = "<|end_of_audio|>"
74
+ self.user_token = "<|user|>"
75
+ self.assistant_token = "<|assistant|>"
76
+ self.assistant_end_token = "<|im_end|>"
77
+
78
+ @classmethod
79
+ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs) -> "ArkasrProcessor":
80
+ trust_remote_code = bool(kwargs.pop("trust_remote_code", False))
81
+ passthrough_keys = {"cache_dir", "force_download", "local_files_only", "token", "revision", "subfolder"}
82
+ shared_kwargs = {k: kwargs[k] for k in list(kwargs.keys()) if k in passthrough_keys}
83
+
84
+ merge_factor = 4
85
+ audio_token = "<|audio|>"
86
+ audio_dtype = "float32"
87
+ tokenizer_cfg: Dict[str, Any] = {}
88
+ feat_cfg: Dict[str, Any] = {}
89
+
90
+ proc_cfg_path = os.path.join(pretrained_model_name_or_path, "processor_config.json")
91
+ if os.path.isfile(proc_cfg_path):
92
+ with open(proc_cfg_path, "r", encoding="utf-8") as f:
93
+ proc_cfg = json.load(f)
94
+ merge_factor = int(proc_cfg.get("merge_factor", merge_factor))
95
+ audio_token = str(proc_cfg.get("audio_token", audio_token))
96
+ audio_dtype = str(proc_cfg.get("audio_dtype", audio_dtype))
97
+ tokenizer_cfg = proc_cfg.get("tokenizer_config", {}) or {}
98
+ feat_cfg = proc_cfg.get("feature_extractor_config", {}) or {}
99
+
100
+ feature_extractor = WhisperFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **shared_kwargs)
101
+ for k, v in feat_cfg.items():
102
+ if hasattr(feature_extractor, k):
103
+ try: setattr(feature_extractor, k, v)
104
+ except Exception: pass
105
+
106
+ tokenizer = AutoTokenizer.from_pretrained(
107
+ pretrained_model_name_or_path, use_fast=True, trust_remote_code=trust_remote_code, **shared_kwargs
108
+ )
109
+ for k, v in tokenizer_cfg.items():
110
+ if hasattr(tokenizer, k):
111
+ try: setattr(tokenizer, k, v)
112
+ except Exception: pass
113
+
114
+ return cls(
115
+ feature_extractor=feature_extractor,
116
+ tokenizer=tokenizer,
117
+ merge_factor=merge_factor,
118
+ audio_token=audio_token,
119
+ audio_dtype=audio_dtype,
120
+ )
121
+
122
+ # =========================
123
+ # audio helpers (Modified)
124
+ # =========================
125
+ def _load_audio_file(self, path: str, sampling_rate: int = 16000, offset: float = 0.0, duration: Optional[float] = None) -> np.ndarray:
126
+ # librosa load 支持 offset 和 duration
127
+ # offset: start reading after this time (in seconds)
128
+ # duration: only load up to this much audio (in seconds)
129
+ audio_array, _ = librosa.load(path, sr=int(sampling_rate), mono=True, offset=offset, duration=duration)
130
+ return np.asarray(audio_array, dtype=np.float32)
131
+
132
+ def _strip_data_url_prefix(self, b64: str) -> str:
133
+ if "," in b64 and b64[:30].lower().startswith("data:"):
134
+ return b64.split(",", 1)[1]
135
+ return b64
136
+
137
+ def _load_audio_base64(self, b64: str, sampling_rate: int = 16000, offset: float = 0.0, duration: Optional[float] = None) -> np.ndarray:
138
+ b64 = self._strip_data_url_prefix(b64)
139
+ raw = base64.b64decode(b64)
140
+ bio = io.BytesIO(raw)
141
+
142
+ # 使用 librosa 加载 BytesIO 同样支持 offset 和 duration
143
+ try:
144
+ wav, _sr = librosa.load(bio, sr=int(sampling_rate), mono=True, offset=offset, duration=duration)
145
+ return np.asarray(wav, dtype=np.float32)
146
+ except Exception as e:
147
+ # Fallback (手动切片,比较慢)
148
+ try:
149
+ bio.seek(0)
150
+ data, sr = sf.read(bio, dtype="float32", always_2d=True)
151
+ wav = data.mean(axis=1)
152
+ if int(sr) != int(sampling_rate):
153
+ wav = librosa.resample(wav, orig_sr=int(sr), target_sr=int(sampling_rate))
154
+
155
+ start_sample = int(offset * sampling_rate)
156
+ end_sample = None
157
+ if duration is not None:
158
+ end_sample = start_sample + int(duration * sampling_rate)
159
+
160
+ return np.asarray(wav[start_sample:end_sample], dtype=np.float32)
161
+ except Exception as e2:
162
+ raise ValueError("Failed to decode base64 audio.") from e2
163
+
164
+ def calculate_audio_token_count(self, mel_frames: int) -> int:
165
+ downsampled = (int(mel_frames) + 1) // 2
166
+ merged = downsampled // max(self.merge_factor, 1)
167
+ return max(int(merged), 1)
168
+
169
+ def _build_templates_and_audios(
170
+ self,
171
+ conversations: List[List[dict]],
172
+ sampling_rate: int,
173
+ add_generation_prompt: bool,
174
+ ) -> tuple[List[str], List[np.ndarray], List[int]]:
175
+ prompts_template: List[str] = []
176
+ audios_raw: List[np.ndarray] = []
177
+ prompt_audio_counts: List[int] = []
178
+
179
+ for conv in conversations:
180
+ conv_str = ""
181
+ last_role = None
182
+ audio_count_this_conv = 0
183
+
184
+ for msg in conv:
185
+ role = msg["role"]
186
+ last_role = role
187
+ content = msg["content"]
188
+
189
+ if role == "user": conv_str += f"{self.user_token}"
190
+ elif role == "assistant": conv_str += f"{self.assistant_token}"
191
+ else: conv_str += f"<|{role}|>"
192
+
193
+ if isinstance(content, str):
194
+ conv_str += f"{content}"
195
+ elif isinstance(content, list):
196
+ for part in content:
197
+ ptype = part.get("type")
198
+ if ptype == "audio":
199
+ # ------------------------------------------------------------
200
+ # 修改点:解析 begin_time 和 end_time
201
+ # ------------------------------------------------------------
202
+ begin_time = part.get("begin_time", -1)
203
+ end_time = part.get("end_time", -1)
204
+
205
+ offset = 0.0
206
+ duration = None
207
+
208
+ # 只有当 begin_time >= 0 且有效时才应用切片
209
+ if begin_time is not None and begin_time >= 0:
210
+ offset = float(begin_time)
211
+ if end_time is not None and end_time > begin_time:
212
+ duration = float(end_time) - float(begin_time)
213
+
214
+ audio_raw_this = None
215
+ if "array" in part:
216
+ arr = part["array"]
217
+ if isinstance(arr, torch.Tensor):
218
+ arr = arr.detach().cpu().numpy()
219
+ full_arr = np.asarray(arr, dtype=np.float32).reshape(-1)
220
+
221
+ # 针对 array 的切片
222
+ start_idx = int(offset * sampling_rate)
223
+ end_idx = None
224
+ if duration is not None:
225
+ end_idx = start_idx + int(duration * sampling_rate)
226
+ audio_raw_this = full_arr[start_idx:end_idx]
227
+
228
+ elif "path" in part:
229
+ audio_raw_this = self._load_audio_file(
230
+ part["path"],
231
+ sampling_rate=sampling_rate,
232
+ offset=offset,
233
+ duration=duration
234
+ )
235
+ elif "base64" in part:
236
+ audio_raw_this = self._load_audio_base64(
237
+ part["base64"],
238
+ sampling_rate=sampling_rate,
239
+ offset=offset,
240
+ duration=duration
241
+ )
242
+ else:
243
+ raise ValueError("Audio part must contain 'path' or 'array' or 'base64'.")
244
+
245
+ audios_raw.append(audio_raw_this)
246
+ audio_count_this_conv += 1
247
+ conv_str += f"{self.bos_audio_token}{_AUDIO_MARKER}{self.eos_audio_token}"
248
+
249
+ elif ptype == "text":
250
+ conv_str += f"{part.get('text', '')}"
251
+ else:
252
+ raise ValueError(f"Unknown content part type: {ptype}")
253
+ else:
254
+ raise ValueError(f"Unsupported message content type: {type(content)}")
255
+
256
+ if add_generation_prompt:
257
+ if last_role == "user": conv_str += f"{self.assistant_token}"
258
+ elif last_role == "assistant": conv_str += f"{self.assistant_end_token}"
259
+ else: conv_str += f"{self.assistant_token}"
260
+
261
+ prompts_template.append(conv_str)
262
+ prompt_audio_counts.append(audio_count_this_conv)
263
+
264
+ return prompts_template, audios_raw, prompt_audio_counts
265
+
266
+ def _calculate_audio_token_counts_per_sample(
267
+ self,
268
+ audios_raw: List[np.ndarray],
269
+ sampling_rate: int,
270
+ audio_max_length: Optional[int],
271
+ audio_pad_to_multiple_of: Optional[int],
272
+ ) -> List[int]:
273
+ del sampling_rate, audio_pad_to_multiple_of
274
+
275
+ hop_length = int(getattr(self.feature_extractor, "hop_length", 160))
276
+ max_audio_samples = int(audio_max_length) if audio_max_length is not None else None
277
+ token_counts: List[int] = []
278
+
279
+ for audio_raw in audios_raw:
280
+ audio_np = np.asarray(audio_raw, dtype=np.float32).reshape(-1)
281
+ effective_len = int(audio_np.shape[0])
282
+ if max_audio_samples is not None:
283
+ effective_len = min(effective_len, max_audio_samples)
284
+
285
+ mel_frames = effective_len // max(hop_length, 1)
286
+ token_counts.append(self.calculate_audio_token_count(int(mel_frames)))
287
+
288
+ return token_counts
289
+
290
+ # =========================
291
+ # apply_chat_template
292
+ # =========================
293
+ def apply_chat_template(
294
+ self,
295
+ conversation: Union[List[dict], List[List[dict]]],
296
+ chat_template: Optional[str] = None,
297
+ add_generation_prompt: bool = True,
298
+ **kwargs,
299
+ ) -> Union[BatchFeature, str, List[str]]:
300
+ if chat_template is not None:
301
+ logger.warning("chat_template argument is ignored.")
302
+
303
+ tokenize = kwargs.pop("tokenize", True)
304
+ return_tensors = kwargs.pop("return_tensors", "pt")
305
+ kwargs.pop("return_dict", None)
306
+
307
+ audio_torch_dtype = kwargs.pop("audio_torch_dtype", None)
308
+ audio_dtype_override = kwargs.pop("audio_dtype", None)
309
+ dtype_source = audio_torch_dtype if audio_torch_dtype is not None else audio_dtype_override
310
+ target_dtype = _resolve_torch_dtype(dtype_source, default=getattr(self, "audio_dtype", "float32"))
311
+
312
+ text_kwargs = dict(kwargs.pop("text_kwargs", {}) or {})
313
+ for k in ("padding", "truncation", "max_length", "add_special_tokens"):
314
+ if k in kwargs and k not in text_kwargs:
315
+ text_kwargs[k] = kwargs.pop(k)
316
+
317
+ sampling_rate = int(kwargs.pop("sampling_rate", 16000))
318
+ audio_padding = kwargs.pop("audio_padding", "longest")
319
+ audio_max_length = kwargs.pop("audio_max_length", None)
320
+ audio_pad_to_multiple_of = kwargs.pop("audio_pad_to_multiple_of", None)
321
+
322
+ if kwargs:
323
+ logger.warning(f"Ignored unused kwargs: {list(kwargs.keys())}")
324
+
325
+ if isinstance(conversation, list) and conversation and isinstance(conversation[0], dict):
326
+ conversations = [conversation]
327
+ is_single = True
328
+ else:
329
+ conversations = conversation
330
+ is_single = False
331
+
332
+ prompt_templates, audios_raw, prompt_audio_counts = self._build_templates_and_audios(
333
+ conversations=conversations,
334
+ sampling_rate=sampling_rate,
335
+ add_generation_prompt=add_generation_prompt,
336
+ )
337
+
338
+ input_features = None
339
+ audio_token_counts: List[int] = []
340
+
341
+ if len(audios_raw) > 0:
342
+ feat = self.feature_extractor(
343
+ audios_raw,
344
+ sampling_rate=sampling_rate,
345
+ return_tensors="np",
346
+ return_attention_mask=False,
347
+ padding=audio_padding,
348
+ max_length=audio_max_length,
349
+ pad_to_multiple_of=audio_pad_to_multiple_of,
350
+ )
351
+ input_features = feat["input_features"]
352
+ if not isinstance(input_features, np.ndarray):
353
+ input_features = np.asarray(input_features)
354
+
355
+ audio_token_counts = self._calculate_audio_token_counts_per_sample(
356
+ audios_raw=audios_raw,
357
+ sampling_rate=sampling_rate,
358
+ audio_max_length=audio_max_length,
359
+ audio_pad_to_multiple_of=audio_pad_to_multiple_of,
360
+ )
361
+
362
+ prompts: List[str] = []
363
+ audio_idx = 0
364
+ for prompt_template, audio_count in zip(prompt_templates, prompt_audio_counts):
365
+ prompt = prompt_template
366
+ for _ in range(audio_count):
367
+ if audio_idx >= len(audio_token_counts):
368
+ raise ValueError("Audio token count mismatch while building prompts.")
369
+ audio_tokens_str = "".join([self.audio_token] * audio_token_counts[audio_idx])
370
+ prompt = prompt.replace(_AUDIO_MARKER, audio_tokens_str, 1)
371
+ audio_idx += 1
372
+ if _AUDIO_MARKER in prompt:
373
+ raise ValueError("Unresolved audio marker remained in prompt.")
374
+ prompts.append(prompt)
375
+
376
+ if audio_idx != len(audio_token_counts):
377
+ raise ValueError("Unused audio token counts remained after prompt construction.")
378
+
379
+ if not tokenize:
380
+ return prompts[0] if is_single else prompts
381
+
382
+ text_kwargs.setdefault("padding", "longest")
383
+ text_kwargs.setdefault("add_special_tokens", False)
384
+ text_kwargs["return_tensors"] = return_tensors
385
+
386
+ enc = self.tokenizer(prompts, **text_kwargs)
387
+ data: Dict[str, Any] = dict(enc)
388
+
389
+ if input_features is not None:
390
+ data["audios"] = torch.tensor(input_features, dtype=target_dtype)
391
+
392
+ return BatchFeature(data=data, tensor_type=return_tensors)
393
+
394
+ # ... (其余 batch_decode, decode, __call__, model_input_names 保持不变) ...
395
+ def batch_decode(self, *args, **kwargs):
396
+ return self.tokenizer.batch_decode(*args, **kwargs)
397
+
398
+ def decode(self, *args, **kwargs):
399
+ return self.tokenizer.decode(*args, **kwargs)
400
+
401
+ def __call__(
402
+ self,
403
+ text: Union[str, List[str]],
404
+ audios: Union[np.ndarray, torch.Tensor, List[Union[np.ndarray, torch.Tensor]]],
405
+ sampling_rate: int = 16000,
406
+ return_tensors: str = "pt",
407
+ **tokenizer_kwargs,
408
+ ) -> BatchFeature:
409
+ # 简化版实现,不包含时间切片逻辑,因为直接传入的是 audio array
410
+ audios_list = []
411
+ def flatten_audios(obj):
412
+ if isinstance(obj, (list, tuple)):
413
+ if len(obj) > 0 and isinstance(obj[0], (float, int)):
414
+ audios_list.append(obj)
415
+ else:
416
+ for item in obj: flatten_audios(item)
417
+ elif isinstance(obj, (np.ndarray, torch.Tensor)):
418
+ audios_list.append(obj)
419
+ flatten_audios(audios)
420
+
421
+ audios_np: List[np.ndarray] = []
422
+ for a in audios_list:
423
+ if isinstance(a, torch.Tensor): a = a.detach().cpu().numpy()
424
+ a = np.asarray(a, dtype=np.float32).reshape(-1)
425
+ audios_np.append(a)
426
+
427
+ input_features = None
428
+ if audios_np:
429
+ feat = self.feature_extractor(audios_np, sampling_rate=int(sampling_rate), return_tensors="np", return_attention_mask=False, padding="longest")
430
+ input_features = feat["input_features"]
431
+ if not isinstance(input_features, np.ndarray): input_features = np.asarray(input_features)
432
+
433
+ tokenizer_kwargs = dict(tokenizer_kwargs or {})
434
+ tokenizer_kwargs.setdefault("padding", "longest")
435
+ tokenizer_kwargs.setdefault("add_special_tokens", False)
436
+ tokenizer_kwargs["return_tensors"] = return_tensors
437
+
438
+ enc = self.tokenizer(text, **tokenizer_kwargs)
439
+ data: Dict[str, Any] = dict(enc)
440
+ if input_features is not None:
441
+ data["audios"] = torch.tensor(input_features, dtype=_resolve_torch_dtype(getattr(self, "audio_dtype", "float32")))
442
+ return BatchFeature(data=data, tensor_type=return_tensors)
443
+
444
+ @property
445
+ def model_input_names(self):
446
+ return ["input_ids", "attention_mask", "audios"]
processor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_dtype": "bfloat16",
3
+ "audio_token": "<|audio|>",
4
+ "auto_map": {
5
+ "AutoProcessor": "processing_arkasr.ArkasrProcessor"
6
+ },
7
+ "merge_factor": 4,
8
+ "processor_class": "ArkasrProcessor"
9
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|user|>",
4
+ "<|begin_of_audio|>",
5
+ "<|end_of_audio|>",
6
+ "<|assistant|>",
7
+ "<|system|>"
8
+ ],
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc63972a406328b950c3dea5f64994846a95e444609657d440f0f2ecc4721b32
3
+ size 11422866
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|start_global_token|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|end_global_token|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|start_content|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|end_content|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|audio|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|user|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|begin_of_audio|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<|end_of_audio|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "151668": {
206
+ "content": "<|assistant|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "151669": {
214
+ "content": "<|system|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ }
221
+ },
222
+ "additional_special_tokens": [
223
+ "<|user|>",
224
+ "<|begin_of_audio|>",
225
+ "<|end_of_audio|>",
226
+ "<|assistant|>",
227
+ "<|system|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff