| { |
| "name": "MaskGCT", |
| "_name_or_path": "amphion/MaskGCT", |
| "modelId": "amphion/MaskGCT", |
| "architectures": [ |
| "MaskGCTModel" |
| ], |
| "model_type": "maskgct", |
| "task_specific_params": { |
| "text-to-speech": { |
| "supported_tasks": [ |
| "zero-shot-tts", |
| "non-autoregressive-tts" |
| ] |
| } |
| }, |
| "tags": [ |
| "text-to-speech", |
| "safetensors" |
| ], |
| "pipeline_tag": "text-to-speech", |
| "language": [ |
| "en", |
| "zh" |
| ], |
| "license": "cc-by-nc-4.0", |
| "datasets": [ |
| "Emilia-100k" |
| ], |
| "model_structure": { |
| "semantic_codec": { |
| "type": "w2v-bert-2.0", |
| "description": "Converting speech to semantic tokens" |
| }, |
| "acoustic_codec": { |
| "type": "codec", |
| "description": "Converting speech to acoustic tokens and reconstructing waveform" |
| }, |
| "maskgct_t2s": { |
| "type": "transformer", |
| "description": "Predicting semantic tokens with text and prompt semantic tokens" |
| }, |
| "maskgct_s2a": { |
| "variants": { |
| "1layer": { |
| "type": "transformer", |
| "description": "Single layer model for acoustic token prediction" |
| }, |
| "full": { |
| "type": "transformer", |
| "description": "Full model for acoustic token prediction" |
| } |
| }, |
| "description": "Predicts acoustic tokens conditioned on semantic tokens" |
| } |
| }, |
| "training_data": { |
| "total_hours": 100000, |
| "language_split": { |
| "english": 50000, |
| "chinese": 50000 |
| } |
| }, |
| "paper": { |
| "title": "MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer", |
| "arxiv_id": "2409.00750" |
| } |
| } |