{ "name": "MaskGCT", "_name_or_path": "amphion/MaskGCT", "modelId": "amphion/MaskGCT", "architectures": [ "MaskGCTModel" ], "model_type": "maskgct", "task_specific_params": { "text-to-speech": { "supported_tasks": [ "zero-shot-tts", "non-autoregressive-tts" ] } }, "tags": [ "text-to-speech", "safetensors" ], "pipeline_tag": "text-to-speech", "language": [ "en", "zh" ], "license": "cc-by-nc-4.0", "datasets": [ "Emilia-100k" ], "model_structure": { "semantic_codec": { "type": "w2v-bert-2.0", "description": "Converting speech to semantic tokens" }, "acoustic_codec": { "type": "codec", "description": "Converting speech to acoustic tokens and reconstructing waveform" }, "maskgct_t2s": { "type": "transformer", "description": "Predicting semantic tokens with text and prompt semantic tokens" }, "maskgct_s2a": { "variants": { "1layer": { "type": "transformer", "description": "Single layer model for acoustic token prediction" }, "full": { "type": "transformer", "description": "Full model for acoustic token prediction" } }, "description": "Predicts acoustic tokens conditioned on semantic tokens" } }, "training_data": { "total_hours": 100000, "language_split": { "english": 50000, "chinese": 50000 } }, "paper": { "title": "MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer", "arxiv_id": "2409.00750" } }