Text-to-Speech
Safetensors
MaskGCT / model-index.json
RMSnow's picture
Create model-index.json
3279c63 verified
Raw
History Blame
1.92 kB
{
"name": "MaskGCT",
"_name_or_path": "amphion/MaskGCT",
"modelId": "amphion/MaskGCT",
"architectures": [
"MaskGCTModel"
],
"model_type": "maskgct",
"task_specific_params": {
"text-to-speech": {
"supported_tasks": [
"zero-shot-tts",
"non-autoregressive-tts"
]
}
},
"tags": [
"text-to-speech",
"safetensors"
],
"pipeline_tag": "text-to-speech",
"language": [
"en",
"zh"
],
"license": "cc-by-nc-4.0",
"datasets": [
"Emilia-100k"
],
"model_structure": {
"semantic_codec": {
"type": "w2v-bert-2.0",
"description": "Converting speech to semantic tokens"
},
"acoustic_codec": {
"type": "codec",
"description": "Converting speech to acoustic tokens and reconstructing waveform"
},
"maskgct_t2s": {
"type": "transformer",
"description": "Predicting semantic tokens with text and prompt semantic tokens"
},
"maskgct_s2a": {
"variants": {
"1layer": {
"type": "transformer",
"description": "Single layer model for acoustic token prediction"
},
"full": {
"type": "transformer",
"description": "Full model for acoustic token prediction"
}
},
"description": "Predicts acoustic tokens conditioned on semantic tokens"
}
},
"training_data": {
"total_hours": 100000,
"language_split": {
"english": 50000,
"chinese": 50000
}
},
"paper": {
"title": "MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer",
"arxiv_id": "2409.00750"
}
}