{
    "name": "MaskGCT",
    "_name_or_path": "amphion/MaskGCT",
    "modelId": "amphion/MaskGCT",
    "architectures": [
        "MaskGCTModel"
    ],
    "model_type": "maskgct",
    "task_specific_params": {
        "text-to-speech": {
            "supported_tasks": [
                "zero-shot-tts",
                "non-autoregressive-tts"
            ]
        }
    },
    "tags": [
        "text-to-speech",
        "safetensors"
    ],
    "pipeline_tag": "text-to-speech",
    "language": [
        "en",
        "zh"
    ],
    "license": "cc-by-nc-4.0",
    "datasets": [
        "Emilia-100k"
    ],
    "model_structure": {
        "semantic_codec": {
            "type": "w2v-bert-2.0",
            "description": "Converting speech to semantic tokens"
        },
        "acoustic_codec": {
            "type": "codec",
            "description": "Converting speech to acoustic tokens and reconstructing waveform"
        },
        "maskgct_t2s": {
            "type": "transformer",
            "description": "Predicting semantic tokens with text and prompt semantic tokens"
        },
        "maskgct_s2a": {
            "variants": {
                "1layer": {
                    "type": "transformer",
                    "description": "Single layer model for acoustic token prediction"
                },
                "full": {
                    "type": "transformer",
                    "description": "Full model for acoustic token prediction"
                }
            },
            "description": "Predicts acoustic tokens conditioned on semantic tokens"
        }
    },
    "training_data": {
        "total_hours": 100000,
        "language_split": {
            "english": 50000,
            "chinese": 50000
        }
    },
    "paper": {
        "title": "MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer",
        "arxiv_id": "2409.00750"
    }
}