amphion
/

MaskGCT

Model card Files Files and versions

MaskGCT / model-index.json

RMSnow's picture

Create model-index.json

3279c63 verified about 1 year ago

1.92 kB

	{
	"name": "MaskGCT",
	"_name_or_path": "amphion/MaskGCT",
	"modelId": "amphion/MaskGCT",
	"architectures": [
	"MaskGCTModel"
	],
	"model_type": "maskgct",
	"task_specific_params": {
	"text-to-speech": {
	"supported_tasks": [
	"zero-shot-tts",
	"non-autoregressive-tts"
	]
	}
	},
	"tags": [
	"text-to-speech",
	"safetensors"
	],
	"pipeline_tag": "text-to-speech",
	"language": [
	"en",
	"zh"
	],
	"license": "cc-by-nc-4.0",
	"datasets": [
	"Emilia-100k"
	],
	"model_structure": {
	"semantic_codec": {
	"type": "w2v-bert-2.0",
	"description": "Converting speech to semantic tokens"
	},
	"acoustic_codec": {
	"type": "codec",
	"description": "Converting speech to acoustic tokens and reconstructing waveform"
	},
	"maskgct_t2s": {
	"type": "transformer",
	"description": "Predicting semantic tokens with text and prompt semantic tokens"
	},
	"maskgct_s2a": {
	"variants": {
	"1layer": {
	"type": "transformer",
	"description": "Single layer model for acoustic token prediction"
	},
	"full": {
	"type": "transformer",
	"description": "Full model for acoustic token prediction"
	}
	},
	"description": "Predicts acoustic tokens conditioned on semantic tokens"
	}
	},
	"training_data": {
	"total_hours": 100000,
	"language_split": {
	"english": 50000,
	"chinese": 50000
	}
	},
	"paper": {
	"title": "MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer",
	"arxiv_id": "2409.00750"
	}
	}