{ "model_type": "early-fusion-embedding-mlp", "task": "multimodal-emotion-recognition", "dataset": "declare-lab/MELD", "modalities": ["text", "audio"], "text_encoder": { "model_name": "bert-base-uncased", "embedding_dim": 768, "embedding_extraction": "CLS token", "frozen": true, "max_text_length": 128 }, "audio_encoder": { "feature_type": "pre-extracted acoustic features", "embedding_dim": 768, "frozen": true }, "fusion": { "type": "early", "method": "concatenation", "fusion_dim": 1536 }, "classifier": { "type": "mlp", "hidden_dim": 512, "dropout": 0.3 }, "num_labels": 7, "id2label": { "0": "neutral", "1": "joy", "2": "surprise", "3": "anger", "4": "sadness", "5": "fear", "6": "disgust" }, "label2id": { "neutral": 0, "joy": 1, "surprise": 2, "anger": 3, "sadness": 4, "fear": 5, "disgust": 6 }, "training_setup": { "text_encoder_trainable": false, "audio_encoder_trainable": false, "classifier_trainable": true } }