vk-vlm-gqa-ru-qwen35-08b-lora / benchmark_summary.json
lockR's picture
Update benchmark summary with full GQA-ru metrics
50e8a21 verified
Raw
History Blame Contribute Delete
3.46 kB
{
"project": "VK Education Vision-Language Modeling",
"author": {
"name": "Ибрагимов Далгат Магомедалиевич",
"institution": "МАИ, институт 8",
"group": "М8О-308Б-32"
},
"primary_hf_artifact": "https://huggingface.co/lockR/vk-vlm-gqa-ru-qwen35-08b-lora",
"primary_run": "gqa_ru_qwen35_0_8b_lora_fast_v1",
"base_model": "Qwen/Qwen3.5-0.8B",
"adapter_type": "LoRA",
"dataset": {
"name": "deepvk/GQA-ru",
"source": "https://huggingface.co/datasets/deepvk/GQA-ru",
"train_samples": 38019,
"validation_samples": 1981,
"testdev_samples": 12216,
"usage": "Image-question-answer records were used for multimodal VLM LoRA fine-tuning and official lmms-eval evaluation."
},
"training": {
"multimodal": true,
"vision_encoder_trainable": false,
"lora_scope": "language_model attention projection layers",
"seed": 42,
"epochs": 1.0,
"batch_size": 8,
"learning_rate": 0.0002,
"mixed_precision": "bf16",
"max_seq_length": 1024,
"image_resolution": 336,
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.05,
"target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj"
],
"best_checkpoint": "checkpoint-4560",
"best_metric_name": "eval_loss",
"best_metric_value": 0.4337001144886017
},
"train_metrics": {
"train_runtime": 6219.1947,
"train_samples_per_second": 6.113,
"train_steps_per_second": 0.764,
"train_loss": 0.04432422036801592,
"epoch": 1.0,
"eval_loss": 0.4337001144886017,
"eval_runtime": 116.0177,
"eval_samples_per_second": 17.075,
"eval_steps_per_second": 1.069,
"metrics_path": "runs/gqa_ru_qwen35_0_8b_lora_fast_v1/train_metrics.json"
},
"official_benchmark_full": {
"runner": "lmms-eval",
"task": "gqa-ru",
"dataset_path": "deepvk/GQA-ru",
"dataset_name": "testdev_balanced_instructions",
"split": "testdev",
"metric": "exact_match",
"metric_higher_is_better": true,
"prompt_suffix": "Ответь одним словом.",
"samples": 12216,
"limit": null,
"model_backend": "qwen3_5",
"model_args_common": "enable_thinking=False",
"base_exact_match": 0.2861820563195809,
"adapter_exact_match": 0.48321872953503603,
"exact_match_delta": 0.19703667321545515,
"relative_improvement": 0.6885011441647597,
"base_stderr": 0.004089480999753636,
"adapter_stderr": 0.004521458266039995,
"base_correct": 3496,
"adapter_correct": 5903,
"correct_delta": 2407,
"base_results_path": "runs/lmms_eval/gqa_ru_qwen35_base_full/Qwen__Qwen3.5-0.8B/20260604_141134_results.json",
"adapter_results_path": "runs/lmms_eval/gqa_ru_qwen35_lora_full/artifacts__merged_qwen35_gqa_ru_full/20260604_145224_results.json"
},
"secondary_experiment": {
"base_model": "Qwen/Qwen2.5-VL-3B-Instruct",
"status": "Full GQA-ru training and full base-vs-adapter evaluation are in progress.",
"smoke_hf_artifact": "https://huggingface.co/lockR/vk-vlm-gqa-ru-qwen25vl-3b-lora-smoke",
"smoke_limit_100_base_exact_match": 0.39,
"smoke_limit_100_adapter_exact_match": 0.48
},
"limitations": [
"The vision encoder was frozen; LoRA was trained only in language model attention projection layers.",
"MMBench-ru has not yet been measured.",
"The Qwen2.5-VL full experiment is still running and is not included in the primary result."
]
}