{ "project": "VK Education Vision-Language Modeling", "author": { "name": "Ибрагимов Далгат Магомедалиевич", "institution": "МАИ, институт 8", "group": "М8О-308Б-32" }, "primary_hf_artifact": "https://huggingface.co/lockR/vk-vlm-gqa-ru-qwen35-08b-lora", "primary_run": "gqa_ru_qwen35_0_8b_lora_fast_v1", "base_model": "Qwen/Qwen3.5-0.8B", "adapter_type": "LoRA", "dataset": { "name": "deepvk/GQA-ru", "source": "https://huggingface.co/datasets/deepvk/GQA-ru", "train_samples": 38019, "validation_samples": 1981, "testdev_samples": 12216, "usage": "Image-question-answer records were used for multimodal VLM LoRA fine-tuning and official lmms-eval evaluation." }, "training": { "multimodal": true, "vision_encoder_trainable": false, "lora_scope": "language_model attention projection layers", "seed": 42, "epochs": 1.0, "batch_size": 8, "learning_rate": 0.0002, "mixed_precision": "bf16", "max_seq_length": 1024, "image_resolution": 336, "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj" ], "best_checkpoint": "checkpoint-4560", "best_metric_name": "eval_loss", "best_metric_value": 0.4337001144886017 }, "train_metrics": { "train_runtime": 6219.1947, "train_samples_per_second": 6.113, "train_steps_per_second": 0.764, "train_loss": 0.04432422036801592, "epoch": 1.0, "eval_loss": 0.4337001144886017, "eval_runtime": 116.0177, "eval_samples_per_second": 17.075, "eval_steps_per_second": 1.069, "metrics_path": "runs/gqa_ru_qwen35_0_8b_lora_fast_v1/train_metrics.json" }, "official_benchmark_full": { "runner": "lmms-eval", "task": "gqa-ru", "dataset_path": "deepvk/GQA-ru", "dataset_name": "testdev_balanced_instructions", "split": "testdev", "metric": "exact_match", "metric_higher_is_better": true, "prompt_suffix": "Ответь одним словом.", "samples": 12216, "limit": null, "model_backend": "qwen3_5", "model_args_common": "enable_thinking=False", "base_exact_match": 0.2861820563195809, "adapter_exact_match": 0.48321872953503603, "exact_match_delta": 0.19703667321545515, "relative_improvement": 0.6885011441647597, "base_stderr": 0.004089480999753636, "adapter_stderr": 0.004521458266039995, "base_correct": 3496, "adapter_correct": 5903, "correct_delta": 2407, "base_results_path": "runs/lmms_eval/gqa_ru_qwen35_base_full/Qwen__Qwen3.5-0.8B/20260604_141134_results.json", "adapter_results_path": "runs/lmms_eval/gqa_ru_qwen35_lora_full/artifacts__merged_qwen35_gqa_ru_full/20260604_145224_results.json" }, "secondary_experiment": { "base_model": "Qwen/Qwen2.5-VL-3B-Instruct", "status": "Full GQA-ru training and full base-vs-adapter evaluation are in progress.", "smoke_hf_artifact": "https://huggingface.co/lockR/vk-vlm-gqa-ru-qwen25vl-3b-lora-smoke", "smoke_limit_100_base_exact_match": 0.39, "smoke_limit_100_adapter_exact_match": 0.48 }, "limitations": [ "The vision encoder was frozen; LoRA was trained only in language model attention projection layers.", "MMBench-ru has not yet been measured.", "The Qwen2.5-VL full experiment is still running and is not included in the primary result." ] }