{ "results": { "gqa-ru": { "alias": "gqa-ru", "exact_match,none": 0.48321872953503603, "exact_match_stderr,none": 0.004521458266039995, "exact_match_stderr_clt,none": 0.004521458266040028, "exact_match_stderr_clustered,none": "N/A" } }, "group_subtasks": { "gqa-ru": [] }, "configs": { "gqa-ru": { "task": "gqa-ru", "dataset_path": "deepvk/GQA-ru", "dataset_name": "testdev_balanced_instructions", "dataset_kwargs": { "token": true }, "test_split": "testdev", "full_docs": false, "process_results_use_image": false, "doc_to_visual": "", "doc_to_text": "", "doc_to_target": "answer", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "num_fewshot": 0, "metric_list": [ { "metric": "exact_match", "aggregation": "mean", "higher_is_better": true, "ignore_case": true, "ignore_punctuation": true } ], "output_type": "generate_until", "generation_kwargs": { "max_new_tokens": 16, "temperature": 0.0, "top_p": 1.0, "num_beams": 1, "do_sample": false, "until": [ "\n\n" ] }, "repeats": 1, "should_decontaminate": false, "score_key": "score", "metadata": [ { "version": 0.0 } ], "lmms_eval_specific_kwargs": { "default": { "pre_prompt": "", "post_prompt": "\nОтветь одним словом." }, "pre_prompt": "", "post_prompt": "\nОтветь одним словом." } } }, "versions": { "gqa-ru": "Yaml" }, "n-shot": { "gqa-ru": 0 }, "higher_is_better": { "gqa-ru": { "exact_match": true } }, "n-samples": { "gqa-ru": { "original": 12216, "effective": 12216 } }, "config": { "model": "qwen3_5", "model_args": "pretrained=artifacts/merged_qwen35_gqa_ru_full,enable_thinking=False", "batch_size": "1", "batch_sizes": [], "device": "cuda:0", "use_cache": null, "limit": null, "offset": 0, "bootstrap_iters": 100000, "gen_kwargs": "", "random_seed": 0, "numpy_seed": 1234, "torch_seed": 1234, "fewshot_seed": 1234, "resolved_cli_args": { "config": "", "model": "qwen3_5", "tasks": "gqa-ru", "model_args": "pretrained=artifacts/merged_qwen35_gqa_ru_full,enable_thinking=False", "launcher_args": null, "num_fewshot": null, "batch_size": "1", "max_batch_size": null, "device": "cuda:0", "output_path": "runs/lmms_eval/gqa_ru_qwen35_lora_full", "limit": null, "offset": 0, "use_cache": null, "cache_requests": null, "check_integrity": false, "write_out": false, "log_samples": true, "wandb_log_samples": false, "log_samples_suffix": "model_outputs", "system_instruction": null, "apply_chat_template": false, "fewshot_as_multiturn": false, "show_config": false, "include_path": null, "gen_kwargs": "", "reasoning_tags": "[[\"\", \"\"], [\"\", \"\"]]", "verbosity": "INFO", "wandb_args": "", "timezone": "Asia/Singapore", "hf_hub_log_args": ",output_path=runs/lmms_eval/gqa_ru_qwen35_lora_full", "predict_only": false, "seed": [ 0, 1234, 1234, 1234 ], "trust_remote_code": true, "process_with_media": false, "agentic_trace_mode": "basic", "force_simple": false, "tui": false, "repeats": 1, "baseline": null, "max_tokens": null, "power_analysis": false, "effect_size": 0.03, "alpha": 0.05, "power": 0.8, "correlation": 0.5, "std_a": null, "std_b": null } }, "git_hash": "66abb85", "git_branch": "HEAD", "lmms_eval_version": "HEAD@66abb85", "date": "20260604_145224", "usage": {}, "efficiency": { "by_task": { "gqa-ru": { "docs": 12216.0, "docs_with_token_counts": 0.0, "total_input_tokens": 0.0, "total_output_tokens": 0.0, "total_tokens": 0.0, "total_correct_score": 5903.0, "avg_output_tokens_per_sample": 0.0, "tokens_per_correct_answer": 0.0 } }, "overall": { "docs": 12216.0, "docs_with_token_counts": 0.0, "total_input_tokens": 0.0, "total_output_tokens": 0.0, "total_tokens": 0.0, "total_correct_score": 5903.0, "avg_output_tokens_per_sample": 0.0, "tokens_per_correct_answer": 0.0 } }, "task_hashes": { "gqa-ru": "2c8b244017701ddcaba1c1b8d0b68381736ba7af712eb04662ecd1049801029e" }, "model_source": "qwen3_5", "model_name": "artifacts/merged_qwen35_gqa_ru_full", "model_name_sanitized": "artifacts__merged_qwen35_gqa_ru_full", "system_instruction": null, "system_instruction_sha": null, "fewshot_as_multiturn": false, "chat_template": null, "chat_template_sha": null, "start_time": 96501.56674432, "end_time": 98634.818651018, "total_evaluation_time_seconds": "2133.2519066979876" }