prodigyhuh's picture
Upload promoted hard recall micro boost adapter
c6ce9e9 verified
{
"base_model": "Qwen/Qwen3-1.7B",
"adapter": "/tmp/atomicvision_publish_runner/output/train/checkpoint-1",
"episodes_per_difficulty": 32,
"seed_start": 10000,
"seed_policy": {
"sft_train": {
"start": 1000,
"stop": 4000
},
"grpo_train": {
"start": 4000,
"stop": 8000
},
"heldout_eval": {
"start": 10000,
"stop": 11000
}
},
"heldout_seed_enforced": true,
"max_tool_steps": 3,
"max_new_tokens": 180,
"modes": [
"strict"
],
"results": {
"medium": {
"baseline_prior_submit": {
"episodes": 32,
"mean_reward": 4.65651721875,
"mean_f1": 0.80580365625,
"mean_mae": 0.0244615625,
"mean_steps": 2.0,
"mean_scan_cost": 1.5,
"done_rate": 1.0,
"tool_failure_rate": 0.0,
"mean_repeated_tool_calls": 0.0,
"strict_tool_call_pass_rate": 1.0,
"normalized_tool_call_pass_rate": 1.0,
"normalized_tool_call_repair_rate": 0.0,
"first_action_valid_rate": 1.0,
"first_action_ask_prior_rate": 1.0,
"submit_action_rate": 1.0,
"mean_identity_reward": 3.2232142812499998,
"mean_concentration_reward": 2.23155896875,
"mean_confidence_reward": 0.270494,
"mean_false_positive_penalty": -0.1875,
"mean_missed_defect_penalty": -0.28125,
"mean_timeout_penalty": 0.0,
"mean_outcome_reward_total": 5.72526725,
"mean_penalty_total": -1.06875
},
"strict_adapter": {
"episodes": 32,
"mean_reward": 4.50648265625,
"mean_f1": 0.789137,
"mean_mae": 0.027124218749999998,
"mean_steps": 2.0,
"mean_scan_cost": 1.5,
"done_rate": 1.0,
"tool_failure_rate": 0.0,
"mean_repeated_tool_calls": 0.0,
"strict_tool_call_pass_rate": 1.0,
"normalized_tool_call_pass_rate": 1.0,
"normalized_tool_call_repair_rate": 0.0,
"first_action_valid_rate": 1.0,
"first_action_ask_prior_rate": 1.0,
"submit_action_rate": 1.0,
"mean_identity_reward": 3.156547625,
"mean_concentration_reward": 2.16269634375,
"mean_confidence_reward": 0.29348875,
"mean_false_positive_penalty": -0.1875,
"mean_missed_defect_penalty": -0.31875,
"mean_timeout_penalty": 0.0,
"mean_outcome_reward_total": 5.61273271875,
"mean_penalty_total": -1.10625
},
"strict_failures": []
},
"hard": {
"baseline_prior_submit": {
"episodes": 32,
"mean_reward": 5.01651990625,
"mean_f1": 0.85153328125,
"mean_mae": 0.02220903125,
"mean_steps": 2.0,
"mean_scan_cost": 1.5,
"done_rate": 1.0,
"tool_failure_rate": 0.0,
"mean_repeated_tool_calls": 0.0,
"strict_tool_call_pass_rate": 1.0,
"normalized_tool_call_pass_rate": 1.0,
"normalized_tool_call_repair_rate": 0.0,
"first_action_valid_rate": 1.0,
"first_action_ask_prior_rate": 1.0,
"submit_action_rate": 1.0,
"mean_identity_reward": 3.40613278125,
"mean_concentration_reward": 2.3444645625,
"mean_confidence_reward": 0.53779759375,
"mean_false_positive_penalty": -0.109375,
"mean_missed_defect_penalty": -0.5625,
"mean_timeout_penalty": 0.0,
"mean_outcome_reward_total": 6.2883949375,
"mean_penalty_total": -1.2718749999999999
},
"strict_adapter": {
"episodes": 32,
"mean_reward": 4.714775875,
"mean_f1": 0.8206800937500001,
"mean_mae": 0.02552296875,
"mean_steps": 2.0,
"mean_scan_cost": 1.5,
"done_rate": 1.0,
"tool_failure_rate": 0.0,
"mean_repeated_tool_calls": 0.0,
"strict_tool_call_pass_rate": 1.0,
"normalized_tool_call_pass_rate": 1.0,
"normalized_tool_call_repair_rate": 0.0,
"first_action_valid_rate": 1.0,
"first_action_ask_prior_rate": 1.0,
"submit_action_rate": 1.0,
"mean_identity_reward": 3.282720125,
"mean_concentration_reward": 2.243760375,
"mean_confidence_reward": 0.5257955,
"mean_false_positive_penalty": -0.09375,
"mean_missed_defect_penalty": -0.64375,
"mean_timeout_penalty": 0.0,
"mean_outcome_reward_total": 6.052276,
"mean_penalty_total": -1.3375
},
"strict_failures": []
}
}
}