{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 30, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 281.53125, "epoch": 0.3333333333333333, "grad_norm": 2.773542642593384, "kl": 0.0580324565526098, "learning_rate": 4.985344892885899e-07, "loss": 0.0001, "reward": 2.4544273018836975, "reward_std": 2.3720036819577217, "rewards/concensus_correctness_reward_func": 0.8223749995231628, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.5142397312447429, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5553124938160181, "step": 2 }, { "completion_length": 271.28125, "epoch": 0.6666666666666666, "grad_norm": 2.0007011890411377, "kl": 0.07696286379359663, "learning_rate": 4.869132927957006e-07, "loss": 0.0001, "reward": 1.217789612710476, "reward_std": 1.0034673381596804, "rewards/concensus_correctness_reward_func": 0.0598750002682209, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.36272712517529726, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.6076875105500221, "step": 4 }, { "completion_length": 303.25, "epoch": 1.0, "grad_norm": 6.666475772857666, "kl": 0.0992446473101154, "learning_rate": 4.642142940418973e-07, "loss": 0.0001, "reward": 1.2962229549884796, "reward_std": 1.3163017462939024, "rewards/concensus_correctness_reward_func": 0.10431249998509884, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.42141042836010456, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3798749968409538, "step": 6 }, { "completion_length": 256.21875, "epoch": 1.3333333333333333, "grad_norm": 2.719036340713501, "kl": 0.08407116535818204, "learning_rate": 4.314988729807827e-07, "loss": 0.0001, "reward": 1.9946057051420212, "reward_std": 1.059820756316185, "rewards/concensus_correctness_reward_func": 0.3334375023841858, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.4136994816362858, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.684968750923872, "step": 8 }, { "completion_length": 321.0, "epoch": 1.6666666666666665, "grad_norm": 2.027712821960449, "kl": 0.10444008558988571, "learning_rate": 3.902967663405956e-07, "loss": 0.0001, "reward": 2.8985730558633804, "reward_std": 2.1391075775027275, "rewards/concensus_correctness_reward_func": 0.9828124903142452, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.6192605346441269, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.6715000197291374, "step": 10 }, { "completion_length": 289.78125, "epoch": 2.0, "grad_norm": 3.4199697971343994, "kl": 0.06481281516607851, "learning_rate": 3.4253453883497864e-07, "loss": 0.0001, "reward": 2.0985126718878746, "reward_std": 1.588769018650055, "rewards/concensus_correctness_reward_func": 0.3907500021159649, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.5003876239061356, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.6448750030249357, "step": 12 }, { "completion_length": 322.65625, "epoch": 2.3333333333333335, "grad_norm": 1.4510133266448975, "kl": 0.09526054270099849, "learning_rate": 2.9044549913819124e-07, "loss": 0.0001, "reward": 1.6081246063113213, "reward_std": 1.3514070957899094, "rewards/concensus_correctness_reward_func": 0.26324999891221523, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.3845308255404234, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5228437595069408, "step": 14 }, { "completion_length": 300.875, "epoch": 2.6666666666666665, "grad_norm": 32.19089126586914, "kl": 0.08323089242912829, "learning_rate": 2.3646527285364563e-07, "loss": 0.0001, "reward": 2.5157083868980408, "reward_std": 3.12582578510046, "rewards/concensus_correctness_reward_func": 1.4086875282227993, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.39945833198726177, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.45756249502301216, "step": 16 }, { "completion_length": 367.65625, "epoch": 3.0, "grad_norm": 2.201923131942749, "kl": 0.040471521380823106, "learning_rate": 1.8311791536769483e-07, "loss": 0.0, "reward": 1.7202124670147896, "reward_std": 1.1163596734404564, "rewards/concensus_correctness_reward_func": 0.15918750315904617, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.5503374617546797, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.43256248952820897, "step": 18 }, { "completion_length": 298.71875, "epoch": 3.3333333333333335, "grad_norm": 5.080014705657959, "kl": 0.1327864183112979, "learning_rate": 1.328978898250525e-07, "loss": 0.0001, "reward": 2.0473050475120544, "reward_std": 1.830180624499917, "rewards/concensus_correctness_reward_func": 0.684749998152256, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.38293003430590034, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.7296250090003014, "step": 20 }, { "completion_length": 321.9375, "epoch": 3.6666666666666665, "grad_norm": 1.9241300821304321, "kl": 0.07197530311532319, "learning_rate": 8.81534288045431e-08, "loss": 0.0001, "reward": 1.3912755213677883, "reward_std": 0.9820459727197886, "rewards/concensus_correctness_reward_func": 0.10918749868869781, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.4397130124270916, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5142499897629023, "step": 22 }, { "completion_length": 347.75, "epoch": 4.0, "grad_norm": 2.118330240249634, "kl": 0.051218974171206355, "learning_rate": 5.097673357358906e-08, "loss": 0.0001, "reward": 2.470015749335289, "reward_std": 3.3017385974526405, "rewards/concensus_correctness_reward_func": 0.9248749949038029, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.5278282500803471, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.251687491312623, "step": 24 }, { "completion_length": 399.5625, "epoch": 4.333333333333333, "grad_norm": 1.9072325229644775, "kl": 0.042736936593428254, "learning_rate": 2.3106145082260774e-08, "loss": 0.0, "reward": 2.27708288282156, "reward_std": 2.7430423460900784, "rewards/concensus_correctness_reward_func": 0.8621874898672104, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.5607079211622477, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291874964721501, "step": 26 }, { "completion_length": 354.09375, "epoch": 4.666666666666667, "grad_norm": 2.1720738410949707, "kl": 0.08552559395320714, "learning_rate": 5.844861072478335e-09, "loss": 0.0001, "reward": 1.2497381269931793, "reward_std": 1.0915438868105412, "rewards/concensus_correctness_reward_func": 0.14025000110268593, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.44089438393712044, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.5279687470756471, "step": 28 }, { "completion_length": 294.125, "epoch": 5.0, "grad_norm": 1.8672877550125122, "kl": 0.04691072192508727, "learning_rate": 0.0, "loss": 0.0, "reward": 1.9176766276359558, "reward_std": 1.8577748499810696, "rewards/concensus_correctness_reward_func": 0.338749997317791, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.5273328814655542, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5515937563031912, "step": 30 }, { "epoch": 5.0, "step": 30, "total_flos": 0.0, "train_loss": 7.583472033729776e-05, "train_runtime": 406.4207, "train_samples_per_second": 1.181, "train_steps_per_second": 0.074 } ], "logging_steps": 2, "max_steps": 30, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }