{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 472.75, "epoch": 0.1, "grad_norm": 4.972134113311768, "kl": 0.0, "learning_rate": 4.965903258506806e-07, "loss": -0.0, "reward": 0.8231485052965581, "reward_std": 1.2553124479018152, "rewards/concensus_correctness_reward_func": 0.13868749793618917, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.38033601630013436, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07087501464411616, "step": 2 }, { "completion_length": 404.34375, "epoch": 0.2, "grad_norm": 7.311157703399658, "kl": 0.0009687572946859291, "learning_rate": 4.698684378016222e-07, "loss": 0.0, "reward": 0.47689660359174013, "reward_std": 0.7753186682239175, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.28077160858083516, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07112499827053398, "step": 4 }, { "completion_length": 397.8125, "epoch": 0.3, "grad_norm": 12.540438652038574, "kl": 0.001213116767758038, "learning_rate": 4.193203929064353e-07, "loss": 0.0, "reward": 0.320636167190969, "reward_std": 0.6216559750464512, "rewards/concensus_correctness_reward_func": 0.03693750128149986, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.1836361801251769, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08743749931454659, "step": 6 }, { "completion_length": 253.09375, "epoch": 0.4, "grad_norm": 117.53514862060547, "kl": 0.04831917503543082, "learning_rate": 3.5042385616324236e-07, "loss": 0.0, "reward": 0.3986110386031214, "reward_std": 0.4029649797012098, "rewards/concensus_correctness_reward_func": 0.015625, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.2335172927705571, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08696874883025885, "step": 8 }, { "completion_length": 313.5625, "epoch": 0.5, "grad_norm": 8.876846313476562, "kl": 0.00187252888281364, "learning_rate": 2.706448363680831e-07, "loss": 0.0, "reward": 0.46869485336355865, "reward_std": 0.6680965052219108, "rewards/concensus_correctness_reward_func": -0.011062500067055225, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.28053860645741224, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01171874301508069, "step": 10 }, { "completion_length": 403.1875, "epoch": 0.6, "grad_norm": 6.0552473068237305, "kl": 0.0020919858761772048, "learning_rate": 1.886286282148002e-07, "loss": 0.0, "reward": 0.5102090919390321, "reward_std": 0.9569540836382657, "rewards/concensus_correctness_reward_func": 0.10056249983608723, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.30486532411305234, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08271874906495214, "step": 12 }, { "completion_length": 457.28125, "epoch": 0.7, "grad_norm": 4.433928489685059, "kl": 0.0026758583626360632, "learning_rate": 1.1326296046939333e-07, "loss": 0.0, "reward": 0.9141725162044168, "reward_std": 0.7021154585527256, "rewards/concensus_correctness_reward_func": 0.02274999930523336, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.4190162750892341, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2224062472814694, "step": 14 }, { "completion_length": 321.5, "epoch": 0.8, "grad_norm": 20.360118865966797, "kl": 0.0045296397038328, "learning_rate": 5.271487265090163e-08, "loss": 0.0, "reward": 0.5812048423686065, "reward_std": 0.5352981987709882, "rewards/concensus_correctness_reward_func": 0.03956250101327896, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.3840485939872451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.029906250536441803, "step": 16 }, { "completion_length": 505.71875, "epoch": 0.9, "grad_norm": 5.112046718597412, "kl": 0.0033627474076638464, "learning_rate": 1.3545689574841341e-08, "loss": 0.0, "reward": 0.8375814352184534, "reward_std": 0.8647369077662006, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.3907376849092543, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3218437498435378, "step": 18 }, { "completion_length": 352.875, "epoch": 1.0, "grad_norm": 3.8864283561706543, "kl": 0.0029916863204562105, "learning_rate": 0.0, "loss": 0.0, "reward": 1.0474416308570653, "reward_std": 0.6638899724930525, "rewards/concensus_correctness_reward_func": 0.09231250081211329, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.4742853690404445, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1683437500614673, "step": 20 }, { "epoch": 1.0, "step": 20, "total_flos": 0.0, "train_loss": 6.804741133237258e-06, "train_runtime": 399.5968, "train_samples_per_second": 0.801, "train_steps_per_second": 0.05 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }