{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 302.6875, "epoch": 0.4, "grad_norm": 2.107053279876709, "kl": 0.001833398244343698, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.6121698692440987, "reward_std": 0.7188298236578703, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.34723235201090574, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13993750419467688, "step": 2 }, { "completion_length": 302.5, "epoch": 0.8, "grad_norm": 2.666794776916504, "kl": 0.02018403948022751, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 0.8020964106544852, "reward_std": 0.5309067370835692, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.29853392392396927, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0035625100135803223, "step": 4 }, { "completion_length": 492.0, "epoch": 1.2, "grad_norm": 1.4782581329345703, "kl": 0.0017244854752789252, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 0.753280152566731, "reward_std": 0.9823426175862551, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.28684263746254146, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09143750462681055, "step": 6 }, { "completion_length": 458.9375, "epoch": 1.6, "grad_norm": 2.416404962539673, "kl": 0.0026180234526691493, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 1.2846465739421546, "reward_std": 1.141023407690227, "rewards/concensus_correctness_reward_func": 0.21074999868869781, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.45814658515155315, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36574999801814556, "step": 8 }, { "completion_length": 360.6875, "epoch": 2.0, "grad_norm": 8.60208797454834, "kl": 0.0025160184886772186, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 1.591901202686131, "reward_std": 1.8193573111202568, "rewards/concensus_correctness_reward_func": 0.5272499993443489, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.39571370277553797, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2939375042915344, "step": 10 }, { "completion_length": 452.625, "epoch": 2.4, "grad_norm": 2.5068929195404053, "kl": 0.0019544935057638213, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 0.7135184425860643, "reward_std": 0.8140879347920418, "rewards/concensus_correctness_reward_func": 0.1171249970793724, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.30308092199265957, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04331250116229057, "step": 12 }, { "completion_length": 315.0625, "epoch": 2.8, "grad_norm": 3.2494900226593018, "kl": 0.01614819017413538, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 1.636540960520506, "reward_std": 2.4551502619870007, "rewards/concensus_correctness_reward_func": 1.3697500005364418, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.2268535066395998, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08506249729543924, "step": 14 }, { "completion_length": 459.4375, "epoch": 3.2, "grad_norm": 2.0964574813842773, "kl": 0.0013882217381251394, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 0.8315258803777397, "reward_std": 0.5543470710981637, "rewards/concensus_correctness_reward_func": 0.3542499914765358, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.28727589966729283, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06000000424683094, "step": 16 }, { "completion_length": 476.125, "epoch": 3.6, "grad_norm": 2.4637372493743896, "kl": 0.001981627065106295, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 1.526203976944089, "reward_std": 1.0305481338873506, "rewards/concensus_correctness_reward_func": 0.4403750002384186, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.34639142733067274, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3644375130534172, "step": 18 }, { "completion_length": 404.5, "epoch": 4.0, "grad_norm": 3.0885298252105713, "kl": 0.007382418974884786, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 2.3384321611374617, "reward_std": 2.554030758328736, "rewards/concensus_correctness_reward_func": 1.3282499983906746, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.39343212731182575, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.00824999762699008, "step": 20 }, { "epoch": 4.0, "step": 20, "total_flos": 0.0, "train_loss": 5.768292692209798e-06, "train_runtime": 200.4297, "train_samples_per_second": 0.798, "train_steps_per_second": 0.1 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }