{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.555555555555555, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 254.0, "epoch": 0.2222222222222222, "grad_norm": 2.455533266067505, "kl": 0.0016071582067525014, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.8821750227361917, "reward_std": 0.7979278466664255, "rewards/concensus_correctness_reward_func": 0.049937501549720764, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.26773752458393574, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1895000054500997, "step": 2 }, { "completion_length": 369.3125, "epoch": 0.4444444444444444, "grad_norm": 2.520486354827881, "kl": 0.0038820735207991675, "learning_rate": 4.978612153434526e-07, "loss": 0.0, "reward": 0.3699257434345782, "reward_std": 0.5709783472120762, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.19958198571112007, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17034375481307507, "step": 4 }, { "completion_length": 467.875, "epoch": 0.6666666666666666, "grad_norm": 1.3698705434799194, "kl": 0.0013275225574034266, "learning_rate": 4.91481456572267e-07, "loss": 0.0, "reward": 0.6990349255502224, "reward_std": 0.9675047248601913, "rewards/concensus_correctness_reward_func": 0.046875, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.33225369080901146, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19490624405443668, "step": 6 }, { "completion_length": 323.34375, "epoch": 0.8888888888888888, "grad_norm": 2.7056479454040527, "kl": 0.00173920994711807, "learning_rate": 4.809698831278217e-07, "loss": 0.0, "reward": 0.5385207324288785, "reward_std": 0.8355418732389808, "rewards/concensus_correctness_reward_func": 0.02787500061094761, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.2965519982390106, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07346874848008156, "step": 8 }, { "completion_length": 276.09375, "epoch": 1.1111111111111112, "grad_norm": 3.8186511993408203, "kl": 0.0028932989152963273, "learning_rate": 4.6650635094610966e-07, "loss": 0.0, "reward": 1.1083624437451363, "reward_std": 1.7071605939418077, "rewards/concensus_correctness_reward_func": 0.625, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.25786245451308787, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03799999970942736, "step": 10 }, { "completion_length": 255.59375, "epoch": 1.3333333333333333, "grad_norm": 34.07455062866211, "kl": 0.0037407633935799822, "learning_rate": 4.483383350728088e-07, "loss": 0.0, "reward": 1.2880361340939999, "reward_std": 1.625240983441472, "rewards/concensus_correctness_reward_func": 0.625, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.32416112907230854, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15137499663978815, "step": 12 }, { "completion_length": 285.21875, "epoch": 1.5555555555555556, "grad_norm": 3.052701234817505, "kl": 0.002863182016881183, "learning_rate": 4.2677669529663686e-07, "loss": 0.0, "reward": 0.607268082909286, "reward_std": 0.6088872440159321, "rewards/concensus_correctness_reward_func": 0.022187499329447746, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.38814307004213333, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19693750143051147, "step": 14 }, { "completion_length": 347.65625, "epoch": 1.7777777777777777, "grad_norm": 5.15467643737793, "kl": 0.0014800850040046498, "learning_rate": 4.0219035725218013e-07, "loss": 0.0, "reward": 0.1809218251146376, "reward_std": 0.7999278882052749, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.2310780775733292, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.112656245008111, "step": 16 }, { "completion_length": 441.25, "epoch": 2.0, "grad_norm": 4.815533638000488, "kl": 0.0013374313130043447, "learning_rate": 3.75e-07, "loss": 0.0, "reward": 0.5386425573378801, "reward_std": 0.6832009451463819, "rewards/concensus_correctness_reward_func": 0.046875, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.23683004680788144, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19243750348687172, "step": 18 }, { "completion_length": 437.25, "epoch": 2.2222222222222223, "grad_norm": 2.5218758583068848, "kl": 0.0015388188403449021, "learning_rate": 3.4567085809127245e-07, "loss": 0.0, "reward": 0.7281985133886337, "reward_std": 0.6127606704831123, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.45726102218031883, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709374986588955, "step": 20 }, { "completion_length": 387.75, "epoch": 2.4444444444444446, "grad_norm": 1.3259650468826294, "kl": 0.0036219921821611933, "learning_rate": 3.147047612756302e-07, "loss": 0.0, "reward": 0.2731318287551403, "reward_std": 0.2530765999108553, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.20441308384761214, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06871875282377005, "step": 22 }, { "completion_length": 447.09375, "epoch": 2.6666666666666665, "grad_norm": 6.329639434814453, "kl": 0.0021589112002402544, "learning_rate": 2.826315480550129e-07, "loss": 0.0, "reward": 1.2416966576129198, "reward_std": 2.0539041608572006, "rewards/concensus_correctness_reward_func": 0.625, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.3013216257095337, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1278750030323863, "step": 24 }, { "completion_length": 301.875, "epoch": 2.888888888888889, "grad_norm": 4.056438446044922, "kl": 0.0024828215537127107, "learning_rate": 2.5e-07, "loss": 0.0, "reward": 0.2829342377372086, "reward_std": 0.33365900977514684, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.1184342410415411, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16449999809265137, "step": 26 }, { "completion_length": 341.15625, "epoch": 3.111111111111111, "grad_norm": 2.550891876220703, "kl": 0.002100169222103432, "learning_rate": 2.1736845194498716e-07, "loss": 0.0, "reward": 0.3925126292742789, "reward_std": 0.7323208572342992, "rewards/concensus_correctness_reward_func": 0.014562499709427357, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.29879387514665723, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016656249528750777, "step": 28 }, { "completion_length": 388.03125, "epoch": 3.3333333333333335, "grad_norm": 1.4323590993881226, "kl": 0.004696541313023772, "learning_rate": 1.8529523872436977e-07, "loss": 0.0, "reward": 0.05710854474455118, "reward_std": 0.5547531270422041, "rewards/concensus_correctness_reward_func": 0.0078125, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.19385854969732463, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20706249913200736, "step": 30 }, { "completion_length": 352.90625, "epoch": 3.5555555555555554, "grad_norm": 2.221634864807129, "kl": 0.002907156704168301, "learning_rate": 1.5432914190872756e-07, "loss": 0.0, "reward": 0.3871938968077302, "reward_std": 0.3934658619109541, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.2629126524552703, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1242812555283308, "step": 32 }, { "completion_length": 384.375, "epoch": 3.7777777777777777, "grad_norm": 2.7794787883758545, "kl": 0.001867200349806808, "learning_rate": 1.2500000000000005e-07, "loss": 0.0, "reward": 0.9573859982192516, "reward_std": 1.0359943909570575, "rewards/concensus_correctness_reward_func": 0.11443750280886889, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.3830109708942473, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20993749890476465, "step": 34 }, { "completion_length": 361.375, "epoch": 4.0, "grad_norm": 3.378568172454834, "kl": 0.0019658175588119775, "learning_rate": 9.780964274781983e-08, "loss": 0.0, "reward": 0.550561910495162, "reward_std": 0.5597149441018701, "rewards/concensus_correctness_reward_func": 0.03125, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.27071817917749286, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12359375134110451, "step": 36 }, { "completion_length": 282.96875, "epoch": 4.222222222222222, "grad_norm": 4.209115505218506, "kl": 0.0013284565284266137, "learning_rate": 7.322330470336313e-08, "loss": 0.0, "reward": 1.8079969780519605, "reward_std": 2.005708161741495, "rewards/concensus_correctness_reward_func": 1.268062500283122, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.20130947075085714, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08862500544637442, "step": 38 }, { "completion_length": 431.4375, "epoch": 4.444444444444445, "grad_norm": 29.043092727661133, "kl": 0.003590297739719972, "learning_rate": 5.166166492719124e-08, "loss": 0.0, "reward": 0.5267084827646613, "reward_std": 0.684504278935492, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.35548973828554153, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10871875286102295, "step": 40 }, { "completion_length": 384.59375, "epoch": 4.666666666666667, "grad_norm": 29.70347785949707, "kl": 0.001960429195605684, "learning_rate": 3.349364905389032e-08, "loss": 0.0, "reward": 1.234227987471968, "reward_std": 1.9329941319301724, "rewards/concensus_correctness_reward_func": 0.6629374995827675, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.257196772727184, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12659375229850411, "step": 42 }, { "completion_length": 364.0625, "epoch": 4.888888888888889, "grad_norm": 55.00151062011719, "kl": 0.002629825277836062, "learning_rate": 1.9030116872178314e-08, "loss": 0.0, "reward": 0.5272765271365643, "reward_std": 0.8430194035172462, "rewards/concensus_correctness_reward_func": 0.01837499998509884, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.31580778677016497, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13059375621378422, "step": 44 }, { "completion_length": 304.40625, "epoch": 5.111111111111111, "grad_norm": 42.15534973144531, "kl": 0.005508741844096221, "learning_rate": 8.518543427732949e-09, "loss": 0.0, "reward": 0.4785704296082258, "reward_std": 0.8159382930025458, "rewards/concensus_correctness_reward_func": 0.01837499998509884, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.33713291585445404, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06056249886751175, "step": 46 }, { "completion_length": 353.375, "epoch": 5.333333333333333, "grad_norm": 6.933172225952148, "kl": 0.002127421241311822, "learning_rate": 2.1387846565474044e-09, "loss": 0.0, "reward": 0.6708917245268822, "reward_std": 0.9302016571164131, "rewards/concensus_correctness_reward_func": 0.006812499836087227, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.2251729667186737, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1264062523841858, "step": 48 }, { "completion_length": 346.9375, "epoch": 5.555555555555555, "grad_norm": 7.681344985961914, "kl": 0.0038058556092437357, "learning_rate": 0.0, "loss": 0.0, "reward": 0.41686041094362736, "reward_std": 0.7147915656678379, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.2795791446696967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13728125765919685, "step": 50 }, { "epoch": 5.555555555555555, "step": 50, "total_flos": 0.0, "train_loss": 2.602725280667073e-06, "train_runtime": 1884.3714, "train_samples_per_second": 0.425, "train_steps_per_second": 0.027 } ], "logging_steps": 2, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }