{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4934210526315789, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 10.125, "epoch": 0.003289473684210526, "grad_norm": 1.2648940086364746, "kl": 5.090309143066406, "learning_rate": 9.967105263157895e-07, "loss": 1.0181, "reward": 0.76214998960495, "reward_std": 0.04795105755329132, "rewards/consultenv_reward": 0.76214998960495, "step": 1 }, { "completion_length": 6.0, "epoch": 0.006578947368421052, "grad_norm": 0.040654104202985764, "kl": 6.3098578453063965, "learning_rate": 9.93421052631579e-07, "loss": 1.262, "reward": 0.9271000027656555, "reward_std": 0.0, "rewards/consultenv_reward": 0.9271000027656555, "step": 2 }, { "completion_length": 6.0, "epoch": 0.009868421052631578, "grad_norm": 0.05010877922177315, "kl": 5.868729114532471, "learning_rate": 9.901315789473685e-07, "loss": 1.1737, "reward": 0.8032000064849854, "reward_std": 0.0, "rewards/consultenv_reward": 0.8032000064849854, "step": 3 }, { "completion_length": 6.0, "epoch": 0.013157894736842105, "grad_norm": 0.05608519911766052, "kl": 4.800868034362793, "learning_rate": 9.868421052631579e-07, "loss": 0.9602, "reward": 0.7289999723434448, "reward_std": 0.0, "rewards/consultenv_reward": 0.7289999723434448, "step": 4 }, { "completion_length": 9.75, "epoch": 0.01644736842105263, "grad_norm": 1.6945240497589111, "kl": 5.157371520996094, "learning_rate": 9.835526315789472e-07, "loss": 1.0315, "reward": 0.8761749863624573, "reward_std": 0.004767955280840397, "rewards/consultenv_reward": 0.8761749863624573, "step": 5 }, { "completion_length": 6.0, "epoch": 0.019736842105263157, "grad_norm": 0.046472445130348206, "kl": 4.587093830108643, "learning_rate": 9.802631578947368e-07, "loss": 0.9174, "reward": 0.9162999987602234, "reward_std": 0.0, "rewards/consultenv_reward": 0.9162999987602234, "step": 6 }, { "completion_length": 8.375, "epoch": 0.023026315789473683, "grad_norm": 2.0128767490386963, "kl": 5.368254661560059, "learning_rate": 9.769736842105262e-07, "loss": 1.0737, "reward": 0.7254500389099121, "reward_std": 0.16540874540805817, "rewards/consultenv_reward": 0.7254500389099121, "step": 7 }, { "completion_length": 6.0, "epoch": 0.02631578947368421, "grad_norm": 0.8054580092430115, "kl": 6.3482818603515625, "learning_rate": 9.736842105263158e-07, "loss": 1.2697, "reward": 0.8036999702453613, "reward_std": 0.0, "rewards/consultenv_reward": 0.8036999702453613, "step": 8 }, { "completion_length": 6.0, "epoch": 0.029605263157894735, "grad_norm": 0.09759554266929626, "kl": 5.14717435836792, "learning_rate": 9.703947368421054e-07, "loss": 1.0294, "reward": 0.7419999837875366, "reward_std": 0.0, "rewards/consultenv_reward": 0.7419999837875366, "step": 9 }, { "completion_length": 6.0, "epoch": 0.03289473684210526, "grad_norm": 0.06322209537029266, "kl": 5.835409641265869, "learning_rate": 9.671052631578947e-07, "loss": 1.1671, "reward": 0.7939000129699707, "reward_std": 0.0, "rewards/consultenv_reward": 0.7939000129699707, "step": 10 }, { "completion_length": 6.0, "epoch": 0.03618421052631579, "grad_norm": 0.020609136670827866, "kl": 6.752837657928467, "learning_rate": 9.63815789473684e-07, "loss": 1.3506, "reward": 0.9244999885559082, "reward_std": 0.0, "rewards/consultenv_reward": 0.9244999885559082, "step": 11 }, { "completion_length": 6.0, "epoch": 0.039473684210526314, "grad_norm": 0.0241270512342453, "kl": 6.196539402008057, "learning_rate": 9.605263157894737e-07, "loss": 1.2393, "reward": 0.826200008392334, "reward_std": 0.0, "rewards/consultenv_reward": 0.826200008392334, "step": 12 }, { "completion_length": 6.0, "epoch": 0.04276315789473684, "grad_norm": 0.011420820839703083, "kl": 5.5568623542785645, "learning_rate": 9.57236842105263e-07, "loss": 1.1114, "reward": 0.6934999823570251, "reward_std": 0.0, "rewards/consultenv_reward": 0.6934999823570251, "step": 13 }, { "completion_length": 10.125, "epoch": 0.046052631578947366, "grad_norm": 0.8116201162338257, "kl": 5.411775588989258, "learning_rate": 9.539473684210526e-07, "loss": 1.0824, "reward": 0.8579875230789185, "reward_std": 0.012692571617662907, "rewards/consultenv_reward": 0.8579875230789185, "step": 14 }, { "completion_length": 6.125, "epoch": 0.049342105263157895, "grad_norm": 20.276941299438477, "kl": 6.70836067199707, "learning_rate": 9.506578947368421e-07, "loss": 1.3417, "reward": 0.7178999781608582, "reward_std": 0.0, "rewards/consultenv_reward": 0.7178999781608582, "step": 15 }, { "completion_length": 12.125, "epoch": 0.05263157894736842, "grad_norm": 1.932254672050476, "kl": 2.8613433837890625, "learning_rate": 9.473684210526315e-07, "loss": 0.5723, "reward": 0.8086625337600708, "reward_std": 0.038154371082782745, "rewards/consultenv_reward": 0.8086625337600708, "step": 16 }, { "completion_length": 24.0, "epoch": 0.05592105263157895, "grad_norm": 0.6563337445259094, "kl": 1.2501894235610962, "learning_rate": 9.44078947368421e-07, "loss": 0.25, "reward": 0.6434625387191772, "reward_std": 0.028080938383936882, "rewards/consultenv_reward": 0.6434625387191772, "step": 17 }, { "completion_length": 5.5, "epoch": 0.05921052631578947, "grad_norm": 19.687971115112305, "kl": 5.321070671081543, "learning_rate": 9.407894736842104e-07, "loss": 1.0642, "reward": 0.8448375463485718, "reward_std": 0.04147179052233696, "rewards/consultenv_reward": 0.8448375463485718, "step": 18 }, { "completion_length": 8.5, "epoch": 0.0625, "grad_norm": 1.8981589078903198, "kl": 5.300406455993652, "learning_rate": 9.374999999999999e-07, "loss": 1.0601, "reward": 0.7219375371932983, "reward_std": 0.16829171776771545, "rewards/consultenv_reward": 0.7219375371932983, "step": 19 }, { "completion_length": 6.0, "epoch": 0.06578947368421052, "grad_norm": 1.0172165632247925, "kl": 6.5475568771362305, "learning_rate": 9.342105263157895e-07, "loss": 1.3095, "reward": 0.739799976348877, "reward_std": 0.0, "rewards/consultenv_reward": 0.739799976348877, "step": 20 }, { "completion_length": 6.0, "epoch": 0.06907894736842106, "grad_norm": 0.05717224255204201, "kl": 4.586100101470947, "learning_rate": 9.30921052631579e-07, "loss": 0.9172, "reward": 0.9162999987602234, "reward_std": 0.0, "rewards/consultenv_reward": 0.9162999987602234, "step": 21 }, { "completion_length": 10.25, "epoch": 0.07236842105263158, "grad_norm": 0.9971261024475098, "kl": 5.274713039398193, "learning_rate": 9.276315789473685e-07, "loss": 1.0549, "reward": 0.7757499814033508, "reward_std": 0.06508515775203705, "rewards/consultenv_reward": 0.7757499814033508, "step": 22 }, { "completion_length": 6.0, "epoch": 0.0756578947368421, "grad_norm": 0.05118527263402939, "kl": 6.092126369476318, "learning_rate": 9.243421052631578e-07, "loss": 1.2184, "reward": 0.9082000255584717, "reward_std": 0.0, "rewards/consultenv_reward": 0.9082000255584717, "step": 23 }, { "completion_length": 7.0, "epoch": 0.07894736842105263, "grad_norm": 0.13762526214122772, "kl": 7.502526760101318, "learning_rate": 9.210526315789473e-07, "loss": 1.5005, "reward": 0.7645000219345093, "reward_std": 0.0, "rewards/consultenv_reward": 0.7645000219345093, "step": 24 }, { "completion_length": 6.0, "epoch": 0.08223684210526316, "grad_norm": 0.030327195301651955, "kl": 6.354429244995117, "learning_rate": 9.177631578947368e-07, "loss": 1.2709, "reward": 0.8080999851226807, "reward_std": 0.0, "rewards/consultenv_reward": 0.8080999851226807, "step": 25 }, { "completion_length": 6.0, "epoch": 0.08552631578947369, "grad_norm": 0.032285451889038086, "kl": 6.679554462432861, "learning_rate": 9.144736842105263e-07, "loss": 1.3359, "reward": 0.9332000017166138, "reward_std": 0.0, "rewards/consultenv_reward": 0.9332000017166138, "step": 26 }, { "completion_length": 6.0, "epoch": 0.08881578947368421, "grad_norm": 0.02108626626431942, "kl": 5.518928527832031, "learning_rate": 9.111842105263157e-07, "loss": 1.1038, "reward": 0.859499990940094, "reward_std": 0.0, "rewards/consultenv_reward": 0.859499990940094, "step": 27 }, { "completion_length": 6.0, "epoch": 0.09210526315789473, "grad_norm": 0.07486175000667572, "kl": 5.777313232421875, "learning_rate": 9.078947368421053e-07, "loss": 1.1555, "reward": 0.6912999749183655, "reward_std": 0.0, "rewards/consultenv_reward": 0.6912999749183655, "step": 28 }, { "completion_length": 6.0, "epoch": 0.09539473684210527, "grad_norm": 0.749207615852356, "kl": 6.067619323730469, "learning_rate": 9.046052631578947e-07, "loss": 1.2135, "reward": 0.8069000244140625, "reward_std": 0.0, "rewards/consultenv_reward": 0.8069000244140625, "step": 29 }, { "completion_length": 6.0, "epoch": 0.09868421052631579, "grad_norm": 0.021433958783745766, "kl": 5.274773120880127, "learning_rate": 9.013157894736842e-07, "loss": 1.055, "reward": 0.7319999933242798, "reward_std": 0.0, "rewards/consultenv_reward": 0.7319999933242798, "step": 30 }, { "completion_length": 10.5, "epoch": 0.10197368421052631, "grad_norm": 1.43083655834198, "kl": 3.3337671756744385, "learning_rate": 8.980263157894736e-07, "loss": 0.6668, "reward": 0.8494499921798706, "reward_std": 0.012020353227853775, "rewards/consultenv_reward": 0.8494499921798706, "step": 31 }, { "completion_length": 6.0, "epoch": 0.10526315789473684, "grad_norm": 0.17918534576892853, "kl": 5.648766040802002, "learning_rate": 8.947368421052631e-07, "loss": 1.1298, "reward": 0.8689000010490417, "reward_std": 0.0, "rewards/consultenv_reward": 0.8689000010490417, "step": 32 }, { "completion_length": 6.0, "epoch": 0.10855263157894737, "grad_norm": 0.03201783820986748, "kl": 6.6795334815979, "learning_rate": 8.914473684210526e-07, "loss": 1.3359, "reward": 0.9332000017166138, "reward_std": 0.0, "rewards/consultenv_reward": 0.9332000017166138, "step": 33 }, { "completion_length": 5.5, "epoch": 0.1118421052631579, "grad_norm": 1.7480360269546509, "kl": 4.997650146484375, "learning_rate": 8.881578947368421e-07, "loss": 0.9995, "reward": 0.7120000123977661, "reward_std": 0.0, "rewards/consultenv_reward": 0.7120000123977661, "step": 34 }, { "completion_length": 6.0, "epoch": 0.11513157894736842, "grad_norm": 0.4796940088272095, "kl": 5.869419097900391, "learning_rate": 8.848684210526314e-07, "loss": 1.1739, "reward": 0.739799976348877, "reward_std": 0.0, "rewards/consultenv_reward": 0.739799976348877, "step": 35 }, { "completion_length": 6.0, "epoch": 0.11842105263157894, "grad_norm": 0.02076355554163456, "kl": 5.624121189117432, "learning_rate": 8.815789473684209e-07, "loss": 1.1248, "reward": 0.8032000064849854, "reward_std": 0.0, "rewards/consultenv_reward": 0.8032000064849854, "step": 36 }, { "completion_length": 6.0, "epoch": 0.12171052631578948, "grad_norm": 0.08061272650957108, "kl": 5.065870761871338, "learning_rate": 8.782894736842105e-07, "loss": 1.0132, "reward": 0.9010999798774719, "reward_std": 0.0, "rewards/consultenv_reward": 0.9010999798774719, "step": 37 }, { "completion_length": 6.0, "epoch": 0.125, "grad_norm": 0.07088731229305267, "kl": 4.584712505340576, "learning_rate": 8.75e-07, "loss": 0.9169, "reward": 0.9162999987602234, "reward_std": 0.0, "rewards/consultenv_reward": 0.9162999987602234, "step": 38 }, { "completion_length": 6.0, "epoch": 0.12828947368421054, "grad_norm": 0.09207562357187271, "kl": 5.086410999298096, "learning_rate": 8.717105263157895e-07, "loss": 1.0173, "reward": 0.7663000226020813, "reward_std": 0.0, "rewards/consultenv_reward": 0.7663000226020813, "step": 39 }, { "completion_length": 6.0, "epoch": 0.13157894736842105, "grad_norm": 0.026300476863980293, "kl": 6.287196636199951, "learning_rate": 8.684210526315789e-07, "loss": 1.2574, "reward": 0.8686000108718872, "reward_std": 0.0, "rewards/consultenv_reward": 0.8686000108718872, "step": 40 }, { "completion_length": 37.375, "epoch": 0.13486842105263158, "grad_norm": 2.399390459060669, "kl": 0.6639226078987122, "learning_rate": 8.651315789473684e-07, "loss": 0.1328, "reward": 0.5, "reward_std": 0.0, "rewards/consultenv_reward": 0.5, "step": 41 }, { "completion_length": 6.0, "epoch": 0.13815789473684212, "grad_norm": 3.302048683166504, "kl": 5.2337236404418945, "learning_rate": 8.618421052631578e-07, "loss": 1.0467, "reward": 0.9254124760627747, "reward_std": 0.01948077790439129, "rewards/consultenv_reward": 0.9254124760627747, "step": 42 }, { "completion_length": 6.0, "epoch": 0.14144736842105263, "grad_norm": 0.022969799116253853, "kl": 5.7877020835876465, "learning_rate": 8.585526315789473e-07, "loss": 1.1575, "reward": 0.864799976348877, "reward_std": 0.0, "rewards/consultenv_reward": 0.864799976348877, "step": 43 }, { "completion_length": 6.0, "epoch": 0.14473684210526316, "grad_norm": 0.04024764150381088, "kl": 6.679026126861572, "learning_rate": 8.552631578947367e-07, "loss": 1.3358, "reward": 0.9332000017166138, "reward_std": 0.0, "rewards/consultenv_reward": 0.9332000017166138, "step": 44 }, { "completion_length": 32.125, "epoch": 0.14802631578947367, "grad_norm": 0.3278043866157532, "kl": 0.7964338064193726, "learning_rate": 8.519736842105263e-07, "loss": 0.1593, "reward": 0.5, "reward_std": 0.0, "rewards/consultenv_reward": 0.5, "step": 45 }, { "completion_length": 6.0, "epoch": 0.1513157894736842, "grad_norm": 0.06639762967824936, "kl": 6.105227470397949, "learning_rate": 8.486842105263158e-07, "loss": 1.221, "reward": 0.9271000027656555, "reward_std": 0.0, "rewards/consultenv_reward": 0.9271000027656555, "step": 46 }, { "completion_length": 7.25, "epoch": 0.15460526315789475, "grad_norm": 0.9748040437698364, "kl": 7.596184253692627, "learning_rate": 8.453947368421053e-07, "loss": 1.5192, "reward": 0.8220875263214111, "reward_std": 0.07509636878967285, "rewards/consultenv_reward": 0.8220875263214111, "step": 47 }, { "completion_length": 6.0, "epoch": 0.15789473684210525, "grad_norm": 0.09168459475040436, "kl": 5.086416721343994, "learning_rate": 8.421052631578947e-07, "loss": 1.0173, "reward": 0.7663000226020813, "reward_std": 0.0, "rewards/consultenv_reward": 0.7663000226020813, "step": 48 }, { "completion_length": 6.0, "epoch": 0.1611842105263158, "grad_norm": 0.029680494219064713, "kl": 5.518282413482666, "learning_rate": 8.388157894736841e-07, "loss": 1.1037, "reward": 0.859499990940094, "reward_std": 0.0, "rewards/consultenv_reward": 0.859499990940094, "step": 49 }, { "completion_length": 6.0, "epoch": 0.16447368421052633, "grad_norm": 0.041072551161050797, "kl": 5.699230670928955, "learning_rate": 8.355263157894736e-07, "loss": 1.1398, "reward": 0.9082000255584717, "reward_std": 0.0, "rewards/consultenv_reward": 0.9082000255584717, "step": 50 }, { "completion_length": 6.0, "epoch": 0.16776315789473684, "grad_norm": 0.036333128809928894, "kl": 6.195315837860107, "learning_rate": 8.322368421052631e-07, "loss": 1.2391, "reward": 0.826200008392334, "reward_std": 0.0, "rewards/consultenv_reward": 0.826200008392334, "step": 51 }, { "completion_length": 6.0, "epoch": 0.17105263157894737, "grad_norm": 0.2923116087913513, "kl": 6.882262706756592, "learning_rate": 8.289473684210527e-07, "loss": 1.3765, "reward": 0.7566999793052673, "reward_std": 0.0, "rewards/consultenv_reward": 0.7566999793052673, "step": 52 }, { "completion_length": 6.0, "epoch": 0.17434210526315788, "grad_norm": 0.09189020842313766, "kl": 5.08641242980957, "learning_rate": 8.256578947368421e-07, "loss": 1.0173, "reward": 0.7663000226020813, "reward_std": 0.0, "rewards/consultenv_reward": 0.7663000226020813, "step": 53 }, { "completion_length": 11.0, "epoch": 0.17763157894736842, "grad_norm": 4.741831302642822, "kl": 4.79311466217041, "learning_rate": 8.223684210526315e-07, "loss": 0.9586, "reward": 0.8593250513076782, "reward_std": 0.011526478454470634, "rewards/consultenv_reward": 0.8593250513076782, "step": 54 }, { "completion_length": 6.0, "epoch": 0.18092105263157895, "grad_norm": 0.03661518543958664, "kl": 6.195318222045898, "learning_rate": 8.19078947368421e-07, "loss": 1.2391, "reward": 0.826200008392334, "reward_std": 0.0, "rewards/consultenv_reward": 0.826200008392334, "step": 55 }, { "completion_length": 9.25, "epoch": 0.18421052631578946, "grad_norm": 1.9594019651412964, "kl": 5.646660327911377, "learning_rate": 8.157894736842105e-07, "loss": 1.1293, "reward": 0.8579875230789185, "reward_std": 0.012692571617662907, "rewards/consultenv_reward": 0.8579875230789185, "step": 56 }, { "completion_length": 6.0, "epoch": 0.1875, "grad_norm": 0.2260618358850479, "kl": 6.500757694244385, "learning_rate": 8.125e-07, "loss": 1.3002, "reward": 0.772599995136261, "reward_std": 0.0, "rewards/consultenv_reward": 0.772599995136261, "step": 57 }, { "completion_length": 6.0, "epoch": 0.19078947368421054, "grad_norm": 0.13923606276512146, "kl": 5.272119522094727, "learning_rate": 8.092105263157894e-07, "loss": 1.0544, "reward": 0.7319999933242798, "reward_std": 0.0, "rewards/consultenv_reward": 0.7319999933242798, "step": 58 }, { "completion_length": 6.0, "epoch": 0.19407894736842105, "grad_norm": 0.04126003012061119, "kl": 5.699241638183594, "learning_rate": 8.059210526315789e-07, "loss": 1.1398, "reward": 0.9082000255584717, "reward_std": 0.0, "rewards/consultenv_reward": 0.9082000255584717, "step": 59 }, { "completion_length": 6.0, "epoch": 0.19736842105263158, "grad_norm": 2.0743489265441895, "kl": 6.393285751342773, "learning_rate": 8.026315789473685e-07, "loss": 1.2787, "reward": 0.8684250116348267, "reward_std": 0.00954596046358347, "rewards/consultenv_reward": 0.8684250116348267, "step": 60 }, { "completion_length": 14.875, "epoch": 0.20065789473684212, "grad_norm": 0.5674435496330261, "kl": 2.438453435897827, "learning_rate": 7.993421052631579e-07, "loss": 0.4877, "reward": 0.8589999675750732, "reward_std": 0.025455815717577934, "rewards/consultenv_reward": 0.8589999675750732, "step": 61 }, { "completion_length": 33.125, "epoch": 0.20394736842105263, "grad_norm": 0.736821174621582, "kl": 0.7446178793907166, "learning_rate": 7.960526315789473e-07, "loss": 0.1489, "reward": 0.5, "reward_std": 0.0, "rewards/consultenv_reward": 0.5, "step": 62 }, { "completion_length": 7.25, "epoch": 0.20723684210526316, "grad_norm": 0.6404052376747131, "kl": 7.158998966217041, "learning_rate": 7.927631578947368e-07, "loss": 1.4318, "reward": 0.7519500255584717, "reward_std": 0.02323809638619423, "rewards/consultenv_reward": 0.7519500255584717, "step": 63 }, { "completion_length": 6.0, "epoch": 0.21052631578947367, "grad_norm": 0.019384942948818207, "kl": 5.556180477142334, "learning_rate": 7.894736842105263e-07, "loss": 1.1112, "reward": 0.6934999823570251, "reward_std": 0.0, "rewards/consultenv_reward": 0.6934999823570251, "step": 64 }, { "completion_length": 10.5, "epoch": 0.2138157894736842, "grad_norm": 0.8739587068557739, "kl": 5.285774230957031, "learning_rate": 7.861842105263158e-07, "loss": 1.0572, "reward": 0.8624750375747681, "reward_std": 0.01661846973001957, "rewards/consultenv_reward": 0.8624750375747681, "step": 65 }, { "completion_length": 9.875, "epoch": 0.21710526315789475, "grad_norm": 1.5130972862243652, "kl": 4.849190711975098, "learning_rate": 7.828947368421052e-07, "loss": 0.9698, "reward": 0.8085625171661377, "reward_std": 0.07747609913349152, "rewards/consultenv_reward": 0.8085625171661377, "step": 66 }, { "completion_length": 6.0, "epoch": 0.22039473684210525, "grad_norm": 0.053318992257118225, "kl": 6.415820121765137, "learning_rate": 7.796052631578946e-07, "loss": 1.2832, "reward": 0.6561999917030334, "reward_std": 0.0, "rewards/consultenv_reward": 0.6561999917030334, "step": 67 }, { "completion_length": 8.625, "epoch": 0.2236842105263158, "grad_norm": 2.2133703231811523, "kl": 5.145012855529785, "learning_rate": 7.763157894736841e-07, "loss": 1.029, "reward": 0.8045499324798584, "reward_std": 0.15015636384487152, "rewards/consultenv_reward": 0.8045499324798584, "step": 68 }, { "completion_length": 6.0, "epoch": 0.22697368421052633, "grad_norm": 0.12775838375091553, "kl": 5.847586154937744, "learning_rate": 7.730263157894737e-07, "loss": 1.1695, "reward": 0.7178999781608582, "reward_std": 0.0, "rewards/consultenv_reward": 0.7178999781608582, "step": 69 }, { "completion_length": 6.0, "epoch": 0.23026315789473684, "grad_norm": 0.2844913601875305, "kl": 5.748251914978027, "learning_rate": 7.697368421052632e-07, "loss": 1.1497, "reward": 0.8032000064849854, "reward_std": 0.0, "rewards/consultenv_reward": 0.8032000064849854, "step": 70 }, { "completion_length": 14.875, "epoch": 0.23355263157894737, "grad_norm": 0.5667825937271118, "kl": 2.4429831504821777, "learning_rate": 7.664473684210526e-07, "loss": 0.4886, "reward": 0.866612434387207, "reward_std": 0.003924437798559666, "rewards/consultenv_reward": 0.866612434387207, "step": 71 }, { "completion_length": 6.0, "epoch": 0.23684210526315788, "grad_norm": 0.00787241943180561, "kl": 5.966519832611084, "learning_rate": 7.631578947368421e-07, "loss": 1.1933, "reward": 0.9082000255584717, "reward_std": 0.0, "rewards/consultenv_reward": 0.9082000255584717, "step": 72 }, { "completion_length": 6.0, "epoch": 0.24013157894736842, "grad_norm": 0.023889625445008278, "kl": 5.555818557739258, "learning_rate": 7.598684210526315e-07, "loss": 1.1112, "reward": 0.6934999823570251, "reward_std": 0.0, "rewards/consultenv_reward": 0.6934999823570251, "step": 73 }, { "completion_length": 6.0, "epoch": 0.24342105263157895, "grad_norm": 0.07475114613771439, "kl": 4.655825614929199, "learning_rate": 7.56578947368421e-07, "loss": 0.9312, "reward": 0.9197999835014343, "reward_std": 0.0, "rewards/consultenv_reward": 0.9197999835014343, "step": 74 }, { "completion_length": 6.0, "epoch": 0.24671052631578946, "grad_norm": 0.18682347238063812, "kl": 4.780447959899902, "learning_rate": 7.532894736842104e-07, "loss": 0.9561, "reward": 0.7120000123977661, "reward_std": 0.0, "rewards/consultenv_reward": 0.7120000123977661, "step": 75 }, { "completion_length": 6.0, "epoch": 0.25, "grad_norm": 0.04034702852368355, "kl": 5.622495174407959, "learning_rate": 7.5e-07, "loss": 1.1245, "reward": 0.8032000064849854, "reward_std": 0.0, "rewards/consultenv_reward": 0.8032000064849854, "step": 76 }, { "completion_length": 9.25, "epoch": 0.2532894736842105, "grad_norm": 1.8869643211364746, "kl": 4.61402702331543, "learning_rate": 7.467105263157895e-07, "loss": 0.9228, "reward": 0.7054375410079956, "reward_std": 0.15556176006793976, "rewards/consultenv_reward": 0.7054375410079956, "step": 77 }, { "completion_length": 10.375, "epoch": 0.2565789473684211, "grad_norm": 1.1384512186050415, "kl": 3.3433761596679688, "learning_rate": 7.43421052631579e-07, "loss": 0.6687, "reward": 0.8480250239372253, "reward_std": 0.01060025580227375, "rewards/consultenv_reward": 0.8480250239372253, "step": 78 }, { "completion_length": 6.0, "epoch": 0.2598684210526316, "grad_norm": 0.04677698761224747, "kl": 5.5537519454956055, "learning_rate": 7.401315789473685e-07, "loss": 1.1108, "reward": 0.8689000010490417, "reward_std": 0.0, "rewards/consultenv_reward": 0.8689000010490417, "step": 79 }, { "completion_length": 11.375, "epoch": 0.2631578947368421, "grad_norm": 1.1391830444335938, "kl": 4.534235000610352, "learning_rate": 7.368421052631578e-07, "loss": 0.9068, "reward": 0.8380250334739685, "reward_std": 0.008841577917337418, "rewards/consultenv_reward": 0.8380250334739685, "step": 80 }, { "completion_length": 33.125, "epoch": 0.26644736842105265, "grad_norm": 0.19980522990226746, "kl": 0.6735504865646362, "learning_rate": 7.335526315789473e-07, "loss": 0.1347, "reward": 0.5, "reward_std": 0.0, "rewards/consultenv_reward": 0.5, "step": 81 }, { "completion_length": 6.0, "epoch": 0.26973684210526316, "grad_norm": 0.03516209125518799, "kl": 6.2434163093566895, "learning_rate": 7.302631578947368e-07, "loss": 1.2487, "reward": 0.8111000061035156, "reward_std": 0.0, "rewards/consultenv_reward": 0.8111000061035156, "step": 82 }, { "completion_length": 7.0, "epoch": 0.2730263157894737, "grad_norm": 0.15605852007865906, "kl": 8.064192771911621, "learning_rate": 7.269736842105263e-07, "loss": 1.6128, "reward": 0.8765000104904175, "reward_std": 0.0, "rewards/consultenv_reward": 0.8765000104904175, "step": 83 }, { "completion_length": 6.0, "epoch": 0.27631578947368424, "grad_norm": 0.42963021993637085, "kl": 5.664529800415039, "learning_rate": 7.236842105263158e-07, "loss": 1.1329, "reward": 0.6912999749183655, "reward_std": 0.0, "rewards/consultenv_reward": 0.6912999749183655, "step": 84 }, { "completion_length": 6.0, "epoch": 0.27960526315789475, "grad_norm": 0.030798109248280525, "kl": 6.369731903076172, "learning_rate": 7.203947368421053e-07, "loss": 1.2739, "reward": 0.6395000219345093, "reward_std": 0.0, "rewards/consultenv_reward": 0.6395000219345093, "step": 85 }, { "completion_length": 10.5, "epoch": 0.28289473684210525, "grad_norm": 1.2990237474441528, "kl": 4.914910793304443, "learning_rate": 7.171052631578947e-07, "loss": 0.983, "reward": 0.8601499795913696, "reward_std": 0.043563880026340485, "rewards/consultenv_reward": 0.8601499795913696, "step": 86 }, { "completion_length": 6.0, "epoch": 0.28618421052631576, "grad_norm": 0.16764293611049652, "kl": 5.955133438110352, "learning_rate": 7.138157894736842e-07, "loss": 1.191, "reward": 0.7178999781608582, "reward_std": 0.0, "rewards/consultenv_reward": 0.7178999781608582, "step": 87 }, { "completion_length": 6.0, "epoch": 0.2894736842105263, "grad_norm": 0.02464032731950283, "kl": 6.151308059692383, "learning_rate": 7.105263157894736e-07, "loss": 1.2303, "reward": 0.9855999946594238, "reward_std": 0.0, "rewards/consultenv_reward": 0.9855999946594238, "step": 88 }, { "completion_length": 6.0, "epoch": 0.29276315789473684, "grad_norm": 0.06536629796028137, "kl": 6.750110149383545, "learning_rate": 7.072368421052631e-07, "loss": 1.35, "reward": 0.9244999885559082, "reward_std": 0.0, "rewards/consultenv_reward": 0.9244999885559082, "step": 89 }, { "completion_length": 7.125, "epoch": 0.29605263157894735, "grad_norm": 0.28588587045669556, "kl": 6.390361785888672, "learning_rate": 7.039473684210526e-07, "loss": 1.2781, "reward": 0.8784749507904053, "reward_std": 0.016192736104130745, "rewards/consultenv_reward": 0.8784749507904053, "step": 90 }, { "completion_length": 6.0, "epoch": 0.2993421052631579, "grad_norm": 0.34626102447509766, "kl": 6.636594295501709, "learning_rate": 7.006578947368421e-07, "loss": 1.3273, "reward": 0.9277999997138977, "reward_std": 0.0, "rewards/consultenv_reward": 0.9277999997138977, "step": 91 }, { "completion_length": 6.0, "epoch": 0.3026315789473684, "grad_norm": 3.3909687995910645, "kl": 4.660943984985352, "learning_rate": 6.973684210526314e-07, "loss": 0.9322, "reward": 0.716487467288971, "reward_std": 0.0353906974196434, "rewards/consultenv_reward": 0.716487467288971, "step": 92 }, { "completion_length": 22.75, "epoch": 0.3059210526315789, "grad_norm": 4.014632225036621, "kl": 1.729530692100525, "learning_rate": 6.94078947368421e-07, "loss": 0.3459, "reward": 0.6597249507904053, "reward_std": 0.0340556725859642, "rewards/consultenv_reward": 0.6597249507904053, "step": 93 }, { "completion_length": 6.0, "epoch": 0.3092105263157895, "grad_norm": 0.14212459325790405, "kl": 6.503857135772705, "learning_rate": 6.907894736842105e-07, "loss": 1.3008, "reward": 0.772599995136261, "reward_std": 0.0, "rewards/consultenv_reward": 0.772599995136261, "step": 94 }, { "completion_length": 23.375, "epoch": 0.3125, "grad_norm": 1.256418228149414, "kl": 1.291813850402832, "learning_rate": 6.875e-07, "loss": 0.2584, "reward": 0.6482499837875366, "reward_std": 0.05817146226763725, "rewards/consultenv_reward": 0.6482499837875366, "step": 95 }, { "completion_length": 10.375, "epoch": 0.3157894736842105, "grad_norm": 1.144538402557373, "kl": 3.1393635272979736, "learning_rate": 6.842105263157895e-07, "loss": 0.6279, "reward": 0.9648749828338623, "reward_std": 0.012481950223445892, "rewards/consultenv_reward": 0.9648749828338623, "step": 96 }, { "completion_length": 14.125, "epoch": 0.3190789473684211, "grad_norm": 1.2048355340957642, "kl": 3.006725311279297, "learning_rate": 6.809210526315789e-07, "loss": 0.6013, "reward": 0.7702249884605408, "reward_std": 0.05154811218380928, "rewards/consultenv_reward": 0.7702249884605408, "step": 97 }, { "completion_length": 37.75, "epoch": 0.3223684210526316, "grad_norm": 0.2545125484466553, "kl": 0.6292104721069336, "learning_rate": 6.776315789473684e-07, "loss": 0.1258, "reward": 0.5, "reward_std": 0.0, "rewards/consultenv_reward": 0.5, "step": 98 }, { "completion_length": 14.0, "epoch": 0.3256578947368421, "grad_norm": 1.3225420713424683, "kl": 2.1595888137817383, "learning_rate": 6.743421052631578e-07, "loss": 0.4319, "reward": 0.8472999930381775, "reward_std": 0.015556366182863712, "rewards/consultenv_reward": 0.8472999930381775, "step": 99 }, { "completion_length": 8.5, "epoch": 0.32894736842105265, "grad_norm": 1.0874344110488892, "kl": 5.099480628967285, "learning_rate": 6.710526315789473e-07, "loss": 1.0199, "reward": 0.7386875152587891, "reward_std": 0.13042065501213074, "rewards/consultenv_reward": 0.7386875152587891, "step": 100 }, { "completion_length": 7.375, "epoch": 0.33223684210526316, "grad_norm": 0.5237722992897034, "kl": 7.550236701965332, "learning_rate": 6.677631578947368e-07, "loss": 1.51, "reward": 0.8378874659538269, "reward_std": 0.07137002795934677, "rewards/consultenv_reward": 0.8378874659538269, "step": 101 }, { "completion_length": 6.0, "epoch": 0.3355263157894737, "grad_norm": 0.14063242077827454, "kl": 5.082447528839111, "learning_rate": 6.644736842105263e-07, "loss": 1.0165, "reward": 0.7663000226020813, "reward_std": 0.0, "rewards/consultenv_reward": 0.7663000226020813, "step": 102 }, { "completion_length": 6.0, "epoch": 0.33881578947368424, "grad_norm": 4.373421669006348, "kl": 6.510066032409668, "learning_rate": 6.611842105263158e-07, "loss": 1.302, "reward": 0.9135124683380127, "reward_std": 0.05568467080593109, "rewards/consultenv_reward": 0.9135124683380127, "step": 103 }, { "completion_length": 6.0, "epoch": 0.34210526315789475, "grad_norm": 3.4771292209625244, "kl": 4.881898403167725, "learning_rate": 6.578947368421053e-07, "loss": 0.9764, "reward": 0.8253124952316284, "reward_std": 0.06841256469488144, "rewards/consultenv_reward": 0.8253124952316284, "step": 104 }, { "completion_length": 6.0, "epoch": 0.34539473684210525, "grad_norm": 0.05689006298780441, "kl": 6.351839065551758, "learning_rate": 6.546052631578946e-07, "loss": 1.2704, "reward": 0.8080999851226807, "reward_std": 0.0, "rewards/consultenv_reward": 0.8080999851226807, "step": 105 }, { "completion_length": 6.0, "epoch": 0.34868421052631576, "grad_norm": 0.11382176727056503, "kl": 5.773299694061279, "learning_rate": 6.513157894736841e-07, "loss": 1.1547, "reward": 0.6912999749183655, "reward_std": 0.0, "rewards/consultenv_reward": 0.6912999749183655, "step": 106 }, { "completion_length": 6.0, "epoch": 0.3519736842105263, "grad_norm": 0.042156364768743515, "kl": 5.517341613769531, "learning_rate": 6.480263157894736e-07, "loss": 1.1035, "reward": 0.859499990940094, "reward_std": 0.0, "rewards/consultenv_reward": 0.859499990940094, "step": 107 }, { "completion_length": 6.0, "epoch": 0.35526315789473684, "grad_norm": 0.2488948255777359, "kl": 6.305784225463867, "learning_rate": 6.447368421052632e-07, "loss": 1.2612, "reward": 0.8036999702453613, "reward_std": 0.0, "rewards/consultenv_reward": 0.8036999702453613, "step": 108 }, { "completion_length": 6.0, "epoch": 0.35855263157894735, "grad_norm": 0.04145219922065735, "kl": 6.1041340827941895, "learning_rate": 6.414473684210527e-07, "loss": 1.2208, "reward": 0.9271000027656555, "reward_std": 0.0, "rewards/consultenv_reward": 0.9271000027656555, "step": 109 }, { "completion_length": 6.0, "epoch": 0.3618421052631579, "grad_norm": 0.20263297855854034, "kl": 6.5490498542785645, "learning_rate": 6.381578947368421e-07, "loss": 1.3098, "reward": 0.6395000219345093, "reward_std": 0.0, "rewards/consultenv_reward": 0.6395000219345093, "step": 110 }, { "completion_length": 6.0, "epoch": 0.3651315789473684, "grad_norm": 0.278015673160553, "kl": 5.725656509399414, "learning_rate": 6.348684210526315e-07, "loss": 1.1451, "reward": 0.7939000129699707, "reward_std": 0.0, "rewards/consultenv_reward": 0.7939000129699707, "step": 111 }, { "completion_length": 34.125, "epoch": 0.3684210526315789, "grad_norm": 0.3532399833202362, "kl": 0.7193299531936646, "learning_rate": 6.31578947368421e-07, "loss": 0.1439, "reward": 0.5, "reward_std": 0.0, "rewards/consultenv_reward": 0.5, "step": 112 }, { "completion_length": 6.0, "epoch": 0.3717105263157895, "grad_norm": 0.33866649866104126, "kl": 6.636590957641602, "learning_rate": 6.282894736842105e-07, "loss": 1.3273, "reward": 0.9277999997138977, "reward_std": 0.0, "rewards/consultenv_reward": 0.9277999997138977, "step": 113 }, { "completion_length": 6.0, "epoch": 0.375, "grad_norm": 0.058332037180662155, "kl": 6.351879119873047, "learning_rate": 6.249999999999999e-07, "loss": 1.2704, "reward": 0.8080999851226807, "reward_std": 0.0, "rewards/consultenv_reward": 0.8080999851226807, "step": 114 }, { "completion_length": 6.0, "epoch": 0.3782894736842105, "grad_norm": 0.10112585872411728, "kl": 6.196841239929199, "learning_rate": 6.217105263157894e-07, "loss": 1.2394, "reward": 0.6668999791145325, "reward_std": 0.0, "rewards/consultenv_reward": 0.6668999791145325, "step": 115 }, { "completion_length": 6.0, "epoch": 0.3815789473684211, "grad_norm": 0.12126638740301132, "kl": 6.085808277130127, "learning_rate": 6.18421052631579e-07, "loss": 1.2172, "reward": 0.9082000255584717, "reward_std": 0.0, "rewards/consultenv_reward": 0.9082000255584717, "step": 116 }, { "completion_length": 9.625, "epoch": 0.3848684210526316, "grad_norm": 0.586071789264679, "kl": 4.974440097808838, "learning_rate": 6.151315789473685e-07, "loss": 0.9949, "reward": 0.862725019454956, "reward_std": 0.04471808671951294, "rewards/consultenv_reward": 0.862725019454956, "step": 117 }, { "completion_length": 6.0, "epoch": 0.3881578947368421, "grad_norm": 0.2230241298675537, "kl": 6.54638671875, "learning_rate": 6.118421052631579e-07, "loss": 1.3093, "reward": 0.6395000219345093, "reward_std": 0.0, "rewards/consultenv_reward": 0.6395000219345093, "step": 118 }, { "completion_length": 11.75, "epoch": 0.39144736842105265, "grad_norm": 3.0335872173309326, "kl": 4.1337432861328125, "learning_rate": 6.085526315789473e-07, "loss": 0.8267, "reward": 0.8119999766349792, "reward_std": 0.0665445625782013, "rewards/consultenv_reward": 0.8119999766349792, "step": 119 }, { "completion_length": 6.0, "epoch": 0.39473684210526316, "grad_norm": 0.44197261333465576, "kl": 6.869905471801758, "learning_rate": 6.052631578947368e-07, "loss": 1.374, "reward": 0.7566999793052673, "reward_std": 0.0, "rewards/consultenv_reward": 0.7566999793052673, "step": 120 }, { "completion_length": 8.125, "epoch": 0.3980263157894737, "grad_norm": 1.1296876668930054, "kl": 5.830264568328857, "learning_rate": 6.019736842105263e-07, "loss": 1.1661, "reward": 0.8140624761581421, "reward_std": 0.11861716955900192, "rewards/consultenv_reward": 0.8140624761581421, "step": 121 }, { "completion_length": 8.625, "epoch": 0.40131578947368424, "grad_norm": 1.6957347393035889, "kl": 4.981289863586426, "learning_rate": 5.986842105263158e-07, "loss": 0.9963, "reward": 0.7094249725341797, "reward_std": 0.15443630516529083, "rewards/consultenv_reward": 0.7094249725341797, "step": 122 }, { "completion_length": 6.0, "epoch": 0.40460526315789475, "grad_norm": 0.13093456625938416, "kl": 5.146120071411133, "learning_rate": 5.953947368421052e-07, "loss": 1.0292, "reward": 0.7419999837875366, "reward_std": 0.0, "rewards/consultenv_reward": 0.7419999837875366, "step": 123 }, { "completion_length": 9.875, "epoch": 0.40789473684210525, "grad_norm": 1.79840886592865, "kl": 4.918962478637695, "learning_rate": 5.921052631578946e-07, "loss": 0.9838, "reward": 0.8630000352859497, "reward_std": 0.012514001689851284, "rewards/consultenv_reward": 0.8630000352859497, "step": 124 }, { "completion_length": 10.5, "epoch": 0.41118421052631576, "grad_norm": 1.3767091035842896, "kl": 3.2740769386291504, "learning_rate": 5.888157894736842e-07, "loss": 0.6548, "reward": 0.8647249937057495, "reward_std": 0.009815982542932034, "rewards/consultenv_reward": 0.8647249937057495, "step": 125 }, { "completion_length": 6.0, "epoch": 0.4144736842105263, "grad_norm": 0.060881104320287704, "kl": 4.89548397064209, "learning_rate": 5.855263157894737e-07, "loss": 0.9791, "reward": 0.7120000123977661, "reward_std": 0.0, "rewards/consultenv_reward": 0.7120000123977661, "step": 126 }, { "completion_length": 6.0, "epoch": 0.41776315789473684, "grad_norm": 0.045680202543735504, "kl": 5.553736686706543, "learning_rate": 5.822368421052632e-07, "loss": 1.1107, "reward": 0.8689000010490417, "reward_std": 0.0, "rewards/consultenv_reward": 0.8689000010490417, "step": 127 }, { "completion_length": 6.0, "epoch": 0.42105263157894735, "grad_norm": 0.06434284895658493, "kl": 6.284034729003906, "learning_rate": 5.789473684210526e-07, "loss": 1.2568, "reward": 0.8686000108718872, "reward_std": 0.0, "rewards/consultenv_reward": 0.8686000108718872, "step": 128 }, { "completion_length": 14.5, "epoch": 0.4243421052631579, "grad_norm": 56.9053955078125, "kl": 7.230620384216309, "learning_rate": 5.756578947368421e-07, "loss": 1.4461, "reward": 0.8381749987602234, "reward_std": 0.03326690196990967, "rewards/consultenv_reward": 0.8381749987602234, "step": 129 }, { "completion_length": 13.0, "epoch": 0.4276315789473684, "grad_norm": 1.3861262798309326, "kl": 2.501538038253784, "learning_rate": 5.723684210526315e-07, "loss": 0.5003, "reward": 0.8357375264167786, "reward_std": 0.03201190382242203, "rewards/consultenv_reward": 0.8357375264167786, "step": 130 }, { "completion_length": 6.0, "epoch": 0.4309210526315789, "grad_norm": 0.23955222964286804, "kl": 6.153580665588379, "learning_rate": 5.69078947368421e-07, "loss": 1.2307, "reward": 0.8188999891281128, "reward_std": 0.0, "rewards/consultenv_reward": 0.8188999891281128, "step": 131 }, { "completion_length": 7.25, "epoch": 0.4342105263157895, "grad_norm": 0.09075440466403961, "kl": 6.361033916473389, "learning_rate": 5.657894736842104e-07, "loss": 1.2722, "reward": 0.8727499842643738, "reward_std": 0.02120126783847809, "rewards/consultenv_reward": 0.8727499842643738, "step": 132 }, { "completion_length": 29.25, "epoch": 0.4375, "grad_norm": 0.7662698030471802, "kl": 0.9236090779304504, "learning_rate": 5.625e-07, "loss": 0.1847, "reward": 0.5, "reward_std": 0.0, "rewards/consultenv_reward": 0.5, "step": 133 }, { "completion_length": 26.0, "epoch": 0.4407894736842105, "grad_norm": 0.2385452538728714, "kl": 0.8105572462081909, "learning_rate": 5.592105263157895e-07, "loss": 0.1621, "reward": 0.5, "reward_std": 0.0, "rewards/consultenv_reward": 0.5, "step": 134 }, { "completion_length": 6.0, "epoch": 0.4440789473684211, "grad_norm": 0.32297372817993164, "kl": 4.999083518981934, "learning_rate": 5.55921052631579e-07, "loss": 0.9998, "reward": 0.7120000123977661, "reward_std": 0.0, "rewards/consultenv_reward": 0.7120000123977661, "step": 135 }, { "completion_length": 6.0, "epoch": 0.4473684210526316, "grad_norm": 0.2102341651916504, "kl": 5.738028526306152, "learning_rate": 5.526315789473684e-07, "loss": 1.1476, "reward": 0.8032000064849854, "reward_std": 0.0, "rewards/consultenv_reward": 0.8032000064849854, "step": 136 }, { "completion_length": 6.0, "epoch": 0.4506578947368421, "grad_norm": 0.06473302841186523, "kl": 5.620542049407959, "learning_rate": 5.493421052631578e-07, "loss": 1.1241, "reward": 0.8032000064849854, "reward_std": 0.0, "rewards/consultenv_reward": 0.8032000064849854, "step": 137 }, { "completion_length": 14.0, "epoch": 0.45394736842105265, "grad_norm": 0.012383934110403061, "kl": 2.1373372077941895, "learning_rate": 5.460526315789473e-07, "loss": 0.4275, "reward": 0.8385999798774719, "reward_std": 0.0, "rewards/consultenv_reward": 0.8385999798774719, "step": 138 }, { "completion_length": 11.25, "epoch": 0.45723684210526316, "grad_norm": 2.2385120391845703, "kl": 4.439551830291748, "learning_rate": 5.427631578947368e-07, "loss": 0.8879, "reward": 0.8598875403404236, "reward_std": 0.011286596767604351, "rewards/consultenv_reward": 0.8598875403404236, "step": 139 }, { "completion_length": 6.0, "epoch": 0.4605263157894737, "grad_norm": 0.06162038445472717, "kl": 6.4111456871032715, "learning_rate": 5.394736842105264e-07, "loss": 1.2822, "reward": 0.8718000054359436, "reward_std": 0.0, "rewards/consultenv_reward": 0.8718000054359436, "step": 140 }, { "completion_length": 7.375, "epoch": 0.46381578947368424, "grad_norm": 0.759215235710144, "kl": 7.472259521484375, "learning_rate": 5.361842105263158e-07, "loss": 1.4945, "reward": 0.8039500117301941, "reward_std": 0.0775592103600502, "rewards/consultenv_reward": 0.8039500117301941, "step": 141 }, { "completion_length": 9.75, "epoch": 0.46710526315789475, "grad_norm": 1.7282990217208862, "kl": 5.232257843017578, "learning_rate": 5.328947368421053e-07, "loss": 1.0465, "reward": 0.7743874788284302, "reward_std": 0.06623844802379608, "rewards/consultenv_reward": 0.7743874788284302, "step": 142 }, { "completion_length": 14.75, "epoch": 0.47039473684210525, "grad_norm": 0.9253103733062744, "kl": 2.687042236328125, "learning_rate": 5.296052631578947e-07, "loss": 0.5374, "reward": 0.7884500026702881, "reward_std": 0.06749232113361359, "rewards/consultenv_reward": 0.7884500026702881, "step": 143 }, { "completion_length": 13.125, "epoch": 0.47368421052631576, "grad_norm": 1.721250057220459, "kl": 2.4987120628356934, "learning_rate": 5.263157894736842e-07, "loss": 0.4997, "reward": 0.8135999441146851, "reward_std": 0.0707106739282608, "rewards/consultenv_reward": 0.8135999441146851, "step": 144 }, { "completion_length": 22.375, "epoch": 0.4769736842105263, "grad_norm": 1.2682180404663086, "kl": 1.364314079284668, "learning_rate": 5.230263157894736e-07, "loss": 0.2729, "reward": 0.6870999932289124, "reward_std": 0.033284250646829605, "rewards/consultenv_reward": 0.6870999932289124, "step": 145 }, { "completion_length": 37.5, "epoch": 0.48026315789473684, "grad_norm": 0.20368559658527374, "kl": 0.6978104710578918, "learning_rate": 5.197368421052631e-07, "loss": 0.1396, "reward": 0.5, "reward_std": 0.0, "rewards/consultenv_reward": 0.5, "step": 146 }, { "completion_length": 10.5, "epoch": 0.48355263157894735, "grad_norm": 11.25495433807373, "kl": 4.274662017822266, "learning_rate": 5.164473684210526e-07, "loss": 0.8549, "reward": 0.7791624665260315, "reward_std": 0.15686915814876556, "rewards/consultenv_reward": 0.7791624665260315, "step": 147 }, { "completion_length": 6.0, "epoch": 0.4868421052631579, "grad_norm": 0.06683629751205444, "kl": 5.8632707595825195, "learning_rate": 5.131578947368422e-07, "loss": 1.1727, "reward": 0.7301999926567078, "reward_std": 0.0, "rewards/consultenv_reward": 0.7301999926567078, "step": 148 }, { "completion_length": 6.0, "epoch": 0.4901315789473684, "grad_norm": 0.03876814991235733, "kl": 6.150131702423096, "learning_rate": 5.098684210526315e-07, "loss": 1.23, "reward": 0.9855999946594238, "reward_std": 0.0, "rewards/consultenv_reward": 0.9855999946594238, "step": 149 }, { "completion_length": 22.875, "epoch": 0.4934210526315789, "grad_norm": 0.9633879661560059, "kl": 1.2498282194137573, "learning_rate": 5.06578947368421e-07, "loss": 0.25, "reward": 0.6793124675750732, "reward_std": 0.03390160948038101, "rewards/consultenv_reward": 0.6793124675750732, "step": 150 } ], "logging_steps": 1, "max_steps": 304, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }