diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,511 +2,8238 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 3.0, + "epoch": 1.0, "eval_steps": 500, - "global_step": 465, + "global_step": 1371, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "entropy": 1.0450326681137085, - "epoch": 0.06482982171799027, - "grad_norm": 84.5, - "learning_rate": 3.7500000000000005e-06, - "loss": 3.4523143768310547, - "mean_token_accuracy": 0.5389400094747543, - "num_tokens": 24017.0, - "step": 10 + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.95, + "completions/max_length": 256.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 245.7, + "completions/mean_terminated_length": 24.4, + "completions/min_length": 114.2, + "completions/min_terminated_length": 11.8, + "entropy": 0.547074381262064, + "epoch": 0.0036469730123997084, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0224609375, + "kl": 0.0009595323979738168, + "learning_rate": 2.5e-06, + "loss": 0.10703855752944946, + "num_tokens": 32128.0, + "reward": 0.04250000091269612, + "reward_std": 0.1351443100720644, + "rewards/reward_correct/mean": 0.0375, + "rewards/reward_correct/std": 0.11831300854682922, + "rewards/reward_format/mean": 0.005000000074505806, + "rewards/reward_format/std": 0.016831301152706146, + "step": 5, + "step_time": 10.844848310761154 }, { - "entropy": 1.1437347918748855, - "epoch": 0.12965964343598055, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9, + "completions/max_length": 256.0, + "completions/max_terminated_length": 31.4, + "completions/mean_length": 234.7125, + "completions/mean_terminated_length": 13.457143402099609, + "completions/min_length": 160.6, + "completions/min_terminated_length": 7.0, + "entropy": 0.5721548050642014, + "epoch": 0.007293946024799417, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.07080078125, + "kl": 0.004356882933643646, + "learning_rate": 4.9999933592389375e-06, + "loss": 0.10316625833511353, + "num_tokens": 70633.0, + "reward": 0.03500000359490514, + "reward_std": 0.07829165942966938, + "rewards/reward_correct/mean": 0.025, + "rewards/reward_correct/std": 0.06831300854682923, + "rewards/reward_format/mean": 0.0100000006146729, + "rewards/reward_format/std": 0.015246951207518577, + "step": 10, + "step_time": 11.166164619848132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.575, + "completions/max_length": 256.0, + "completions/max_terminated_length": 157.6, + "completions/mean_length": 179.6875, + "completions/mean_terminated_length": 66.38484954833984, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "entropy": 0.9433698575943709, + "epoch": 0.010940919037199124, + "frac_reward_zero_std": 0.2, + "grad_norm": 5.125, + "kl": 0.03965476304874756, + "learning_rate": 4.999760936306081e-06, + "loss": 0.21741697788238526, + "num_tokens": 100712.0, + "reward": 0.3325000119395554, + "reward_std": 0.42388612516224383, + "rewards/reward_correct/mean": 0.2875, + "rewards/reward_correct/std": 0.39012446999549866, + "rewards/reward_format/mean": 0.04500000076368451, + "rewards/reward_format/std": 0.04555814005434513, + "step": 15, + "step_time": 10.90111221112311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 123.425, + "completions/mean_terminated_length": 65.1582275390625, + "completions/min_length": 13.6, + "completions/min_terminated_length": 13.6, + "entropy": 0.9993088260293007, + "epoch": 0.014587892049598834, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.53125, + "kl": 0.15501022015232593, + "learning_rate": 4.99919651059899e-06, + "loss": 0.3644798755645752, + "num_tokens": 123234.0, + "reward": 0.2975000262260437, + "reward_std": 0.4273386001586914, + "rewards/reward_correct/mean": 0.2375, + "rewards/reward_correct/std": 0.40396743416786196, + "rewards/reward_format/mean": 0.06000000275671482, + "rewards/reward_format/std": 0.04366411790251732, + "step": 20, + "step_time": 10.735633163712919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 149.8, + "completions/mean_length": 69.175, + "completions/mean_terminated_length": 39.53047866821289, + "completions/min_length": 11.4, + "completions/min_terminated_length": 11.4, + "entropy": 0.8098364531993866, + "epoch": 0.01823486506199854, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.125, + "kl": 0.32382691686507314, + "learning_rate": 4.998300157081193e-06, + "loss": 0.5424751758575439, + "num_tokens": 141800.0, + "reward": 0.5500000238418579, + "reward_std": 0.5090911328792572, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.4957427144050598, + "rewards/reward_format/mean": 0.08750000447034836, + "rewards/reward_format/std": 0.03355616144835949, + "step": 25, + "step_time": 10.772075586020947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 199.6, + "completions/mean_length": 111.3625, + "completions/mean_terminated_length": 53.75607032775879, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 1.1369147203862666, + "epoch": 0.02188183807439825, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.875, + "kl": 0.19015779324108734, + "learning_rate": 4.997071994800816e-06, + "loss": 0.3855216264724731, + "num_tokens": 161141.0, + "reward": 0.3812500059604645, + "reward_std": 0.4697671353816986, + "rewards/reward_correct/mean": 0.3125, + "rewards/reward_correct/std": 0.44714781641960144, + "rewards/reward_format/mean": 0.06875000149011612, + "rewards/reward_format/std": 0.04682775363326073, + "step": 30, + "step_time": 10.76372629776597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2625, + "completions/max_length": 256.0, + "completions/max_terminated_length": 162.6, + "completions/mean_length": 101.325, + "completions/mean_terminated_length": 52.44614372253418, + "completions/min_length": 12.6, + "completions/min_terminated_length": 12.6, + "entropy": 1.0375047504901886, + "epoch": 0.025528811086797956, + "frac_reward_zero_std": 0.1, + "grad_norm": 12.9375, + "kl": 0.24681583042256533, + "learning_rate": 4.9955121868747705e-06, + "loss": 0.46199893951416016, + "num_tokens": 181375.0, + "reward": 0.4225000113248825, + "reward_std": 0.4219440996646881, + "rewards/reward_correct/mean": 0.35, + "rewards/reward_correct/std": 0.4024340331554413, + "rewards/reward_format/mean": 0.07250000238418579, + "rewards/reward_format/std": 0.04182775355875492, + "step": 35, + "step_time": 10.80482179876417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 186.6, + "completions/mean_length": 76.2125, + "completions/mean_terminated_length": 47.18564300537109, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "entropy": 0.9317402366548777, + "epoch": 0.029175784099197667, + "frac_reward_zero_std": 0.1, + "grad_norm": 9.9375, + "kl": 0.22398740155622363, + "learning_rate": 4.99362094046709e-06, + "loss": 0.35251405239105227, + "num_tokens": 199200.0, + "reward": 0.42625001072883606, + "reward_std": 0.3446300495415926, + "rewards/reward_correct/mean": 0.3375, + "rewards/reward_correct/std": 0.3296448469161987, + "rewards/reward_format/mean": 0.08875000327825547, + "rewards/reward_format/std": 0.03172486051917076, + "step": 40, + "step_time": 10.718003647401929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 209.2, + "completions/max_terminated_length": 113.6, + "completions/mean_length": 80.975, + "completions/mean_terminated_length": 42.155350875854495, + "completions/min_length": 12.2, + "completions/min_terminated_length": 12.2, + "entropy": 0.7341153841465712, + "epoch": 0.03282275711159737, + "frac_reward_zero_std": 0.0, + "grad_norm": 28.125, + "kl": 0.33811302059330045, + "learning_rate": 4.991398506761417e-06, + "loss": 0.43915524482727053, + "num_tokens": 218142.0, + "reward": 0.4575000196695328, + "reward_std": 0.4790138781070709, + "rewards/reward_correct/mean": 0.375, + "rewards/reward_correct/std": 0.46597431898117064, + "rewards/reward_format/mean": 0.08250000178813935, + "rewards/reward_format/std": 0.03246281631290913, + "step": 45, + "step_time": 9.152494781091809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 163.8, + "completions/mean_length": 80.0125, + "completions/mean_terminated_length": 41.76559352874756, + "completions/min_length": 12.4, + "completions/min_terminated_length": 12.4, + "entropy": 0.821015353128314, + "epoch": 0.03646973012399708, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.875, + "kl": 0.32567108115181326, + "learning_rate": 4.988845180927638e-06, + "loss": 0.4739445686340332, + "num_tokens": 237559.0, + "reward": 0.37750002443790437, + "reward_std": 0.4639891743659973, + "rewards/reward_correct/mean": 0.3, + "rewards/reward_correct/std": 0.4496173322200775, + "rewards/reward_format/mean": 0.07750000432133675, + "rewards/reward_format/std": 0.039766131341457366, + "step": 50, + "step_time": 10.78912747707218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 177.8, + "completions/max_terminated_length": 115.8, + "completions/mean_length": 47.4875, + "completions/mean_terminated_length": 25.25436019897461, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 0.6518673650920391, + "epoch": 0.04011670313639679, + "frac_reward_zero_std": 0.1, + "grad_norm": 17.375, + "kl": 0.3619930279441178, + "learning_rate": 4.9859613020826855e-06, + "loss": 0.41170291900634765, + "num_tokens": 252854.0, + "reward": 0.5262500196695328, + "reward_std": 0.46595550775527955, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.4532795548439026, + "rewards/reward_format/mean": 0.08875000178813934, + "rewards/reward_format/std": 0.02700653076171875, + "step": 55, + "step_time": 8.004997123591602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 181.0, + "completions/max_terminated_length": 142.6, + "completions/mean_length": 44.325, + "completions/mean_terminated_length": 33.7089111328125, + "completions/min_length": 12.4, + "completions/min_terminated_length": 12.4, + "entropy": 0.5414984721690417, + "epoch": 0.0437636761487965, + "frac_reward_zero_std": 0.2, + "grad_norm": 9.6875, + "kl": 0.3424894532188773, + "learning_rate": 4.982747253245498e-06, + "loss": 0.33097453117370607, + "num_tokens": 271568.0, + "reward": 0.6537500202655793, + "reward_std": 0.40773797035217285, + "rewards/reward_correct/mean": 0.5625, + "rewards/reward_correct/std": 0.39847134947776797, + "rewards/reward_format/mean": 0.09125000089406968, + "rewards/reward_format/std": 0.023944272473454476, + "step": 60, + "step_time": 8.301244870945812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.225, + "completions/max_length": 161.4, + "completions/max_terminated_length": 77.2, + "completions/mean_length": 75.475, + "completions/mean_terminated_length": 25.114743804931642, + "completions/min_length": 12.6, + "completions/min_terminated_length": 12.6, + "entropy": 0.754500750079751, + "epoch": 0.04741064916119621, + "frac_reward_zero_std": 0.2, + "grad_norm": 10.0625, + "kl": 0.34600050022127105, + "learning_rate": 4.979203461286147e-06, + "loss": 0.3402897596359253, + "num_tokens": 290166.0, + "reward": 0.5037500202655792, + "reward_std": 0.5245579957962037, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.5031512618064881, + "rewards/reward_format/mean": 0.0787500023841858, + "rewards/reward_format/std": 0.027883481234312057, + "step": 65, + "step_time": 7.526004654355347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 218.6, + "completions/max_terminated_length": 155.2, + "completions/mean_length": 30.9375, + "completions/mean_terminated_length": 25.00916690826416, + "completions/min_length": 11.2, + "completions/min_terminated_length": 11.2, + "entropy": 0.5036110047250986, + "epoch": 0.05105762217359591, + "frac_reward_zero_std": 0.4, + "grad_norm": 25.5, + "kl": 0.6928889500908554, + "learning_rate": 4.975330396869143e-06, + "loss": 0.37158665657043455, + "num_tokens": 302801.0, + "reward": 0.5850000143051147, + "reward_std": 0.3965589225292206, + "rewards/reward_correct/mean": 0.4875, + "rewards/reward_correct/std": 0.3939549446105957, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 70, + "step_time": 9.44622635692358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 164.6, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 27.0375, + "completions/mean_terminated_length": 23.991666793823242, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.5376770570874214, + "epoch": 0.05470459518599562, + "frac_reward_zero_std": 0.2, + "grad_norm": 34.5, + "kl": 0.5705962640233337, + "learning_rate": 4.9711285743909305e-06, + "loss": 0.41422367095947266, + "num_tokens": 317308.0, + "reward": 0.6487500190734863, + "reward_std": 0.378141188621521, + "rewards/reward_correct/mean": 0.55, + "rewards/reward_correct/std": 0.37636529207229613, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 75, + "step_time": 7.6598183123394845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 187.8, + "completions/max_terminated_length": 90.4, + "completions/mean_length": 41.6625, + "completions/mean_terminated_length": 21.87238140106201, + "completions/min_length": 11.2, + "completions/min_terminated_length": 11.2, + "entropy": 0.6237934932112694, + "epoch": 0.058351568198395334, + "frac_reward_zero_std": 0.1, + "grad_norm": 13.5625, + "kl": 0.4231829413212836, + "learning_rate": 4.96659855191156e-06, + "loss": 0.5371630668640137, + "num_tokens": 333657.0, + "reward": 0.4287500113248825, + "reward_std": 0.35254624411463736, + "rewards/reward_correct/mean": 0.3375, + "rewards/reward_correct/std": 0.3414854288101196, + "rewards/reward_format/mean": 0.09125000238418579, + "rewards/reward_format/std": 0.020775573328137398, + "step": 80, + "step_time": 8.462781658768654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 176.4, + "completions/max_terminated_length": 69.4, + "completions/mean_length": 42.7875, + "completions/mean_terminated_length": 18.85842056274414, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.4891991775482893, + "epoch": 0.06199854121079504, + "frac_reward_zero_std": 0.2, + "grad_norm": 15.3125, + "kl": 0.4579021529236343, + "learning_rate": 4.961740931080577e-06, + "loss": 0.5135890483856201, + "num_tokens": 349976.0, + "reward": 0.5525000274181366, + "reward_std": 0.4548910856246948, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.44643059372901917, + "rewards/reward_format/mean": 0.09000000208616257, + "rewards/reward_format/std": 0.021405572816729546, + "step": 85, + "step_time": 8.072975029051303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 244.0, + "completions/max_terminated_length": 169.8, + "completions/mean_length": 46.0875, + "completions/mean_terminated_length": 26.288142013549805, + "completions/min_length": 9.8, + "completions/min_terminated_length": 9.8, + "entropy": 0.641087163425982, + "epoch": 0.06564551422319474, + "frac_reward_zero_std": 0.3, + "grad_norm": 23.375, + "kl": 0.5415010590106248, + "learning_rate": 4.956556357057114e-06, + "loss": 0.47292590141296387, + "num_tokens": 366503.0, + "reward": 0.6150000095367432, + "reward_std": 0.38518666513264177, + "rewards/reward_correct/mean": 0.525, + "rewards/reward_correct/std": 0.3665252387523651, + "rewards/reward_format/mean": 0.09000000208616257, + "rewards/reward_format/std": 0.029893559589982032, + "step": 90, + "step_time": 10.2234088473022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 242.8, + "completions/max_terminated_length": 109.6, + "completions/mean_length": 73.7625, + "completions/mean_terminated_length": 30.631349945068358, + "completions/min_length": 9.4, + "completions/min_terminated_length": 9.4, + "entropy": 0.5248838514089584, + "epoch": 0.06929248723559446, + "frac_reward_zero_std": 0.2, + "grad_norm": 15.0625, + "kl": 0.3561408089939505, + "learning_rate": 4.951045518424198e-06, + "loss": 0.4540065288543701, + "num_tokens": 383004.0, + "reward": 0.4162500023841858, + "reward_std": 0.40364792943000793, + "rewards/reward_correct/mean": 0.3375, + "rewards/reward_correct/std": 0.3884649932384491, + "rewards/reward_format/mean": 0.0787500023841858, + "rewards/reward_format/std": 0.03471478223800659, + "step": 95, + "step_time": 10.334046882390975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 168.0, + "completions/max_terminated_length": 72.8, + "completions/mean_length": 39.375, + "completions/mean_terminated_length": 21.81931381225586, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5239920597523451, + "epoch": 0.07293946024799416, + "frac_reward_zero_std": 0.2, + "grad_norm": 15.375, + "kl": 0.44049375895410775, + "learning_rate": 4.945209147097306e-06, + "loss": 0.5667720794677734, + "num_tokens": 397410.0, + "reward": 0.5300000131130218, + "reward_std": 0.4832838773727417, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.47500433325767516, + "rewards/reward_format/mean": 0.0925000011920929, + "rewards/reward_format/std": 0.01989355869591236, + "step": 100, + "step_time": 7.740872152335942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15, + "completions/max_length": 215.2, + "completions/max_terminated_length": 110.8, + "completions/mean_length": 61.2125, + "completions/mean_terminated_length": 28.011537551879883, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.7239632446318864, + "epoch": 0.07658643326039387, + "frac_reward_zero_std": 0.0, + "grad_norm": 24.375, + "kl": 0.5406704201828688, + "learning_rate": 4.939048018227151e-06, + "loss": 0.44902925491333007, + "num_tokens": 418747.0, + "reward": 0.3850000262260437, + "reward_std": 0.43974891901016233, + "rewards/reward_correct/mean": 0.3, + "rewards/reward_correct/std": 0.42883480787277223, + "rewards/reward_format/mean": 0.08500000238418579, + "rewards/reward_format/std": 0.030349845066666603, + "step": 105, + "step_time": 9.38893738053739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 78.6, + "completions/max_terminated_length": 53.6, + "completions/mean_length": 29.9875, + "completions/mean_terminated_length": 15.459091186523438, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19507885100319983, + "epoch": 0.08023340627279359, + "frac_reward_zero_std": 0.2, + "grad_norm": 4.78125, + "kl": 1.03940935311839, + "learning_rate": 4.9325629500967325e-06, + "loss": 0.15453168153762817, + "num_tokens": 435234.0, + "reward": 0.46875002086162565, + "reward_std": 0.43081329464912416, + "rewards/reward_correct/mean": 0.375, + "rewards/reward_correct/std": 0.42777039408683776, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.009574271738529205, + "step": 110, + "step_time": 4.701937833800912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.6, + "completions/max_terminated_length": 34.6, + "completions/mean_length": 12.475, + "completions/mean_terminated_length": 12.475, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12682482046075166, + "epoch": 0.08388037928519329, + "frac_reward_zero_std": 0.4, + "grad_norm": 14.1875, + "kl": 1.0640871480107308, + "learning_rate": 4.925754804012657e-06, + "loss": 0.06728461980819703, + "num_tokens": 449832.0, + "reward": 0.45000002086162566, + "reward_std": 0.45396127700805666, + "rewards/reward_correct/mean": 0.35, + "rewards/reward_correct/std": 0.45396130084991454, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 115, + "step_time": 3.1354188466444612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.6, + "completions/max_terminated_length": 23.6, + "completions/mean_length": 14.85, + "completions/mean_terminated_length": 11.837500190734863, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15330878184176983, + "epoch": 0.087527352297593, + "frac_reward_zero_std": 0.3, + "grad_norm": 10.25, + "kl": 1.144705768674612, + "learning_rate": 4.918624484190746e-06, + "loss": 0.12007564306259155, + "num_tokens": 461972.0, + "reward": 0.5737500190734863, + "reward_std": 0.4788735508918762, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.47709767818450927, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 120, + "step_time": 4.030961454100907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.6, + "completions/max_terminated_length": 32.4, + "completions/mean_length": 15.3, + "completions/mean_terminated_length": 12.324166870117187, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.21546696010045707, + "epoch": 0.09117432530999271, + "frac_reward_zero_std": 0.2, + "grad_norm": 21.5, + "kl": 1.035651722829789, + "learning_rate": 4.911172937635942e-06, + "loss": 0.2419037103652954, + "num_tokens": 475260.0, + "reward": 0.6962500154972077, + "reward_std": 0.41330612897872926, + "rewards/reward_correct/mean": 0.6, + "rewards/reward_correct/std": 0.41022524833679197, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 125, + "step_time": 3.9624019052833317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.0, + "completions/max_terminated_length": 11.2, + "completions/mean_length": 14.1, + "completions/mean_terminated_length": 11.04000015258789, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.09151147305965424, + "epoch": 0.09482129832239242, + "frac_reward_zero_std": 0.5, + "grad_norm": 10.6875, + "kl": 1.1447502395138145, + "learning_rate": 4.903401154016534e-06, + "loss": 0.19530326128005981, + "num_tokens": 498396.0, + "reward": 0.8487500190734864, + "reward_std": 0.3700888991355896, + "rewards/reward_correct/mean": 0.75, + "rewards/reward_correct/std": 0.36831300854682925, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 130, + "step_time": 4.194347287155688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.4, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 11.0125, + "completions/mean_terminated_length": 11.0125, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.07491862165043131, + "epoch": 0.09846827133479212, + "frac_reward_zero_std": 0.6, + "grad_norm": 9.4375, + "kl": 1.0447093114256858, + "learning_rate": 4.895310165532715e-06, + "loss": 0.04008364379405975, + "num_tokens": 510901.0, + "reward": 0.7000000238418579, + "reward_std": 0.45047799348831175, + "rewards/reward_correct/mean": 0.6, + "rewards/reward_correct/std": 0.4504780113697052, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 135, + "step_time": 2.3076109690591693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.2, + "completions/max_terminated_length": 11.2, + "completions/mean_length": 11.0125, + "completions/mean_terminated_length": 11.0125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07331698536872863, + "epoch": 0.10211524434719182, + "frac_reward_zero_std": 0.6, + "grad_norm": 10.0625, + "kl": 1.440933346748352, + "learning_rate": 4.8869010467794934e-06, + "loss": 0.05762990713119507, + "num_tokens": 524854.0, + "reward": 0.5250000208616257, + "reward_std": 0.38637163639068606, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.38637164831161497, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 140, + "step_time": 2.22716761585325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.8, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 11.1125, + "completions/mean_terminated_length": 11.1125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07434430327266454, + "epoch": 0.10576221735959154, + "frac_reward_zero_std": 0.6, + "grad_norm": 19.75, + "kl": 1.4029747113585471, + "learning_rate": 4.8781749146039705e-06, + "loss": 0.05918262004852295, + "num_tokens": 539287.0, + "reward": 0.7500000238418579, + "reward_std": 0.39766128063201905, + "rewards/reward_correct/mean": 0.65, + "rewards/reward_correct/std": 0.39766131043434144, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 145, + "step_time": 2.3720832807943224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.6, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 14.175, + "completions/mean_terminated_length": 11.115000152587891, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.08828974210191518, + "epoch": 0.10940919037199125, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1875, + "kl": 1.1537816863507033, + "learning_rate": 4.869132927957007e-06, + "loss": 0.12335410118103027, + "num_tokens": 552877.0, + "reward": 0.6862500190734864, + "reward_std": 0.3755089223384857, + "rewards/reward_correct/mean": 0.5875, + "rewards/reward_correct/std": 0.3746281623840332, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 150, + "step_time": 4.018617187254131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.2, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 11.1125, + "completions/mean_terminated_length": 11.1125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.08974357040133327, + "epoch": 0.11305616338439095, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.765625, + "kl": 1.317809948325157, + "learning_rate": 4.8597762877393e-06, + "loss": 0.055012041330337526, + "num_tokens": 569430.0, + "reward": 0.6000000208616256, + "reward_std": 0.40983866453170775, + "rewards/reward_correct/mean": 0.5, + "rewards/reward_correct/std": 0.40983866453170775, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 155, + "step_time": 2.383729504235089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 61.4, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 23.375, + "completions/mean_terminated_length": 11.125, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.0998516078107059, + "epoch": 0.11670313639679067, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.46484375, + "kl": 1.2694853531196713, + "learning_rate": 4.850106236641888e-06, + "loss": 0.20386412143707275, + "num_tokens": 592676.0, + "reward": 0.7700000241398811, + "reward_std": 0.20868916511535646, + "rewards/reward_correct/mean": 0.675, + "rewards/reward_correct/std": 0.2032795548439026, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.008944272249937057, + "step": 160, + "step_time": 4.491866340488196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.6, + "completions/max_terminated_length": 42.6, + "completions/mean_length": 13.0375, + "completions/mean_terminated_length": 13.0375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.0853432139265351, + "epoch": 0.12035010940919037, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.71875, + "kl": 1.180515643954277, + "learning_rate": 4.840124058981099e-06, + "loss": 0.345825719833374, + "num_tokens": 610815.0, + "reward": 0.7500000238418579, + "reward_std": 0.3557490587234497, + "rewards/reward_correct/mean": 0.65, + "rewards/reward_correct/std": 0.35574907064437866, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 165, + "step_time": 3.590569644793868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 158.4, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 20.2375, + "completions/mean_terminated_length": 11.050833892822265, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11941310898400843, + "epoch": 0.12399708242159008, + "frac_reward_zero_std": 0.4, + "grad_norm": 10.0625, + "kl": 1.2691643899306655, + "learning_rate": 4.829831080527982e-06, + "loss": 0.5662191867828369, + "num_tokens": 626154.0, + "reward": 0.5587500154972076, + "reward_std": 0.3578946400433779, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.35, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 170, + "step_time": 7.5109656000509855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.4, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 14.0875, + "completions/mean_terminated_length": 11.025000190734863, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07720207688398659, + "epoch": 0.1276440554339898, + "frac_reward_zero_std": 0.4, + "grad_norm": 19.625, + "kl": 1.1847386069595813, + "learning_rate": 4.819228668332222e-06, + "loss": 0.19896353483200074, + "num_tokens": 638841.0, + "reward": 0.7237500190734864, + "reward_std": 0.48944908380508423, + "rewards/reward_correct/mean": 0.625, + "rewards/reward_correct/std": 0.4871040344238281, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 175, + "step_time": 3.96020096167922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.4, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 11.0875, + "completions/mean_terminated_length": 11.0875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.09593665869906545, + "epoch": 0.13129102844638948, + "frac_reward_zero_std": 0.3, + "grad_norm": 24.75, + "kl": 1.252165026962757, + "learning_rate": 4.8083182305405794e-06, + "loss": 0.049454343318939206, + "num_tokens": 655472.0, + "reward": 0.650000023841858, + "reward_std": 0.37457751035690307, + "rewards/reward_correct/mean": 0.55, + "rewards/reward_correct/std": 0.37457752227783203, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 180, + "step_time": 2.333094880543649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.6, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 10.925, + "completions/mean_terminated_length": 10.925, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.04658548422157764, + "epoch": 0.1349380014587892, + "frac_reward_zero_std": 0.6, + "grad_norm": 9.9375, + "kl": 1.2068048641085625, + "learning_rate": 4.797101216209866e-06, + "loss": 0.017128930985927583, + "num_tokens": 668338.0, + "reward": 0.5487500101327896, + "reward_std": 0.2353046417236328, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.23390213251113892, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 185, + "step_time": 2.3096012519672513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.4, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 11.025, + "completions/mean_terminated_length": 11.025, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06313171470537782, + "epoch": 0.13858497447118892, + "frac_reward_zero_std": 0.5, + "grad_norm": 51.0, + "kl": 1.4884838223457337, + "learning_rate": 4.7855791151144936e-06, + "loss": 0.05992478132247925, + "num_tokens": 681436.0, + "reward": 0.5625000193715095, + "reward_std": 0.3951917767524719, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.39519179463386533, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 190, + "step_time": 2.298890479654074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 109.4, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 22.8375, + "completions/mean_terminated_length": 14.02226219177246, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.09166574152186513, + "epoch": 0.1422319474835886, + "frac_reward_zero_std": 0.6, + "grad_norm": 22.5, + "kl": 1.2180277963983825, + "learning_rate": 4.773753457548608e-06, + "loss": 0.27732343673706056, + "num_tokens": 696159.0, + "reward": 0.6337500169873238, + "reward_std": 0.275569087266922, + "rewards/reward_correct/mean": 0.5375, + "rewards/reward_correct/std": 0.26733527779579164, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 195, + "step_time": 5.679303100146353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 14.1625, + "completions/mean_terminated_length": 11.100000190734864, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.056278343067970124, + "epoch": 0.14587892049598833, + "frac_reward_zero_std": 0.6, + "grad_norm": 16.25, + "kl": 1.1579875383526086, + "learning_rate": 4.761625814122849e-06, + "loss": 0.3294424295425415, + "num_tokens": 707660.0, + "reward": 0.7862500190734864, + "reward_std": 0.4107490539550781, + "rewards/reward_correct/mean": 0.6875, + "rewards/reward_correct/std": 0.40574907064437865, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 200, + "step_time": 4.051652999222279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 158.4, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 23.375, + "completions/mean_terminated_length": 11.13190517425537, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.10470445868559182, + "epoch": 0.14952589350838805, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.47265625, + "kl": 1.0083515344187617, + "learning_rate": 4.749197795555744e-06, + "loss": 0.8333715438842774, + "num_tokens": 722890.0, + "reward": 0.7700000166893005, + "reward_std": 0.38229522705078123, + "rewards/reward_correct/mean": 0.675, + "rewards/reward_correct/std": 0.376531594991684, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.016831301152706146, + "step": 205, + "step_time": 7.456793288141489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.8, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 14.1125, + "completions/mean_terminated_length": 11.050000190734863, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.043511086050421, + "epoch": 0.15317286652078774, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.35546875, + "kl": 1.0842143267393112, + "learning_rate": 4.736471052459793e-06, + "loss": 0.3268296718597412, + "num_tokens": 735115.0, + "reward": 1.0612500190734864, + "reward_std": 0.12331299781799317, + "rewards/reward_correct/mean": 0.9625, + "rewards/reward_correct/std": 0.11831300854682922, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 210, + "step_time": 3.9708797845989467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.6, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 14.0375, + "completions/mean_terminated_length": 10.975833511352539, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.0639728254172951, + "epoch": 0.15681983953318746, + "frac_reward_zero_std": 0.6, + "grad_norm": 12.125, + "kl": 1.3062359392642975, + "learning_rate": 4.723447275122236e-06, + "loss": 0.10097410678863525, + "num_tokens": 747678.0, + "reward": 0.8225000143051148, + "reward_std": 0.362524950504303, + "rewards/reward_correct/mean": 0.725, + "rewards/reward_correct/std": 0.35574907064437866, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 215, + "step_time": 4.0270144991576675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.8, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 11.1625, + "completions/mean_terminated_length": 11.1625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.10701510878279805, + "epoch": 0.16046681254558717, + "frac_reward_zero_std": 0.6, + "grad_norm": 24.75, + "kl": 1.1643942534923553, + "learning_rate": 4.710128193280563e-06, + "loss": 0.04864640533924103, + "num_tokens": 762163.0, + "reward": 0.5250000193715095, + "reward_std": 0.36908756494522094, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.36908757090568545, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 220, + "step_time": 2.3151350896805525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 60.4, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 26.3375, + "completions/mean_terminated_length": 11.025, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.10675514980684966, + "epoch": 0.16411378555798686, + "frac_reward_zero_std": 0.7, + "grad_norm": 8.625, + "kl": 1.1121211878955364, + "learning_rate": 4.69651557589278e-06, + "loss": 0.16680443286895752, + "num_tokens": 773622.0, + "reward": 0.3812500208616257, + "reward_std": 0.4213937222957611, + "rewards/reward_correct/mean": 0.2875, + "rewards/reward_correct/std": 0.41780426502227785, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.009574271738529205, + "step": 225, + "step_time": 3.9202851224690676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 179.4, + "completions/max_terminated_length": 81.6, + "completions/mean_length": 27.7, + "completions/mean_terminated_length": 15.451786041259766, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12723040292039514, + "epoch": 0.16776075857038658, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.546875, + "kl": 1.127267230092548, + "learning_rate": 4.682611230902466e-06, + "loss": 0.3900377511978149, + "num_tokens": 801966.0, + "reward": 0.4575000211596489, + "reward_std": 0.3705937087535858, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.36350480318069456, + "rewards/reward_format/mean": 0.09500000327825546, + "rewards/reward_format/std": 0.013662602007389068, + "step": 230, + "step_time": 8.629118899069727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 158.2, + "completions/max_terminated_length": 11.2, + "completions/mean_length": 26.325, + "completions/mean_terminated_length": 11.012500381469726, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.14824053931515663, + "epoch": 0.1714077315827863, + "frac_reward_zero_std": 0.6, + "grad_norm": 15.125, + "kl": 0.954940097220242, + "learning_rate": 4.668417004998658e-06, + "loss": 0.6346611022949219, + "num_tokens": 816168.0, + "reward": 0.38125001490116117, + "reward_std": 0.30240801759064195, + "rewards/reward_correct/mean": 0.2875, + "rewards/reward_correct/std": 0.2884649932384491, + "rewards/reward_format/mean": 0.09375, + "rewards/reward_format/std": 0.018062257766723634, + "step": 235, + "step_time": 7.433387812599539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 109.2, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 23.275, + "completions/mean_terminated_length": 11.026786041259765, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.13740440141409635, + "epoch": 0.175054704595186, + "frac_reward_zero_std": 0.1, + "grad_norm": 37.0, + "kl": 1.0841830573044717, + "learning_rate": 4.653934783370579e-06, + "loss": 0.39234879016876223, + "num_tokens": 831534.0, + "reward": 0.5325000166893006, + "reward_std": 0.4643899977207184, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.456436949968338, + "rewards/reward_format/mean": 0.09500000327825546, + "rewards/reward_format/std": 0.013662602007389068, + "step": 240, + "step_time": 5.670606399327516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.4, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 11.1, + "completions/mean_terminated_length": 11.1, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07208764422684907, + "epoch": 0.1787016776075857, + "frac_reward_zero_std": 0.6, + "grad_norm": 27.75, + "kl": 1.2417992696166038, + "learning_rate": 4.63916648945726e-06, + "loss": 0.049610266089439393, + "num_tokens": 849518.0, + "reward": 0.6375000193715096, + "reward_std": 0.3278210163116455, + "rewards/reward_correct/mean": 0.5375, + "rewards/reward_correct/std": 0.32782103419303893, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 245, + "step_time": 2.396680700033903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.1, + "completions/mean_terminated_length": 11.1, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.043113839160650966, + "epoch": 0.18234865061998543, + "frac_reward_zero_std": 0.7, + "grad_norm": 2.5625, + "kl": 0.9787846058607101, + "learning_rate": 4.624114084692086e-06, + "loss": 0.039211463928222653, + "num_tokens": 861910.0, + "reward": 0.900000023841858, + "reward_std": 0.2665252208709717, + "rewards/reward_correct/mean": 0.8, + "rewards/reward_correct/std": 0.2665252387523651, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 250, + "step_time": 2.3184084575623274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.1, + "completions/mean_terminated_length": 11.1, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.0639854067005217, + "epoch": 0.18599562363238512, + "frac_reward_zero_std": 0.7, + "grad_norm": 44.75, + "kl": 0.992956705391407, + "learning_rate": 4.608779568242284e-06, + "loss": 0.03908544182777405, + "num_tokens": 872910.0, + "reward": 0.712500023841858, + "reward_std": 0.4123081684112549, + "rewards/reward_correct/mean": 0.6125, + "rewards/reward_correct/std": 0.41230818033218386, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 255, + "step_time": 2.30882707349956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.8, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 11.05, + "completions/mean_terminated_length": 11.05, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.06614171494729817, + "epoch": 0.18964259664478483, + "frac_reward_zero_std": 0.8, + "grad_norm": 21.25, + "kl": 1.5656966656446456, + "learning_rate": 4.59316497674341e-06, + "loss": 0.06254332065582276, + "num_tokens": 886090.0, + "reward": 0.43750001937150956, + "reward_std": 0.3984713315963745, + "rewards/reward_correct/mean": 0.3375, + "rewards/reward_correct/std": 0.39847134947776797, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 260, + "step_time": 2.268936365097761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.8, + "completions/max_terminated_length": 42.6, + "completions/mean_length": 16.1375, + "completions/mean_terminated_length": 13.205000305175782, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.11322204252937809, + "epoch": 0.19328956965718452, + "frac_reward_zero_std": 0.6, + "grad_norm": 20.375, + "kl": 1.250516564771533, + "learning_rate": 4.577272384028852e-06, + "loss": 0.2606987714767456, + "num_tokens": 897989.0, + "reward": 0.49875001460313795, + "reward_std": 0.23106872327625752, + "rewards/reward_correct/mean": 0.4, + "rewards/reward_correct/std": 0.22606874108314515, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 265, + "step_time": 4.030756696872413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.8, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 10.9, + "completions/mean_terminated_length": 10.9, + "completions/min_length": 10.2, + "completions/min_terminated_length": 10.2, + "entropy": 0.0970899501349777, + "epoch": 0.19693654266958424, + "frac_reward_zero_std": 0.4, + "grad_norm": 42.0, + "kl": 1.20570567548275, + "learning_rate": 4.561103900854401e-06, + "loss": 0.0584064781665802, + "num_tokens": 915021.0, + "reward": 0.6625000238418579, + "reward_std": 0.4353072166442871, + "rewards/reward_correct/mean": 0.5625, + "rewards/reward_correct/std": 0.43530723452568054, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 270, + "step_time": 2.3582403326407073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.2, + "completions/max_terminated_length": 51.2, + "completions/mean_length": 14.5125, + "completions/mean_terminated_length": 14.5125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11866076248697936, + "epoch": 0.20058351568198396, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.71484375, + "kl": 1.1673100851476192, + "learning_rate": 4.544661674617913e-06, + "loss": 0.014130707085132598, + "num_tokens": 926542.0, + "reward": 0.6875000238418579, + "reward_std": 0.4123081684112549, + "rewards/reward_correct/mean": 0.5875, + "rewards/reward_correct/std": 0.41230818033218386, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 275, + "step_time": 3.628739892318845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.2, + "completions/max_terminated_length": 60.2, + "completions/mean_length": 17.0625, + "completions/mean_terminated_length": 17.0625, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.13258798802271485, + "epoch": 0.20423048869438365, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.1533203125, + "kl": 1.1400590494275094, + "learning_rate": 4.527947889074097e-06, + "loss": 0.041229134798049925, + "num_tokens": 938267.0, + "reward": 0.7125000208616257, + "reward_std": 0.25655910968780515, + "rewards/reward_correct/mean": 0.6125, + "rewards/reward_correct/std": 0.25655910968780515, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 280, + "step_time": 4.000555062294007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 16.0375, + "completions/mean_terminated_length": 16.0375, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.13616890476550908, + "epoch": 0.20787746170678337, + "frac_reward_zero_std": 0.8, + "grad_norm": 1.765625, + "kl": 1.0311771709471942, + "learning_rate": 4.510964764044494e-06, + "loss": 0.05273414850234985, + "num_tokens": 951774.0, + "reward": 0.8125000238418579, + "reward_std": 0.3090286135673523, + "rewards/reward_correct/mean": 0.7125, + "rewards/reward_correct/std": 0.30902862548828125, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 285, + "step_time": 3.5707103287801147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.8, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 11.0125, + "completions/mean_terminated_length": 11.0125, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.06727474313229323, + "epoch": 0.21152443471918309, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.2333984375, + "kl": 1.1392212808132172, + "learning_rate": 4.493714555122646e-06, + "loss": 0.04595130383968353, + "num_tokens": 963367.0, + "reward": 0.650000023841858, + "reward_std": 0.44847133159637453, + "rewards/reward_correct/mean": 0.55, + "rewards/reward_correct/std": 0.44847134947776796, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 290, + "step_time": 2.2439120043069125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.6, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 14.075, + "completions/mean_terminated_length": 11.010000038146973, + "completions/min_length": 10.2, + "completions/min_terminated_length": 10.2, + "entropy": 0.10070977592840791, + "epoch": 0.21517140773158278, + "frac_reward_zero_std": 0.8, + "grad_norm": 29.0, + "kl": 1.2017444424331187, + "learning_rate": 4.476199553374525e-06, + "loss": 0.330456280708313, + "num_tokens": 981477.0, + "reward": 0.7112500160932541, + "reward_std": 0.3298721075057983, + "rewards/reward_correct/mean": 0.6125, + "rewards/reward_correct/std": 0.3248721182346344, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 295, + "step_time": 4.207831310294568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.6, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 11.075, + "completions/mean_terminated_length": 11.075, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07061950541101396, + "epoch": 0.2188183807439825, + "frac_reward_zero_std": 0.6, + "grad_norm": 17.625, + "kl": 1.1677941218018533, + "learning_rate": 4.4584220850342455e-06, + "loss": 0.046302270889282224, + "num_tokens": 992963.0, + "reward": 0.6000000193715096, + "reward_std": 0.2514050841331482, + "rewards/reward_correct/mean": 0.5, + "rewards/reward_correct/std": 0.25140510201454164, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 300, + "step_time": 2.317406605742872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 69.2, + "completions/max_terminated_length": 22.8, + "completions/mean_length": 14.8125, + "completions/mean_terminated_length": 11.760833549499512, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.10661213407292962, + "epoch": 0.2224653537563822, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.29296875, + "kl": 1.510790778696537, + "learning_rate": 4.4403845111951125e-06, + "loss": 0.11747499704360961, + "num_tokens": 1004860.0, + "reward": 0.6612500190734864, + "reward_std": 0.23367205262184143, + "rewards/reward_correct/mean": 0.5625, + "rewards/reward_correct/std": 0.2330920934677124, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 305, + "step_time": 4.344296957179904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 110.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 11.13000030517578, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.10651253748219461, + "epoch": 0.2261123267687819, + "frac_reward_zero_std": 0.4, + "grad_norm": 20.75, + "kl": 1.2567486291751266, + "learning_rate": 4.422089227496031e-06, + "loss": 0.39818150997161866, + "num_tokens": 1018728.0, + "reward": 0.4850000113248825, + "reward_std": 0.4156197845935822, + "rewards/reward_correct/mean": 0.3875, + "rewards/reward_correct/std": 0.4126947641372681, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 310, + "step_time": 5.7245670586824415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 207.2, + "completions/max_terminated_length": 56.8, + "completions/mean_length": 35.2625, + "completions/mean_terminated_length": 14.470595932006836, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.17611132971942425, + "epoch": 0.22975929978118162, + "frac_reward_zero_std": 0.3, + "grad_norm": 19.25, + "kl": 1.0911721091717481, + "learning_rate": 4.403538663803336e-06, + "loss": 0.6034462928771973, + "num_tokens": 1040973.0, + "reward": 0.5537500143051147, + "reward_std": 0.5126215934753418, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.5039613008499145, + "rewards/reward_format/mean": 0.09125000089406968, + "rewards/reward_format/std": 0.024893558770418166, + "step": 315, + "step_time": 9.428698727861047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 207.0, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 29.45, + "completions/mean_terminated_length": 11.093141555786133, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.1853351781144738, + "epoch": 0.23340627279358134, + "frac_reward_zero_std": 0.1, + "grad_norm": 17.75, + "kl": 1.0361705672927202, + "learning_rate": 4.384735283888072e-06, + "loss": 0.9261909484863281, + "num_tokens": 1058601.0, + "reward": 0.41750001907348633, + "reward_std": 0.463871031999588, + "rewards/reward_correct/mean": 0.325, + "rewards/reward_correct/std": 0.4577708959579468, + "rewards/reward_format/mean": 0.0925000011920929, + "rewards/reward_format/std": 0.0230622585862875, + "step": 320, + "step_time": 9.2150695707649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.6, + "completions/max_terminated_length": 13.8, + "completions/mean_length": 14.325, + "completions/mean_terminated_length": 11.270833396911621, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1327807642519474, + "epoch": 0.23705324580598103, + "frac_reward_zero_std": 0.4, + "grad_norm": 20.25, + "kl": 1.2340400084853171, + "learning_rate": 4.365681585098765e-06, + "loss": 0.3276486396789551, + "num_tokens": 1072427.0, + "reward": 0.6612500220537185, + "reward_std": 0.3294189989566803, + "rewards/reward_correct/mean": 0.5625, + "rewards/reward_correct/std": 0.3285382568836212, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 325, + "step_time": 4.020631538145244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 12.3875, + "completions/mean_terminated_length": 12.3875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.13266039690934123, + "epoch": 0.24070021881838075, + "frac_reward_zero_std": 0.6, + "grad_norm": 1.1484375, + "kl": 1.1591177150607108, + "learning_rate": 4.3463800980297495e-06, + "loss": 0.04369702041149139, + "num_tokens": 1084666.0, + "reward": 0.31250001192092897, + "reward_std": 0.27078250646591184, + "rewards/reward_correct/mean": 0.2125, + "rewards/reward_correct/std": 0.2707825243473053, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 330, + "step_time": 3.0494307784363626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 139.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 19.1375, + "completions/mean_terminated_length": 16.075000190734862, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12816359531134366, + "epoch": 0.24434719183078046, + "frac_reward_zero_std": 0.4, + "grad_norm": 14.25, + "kl": 1.045123457349837, + "learning_rate": 4.326833386185063e-06, + "loss": 0.2902881622314453, + "num_tokens": 1102789.0, + "reward": 0.4237500160932541, + "reward_std": 0.4266521632671356, + "rewards/reward_correct/mean": 0.325, + "rewards/reward_correct/std": 0.4246281623840332, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 335, + "step_time": 6.870491482689976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 158.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 28.875, + "completions/mean_terminated_length": 17.001905250549317, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17933309143409132, + "epoch": 0.24799416484318015, + "frac_reward_zero_std": 0.2, + "grad_norm": 18.25, + "kl": 1.2366713466122747, + "learning_rate": 4.307044045637979e-06, + "loss": 0.39889614582061766, + "num_tokens": 1126027.0, + "reward": 0.4575000137090683, + "reward_std": 0.47183834910392763, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.46678435802459717, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.016831301152706146, + "step": 340, + "step_time": 7.858821518532932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.4, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 14.0875, + "completions/mean_terminated_length": 11.025000190734863, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11094034323468804, + "epoch": 0.25164113785557984, + "frac_reward_zero_std": 0.6, + "grad_norm": 1.09375, + "kl": 1.2160658542066813, + "learning_rate": 4.287014704686215e-06, + "loss": 0.3306739330291748, + "num_tokens": 1138434.0, + "reward": 0.46125001162290574, + "reward_std": 0.3726388454437256, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.3706148326396942, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 345, + "step_time": 3.9530823271721602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2, + "completions/max_length": 256.0, + "completions/max_terminated_length": 14.8, + "completions/mean_length": 60.35, + "completions/mean_terminated_length": 11.473660278320313, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2570150036364794, + "epoch": 0.2552881108679796, + "frac_reward_zero_std": 0.3, + "grad_norm": 11.75, + "kl": 0.6521673948736861, + "learning_rate": 4.266748023502858e-06, + "loss": 0.9113063812255859, + "num_tokens": 1153438.0, + "reward": 0.6037500321865081, + "reward_std": 0.39536969661712645, + "rewards/reward_correct/mean": 0.525, + "rewards/reward_correct/std": 0.36124515533447266, + "rewards/reward_format/mean": 0.07875000387430191, + "rewards/reward_format/std": 0.04087340384721756, + "step": 350, + "step_time": 10.89834836255759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 63.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 14.3375, + "completions/mean_terminated_length": 11.275000190734863, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1417900924105197, + "epoch": 0.2589350838803793, + "frac_reward_zero_std": 0.2, + "grad_norm": 22.25, + "kl": 1.057770232576877, + "learning_rate": 4.246246693783051e-06, + "loss": 0.24158201217651368, + "num_tokens": 1167505.0, + "reward": 0.5987500190734864, + "reward_std": 0.48049641847610475, + "rewards/reward_correct/mean": 0.5, + "rewards/reward_correct/std": 0.47909392714500426, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 355, + "step_time": 4.124299841746688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 60.0, + "completions/max_terminated_length": 51.8, + "completions/mean_length": 22.85, + "completions/mean_terminated_length": 14.276923370361327, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.14749723440036178, + "epoch": 0.26258205689277897, + "frac_reward_zero_std": 0.5, + "grad_norm": 11.375, + "kl": 1.0647744481917472, + "learning_rate": 4.2255134383865065e-06, + "loss": 0.09965692162513733, + "num_tokens": 1179381.0, + "reward": 0.533750033378601, + "reward_std": 0.49806923866271974, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.4921671748161316, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.00806225836277008, + "step": 360, + "step_time": 3.9502382948994637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.4, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 14.1625, + "completions/mean_terminated_length": 11.101666831970215, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.09400356900878251, + "epoch": 0.2662290299051787, + "frac_reward_zero_std": 0.5, + "grad_norm": 25.375, + "kl": 1.3044617313891649, + "learning_rate": 4.20455101097587e-06, + "loss": 0.20124485492706298, + "num_tokens": 1194578.0, + "reward": 0.573750016093254, + "reward_std": 0.4383556365966797, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.4355616092681885, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 365, + "step_time": 4.072852225415408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 109.8, + "completions/max_terminated_length": 31.8, + "completions/mean_length": 19.125, + "completions/mean_terminated_length": 13.13000030517578, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15153004545718432, + "epoch": 0.2698760029175784, + "frac_reward_zero_std": 0.2, + "grad_norm": 20.25, + "kl": 1.2351090790703894, + "learning_rate": 4.183362195650993e-06, + "loss": 0.4343620777130127, + "num_tokens": 1210412.0, + "reward": 0.7225000143051148, + "reward_std": 0.48535123467445374, + "rewards/reward_correct/mean": 0.625, + "rewards/reward_correct/std": 0.48130432367324827, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 370, + "step_time": 5.820126916281879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 11.85, + "completions/mean_terminated_length": 11.85, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15505906101316214, + "epoch": 0.2735229759299781, + "frac_reward_zero_std": 0.5, + "grad_norm": 13.6875, + "kl": 1.04697345495224, + "learning_rate": 4.161949806579171e-06, + "loss": 0.045387822389602664, + "num_tokens": 1225616.0, + "reward": 0.37500001639127734, + "reward_std": 0.3626947283744812, + "rewards/reward_correct/mean": 0.275, + "rewards/reward_correct/std": 0.3626947641372681, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 375, + "step_time": 2.6013031523674726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.2, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 11.0375, + "completions/mean_terminated_length": 11.0375, + "completions/min_length": 9.4, + "completions/min_terminated_length": 9.4, + "entropy": 0.11527485344558955, + "epoch": 0.27716994894237784, + "frac_reward_zero_std": 0.5, + "grad_norm": 15.5, + "kl": 1.2315083250403405, + "learning_rate": 4.140316687621379e-06, + "loss": 0.03257818520069122, + "num_tokens": 1258643.0, + "reward": 0.34875001162290575, + "reward_std": 0.3371353387832642, + "rewards/reward_correct/mean": 0.25, + "rewards/reward_correct/std": 0.33556160926818845, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 380, + "step_time": 2.907916400022805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.4, + "completions/max_terminated_length": 61.4, + "completions/mean_length": 14.3, + "completions/mean_terminated_length": 14.3, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12880323510617017, + "epoch": 0.28081692195477753, + "frac_reward_zero_std": 0.4, + "grad_norm": 19.75, + "kl": 1.218659931048751, + "learning_rate": 4.11846571195457e-06, + "loss": -0.03600308001041412, + "num_tokens": 1273859.0, + "reward": 0.5750000238418579, + "reward_std": 0.4896511912345886, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.4896512031555176, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 385, + "step_time": 4.11026696395129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 81.2, + "completions/max_terminated_length": 32.6, + "completions/mean_length": 15.4875, + "completions/mean_terminated_length": 12.428333473205566, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16286818054504693, + "epoch": 0.2844638949671772, + "frac_reward_zero_std": 0.5, + "grad_norm": 20.375, + "kl": 0.9706083960831166, + "learning_rate": 4.096399781690075e-06, + "loss": 0.339518666267395, + "num_tokens": 1287434.0, + "reward": 0.4612500160932541, + "reward_std": 0.35649177320301534, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.35149178504943845, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 390, + "step_time": 4.723254252411425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.6, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 14.175, + "completions/mean_terminated_length": 11.115833473205566, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11281716926023364, + "epoch": 0.28811086797957697, + "frac_reward_zero_std": 0.4, + "grad_norm": 13.0625, + "kl": 1.1161916764453053, + "learning_rate": 4.0741218274881664e-06, + "loss": 0.324251389503479, + "num_tokens": 1299040.0, + "reward": 0.5362500190734864, + "reward_std": 0.35214779861271384, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.34714781641960146, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 395, + "step_time": 4.007868527434766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.8, + "completions/max_terminated_length": 18.8, + "completions/mean_length": 11.5375, + "completions/mean_terminated_length": 11.5375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11438067462295294, + "epoch": 0.29175784099197666, + "frac_reward_zero_std": 0.6, + "grad_norm": 6.09375, + "kl": 1.0016713306307792, + "learning_rate": 4.0516348081688225e-06, + "loss": 0.07169039249420166, + "num_tokens": 1320259.0, + "reward": 0.5875000193715095, + "reward_std": 0.3972344875335693, + "rewards/reward_correct/mean": 0.4875, + "rewards/reward_correct/std": 0.3972344994544983, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 400, + "step_time": 2.764884624630213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.8, + "completions/max_terminated_length": 46.8, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12019062303006649, + "epoch": 0.29540481400437635, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5546875, + "kl": 1.278691291809082, + "learning_rate": 4.028941710318757e-06, + "loss": 0.04510431587696075, + "num_tokens": 1334195.0, + "reward": 0.7500000238418579, + "reward_std": 0.29438172578811644, + "rewards/reward_correct/mean": 0.65, + "rewards/reward_correct/std": 0.29438175559043883, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 405, + "step_time": 3.576040597446263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.6, + "completions/max_terminated_length": 60.2, + "completions/mean_length": 17.2375, + "completions/mean_terminated_length": 14.381666946411134, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.13666094592772424, + "epoch": 0.2990517870167761, + "frac_reward_zero_std": 0.5, + "grad_norm": 11.5625, + "kl": 1.4160386834293603, + "learning_rate": 4.006045547894756e-06, + "loss": 0.19378458261489867, + "num_tokens": 1350990.0, + "reward": 0.6987500190734863, + "reward_std": 0.46407375335693357, + "rewards/reward_correct/mean": 0.6, + "rewards/reward_correct/std": 0.46307799220085144, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 410, + "step_time": 4.030793973617255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.8, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 11.175, + "completions/mean_terminated_length": 11.175, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.0909872115124017, + "epoch": 0.3026987600291758, + "frac_reward_zero_std": 0.5, + "grad_norm": 11.5625, + "kl": 1.2308201387524604, + "learning_rate": 3.982949361823388e-06, + "loss": 0.047733986377716066, + "num_tokens": 1363396.0, + "reward": 0.712500023841858, + "reward_std": 0.4532795548439026, + "rewards/reward_correct/mean": 0.6125, + "rewards/reward_correct/std": 0.4532795548439026, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 415, + "step_time": 2.3610500536859034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 122.4, + "completions/max_terminated_length": 25.4, + "completions/mean_length": 18.1, + "completions/mean_terminated_length": 11.983333587646484, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1709391510114074, + "epoch": 0.3063457330415755, + "frac_reward_zero_std": 0.3, + "grad_norm": 13.5, + "kl": 1.2427557894960046, + "learning_rate": 3.959656219597124e-06, + "loss": 0.6054922103881836, + "num_tokens": 1376892.0, + "reward": 0.5725000143051148, + "reward_std": 0.45437872409820557, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.4496173322200775, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 420, + "step_time": 6.242776400968433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.125, + "completions/mean_terminated_length": 11.125, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.09185657552443445, + "epoch": 0.3099927060539752, + "frac_reward_zero_std": 0.6, + "grad_norm": 16.75, + "kl": 1.1151645183563232, + "learning_rate": 3.936169214866932e-06, + "loss": 0.04759896695613861, + "num_tokens": 1389326.0, + "reward": 0.6125000208616257, + "reward_std": 0.4281516671180725, + "rewards/reward_correct/mean": 0.5125, + "rewards/reward_correct/std": 0.42815167307853697, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 425, + "step_time": 2.26362033393234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 182.2, + "completions/max_terminated_length": 80.8, + "completions/mean_length": 27.6875, + "completions/mean_terminated_length": 15.77869110107422, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.2325332318432629, + "epoch": 0.3136396790663749, + "frac_reward_zero_std": 0.2, + "grad_norm": 19.0, + "kl": 0.9733296466991306, + "learning_rate": 3.912491467031398e-06, + "loss": 0.4203649520874023, + "num_tokens": 1405941.0, + "reward": 0.6950000166893006, + "reward_std": 0.4742667019367218, + "rewards/reward_correct/mean": 0.6, + "rewards/reward_correct/std": 0.4665252387523651, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.016831301152706146, + "step": 430, + "step_time": 8.354645171947777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.2, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 11.075, + "completions/mean_terminated_length": 11.075, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.0747150843963027, + "epoch": 0.3172866520787746, + "frac_reward_zero_std": 0.7, + "grad_norm": 11.5, + "kl": 1.284253054857254, + "learning_rate": 3.888626120822423e-06, + "loss": 0.05172805786132813, + "num_tokens": 1417035.0, + "reward": 0.5500000163912773, + "reward_std": 0.24191222190856934, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.24191223978996276, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 435, + "step_time": 2.3270694056525825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.2, + "completions/max_terminated_length": 21.2, + "completions/mean_length": 11.6875, + "completions/mean_terminated_length": 11.6875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12435088921338319, + "epoch": 0.32093362509117435, + "frac_reward_zero_std": 0.9, + "grad_norm": 41.0, + "kl": 1.2318147175014018, + "learning_rate": 3.864576345887569e-06, + "loss": 0.04762374460697174, + "num_tokens": 1429570.0, + "reward": 0.8750000238418579, + "reward_std": 0.2748721122741699, + "rewards/reward_correct/mean": 0.775, + "rewards/reward_correct/std": 0.2748721182346344, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 440, + "step_time": 2.604528891667724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 86.6, + "completions/max_terminated_length": 37.8, + "completions/mean_length": 18.9375, + "completions/mean_terminated_length": 12.814285850524902, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15180045051965863, + "epoch": 0.32458059810357404, + "frac_reward_zero_std": 0.6, + "grad_norm": 10.4375, + "kl": 1.315756471455097, + "learning_rate": 3.840345336369075e-06, + "loss": 0.14110722541809081, + "num_tokens": 1441253.0, + "reward": 0.6225000232458114, + "reward_std": 0.35925430059432983, + "rewards/reward_correct/mean": 0.525, + "rewards/reward_correct/std": 0.35574907064437866, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 445, + "step_time": 4.959737072512508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 111.2, + "completions/max_terminated_length": 16.4, + "completions/mean_length": 17.6375, + "completions/mean_terminated_length": 11.530000305175781, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11573877576738596, + "epoch": 0.3282275711159737, + "frac_reward_zero_std": 0.4, + "grad_norm": 24.5, + "kl": 1.113444908708334, + "learning_rate": 3.8159363104796375e-06, + "loss": 0.5929186820983887, + "num_tokens": 1464744.0, + "reward": 0.6600000143051148, + "reward_std": 0.3202962338924408, + "rewards/reward_correct/mean": 0.5625, + "rewards/reward_correct/std": 0.31452471017837524, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 450, + "step_time": 6.085850327461958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 60.4, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 17.175, + "completions/mean_terminated_length": 11.053571510314942, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.1339926091488451, + "epoch": 0.33187454412837347, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.443359375, + "kl": 1.00316090490669, + "learning_rate": 3.7913525100749854e-06, + "loss": 0.1132813811302185, + "num_tokens": 1479278.0, + "reward": 0.3975000113248825, + "reward_std": 0.25906072854995726, + "rewards/reward_correct/mean": 0.3, + "rewards/reward_correct/std": 0.2577557325363159, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 455, + "step_time": 4.049631121009588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.6, + "completions/max_terminated_length": 14.6, + "completions/mean_length": 11.4375, + "completions/mean_terminated_length": 11.4375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.13541655708104372, + "epoch": 0.33552151714077316, + "frac_reward_zero_std": 0.4, + "grad_norm": 23.875, + "kl": 1.189768685400486, + "learning_rate": 3.7665972002233165e-06, + "loss": 0.06251391172409057, + "num_tokens": 1505753.0, + "reward": 0.2875000163912773, + "reward_std": 0.3383782982826233, + "rewards/reward_correct/mean": 0.1875, + "rewards/reward_correct/std": 0.33837831020355225, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 460, + "step_time": 2.65321463085711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.4, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 14.3125, + "completions/mean_terminated_length": 11.254166793823241, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.10027080331929028, + "epoch": 0.33916849015317285, + "frac_reward_zero_std": 0.4, + "grad_norm": 25.375, + "kl": 1.0626708364114166, + "learning_rate": 3.741673668771653e-06, + "loss": 0.3159222364425659, + "num_tokens": 1529530.0, + "reward": 0.8237500190734863, + "reward_std": 0.40993900299072267, + "rewards/reward_correct/mean": 0.725, + "rewards/reward_correct/std": 0.4049390316009521, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 465, + "step_time": 4.532307118922472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.2, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 14.275, + "completions/mean_terminated_length": 11.21333351135254, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11476188297383487, + "epoch": 0.3428154631655726, + "frac_reward_zero_std": 0.5, + "grad_norm": 11.375, + "kl": 1.312535627745092, + "learning_rate": 3.716585225909165e-06, + "loss": 0.1281302809715271, + "num_tokens": 1543768.0, + "reward": 0.6237500250339508, + "reward_std": 0.5006997585296631, + "rewards/reward_correct/mean": 0.525, + "rewards/reward_correct/std": 0.4997040152549744, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 470, + "step_time": 4.0860894789919255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.8, + "completions/max_terminated_length": 13.4, + "completions/mean_length": 14.55, + "completions/mean_terminated_length": 11.492500114440919, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.09601853825151921, + "epoch": 0.3464624361779723, + "frac_reward_zero_std": 0.3, + "grad_norm": 12.0625, + "kl": 1.1556208044290543, + "learning_rate": 3.6913352037275408e-06, + "loss": 0.31904890537261965, + "num_tokens": 1556788.0, + "reward": 0.7612500190734863, + "reward_std": 0.46066267490386964, + "rewards/reward_correct/mean": 0.6625, + "rewards/reward_correct/std": 0.4571541726589203, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 475, + "step_time": 4.040919363312423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.2, + "completions/max_terminated_length": 13.2, + "completions/mean_length": 11.1875, + "completions/mean_terminated_length": 11.1875, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.05810582416597754, + "epoch": 0.350109409190372, + "frac_reward_zero_std": 0.7, + "grad_norm": 48.25, + "kl": 1.4385133147239686, + "learning_rate": 3.665926955778429e-06, + "loss": 0.054510879516601565, + "num_tokens": 1567691.0, + "reward": 0.8250000238418579, + "reward_std": 0.3014917731285095, + "rewards/reward_correct/mean": 0.725, + "rewards/reward_correct/std": 0.30149178504943847, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 480, + "step_time": 2.372496274486184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.8, + "completions/max_terminated_length": 48.8, + "completions/mean_length": 13.4625, + "completions/mean_terminated_length": 13.4625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1009595861658454, + "epoch": 0.3537563822027717, + "frac_reward_zero_std": 0.8, + "grad_norm": 12.1875, + "kl": 1.0892627842724323, + "learning_rate": 3.640363856628048e-06, + "loss": 0.03951211571693421, + "num_tokens": 1585584.0, + "reward": 0.6125000208616257, + "reward_std": 0.37734161615371703, + "rewards/reward_correct/mean": 0.5125, + "rewards/reward_correct/std": 0.37734163403511045, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 485, + "step_time": 3.771330860070884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.8, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 11.1125, + "completions/mean_terminated_length": 11.1125, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.11290257377550006, + "epoch": 0.3574033552151714, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01251220703125, + "kl": 1.179569673538208, + "learning_rate": 3.6146493014089933e-06, + "loss": 0.05009811520576477, + "num_tokens": 1596689.0, + "reward": 0.46250001937150953, + "reward_std": 0.3602674245834351, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.3602674245834351, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 490, + "step_time": 2.265321587398648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 130.6, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 18.6875, + "completions/mean_terminated_length": 12.574166870117187, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17142658052034676, + "epoch": 0.3610503282275711, + "frac_reward_zero_std": 0.1, + "grad_norm": 21.625, + "kl": 0.8313944321125746, + "learning_rate": 3.5887867053693176e-06, + "loss": 0.6226963996887207, + "num_tokens": 1610256.0, + "reward": 0.5225000083446503, + "reward_std": 0.3943799376487732, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.3890955328941345, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 495, + "step_time": 6.569896092265845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.6, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 14.4875, + "completions/mean_terminated_length": 11.427500152587891, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.16391165722161533, + "epoch": 0.36469730123997085, + "frac_reward_zero_std": 0.3, + "grad_norm": 14.0, + "kl": 1.074128818884492, + "learning_rate": 3.5627795034189394e-06, + "loss": 0.32291080951690676, + "num_tokens": 1624991.0, + "reward": 0.4612500160932541, + "reward_std": 0.39751890301704407, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.39685126543045046, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 500, + "step_time": 4.1251722542569045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 67.0, + "completions/max_terminated_length": 18.2, + "completions/mean_length": 14.6, + "completions/mean_terminated_length": 11.536666870117188, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.14056114722043275, + "epoch": 0.36834427425237054, + "frac_reward_zero_std": 0.3, + "grad_norm": 19.25, + "kl": 1.226618971116841, + "learning_rate": 3.5366311496734394e-06, + "loss": 0.4041902542114258, + "num_tokens": 1638599.0, + "reward": 0.5362500160932541, + "reward_std": 0.40625839233398436, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.40468465685844424, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 505, + "step_time": 4.2246469365432855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 109.4, + "completions/max_terminated_length": 60.6, + "completions/mean_length": 17.2125, + "completions/mean_terminated_length": 14.15250015258789, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1850255263969302, + "epoch": 0.37199124726477023, + "frac_reward_zero_std": 0.4, + "grad_norm": 2.265625, + "kl": 1.2300702909007668, + "learning_rate": 3.5103451169953063e-06, + "loss": 0.10501872301101685, + "num_tokens": 1653408.0, + "reward": 0.46125001907348634, + "reward_std": 0.3800159513950348, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.3793482959270477, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 510, + "step_time": 5.822630574181676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.8, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 11.1375, + "completions/mean_terminated_length": 11.1375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1098517581820488, + "epoch": 0.37563822027717, + "frac_reward_zero_std": 0.5, + "grad_norm": 11.25, + "kl": 1.0533027783036233, + "learning_rate": 3.4839248965326917e-06, + "loss": 0.0392227053642273, + "num_tokens": 1666851.0, + "reward": 0.4625000238418579, + "reward_std": 0.453157377243042, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.4531573951244354, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 515, + "step_time": 2.332393659092486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 90.0, + "completions/max_terminated_length": 42.4, + "completions/mean_length": 16.1375, + "completions/mean_terminated_length": 13.082500076293945, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.12428427785634995, + "epoch": 0.37928519328956967, + "frac_reward_zero_std": 0.6, + "grad_norm": 6.34375, + "kl": 1.2909703208133578, + "learning_rate": 3.4573739972557376e-06, + "loss": 0.23775811195373536, + "num_tokens": 1682278.0, + "reward": 0.7487500190734864, + "reward_std": 0.4275705933570862, + "rewards/reward_correct/mean": 0.65, + "rewards/reward_correct/std": 0.42406207919120786, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 520, + "step_time": 5.122372025437653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.2, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 11.1, + "completions/mean_terminated_length": 11.1, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.10681108636781574, + "epoch": 0.38293216630196936, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.357421875, + "kl": 1.2207902833819388, + "learning_rate": 3.430695945490539e-06, + "loss": 0.04800998866558075, + "num_tokens": 1694278.0, + "reward": 0.7125000208616257, + "reward_std": 0.3248721122741699, + "rewards/reward_correct/mean": 0.6125, + "rewards/reward_correct/std": 0.3248721182346344, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 525, + "step_time": 2.2714664824306965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 110.6, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 26.45, + "completions/mean_terminated_length": 18.129286193847655, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.22787839574739338, + "epoch": 0.38657913931436905, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.81640625, + "kl": 1.086973152682185, + "learning_rate": 3.403894284450795e-06, + "loss": 0.24099428653717042, + "num_tokens": 1709306.0, + "reward": 0.6337500274181366, + "reward_std": 0.4585378408432007, + "rewards/reward_correct/mean": 0.5375, + "rewards/reward_correct/std": 0.45477133989334106, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 530, + "step_time": 5.844319257698953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 103.6, + "completions/max_terminated_length": 55.6, + "completions/mean_length": 17.1125, + "completions/mean_terminated_length": 14.07083339691162, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.182498611882329, + "epoch": 0.3902261123267688, + "frac_reward_zero_std": 0.2, + "grad_norm": 19.5, + "kl": 1.38779058996588, + "learning_rate": 3.3769725737672255e-06, + "loss": 0.2536503314971924, + "num_tokens": 1723835.0, + "reward": 0.5237500160932541, + "reward_std": 0.4781053185462952, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.47653159499168396, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 535, + "step_time": 5.635008576698601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 207.8, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 26.8375, + "completions/mean_terminated_length": 11.55952434539795, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.20077550169080496, + "epoch": 0.3938730853391685, + "frac_reward_zero_std": 0.3, + "grad_norm": 3.078125, + "kl": 1.0603073662146927, + "learning_rate": 3.3499343890148007e-06, + "loss": 0.6534118175506591, + "num_tokens": 1741438.0, + "reward": 0.4312500149011612, + "reward_std": 0.47009692192077634, + "rewards/reward_correct/mean": 0.3375, + "rewards/reward_correct/std": 0.4655475080013275, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.02183130122721195, + "step": 540, + "step_time": 9.267667215690016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.4, + "completions/max_terminated_length": 68.4, + "completions/mean_length": 15.7375, + "completions/mean_terminated_length": 15.7375, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.25203739320859314, + "epoch": 0.3975200583515682, + "frac_reward_zero_std": 0.5, + "grad_norm": 8.75, + "kl": 0.8174846898764372, + "learning_rate": 3.3227833212378546e-06, + "loss": 0.1733352541923523, + "num_tokens": 1756433.0, + "reward": 0.5000000238418579, + "reward_std": 0.4887241005897522, + "rewards/reward_correct/mean": 0.4, + "rewards/reward_correct/std": 0.48872411251068115, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 545, + "step_time": 4.352004214562475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 17.0625, + "completions/mean_terminated_length": 17.0625, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.15203919336199762, + "epoch": 0.4011670313639679, + "frac_reward_zero_std": 0.4, + "grad_norm": 15.4375, + "kl": 1.2429417807608842, + "learning_rate": 3.2955229764731454e-06, + "loss": -0.02172479182481766, + "num_tokens": 1774982.0, + "reward": 0.6875000238418579, + "reward_std": 0.453157377243042, + "rewards/reward_correct/mean": 0.5875, + "rewards/reward_correct/std": 0.4531573951244354, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 550, + "step_time": 4.156383008882403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 62.0, + "completions/max_terminated_length": 51.2, + "completions/mean_length": 23.2125, + "completions/mean_terminated_length": 14.596154022216798, + "completions/min_length": 10.2, + "completions/min_terminated_length": 10.2, + "entropy": 0.2312072828412056, + "epoch": 0.4048140043763676, + "frac_reward_zero_std": 0.4, + "grad_norm": 16.25, + "kl": 1.1721166325267405, + "learning_rate": 3.2681569752709275e-06, + "loss": 0.23104314804077147, + "num_tokens": 1791799.0, + "reward": 0.570000022649765, + "reward_std": 0.4669440448284149, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.46402024626731875, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.008944272249937057, + "step": 555, + "step_time": 4.170228382386267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 56.4, + "completions/max_terminated_length": 56.4, + "completions/mean_length": 14.0625, + "completions/mean_terminated_length": 14.0625, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.20865311063826084, + "epoch": 0.4084609773887673, + "frac_reward_zero_std": 0.2, + "grad_norm": 11.875, + "kl": 1.1729019869118928, + "learning_rate": 3.2406889522140854e-06, + "loss": -0.08613023161888123, + "num_tokens": 1807628.0, + "reward": 0.40000002086162567, + "reward_std": 0.45979843139648435, + "rewards/reward_correct/mean": 0.3, + "rewards/reward_correct/std": 0.45979843735694886, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 560, + "step_time": 3.8736391253769398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 109.8, + "completions/max_terminated_length": 13.6, + "completions/mean_length": 20.6625, + "completions/mean_terminated_length": 11.525595474243165, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.14480989766307176, + "epoch": 0.41210795040116704, + "frac_reward_zero_std": 0.4, + "grad_norm": 8.5625, + "kl": 1.1986199734732508, + "learning_rate": 3.2131225554354173e-06, + "loss": 0.3582606315612793, + "num_tokens": 1820289.0, + "reward": 0.4212500125169754, + "reward_std": 0.34379262924194337, + "rewards/reward_correct/mean": 0.325, + "rewards/reward_correct/std": 0.3351854383945465, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 565, + "step_time": 5.795268640480936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.2, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 14.175, + "completions/mean_terminated_length": 11.124166870117188, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.11937150268349797, + "epoch": 0.41575492341356673, + "frac_reward_zero_std": 0.5, + "grad_norm": 34.5, + "kl": 1.32432839050889, + "learning_rate": 3.185461446133109e-06, + "loss": 0.33120107650756836, + "num_tokens": 1835759.0, + "reward": 0.6362500190734863, + "reward_std": 0.4752968788146973, + "rewards/reward_correct/mean": 0.5375, + "rewards/reward_correct/std": 0.4738943874835968, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 570, + "step_time": 4.070260091684759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 109.2, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 20.4, + "completions/mean_terminated_length": 11.22761936187744, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1531581949442625, + "epoch": 0.4194018964259664, + "frac_reward_zero_std": 0.2, + "grad_norm": 25.625, + "kl": 1.045248545333743, + "learning_rate": 3.1577092980844783e-06, + "loss": 0.5447211742401123, + "num_tokens": 1847855.0, + "reward": 0.5337500214576721, + "reward_std": 0.48127453327178954, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.4779077172279358, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 575, + "step_time": 5.721678684465587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 130.6, + "completions/max_terminated_length": 34.8, + "completions/mean_length": 25.4375, + "completions/mean_terminated_length": 13.289871978759766, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.22714195176959037, + "epoch": 0.42304886943836617, + "frac_reward_zero_std": 0.1, + "grad_norm": 13.6875, + "kl": 1.0361129453405737, + "learning_rate": 3.129869797158046e-06, + "loss": 0.25610668659210206, + "num_tokens": 1874002.0, + "reward": 0.4687500178813934, + "reward_std": 0.4904141962528229, + "rewards/reward_correct/mean": 0.375, + "rewards/reward_correct/std": 0.486371648311615, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.013944272324442864, + "step": 580, + "step_time": 7.413618630170822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 110.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.425, + "completions/mean_terminated_length": 12.315714645385743, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.22436764594167471, + "epoch": 0.42669584245076586, + "frac_reward_zero_std": 0.4, + "grad_norm": 15.3125, + "kl": 1.123335475102067, + "learning_rate": 3.101946640824e-06, + "loss": 0.5032818794250489, + "num_tokens": 1897060.0, + "reward": 0.49625000506639483, + "reward_std": 0.29726893790066244, + "rewards/reward_correct/mean": 0.4, + "rewards/reward_correct/std": 0.28765495419502257, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 585, + "step_time": 6.213371917419136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 61.0, + "completions/max_terminated_length": 14.8, + "completions/mean_length": 17.6625, + "completions/mean_terminated_length": 11.583928680419922, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.10833779159002006, + "epoch": 0.43034281546316555, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.08544921875, + "kl": 1.0097390310838819, + "learning_rate": 3.0739435376631215e-06, + "loss": 0.25488159656524656, + "num_tokens": 1908281.0, + "reward": 0.5475000232458115, + "reward_std": 0.46652376651763916, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.4643148422241211, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 590, + "step_time": 3.9946171052753927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.6, + "completions/max_terminated_length": 12.6, + "completions/mean_length": 11.0875, + "completions/mean_terminated_length": 11.0875, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.1208916524425149, + "epoch": 0.4339897884755653, + "frac_reward_zero_std": 0.3, + "grad_norm": 19.0, + "kl": 1.2540010496973992, + "learning_rate": 3.045864206874238e-06, + "loss": 0.043874767422676084, + "num_tokens": 1918856.0, + "reward": 0.7000000238418579, + "reward_std": 0.4140708684921265, + "rewards/reward_correct/mean": 0.6, + "rewards/reward_correct/std": 0.4140708863735199, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 595, + "step_time": 2.3449863655492664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.2, + "completions/max_terminated_length": 13.4, + "completions/mean_length": 14.2875, + "completions/mean_terminated_length": 11.237500190734863, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.14411209817044437, + "epoch": 0.437636761487965, + "frac_reward_zero_std": 0.6, + "grad_norm": 7.0625, + "kl": 1.2594374112784863, + "learning_rate": 3.0177123777802535e-06, + "loss": 0.11211379766464233, + "num_tokens": 1931439.0, + "reward": 0.7362500160932541, + "reward_std": 0.3248257637023926, + "rewards/reward_correct/mean": 0.6375, + "rewards/reward_correct/std": 0.32325204014778136, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 600, + "step_time": 4.089182897657156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.6, + "completions/max_terminated_length": 12.6, + "completions/mean_length": 11.15, + "completions/mean_terminated_length": 11.15, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.1029292210470885, + "epoch": 0.4412837345003647, + "frac_reward_zero_std": 0.5, + "grad_norm": 24.25, + "kl": 1.3007706478238106, + "learning_rate": 2.989491789332851e-06, + "loss": 0.059204214811325075, + "num_tokens": 1943291.0, + "reward": 0.8500000238418579, + "reward_std": 0.32406206130981446, + "rewards/reward_correct/mean": 0.75, + "rewards/reward_correct/std": 0.3240620791912079, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 605, + "step_time": 2.370541114360094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.8, + "completions/max_terminated_length": 13.8, + "completions/mean_length": 11.375, + "completions/mean_terminated_length": 11.375, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.11047732485458255, + "epoch": 0.4449307075127644, + "frac_reward_zero_std": 0.3, + "grad_norm": 32.25, + "kl": 1.277146790921688, + "learning_rate": 2.9612061896159065e-06, + "loss": 0.06029477119445801, + "num_tokens": 1956977.0, + "reward": 0.7250000238418579, + "reward_std": 0.48458385467529297, + "rewards/reward_correct/mean": 0.625, + "rewards/reward_correct/std": 0.4845838785171509, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 610, + "step_time": 2.3578942948952317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.4, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 11.125, + "completions/mean_terminated_length": 11.125, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.11945098973810672, + "epoch": 0.4485776805251641, + "frac_reward_zero_std": 0.3, + "grad_norm": 9.3125, + "kl": 1.4077869176864624, + "learning_rate": 2.932859335347687e-06, + "loss": 0.06603010892868041, + "num_tokens": 1973515.0, + "reward": 0.6125000238418579, + "reward_std": 0.4837738037109375, + "rewards/reward_correct/mean": 0.5125, + "rewards/reward_correct/std": 0.4837738394737244, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 615, + "step_time": 2.398899529315531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.4, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.08357606306672097, + "epoch": 0.4522246535375638, + "frac_reward_zero_std": 0.4, + "grad_norm": 12.875, + "kl": 1.43735980540514, + "learning_rate": 2.9044549913819125e-06, + "loss": 0.0562359631061554, + "num_tokens": 1985603.0, + "reward": 0.5625000193715095, + "reward_std": 0.3383782982826233, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.33837831020355225, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 620, + "step_time": 2.320349162258208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 158.2, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 20.2125, + "completions/mean_terminated_length": 11.027500534057618, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1310176419094205, + "epoch": 0.45587162654996355, + "frac_reward_zero_std": 0.2, + "grad_norm": 49.75, + "kl": 1.0914184780791403, + "learning_rate": 2.875996930207727e-06, + "loss": 0.3286792755126953, + "num_tokens": 1999244.0, + "reward": 0.48375001549720764, + "reward_std": 0.45775646567344663, + "rewards/reward_correct/mean": 0.3875, + "rewards/reward_correct/std": 0.45493903160095217, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 625, + "step_time": 7.507395029440522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.6, + "completions/max_terminated_length": 12.6, + "completions/mean_length": 11.1375, + "completions/mean_terminated_length": 11.1375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.0890437193447724, + "epoch": 0.45951859956236324, + "frac_reward_zero_std": 0.4, + "grad_norm": 26.0, + "kl": 1.1414251357316971, + "learning_rate": 2.8474889314486636e-06, + "loss": 0.049356389045715335, + "num_tokens": 2021023.0, + "reward": 0.6625000208616256, + "reward_std": 0.447147798538208, + "rewards/reward_correct/mean": 0.5625, + "rewards/reward_correct/std": 0.44714781641960144, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 630, + "step_time": 2.837720428779721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.4, + "completions/max_terminated_length": 42.2, + "completions/mean_length": 17.975, + "completions/mean_terminated_length": 15.170833587646484, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.169133938010782, + "epoch": 0.46316557257476293, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.10400390625, + "kl": 1.191723581776023, + "learning_rate": 2.8189347813606544e-06, + "loss": 0.24407191276550294, + "num_tokens": 2048981.0, + "reward": 0.7237500160932541, + "reward_std": 0.31257030963897703, + "rewards/reward_correct/mean": 0.625, + "rewards/reward_correct/std": 0.310225248336792, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 635, + "step_time": 5.0386811953037975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.6, + "completions/max_terminated_length": 12.6, + "completions/mean_length": 11.1, + "completions/mean_terminated_length": 11.1, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.09728879313915968, + "epoch": 0.4668125455871627, + "frac_reward_zero_std": 0.7, + "grad_norm": 17.25, + "kl": 0.9838672868907452, + "learning_rate": 2.790338272329166e-06, + "loss": 0.037537801265716556, + "num_tokens": 2062549.0, + "reward": 0.5625000208616256, + "reward_std": 0.47734161615371706, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.4773416340351105, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 640, + "step_time": 2.303885807842016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 158.8, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 20.2625, + "completions/mean_terminated_length": 11.075833892822265, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.1268269836436957, + "epoch": 0.47045951859956237, + "frac_reward_zero_std": 0.5, + "grad_norm": 29.375, + "kl": 1.1911901906132698, + "learning_rate": 2.761703202365518e-06, + "loss": 0.45805997848510743, + "num_tokens": 2078946.0, + "reward": 0.5462500095367432, + "reward_std": 0.45170668959617616, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.44847134947776796, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 645, + "step_time": 7.6234702337533236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.4, + "completions/max_terminated_length": 13.4, + "completions/mean_length": 11.175, + "completions/mean_terminated_length": 11.175, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.10774052110500634, + "epoch": 0.47410649161196206, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.1005859375, + "kl": 1.2857966154813767, + "learning_rate": 2.7330333746024552e-06, + "loss": 0.04964399933815002, + "num_tokens": 2098712.0, + "reward": 0.712500023841858, + "reward_std": 0.3669942140579224, + "rewards/reward_correct/mean": 0.6125, + "rewards/reward_correct/std": 0.36699422597885134, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 650, + "step_time": 2.6685423662886025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.6, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 11.0375, + "completions/mean_terminated_length": 11.0375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07678919015452265, + "epoch": 0.4777534646243618, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.283203125, + "kl": 1.2016210861504077, + "learning_rate": 2.7043325967890355e-06, + "loss": 0.04805096983909607, + "num_tokens": 2109979.0, + "reward": 0.6125000193715096, + "reward_std": 0.37061482667922974, + "rewards/reward_correct/mean": 0.5125, + "rewards/reward_correct/std": 0.3706148326396942, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 655, + "step_time": 2.2578060291707516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 159.0, + "completions/max_terminated_length": 52.8, + "completions/mean_length": 25.95, + "completions/mean_terminated_length": 13.877857780456543, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.17742527059745045, + "epoch": 0.4814004376367615, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.64453125, + "kl": 1.1215825689956547, + "learning_rate": 2.6756046807849135e-06, + "loss": 0.4276102542877197, + "num_tokens": 2128719.0, + "reward": 0.4212500110268593, + "reward_std": 0.40691953897476196, + "rewards/reward_correct/mean": 0.325, + "rewards/reward_correct/std": 0.4032795548439026, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 660, + "step_time": 7.633640008606017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.0, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 14.1625, + "completions/mean_terminated_length": 11.101666831970215, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.07347399080172182, + "epoch": 0.4850474106491612, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.2373046875, + "kl": 1.1273339115083219, + "learning_rate": 2.646853442054068e-06, + "loss": 0.11161750555038452, + "num_tokens": 2140684.0, + "reward": 0.5612500190734864, + "reward_std": 0.3465817987918854, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.34600183367729187, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 665, + "step_time": 4.095501427352429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 109.8, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 17.175, + "completions/mean_terminated_length": 11.050833702087402, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.13329777736216783, + "epoch": 0.4886943836615609, + "frac_reward_zero_std": 0.3, + "grad_norm": 19.75, + "kl": 1.3517018549144268, + "learning_rate": 2.618082699158061e-06, + "loss": 0.2521249771118164, + "num_tokens": 2154746.0, + "reward": 0.5975000113248825, + "reward_std": 0.4522609353065491, + "rewards/reward_correct/mean": 0.5, + "rewards/reward_correct/std": 0.44924116134643555, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 670, + "step_time": 5.833194084651768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 202.8, + "completions/max_terminated_length": 57.2, + "completions/mean_length": 32.5125, + "completions/mean_terminated_length": 14.19000015258789, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.18839909620583056, + "epoch": 0.4923413566739606, + "frac_reward_zero_std": 0.3, + "grad_norm": 10.625, + "kl": 0.9344149041920901, + "learning_rate": 2.5892962732488746e-06, + "loss": 0.570836877822876, + "num_tokens": 2171539.0, + "reward": 0.5925000190734864, + "reward_std": 0.5061526596546173, + "rewards/reward_correct/mean": 0.5, + "rewards/reward_correct/std": 0.49438175559043884, + "rewards/reward_format/mean": 0.0925000011920929, + "rewards/reward_format/std": 0.01894427239894867, + "step": 675, + "step_time": 9.125298042595386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 114.8, + "completions/max_terminated_length": 17.6, + "completions/mean_length": 17.625, + "completions/mean_terminated_length": 11.50333366394043, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1579702500719577, + "epoch": 0.4959883296863603, + "frac_reward_zero_std": 0.6, + "grad_norm": 24.875, + "kl": 1.1326635053381324, + "learning_rate": 2.560497987561412e-06, + "loss": 0.6085105419158936, + "num_tokens": 2185397.0, + "reward": 0.6475000143051147, + "reward_std": 0.44295321106910707, + "rewards/reward_correct/mean": 0.55, + "rewards/reward_correct/std": 0.4371816873550415, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 680, + "step_time": 6.066542265377938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 158.2, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 26.5125, + "completions/mean_terminated_length": 11.240000343322754, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.17320034531876444, + "epoch": 0.49963530269876005, + "frac_reward_zero_std": 0.5, + "grad_norm": 28.875, + "kl": 1.1312677177134902, + "learning_rate": 2.5316916669057195e-06, + "loss": 0.7942066669464112, + "num_tokens": 2207662.0, + "reward": 0.5062500104308129, + "reward_std": 0.3586968682706356, + "rewards/reward_correct/mean": 0.4125, + "rewards/reward_correct/std": 0.3443817555904388, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.01806225851178169, + "step": 685, + "step_time": 7.992122623883188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 102.2, + "completions/max_terminated_length": 53.8, + "completions/mean_length": 16.9375, + "completions/mean_terminated_length": 13.880833435058594, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1877401598263532, + "epoch": 0.5032822757111597, + "frac_reward_zero_std": 0.5, + "grad_norm": 12.375, + "kl": 1.0558208076283335, + "learning_rate": 2.5028811371589977e-06, + "loss": 0.09907621145248413, + "num_tokens": 2219697.0, + "reward": 0.473750014603138, + "reward_std": 0.2671928942203522, + "rewards/reward_correct/mean": 0.375, + "rewards/reward_correct/std": 0.2665252387523651, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 690, + "step_time": 5.522791969031095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 84.6, + "completions/max_terminated_length": 35.8, + "completions/mean_length": 28.6125, + "completions/mean_terminated_length": 13.305681800842285, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.1837477927096188, + "epoch": 0.5069292487235595, + "frac_reward_zero_std": 0.2, + "grad_norm": 15.0, + "kl": 0.9750595092773438, + "learning_rate": 2.4740702247574716e-06, + "loss": 0.27918286323547364, + "num_tokens": 2232978.0, + "reward": 0.6300000190734864, + "reward_std": 0.4936726152896881, + "rewards/reward_correct/mean": 0.5375, + "rewards/reward_correct/std": 0.4855616092681885, + "rewards/reward_format/mean": 0.0925000011920929, + "rewards/reward_format/std": 0.01457427181303501, + "step": 695, + "step_time": 4.946827319636941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.8, + "completions/max_terminated_length": 90.8, + "completions/mean_length": 16.1125, + "completions/mean_terminated_length": 16.1125, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.22496891710907221, + "epoch": 0.5105762217359592, + "frac_reward_zero_std": 0.4, + "grad_norm": 13.625, + "kl": 1.1064648594707251, + "learning_rate": 2.4452627561881905e-06, + "loss": -0.1409442186355591, + "num_tokens": 2245939.0, + "reward": 0.4487500131130219, + "reward_std": 0.29248119592666627, + "rewards/reward_correct/mean": 0.35, + "rewards/reward_correct/std": 0.29148542881011963, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 700, + "step_time": 5.160510801523924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.6, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 11.1, + "completions/mean_terminated_length": 11.1, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11645094295963646, + "epoch": 0.5142231947483589, + "frac_reward_zero_std": 0.3, + "grad_norm": 14.4375, + "kl": 1.230565007030964, + "learning_rate": 2.4164625574808145e-06, + "loss": 0.04428930282592773, + "num_tokens": 2262835.0, + "reward": 0.5750000178813934, + "reward_std": 0.3672485828399658, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.3672485947608948, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 705, + "step_time": 2.3278287082910536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 109.6, + "completions/max_terminated_length": 12.6, + "completions/mean_length": 20.325, + "completions/mean_terminated_length": 11.150595474243165, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.133493235334754, + "epoch": 0.5178701677607586, + "frac_reward_zero_std": 0.3, + "grad_norm": 20.125, + "kl": 0.9306610645726323, + "learning_rate": 2.3876734536994618e-06, + "loss": 0.545658540725708, + "num_tokens": 2276749.0, + "reward": 0.5587500214576722, + "reward_std": 0.5114140748977661, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.5072408556938172, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 710, + "step_time": 5.847634883970022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.2, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 14.2625, + "completions/mean_terminated_length": 11.200000190734864, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11493070041760803, + "epoch": 0.5215171407731582, + "frac_reward_zero_std": 0.4, + "grad_norm": 22.875, + "kl": 1.275475772470236, + "learning_rate": 2.3588992684346968e-06, + "loss": 0.3383802890777588, + "num_tokens": 2289434.0, + "reward": 0.6737500160932541, + "reward_std": 0.3782520055770874, + "rewards/reward_correct/mean": 0.575, + "rewards/reward_correct/std": 0.37325204014778135, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 715, + "step_time": 4.137530161812902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 105.2, + "completions/max_terminated_length": 56.6, + "completions/mean_length": 16.9625, + "completions/mean_terminated_length": 13.90250015258789, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.15697559248656034, + "epoch": 0.5251641137855579, + "frac_reward_zero_std": 0.5, + "grad_norm": 17.125, + "kl": 1.0811178747564554, + "learning_rate": 2.3301438232956956e-06, + "loss": 0.32156217098236084, + "num_tokens": 2304503.0, + "reward": 0.6737500190734863, + "reward_std": 0.4476358890533447, + "rewards/reward_correct/mean": 0.575, + "rewards/reward_correct/std": 0.4441273808479309, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 720, + "step_time": 5.729240739345551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 95.8, + "completions/max_terminated_length": 47.6, + "completions/mean_length": 16.625, + "completions/mean_terminated_length": 13.577500152587891, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.15090966038405895, + "epoch": 0.5288110867979577, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.71875, + "kl": 1.2612082734704018, + "learning_rate": 2.301410937402688e-06, + "loss": 0.1808980941772461, + "num_tokens": 2321441.0, + "reward": 0.7987500190734863, + "reward_std": 0.3981435656547546, + "rewards/reward_correct/mean": 0.7, + "rewards/reward_correct/std": 0.39714781641960145, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 725, + "step_time": 5.3293802358210085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 60.8, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 17.325, + "completions/mean_terminated_length": 11.210714340209961, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16485853474587203, + "epoch": 0.5324580598103574, + "frac_reward_zero_std": 0.4, + "grad_norm": 19.125, + "kl": 1.1574596564285458, + "learning_rate": 2.2727044268797234e-06, + "loss": 0.1414148688316345, + "num_tokens": 2333035.0, + "reward": 0.6600000262260437, + "reward_std": 0.4011389672756195, + "rewards/reward_correct/mean": 0.5625, + "rewards/reward_correct/std": 0.3996234655380249, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 730, + "step_time": 4.073831165395677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 61.0, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 17.2625, + "completions/mean_terminated_length": 11.139285850524903, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.15900945872999728, + "epoch": 0.5361050328227571, + "frac_reward_zero_std": 0.3, + "grad_norm": 15.6875, + "kl": 1.0457678042352199, + "learning_rate": 2.2440281043478395e-06, + "loss": 0.2766944169998169, + "num_tokens": 2345048.0, + "reward": 0.6600000232458114, + "reward_std": 0.42856777310371397, + "rewards/reward_correct/mean": 0.5625, + "rewards/reward_correct/std": 0.424577522277832, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 735, + "step_time": 4.082735797390342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 110.0, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 17.325, + "completions/mean_terminated_length": 11.20333366394043, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1275663654319942, + "epoch": 0.5397520058351568, + "frac_reward_zero_std": 0.4, + "grad_norm": 21.375, + "kl": 1.0273871816694737, + "learning_rate": 2.2153857784186895e-06, + "loss": 0.6156983375549316, + "num_tokens": 2358202.0, + "reward": 0.7725000143051147, + "reward_std": 0.43836723566055297, + "rewards/reward_correct/mean": 0.675, + "rewards/reward_correct/std": 0.4321143627166748, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 740, + "step_time": 5.772066319547593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.0, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 14.3125, + "completions/mean_terminated_length": 11.259166717529297, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.10873888838104903, + "epoch": 0.5433989788475565, + "frac_reward_zero_std": 0.6, + "grad_norm": 12.8125, + "kl": 1.1318640105426312, + "learning_rate": 2.1867812531887097e-06, + "loss": 0.15562882423400878, + "num_tokens": 2376819.0, + "reward": 0.6237500160932541, + "reward_std": 0.45610262751579284, + "rewards/reward_correct/mean": 0.525, + "rewards/reward_correct/std": 0.4537575662136078, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 745, + "step_time": 4.1631547754630445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 73.6, + "completions/max_terminated_length": 25.6, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 12.19583339691162, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12490258021280169, + "epoch": 0.5470459518599562, + "frac_reward_zero_std": 0.4, + "grad_norm": 30.375, + "kl": 1.0008595857769251, + "learning_rate": 2.158218327733882e-06, + "loss": 0.23266313076019288, + "num_tokens": 2389951.0, + "reward": 0.5475000143051147, + "reward_std": 0.5077475905418396, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.5047713398933411, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 750, + "step_time": 4.563895738311112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 70.8, + "completions/max_terminated_length": 22.2, + "completions/mean_length": 17.95, + "completions/mean_terminated_length": 11.832143020629882, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.11990389078855515, + "epoch": 0.550692924872356, + "frac_reward_zero_std": 0.5, + "grad_norm": 8.6875, + "kl": 1.4239639887586235, + "learning_rate": 2.129700795605163e-06, + "loss": 0.023593926429748537, + "num_tokens": 2414771.0, + "reward": 0.7725000143051147, + "reward_std": 0.45342499017715454, + "rewards/reward_correct/mean": 0.675, + "rewards/reward_correct/std": 0.4468512713909149, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 755, + "step_time": 4.98283479642123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.6, + "completions/max_terminated_length": 12.6, + "completions/mean_length": 11.2375, + "completions/mean_terminated_length": 11.2375, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.14313149992376567, + "epoch": 0.5543398978847557, + "frac_reward_zero_std": 0.3, + "grad_norm": 15.625, + "kl": 1.1617460146546363, + "learning_rate": 2.101232444324648e-06, + "loss": 0.03879738450050354, + "num_tokens": 2428950.0, + "reward": 0.4250000238418579, + "reward_std": 0.44761679172515867, + "rewards/reward_correct/mean": 0.325, + "rewards/reward_correct/std": 0.44761680364608764, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 760, + "step_time": 2.318876395188272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 86.0, + "completions/max_terminated_length": 37.4, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 12.690833473205567, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1481350782327354, + "epoch": 0.5579868708971554, + "frac_reward_zero_std": 0.3, + "grad_norm": 19.25, + "kl": 1.2508833687752485, + "learning_rate": 2.072817054882538e-06, + "loss": 0.2704556703567505, + "num_tokens": 2444866.0, + "reward": 0.4612500160932541, + "reward_std": 0.4147715628147125, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.4119775414466858, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 765, + "step_time": 5.029102122038603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 110.4, + "completions/max_terminated_length": 13.4, + "completions/mean_length": 23.525, + "completions/mean_terminated_length": 11.296428871154784, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.16932614110410213, + "epoch": 0.5616338439095551, + "frac_reward_zero_std": 0.6, + "grad_norm": 5.9375, + "kl": 1.1262450924143195, + "learning_rate": 2.04445840123497e-06, + "loss": 0.20620214939117432, + "num_tokens": 2463580.0, + "reward": 0.5962500214576721, + "reward_std": 0.509655499458313, + "rewards/reward_correct/mean": 0.5, + "rewards/reward_correct/std": 0.5047713398933411, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 770, + "step_time": 5.97357696723193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 109.8, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 17.2875, + "completions/mean_terminated_length": 11.170000267028808, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11063970820978283, + "epoch": 0.5652808169219548, + "frac_reward_zero_std": 0.5, + "grad_norm": 16.375, + "kl": 1.1135979294776917, + "learning_rate": 2.0161602498027826e-06, + "loss": 0.42127599716186526, + "num_tokens": 2475339.0, + "reward": 0.49750001579523084, + "reward_std": 0.3348305583000183, + "rewards/reward_correct/mean": 0.4, + "rewards/reward_correct/std": 0.3288348078727722, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 775, + "step_time": 5.7840487219393255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 109.4, + "completions/max_terminated_length": 16.8, + "completions/mean_length": 20.8125, + "completions/mean_terminated_length": 11.668214416503906, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.14832511553540825, + "epoch": 0.5689277899343544, + "frac_reward_zero_std": 0.5, + "grad_norm": 26.75, + "kl": 1.0891086495481432, + "learning_rate": 1.9879263589712857e-06, + "loss": 0.48008298873901367, + "num_tokens": 2493172.0, + "reward": 0.5962500214576721, + "reward_std": 0.350608729198575, + "rewards/reward_correct/mean": 0.5, + "rewards/reward_correct/std": 0.340130603313446, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 780, + "step_time": 5.8947518732398745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.8, + "completions/max_terminated_length": 35.8, + "completions/mean_length": 12.6375, + "completions/mean_terminated_length": 12.6375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.11841410864144564, + "epoch": 0.5725747629467542, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.55078125, + "kl": 1.332649065554142, + "learning_rate": 1.9597604785910937e-06, + "loss": 0.23699214458465576, + "num_tokens": 2505527.0, + "reward": 0.5375000163912773, + "reward_std": 0.2740620613098145, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.2740620791912079, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 785, + "step_time": 3.203191607631743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 109.4, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 23.35, + "completions/mean_terminated_length": 11.112372016906738, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.11188999470323324, + "epoch": 0.5762217359591539, + "frac_reward_zero_std": 0.2, + "grad_norm": 222.0, + "kl": 1.2236792907118796, + "learning_rate": 1.931666349480095e-06, + "loss": 0.42595229148864744, + "num_tokens": 2518867.0, + "reward": 0.44500001668930056, + "reward_std": 0.45924447774887084, + "rewards/reward_correct/mean": 0.35, + "rewards/reward_correct/std": 0.45634413361549375, + "rewards/reward_format/mean": 0.09500000029802322, + "rewards/reward_format/std": 0.013062257692217827, + "step": 790, + "step_time": 5.767134657129645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.8, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 14.225, + "completions/mean_terminated_length": 11.166666793823243, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1320604055188596, + "epoch": 0.5798687089715536, + "frac_reward_zero_std": 0.4, + "grad_norm": 11.875, + "kl": 1.0688392341136932, + "learning_rate": 1.9036477029266182e-06, + "loss": 0.3217693567276001, + "num_tokens": 2534437.0, + "reward": 0.7362500250339508, + "reward_std": 0.43157670497894285, + "rewards/reward_correct/mean": 0.6375, + "rewards/reward_correct/std": 0.43058096170425414, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 795, + "step_time": 4.142721536196769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 89.2, + "completions/max_terminated_length": 40.2, + "completions/mean_length": 16.925, + "completions/mean_terminated_length": 13.862500190734863, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.15747904698364437, + "epoch": 0.5835156819839533, + "frac_reward_zero_std": 0.5, + "grad_norm": 11.25, + "kl": 1.064980798214674, + "learning_rate": 1.8757082601938642e-06, + "loss": 0.37255814075469973, + "num_tokens": 2548223.0, + "reward": 0.7237500190734864, + "reward_std": 0.35193461179733276, + "rewards/reward_correct/mean": 0.625, + "rewards/reward_correct/std": 0.350681746006012, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 800, + "step_time": 5.161932708323002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.4, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 14.0875, + "completions/mean_terminated_length": 11.025000190734863, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.10426421193405985, + "epoch": 0.587162654996353, + "frac_reward_zero_std": 0.5, + "grad_norm": 14.9375, + "kl": 1.284999130293727, + "learning_rate": 1.8478517320256737e-06, + "loss": 0.1272961378097534, + "num_tokens": 2561798.0, + "reward": 0.7362500190734863, + "reward_std": 0.2869613289833069, + "rewards/reward_correct/mean": 0.6375, + "rewards/reward_correct/std": 0.2851854383945465, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 805, + "step_time": 4.098441308923066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 62.4, + "completions/max_terminated_length": 13.4, + "completions/mean_length": 14.375, + "completions/mean_terminated_length": 11.312500190734863, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.12187190717086197, + "epoch": 0.5908096280087527, + "frac_reward_zero_std": 0.6, + "grad_norm": 15.4375, + "kl": 1.121857824921608, + "learning_rate": 1.8200818181536883e-06, + "loss": 0.2414854049682617, + "num_tokens": 2576524.0, + "reward": 0.39875001162290574, + "reward_std": 0.35686780214309693, + "rewards/reward_correct/mean": 0.3, + "rewards/reward_correct/std": 0.35574907064437866, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 810, + "step_time": 4.105724668502807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 62.4, + "completions/max_terminated_length": 14.2, + "completions/mean_length": 14.425, + "completions/mean_terminated_length": 11.366666793823242, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.12056900856550783, + "epoch": 0.5944566010211525, + "frac_reward_zero_std": 0.5, + "grad_norm": 26.0, + "kl": 1.270766181871295, + "learning_rate": 1.7924022068059755e-06, + "loss": 0.3251985549926758, + "num_tokens": 2591998.0, + "reward": 0.6237500190734864, + "reward_std": 0.4858367443084717, + "rewards/reward_correct/mean": 0.525, + "rewards/reward_correct/std": 0.4845838785171509, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 815, + "step_time": 4.174941363558173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.6, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 14.2375, + "completions/mean_terminated_length": 11.17583351135254, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.10346585493534803, + "epoch": 0.5981035740335522, + "frac_reward_zero_std": 0.3, + "grad_norm": 24.25, + "kl": 1.2462828190997244, + "learning_rate": 1.7648165742171788e-06, + "loss": 0.12477167844772338, + "num_tokens": 2603697.0, + "reward": 0.5362500190734864, + "reward_std": 0.44306035041809083, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.44165786504745486, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 820, + "step_time": 4.133172137476504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.4, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 14.05, + "completions/mean_terminated_length": 10.987500190734863, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.11616960091050714, + "epoch": 0.6017505470459519, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.341796875, + "kl": 1.2844669874757528, + "learning_rate": 1.737328584140267e-06, + "loss": 0.3280961990356445, + "num_tokens": 2616837.0, + "reward": 0.5725000143051148, + "reward_std": 0.5024597883224488, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.49928138852119447, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 825, + "step_time": 4.066914449073375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 109.2, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 17.1875, + "completions/mean_terminated_length": 11.063333702087402, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.14151806011795998, + "epoch": 0.6053975200583516, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8125, + "kl": 1.3854894034564496, + "learning_rate": 1.7099418873599332e-06, + "loss": 0.19663093090057374, + "num_tokens": 2633116.0, + "reward": 0.3975000098347664, + "reward_std": 0.38154210448265075, + "rewards/reward_correct/mean": 0.3, + "rewards/reward_correct/std": 0.3788854479789734, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 830, + "step_time": 5.907595778629184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 97.6, + "completions/max_terminated_length": 48.8, + "completions/mean_length": 19.5875, + "completions/mean_terminated_length": 13.467857170104981, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1770199915394187, + "epoch": 0.6090444930707513, + "frac_reward_zero_std": 0.4, + "grad_norm": 25.25, + "kl": 1.2550924096256495, + "learning_rate": 1.6826601212077224e-06, + "loss": 0.22758891582489013, + "num_tokens": 2649283.0, + "reward": 0.5725000113248825, + "reward_std": 0.34125515818595886, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.3366411805152893, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 835, + "step_time": 5.446167773194611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 109.4, + "completions/max_terminated_length": 39.8, + "completions/mean_length": 25.125, + "completions/mean_terminated_length": 13.139285850524903, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.14589090985246003, + "epoch": 0.612691466083151, + "frac_reward_zero_std": 0.4, + "grad_norm": 16.875, + "kl": 1.1861960230395199, + "learning_rate": 1.6554869090789421e-06, + "loss": 0.4542790412902832, + "num_tokens": 2665741.0, + "reward": 0.6825000256299972, + "reward_std": 0.40972196459770205, + "rewards/reward_correct/mean": 0.5875, + "rewards/reward_correct/std": 0.40068174004554746, + "rewards/reward_format/mean": 0.09500000327825546, + "rewards/reward_format/std": 0.013662602007389068, + "step": 840, + "step_time": 5.801089775376022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 158.8, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 29.4125, + "completions/mean_terminated_length": 11.034167098999024, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.16116405446082355, + "epoch": 0.6163384390955506, + "frac_reward_zero_std": 0.5, + "grad_norm": 8.875, + "kl": 1.1798366663977504, + "learning_rate": 1.6284258599514275e-06, + "loss": 0.5787829399108887, + "num_tokens": 2678590.0, + "reward": 0.5800000220537186, + "reward_std": 0.36448997259140015, + "rewards/reward_correct/mean": 0.4875, + "rewards/reward_correct/std": 0.3565591096878052, + "rewards/reward_format/mean": 0.0925000011920929, + "rewards/reward_format/std": 0.01894427239894867, + "step": 845, + "step_time": 7.616462749987841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 158.6, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 32.5125, + "completions/mean_terminated_length": 11.083333778381348, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.16811910178512335, + "epoch": 0.6199854121079504, + "frac_reward_zero_std": 0.2, + "grad_norm": 24.25, + "kl": 0.9879746390506625, + "learning_rate": 1.6014805679062185e-06, + "loss": 0.6805807113647461, + "num_tokens": 2692495.0, + "reward": 0.7537500143051148, + "reward_std": 0.45700827836990354, + "rewards/reward_correct/mean": 0.6625, + "rewards/reward_correct/std": 0.44600183367729185, + "rewards/reward_format/mean": 0.09125000238418579, + "rewards/reward_format/std": 0.020775573328137398, + "step": 850, + "step_time": 7.615449336916209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 207.0, + "completions/max_terminated_length": 11.4, + "completions/mean_length": 26.3375, + "completions/mean_terminated_length": 11.026667404174805, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16620325203984976, + "epoch": 0.6236323851203501, + "frac_reward_zero_std": 0.2, + "grad_norm": 14.4375, + "kl": 1.0341378800570964, + "learning_rate": 1.574654611650214e-06, + "loss": 0.5377101421356201, + "num_tokens": 2706394.0, + "reward": 0.3562500089406967, + "reward_std": 0.42934694290161135, + "rewards/reward_correct/mean": 0.2625, + "rewards/reward_correct/std": 0.42509101033210756, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.02183130122721195, + "step": 855, + "step_time": 9.28058768901974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 109.4, + "completions/max_terminated_length": 12.6, + "completions/mean_length": 20.425, + "completions/mean_terminated_length": 11.253571701049804, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.10414154501631856, + "epoch": 0.6272793581327498, + "frac_reward_zero_std": 0.3, + "grad_norm": 16.625, + "kl": 1.111027793586254, + "learning_rate": 1.5479515540408751e-06, + "loss": 0.46784439086914065, + "num_tokens": 2719268.0, + "reward": 0.6837500214576722, + "reward_std": 0.4920764327049255, + "rewards/reward_correct/mean": 0.5875, + "rewards/reward_correct/std": 0.4884649932384491, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 860, + "step_time": 5.883879956416786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 239.4, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 33.2375, + "completions/mean_terminated_length": 18.15297660827637, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.22872643619775773, + "epoch": 0.6309263311451495, + "frac_reward_zero_std": 0.3, + "grad_norm": 9.125, + "kl": 0.8511719422414898, + "learning_rate": 1.5213749416130225e-06, + "loss": 0.31956512928009034, + "num_tokens": 2734887.0, + "reward": 0.6062500238418579, + "reward_std": 0.4939933180809021, + "rewards/reward_correct/mean": 0.5125, + "rewards/reward_correct/std": 0.4879140734672546, + "rewards/reward_format/mean": 0.09375000298023224, + "rewards/reward_format/std": 0.018662602081894873, + "step": 865, + "step_time": 10.53759575355798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 158.4, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 20.275, + "completions/mean_terminated_length": 11.09333381652832, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.12340900329872966, + "epoch": 0.6345733041575492, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.34375, + "kl": 1.222047933936119, + "learning_rate": 1.4949283041078125e-06, + "loss": 0.3189678192138672, + "num_tokens": 2753509.0, + "reward": 0.6212500184774399, + "reward_std": 0.3751548707485199, + "rewards/reward_correct/mean": 0.525, + "rewards/reward_correct/std": 0.3715925633907318, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 870, + "step_time": 7.870939800702035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 158.2, + "completions/max_terminated_length": 12.6, + "completions/mean_length": 20.35, + "completions/mean_terminated_length": 11.1725004196167, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.12726073777303099, + "epoch": 0.6382202771699489, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.21875, + "kl": 1.0199619920924305, + "learning_rate": 1.468615154003935e-06, + "loss": 0.6692329406738281, + "num_tokens": 2773305.0, + "reward": 0.4712500125169754, + "reward_std": 0.4628027558326721, + "rewards/reward_correct/mean": 0.375, + "rewards/reward_correct/std": 0.45902862548828127, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 875, + "step_time": 7.794203354604543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.0, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 14.225, + "completions/mean_terminated_length": 11.165833473205566, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.11931565562263131, + "epoch": 0.6418672501823487, + "frac_reward_zero_std": 0.4, + "grad_norm": 30.25, + "kl": 1.0463667506352068, + "learning_rate": 1.4424389860511107e-06, + "loss": 0.32498223781585694, + "num_tokens": 2787379.0, + "reward": 0.5987500190734864, + "reward_std": 0.39922463297843935, + "rewards/reward_correct/mean": 0.5, + "rewards/reward_correct/std": 0.3964305937290192, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 880, + "step_time": 4.161012599989772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.6, + "completions/max_terminated_length": 13.6, + "completions/mean_length": 11.2, + "completions/mean_terminated_length": 11.2, + "completions/min_length": 10.2, + "completions/min_terminated_length": 10.2, + "entropy": 0.14874349618330598, + "epoch": 0.6455142231947484, + "frac_reward_zero_std": 0.4, + "grad_norm": 11.625, + "kl": 1.3606631740927697, + "learning_rate": 1.4164032768059389e-06, + "loss": 0.056450831890106204, + "num_tokens": 2802707.0, + "reward": 0.6500000208616257, + "reward_std": 0.36580801010131836, + "rewards/reward_correct/mean": 0.55, + "rewards/reward_correct/std": 0.3658080160617828, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 885, + "step_time": 2.4683469166979193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 207.8, + "completions/max_terminated_length": 13.8, + "completions/mean_length": 29.5875, + "completions/mean_terminated_length": 11.234423637390137, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.16692863404750824, + "epoch": 0.6491611962071481, + "frac_reward_zero_std": 0.3, "grad_norm": 31.125, - "learning_rate": 7.916666666666667e-06, - "loss": 2.708984375, - "mean_token_accuracy": 0.5555679220706224, - "num_tokens": 48215.0, - "step": 20 + "kl": 1.1874685259535909, + "learning_rate": 1.3905114841701622e-06, + "loss": 1.18980712890625, + "num_tokens": 2815530.0, + "reward": 0.5800000041723251, + "reward_std": 0.19825477786362172, + "rewards/reward_correct/mean": 0.4875, + "rewards/reward_correct/std": 0.18062257766723633, + "rewards/reward_format/mean": 0.09249999970197678, + "rewards/reward_format/std": 0.02306225784122944, + "step": 890, + "step_time": 9.297081774659455 }, { - "entropy": 1.2782868981361388, - "epoch": 0.19448946515397084, - "grad_norm": 15.0, - "learning_rate": 9.996828558775486e-06, - "loss": 1.8321617126464844, - "mean_token_accuracy": 0.6394873388111592, - "num_tokens": 71706.0, - "step": 30 + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 158.2, + "completions/max_terminated_length": 16.4, + "completions/mean_length": 32.925, + "completions/mean_terminated_length": 11.573654174804688, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.18815375985577704, + "epoch": 0.6528081692195478, + "frac_reward_zero_std": 0.3, + "grad_norm": 10.75, + "kl": 1.097396606579423, + "learning_rate": 1.364767046931411e-06, + "loss": 0.4671913146972656, + "num_tokens": 2830164.0, + "reward": 0.5537500143051147, + "reward_std": 0.518068790435791, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.5088609337806702, + "rewards/reward_format/mean": 0.09125000238418579, + "rewards/reward_format/std": 0.021124516800045966, + "step": 895, + "step_time": 7.5685260068625215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 68.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 17.775, + "completions/mean_terminated_length": 11.650000190734863, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.13993122894316912, + "epoch": 0.6564551422319475, + "frac_reward_zero_std": 0.4, + "grad_norm": 25.875, + "kl": 1.0716644663363695, + "learning_rate": 1.3391733843064824e-06, + "loss": 0.24156837463378905, + "num_tokens": 2844250.0, + "reward": 0.5725000262260437, + "reward_std": 0.4568935751914978, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.4546846568584442, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 900, + "step_time": 4.354886823520064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 110.0, + "completions/max_terminated_length": 12.6, + "completions/mean_length": 26.4875, + "completions/mean_terminated_length": 11.180833625793458, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.1504152875393629, + "epoch": 0.6601021152443471, + "frac_reward_zero_std": 0.5, + "grad_norm": 12.0625, + "kl": 1.1662491869181395, + "learning_rate": 1.3137338954872242e-06, + "loss": 0.49866552352905275, + "num_tokens": 2860537.0, + "reward": 0.41875001341104506, + "reward_std": 0.4104991316795349, + "rewards/reward_correct/mean": 0.325, + "rewards/reward_correct/std": 0.4049390316009521, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.013944272324442864, + "step": 905, + "step_time": 5.8045944008976225 }, { - "entropy": 1.2274531334638596, - "epoch": 0.2593192868719611, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 110.0, + "completions/max_terminated_length": 61.2, + "completions/mean_length": 20.4375, + "completions/mean_terminated_length": 17.37583351135254, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.2185256845317781, + "epoch": 0.6637490882567469, + "frac_reward_zero_std": 0.3, "grad_norm": 11.9375, - "learning_rate": 9.97148116317027e-06, - "loss": 1.3626604080200195, - "mean_token_accuracy": 0.7182034332305193, - "num_tokens": 94417.0, - "step": 40 + "kl": 1.1813059832900763, + "learning_rate": 1.288451959189072e-06, + "loss": -0.05511292815208435, + "num_tokens": 2872996.0, + "reward": 0.5237500160932541, + "reward_std": 0.43575005531311034, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.434497195482254, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 910, + "step_time": 5.847421420551837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 166.8, + "completions/max_terminated_length": 70.8, + "completions/mean_length": 30.6, + "completions/mean_terminated_length": 15.507820892333985, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.24432018604129552, + "epoch": 0.6673960612691466, + "frac_reward_zero_std": 0.3, + "grad_norm": 13.75, + "kl": 0.9329290691763162, + "learning_rate": 1.263330933202313e-06, + "loss": 0.5744418621063232, + "num_tokens": 2886484.0, + "reward": 0.4937500238418579, + "reward_std": 0.4854832112789154, + "rewards/reward_correct/mean": 0.4, + "rewards/reward_correct/std": 0.47909392714500426, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.01806225851178169, + "step": 915, + "step_time": 7.905635695718229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 60.8, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 17.2, + "completions/mean_terminated_length": 11.07857151031494, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.15780966076999903, + "epoch": 0.6710430342815463, + "frac_reward_zero_std": 0.1, + "grad_norm": 25.25, + "kl": 1.1635019179433583, + "learning_rate": 1.2383741539461206e-06, + "loss": 0.27414746284484864, + "num_tokens": 2902748.0, + "reward": 0.4350000262260437, + "reward_std": 0.47432781457901, + "rewards/reward_correct/mean": 0.3375, + "rewards/reward_correct/std": 0.4715570867061615, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 920, + "step_time": 4.143374693207443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 114.0, + "completions/max_terminated_length": 17.2, + "completions/mean_length": 26.775, + "completions/mean_terminated_length": 11.490000152587891, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.17352617997676134, + "epoch": 0.674690007293946, + "frac_reward_zero_std": 0.5, + "grad_norm": 14.625, + "kl": 0.9229282831773162, + "learning_rate": 1.213584936025435e-06, + "loss": 0.5839475154876709, + "num_tokens": 2915090.0, + "reward": 0.4687500104308128, + "reward_std": 0.2596869826316833, + "rewards/reward_correct/mean": 0.375, + "rewards/reward_correct/std": 0.24574271440505982, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.013944272324442864, + "step": 925, + "step_time": 5.91396897546947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 109.0, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 20.175, + "completions/mean_terminated_length": 10.989286041259765, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.1421708886977285, + "epoch": 0.6783369803063457, + "frac_reward_zero_std": 0.4, + "grad_norm": 32.75, + "kl": 1.306141108646989, + "learning_rate": 1.188966571790738e-06, + "loss": 0.564964485168457, + "num_tokens": 2929032.0, + "reward": 0.6087500274181366, + "reward_std": 0.4773897409439087, + "rewards/reward_correct/mean": 0.5125, + "rewards/reward_correct/std": 0.4740620791912079, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 930, + "step_time": 5.844562582299114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 79.2, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 15.6, + "completions/mean_terminated_length": 12.55666675567627, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.1395382797345519, + "epoch": 0.6819839533187454, + "frac_reward_zero_std": 0.7, + "grad_norm": 8.375, + "kl": 1.2526559926569463, + "learning_rate": 1.1645223309007807e-06, + "loss": 0.31260497570037843, + "num_tokens": 2942584.0, + "reward": 0.5862500190734863, + "reward_std": 0.396744542196393, + "rewards/reward_correct/mean": 0.4875, + "rewards/reward_correct/std": 0.3917445480823517, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 935, + "step_time": 4.799822365492583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 159.0, + "completions/max_terminated_length": 13.2, + "completions/mean_length": 32.8125, + "completions/mean_terminated_length": 11.4247257232666, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.18375055314972996, + "epoch": 0.6856309263311452, + "frac_reward_zero_std": 0.4, + "grad_norm": 23.625, + "kl": 1.1651625875383615, + "learning_rate": 1.1402554598883307e-06, + "loss": 0.591346549987793, + "num_tokens": 2971513.0, + "reward": 0.6662500202655792, + "reward_std": 0.3669509470462799, + "rewards/reward_correct/mean": 0.575, + "rewards/reward_correct/std": 0.35669131875038146, + "rewards/reward_format/mean": 0.0912500038743019, + "rewards/reward_format/std": 0.02172486037015915, + "step": 940, + "step_time": 8.087260065786541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 110.4, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 17.425, + "completions/mean_terminated_length": 11.301667022705079, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.13547584163025023, + "epoch": 0.6892778993435449, + "frac_reward_zero_std": 0.4, + "grad_norm": 14.625, + "kl": 1.1704786384478212, + "learning_rate": 1.1161691817289848e-06, + "loss": 0.4187188148498535, + "num_tokens": 2985827.0, + "reward": 0.6475000143051147, + "reward_std": 0.39262746572494506, + "rewards/reward_correct/mean": 0.55, + "rewards/reward_correct/std": 0.3896512031555176, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 945, + "step_time": 5.889890012890101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 63.0, + "completions/max_terminated_length": 14.8, + "completions/mean_length": 17.475, + "completions/mean_terminated_length": 11.37321434020996, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.13779198052361608, + "epoch": 0.6929248723559446, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.16796875, + "kl": 1.2816429644823075, + "learning_rate": 1.0922666954131173e-06, + "loss": 0.21606624126434326, + "num_tokens": 3000129.0, + "reward": 0.6225000217556953, + "reward_std": 0.26900014877319334, + "rewards/reward_correct/mean": 0.525, + "rewards/reward_correct/std": 0.2665252387523651, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 950, + "step_time": 4.198133673332632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 63.8, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 14.475, + "completions/mean_terminated_length": 11.414166831970215, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1219971620477736, + "epoch": 0.6965718453683443, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.5703125, + "kl": 1.2146267022937536, + "learning_rate": 1.0685511755210054e-06, + "loss": 0.17830634117126465, + "num_tokens": 3012567.0, + "reward": 0.6737500190734863, + "reward_std": 0.3974202036857605, + "rewards/reward_correct/mean": 0.575, + "rewards/reward_correct/std": 0.39642446041107177, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 955, + "step_time": 4.147014885954559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 110.8, + "completions/max_terminated_length": 13.4, + "completions/mean_length": 17.3625, + "completions/mean_terminated_length": 11.241666984558105, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.1321013109991327, + "epoch": 0.700218818380744, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.55859375, + "kl": 1.061482924595475, + "learning_rate": 1.0450257718012042e-06, + "loss": 0.4734503746032715, + "num_tokens": 3028036.0, + "reward": 0.6475000098347664, + "reward_std": 0.34497573375701907, + "rewards/reward_correct/mean": 0.55, + "rewards/reward_correct/std": 0.3371816873550415, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 960, + "step_time": 5.983434215933085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 179.6, + "completions/max_terminated_length": 82.4, + "completions/mean_length": 21.7625, + "completions/mean_terminated_length": 15.641666984558105, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.1807787662371993, + "epoch": 0.7038657913931436, + "frac_reward_zero_std": 0.1, + "grad_norm": 37.5, + "kl": 1.2269989637658, + "learning_rate": 1.021693608752215e-06, + "loss": 0.18453656435012816, + "num_tokens": 3044201.0, + "reward": 0.5225000143051147, + "reward_std": 0.4964973211288452, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.49421406388282774, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 965, + "step_time": 8.416685522347688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 181.8, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 21.9875, + "completions/mean_terminated_length": 12.8100004196167, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2144255481660366, + "epoch": 0.7075127644055434, + "frac_reward_zero_std": 0.3, + "grad_norm": 11.4375, + "kl": 1.0428111262619495, + "learning_rate": 9.985577852075099e-07, + "loss": 0.42957406044006347, + "num_tokens": 3056632.0, + "reward": 0.5212500154972076, + "reward_std": 0.4731999456882477, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.4690875709056854, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 970, + "step_time": 8.459058383479714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 168.2, + "completions/max_terminated_length": 21.8, + "completions/mean_length": 24.0875, + "completions/mean_terminated_length": 11.852857398986817, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.17413299554027617, + "epoch": 0.7111597374179431, + "frac_reward_zero_std": 0.5, + "grad_norm": 29.125, + "kl": 0.9784929996356369, + "learning_rate": 9.756213739239632e-07, + "loss": 0.7012048721313476, + "num_tokens": 3071191.0, + "reward": 0.42000001668930054, + "reward_std": 0.38715612776577474, + "rewards/reward_correct/mean": 0.325, + "rewards/reward_correct/std": 0.3796448469161987, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.016831301152706146, + "step": 975, + "step_time": 7.902792886644602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 159.4, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 20.3875, + "completions/mean_terminated_length": 11.201667213439942, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.12266774578019976, + "epoch": 0.7148067104303428, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5, + "kl": 1.1709866758435965, + "learning_rate": 9.528874211737471e-07, + "loss": 0.6103484153747558, + "num_tokens": 3085086.0, + "reward": 0.7712500095367432, + "reward_std": 0.40152530670166015, + "rewards/reward_correct/mean": 0.675, + "rewards/reward_correct/std": 0.39600183367729186, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 980, + "step_time": 7.665781799331308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.6, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 14.15, + "completions/mean_terminated_length": 11.08833351135254, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.11113924738019705, + "epoch": 0.7184536834427425, + "frac_reward_zero_std": 0.6, + "grad_norm": 10.75, + "kl": 1.1140049666166305, + "learning_rate": 9.303589463397441e-07, + "loss": 0.12303996086120605, + "num_tokens": 3103258.0, + "reward": 0.7487500190734864, + "reward_std": 0.3056066811084747, + "rewards/reward_correct/mean": 0.65, + "rewards/reward_correct/std": 0.3049390256404877, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 985, + "step_time": 4.173900611326099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 159.0, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 26.0875, + "completions/mean_terminated_length": 14.002143287658692, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.17991922618821263, + "epoch": 0.7221006564551422, + "frac_reward_zero_std": 0.4, + "grad_norm": 6.78125, + "kl": 0.999136315472424, + "learning_rate": 9.080389415145328e-07, + "loss": 0.32036216259002687, + "num_tokens": 3131401.0, + "reward": 0.7825000166893006, + "reward_std": 0.415789520740509, + "rewards/reward_correct/mean": 0.6875, + "rewards/reward_correct/std": 0.40945738554000854, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.016831301152706146, + "step": 990, + "step_time": 8.07879895698279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 68.4, + "completions/max_terminated_length": 19.6, + "completions/mean_length": 14.65, + "completions/mean_terminated_length": 11.58833351135254, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12039419142529369, + "epoch": 0.7257476294675419, + "frac_reward_zero_std": 0.6, + "grad_norm": 17.625, + "kl": 0.9908271584659815, + "learning_rate": 8.85930371102994e-07, + "loss": 0.24936254024505616, + "num_tokens": 3142653.0, + "reward": 0.5487500160932541, + "reward_std": 0.4170133113861084, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.4135048031806946, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 995, + "step_time": 4.369975199177861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 158.2, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 26.4, + "completions/mean_terminated_length": 11.10974407196045, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.14572411840781568, + "epoch": 0.7293946024799417, + "frac_reward_zero_std": 0.1, + "grad_norm": 18.5, + "kl": 0.9908344244584442, + "learning_rate": 8.640361714286014e-07, + "loss": 0.3199794769287109, + "num_tokens": 3159965.0, + "reward": 0.618750023841858, + "reward_std": 0.5064317464828492, + "rewards/reward_correct/mean": 0.525, + "rewards/reward_correct/std": 0.499320787191391, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.01806225851178169, + "step": 1000, + "step_time": 7.6432202199473975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 109.6, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 23.4375, + "completions/mean_terminated_length": 11.194643020629883, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.15537974070757626, + "epoch": 0.7330415754923414, + "frac_reward_zero_std": 0.3, + "grad_norm": 36.25, + "kl": 1.0698428362607957, + "learning_rate": 8.423592503434288e-07, + "loss": 0.42586145401000974, + "num_tokens": 3173296.0, + "reward": 0.4075000211596489, + "reward_std": 0.3828831076622009, + "rewards/reward_correct/mean": 0.3125, + "rewards/reward_correct/std": 0.37734163403511045, + "rewards/reward_format/mean": 0.09500000327825546, + "rewards/reward_format/std": 0.013662602007389068, + "step": 1005, + "step_time": 5.783974103629589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 60.8, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 20.125, + "completions/mean_terminated_length": 10.934615516662598, + "completions/min_length": 10.2, + "completions/min_terminated_length": 10.2, + "entropy": 0.14931348655372859, + "epoch": 0.7366885485047411, + "frac_reward_zero_std": 0.3, + "grad_norm": 23.75, + "kl": 1.037040038406849, + "learning_rate": 8.209024868419596e-07, + "loss": 0.2815678119659424, + "num_tokens": 3200210.0, + "reward": 0.6212500214576722, + "reward_std": 0.45014098286628723, + "rewards/reward_correct/mean": 0.525, + "rewards/reward_correct/std": 0.4472344994544983, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.00806225836277008, + "step": 1010, + "step_time": 4.542424210160971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 109.6, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 17.2125, + "completions/mean_terminated_length": 11.090000343322753, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.11048957742750645, + "epoch": 0.7403355215171408, + "frac_reward_zero_std": 0.5, + "grad_norm": 14.25, + "kl": 0.9843526845797896, + "learning_rate": 7.996687306787054e-07, + "loss": 0.38713390827178956, + "num_tokens": 3211995.0, + "reward": 0.5100000143051148, + "reward_std": 0.39578779339790343, + "rewards/reward_correct/mean": 0.4125, + "rewards/reward_correct/std": 0.3939549446105957, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1015, + "step_time": 5.846088585443795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 158.4, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 23.2625, + "completions/mean_terminated_length": 11.013214683532714, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.173574111610651, + "epoch": 0.7439824945295405, + "frac_reward_zero_std": 0.2, + "grad_norm": 11.4375, + "kl": 1.1816275650635362, + "learning_rate": 7.786608019897241e-07, + "loss": 0.47748708724975586, + "num_tokens": 3227256.0, + "reward": 0.44500001668930056, + "reward_std": 0.4806786775588989, + "rewards/reward_correct/mean": 0.35, + "rewards/reward_correct/std": 0.4758143723011017, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.016831301152706146, + "step": 1020, + "step_time": 7.6466738833114505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1125, + "completions/max_length": 207.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 41.525, + "completions/mean_terminated_length": 14.19267463684082, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1892614746466279, + "epoch": 0.7476294675419402, + "frac_reward_zero_std": 0.2, + "grad_norm": 20.125, + "kl": 1.0072221651673317, + "learning_rate": 7.578814909180668e-07, + "loss": 0.8787998199462891, + "num_tokens": 3251690.0, + "reward": 0.6262500286102295, + "reward_std": 0.4741853892803192, + "rewards/reward_correct/mean": 0.5375, + "rewards/reward_correct/std": 0.45677800178527833, + "rewards/reward_format/mean": 0.08875000327825547, + "rewards/reward_format/std": 0.027955817803740502, + "step": 1025, + "step_time": 9.90162706375122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.6, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 14.2125, + "completions/mean_terminated_length": 11.147500038146973, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.13515494633466005, + "epoch": 0.75127644055434, + "frac_reward_zero_std": 0.2, + "grad_norm": 21.375, + "kl": 1.1974251046776772, + "learning_rate": 7.373335572432083e-07, + "loss": 0.33423709869384766, + "num_tokens": 3263059.0, + "reward": 0.5737500250339508, + "reward_std": 0.4456740617752075, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.4446783006191254, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1030, + "step_time": 4.137141246534884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 158.2, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 23.5625, + "completions/mean_terminated_length": 11.350714683532715, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.14335245164111257, + "epoch": 0.7549234135667396, + "frac_reward_zero_std": 0.3, + "grad_norm": 21.75, + "kl": 1.2266500808298588, + "learning_rate": 7.170197300145093e-07, + "loss": 0.7869414329528809, + "num_tokens": 3278928.0, + "reward": 0.7825000166893006, + "reward_std": 0.39596893191337584, + "rewards/reward_correct/mean": 0.6875, + "rewards/reward_correct/std": 0.38135496377944944, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.016831301152706146, + "step": 1035, + "step_time": 7.642712612450123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 150.4, + "completions/max_terminated_length": 52.6, + "completions/mean_length": 22.7, + "completions/mean_terminated_length": 13.511667060852051, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.15192348677664996, + "epoch": 0.7585703865791393, + "frac_reward_zero_std": 0.3, + "grad_norm": 27.25, + "kl": 1.213655634224415, + "learning_rate": 6.969427071887591e-07, + "loss": 0.6880573272705078, + "num_tokens": 3291016.0, + "reward": 0.5962500214576721, + "reward_std": 0.4363097190856934, + "rewards/reward_correct/mean": 0.5, + "rewards/reward_correct/std": 0.42883480787277223, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 1040, + "step_time": 7.16560839265585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 207.2, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 33.5875, + "completions/mean_terminated_length": 15.49333438873291, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.15568729294463993, + "epoch": 0.762217359591539, + "frac_reward_zero_std": 0.5, + "grad_norm": 30.375, + "kl": 1.0448514215648175, + "learning_rate": 6.771051552718569e-07, + "loss": 0.8683349609375, + "num_tokens": 3303975.0, + "reward": 0.530000002682209, + "reward_std": 0.36571626663208007, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.35493903160095214, + "rewards/reward_format/mean": 0.09249999970197678, + "rewards/reward_format/std": 0.02306225784122944, + "step": 1045, + "step_time": 9.333471669629215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 207.2, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 26.4125, + "completions/mean_terminated_length": 11.103810119628907, + "completions/min_length": 10.2, + "completions/min_terminated_length": 10.2, + "entropy": 0.19476536866277455, + "epoch": 0.7658643326039387, + "frac_reward_zero_std": 0.3, + "grad_norm": 26.625, + "kl": 1.0343118201941253, + "learning_rate": 6.575097089646543e-07, + "loss": 0.9979419708251953, + "num_tokens": 3323376.0, + "reward": 0.38125001788139345, + "reward_std": 0.31709347739815713, + "rewards/reward_correct/mean": 0.2875, + "rewards/reward_correct/std": 0.3024695158004761, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.02183130122721195, + "step": 1050, + "step_time": 9.465577307716012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.4, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 14.075, + "completions/mean_terminated_length": 11.013333511352538, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1104262274224311, + "epoch": 0.7695113056163384, + "frac_reward_zero_std": 0.4, + "grad_norm": 31.25, + "kl": 1.2681043207645417, + "learning_rate": 6.381589708130356e-07, + "loss": 0.334909462928772, + "num_tokens": 3339486.0, + "reward": 0.7112500190734863, + "reward_std": 0.3883556306362152, + "rewards/reward_correct/mean": 0.6125, + "rewards/reward_correct/std": 0.3855616092681885, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1055, + "step_time": 4.137268589437008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 13.2, + "completions/mean_length": 32.775, + "completions/mean_terminated_length": 11.368572235107422, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.17338744243606924, + "epoch": 0.7731582786287381, + "frac_reward_zero_std": 0.3, + "grad_norm": 17.875, + "kl": 1.1310414992272855, + "learning_rate": 6.190555108622626e-07, + "loss": 1.0370418548583984, + "num_tokens": 3367684.0, + "reward": 0.40374999642372134, + "reward_std": 0.38583031184971334, + "rewards/reward_correct/mean": 0.3125, + "rewards/reward_correct/std": 0.3740620791912079, + "rewards/reward_format/mean": 0.09125000238418579, + "rewards/reward_format/std": 0.028662602230906487, + "step": 1060, + "step_time": 12.054812138155103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 158.0, + "completions/max_terminated_length": 58.6, + "completions/mean_length": 23.2625, + "completions/mean_terminated_length": 14.280000495910645, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.14124670568853617, + "epoch": 0.7768052516411379, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.0927734375, + "kl": 1.0903849417343736, + "learning_rate": 6.002018663156375e-07, + "loss": 0.06624346971511841, + "num_tokens": 3380633.0, + "reward": 0.5837500154972076, + "reward_std": 0.2722975671291351, + "rewards/reward_correct/mean": 0.4875, + "rewards/reward_correct/std": 0.27006530165672304, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 1065, + "step_time": 7.588603504933417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 110.6, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 20.4875, + "completions/mean_terminated_length": 11.303452682495116, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1466917991405353, + "epoch": 0.7804522246535376, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.314453125, + "kl": 1.1688736403360962, + "learning_rate": 5.816005411975254e-07, + "loss": 0.5652116775512696, + "num_tokens": 3396168.0, + "reward": 0.6837500214576722, + "reward_std": 0.48513195514678953, + "rewards/reward_correct/mean": 0.5875, + "rewards/reward_correct/std": 0.48062118887901306, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 1070, + "step_time": 5.913995978236199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 119.0, + "completions/max_terminated_length": 21.8, + "completions/mean_length": 17.9625, + "completions/mean_terminated_length": 11.844166946411132, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.12736709106247873, + "epoch": 0.7840991976659373, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.26171875, + "kl": 1.095674180611968, + "learning_rate": 5.632540060207875e-07, + "loss": 0.6179062366485596, + "num_tokens": 3408477.0, + "reward": 0.8350000143051147, + "reward_std": 0.18607617020606995, + "rewards/reward_correct/mean": 0.7375, + "rewards/reward_correct/std": 0.1839021325111389, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1075, + "step_time": 6.220561241731048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 110.2, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 29.4875, + "completions/mean_terminated_length": 11.104166984558105, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.13068979703821243, + "epoch": 0.787746170678337, + "frac_reward_zero_std": 0.5, + "grad_norm": 8.4375, + "kl": 1.1333220265805721, + "learning_rate": 5.451646974586638e-07, + "loss": 0.27587137222290037, + "num_tokens": 3424820.0, + "reward": 0.48000001907348633, + "reward_std": 0.40973682403564454, + "rewards/reward_correct/mean": 0.3875, + "rewards/reward_correct/std": 0.40525074005126954, + "rewards/reward_format/mean": 0.09250000268220901, + "rewards/reward_format/std": 0.015775573253631592, + "step": 1080, + "step_time": 5.878227005898952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 110.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 23.475, + "completions/mean_terminated_length": 11.230000114440918, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1353360296227038, + "epoch": 0.7913931436907367, + "frac_reward_zero_std": 0.4, + "grad_norm": 13.3125, + "kl": 0.9516539743170143, + "learning_rate": 5.273350180211453e-07, + "loss": 0.2977238416671753, + "num_tokens": 3437626.0, + "reward": 0.4575000196695328, + "reward_std": 0.43776618242263793, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.4342077076435089, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.013062258437275886, + "step": 1085, + "step_time": 5.909720999933779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.0, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 14.1875, + "completions/mean_terminated_length": 11.125833511352539, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.10449443897232413, + "epoch": 0.7950401167031363, + "frac_reward_zero_std": 0.5, + "grad_norm": 27.375, + "kl": 1.1727975588291883, + "learning_rate": 5.097673357358906e-07, + "loss": 0.32521967887878417, + "num_tokens": 3450721.0, + "reward": 0.636250014603138, + "reward_std": 0.35900322198867796, + "rewards/reward_correct/mean": 0.5375, + "rewards/reward_correct/std": 0.3554946959018707, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1090, + "step_time": 4.216670380905271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.6, + "completions/max_terminated_length": 13.6, + "completions/mean_length": 11.65, + "completions/mean_terminated_length": 11.65, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1377808633260429, + "epoch": 0.7986870897155361, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.19140625, + "kl": 1.2800965502858161, + "learning_rate": 4.924639838337184e-07, + "loss": 0.05056450366973877, + "num_tokens": 3475005.0, + "reward": 0.5375000193715096, + "reward_std": 0.4055813789367676, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.4055813789367676, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 1095, + "step_time": 2.649897381849587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 158.8, + "completions/max_terminated_length": 14.8, + "completions/mean_length": 26.55, + "completions/mean_terminated_length": 11.2621431350708, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.16694942396134138, + "epoch": 0.8023340627279358, + "frac_reward_zero_std": 0.2, + "grad_norm": 23.0, + "kl": 1.2879412140697242, + "learning_rate": 4.7542726043872223e-07, + "loss": 0.5643815517425537, + "num_tokens": 3489697.0, + "reward": 0.5312500238418579, + "reward_std": 0.5059142410755157, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.5001308262348175, + "rewards/reward_format/mean": 0.09375000298023224, + "rewards/reward_format/std": 0.018662602081894873, + "step": 1100, + "step_time": 7.680788661539554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 164.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 26.85, + "completions/mean_terminated_length": 11.543718338012695, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19747618471737952, + "epoch": 0.8059810357403355, + "frac_reward_zero_std": 0.4, + "grad_norm": 19.25, + "kl": 1.118091570958495, + "learning_rate": 4.586594282630466e-07, + "loss": 0.456316614151001, + "num_tokens": 3501629.0, + "reward": 0.4437500059604645, + "reward_std": 0.3370595008134842, + "rewards/reward_correct/mean": 0.35, + "rewards/reward_correct/std": 0.3246281623840332, + "rewards/reward_format/mean": 0.09375, + "rewards/reward_format/std": 0.018062257766723634, + "step": 1105, + "step_time": 7.802947326749563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 62.2, + "completions/max_terminated_length": 13.2, + "completions/mean_length": 14.2625, + "completions/mean_terminated_length": 11.197500038146973, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.12731944490224123, + "epoch": 0.8096280087527352, + "frac_reward_zero_std": 0.5, + "grad_norm": 27.25, + "kl": 1.3676513850688934, + "learning_rate": 4.4216271430637005e-07, + "loss": 0.34341113567352294, + "num_tokens": 3514842.0, + "reward": 0.49875001013278963, + "reward_std": 0.25372239351272585, + "rewards/reward_correct/mean": 0.4, + "rewards/reward_correct/std": 0.25246951580047605, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1110, + "step_time": 4.177437866665423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 110.4, + "completions/max_terminated_length": 23.4, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 11.924167060852051, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.14376077065244316, + "epoch": 0.8132749817651349, + "frac_reward_zero_std": 0.5, + "grad_norm": 27.125, + "kl": 1.001600916683674, + "learning_rate": 4.259393095601258e-07, + "loss": 0.5589489936828613, + "num_tokens": 3528354.0, + "reward": 0.5475000098347664, + "reward_std": 0.3620019435882568, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.35574907064437866, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1115, + "step_time": 5.851181945577264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 109.8, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 23.55, + "completions/mean_terminated_length": 11.332143020629882, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17104517347179354, + "epoch": 0.8169219547775346, + "frac_reward_zero_std": 0.1, + "grad_norm": 21.375, + "kl": 0.9864880526438355, + "learning_rate": 4.0999136871651244e-07, + "loss": 0.4844799041748047, + "num_tokens": 3543014.0, + "reward": 0.32000001072883605, + "reward_std": 0.35310750007629393, + "rewards/reward_correct/mean": 0.225, + "rewards/reward_correct/std": 0.3506817400455475, + "rewards/reward_format/mean": 0.09500000327825546, + "rewards/reward_format/std": 0.013662602007389068, + "step": 1120, + "step_time": 5.890768299996853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 207.2, + "completions/max_terminated_length": 42.6, + "completions/mean_length": 31.3875, + "completions/mean_terminated_length": 13.149762725830078, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.19566917652264237, + "epoch": 0.8205689277899344, + "frac_reward_zero_std": 0.3, + "grad_norm": 17.0, + "kl": 0.9735424511134625, + "learning_rate": 3.943210098823158e-07, + "loss": 0.8205422401428223, + "num_tokens": 3567189.0, + "reward": 0.7550000190734864, + "reward_std": 0.39593332409858706, + "rewards/reward_correct/mean": 0.6625, + "rewards/reward_correct/std": 0.37883480787277224, + "rewards/reward_format/mean": 0.09250000268220901, + "rewards/reward_format/std": 0.023662602156400682, + "step": 1125, + "step_time": 9.867853231169283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 158.2, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 23.275, + "completions/mean_terminated_length": 11.026905059814453, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1576895572245121, + "epoch": 0.8242159008023341, + "frac_reward_zero_std": 0.2, + "grad_norm": 15.125, + "kl": 1.3187669806182385, + "learning_rate": 3.7893031429759724e-07, + "loss": 0.6695477485656738, + "num_tokens": 3583443.0, + "reward": 0.44500001668930056, + "reward_std": 0.4860190153121948, + "rewards/reward_correct/mean": 0.35, + "rewards/reward_correct/std": 0.48130432367324827, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.016831301152706146, + "step": 1130, + "step_time": 7.628186381235719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 102.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 22.2, + "completions/mean_terminated_length": 16.408928680419923, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.16459039226174355, + "epoch": 0.8278628738147338, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.0, + "kl": 1.1098931267857552, + "learning_rate": 3.638213260592785e-07, + "loss": 0.29524142742156984, + "num_tokens": 3607619.0, + "reward": 0.5225000202655792, + "reward_std": 0.40159628391265867, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.39476498365402224, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 1135, + "step_time": 5.897761693410575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 158.2, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 20.3625, + "completions/mean_terminated_length": 11.190000534057617, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.13526572035625578, + "epoch": 0.8315098468271335, + "frac_reward_zero_std": 0.3, + "grad_norm": 22.125, + "kl": 1.0481186473742128, + "learning_rate": 3.489960518496521e-07, + "loss": 0.5957928657531738, + "num_tokens": 3623224.0, + "reward": 0.7337500154972076, + "reward_std": 0.42885017991065977, + "rewards/reward_correct/mean": 0.6375, + "rewards/reward_correct/std": 0.424577522277832, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 1140, + "step_time": 7.660480500571429 }, { - "entropy": 1.3004080675542355, - "epoch": 0.3241491085899514, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 109.2, + "completions/max_terminated_length": 46.4, + "completions/mean_length": 19.4625, + "completions/mean_terminated_length": 13.493333625793458, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.12261578012257815, + "epoch": 0.8351568198395332, + "frac_reward_zero_std": 0.3, "grad_norm": 12.9375, - "learning_rate": 9.920914951609189e-06, - "loss": 1.4310997009277344, - "mean_token_accuracy": 0.7223183058202267, - "num_tokens": 118707.0, - "step": 50 + "kl": 1.1614298637956382, + "learning_rate": 3.344564606698722e-07, + "loss": 0.2668048858642578, + "num_tokens": 3637165.0, + "reward": 0.6350000202655792, + "reward_std": 0.3882540941238403, + "rewards/reward_correct/mean": 0.5375, + "rewards/reward_correct/std": 0.3855616092681885, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1145, + "step_time": 5.807115223072469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 207.0, + "completions/max_terminated_length": 51.8, + "completions/mean_length": 32.8125, + "completions/mean_terminated_length": 14.926548194885253, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.1611309928819537, + "epoch": 0.8388037928519329, + "frac_reward_zero_std": 0.4, + "grad_norm": 29.625, + "kl": 1.0498193325474858, + "learning_rate": 3.2020448357843957e-07, + "loss": 0.6072197914123535, + "num_tokens": 3661006.0, + "reward": 0.45375001430511475, + "reward_std": 0.4571046769618988, + "rewards/reward_correct/mean": 0.3625, + "rewards/reward_correct/std": 0.4492813885211945, + "rewards/reward_format/mean": 0.09125000089406968, + "rewards/reward_format/std": 0.024893558770418166, + "step": 1150, + "step_time": 9.794787894934416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 158.2, + "completions/max_terminated_length": 36.6, + "completions/mean_length": 27.9125, + "completions/mean_terminated_length": 12.962692642211914, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.19069184437394143, + "epoch": 0.8424507658643327, + "frac_reward_zero_std": 0.4, + "grad_norm": 1.75, + "kl": 1.176589535176754, + "learning_rate": 3.062420134347344e-07, + "loss": 0.4537034511566162, + "num_tokens": 3676671.0, + "reward": 0.40625001937150956, + "reward_std": 0.3934617698192596, + "rewards/reward_correct/mean": 0.3125, + "rewards/reward_correct/std": 0.3871816873550415, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.01806225851178169, + "step": 1155, + "step_time": 7.721098084934056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 109.8, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 17.425, + "completions/mean_terminated_length": 11.31583366394043, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.13128961883485318, + "epoch": 0.8460977388767323, + "frac_reward_zero_std": 0.5, + "grad_norm": 14.75, + "kl": 1.171060440503061, + "learning_rate": 2.925709046476136e-07, + "loss": 0.5961678504943848, + "num_tokens": 3688409.0, + "reward": 0.7475000143051147, + "reward_std": 0.4191900730133057, + "rewards/reward_correct/mean": 0.65, + "rewards/reward_correct/std": 0.4127875804901123, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1160, + "step_time": 5.9130605144426225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 207.0, + "completions/max_terminated_length": 13.2, + "completions/mean_length": 29.6375, + "completions/mean_terminated_length": 11.294423484802246, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.15925701488740743, + "epoch": 0.849744711889132, + "frac_reward_zero_std": 0.2, + "grad_norm": 27.875, + "kl": 1.1029092611745, + "learning_rate": 2.7919297292912403e-07, + "loss": 0.7251657009124756, + "num_tokens": 3702260.0, + "reward": 0.5550000131130218, + "reward_std": 0.47095765471458434, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.46192690134048464, + "rewards/reward_format/mean": 0.0925000011920929, + "rewards/reward_format/std": 0.0230622585862875, + "step": 1165, + "step_time": 9.310750750079752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 207.2, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 23.575, + "completions/mean_terminated_length": 11.34583396911621, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1362998990342021, + "epoch": 0.8533916849015317, + "frac_reward_zero_std": 0.4, + "grad_norm": 27.125, + "kl": 1.1539748897776008, + "learning_rate": 2.661099950533469e-07, + "loss": 1.1619250297546386, + "num_tokens": 3715178.0, + "reward": 0.6700000047683716, + "reward_std": 0.46959633827209474, + "rewards/reward_correct/mean": 0.575, + "rewards/reward_correct/std": 0.4606881022453308, + "rewards/reward_format/mean": 0.09500000029802322, + "rewards/reward_format/std": 0.020000000298023225, + "step": 1170, + "step_time": 9.40191943924874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 110.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 11.385000228881836, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.12817461686208845, + "epoch": 0.8570386579139314, + "frac_reward_zero_std": 0.6, + "grad_norm": 28.125, + "kl": 1.347428990341723, + "learning_rate": 2.5332370862042156e-07, + "loss": 0.5217385768890381, + "num_tokens": 3729074.0, + "reward": 0.5600000157952308, + "reward_std": 0.22001171708106995, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.21831300854682922, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1175, + "step_time": 5.9154639767482875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1125, + "completions/max_length": 218.6, + "completions/max_terminated_length": 43.6, + "completions/mean_length": 40.7875, + "completions/mean_terminated_length": 13.34887409210205, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.22097252551466226, + "epoch": 0.8606856309263311, + "frac_reward_zero_std": 0.1, + "grad_norm": 11.3125, + "kl": 1.017764002084732, + "learning_rate": 2.408358118257659e-07, + "loss": 0.567827558517456, + "num_tokens": 3746009.0, + "reward": 0.5012500047683716, + "reward_std": 0.484658282995224, + "rewards/reward_correct/mean": 0.4125, + "rewards/reward_correct/std": 0.47538756132125853, + "rewards/reward_format/mean": 0.08875000327825547, + "rewards/reward_format/std": 0.027955817803740502, + "step": 1180, + "step_time": 9.862211968749762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 207.0, + "completions/max_terminated_length": 60.4, + "completions/mean_length": 35.825, + "completions/mean_terminated_length": 14.612235260009765, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.1767511057667434, + "epoch": 0.8643326039387309, + "frac_reward_zero_std": 0.2, + "grad_norm": 5.65625, + "kl": 1.1420703403651715, + "learning_rate": 2.2864796323453286e-07, + "loss": 0.7290468215942383, + "num_tokens": 3761683.0, + "reward": 0.6037500113248825, + "reward_std": 0.46659151911735536, + "rewards/reward_correct/mean": 0.5125, + "rewards/reward_correct/std": 0.4538746178150177, + "rewards/reward_format/mean": 0.09125000089406968, + "rewards/reward_format/std": 0.024893558770418166, + "step": 1185, + "step_time": 9.439315206184983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.2, + "completions/max_terminated_length": 24.2, + "completions/mean_length": 11.875, + "completions/mean_terminated_length": 11.875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.10697071934118867, + "epoch": 0.8679795769511306, + "frac_reward_zero_std": 0.7, + "grad_norm": 19.875, + "kl": 1.0984319552779198, + "learning_rate": 2.1676178156133127e-07, + "loss": 0.042615166306495665, + "num_tokens": 3771961.0, + "reward": 0.5875000193715095, + "reward_std": 0.3205281376838684, + "rewards/reward_correct/mean": 0.4875, + "rewards/reward_correct/std": 0.32052814960479736, + "rewards/reward_format/mean": 0.10000000149011612, + "rewards/reward_format/std": 0.0, + "step": 1190, + "step_time": 2.7853813752532006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 113.2, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 17.5125, + "completions/mean_terminated_length": 11.390000343322754, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.12137670465745032, + "epoch": 0.8716265499635303, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.59765625, + "kl": 1.0660011304542423, + "learning_rate": 2.0517884545523614e-07, + "loss": 0.6065550327301026, + "num_tokens": 3785026.0, + "reward": 0.7725000143051147, + "reward_std": 0.16246949918568135, + "rewards/reward_correct/mean": 0.675, + "rewards/reward_correct/std": 0.15246951580047607, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1195, + "step_time": 6.019032196328044 }, { - "entropy": 1.2273146107792854, - "epoch": 0.3889789303079417, + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 110.0, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 20.3875, + "completions/mean_terminated_length": 11.205238342285156, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.15347529174759983, + "epoch": 0.87527352297593, + "frac_reward_zero_std": 0.3, "grad_norm": 6.46875, - "learning_rate": 9.84538643114539e-06, - "loss": 1.3282580375671387, - "mean_token_accuracy": 0.7365645788609981, - "num_tokens": 144662.0, - "step": 60 - }, - { - "entropy": 1.1238694667816163, - "epoch": 0.4538087520259319, - "grad_norm": 5.5625, - "learning_rate": 9.745278735053345e-06, - "loss": 1.1545157432556152, - "mean_token_accuracy": 0.7437338665127754, - "num_tokens": 166691.0, - "step": 70 - }, - { - "entropy": 1.119261236488819, - "epoch": 0.5186385737439222, - "grad_norm": 5.71875, - "learning_rate": 9.621099679309948e-06, - "loss": 1.1878681182861328, - "mean_token_accuracy": 0.7420044131577015, - "num_tokens": 188243.0, - "step": 80 - }, - { - "entropy": 1.1736014567315578, - "epoch": 0.5834683954619124, - "grad_norm": 4.90625, - "learning_rate": 9.473479186598115e-06, - "loss": 1.2732332229614258, - "mean_token_accuracy": 0.7324138328433036, - "num_tokens": 214178.0, - "step": 90 - }, - { - "entropy": 1.1458057157695294, - "epoch": 0.6482982171799028, - "grad_norm": 4.875, - "learning_rate": 9.303166090900082e-06, - "loss": 1.189667320251465, - "mean_token_accuracy": 0.746358548104763, - "num_tokens": 237658.0, - "step": 100 - }, - { - "entropy": 1.0824428118765355, - "epoch": 0.713128038897893, - "grad_norm": 5.15625, - "learning_rate": 9.111024338889748e-06, - "loss": 1.1823931694030763, - "mean_token_accuracy": 0.7510646276175976, - "num_tokens": 263592.0, - "step": 110 - }, - { - "entropy": 1.12230906188488, - "epoch": 0.7779578606158833, - "grad_norm": 4.8125, - "learning_rate": 8.89802860739326e-06, - "loss": 1.192618465423584, - "mean_token_accuracy": 0.7481291465461254, - "num_tokens": 287665.0, - "step": 120 - }, - { - "entropy": 1.06078160405159, - "epoch": 0.8427876823338736, - "grad_norm": 4.1875, - "learning_rate": 8.665259359149132e-06, - "loss": 1.1851810455322265, - "mean_token_accuracy": 0.7557851374149323, - "num_tokens": 314951.0, - "step": 130 - }, - { - "entropy": 1.0965183354914187, - "epoch": 0.9076175040518638, - "grad_norm": 5.90625, - "learning_rate": 8.413897361948484e-06, - "loss": 1.214451789855957, - "mean_token_accuracy": 0.747528488188982, - "num_tokens": 339584.0, - "step": 140 - }, - { - "entropy": 1.0685688987374307, - "epoch": 0.9724473257698542, - "grad_norm": 5.125, - "learning_rate": 8.145217698958213e-06, - "loss": 1.058650302886963, - "mean_token_accuracy": 0.754636698961258, - "num_tokens": 360451.0, - "step": 150 - }, - { - "epoch": 1.0, - "eval_entropy": 1.1447731256484985, - "eval_loss": 1.1801178455352783, - "eval_mean_token_accuracy": 0.7390438011714391, - "eval_num_tokens": 370054.0, - "eval_runtime": 4.0613, - "eval_samples_per_second": 67.712, - "eval_steps_per_second": 8.618, - "step": 155 - }, - { - "entropy": 1.1084210735720557, - "epoch": 1.032414910858995, - "grad_norm": 4.625, - "learning_rate": 7.860583300610849e-06, - "loss": 1.166214942932129, - "mean_token_accuracy": 0.7552700759591283, - "num_tokens": 382705.0, - "step": 160 - }, - { - "entropy": 1.1435431733727455, - "epoch": 1.0972447325769854, - "grad_norm": 3.90625, - "learning_rate": 7.561438030871886e-06, - "loss": 1.2284239768981933, - "mean_token_accuracy": 0.7437379971146584, - "num_tokens": 410122.0, - "step": 170 - }, - { - "entropy": 1.0213233061134814, - "epoch": 1.1620745542949757, - "grad_norm": 4.90625, - "learning_rate": 7.249299362955846e-06, - "loss": 1.0209275245666505, - "mean_token_accuracy": 0.7661052703857422, - "num_tokens": 432229.0, - "step": 180 - }, - { - "entropy": 1.0373225547373295, - "epoch": 1.2269043760129659, - "grad_norm": 4.21875, - "learning_rate": 6.925750681644954e-06, - "loss": 1.0731925964355469, - "mean_token_accuracy": 0.7609687730669975, - "num_tokens": 454799.0, - "step": 190 - }, - { - "entropy": 1.0361094988882542, - "epoch": 1.2917341977309562, - "grad_norm": 4.90625, - "learning_rate": 6.592433251258423e-06, - "loss": 1.0759617805480957, - "mean_token_accuracy": 0.7617471802979707, - "num_tokens": 477987.0, - "step": 200 - }, - { - "entropy": 1.1075004316866397, - "epoch": 1.3565640194489466, - "grad_norm": 4.5, - "learning_rate": 6.251037890016396e-06, - "loss": 1.1430248260498046, - "mean_token_accuracy": 0.74955914914608, - "num_tokens": 501601.0, - "step": 210 - }, - { - "entropy": 1.011087729036808, - "epoch": 1.4213938411669367, - "grad_norm": 4.40625, - "learning_rate": 5.903296393031996e-06, - "loss": 1.0240952491760253, - "mean_token_accuracy": 0.762733019888401, - "num_tokens": 524365.0, - "step": 220 - }, - { - "entropy": 1.044830472022295, - "epoch": 1.486223662884927, - "grad_norm": 5.0, - "learning_rate": 5.550972747440007e-06, - "loss": 1.0757940292358399, - "mean_token_accuracy": 0.7587831228971481, - "num_tokens": 547139.0, - "step": 230 - }, - { - "entropy": 1.0843433514237404, - "epoch": 1.5510534846029174, - "grad_norm": 4.09375, - "learning_rate": 5.1958541842252145e-06, - "loss": 1.1496910095214843, - "mean_token_accuracy": 0.7557259976863862, - "num_tokens": 571401.0, - "step": 240 - }, - { - "entropy": 1.121275296062231, - "epoch": 1.6158833063209075, - "grad_norm": 4.0, - "learning_rate": 4.839742112141725e-06, - "loss": 1.2097179412841796, - "mean_token_accuracy": 0.7418307565152645, - "num_tokens": 598409.0, - "step": 250 - }, - { - "entropy": 1.0553250342607499, - "epoch": 1.680713128038898, - "grad_norm": 4.09375, - "learning_rate": 4.484442979712783e-06, - "loss": 1.110421371459961, - "mean_token_accuracy": 0.7576163284480572, - "num_tokens": 622419.0, - "step": 260 - }, - { - "entropy": 0.9956472732126713, - "epoch": 1.7455429497568882, - "grad_norm": 4.96875, - "learning_rate": 4.131759111665349e-06, - "loss": 1.0361756324768066, - "mean_token_accuracy": 0.7665786109864712, - "num_tokens": 646753.0, - "step": 270 - }, - { - "entropy": 1.0138130433857442, - "epoch": 1.8103727714748783, - "grad_norm": 4.625, - "learning_rate": 3.783479566283457e-06, - "loss": 0.9922152519226074, - "mean_token_accuracy": 0.7726757541298866, - "num_tokens": 668120.0, - "step": 280 - }, - { - "entropy": 1.076356042176485, - "epoch": 1.8752025931928689, - "grad_norm": 3.75, - "learning_rate": 3.4413710600582096e-06, - "loss": 1.1258915901184081, - "mean_token_accuracy": 0.7554280295968056, - "num_tokens": 693086.0, - "step": 290 - }, - { - "entropy": 1.0491948314011097, - "epoch": 1.940032414910859, - "grad_norm": 4.46875, - "learning_rate": 3.1071690056709125e-06, - "loss": 1.1284616470336915, - "mean_token_accuracy": 0.7621670566499233, - "num_tokens": 717709.0, - "step": 300 - }, - { - "entropy": 1.0401086009837486, - "epoch": 2.0, - "grad_norm": 14.5, - "learning_rate": 2.782568708770933e-06, - "loss": 1.1070916175842285, - "mean_token_accuracy": 0.7640571851988096, - "num_tokens": 740108.0, - "step": 310 - }, - { - "epoch": 2.0, - "eval_entropy": 1.1126139402389525, - "eval_loss": 1.1659893989562988, - "eval_mean_token_accuracy": 0.7409281015396119, - "eval_num_tokens": 740108.0, - "eval_runtime": 4.0121, - "eval_samples_per_second": 68.543, - "eval_steps_per_second": 8.724, - "step": 310 - }, - { - "entropy": 1.04004001095891, - "epoch": 2.06482982171799, - "grad_norm": 3.59375, - "learning_rate": 2.4692167682043855e-06, - "loss": 1.0921714782714844, - "mean_token_accuracy": 0.7642068356275559, - "num_tokens": 764857.0, - "step": 320 - }, - { - "entropy": 1.061173403263092, - "epoch": 2.1296596434359807, - "grad_norm": 4.21875, - "learning_rate": 2.168702723317632e-06, - "loss": 1.0697747230529786, - "mean_token_accuracy": 0.7612246759235859, - "num_tokens": 789431.0, - "step": 330 - }, - { - "entropy": 1.1021161071956158, - "epoch": 2.194489465153971, - "grad_norm": 4.5, - "learning_rate": 1.8825509907063328e-06, - "loss": 1.1517933845520019, - "mean_token_accuracy": 0.7502290509641171, - "num_tokens": 815703.0, - "step": 340 - }, - { - "entropy": 1.105942540615797, - "epoch": 2.259319286871961, - "grad_norm": 4.34375, - "learning_rate": 1.612213131312454e-06, - "loss": 1.1506426811218262, - "mean_token_accuracy": 0.7514899417757988, - "num_tokens": 839522.0, - "step": 350 - }, - { - "entropy": 1.0000801369547845, - "epoch": 2.3241491085899515, - "grad_norm": 3.796875, - "learning_rate": 1.3590604870959046e-06, - "loss": 1.0159631729125977, - "mean_token_accuracy": 0.7752634860575199, - "num_tokens": 863471.0, - "step": 360 - }, - { - "entropy": 1.042192178219557, - "epoch": 2.3889789303079416, - "grad_norm": 3.640625, - "learning_rate": 1.1243772246327416e-06, - "loss": 1.1197192192077636, - "mean_token_accuracy": 0.7630650572478771, - "num_tokens": 888343.0, - "step": 370 - }, - { - "entropy": 1.0621915347874165, - "epoch": 2.4538087520259317, - "grad_norm": 5.3125, - "learning_rate": 9.093538209276487e-07, - "loss": 1.1313201904296875, - "mean_token_accuracy": 0.763028659671545, - "num_tokens": 911118.0, - "step": 380 - }, - { - "entropy": 1.0539461009204387, - "epoch": 2.5186385737439223, - "grad_norm": 5.125, - "learning_rate": 7.150810244852036e-07, - "loss": 1.1286755561828614, - "mean_token_accuracy": 0.7617501951754093, - "num_tokens": 935765.0, - "step": 390 - }, - { - "entropy": 1.0650467693805694, - "epoch": 2.5834683954619124, - "grad_norm": 5.28125, - "learning_rate": 5.425443222735527e-07, - "loss": 1.0615510940551758, - "mean_token_accuracy": 0.7582719139754772, - "num_tokens": 957739.0, - "step": 400 - }, - { - "entropy": 1.0712328039109706, - "epoch": 2.648298217179903, - "grad_norm": 4.40625, - "learning_rate": 3.9261894064796136e-07, - "loss": 1.1574504852294922, - "mean_token_accuracy": 0.7474529884755612, - "num_tokens": 986006.0, - "step": 410 - }, - { - "entropy": 1.0407099336385728, - "epoch": 2.713128038897893, - "grad_norm": 4.21875, - "learning_rate": 2.6606540559298956e-07, - "loss": 1.122768020629883, - "mean_token_accuracy": 0.7613956540822983, - "num_tokens": 1010199.0, - "step": 420 - }, - { - "entropy": 0.9765883333981037, - "epoch": 2.7779578606158832, - "grad_norm": 5.5, - "learning_rate": 1.6352568480485277e-07, - "loss": 0.952810001373291, - "mean_token_accuracy": 0.7726208001375199, - "num_tokens": 1032105.0, - "step": 430 - }, - { - "entropy": 0.9981532819569111, - "epoch": 2.8427876823338734, + "kl": 1.0669576674699783, + "learning_rate": 1.9390069329012356e-07, + "loss": 0.3958942174911499, + "num_tokens": 3800585.0, + "reward": 0.6462500214576721, + "reward_std": 0.4710683345794678, + "rewards/reward_correct/mean": 0.55, + "rewards/reward_correct/std": 0.4661841869354248, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 1200, + "step_time": 5.870821896754205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 109.4, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 20.275, + "completions/mean_terminated_length": 11.086309814453125, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.12172106686048209, + "epoch": 0.8789204959883297, + "frac_reward_zero_std": 0.6, + "grad_norm": 21.875, + "kl": 0.9709623863920569, + "learning_rate": 1.829288229603529e-07, + "loss": 0.4008487701416016, + "num_tokens": 3813687.0, + "reward": 0.5337500154972077, + "reward_std": 0.2803670633584261, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.2740620791912079, + "rewards/reward_format/mean": 0.09625000208616256, + "rewards/reward_format/std": 0.011831301078200341, + "step": 1205, + "step_time": 5.889204369299113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 62.2, + "completions/max_terminated_length": 13.2, + "completions/mean_length": 14.35, + "completions/mean_terminated_length": 11.287500190734864, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.13168379915878176, + "epoch": 0.8825674690007294, + "frac_reward_zero_std": 0.6, + "grad_norm": 16.625, + "kl": 1.168185787089169, + "learning_rate": 1.722646916818266e-07, + "loss": 0.23318815231323242, + "num_tokens": 3826635.0, + "reward": 0.611250014603138, + "reward_std": 0.35796160697937013, + "rewards/reward_correct/mean": 0.5125, + "rewards/reward_correct/std": 0.3565591096878052, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1210, + "step_time": 4.199280779622495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 124.4, + "completions/max_terminated_length": 27.2, + "completions/mean_length": 19.5375, + "completions/mean_terminated_length": 13.420000267028808, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1678618049249053, + "epoch": 0.8862144420131292, + "frac_reward_zero_std": 0.3, + "grad_norm": 16.625, + "kl": 0.9653506154194474, + "learning_rate": 1.619097157984506e-07, + "loss": 0.3652472019195557, + "num_tokens": 3839462.0, + "reward": 0.5725000143051148, + "reward_std": 0.4312571883201599, + "rewards/reward_correct/mean": 0.475, + "rewards/reward_correct/std": 0.42500433325767517, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1215, + "step_time": 6.370900041237474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 62.0, + "completions/max_terminated_length": 13.2, + "completions/mean_length": 14.3, + "completions/mean_terminated_length": 11.239166831970214, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.13081118185073137, + "epoch": 0.8898614150255288, + "frac_reward_zero_std": 0.6, + "grad_norm": 10.0, + "kl": 1.286240622214973, + "learning_rate": 1.5186527059402573e-07, + "loss": 0.11276096105575562, + "num_tokens": 3861486.0, + "reward": 0.5362500160932541, + "reward_std": 0.35385951995849607, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.3532795548439026, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1220, + "step_time": 4.45845171995461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 60.8, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 14.1375, + "completions/mean_terminated_length": 11.071666717529297, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.15417862283065914, + "epoch": 0.8935083880379285, + "frac_reward_zero_std": 0.2, + "grad_norm": 16.75, + "kl": 1.1948064217343926, + "learning_rate": 1.4213269010958902e-07, + "loss": 0.3309065818786621, + "num_tokens": 3874545.0, + "reward": 0.5487500250339508, + "reward_std": 0.4563694715499878, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.45525074005126953, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1225, + "step_time": 4.123033397644758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 159.0, + "completions/max_terminated_length": 31.2, + "completions/mean_length": 21.675, + "completions/mean_terminated_length": 12.579166984558105, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.13562840055674313, + "epoch": 0.8971553610503282, + "frac_reward_zero_std": 0.4, + "grad_norm": 27.25, + "kl": 1.0161822739988566, + "learning_rate": 1.327132669662376e-07, + "loss": 0.6893713474273682, + "num_tokens": 3889775.0, + "reward": 0.49625001549720765, + "reward_std": 0.41573981530964377, + "rewards/reward_correct/mean": 0.4, + "rewards/reward_correct/std": 0.40821858644485476, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 1230, + "step_time": 7.707901426404715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 95.6, + "completions/max_terminated_length": 46.6, + "completions/mean_length": 16.925, + "completions/mean_terminated_length": 13.862500190734863, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.1752641463186592, + "epoch": 0.9008023340627279, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.1923828125, + "kl": 1.1024759428575635, + "learning_rate": 1.2360825219344702e-07, + "loss": 0.11647883653640748, + "num_tokens": 3902185.0, + "reward": 0.5487500220537186, + "reward_std": 0.4380682408809662, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.43740057945251465, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1235, + "step_time": 5.3195851827040315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 207.4, + "completions/max_terminated_length": 11.6, + "completions/mean_length": 35.4875, + "completions/mean_terminated_length": 10.97726230621338, + "completions/min_length": 10.2, + "completions/min_terminated_length": 10.2, + "entropy": 0.1884731743950397, + "epoch": 0.9044493070751276, + "frac_reward_zero_std": 0.4, "grad_norm": 4.375, - "learning_rate": 8.551993118403656e-08, - "loss": 1.029557228088379, - "mean_token_accuracy": 0.7673999711871147, - "num_tokens": 1053781.0, - "step": 440 - }, - { - "entropy": 1.0824145190417767, - "epoch": 2.907617504051864, - "grad_norm": 5.03125, - "learning_rate": 3.2443844257400434e-08, - "loss": 1.1431958198547363, - "mean_token_accuracy": 0.7591245241463185, - "num_tokens": 1076454.0, - "step": 450 - }, - { - "entropy": 1.0491673357784748, - "epoch": 2.972447325769854, - "grad_norm": 4.4375, - "learning_rate": 4.56666291450858e-09, - "loss": 1.1116426467895508, - "mean_token_accuracy": 0.7619525894522667, - "num_tokens": 1100407.0, - "step": 460 - }, - { - "epoch": 3.0, - "eval_entropy": 1.10868159532547, - "eval_loss": 1.1656713485717773, - "eval_mean_token_accuracy": 0.7412662489073617, - "eval_num_tokens": 1110162.0, - "eval_runtime": 4.0591, - "eval_samples_per_second": 67.749, - "eval_steps_per_second": 8.623, - "step": 465 + "kl": 0.8981709321960807, + "learning_rate": 1.1481885506292006e-07, + "loss": 0.8048501968383789, + "num_tokens": 3918840.0, + "reward": 0.6025000154972077, + "reward_std": 0.460212242603302, + "rewards/reward_correct/mean": 0.5125, + "rewards/reward_correct/std": 0.450427371263504, + "rewards/reward_format/mean": 0.09000000208616257, + "rewards/reward_format/std": 0.025775573402643203, + "step": 1240, + "step_time": 9.470035465247928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 158.4, + "completions/max_terminated_length": 11.8, + "completions/mean_length": 20.2875, + "completions/mean_terminated_length": 11.103333854675293, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.1405309045687318, + "epoch": 0.9080962800875274, + "frac_reward_zero_std": 0.3, + "grad_norm": 19.75, + "kl": 1.1071846587583423, + "learning_rate": 1.0634624292797824e-07, + "loss": 0.3469456911087036, + "num_tokens": 3929871.0, + "reward": 0.6337500095367432, + "reward_std": 0.4897329926490784, + "rewards/reward_correct/mean": 0.5375, + "rewards/reward_correct/std": 0.48518543839454653, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 1245, + "step_time": 7.678769601136446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 126.8, + "completions/max_terminated_length": 45.6, + "completions/mean_length": 19.4625, + "completions/mean_terminated_length": 13.41500015258789, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16323285629041492, + "epoch": 0.9117432530999271, + "frac_reward_zero_std": 0.4, + "grad_norm": 9.1875, + "kl": 1.0787921341136097, + "learning_rate": 9.819154106852052e-08, + "loss": 0.14792591333389282, + "num_tokens": 3943380.0, + "reward": 0.5462500154972076, + "reward_std": 0.455259644985199, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.45213748812675475, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 1250, + "step_time": 6.4878272010013465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 61.8, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 19.4, + "completions/mean_terminated_length": 13.583928680419922, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.11878243619576097, + "epoch": 0.9153902261123268, + "frac_reward_zero_std": 0.4, + "grad_norm": 35.0, + "kl": 1.1625534605234862, + "learning_rate": 9.035583254157072e-08, + "loss": 0.30159051418304444, + "num_tokens": 3956684.0, + "reward": 0.7100000232458115, + "reward_std": 0.3741665780544281, + "rewards/reward_correct/mean": 0.6125, + "rewards/reward_correct/std": 0.3673352777957916, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 1255, + "step_time": 4.215127210505306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 117.2, + "completions/max_terminated_length": 19.4, + "completions/mean_length": 17.925, + "completions/mean_terminated_length": 11.800833702087402, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12006725454702974, + "epoch": 0.9190371991247265, + "frac_reward_zero_std": 0.6, + "grad_norm": 13.3125, + "kl": 1.19089269451797, + "learning_rate": 8.284015803743273e-08, + "loss": 0.30684738159179686, + "num_tokens": 3970718.0, + "reward": 0.535000017285347, + "reward_std": 0.32001171708106996, + "rewards/reward_correct/mean": 0.4375, + "rewards/reward_correct/std": 0.3183130085468292, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1260, + "step_time": 6.2046219816431405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 207.0, + "completions/max_terminated_length": 26.2, + "completions/mean_length": 27.8, + "completions/mean_terminated_length": 12.595834159851075, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.198873932659626, + "epoch": 0.9226841721371262, + "frac_reward_zero_std": 0.4, + "grad_norm": 25.75, + "kl": 0.9969290411099792, + "learning_rate": 7.564551574147372e-08, + "loss": 0.9914497375488281, + "num_tokens": 3986038.0, + "reward": 0.6562500119209289, + "reward_std": 0.4098332166671753, + "rewards/reward_correct/mean": 0.5625, + "rewards/reward_correct/std": 0.397957855463028, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.02183130122721195, + "step": 1265, + "step_time": 9.472559287585318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 170.8, + "completions/max_terminated_length": 31.2, + "completions/mean_length": 21.5875, + "completions/mean_terminated_length": 12.434167098999023, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16186856417916715, + "epoch": 0.9263311451495259, + "frac_reward_zero_std": 0.5, + "grad_norm": 25.875, + "kl": 1.2236642342060804, + "learning_rate": 6.877286120154813e-08, + "loss": 0.8637575149536133, + "num_tokens": 3997861.0, + "reward": 0.6962500095367432, + "reward_std": 0.4358209609985352, + "rewards/reward_correct/mean": 0.6, + "rewards/reward_correct/std": 0.4266244113445282, + "rewards/reward_format/mean": 0.09625000059604645, + "rewards/reward_format/std": 0.015000000223517418, + "step": 1270, + "step_time": 8.050456136651338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 62.0, + "completions/max_terminated_length": 13.8, + "completions/mean_length": 14.4625, + "completions/mean_terminated_length": 11.40833339691162, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12154140481725335, + "epoch": 0.9299781181619255, + "frac_reward_zero_std": 0.5, + "grad_norm": 17.875, + "kl": 1.2322701148688793, + "learning_rate": 6.22231072010912e-08, + "loss": 0.31382088661193847, + "num_tokens": 4019610.0, + "reward": 0.5237500190734863, + "reward_std": 0.4060746610164642, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.4054946959018707, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1275, + "step_time": 4.552192717604339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 217.0, + "completions/max_terminated_length": 23.4, + "completions/mean_length": 24.7625, + "completions/mean_terminated_length": 12.529167175292969, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.15384551649913192, + "epoch": 0.9336250911743253, + "frac_reward_zero_std": 0.5, + "grad_norm": 12.875, + "kl": 0.9660263145342469, + "learning_rate": 5.599712363788673e-08, + "loss": 0.6800155639648438, + "num_tokens": 4038023.0, + "reward": 0.5825000047683716, + "reward_std": 0.4460014820098877, + "rewards/reward_correct/mean": 0.4875, + "rewards/reward_correct/std": 0.4396512031555176, + "rewards/reward_format/mean": 0.09500000029802322, + "rewards/reward_format/std": 0.020000000298023225, + "step": 1280, + "step_time": 9.965804596059025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.4, + "completions/max_terminated_length": 37.2, + "completions/mean_length": 17.025, + "completions/mean_terminated_length": 14.15250015258789, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.15362449996173383, + "epoch": 0.937272064186725, + "frac_reward_zero_std": 0.4, + "grad_norm": 15.875, + "kl": 1.2669753028079866, + "learning_rate": 5.009573740853313e-08, + "loss": 0.0688023567199707, + "num_tokens": 4052953.0, + "reward": 0.5862500160932541, + "reward_std": 0.4470571160316467, + "rewards/reward_correct/mean": 0.4875, + "rewards/reward_correct/std": 0.4456546425819397, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1285, + "step_time": 4.167799991369248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 158.2, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 26.3625, + "completions/mean_terminated_length": 11.052500152587891, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.14020586647093297, + "epoch": 0.9409190371991247, + "frac_reward_zero_std": 0.3, + "grad_norm": 13.125, + "kl": 1.1390698188915849, + "learning_rate": 4.4519732298620445e-08, + "loss": 0.634610939025879, + "num_tokens": 4067622.0, + "reward": 0.6437500238418579, + "reward_std": 0.4798768162727356, + "rewards/reward_correct/mean": 0.55, + "rewards/reward_correct/std": 0.4725348174571991, + "rewards/reward_format/mean": 0.09375000149011611, + "rewards/reward_format/std": 0.01806225851178169, + "step": 1290, + "step_time": 7.63978633992374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 124.6, + "completions/max_terminated_length": 120.8, + "completions/mean_length": 23.175, + "completions/mean_terminated_length": 20.299166870117187, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.1875308462418616, + "epoch": 0.9445660102115244, + "frac_reward_zero_std": 0.6, + "grad_norm": 3.9375, + "kl": 1.140010260976851, + "learning_rate": 3.9269848878631946e-08, + "loss": 0.21886055469512938, + "num_tokens": 4081708.0, + "reward": 0.7737500190734863, + "reward_std": 0.39976497888565066, + "rewards/reward_correct/mean": 0.675, + "rewards/reward_correct/std": 0.39476498365402224, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1295, + "step_time": 6.518524892255664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 109.6, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 11.001218032836913, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.13531422056257725, + "epoch": 0.9482129832239241, + "frac_reward_zero_std": 0.4, + "grad_norm": 24.5, + "kl": 1.0114584499038757, + "learning_rate": 3.434678440558781e-08, + "loss": 0.6326910018920898, + "num_tokens": 4095872.0, + "reward": 0.7825000166893006, + "reward_std": 0.41665183901786806, + "rewards/reward_correct/mean": 0.6875, + "rewards/reward_correct/std": 0.4110352873802185, + "rewards/reward_format/mean": 0.09500000029802322, + "rewards/reward_format/std": 0.013062257692217827, + "step": 1300, + "step_time": 5.918459895625711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 136.6, + "completions/max_terminated_length": 38.8, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 12.75250015258789, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.16376146394759417, + "epoch": 0.9518599562363238, + "frac_reward_zero_std": 0.2, + "grad_norm": 6.25, + "kl": 1.1120296666398644, + "learning_rate": 2.9751192730437527e-08, + "loss": 0.020284974575042726, + "num_tokens": 4109720.0, + "reward": 0.5575000256299972, + "reward_std": 0.4448194861412048, + "rewards/reward_correct/mean": 0.4625, + "rewards/reward_correct/std": 0.4390955328941345, + "rewards/reward_format/mean": 0.09500000178813935, + "rewards/reward_format/std": 0.013062258437275886, + "step": 1305, + "step_time": 6.82063927706331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 111.4, + "completions/max_terminated_length": 16.8, + "completions/mean_length": 17.8375, + "completions/mean_terminated_length": 11.732500267028808, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15829349178820848, + "epoch": 0.9555069292487236, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3203125, + "kl": 1.230124199949205, + "learning_rate": 2.548368421122105e-08, + "loss": 0.09629564881324768, + "num_tokens": 4121579.0, + "reward": 0.6112500220537186, + "reward_std": 0.34232552647590636, + "rewards/reward_correct/mean": 0.5125, + "rewards/reward_correct/std": 0.3416578650474548, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1310, + "step_time": 6.008304981328547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 159.0, + "completions/max_terminated_length": 12.4, + "completions/mean_length": 26.7375, + "completions/mean_terminated_length": 11.439286231994629, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15575562939047813, + "epoch": 0.9591539022611233, + "frac_reward_zero_std": 0.2, + "grad_norm": 7.84375, + "kl": 1.0837120974436403, + "learning_rate": 2.1544825632004163e-08, + "loss": 0.8264616966247559, + "num_tokens": 4134342.0, + "reward": 0.5437500238418579, + "reward_std": 0.5058443427085877, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.499320787191391, + "rewards/reward_format/mean": 0.09375000298023224, + "rewards/reward_format/std": 0.018662602081894873, + "step": 1315, + "step_time": 7.7235252929851415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 62.8, + "completions/max_terminated_length": 13.8, + "completions/mean_length": 17.2625, + "completions/mean_terminated_length": 11.133928680419922, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.1365585310384631, + "epoch": 0.962800875273523, + "frac_reward_zero_std": 0.5, + "grad_norm": 22.0, + "kl": 1.2857500046491623, + "learning_rate": 1.793514012760261e-08, + "loss": 0.29351532459259033, + "num_tokens": 4146339.0, + "reward": 0.6725000143051147, + "reward_std": 0.4918497741222382, + "rewards/reward_correct/mean": 0.575, + "rewards/reward_correct/std": 0.486371648311615, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 1320, + "step_time": 4.238193775340915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.0, + "completions/max_terminated_length": 60.4, + "completions/mean_length": 17.2375, + "completions/mean_terminated_length": 14.37750015258789, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.12142869005911053, + "epoch": 0.9664478482859227, + "frac_reward_zero_std": 0.5, + "grad_norm": 22.5, + "kl": 1.1785110834985972, + "learning_rate": 1.4655107114101008e-08, + "loss": 0.20103805065155028, + "num_tokens": 4159950.0, + "reward": 0.5487500190734863, + "reward_std": 0.43751977682113646, + "rewards/reward_correct/mean": 0.45, + "rewards/reward_correct/std": 0.43611727356910707, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1325, + "step_time": 4.162991305999458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 207.0, + "completions/max_terminated_length": 12.6, + "completions/mean_length": 32.6, + "completions/mean_terminated_length": 11.181786155700683, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.174559523537755, + "epoch": 0.9700948212983224, + "frac_reward_zero_std": 0.2, + "grad_norm": 14.75, + "kl": 1.1911510691046714, + "learning_rate": 1.1705162225181254e-08, + "loss": 0.720400619506836, + "num_tokens": 4172942.0, + "reward": 0.4787500262260437, + "reward_std": 0.46417110562324526, + "rewards/reward_correct/mean": 0.3875, + "rewards/reward_correct/std": 0.45493903160095217, + "rewards/reward_format/mean": 0.09125000238418579, + "rewards/reward_format/std": 0.024893559515476227, + "step": 1330, + "step_time": 9.379100299254059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 71.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 17.2, + "completions/mean_terminated_length": 14.295833587646484, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.173837832827121, + "epoch": 0.973741794310722, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.96875, + "kl": 1.0959458950906993, + "learning_rate": 9.08569725426356e-09, + "loss": 0.16633286476135253, + "num_tokens": 4197062.0, + "reward": 0.6475000262260437, + "reward_std": 0.3818537831306458, + "rewards/reward_correct/mean": 0.55, + "rewards/reward_correct/std": 0.3796448469161987, + "rewards/reward_format/mean": 0.0975000023841858, + "rewards/reward_format/std": 0.006831301003694534, + "step": 1335, + "step_time": 5.301490155234933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 97.4, + "completions/max_terminated_length": 49.2, + "completions/mean_length": 16.4375, + "completions/mean_terminated_length": 13.378333473205567, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.08703776504844427, + "epoch": 0.9773887673231219, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.11181640625, + "kl": 1.2685282507911324, + "learning_rate": 6.797060102470831e-09, + "loss": 0.16221898794174194, + "num_tokens": 4207297.0, + "reward": 0.6987500190734863, + "reward_std": 0.48166019916534425, + "rewards/reward_correct/mean": 0.6, + "rewards/reward_correct/std": 0.478151673078537, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1340, + "step_time": 5.527895992621779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 158.2, + "completions/max_terminated_length": 14.6, + "completions/mean_length": 35.85, + "completions/mean_terminated_length": 11.438205337524414, + "completions/min_length": 10.6, + "completions/min_terminated_length": 10.6, + "entropy": 0.16663135583512484, + "epoch": 0.9810357403355215, + "frac_reward_zero_std": 0.3, + "grad_norm": 10.4375, + "kl": 0.9873619046062231, + "learning_rate": 4.839554732423424e-09, + "loss": 0.5998001575469971, + "num_tokens": 4222845.0, + "reward": 0.4650000184774399, + "reward_std": 0.43344358205795286, + "rewards/reward_correct/mean": 0.375, + "rewards/reward_correct/std": 0.42406207919120786, + "rewards/reward_format/mean": 0.09000000059604644, + "rewards/reward_format/std": 0.022006529942154886, + "step": 1345, + "step_time": 7.704775251820683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 61.6, + "completions/max_terminated_length": 12.8, + "completions/mean_length": 14.2875, + "completions/mean_terminated_length": 11.226666831970215, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.10666983281262218, + "epoch": 0.9846827133479212, + "frac_reward_zero_std": 0.3, + "grad_norm": 27.25, + "kl": 1.2111088124103844, + "learning_rate": 3.213441127867811e-09, + "loss": 0.3297783851623535, + "num_tokens": 4234956.0, + "reward": 0.6362500190734863, + "reward_std": 0.49169819355010985, + "rewards/reward_correct/mean": 0.5375, + "rewards/reward_correct/std": 0.49012446999549864, + "rewards/reward_format/mean": 0.0987500011920929, + "rewards/reward_format/std": 0.005000000074505806, + "step": 1350, + "step_time": 4.193444421328604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 121.6, + "completions/max_terminated_length": 23.8, + "completions/mean_length": 24.025, + "completions/mean_terminated_length": 11.775000190734863, + "completions/min_length": 10.2, + "completions/min_terminated_length": 10.2, + "entropy": 0.17740441677160562, + "epoch": 0.9883296863603209, + "frac_reward_zero_std": 0.6, + "grad_norm": 2.90625, + "kl": 1.205637697596103, + "learning_rate": 1.9189352591469923e-09, + "loss": 0.598662281036377, + "num_tokens": 4253094.0, + "reward": 0.4825000137090683, + "reward_std": 0.4217729359865189, + "rewards/reward_correct/mean": 0.3875, + "rewards/reward_correct/std": 0.41230818033218386, + "rewards/reward_format/mean": 0.09500000029802322, + "rewards/reward_format/std": 0.013062257692217827, + "step": 1355, + "step_time": 6.477925518527627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 132.8, + "completions/max_terminated_length": 35.2, + "completions/mean_length": 18.6375, + "completions/mean_terminated_length": 12.514167022705077, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.16600654548965393, + "epoch": 0.9919766593727206, + "frac_reward_zero_std": 0.4, + "grad_norm": 12.9375, + "kl": 1.1097917286679149, + "learning_rate": 9.562090545181534e-10, + "loss": 0.3176872968673706, + "num_tokens": 4267785.0, + "reward": 0.6975000202655792, + "reward_std": 0.4413697779178619, + "rewards/reward_correct/mean": 0.6, + "rewards/reward_correct/std": 0.4384649932384491, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1360, + "step_time": 6.8133282080292705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 109.8, + "completions/max_terminated_length": 12.2, + "completions/mean_length": 17.2125, + "completions/mean_terminated_length": 11.089167022705078, + "completions/min_length": 10.8, + "completions/min_terminated_length": 10.8, + "entropy": 0.13670172039419412, + "epoch": 0.9956236323851203, + "frac_reward_zero_std": 0.3, + "grad_norm": 30.625, + "kl": 1.1015755292028189, + "learning_rate": 3.2539037731593725e-10, + "loss": 0.615378475189209, + "num_tokens": 4289858.0, + "reward": 0.5225000113248826, + "reward_std": 0.4638355731964111, + "rewards/reward_correct/mean": 0.425, + "rewards/reward_correct/std": 0.4606881022453308, + "rewards/reward_format/mean": 0.09750000089406967, + "rewards/reward_format/std": 0.010000000149011612, + "step": 1365, + "step_time": 6.164464436843991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 207.0, + "completions/max_terminated_length": 31.4, + "completions/mean_length": 24.6125, + "completions/mean_terminated_length": 12.454167366027832, + "completions/min_length": 10.4, + "completions/min_terminated_length": 10.4, + "entropy": 0.20005234158597887, + "epoch": 0.9992706053975201, + "frac_reward_zero_std": 0.4, + "grad_norm": 13.875, + "kl": 1.2183840798214078, + "learning_rate": 2.6563008972968486e-11, + "loss": 0.9409544944763184, + "num_tokens": 4304611.0, + "reward": 0.49500000178813935, + "reward_std": 0.3399057496339083, + "rewards/reward_correct/mean": 0.4, + "rewards/reward_correct/std": 0.3288348078727722, + "rewards/reward_format/mean": 0.09500000029802322, + "rewards/reward_format/std": 0.020000000298023225, + "step": 1370, + "step_time": 9.565368090569972 } ], - "logging_steps": 10, - "max_steps": 465, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, + "logging_steps": 5, + "max_steps": 1371, + "num_input_tokens_seen": 4307996, + "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -520,7 +8247,7 @@ "attributes": {} } }, - "total_flos": 1.22293100454912e+16, + "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null