{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 28.571428571428573, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1246.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 444.0, "completions/min_terminated_length": 0.0, "entropy": 3.5187635719776154, "epoch": 0.07142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.013423021882772446, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0003, "num_tokens": 111872.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.444161057472229, "sampling/importance_sampling_ratio/min": 4.906094994852858e-35, "sampling/sampling_logp_difference/max": 79.0, "sampling/sampling_logp_difference/mean": 8.43178939819336, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 799.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 789.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 780.0, "completions/min_terminated_length": 0.0, "entropy": 3.5843773782253265, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 0.01541836280375719, "kl": 0.0, "learning_rate": 4.166666666666667e-06, "loss": 0.001, "num_tokens": 194528.0, "reward": 0.15625, "reward_std": 0.1246790662407875, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.43784064054489136, "sampling/importance_sampling_ratio/min": 3.858942701415657e-34, "sampling/sampling_logp_difference/max": 76.9375, "sampling/sampling_logp_difference/mean": 8.033233642578125, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1462.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 877.0, "completions/min_terminated_length": 0.0, "entropy": 4.25638672709465, "epoch": 0.21428571428571427, "frac_reward_zero_std": 1.0, "grad_norm": 0.43403682112693787, "kl": 1.2273831204511225, "learning_rate": 8.333333333333334e-06, "loss": 0.0012, "num_tokens": 320256.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4009392261505127, "sampling/importance_sampling_ratio/min": 9.165791500701421e-35, "sampling/sampling_logp_difference/max": 78.375, "sampling/sampling_logp_difference/mean": 8.916457176208496, "step": 3 }, { "clip_ratio/high_max": 0.0003793626674450934, "clip_ratio/high_mean": 0.0001896813337225467, "clip_ratio/low_mean": 0.0016122913657454774, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001801972699468024, "completions/clipped_ratio": 1.0, "completions/max_length": 659.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 642.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 625.0, "completions/min_terminated_length": 0.0, "entropy": 2.8162572011351585, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 0.006306707859039307, "kl": 0.00489562786242459, "learning_rate": 1.25e-05, "loss": 0.0001, "num_tokens": 393472.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5194122791290283, "sampling/importance_sampling_ratio/min": 2.823262612952882e-34, "sampling/sampling_logp_difference/max": 77.25, "sampling/sampling_logp_difference/mean": 6.9651288986206055, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 900.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 785.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 671.0, "completions/min_terminated_length": 0.0, "entropy": 3.2788542956113815, "epoch": 0.35714285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 1.078719973564148, "kl": 0.6682980706973467, "learning_rate": 1.6666666666666667e-05, "loss": 0.0003, "num_tokens": 475872.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4757689833641052, "sampling/importance_sampling_ratio/min": 6.705835965062432e-35, "sampling/sampling_logp_difference/max": 78.6875, "sampling/sampling_logp_difference/mean": 7.534091949462891, "step": 5 }, { "clip_ratio/high_max": 0.0003357437963131815, "clip_ratio/high_mean": 0.00016787189815659076, "clip_ratio/low_mean": 0.0009168388260150095, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010847107241716003, "completions/clipped_ratio": 1.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 994.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 778.0, "completions/min_terminated_length": 0.0, "entropy": 4.162048861384392, "epoch": 0.42857142857142855, "frac_reward_zero_std": 0.875, "grad_norm": 0.04752767086029053, "kl": 0.5828670512419194, "learning_rate": 2.0833333333333336e-05, "loss": 0.0003, "num_tokens": 571616.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3944012522697449, "sampling/importance_sampling_ratio/min": 4.2868981798105594e-36, "sampling/sampling_logp_difference/max": 81.4375, "sampling/sampling_logp_difference/mean": 8.664558410644531, "step": 6 }, { "clip_ratio/high_max": 0.005552030488615856, "clip_ratio/high_mean": 0.002776015244307928, "clip_ratio/low_mean": 0.0011104060467914678, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038864212692715228, "completions/clipped_ratio": 1.0, "completions/max_length": 788.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 784.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 781.0, "completions/min_terminated_length": 0.0, "entropy": 3.359421193599701, "epoch": 0.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.01192376110702753, "kl": 0.009047224128153175, "learning_rate": 2.5e-05, "loss": 0.0039, "num_tokens": 653952.0, "reward": 0.125, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4779437184333801, "sampling/importance_sampling_ratio/min": 8.525514564980869e-36, "sampling/sampling_logp_difference/max": 80.75, "sampling/sampling_logp_difference/mean": 7.502476692199707, "step": 7 }, { "clip_ratio/high_max": 0.002136075956514105, "clip_ratio/high_mean": 0.0010680379782570526, "clip_ratio/low_mean": 0.0020371835271362215, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003105221490841359, "completions/clipped_ratio": 1.0, "completions/max_length": 844.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 817.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 790.0, "completions/min_terminated_length": 0.0, "entropy": 3.448928102850914, "epoch": 0.5714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.012722174637019634, "kl": 0.007033233967376873, "learning_rate": 2.916666666666667e-05, "loss": 0.0021, "num_tokens": 738368.0, "reward": 0.0625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48152869939804077, "sampling/importance_sampling_ratio/min": 5.1709859456893004e-36, "sampling/sampling_logp_difference/max": 81.25, "sampling/sampling_logp_difference/mean": 7.416894912719727, "step": 8 }, { "clip_ratio/high_max": 0.003405624200240709, "clip_ratio/high_mean": 0.0017693014597170986, "clip_ratio/low_mean": 0.003321697749925079, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005090999275125796, "completions/clipped_ratio": 1.0, "completions/max_length": 705.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 556.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 408.0, "completions/min_terminated_length": 0.0, "entropy": 1.9986578896641731, "epoch": 0.6428571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.01282485295087099, "kl": 0.009839696518611163, "learning_rate": 3.3333333333333335e-05, "loss": -0.0019, "num_tokens": 806112.0, "reward": 0.15625, "reward_std": 0.1462521106004715, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6226313710212708, "sampling/importance_sampling_ratio/min": 1.822835416400275e-34, "sampling/sampling_logp_difference/max": 77.6875, "sampling/sampling_logp_difference/mean": 5.599445343017578, "step": 9 }, { "clip_ratio/high_max": 0.001603803102625534, "clip_ratio/high_mean": 0.000801901551312767, "clip_ratio/low_mean": 0.0026489038646104746, "clip_ratio/low_min": 0.00012617766333278269, "clip_ratio/region_mean": 0.0034508054013713263, "completions/clipped_ratio": 1.0, "completions/max_length": 743.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 549.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 356.0, "completions/min_terminated_length": 0.0, "entropy": 2.059837482869625, "epoch": 0.7142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.010698383674025536, "kl": 0.0066099292889703065, "learning_rate": 3.7500000000000003e-05, "loss": 0.0009, "num_tokens": 873408.0, "reward": 0.046875, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5995035171508789, "sampling/importance_sampling_ratio/min": 3.820870344979068e-35, "sampling/sampling_logp_difference/max": 79.25, "sampling/sampling_logp_difference/mean": 5.982863426208496, "step": 10 }, { "clip_ratio/high_max": 0.0008791560103418306, "clip_ratio/high_mean": 0.0004395780051709153, "clip_ratio/low_mean": 0.0006793478387407959, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011189258439117111, "completions/clipped_ratio": 1.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 911.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 782.0, "completions/min_terminated_length": 0.0, "entropy": 3.563076823949814, "epoch": 0.7857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 0.047095492482185364, "kl": 0.25519878830527887, "learning_rate": 4.166666666666667e-05, "loss": 0.0011, "num_tokens": 963872.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.46671366691589355, "sampling/importance_sampling_ratio/min": 2.9756967944000553e-35, "sampling/sampling_logp_difference/max": 79.5, "sampling/sampling_logp_difference/mean": 7.607812881469727, "step": 11 }, { "clip_ratio/high_max": 0.002673796785529703, "clip_ratio/high_mean": 0.0013368983927648515, "clip_ratio/low_mean": 0.0012533422341221012, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025902406123350374, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1398.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 748.0, "completions/min_terminated_length": 0.0, "entropy": 4.679965853691101, "epoch": 0.8571428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.04233424365520477, "kl": 0.20231061801314354, "learning_rate": 4.5833333333333334e-05, "loss": 0.0025, "num_tokens": 1085472.0, "reward": 0.09375, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3159329891204834, "sampling/importance_sampling_ratio/min": 4.809766177996032e-37, "sampling/sampling_logp_difference/max": 83.625, "sampling/sampling_logp_difference/mean": 10.143003463745117, "step": 12 }, { "clip_ratio/high_max": 0.0010676361853256822, "clip_ratio/high_mean": 0.0005338180926628411, "clip_ratio/low_mean": 0.0008800784853519872, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014138965852907859, "completions/clipped_ratio": 1.0, "completions/max_length": 1083.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 894.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 705.0, "completions/min_terminated_length": 0.0, "entropy": 3.5531027764081955, "epoch": 0.9285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 0.012566735967993736, "kl": 0.11298408970469609, "learning_rate": 5e-05, "loss": 0.0011, "num_tokens": 1174816.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4699520766735077, "sampling/importance_sampling_ratio/min": 3.625141007634431e-34, "sampling/sampling_logp_difference/max": 77.0, "sampling/sampling_logp_difference/mean": 7.565427780151367, "step": 13 }, { "clip_ratio/high_max": 0.004520939022768289, "clip_ratio/high_mean": 0.0022604695113841444, "clip_ratio/low_mean": 0.0028156725311418995, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005076142071629874, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1221.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 394.0, "completions/min_terminated_length": 0.0, "entropy": 3.3715325742959976, "epoch": 1.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.02462988905608654, "kl": 0.08842741372063756, "learning_rate": 4.999918050947891e-05, "loss": 0.0008, "num_tokens": 1285088.0, "reward": 0.0625, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4569069445133209, "sampling/importance_sampling_ratio/min": 5.859494238273697e-36, "sampling/sampling_logp_difference/max": 81.125, "sampling/sampling_logp_difference/mean": 8.318171501159668, "step": 14 }, { "clip_ratio/high_max": 0.008296297790366225, "clip_ratio/high_mean": 0.004426545310707297, "clip_ratio/low_mean": 0.003727493527549086, "clip_ratio/low_min": 0.0010028980104834773, "clip_ratio/region_mean": 0.008154038798238616, "completions/clipped_ratio": 1.0, "completions/max_length": 898.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 656.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 415.0, "completions/min_terminated_length": 0.0, "entropy": 2.62962044775486, "epoch": 1.0714285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.016132980585098267, "kl": 0.01394506765063852, "learning_rate": 4.999672209164081e-05, "loss": 0.003, "num_tokens": 1359232.0, "reward": 0.171875, "reward_std": 0.24039678275585175, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5368093252182007, "sampling/importance_sampling_ratio/min": 9.165791500701421e-35, "sampling/sampling_logp_difference/max": 78.375, "sampling/sampling_logp_difference/mean": 6.779623508453369, "step": 15 }, { "clip_ratio/high_max": 0.0017701525357551873, "clip_ratio/high_mean": 0.0008850762678775936, "clip_ratio/low_mean": 0.0013956971633888315, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002280773431266425, "completions/clipped_ratio": 1.0, "completions/max_length": 691.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 575.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 459.0, "completions/min_terminated_length": 0.0, "entropy": 2.434723809361458, "epoch": 1.1428571428571428, "frac_reward_zero_std": 0.875, "grad_norm": 0.00967002660036087, "kl": 0.011154376261401922, "learning_rate": 4.99926249076577e-05, "loss": 0.0004, "num_tokens": 1428160.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5727952718734741, "sampling/importance_sampling_ratio/min": 7.598708298374055e-35, "sampling/sampling_logp_difference/max": 78.5625, "sampling/sampling_logp_difference/mean": 6.210095405578613, "step": 16 }, { "clip_ratio/high_max": 0.00030984418117441237, "clip_ratio/high_mean": 0.00015492209058720618, "clip_ratio/low_mean": 0.0006196883914526552, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007746104820398614, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1377.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 706.0, "completions/min_terminated_length": 0.0, "entropy": 4.384785816073418, "epoch": 1.2142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.007634434849023819, "kl": 0.02924974128836766, "learning_rate": 4.998688922613788e-05, "loss": 0.0006, "num_tokens": 1548416.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3796389102935791, "sampling/importance_sampling_ratio/min": 2.6786371807851483e-33, "sampling/sampling_logp_difference/max": 75.0, "sampling/sampling_logp_difference/mean": 9.128366470336914, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1511.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 974.0, "completions/min_terminated_length": 0.0, "entropy": 4.88330240547657, "epoch": 1.2857142857142856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020460712257772684, "kl": 0.039627177407965064, "learning_rate": 4.997951542310825e-05, "loss": 0.0, "num_tokens": 1677248.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.34160763025283813, "sampling/importance_sampling_ratio/min": 4.190093106797024e-32, "sampling/sampling_logp_difference/max": 72.25, "sampling/sampling_logp_difference/mean": 9.636415481567383, "step": 18 }, { "clip_ratio/high_max": 0.001982867979677394, "clip_ratio/high_mean": 0.000991433989838697, "clip_ratio/low_mean": 0.0005948604230070487, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015862944128457457, "completions/clipped_ratio": 1.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 866.0, "completions/min_terminated_length": 0.0, "entropy": 4.275759667158127, "epoch": 1.3571428571428572, "frac_reward_zero_std": 0.875, "grad_norm": 0.007551537360996008, "kl": 0.02920402679592371, "learning_rate": 4.997050398198977e-05, "loss": 0.0016, "num_tokens": 1774912.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.42277079820632935, "sampling/importance_sampling_ratio/min": 2.4915206584065623e-34, "sampling/sampling_logp_difference/max": 77.375, "sampling/sampling_logp_difference/mean": 8.138193130493164, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 501.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 429.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 358.0, "completions/min_terminated_length": 0.0, "entropy": 1.2731713652610779, "epoch": 1.4285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006374641670845449, "kl": 0.01359531978960149, "learning_rate": 4.995985549356568e-05, "loss": 0.0, "num_tokens": 1834528.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7000365257263184, "sampling/importance_sampling_ratio/min": 1.959732570142077e-33, "sampling/sampling_logp_difference/max": 75.3125, "sampling/sampling_logp_difference/mean": 4.597080707550049, "step": 20 }, { "clip_ratio/high_max": 0.005689519850420766, "clip_ratio/high_mean": 0.003206646848411765, "clip_ratio/low_mean": 0.0019015904981642962, "clip_ratio/low_min": 0.0002554495877120644, "clip_ratio/region_mean": 0.005108237295644358, "completions/clipped_ratio": 1.0, "completions/max_length": 734.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 564.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 395.0, "completions/min_terminated_length": 0.0, "entropy": 2.4604240506887436, "epoch": 1.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.0392061285674572, "kl": 0.01607560072443448, "learning_rate": 4.9947570655942796e-05, "loss": -0.0012, "num_tokens": 1902784.0, "reward": 0.140625, "reward_std": 0.1315089464187622, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5664194226264954, "sampling/importance_sampling_ratio/min": 1.0489692816875613e-33, "sampling/sampling_logp_difference/max": 75.9375, "sampling/sampling_logp_difference/mean": 6.324641227722168, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 791.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 764.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 737.0, "completions/min_terminated_length": 0.0, "entropy": 3.4119139462709427, "epoch": 1.5714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.000988368527032435, "kl": 0.018446799251250923, "learning_rate": 4.993365027450576e-05, "loss": 0.0, "num_tokens": 1983808.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4793815016746521, "sampling/importance_sampling_ratio/min": 1.1389853770767086e-31, "sampling/sampling_logp_difference/max": 71.25, "sampling/sampling_logp_difference/mean": 7.364871978759766, "step": 22 }, { "clip_ratio/high_max": 0.004339850813266821, "clip_ratio/high_mean": 0.0023295648061321117, "clip_ratio/low_mean": 0.0023039309126033913, "clip_ratio/low_min": 0.000775396270910278, "clip_ratio/region_mean": 0.004633495722373482, "completions/clipped_ratio": 1.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 961.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 911.0, "completions/min_terminated_length": 0.0, "entropy": 4.313371479511261, "epoch": 1.6428571428571428, "frac_reward_zero_std": 0.5, "grad_norm": 0.020694196224212646, "kl": 0.02914598456118256, "learning_rate": 4.991809526186424e-05, "loss": 0.0004, "num_tokens": 2077440.0, "reward": 0.1875, "reward_std": 0.23827511072158813, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4050137400627136, "sampling/importance_sampling_ratio/min": 8.169380844367654e-34, "sampling/sampling_logp_difference/max": 76.1875, "sampling/sampling_logp_difference/mean": 8.365341186523438, "step": 23 }, { "clip_ratio/high_max": 0.0038992163026705384, "clip_ratio/high_mean": 0.0020967824093531817, "clip_ratio/low_mean": 0.0008987972432805691, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029955796926515177, "completions/clipped_ratio": 1.0, "completions/max_length": 637.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 577.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 517.0, "completions/min_terminated_length": 0.0, "entropy": 2.046389639377594, "epoch": 1.7142857142857144, "frac_reward_zero_std": 0.75, "grad_norm": 0.009413642808794975, "kl": 0.009351684420835227, "learning_rate": 4.9900906637793046e-05, "loss": 0.0004, "num_tokens": 2146496.0, "reward": 0.140625, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5990230441093445, "sampling/importance_sampling_ratio/min": 3.858942701415657e-34, "sampling/sampling_logp_difference/max": 76.9375, "sampling/sampling_logp_difference/mean": 5.876287937164307, "step": 24 }, { "clip_ratio/high_max": 0.00038723667967133224, "clip_ratio/high_mean": 0.00019361833983566612, "clip_ratio/low_mean": 0.0009293680304836016, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011229863703192677, "completions/clipped_ratio": 1.0, "completions/max_length": 915.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 861.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 807.0, "completions/min_terminated_length": 0.0, "entropy": 3.3377607613801956, "epoch": 1.7857142857142856, "frac_reward_zero_std": 0.875, "grad_norm": 0.01472503412514925, "kl": 0.015872925461735576, "learning_rate": 4.988208552916535e-05, "loss": 0.0009, "num_tokens": 2233728.0, "reward": 0.140625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5184645652770996, "sampling/importance_sampling_ratio/min": 1.419625586416994e-34, "sampling/sampling_logp_difference/max": 77.9375, "sampling/sampling_logp_difference/mean": 6.856696605682373, "step": 25 }, { "clip_ratio/high_max": 0.00107441061118152, "clip_ratio/high_mean": 0.00053720530559076, "clip_ratio/low_mean": 0.0007827848894521594, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013199901877669618, "completions/clipped_ratio": 1.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 930.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 843.0, "completions/min_terminated_length": 0.0, "entropy": 3.5362871438264847, "epoch": 1.8571428571428572, "frac_reward_zero_std": 0.875, "grad_norm": 0.005407406948506832, "kl": 0.018181934778112918, "learning_rate": 4.986163316987876e-05, "loss": -0.0005, "num_tokens": 2325408.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5000567436218262, "sampling/importance_sampling_ratio/min": 3.199175604441272e-34, "sampling/sampling_logp_difference/max": 77.125, "sampling/sampling_logp_difference/mean": 7.123032569885254, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 838.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 771.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 704.0, "completions/min_terminated_length": 0.0, "entropy": 2.8556013256311417, "epoch": 1.9285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009442983428016305, "kl": 0.015790401957929134, "learning_rate": 4.983955090077444e-05, "loss": 0.0, "num_tokens": 2406880.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5401520729064941, "sampling/importance_sampling_ratio/min": 1.448057401441802e-32, "sampling/sampling_logp_difference/max": 73.3125, "sampling/sampling_logp_difference/mean": 6.6327667236328125, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 943.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 766.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 590.0, "completions/min_terminated_length": 0.0, "entropy": 3.279866673052311, "epoch": 2.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009597806492820382, "kl": 0.02093616808997467, "learning_rate": 4.981584016954921e-05, "loss": 0.0, "num_tokens": 2488064.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4747501015663147, "sampling/importance_sampling_ratio/min": 5.614728103716021e-34, "sampling/sampling_logp_difference/max": 76.5625, "sampling/sampling_logp_difference/mean": 7.554892063140869, "step": 28 }, { "clip_ratio/high_max": 0.0034415192785672843, "clip_ratio/high_mean": 0.0017207596392836422, "clip_ratio/low_mean": 0.0006154541770229116, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023362138454103842, "completions/clipped_ratio": 1.0, "completions/max_length": 1244.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 934.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 624.0, "completions/min_terminated_length": 0.0, "entropy": 3.70762899518013, "epoch": 2.0714285714285716, "frac_reward_zero_std": 0.875, "grad_norm": 0.009323753416538239, "kl": 0.028507737908512354, "learning_rate": 4.9790502530660635e-05, "loss": 0.0003, "num_tokens": 2579968.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4435414671897888, "sampling/importance_sampling_ratio/min": 2.3174752027302318e-35, "sampling/sampling_logp_difference/max": 79.75, "sampling/sampling_logp_difference/mean": 7.97567081451416, "step": 29 }, { "clip_ratio/high_max": 0.0006410256610251963, "clip_ratio/high_mean": 0.00032051283051259816, "clip_ratio/low_mean": 0.0009615384769858792, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012820513074984774, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1414.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 780.0, "completions/min_terminated_length": 0.0, "entropy": 4.827475488185883, "epoch": 2.142857142857143, "frac_reward_zero_std": 0.875, "grad_norm": 0.00845959410071373, "kl": 0.028197300503961742, "learning_rate": 4.976353964522509e-05, "loss": 0.0021, "num_tokens": 2702592.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.31858837604522705, "sampling/importance_sampling_ratio/min": 3.820870344979068e-35, "sampling/sampling_logp_difference/max": 79.25, "sampling/sampling_logp_difference/mean": 10.105037689208984, "step": 30 }, { "clip_ratio/high_max": 0.0013260923442430794, "clip_ratio/high_mean": 0.0006630461721215397, "clip_ratio/low_mean": 0.0018975946950376965, "clip_ratio/low_min": 0.0005344925739336759, "clip_ratio/region_mean": 0.0025606408962630667, "completions/clipped_ratio": 1.0, "completions/max_length": 877.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 865.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 854.0, "completions/min_terminated_length": 0.0, "entropy": 3.8935308009386063, "epoch": 2.2142857142857144, "frac_reward_zero_std": 0.75, "grad_norm": 0.008986471220850945, "kl": 0.018184444110374898, "learning_rate": 4.9734953280908904e-05, "loss": -0.0016, "num_tokens": 2790112.0, "reward": 0.0625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4331040382385254, "sampling/importance_sampling_ratio/min": 3.625141007634431e-34, "sampling/sampling_logp_difference/max": 77.0, "sampling/sampling_logp_difference/mean": 8.074987411499023, "step": 31 }, { "clip_ratio/high_max": 0.0019667832530103624, "clip_ratio/high_mean": 0.0009833916265051812, "clip_ratio/low_mean": 0.0006555944128194824, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016389860393246636, "completions/clipped_ratio": 1.0, "completions/max_length": 985.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 850.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 715.0, "completions/min_terminated_length": 0.0, "entropy": 3.8113347440958023, "epoch": 2.2857142857142856, "frac_reward_zero_std": 0.875, "grad_norm": 0.009253383614122868, "kl": 0.019296301470603794, "learning_rate": 4.9704745311812454e-05, "loss": 0.0005, "num_tokens": 2876640.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.44224536418914795, "sampling/importance_sampling_ratio/min": 6.840139179297504e-33, "sampling/sampling_logp_difference/max": 74.0625, "sampling/sampling_logp_difference/mean": 7.951718807220459, "step": 32 }, { "clip_ratio/high_max": 0.0066353652800899, "clip_ratio/high_mean": 0.0034998393966816366, "clip_ratio/low_mean": 0.0005758680672443006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004075707453012001, "completions/clipped_ratio": 1.0, "completions/max_length": 772.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 717.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 663.0, "completions/min_terminated_length": 0.0, "entropy": 3.070578709244728, "epoch": 2.357142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.017226265743374825, "kl": 0.011550676921615377, "learning_rate": 4.967291771834727e-05, "loss": -0.001, "num_tokens": 2954688.0, "reward": 0.265625, "reward_std": 0.17358146607875824, "rewards/tree_correctness_reward/mean": 0.265625, "rewards/tree_correctness_reward/std": 0.44515693187713623, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5175262689590454, "sampling/importance_sampling_ratio/min": 2.466939338135878e-35, "sampling/sampling_logp_difference/max": 79.6875, "sampling/sampling_logp_difference/mean": 6.93226432800293, "step": 33 }, { "clip_ratio/high_max": 0.0010359116131439805, "clip_ratio/high_mean": 0.0005179558065719903, "clip_ratio/low_mean": 0.001151012911577709, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016689687181496993, "completions/clipped_ratio": 1.0, "completions/max_length": 785.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 664.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 543.0, "completions/min_terminated_length": 0.0, "entropy": 2.799503982067108, "epoch": 2.4285714285714284, "frac_reward_zero_std": 0.875, "grad_norm": 0.007142703514546156, "kl": 0.010629668511683121, "learning_rate": 4.963947258710626e-05, "loss": -0.0004, "num_tokens": 3029312.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5335917472839355, "sampling/importance_sampling_ratio/min": 1.1538028783762514e-36, "sampling/sampling_logp_difference/max": 82.75, "sampling/sampling_logp_difference/mean": 6.77097225189209, "step": 34 }, { "clip_ratio/high_max": 0.002393892384134233, "clip_ratio/high_mean": 0.0011969461920671165, "clip_ratio/low_mean": 0.0009057971183210611, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021027433103881776, "completions/clipped_ratio": 1.0, "completions/max_length": 756.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 619.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 483.0, "completions/min_terminated_length": 0.0, "entropy": 2.25935235619545, "epoch": 2.5, "frac_reward_zero_std": 0.875, "grad_norm": 0.009381025098264217, "kl": 0.009392951847985387, "learning_rate": 4.960441211072686e-05, "loss": 0.0003, "num_tokens": 3101088.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5960965156555176, "sampling/importance_sampling_ratio/min": 7.674422876742102e-34, "sampling/sampling_logp_difference/max": 76.25, "sampling/sampling_logp_difference/mean": 5.926796913146973, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1979.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1910.0, "completions/min_terminated_length": 0.0, "entropy": 6.025425761938095, "epoch": 2.571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013711698120459914, "kl": 0.03309723176062107, "learning_rate": 4.956773858774731e-05, "loss": 0.0, "num_tokens": 3259872.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.20658665895462036, "sampling/importance_sampling_ratio/min": 5.559333506405371e-35, "sampling/sampling_logp_difference/max": 78.875, "sampling/sampling_logp_difference/mean": 11.953378677368164, "step": 36 }, { "clip_ratio/high_max": 0.0008230652092606761, "clip_ratio/high_mean": 0.00041153260463033803, "clip_ratio/low_mean": 0.003154664176690858, "clip_ratio/low_min": 0.0005370569415390491, "clip_ratio/region_mean": 0.003566196777683217, "completions/clipped_ratio": 1.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1019.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 931.0, "completions/min_terminated_length": 0.0, "entropy": 4.992978036403656, "epoch": 2.642857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.011515953578054905, "kl": 0.024881580844521523, "learning_rate": 4.9529454422455976e-05, "loss": 0.0037, "num_tokens": 3357248.0, "reward": 0.046875, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.35368722677230835, "sampling/importance_sampling_ratio/min": 5.976846944504e-34, "sampling/sampling_logp_difference/max": 76.5, "sampling/sampling_logp_difference/mean": 9.10731315612793, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 836.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 781.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 726.0, "completions/min_terminated_length": 0.0, "entropy": 3.225335255265236, "epoch": 2.7142857142857144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005232641706243157, "kl": 0.011148024757858366, "learning_rate": 4.94895621247337e-05, "loss": 0.0, "num_tokens": 3439360.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5368466973304749, "sampling/importance_sampling_ratio/min": 2.4915206584065623e-34, "sampling/sampling_logp_difference/max": 77.375, "sampling/sampling_logp_difference/mean": 6.626387596130371, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 564.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 518.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 472.0, "completions/min_terminated_length": 0.0, "entropy": 2.415248617529869, "epoch": 2.7857142857142856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004952172166667879, "kl": 0.011904435174074024, "learning_rate": 4.944806430988927e-05, "loss": 0.0, "num_tokens": 3504640.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5456819534301758, "sampling/importance_sampling_ratio/min": 1.1166220121487572e-33, "sampling/sampling_logp_difference/max": 75.875, "sampling/sampling_logp_difference/mean": 6.6875386238098145, "step": 39 }, { "clip_ratio/high_max": 0.0002164127363357693, "clip_ratio/high_mean": 0.00010820636816788465, "clip_ratio/low_mean": 0.0006925207562744617, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008007271244423464, "completions/clipped_ratio": 1.0, "completions/max_length": 762.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 742.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 722.0, "completions/min_terminated_length": 0.0, "entropy": 2.82285126298666, "epoch": 2.857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 0.0077949827536940575, "kl": 0.009510507545201108, "learning_rate": 4.940496369848795e-05, "loss": 0.0003, "num_tokens": 3584256.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5809241533279419, "sampling/importance_sampling_ratio/min": 5.222510753034111e-35, "sampling/sampling_logp_difference/max": 78.9375, "sampling/sampling_logp_difference/mean": 6.038334846496582, "step": 40 }, { "clip_ratio/high_max": 0.0017187499906867743, "clip_ratio/high_mean": 0.0008593749953433871, "clip_ratio/low_mean": 0.001171875002910383, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00203124999825377, "completions/clipped_ratio": 1.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1086.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 600.0, "completions/min_terminated_length": 0.0, "entropy": 4.394650116562843, "epoch": 2.928571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 0.015968069434165955, "kl": 0.021740168100222945, "learning_rate": 4.936026311617316e-05, "loss": 0.0007, "num_tokens": 3685920.0, "reward": 0.078125, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3845285177230835, "sampling/importance_sampling_ratio/min": 4.107822865311063e-34, "sampling/sampling_logp_difference/max": 76.875, "sampling/sampling_logp_difference/mean": 8.930509567260742, "step": 41 }, { "clip_ratio/high_max": 0.0027676569879986346, "clip_ratio/high_mean": 0.0013838284939993173, "clip_ratio/low_mean": 1.75168170244433e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014013453110237606, "completions/clipped_ratio": 1.0, "completions/max_length": 892.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 567.0, "completions/min_terminated_length": 0.0, "entropy": 2.840691715478897, "epoch": 3.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.04939565435051918, "kl": 0.013475928164552897, "learning_rate": 4.931396549348115e-05, "loss": 0.0014, "num_tokens": 3764736.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5770444869995117, "sampling/importance_sampling_ratio/min": 2.1770663342316889e-35, "sampling/sampling_logp_difference/max": 79.8125, "sampling/sampling_logp_difference/mean": 6.156808853149414, "step": 42 }, { "clip_ratio/high_max": 0.0016521372672286816, "clip_ratio/high_mean": 0.0008260686336143408, "clip_ratio/low_mean": 0.0007030371270957403, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015291057279682718, "completions/clipped_ratio": 1.0, "completions/max_length": 889.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 827.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 766.0, "completions/min_terminated_length": 0.0, "entropy": 3.511796936392784, "epoch": 3.0714285714285716, "frac_reward_zero_std": 0.75, "grad_norm": 0.010950637981295586, "kl": 0.011213228281121701, "learning_rate": 4.926607386564898e-05, "loss": 0.0005, "num_tokens": 3849824.0, "reward": 0.125, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5019423961639404, "sampling/importance_sampling_ratio/min": 3.3386392847942014e-36, "sampling/sampling_logp_difference/max": 81.6875, "sampling/sampling_logp_difference/mean": 7.16123104095459, "step": 43 }, { "clip_ratio/high_max": 0.0025974025920731947, "clip_ratio/high_mean": 0.0012987012960365973, "clip_ratio/low_mean": 0.0010957792110275477, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002394480507064145, "completions/clipped_ratio": 1.0, "completions/max_length": 410.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 397.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 385.0, "completions/min_terminated_length": 0.0, "entropy": 1.3662154711782932, "epoch": 3.142857142857143, "frac_reward_zero_std": 0.875, "grad_norm": 0.008466017432510853, "kl": 0.014420989900827408, "learning_rate": 4.9216591372415445e-05, "loss": -0.0002, "num_tokens": 3907392.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6666592359542847, "sampling/importance_sampling_ratio/min": 5.917879946392805e-35, "sampling/sampling_logp_difference/max": 78.8125, "sampling/sampling_logp_difference/mean": 5.179347991943359, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 969.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 902.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 835.0, "completions/min_terminated_length": 0.0, "entropy": 3.8994200080633163, "epoch": 3.2142857142857144, "frac_reward_zero_std": 1.0, "grad_norm": 0.00047840026672929525, "kl": 0.014965750684496015, "learning_rate": 4.916552125781528e-05, "loss": 0.0, "num_tokens": 3997248.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4598158597946167, "sampling/importance_sampling_ratio/min": 2.7954085051878674e-35, "sampling/sampling_logp_difference/max": 79.5625, "sampling/sampling_logp_difference/mean": 7.75050687789917, "step": 45 }, { "clip_ratio/high_max": 0.003039675808395259, "clip_ratio/high_mean": 0.0015198379041976295, "clip_ratio/low_mean": 0.0002933020587079227, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018131399629055522, "completions/clipped_ratio": 1.0, "completions/max_length": 665.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 625.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 586.0, "completions/min_terminated_length": 0.0, "entropy": 1.9509559720754623, "epoch": 3.2857142857142856, "frac_reward_zero_std": 0.875, "grad_norm": 0.005236838478595018, "kl": 0.011972436273936182, "learning_rate": 4.9112866869966475e-05, "loss": -0.0015, "num_tokens": 4069408.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6656996011734009, "sampling/importance_sampling_ratio/min": 8.088783257874516e-35, "sampling/sampling_logp_difference/max": 78.5, "sampling/sampling_logp_difference/mean": 4.980671405792236, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 823.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 630.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 438.0, "completions/min_terminated_length": 0.0, "entropy": 2.5246231853961945, "epoch": 3.357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003154308651573956, "kl": 0.010597107990179211, "learning_rate": 4.9058631660850765e-05, "loss": 0.0, "num_tokens": 4141888.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.589657187461853, "sampling/importance_sampling_ratio/min": 2.1770663342316889e-35, "sampling/sampling_logp_difference/max": 79.8125, "sampling/sampling_logp_difference/mean": 6.095928192138672, "step": 47 }, { "clip_ratio/high_max": 0.0016224605351453647, "clip_ratio/high_mean": 0.0008112302675726824, "clip_ratio/low_mean": 0.0006172404137032572, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001428470692189876, "completions/clipped_ratio": 1.0, "completions/max_length": 886.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 835.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 785.0, "completions/min_terminated_length": 0.0, "entropy": 4.2445837408304214, "epoch": 3.4285714285714284, "frac_reward_zero_std": 0.75, "grad_norm": 0.00991428829729557, "kl": 0.012052935257088393, "learning_rate": 4.900281918608732e-05, "loss": 0.0015, "num_tokens": 4227488.0, "reward": 0.25, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4090440273284912, "sampling/importance_sampling_ratio/min": 6.175861782819467e-37, "sampling/sampling_logp_difference/max": 83.375, "sampling/sampling_logp_difference/mean": 8.462675094604492, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2029.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1408.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 788.0, "completions/min_terminated_length": 0.0, "entropy": 4.736899197101593, "epoch": 3.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009081251337192953, "kl": 0.019939493155106902, "learning_rate": 4.894543310469968e-05, "loss": 0.0, "num_tokens": 4349760.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.36505764722824097, "sampling/importance_sampling_ratio/min": 9.756933719611155e-35, "sampling/sampling_logp_difference/max": 78.3125, "sampling/sampling_logp_difference/mean": 9.400754928588867, "step": 49 }, { "clip_ratio/high_max": 0.0015074211405590177, "clip_ratio/high_mean": 0.0007537105702795088, "clip_ratio/low_mean": 0.000434833018516656, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011885435887961648, "completions/clipped_ratio": 1.0, "completions/max_length": 815.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 677.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 539.0, "completions/min_terminated_length": 0.0, "entropy": 3.092482954263687, "epoch": 3.571428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 0.01778266951441765, "kl": 0.011468878772575408, "learning_rate": 4.8886477178875826e-05, "loss": -0.0013, "num_tokens": 4425216.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5152342319488525, "sampling/importance_sampling_ratio/min": 1.1769110281448947e-34, "sampling/sampling_logp_difference/max": 78.125, "sampling/sampling_logp_difference/mean": 7.034854888916016, "step": 50 }, { "epoch": 3.571428571428571, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 437.72, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 401.1, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 364.48, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.8469550651311875, "eval_frac_reward_zero_std": 0.88, "eval_kl": 0.013830461800098419, "eval_loss": -0.0005663755582645535, "eval_num_tokens": 4425216.0, "eval_reward": 0.04, "eval_reward_std": 0.04680067300796509, "eval_rewards/tree_correctness_reward/mean": 0.04, "eval_rewards/tree_correctness_reward/std": 0.04680067300796509, "eval_runtime": 134.9035, "eval_samples_per_second": 0.185, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8197375297546386, "eval_sampling/importance_sampling_ratio/min": 1.6823204313093521e-31, "eval_sampling/sampling_logp_difference/max": 73.59375, "eval_sampling/sampling_logp_difference/mean": 2.9100644731521608, "eval_steps_per_second": 0.03, "step": 50 }, { "clip_ratio/high_max": 0.0012489399523474276, "clip_ratio/high_mean": 0.0006244699761737138, "clip_ratio/low_mean": 0.00236281321849674, "clip_ratio/low_min": 0.0006214780078153126, "clip_ratio/region_mean": 0.0029872831946704537, "completions/clipped_ratio": 1.0, "completions/max_length": 988.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 845.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 703.0, "completions/min_terminated_length": 0.0, "entropy": 4.0951427817344666, "epoch": 3.642857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.014702807180583477, "kl": 0.016870125487912446, "learning_rate": 4.882595527372152e-05, "loss": 0.0038, "num_tokens": 4511456.0, "reward": 0.078125, "reward_std": 0.15992169082164764, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.43820780515670776, "sampling/importance_sampling_ratio/min": 6.57417089405873e-37, "sampling/sampling_logp_difference/max": 83.3125, "sampling/sampling_logp_difference/mean": 8.046345710754395, "step": 51 }, { "clip_ratio/high_max": 0.000964718870818615, "clip_ratio/high_mean": 0.0004823594354093075, "clip_ratio/low_mean": 0.000585722165851621, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010680816012609284, "completions/clipped_ratio": 1.0, "completions/max_length": 907.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 829.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 751.0, "completions/min_terminated_length": 0.0, "entropy": 3.921251520514488, "epoch": 3.7142857142857144, "frac_reward_zero_std": 0.875, "grad_norm": 0.006350706331431866, "kl": 0.01242197974352166, "learning_rate": 4.876387135700701e-05, "loss": 0.0, "num_tokens": 4596640.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.46550291776657104, "sampling/importance_sampling_ratio/min": 1.3469032840202626e-33, "sampling/sampling_logp_difference/max": 75.6875, "sampling/sampling_logp_difference/mean": 7.6718034744262695, "step": 52 }, { "clip_ratio/high_max": 0.003396479136426933, "clip_ratio/high_mean": 0.0019218809538870119, "clip_ratio/low_mean": 0.0017201652444782667, "clip_ratio/low_min": 0.00025114809977822006, "clip_ratio/region_mean": 0.003642046245659003, "completions/clipped_ratio": 1.0, "completions/max_length": 871.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 728.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 585.0, "completions/min_terminated_length": 0.0, "entropy": 3.619505152106285, "epoch": 3.7857142857142856, "frac_reward_zero_std": 0.625, "grad_norm": 0.011479267850518227, "kl": 0.011398645845474675, "learning_rate": 4.870022949890676e-05, "loss": 0.0037, "num_tokens": 4675360.0, "reward": 0.140625, "reward_std": 0.17782479524612427, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4657965898513794, "sampling/importance_sampling_ratio/min": 1.6955009379010048e-35, "sampling/sampling_logp_difference/max": 80.0625, "sampling/sampling_logp_difference/mean": 7.693748950958252, "step": 53 }, { "clip_ratio/high_max": 0.0005296610179357231, "clip_ratio/high_mean": 0.00026483050896786153, "clip_ratio/low_mean": 0.00276180385844782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030266343674156815, "completions/clipped_ratio": 1.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1209.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 767.0, "completions/min_terminated_length": 0.0, "entropy": 3.9571817219257355, "epoch": 3.857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 0.012035323306918144, "kl": 0.018493804382160306, "learning_rate": 4.8635033871732755e-05, "loss": 0.0015, "num_tokens": 4784896.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48889410495758057, "sampling/importance_sampling_ratio/min": 3.820870344979068e-35, "sampling/sampling_logp_difference/max": 79.25, "sampling/sampling_logp_difference/mean": 7.478582382202148, "step": 54 }, { "clip_ratio/high_max": 0.001768867892678827, "clip_ratio/high_mean": 0.0008844339463394135, "clip_ratio/low_mean": 0.00166357814669027, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025480121075815987, "completions/clipped_ratio": 1.0, "completions/max_length": 937.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 839.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 742.0, "completions/min_terminated_length": 0.0, "entropy": 4.20221509039402, "epoch": 3.928571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 0.011270307004451752, "kl": 0.01521630899515003, "learning_rate": 4.856828874966086e-05, "loss": -0.0012, "num_tokens": 4870752.0, "reward": 0.046875, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.42717963457107544, "sampling/importance_sampling_ratio/min": 1.729458002261791e-33, "sampling/sampling_logp_difference/max": 75.4375, "sampling/sampling_logp_difference/mean": 8.156082153320312, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 578.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 472.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 367.0, "completions/min_terminated_length": 0.0, "entropy": 1.7483541369438171, "epoch": 4.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.00039491234929300845, "kl": 0.012441737460903823, "learning_rate": 4.8499998508450664e-05, "loss": 0.0, "num_tokens": 4933120.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6607979536056519, "sampling/importance_sampling_ratio/min": 2.2719725080459815e-37, "sampling/sampling_logp_difference/max": 84.375, "sampling/sampling_logp_difference/mean": 5.1545186042785645, "step": 56 }, { "clip_ratio/high_max": 0.0018892974112532102, "clip_ratio/high_mean": 0.0009446487056266051, "clip_ratio/low_mean": 0.001991421617276501, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029360703338170424, "completions/clipped_ratio": 1.0, "completions/max_length": 612.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 590.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 569.0, "completions/min_terminated_length": 0.0, "entropy": 2.8661554902791977, "epoch": 4.071428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.014024986885488033, "kl": 0.011203959118574858, "learning_rate": 4.8430167625158595e-05, "loss": 0.0006, "num_tokens": 5003040.0, "reward": 0.078125, "reward_std": 0.11100947856903076, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5335410833358765, "sampling/importance_sampling_ratio/min": 2.1987591985594636e-34, "sampling/sampling_logp_difference/max": 77.5, "sampling/sampling_logp_difference/mean": 6.7769036293029785, "step": 57 }, { "clip_ratio/high_max": 0.0027071562508353963, "clip_ratio/high_mean": 0.0013535781254176982, "clip_ratio/low_mean": 0.001412429339325172, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002766007473837817, "completions/clipped_ratio": 1.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 769.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 531.0, "completions/min_terminated_length": 0.0, "entropy": 3.4341480508446693, "epoch": 4.142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.013395441696047783, "kl": 0.014455741504207253, "learning_rate": 4.8358800677844406e-05, "loss": 0.0018, "num_tokens": 5084416.0, "reward": 0.125, "reward_std": 0.1293872892856598, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4870520830154419, "sampling/importance_sampling_ratio/min": 1.3336148713971936e-34, "sampling/sampling_logp_difference/max": 78.0, "sampling/sampling_logp_difference/mean": 7.405810356140137, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 708.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 659.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 611.0, "completions/min_terminated_length": 0.0, "entropy": 2.7121631503105164, "epoch": 4.214285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003867326013278216, "kl": 0.011796298203989863, "learning_rate": 4.828590234527106e-05, "loss": 0.0, "num_tokens": 5158752.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5777709484100342, "sampling/importance_sampling_ratio/min": 9.075362379071311e-36, "sampling/sampling_logp_difference/max": 80.6875, "sampling/sampling_logp_difference/mean": 6.13531494140625, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 678.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 542.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 406.0, "completions/min_terminated_length": 0.0, "entropy": 1.9547883048653603, "epoch": 4.285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003418517007958144, "kl": 0.012483503785915673, "learning_rate": 4.821147740659794e-05, "loss": 0.0, "num_tokens": 5225568.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6516018509864807, "sampling/importance_sampling_ratio/min": 1.959732570142077e-33, "sampling/sampling_logp_difference/max": 75.3125, "sampling/sampling_logp_difference/mean": 5.2259697914123535, "step": 60 }, { "clip_ratio/high_max": 0.0006172839493956417, "clip_ratio/high_mean": 0.00030864197469782084, "clip_ratio/low_mean": 0.0011574074160307646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014660494052805007, "completions/clipped_ratio": 1.0, "completions/max_length": 661.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 533.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 405.0, "completions/min_terminated_length": 0.0, "entropy": 2.23892430216074, "epoch": 4.357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 0.005821478087455034, "kl": 0.014304787939181551, "learning_rate": 4.813553074106761e-05, "loss": 0.0004, "num_tokens": 5291808.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6071267127990723, "sampling/importance_sampling_ratio/min": 2.1987591985594636e-34, "sampling/sampling_logp_difference/max": 77.5, "sampling/sampling_logp_difference/mean": 5.850136756896973, "step": 61 }, { "clip_ratio/high_max": 0.0018474842654541135, "clip_ratio/high_mean": 0.0009237421327270567, "clip_ratio/low_mean": 0.0014150943261483917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002338836487979279, "completions/clipped_ratio": 1.0, "completions/max_length": 795.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 721.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 647.0, "completions/min_terminated_length": 0.0, "entropy": 2.81126207113266, "epoch": 4.428571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 0.010938549414277077, "kl": 0.009499191452050582, "learning_rate": 4.805806732768585e-05, "loss": 0.0004, "num_tokens": 5370080.0, "reward": 0.0625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5852782130241394, "sampling/importance_sampling_ratio/min": 1.1652996780444926e-35, "sampling/sampling_logp_difference/max": 80.4375, "sampling/sampling_logp_difference/mean": 6.015608787536621, "step": 62 }, { "clip_ratio/high_max": 0.0014553687069565058, "clip_ratio/high_mean": 0.0008744964434299618, "clip_ratio/low_mean": 0.0011719190915755462, "clip_ratio/low_min": 0.00012583892385009676, "clip_ratio/region_mean": 0.0020464154913497623, "completions/clipped_ratio": 1.0, "completions/max_length": 773.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 759.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 745.0, "completions/min_terminated_length": 0.0, "entropy": 3.4914296120405197, "epoch": 4.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.010757986456155777, "kl": 0.014272812113631517, "learning_rate": 4.7979092244895305e-05, "loss": 0.0015, "num_tokens": 5450784.0, "reward": 0.0625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.49289458990097046, "sampling/importance_sampling_ratio/min": 3.589375444066961e-35, "sampling/sampling_logp_difference/max": 79.3125, "sampling/sampling_logp_difference/mean": 7.297555923461914, "step": 63 }, { "clip_ratio/high_max": 0.0055389404296875, "clip_ratio/high_mean": 0.0029033031896688044, "clip_ratio/low_mean": 0.0016096878971438855, "clip_ratio/low_min": 0.000152587890625, "clip_ratio/region_mean": 0.004512991137744393, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1491.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 934.0, "completions/min_terminated_length": 0.0, "entropy": 5.136043608188629, "epoch": 4.571428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.014029472135007381, "kl": 0.023238016525283456, "learning_rate": 4.789861067024253e-05, "loss": 0.0006, "num_tokens": 5578336.0, "reward": 0.109375, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.31686344742774963, "sampling/importance_sampling_ratio/min": 8.610464551321452e-35, "sampling/sampling_logp_difference/max": 78.4375, "sampling/sampling_logp_difference/mean": 10.111202239990234, "step": 64 }, { "clip_ratio/high_max": 0.0007163323534769006, "clip_ratio/high_mean": 0.0003581661767384503, "clip_ratio/low_mean": 0.0007611031433043536, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011192693200428039, "completions/clipped_ratio": 1.0, "completions/max_length": 808.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 753.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 698.0, "completions/min_terminated_length": 0.0, "entropy": 3.339474454522133, "epoch": 4.642857142857143, "frac_reward_zero_std": 0.875, "grad_norm": 0.007302358280867338, "kl": 0.01692346009076573, "learning_rate": 4.781662788003851e-05, "loss": 0.0009, "num_tokens": 5658656.0, "reward": 0.171875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5225365161895752, "sampling/importance_sampling_ratio/min": 7.138325476697025e-35, "sampling/sampling_logp_difference/max": 78.625, "sampling/sampling_logp_difference/mean": 6.876855850219727, "step": 65 }, { "clip_ratio/high_max": 0.0004740373478853144, "clip_ratio/high_mean": 0.0002370186739426572, "clip_ratio/low_mean": 0.0008569136407459155, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010939323146885727, "completions/clipped_ratio": 1.0, "completions/max_length": 857.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 753.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 649.0, "completions/min_terminated_length": 0.0, "entropy": 3.767613932490349, "epoch": 4.714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 0.006443016696721315, "kl": 0.02145662176189944, "learning_rate": 4.773314924901281e-05, "loss": -0.0031, "num_tokens": 5738976.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4544695019721985, "sampling/importance_sampling_ratio/min": 4.9549805217845695e-34, "sampling/sampling_logp_difference/max": 76.6875, "sampling/sampling_logp_difference/mean": 7.827626705169678, "step": 66 }, { "clip_ratio/high_max": 0.0027736686170101166, "clip_ratio/high_mean": 0.0013868343085050583, "clip_ratio/low_mean": 0.0005855522758793086, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001972386584384367, "completions/clipped_ratio": 1.0, "completions/max_length": 657.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 582.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 507.0, "completions/min_terminated_length": 0.0, "entropy": 2.4568098559975624, "epoch": 4.785714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 0.00892623607069254, "kl": 0.011925855797016993, "learning_rate": 4.764818024996117e-05, "loss": -0.0001, "num_tokens": 5808352.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5850483179092407, "sampling/importance_sampling_ratio/min": 1.419625586416994e-34, "sampling/sampling_logp_difference/max": 77.9375, "sampling/sampling_logp_difference/mean": 6.091662883758545, "step": 67 }, { "clip_ratio/high_max": 0.004917905738693662, "clip_ratio/high_mean": 0.002458952869346831, "clip_ratio/low_mean": 0.0008519206876371754, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033108735606219852, "completions/clipped_ratio": 1.0, "completions/max_length": 807.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 684.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 562.0, "completions/min_terminated_length": 0.0, "entropy": 3.0002060011029243, "epoch": 4.857142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.012368054129183292, "kl": 0.012778725184034556, "learning_rate": 4.756172645338675e-05, "loss": -0.0019, "num_tokens": 5884288.0, "reward": 0.140625, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.535092294216156, "sampling/importance_sampling_ratio/min": 9.075362379071311e-36, "sampling/sampling_logp_difference/max": 80.6875, "sampling/sampling_logp_difference/mean": 6.747511386871338, "step": 68 }, { "clip_ratio/high_max": 0.00037153236917220056, "clip_ratio/high_mean": 0.00018576618458610028, "clip_ratio/low_mean": 0.0006811426719650626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008669088492752053, "completions/clipped_ratio": 1.0, "completions/max_length": 757.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 748.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 739.0, "completions/min_terminated_length": 0.0, "entropy": 3.3291787207126617, "epoch": 4.928571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 0.007735888008028269, "kl": 0.011071905435528606, "learning_rate": 4.747379352713489e-05, "loss": 0.0016, "num_tokens": 5964288.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5214368104934692, "sampling/importance_sampling_ratio/min": 7.598708298374055e-35, "sampling/sampling_logp_difference/max": 78.5625, "sampling/sampling_logp_difference/mean": 6.898473739624023, "step": 69 }, { "clip_ratio/high_max": 0.0015096618226380087, "clip_ratio/high_mean": 0.00094064172662911, "clip_ratio/low_mean": 0.001875843259767862, "clip_ratio/low_min": 0.0007763198154862039, "clip_ratio/region_mean": 0.002816484990034951, "completions/clipped_ratio": 1.0, "completions/max_length": 925.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 773.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 621.0, "completions/min_terminated_length": 0.0, "entropy": 3.8110699504613876, "epoch": 5.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.017148004844784737, "kl": 0.016000346862711012, "learning_rate": 4.738438723602154e-05, "loss": -0.0006, "num_tokens": 6045888.0, "reward": 0.109375, "reward_std": 0.1530819833278656, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.44977352023124695, "sampling/importance_sampling_ratio/min": 7.449511149562219e-37, "sampling/sampling_logp_difference/max": 83.1875, "sampling/sampling_logp_difference/mean": 7.886619567871094, "step": 70 }, { "clip_ratio/high_max": 0.003355704597197473, "clip_ratio/high_mean": 0.0016778522985987365, "clip_ratio/low_mean": 0.0003408137708902359, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020186660694889724, "completions/clipped_ratio": 1.0, "completions/max_length": 774.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 685.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 596.0, "completions/min_terminated_length": 0.0, "entropy": 3.408262848854065, "epoch": 5.071428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 0.008903984911739826, "kl": 0.015819466905668378, "learning_rate": 4.7293513441455364e-05, "loss": -0.0007, "num_tokens": 6121856.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48514968156814575, "sampling/importance_sampling_ratio/min": 7.674422876742102e-34, "sampling/sampling_logp_difference/max": 76.25, "sampling/sampling_logp_difference/mean": 7.419497489929199, "step": 71 }, { "clip_ratio/high_max": 0.000478560494229896, "clip_ratio/high_mean": 0.000239280247114948, "clip_ratio/low_mean": 0.0015792496105859755, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018185298577009235, "completions/clipped_ratio": 1.0, "completions/max_length": 653.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 652.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 651.0, "completions/min_terminated_length": 0.0, "entropy": 3.167395517230034, "epoch": 5.142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.01227265689522028, "kl": 0.015939707052893937, "learning_rate": 4.7201178101053414e-05, "loss": -0.0029, "num_tokens": 6195712.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5154479742050171, "sampling/importance_sampling_ratio/min": 1.0489692816875613e-33, "sampling/sampling_logp_difference/max": 75.9375, "sampling/sampling_logp_difference/mean": 6.99188756942749, "step": 72 }, { "clip_ratio/high_max": 0.0021694978058803827, "clip_ratio/high_mean": 0.0010847489029401913, "clip_ratio/low_mean": 0.0006000738540024031, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016848227569425944, "completions/clipped_ratio": 1.0, "completions/max_length": 807.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 742.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 677.0, "completions/min_terminated_length": 0.0, "entropy": 2.9740626141428947, "epoch": 5.214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.0185931995511055, "kl": 0.022571761859580874, "learning_rate": 4.710738726825059e-05, "loss": -0.001, "num_tokens": 6275328.0, "reward": 0.140625, "reward_std": 0.1315089464187622, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5599408149719238, "sampling/importance_sampling_ratio/min": 1.6787733175152708e-36, "sampling/sampling_logp_difference/max": 82.375, "sampling/sampling_logp_difference/mean": 6.358048439025879, "step": 73 }, { "clip_ratio/high_max": 0.009188311523757875, "clip_ratio/high_mean": 0.004802180919796228, "clip_ratio/low_mean": 0.0017385618994012475, "clip_ratio/low_min": 0.00013868343376088887, "clip_ratio/region_mean": 0.0065407428264734335, "completions/clipped_ratio": 1.0, "completions/max_length": 1925.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1300.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 676.0, "completions/min_terminated_length": 0.0, "entropy": 4.720337927341461, "epoch": 5.285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.009382231160998344, "kl": 0.02783663384616375, "learning_rate": 4.701214709190277e-05, "loss": 0.0024, "num_tokens": 6390688.0, "reward": 0.109375, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3419308364391327, "sampling/importance_sampling_ratio/min": 2.0655430870101624e-34, "sampling/sampling_logp_difference/max": 77.5625, "sampling/sampling_logp_difference/mean": 9.803936958312988, "step": 74 }, { "clip_ratio/high_max": 0.000964964390732348, "clip_ratio/high_mean": 0.000482482195366174, "clip_ratio/low_mean": 0.000278355109912809, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000760837305278983, "completions/clipped_ratio": 1.0, "completions/max_length": 842.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 669.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 497.0, "completions/min_terminated_length": 0.0, "entropy": 3.174016147851944, "epoch": 5.357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 0.010106517933309078, "kl": 0.020186201378237456, "learning_rate": 4.69154638158837e-05, "loss": 0.0008, "num_tokens": 6465664.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5207061767578125, "sampling/importance_sampling_ratio/min": 1.6086464555616602e-34, "sampling/sampling_logp_difference/max": 77.8125, "sampling/sampling_logp_difference/mean": 6.959237098693848, "step": 75 }, { "clip_ratio/high_max": 0.005356723777367733, "clip_ratio/high_mean": 0.0026783618886838667, "clip_ratio/low_mean": 0.0011612364251050167, "clip_ratio/low_min": 0.00012634770246222615, "clip_ratio/region_mean": 0.0038395983065129258, "completions/clipped_ratio": 1.0, "completions/max_length": 742.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 592.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 442.0, "completions/min_terminated_length": 0.0, "entropy": 2.392890691757202, "epoch": 5.428571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 0.009935470297932625, "kl": 0.017821090936195105, "learning_rate": 4.6817343778675614e-05, "loss": 0.0032, "num_tokens": 6535680.0, "reward": 0.109375, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5872933864593506, "sampling/importance_sampling_ratio/min": 9.660673761439827e-36, "sampling/sampling_logp_difference/max": 80.625, "sampling/sampling_logp_difference/mean": 6.1030449867248535, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 921.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 809.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 697.0, "completions/min_terminated_length": 0.0, "entropy": 3.652115747332573, "epoch": 5.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004807775258086622, "kl": 0.022480483981780708, "learning_rate": 4.671779341295378e-05, "loss": 0.0, "num_tokens": 6619584.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4905499815940857, "sampling/importance_sampling_ratio/min": 1.0838975013979647e-36, "sampling/sampling_logp_difference/max": 82.8125, "sampling/sampling_logp_difference/mean": 7.284627914428711, "step": 77 }, { "clip_ratio/high_max": 0.00396728515625, "clip_ratio/high_mean": 0.001983642578125, "clip_ratio/low_mean": 0.00106048583984375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00304412841796875, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "entropy": 5.766772955656052, "epoch": 5.571428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.010813243687152863, "kl": 0.03390377084724605, "learning_rate": 4.661681924516466e-05, "loss": 0.0027, "num_tokens": 6782784.0, "reward": 0.125, "reward_std": 0.1293872892856598, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.24853353202342987, "sampling/importance_sampling_ratio/min": 5.917879946392805e-35, "sampling/sampling_logp_difference/max": 78.8125, "sampling/sampling_logp_difference/mean": 11.392820358276367, "step": 78 }, { "clip_ratio/high_max": 0.0007874668517615646, "clip_ratio/high_mean": 0.0003937334258807823, "clip_ratio/low_mean": 0.00123300730047049, "clip_ratio/low_min": 0.00033156499557662755, "clip_ratio/region_mean": 0.0016267407263512723, "completions/clipped_ratio": 1.0, "completions/max_length": 754.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 667.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 580.0, "completions/min_terminated_length": 0.0, "entropy": 3.5397599041461945, "epoch": 5.642857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.014571044594049454, "kl": 0.036117757728789, "learning_rate": 4.6514427895098134e-05, "loss": 0.001, "num_tokens": 6857600.0, "reward": 0.046875, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.470174103975296, "sampling/importance_sampling_ratio/min": 2.713484435667519e-38, "sampling/sampling_logp_difference/max": 86.5, "sampling/sampling_logp_difference/mean": 7.647233009338379, "step": 79 }, { "clip_ratio/high_max": 0.0006377550889737904, "clip_ratio/high_mean": 0.0003188775444868952, "clip_ratio/low_mean": 0.0011957908136537299, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001514668358140625, "completions/clipped_ratio": 1.0, "completions/max_length": 682.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 635.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 588.0, "completions/min_terminated_length": 0.0, "entropy": 2.3500461876392365, "epoch": 5.714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.014862851239740849, "kl": 0.022990907251369208, "learning_rate": 4.641062607545347e-05, "loss": -0.0019, "num_tokens": 6930368.0, "reward": 0.0625, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6159577965736389, "sampling/importance_sampling_ratio/min": 2.4915206584065623e-34, "sampling/sampling_logp_difference/max": 77.375, "sampling/sampling_logp_difference/mean": 5.629547595977783, "step": 80 }, { "clip_ratio/high_max": 0.0066337720199953765, "clip_ratio/high_mean": 0.0033168860099976882, "clip_ratio/low_mean": 0.0010416666937089758, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004358552694611717, "completions/clipped_ratio": 1.0, "completions/max_length": 938.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 754.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 570.0, "completions/min_terminated_length": 0.0, "entropy": 3.1865550503134727, "epoch": 5.785714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.014303004369139671, "kl": 0.024169143056496978, "learning_rate": 4.630542059139924e-05, "loss": 0.002, "num_tokens": 7010752.0, "reward": 0.15625, "reward_std": 0.1246790662407875, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5240888595581055, "sampling/importance_sampling_ratio/min": 9.165791500701421e-35, "sampling/sampling_logp_difference/max": 78.375, "sampling/sampling_logp_difference/mean": 6.8953680992126465, "step": 81 }, { "clip_ratio/high_max": 0.0016149870934896171, "clip_ratio/high_mean": 0.0008074935467448086, "clip_ratio/low_mean": 0.00010093669334310107, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009084302400879096, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1411.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 774.0, "completions/min_terminated_length": 0.0, "entropy": 4.92596247792244, "epoch": 5.857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 0.010589989833533764, "kl": 0.02689428662415594, "learning_rate": 4.6198818340127196e-05, "loss": -0.0013, "num_tokens": 7133184.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3394150733947754, "sampling/importance_sampling_ratio/min": 2.3405667503630854e-34, "sampling/sampling_logp_difference/max": 77.4375, "sampling/sampling_logp_difference/mean": 9.764121055603027, "step": 82 }, { "clip_ratio/high_max": 0.0007189771858975291, "clip_ratio/high_mean": 0.00035948859294876456, "clip_ratio/low_mean": 0.0033913345250766724, "clip_ratio/low_min": 0.00044116184290032834, "clip_ratio/region_mean": 0.0037508231253013946, "completions/clipped_ratio": 1.0, "completions/max_length": 1092.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 840.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 589.0, "completions/min_terminated_length": 0.0, "entropy": 3.6072495728731155, "epoch": 5.928571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 0.018433451652526855, "kl": 0.06992799049476162, "learning_rate": 4.6090826310400116e-05, "loss": 0.0025, "num_tokens": 7219104.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4961131811141968, "sampling/importance_sampling_ratio/min": 4.608849322522735e-35, "sampling/sampling_logp_difference/max": 79.0625, "sampling/sampling_logp_difference/mean": 7.258965969085693, "step": 83 }, { "clip_ratio/high_max": 0.0016377005376853049, "clip_ratio/high_mean": 0.0008188502688426524, "clip_ratio/low_mean": 0.00038435829628724605, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012032085651298985, "completions/clipped_ratio": 1.0, "completions/max_length": 935.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 818.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 702.0, "completions/min_terminated_length": 0.0, "entropy": 3.7802692502737045, "epoch": 6.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.04603629559278488, "kl": 0.036175756715238094, "learning_rate": 4.5981451582093557e-05, "loss": 0.0, "num_tokens": 7303616.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4906505346298218, "sampling/importance_sampling_ratio/min": 1.959732570142077e-33, "sampling/sampling_logp_difference/max": 75.3125, "sampling/sampling_logp_difference/mean": 7.293757438659668, "step": 84 }, { "clip_ratio/high_max": 0.0006708407890982926, "clip_ratio/high_mean": 0.0003354203945491463, "clip_ratio/low_mean": 0.0005031305881857406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008385509827348869, "completions/clipped_ratio": 1.0, "completions/max_length": 706.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 632.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 559.0, "completions/min_terminated_length": 0.0, "entropy": 2.4695974960923195, "epoch": 6.071428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 0.007366519887000322, "kl": 0.048958684667013586, "learning_rate": 4.587070132573178e-05, "loss": -0.0011, "num_tokens": 7376224.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6010376214981079, "sampling/importance_sampling_ratio/min": 7.523739969508069e-36, "sampling/sampling_logp_difference/max": 80.875, "sampling/sampling_logp_difference/mean": 5.8522233963012695, "step": 85 }, { "clip_ratio/high_max": 0.0007073901360854506, "clip_ratio/high_mean": 0.0003536950680427253, "clip_ratio/low_mean": 0.0014980026553530479, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018516977233957732, "completions/clipped_ratio": 1.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 928.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 751.0, "completions/min_terminated_length": 0.0, "entropy": 4.572963684797287, "epoch": 6.142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.014835071749985218, "kl": 0.023276635678485036, "learning_rate": 4.57585828020176e-05, "loss": -0.0003, "num_tokens": 7467776.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3900577425956726, "sampling/importance_sampling_ratio/min": 2.4915206584065623e-34, "sampling/sampling_logp_difference/max": 77.375, "sampling/sampling_logp_difference/mean": 8.663357734680176, "step": 86 }, { "clip_ratio/high_max": 0.0011210762313567102, "clip_ratio/high_mean": 0.0005605381156783551, "clip_ratio/low_mean": 0.001611547137144953, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002172085252823308, "completions/clipped_ratio": 1.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 842.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 669.0, "completions/min_terminated_length": 0.0, "entropy": 3.561467692255974, "epoch": 6.214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.01637115143239498, "kl": 0.03449520713184029, "learning_rate": 4.5645103361356415e-05, "loss": 0.0002, "num_tokens": 7553824.0, "reward": 0.046875, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5070828199386597, "sampling/importance_sampling_ratio/min": 1.9403981676294466e-34, "sampling/sampling_logp_difference/max": 77.625, "sampling/sampling_logp_difference/mean": 7.061343669891357, "step": 87 }, { "clip_ratio/high_max": 0.0009069920633919537, "clip_ratio/high_mean": 0.00045349603169597685, "clip_ratio/low_mean": 0.0006390171474777162, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001092513179173693, "completions/clipped_ratio": 1.0, "completions/max_length": 929.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 843.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 758.0, "completions/min_terminated_length": 0.0, "entropy": 3.844596207141876, "epoch": 6.285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 0.010715988464653492, "kl": 0.033867576508782804, "learning_rate": 4.55302704433743e-05, "loss": -0.0022, "num_tokens": 7639936.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4739830493927002, "sampling/importance_sampling_ratio/min": 1.094697696541931e-35, "sampling/sampling_logp_difference/max": 80.5, "sampling/sampling_logp_difference/mean": 7.544139862060547, "step": 88 }, { "clip_ratio/high_max": 0.00012634770246222615, "clip_ratio/high_mean": 6.317385123111308e-05, "clip_ratio/low_mean": 0.0008423180333920754, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009054918846231885, "completions/clipped_ratio": 1.0, "completions/max_length": 742.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 640.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 538.0, "completions/min_terminated_length": 0.0, "entropy": 2.7741521894931793, "epoch": 6.357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 0.013445493765175343, "kl": 0.029145828273613006, "learning_rate": 4.541409157643027e-05, "loss": -0.0008, "num_tokens": 7713024.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5612393617630005, "sampling/importance_sampling_ratio/min": 3.858942701415657e-34, "sampling/sampling_logp_difference/max": 76.9375, "sampling/sampling_logp_difference/mean": 6.401476860046387, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 704.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 696.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 689.0, "completions/min_terminated_length": 0.0, "entropy": 2.4043056070804596, "epoch": 6.428571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006551254773512483, "kl": 0.02599391050171107, "learning_rate": 4.529657437712276e-05, "loss": 0.0, "num_tokens": 7789728.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6312199831008911, "sampling/importance_sampling_ratio/min": 8.358120360582445e-38, "sampling/sampling_logp_difference/max": 85.375, "sampling/sampling_logp_difference/mean": 5.381896495819092, "step": 90 }, { "clip_ratio/high_max": 0.00929860316682607, "clip_ratio/high_mean": 0.005080490896943957, "clip_ratio/low_mean": 0.0023820855712983757, "clip_ratio/low_min": 0.0005760621497756802, "clip_ratio/region_mean": 0.007462576468242332, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1380.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 712.0, "completions/min_terminated_length": 0.0, "entropy": 4.543182924389839, "epoch": 6.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.03275305777788162, "kl": 0.03492529003415257, "learning_rate": 4.517772654979023e-05, "loss": 0.0023, "num_tokens": 7910176.0, "reward": 0.203125, "reward_std": 0.1804211586713791, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3697096109390259, "sampling/importance_sampling_ratio/min": 4.563378985501584e-36, "sampling/sampling_logp_difference/max": 81.375, "sampling/sampling_logp_difference/mean": 9.444976806640625, "step": 91 }, { "clip_ratio/high_max": 0.0037020905656390823, "clip_ratio/high_mean": 0.0018510452828195412, "clip_ratio/low_mean": 0.003947081893784343, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005798127225716598, "completions/clipped_ratio": 1.0, "completions/max_length": 721.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 647.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 574.0, "completions/min_terminated_length": 0.0, "entropy": 2.9317656606435776, "epoch": 6.571428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.018274588510394096, "kl": 0.054928607889451087, "learning_rate": 4.505755588600612e-05, "loss": 0.0001, "num_tokens": 7983744.0, "reward": 0.125, "reward_std": 0.2130674123764038, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5431411266326904, "sampling/importance_sampling_ratio/min": 2.3174752027302318e-35, "sampling/sampling_logp_difference/max": 79.75, "sampling/sampling_logp_difference/mean": 6.605520725250244, "step": 92 }, { "clip_ratio/high_max": 0.002148335537640378, "clip_ratio/high_mean": 0.001074167768820189, "clip_ratio/low_mean": 0.0011703514974215068, "clip_ratio/low_min": 0.00027292576851323247, "clip_ratio/region_mean": 0.0022445192371378653, "completions/clipped_ratio": 1.0, "completions/max_length": 786.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 736.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 687.0, "completions/min_terminated_length": 0.0, "entropy": 2.8584478348493576, "epoch": 6.642857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.011851426213979721, "kl": 0.019605247303843498, "learning_rate": 4.493607026406802e-05, "loss": 0.0003, "num_tokens": 8063008.0, "reward": 0.09375, "reward_std": 0.1246790662407875, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5827724933624268, "sampling/importance_sampling_ratio/min": 3.625141007634431e-34, "sampling/sampling_logp_difference/max": 77.0, "sampling/sampling_logp_difference/mean": 6.067327499389648, "step": 93 }, { "clip_ratio/high_max": 0.0011029412053176202, "clip_ratio/high_mean": 0.0005514706026588101, "clip_ratio/low_mean": 0.0008374183271371294, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888889297959395, "completions/clipped_ratio": 1.0, "completions/max_length": 802.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 783.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 765.0, "completions/min_terminated_length": 0.0, "entropy": 3.592208430171013, "epoch": 6.714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 0.008097909390926361, "kl": 0.01901521551189944, "learning_rate": 4.481327764848118e-05, "loss": -0.0008, "num_tokens": 8145280.0, "reward": 0.171875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.49952632188796997, "sampling/importance_sampling_ratio/min": 2.0249872189461843e-36, "sampling/sampling_logp_difference/max": 82.1875, "sampling/sampling_logp_difference/mean": 7.187984466552734, "step": 94 }, { "clip_ratio/high_max": 0.0028635459020733833, "clip_ratio/high_mean": 0.0014317729510366917, "clip_ratio/low_mean": 0.0014006474521011114, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002832420403137803, "completions/clipped_ratio": 1.0, "completions/max_length": 831.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 666.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 502.0, "completions/min_terminated_length": 0.0, "entropy": 2.8841626793146133, "epoch": 6.785714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.011033333837985992, "kl": 0.028880253026727587, "learning_rate": 4.4689186089436366e-05, "loss": 0.0024, "num_tokens": 8220064.0, "reward": 0.125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5564569234848022, "sampling/importance_sampling_ratio/min": 1.3336148713971936e-34, "sampling/sampling_logp_difference/max": 78.0, "sampling/sampling_logp_difference/mean": 6.460968494415283, "step": 95 }, { "clip_ratio/high_max": 0.0027152449620189145, "clip_ratio/high_mean": 0.0013576224810094573, "clip_ratio/low_mean": 0.000715837290044874, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002073459771054331, "completions/clipped_ratio": 1.0, "completions/max_length": 786.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 709.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 633.0, "completions/min_terminated_length": 0.0, "entropy": 2.8617996647953987, "epoch": 6.857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 0.0210742074996233, "kl": 0.034816965693607926, "learning_rate": 4.4563803722282074e-05, "loss": 0.0011, "num_tokens": 8297600.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5829811096191406, "sampling/importance_sampling_ratio/min": 2.823262612952882e-34, "sampling/sampling_logp_difference/max": 77.25, "sampling/sampling_logp_difference/mean": 6.060471534729004, "step": 96 }, { "clip_ratio/high_max": 0.002911938812758308, "clip_ratio/high_mean": 0.0017455744709877763, "clip_ratio/low_mean": 0.0027090720723208506, "clip_ratio/low_min": 0.0008234002671088092, "clip_ratio/region_mean": 0.004454646601516288, "completions/clipped_ratio": 1.0, "completions/max_length": 797.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 686.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 575.0, "completions/min_terminated_length": 0.0, "entropy": 2.6727165207266808, "epoch": 6.928571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.029905669391155243, "kl": 0.04022192506818101, "learning_rate": 4.443713876699124e-05, "loss": 0.003, "num_tokens": 8373632.0, "reward": 0.171875, "reward_std": 0.21778544783592224, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5767068862915039, "sampling/importance_sampling_ratio/min": 3.0053473766614753e-34, "sampling/sampling_logp_difference/max": 77.1875, "sampling/sampling_logp_difference/mean": 6.1859636306762695, "step": 97 }, { "clip_ratio/high_max": 0.001993534475332126, "clip_ratio/high_mean": 0.0011422127136029303, "clip_ratio/low_mean": 0.002243586077383952, "clip_ratio/low_min": 0.0003740026513696648, "clip_ratio/region_mean": 0.0033857987764349673, "completions/clipped_ratio": 1.0, "completions/max_length": 752.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 666.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 580.0, "completions/min_terminated_length": 0.0, "entropy": 2.502697393298149, "epoch": 7.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.013924991711974144, "kl": 0.0489629739895463, "learning_rate": 4.430919952762226e-05, "loss": 0.0029, "num_tokens": 8448384.0, "reward": 0.09375, "reward_std": 0.1552036553621292, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6012206673622131, "sampling/importance_sampling_ratio/min": 1.1769110281448947e-34, "sampling/sampling_logp_difference/max": 78.125, "sampling/sampling_logp_difference/mean": 5.826287269592285, "step": 98 }, { "clip_ratio/high_max": 0.006097078403399792, "clip_ratio/high_mean": 0.003048539201699896, "clip_ratio/low_mean": 0.0009113234264077619, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003959862646297552, "completions/clipped_ratio": 1.0, "completions/max_length": 772.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 611.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 450.0, "completions/min_terminated_length": 0.0, "entropy": 2.2854697704315186, "epoch": 7.071428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.020640557631850243, "kl": 0.0364776166388765, "learning_rate": 4.417999439177466e-05, "loss": -0.0018, "num_tokens": 8519616.0, "reward": 0.140625, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6112511157989502, "sampling/importance_sampling_ratio/min": 7.449511149562219e-37, "sampling/sampling_logp_difference/max": 83.1875, "sampling/sampling_logp_difference/mean": 5.764904975891113, "step": 99 }, { "clip_ratio/high_max": 0.000701508586644195, "clip_ratio/high_mean": 0.0003507542933220975, "clip_ratio/low_mean": 0.002276421604619827, "clip_ratio/low_min": 0.0004180601972620934, "clip_ratio/region_mean": 0.0026271759052178822, "completions/clipped_ratio": 1.0, "completions/max_length": 698.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 648.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 598.0, "completions/min_terminated_length": 0.0, "entropy": 2.5977519527077675, "epoch": 7.142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.13337484002113342, "kl": 0.9475500697735697, "learning_rate": 4.404953183003916e-05, "loss": -0.0004, "num_tokens": 8593216.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5812804698944092, "sampling/importance_sampling_ratio/min": 1.2282168042440444e-36, "sampling/sampling_logp_difference/max": 82.6875, "sampling/sampling_logp_difference/mean": 6.097937107086182, "step": 100 }, { "epoch": 7.142857142857143, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 482.72, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 452.14, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 421.56, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.810429340004921, "eval_frac_reward_zero_std": 0.92, "eval_kl": 0.06461213760077954, "eval_loss": 0.00012982486805412918, "eval_num_tokens": 8593216.0, "eval_reward": 0.035, "eval_reward_std": 0.03265853762626648, "eval_rewards/tree_correctness_reward/mean": 0.035, "eval_rewards/tree_correctness_reward/std": 0.03265853762626648, "eval_runtime": 141.2571, "eval_samples_per_second": 0.177, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8277249264717103, "eval_sampling/importance_sampling_ratio/min": 3.849412337127256e-31, "eval_sampling/sampling_logp_difference/max": 73.4625, "eval_sampling/sampling_logp_difference/mean": 2.7684663105010987, "eval_steps_per_second": 0.028, "step": 100 }, { "clip_ratio/high_max": 0.0005626023048534989, "clip_ratio/high_mean": 0.00028130115242674947, "clip_ratio/low_mean": 0.0008950491119321669, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011763502643589163, "completions/clipped_ratio": 1.0, "completions/max_length": 611.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 601.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 591.0, "completions/min_terminated_length": 0.0, "entropy": 2.1469287425279617, "epoch": 7.214285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 0.009120475500822067, "kl": 0.03733851038850844, "learning_rate": 4.391782039544238e-05, "loss": 0.0017, "num_tokens": 8663808.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6272875070571899, "sampling/importance_sampling_ratio/min": 4.067295401679061e-35, "sampling/sampling_logp_difference/max": 79.1875, "sampling/sampling_logp_difference/mean": 5.515982627868652, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 936.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 759.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 582.0, "completions/min_terminated_length": 0.0, "entropy": 2.9964671432971954, "epoch": 7.285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042415031930431724, "kl": 0.02624429063871503, "learning_rate": 4.378486872288611e-05, "loss": 0.0, "num_tokens": 8744512.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5705947279930115, "sampling/importance_sampling_ratio/min": 1.5262413857920149e-33, "sampling/sampling_logp_difference/max": 75.5625, "sampling/sampling_logp_difference/mean": 6.239292144775391, "step": 102 }, { "clip_ratio/high_max": 0.0052806122112087905, "clip_ratio/high_mean": 0.003217965058865957, "clip_ratio/low_mean": 0.0027462715879664756, "clip_ratio/low_min": 0.000892366559128277, "clip_ratio/region_mean": 0.00596423665410839, "completions/clipped_ratio": 1.0, "completions/max_length": 700.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 668.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 637.0, "completions/min_terminated_length": 0.0, "entropy": 2.9085165560245514, "epoch": 7.357142857142857, "frac_reward_zero_std": 0.375, "grad_norm": 0.019966477528214455, "kl": 0.03235693974420428, "learning_rate": 4.365068552858115e-05, "loss": 0.004, "num_tokens": 8819424.0, "reward": 0.296875, "reward_std": 0.27564918994903564, "rewards/tree_correctness_reward/mean": 0.296875, "rewards/tree_correctness_reward/std": 0.4604927599430084, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5492045879364014, "sampling/importance_sampling_ratio/min": 1.038620201872273e-34, "sampling/sampling_logp_difference/max": 78.25, "sampling/sampling_logp_difference/mean": 6.559114456176758, "step": 103 }, { "clip_ratio/high_max": 0.003994269151007757, "clip_ratio/high_mean": 0.0019971345755038783, "clip_ratio/low_mean": 0.001872726577857975, "clip_ratio/low_min": 0.00030680224881507456, "clip_ratio/region_mean": 0.003869861175189726, "completions/clipped_ratio": 1.0, "completions/max_length": 743.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 728.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 713.0, "completions/min_terminated_length": 0.0, "entropy": 3.298244744539261, "epoch": 7.428571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.016340700909495354, "kl": 0.03405057732015848, "learning_rate": 4.3515279609476e-05, "loss": 0.0053, "num_tokens": 8898144.0, "reward": 0.140625, "reward_std": 0.15992169082164764, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5118764638900757, "sampling/importance_sampling_ratio/min": 9.854154449263851e-34, "sampling/sampling_logp_difference/max": 76.0, "sampling/sampling_logp_difference/mean": 7.03498649597168, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 850.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 825.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 801.0, "completions/min_terminated_length": 0.0, "entropy": 3.1484078764915466, "epoch": 7.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011089500039815903, "kl": 0.026959966053254902, "learning_rate": 4.337865984268001e-05, "loss": 0.0, "num_tokens": 8983104.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5634087920188904, "sampling/importance_sampling_ratio/min": 4.2868981798105594e-36, "sampling/sampling_logp_difference/max": 81.4375, "sampling/sampling_logp_difference/mean": 6.295953750610352, "step": 105 }, { "clip_ratio/high_max": 0.004571813900838606, "clip_ratio/high_mean": 0.0023722329206066206, "clip_ratio/low_mean": 0.002433168927382212, "clip_ratio/low_min": 0.00010444518557051197, "clip_ratio/region_mean": 0.004805401877092663, "completions/clipped_ratio": 1.0, "completions/max_length": 1496.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1110.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 724.0, "completions/min_terminated_length": 0.0, "entropy": 4.45740570127964, "epoch": 7.571428571428571, "frac_reward_zero_std": 0.625, "grad_norm": 0.013490360230207443, "kl": 0.026454119011759758, "learning_rate": 4.324083518488151e-05, "loss": 0.0032, "num_tokens": 9086272.0, "reward": 0.140625, "reward_std": 0.15992169082164764, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.38894617557525635, "sampling/importance_sampling_ratio/min": 1.6955009379010048e-35, "sampling/sampling_logp_difference/max": 80.0625, "sampling/sampling_logp_difference/mean": 8.847061157226562, "step": 106 }, { "clip_ratio/high_max": 0.0031221735989674926, "clip_ratio/high_mean": 0.0015610867994837463, "clip_ratio/low_mean": 0.0028859106387244537, "clip_ratio/low_min": 0.0005668717640219256, "clip_ratio/region_mean": 0.004446997430932242, "completions/clipped_ratio": 1.0, "completions/max_length": 764.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 587.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 411.0, "completions/min_terminated_length": 0.0, "entropy": 2.0717884600162506, "epoch": 7.642857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.0173050444573164, "kl": 0.037493481184355915, "learning_rate": 4.3101814671760546e-05, "loss": 0.0004, "num_tokens": 9156000.0, "reward": 0.09375, "reward_std": 0.16675157845020294, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6436735391616821, "sampling/importance_sampling_ratio/min": 9.075362379071311e-36, "sampling/sampling_logp_difference/max": 80.6875, "sampling/sampling_logp_difference/mean": 5.338537693023682, "step": 107 }, { "clip_ratio/high_max": 0.0013812154502375051, "clip_ratio/high_mean": 0.0006906077251187526, "clip_ratio/low_mean": 0.0006042817767593078, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012948895164299756, "completions/clipped_ratio": 1.0, "completions/max_length": 905.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 842.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 780.0, "completions/min_terminated_length": 0.0, "entropy": 3.9175513237714767, "epoch": 7.714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 0.008563239127397537, "kl": 0.05180044722510502, "learning_rate": 4.296160741739652e-05, "loss": 0.0013, "num_tokens": 9242048.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4644690454006195, "sampling/importance_sampling_ratio/min": 6.362320011142437e-34, "sampling/sampling_logp_difference/max": 76.4375, "sampling/sampling_logp_difference/mean": 7.678481578826904, "step": 108 }, { "clip_ratio/high_max": 0.0056275931710843, "clip_ratio/high_mean": 0.00281379658554215, "clip_ratio/low_mean": 0.002455871261190623, "clip_ratio/low_min": 0.0007209247924038209, "clip_ratio/region_mean": 0.005269667919492349, "completions/clipped_ratio": 1.0, "completions/max_length": 499.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 484.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 469.0, "completions/min_terminated_length": 0.0, "entropy": 1.981952540576458, "epoch": 7.785714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 0.014496243558824062, "kl": 0.03448413545265794, "learning_rate": 4.2820222613670736e-05, "loss": -0.0022, "num_tokens": 9305152.0, "reward": 0.171875, "reward_std": 0.1804211586713791, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6234554052352905, "sampling/importance_sampling_ratio/min": 2.0451645895225563e-35, "sampling/sampling_logp_difference/max": 79.875, "sampling/sampling_logp_difference/mean": 5.632316589355469, "step": 109 }, { "clip_ratio/high_max": 0.0019660595862660557, "clip_ratio/high_mean": 0.0009830297931330279, "clip_ratio/low_mean": 0.00031043047783896327, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012934602709719911, "completions/clipped_ratio": 1.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 908.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 604.0, "completions/min_terminated_length": 0.0, "entropy": 3.948840707540512, "epoch": 7.857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 114.8449935913086, "kl": 15.54290193959605, "learning_rate": 4.267766952966369e-05, "loss": 0.0065, "num_tokens": 9395392.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.45521819591522217, "sampling/importance_sampling_ratio/min": 3.4055047095468348e-34, "sampling/sampling_logp_difference/max": 77.0625, "sampling/sampling_logp_difference/mean": 7.817584037780762, "step": 110 }, { "clip_ratio/high_max": 0.00018638171604834497, "clip_ratio/high_mean": 9.319085802417248e-05, "clip_ratio/low_mean": 0.0009474403486819938, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010406312067061663, "completions/clipped_ratio": 1.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 852.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 699.0, "completions/min_terminated_length": 0.0, "entropy": 3.0074464827775955, "epoch": 7.928571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 0.007773037068545818, "kl": 0.031861686147749424, "learning_rate": 4.253395751104748e-05, "loss": 0.0015, "num_tokens": 9482080.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5986474752426147, "sampling/importance_sampling_ratio/min": 6.362320011142437e-34, "sampling/sampling_logp_difference/max": 76.4375, "sampling/sampling_logp_difference/mean": 5.811089038848877, "step": 111 }, { "clip_ratio/high_max": 6.620762724196538e-05, "clip_ratio/high_mean": 3.310381362098269e-05, "clip_ratio/low_mean": 0.0009931143977155443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001026218211336527, "completions/clipped_ratio": 1.0, "completions/max_length": 583.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 527.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 472.0, "completions/min_terminated_length": 0.0, "entropy": 1.9900961145758629, "epoch": 8.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.014562413096427917, "kl": 0.060134623490739614, "learning_rate": 4.238909597947307e-05, "loss": -0.0003, "num_tokens": 9547968.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6425817012786865, "sampling/importance_sampling_ratio/min": 8.696261289635204e-34, "sampling/sampling_logp_difference/max": 76.125, "sampling/sampling_logp_difference/mean": 5.338327884674072, "step": 112 }, { "clip_ratio/high_max": 0.0024542182072764263, "clip_ratio/high_mean": 0.0013066931423963979, "clip_ratio/low_mean": 0.0020522415688901674, "clip_ratio/low_min": 0.00029600811831187457, "clip_ratio/region_mean": 0.0033589346712687984, "completions/clipped_ratio": 1.0, "completions/max_length": 739.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 664.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 589.0, "completions/min_terminated_length": 0.0, "entropy": 3.0523907095193863, "epoch": 8.071428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.020286278799176216, "kl": 0.01962436578469351, "learning_rate": 4.224309443195261e-05, "loss": 0.0008, "num_tokens": 9622592.0, "reward": 0.1875, "reward_std": 0.19727617502212524, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5337232947349548, "sampling/importance_sampling_ratio/min": 2.0861246614783664e-33, "sampling/sampling_logp_difference/max": 75.25, "sampling/sampling_logp_difference/mean": 6.745914936065674, "step": 113 }, { "clip_ratio/high_max": 0.005508956004632637, "clip_ratio/high_mean": 0.002912572439527139, "clip_ratio/low_mean": 0.0010151585302082822, "clip_ratio/low_min": 0.00029127829475328326, "clip_ratio/region_mean": 0.003927731042494997, "completions/clipped_ratio": 1.0, "completions/max_length": 751.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 672.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 593.0, "completions/min_terminated_length": 0.0, "entropy": 2.783567361533642, "epoch": 8.142857142857142, "frac_reward_zero_std": 0.625, "grad_norm": 0.012183724902570248, "kl": 0.021032700198702514, "learning_rate": 4.2095962440236844e-05, "loss": 0.0035, "num_tokens": 9697728.0, "reward": 0.203125, "reward_std": 0.16887323558330536, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5608242750167847, "sampling/importance_sampling_ratio/min": 5.504485832522232e-36, "sampling/sampling_logp_difference/max": 81.1875, "sampling/sampling_logp_difference/mean": 6.375734329223633, "step": 114 }, { "clip_ratio/high_max": 0.0026908892978099175, "clip_ratio/high_mean": 0.0015662326804886106, "clip_ratio/low_mean": 0.001571751752635464, "clip_ratio/low_min": 0.00010190217290073633, "clip_ratio/region_mean": 0.003137984429486096, "completions/clipped_ratio": 1.0, "completions/max_length": 920.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 778.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 637.0, "completions/min_terminated_length": 0.0, "entropy": 3.5850853323936462, "epoch": 8.214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.016590217128396034, "kl": 0.027193782036192715, "learning_rate": 4.194770965018758e-05, "loss": 0.0011, "num_tokens": 9779680.0, "reward": 0.125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.492942214012146, "sampling/importance_sampling_ratio/min": 4.244603898090227e-37, "sampling/sampling_logp_difference/max": 83.75, "sampling/sampling_logp_difference/mean": 7.2807135581970215, "step": 115 }, { "clip_ratio/high_max": 0.001795134761778172, "clip_ratio/high_mean": 0.000897567380889086, "clip_ratio/low_mean": 0.0012456037256924901, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021431710993056186, "completions/clipped_ratio": 1.0, "completions/max_length": 853.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 759.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 666.0, "completions/min_terminated_length": 0.0, "entropy": 3.8796988874673843, "epoch": 8.285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.013858637772500515, "kl": 0.019802901311777532, "learning_rate": 4.179834578114531e-05, "loss": -0.0017, "num_tokens": 9860416.0, "reward": 0.078125, "reward_std": 0.11100947856903076, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4570200443267822, "sampling/importance_sampling_ratio/min": 8.696261289635204e-34, "sampling/sampling_logp_difference/max": 76.125, "sampling/sampling_logp_difference/mean": 7.789185523986816, "step": 116 }, { "clip_ratio/high_max": 0.0071011112740961835, "clip_ratio/high_mean": 0.004180822135822382, "clip_ratio/low_mean": 0.002795072599838022, "clip_ratio/low_min": 0.0006302099136519246, "clip_ratio/region_mean": 0.006975894793868065, "completions/clipped_ratio": 1.0, "completions/max_length": 743.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 557.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 372.0, "completions/min_terminated_length": 0.0, "entropy": 2.385706216096878, "epoch": 8.357142857142858, "frac_reward_zero_std": 0.5, "grad_norm": 0.019919894635677338, "kl": 0.026737659121863544, "learning_rate": 4.164788062529203e-05, "loss": 0.0027, "num_tokens": 9928224.0, "reward": 0.1875, "reward_std": 0.23356688022613525, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5888185501098633, "sampling/importance_sampling_ratio/min": 3.199175604441272e-34, "sampling/sampling_logp_difference/max": 77.125, "sampling/sampling_logp_difference/mean": 6.107879638671875, "step": 117 }, { "clip_ratio/high_max": 0.0015453297091880813, "clip_ratio/high_mean": 0.0007726648545940407, "clip_ratio/low_mean": 0.0005580357319558971, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013307005865499377, "completions/clipped_ratio": 1.0, "completions/max_length": 736.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 732.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 728.0, "completions/min_terminated_length": 0.0, "entropy": 3.2919307351112366, "epoch": 8.428571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 0.008156086318194866, "kl": 0.04727684520184994, "learning_rate": 4.149632404700925e-05, "loss": 0.0014, "num_tokens": 10007200.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5338599681854248, "sampling/importance_sampling_ratio/min": 5.274548762960186e-34, "sampling/sampling_logp_difference/max": 76.625, "sampling/sampling_logp_difference/mean": 6.707066535949707, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 854.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 820.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 786.0, "completions/min_terminated_length": 0.0, "entropy": 2.8002987802028656, "epoch": 8.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002640416787471622, "kl": 0.0191013136645779, "learning_rate": 4.134368598223132e-05, "loss": 0.0, "num_tokens": 10091808.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6113249063491821, "sampling/importance_sampling_ratio/min": 1.5414491051834882e-32, "sampling/sampling_logp_difference/max": 73.25, "sampling/sampling_logp_difference/mean": 5.60654354095459, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 891.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 791.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 692.0, "completions/min_terminated_length": 0.0, "entropy": 3.3919677138328552, "epoch": 8.571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008797376649454236, "kl": 0.034335985546931624, "learning_rate": 4.118997643779401e-05, "loss": 0.0, "num_tokens": 10174592.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.520865261554718, "sampling/importance_sampling_ratio/min": 6.29955056431282e-35, "sampling/sampling_logp_difference/max": 78.75, "sampling/sampling_logp_difference/mean": 6.891762733459473, "step": 120 }, { "clip_ratio/high_max": 0.001228632521815598, "clip_ratio/high_mean": 0.000614316260907799, "clip_ratio/low_mean": 0.0009081196767510846, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015224359376588836, "completions/clipped_ratio": 1.0, "completions/max_length": 852.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 718.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 585.0, "completions/min_terminated_length": 0.0, "entropy": 3.2409535944461823, "epoch": 8.642857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.008725927211344242, "kl": 0.02664886775892228, "learning_rate": 4.10352054907785e-05, "loss": 0.0011, "num_tokens": 10252704.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5229487419128418, "sampling/importance_sampling_ratio/min": 3.0352933707551616e-33, "sampling/sampling_logp_difference/max": 74.875, "sampling/sampling_logp_difference/mean": 6.8911051750183105, "step": 121 }, { "clip_ratio/high_max": 0.004082514213223476, "clip_ratio/high_mean": 0.002884109508158872, "clip_ratio/low_mean": 0.005067289908765815, "clip_ratio/low_min": 0.0003837736148852855, "clip_ratio/region_mean": 0.0079513995005982, "completions/clipped_ratio": 1.0, "completions/max_length": 850.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 616.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 383.0, "completions/min_terminated_length": 0.0, "entropy": 2.634142465889454, "epoch": 8.714285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.021440010517835617, "kl": 0.029166525695472956, "learning_rate": 4.087938328785072e-05, "loss": -0.001, "num_tokens": 10324288.0, "reward": 0.1875, "reward_std": 0.2314550280570984, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5672345757484436, "sampling/importance_sampling_ratio/min": 4.0271676663933825e-36, "sampling/sampling_logp_difference/max": 81.5, "sampling/sampling_logp_difference/mean": 6.413421630859375, "step": 122 }, { "clip_ratio/high_max": 0.0006038647261448205, "clip_ratio/high_mean": 0.0003019323630724102, "clip_ratio/low_mean": 0.0010567632816673722, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013586956447397824, "completions/clipped_ratio": 1.0, "completions/max_length": 853.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 737.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 621.0, "completions/min_terminated_length": 0.0, "entropy": 3.5065485388040543, "epoch": 8.785714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 0.00665724603459239, "kl": 0.022800558945164084, "learning_rate": 4.072252004459611e-05, "loss": -0.0001, "num_tokens": 10403584.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4917660355567932, "sampling/importance_sampling_ratio/min": 7.674422876742102e-34, "sampling/sampling_logp_difference/max": 76.25, "sampling/sampling_logp_difference/mean": 7.3193135261535645, "step": 123 }, { "clip_ratio/high_max": 0.001838235359173268, "clip_ratio/high_mean": 0.000919117679586634, "clip_ratio/low_mean": 0.0009191176723106764, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353518973105, "completions/clipped_ratio": 1.0, "completions/max_length": 612.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 593.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 575.0, "completions/min_terminated_length": 0.0, "entropy": 2.6376986652612686, "epoch": 8.857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.008136834017932415, "kl": 0.03069052315549925, "learning_rate": 4.0564626044849973e-05, "loss": 0.0005, "num_tokens": 10473696.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5669389367103577, "sampling/importance_sampling_ratio/min": 2.3405667503630854e-34, "sampling/sampling_logp_difference/max": 77.4375, "sampling/sampling_logp_difference/mean": 6.338266372680664, "step": 124 }, { "clip_ratio/high_max": 0.006936547695659101, "clip_ratio/high_mean": 0.003765052475500852, "clip_ratio/low_mean": 0.002197396257543005, "clip_ratio/low_min": 0.0008713735587662086, "clip_ratio/region_mean": 0.005962448758509709, "completions/clipped_ratio": 1.0, "completions/max_length": 824.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 602.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 381.0, "completions/min_terminated_length": 0.0, "entropy": 2.1483086198568344, "epoch": 8.928571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.013117616064846516, "kl": 0.04030630737543106, "learning_rate": 4.0405711640023186e-05, "loss": 0.0014, "num_tokens": 10544384.0, "reward": 0.171875, "reward_std": 0.2472364604473114, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6342247724533081, "sampling/importance_sampling_ratio/min": 2.0451645895225563e-35, "sampling/sampling_logp_difference/max": 79.875, "sampling/sampling_logp_difference/mean": 5.4921417236328125, "step": 125 }, { "clip_ratio/high_max": 0.0014615812979172915, "clip_ratio/high_mean": 0.0007307906489586458, "clip_ratio/low_mean": 0.0017051781578629743, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00243596880682162, "completions/clipped_ratio": 1.0, "completions/max_length": 754.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 601.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 449.0, "completions/min_terminated_length": 0.0, "entropy": 2.57478516548872, "epoch": 9.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.006997971795499325, "kl": 0.029840038740076125, "learning_rate": 4.0245787248423614e-05, "loss": 0.0003, "num_tokens": 10615008.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5640941262245178, "sampling/importance_sampling_ratio/min": 1.028373383751816e-35, "sampling/sampling_logp_difference/max": 80.5625, "sampling/sampling_logp_difference/mean": 6.425477981567383, "step": 126 }, { "clip_ratio/high_max": 0.0012840198760386556, "clip_ratio/high_mean": 0.0006420099380193278, "clip_ratio/low_mean": 0.001311377651290968, "clip_ratio/low_min": 0.0004450640990398824, "clip_ratio/region_mean": 0.0019533875965862535, "completions/clipped_ratio": 1.0, "completions/max_length": 1059.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 821.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 583.0, "completions/min_terminated_length": 0.0, "entropy": 3.572887271642685, "epoch": 9.071428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.008025205694139004, "kl": 0.03837357531301677, "learning_rate": 4.008486335457312e-05, "loss": -0.002, "num_tokens": 10699680.0, "reward": 0.078125, "reward_std": 0.11100947856903076, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4937247633934021, "sampling/importance_sampling_ratio/min": 1.6955009379010048e-35, "sampling/sampling_logp_difference/max": 80.0625, "sampling/sampling_logp_difference/mean": 7.3149003982543945, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 925.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 744.0, "completions/min_terminated_length": 0.0, "entropy": 4.271390303969383, "epoch": 9.142857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004230639897286892, "kl": 0.025217054644599557, "learning_rate": 3.9922950508520127e-05, "loss": 0.0, "num_tokens": 10791008.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4081822633743286, "sampling/importance_sampling_ratio/min": 4.906094994852858e-35, "sampling/sampling_logp_difference/max": 79.0, "sampling/sampling_logp_difference/mean": 8.474234580993652, "step": 128 }, { "clip_ratio/high_max": 0.000992693763691932, "clip_ratio/high_mean": 0.000496346881845966, "clip_ratio/low_mean": 0.0006353240132739302, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011316708951198962, "completions/clipped_ratio": 1.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1049.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 787.0, "completions/min_terminated_length": 0.0, "entropy": 4.547907680273056, "epoch": 9.214285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 0.0073422216810286045, "kl": 0.028595194802619517, "learning_rate": 3.976005932514807e-05, "loss": -0.0012, "num_tokens": 10890304.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4022846221923828, "sampling/importance_sampling_ratio/min": 1.1166220121487572e-33, "sampling/sampling_logp_difference/max": 75.875, "sampling/sampling_logp_difference/mean": 8.536362648010254, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 834.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 638.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 443.0, "completions/min_terminated_length": 0.0, "entropy": 2.6787070855498314, "epoch": 9.285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003382624126970768, "kl": 0.03256513108499348, "learning_rate": 3.9596200483479385e-05, "loss": 0.0, "num_tokens": 10963296.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5652322769165039, "sampling/importance_sampling_ratio/min": 1.240455049987419e-35, "sampling/sampling_logp_difference/max": 80.375, "sampling/sampling_logp_difference/mean": 6.40300178527832, "step": 130 }, { "clip_ratio/high_max": 0.002012155076954514, "clip_ratio/high_mean": 0.001006077538477257, "clip_ratio/low_mean": 0.0009855453427007888, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001991622873902088, "completions/clipped_ratio": 1.0, "completions/max_length": 761.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 623.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 485.0, "completions/min_terminated_length": 0.0, "entropy": 3.0439442545175552, "epoch": 9.357142857142858, "frac_reward_zero_std": 0.75, "grad_norm": 0.009470962919294834, "kl": 0.02804613788612187, "learning_rate": 3.943138472597549e-05, "loss": -0.0012, "num_tokens": 11035296.0, "reward": 0.125, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.514110803604126, "sampling/importance_sampling_ratio/min": 1.0489692816875613e-33, "sampling/sampling_logp_difference/max": 75.9375, "sampling/sampling_logp_difference/mean": 7.078689098358154, "step": 131 }, { "clip_ratio/high_max": 0.004177922593953554, "clip_ratio/high_mean": 0.0021659317390003707, "clip_ratio/low_mean": 0.0016225866038439563, "clip_ratio/low_min": 7.200460822787136e-05, "clip_ratio/region_mean": 0.0037885182919126237, "completions/clipped_ratio": 1.0, "completions/max_length": 868.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 738.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 609.0, "completions/min_terminated_length": 0.0, "entropy": 3.6675124168395996, "epoch": 9.428571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.03655102103948593, "kl": 0.032744683732744306, "learning_rate": 3.9265622857832454e-05, "loss": 0.0003, "num_tokens": 11114688.0, "reward": 0.296875, "reward_std": 0.1804211586713791, "rewards/tree_correctness_reward/mean": 0.296875, "rewards/tree_correctness_reward/std": 0.4604927599430084, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4666104316711426, "sampling/importance_sampling_ratio/min": 2.1987591985594636e-34, "sampling/sampling_logp_difference/max": 77.5, "sampling/sampling_logp_difference/mean": 7.6631574630737305, "step": 132 }, { "clip_ratio/high_max": 0.0019091639551334083, "clip_ratio/high_mean": 0.0009545819775667042, "clip_ratio/low_mean": 0.0006280144589254633, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015825964364921674, "completions/clipped_ratio": 1.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 925.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 622.0, "completions/min_terminated_length": 0.0, "entropy": 3.7519738227128983, "epoch": 9.5, "frac_reward_zero_std": 0.875, "grad_norm": 0.007448675110936165, "kl": 0.02729408093728125, "learning_rate": 3.909892574627266e-05, "loss": 0.001, "num_tokens": 11206048.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4885122776031494, "sampling/importance_sampling_ratio/min": 2.823262612952882e-34, "sampling/sampling_logp_difference/max": 77.25, "sampling/sampling_logp_difference/mean": 7.348101615905762, "step": 133 }, { "clip_ratio/high_max": 0.002428212988888845, "clip_ratio/high_mean": 0.0012861111026722938, "clip_ratio/low_mean": 0.0006906485123181483, "clip_ratio/low_min": 0.000220416986849159, "clip_ratio/region_mean": 0.0019767596295423573, "completions/clipped_ratio": 1.0, "completions/max_length": 868.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 712.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 556.0, "completions/min_terminated_length": 0.0, "entropy": 3.4671861827373505, "epoch": 9.571428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 2.565577268600464, "kl": 23.25146939320257, "learning_rate": 3.893130431983234e-05, "loss": 0.0051, "num_tokens": 11283744.0, "reward": 0.109375, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4970094561576843, "sampling/importance_sampling_ratio/min": 1.1769110281448947e-34, "sampling/sampling_logp_difference/max": 78.125, "sampling/sampling_logp_difference/mean": 7.24913215637207, "step": 134 }, { "clip_ratio/high_max": 0.0033487333566881716, "clip_ratio/high_mean": 0.0018560527241788805, "clip_ratio/low_mean": 0.0011622715828707442, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00301832432160154, "completions/clipped_ratio": 1.0, "completions/max_length": 525.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 434.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 344.0, "completions/min_terminated_length": 0.0, "entropy": 1.5809234604239464, "epoch": 9.642857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.009322664700448513, "kl": 0.04065024247393012, "learning_rate": 3.876276956764509e-05, "loss": -0.0015, "num_tokens": 11343680.0, "reward": 0.125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6574804782867432, "sampling/importance_sampling_ratio/min": 1.7123953114587469e-34, "sampling/sampling_logp_difference/max": 77.75, "sampling/sampling_logp_difference/mean": 5.250431060791016, "step": 135 }, { "clip_ratio/high_max": 0.0034330986090935767, "clip_ratio/high_mean": 0.0017165493045467883, "clip_ratio/low_mean": 0.0015845070156501606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003301056265627267, "completions/clipped_ratio": 1.0, "completions/max_length": 508.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 431.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 355.0, "completions/min_terminated_length": 0.0, "entropy": 1.5319797992706299, "epoch": 9.714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.013598456047475338, "kl": 0.04008490592241287, "learning_rate": 3.859333253872147e-05, "loss": 0.0012, "num_tokens": 11403424.0, "reward": 0.0625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.667367696762085, "sampling/importance_sampling_ratio/min": 9.756933719611155e-35, "sampling/sampling_logp_difference/max": 78.3125, "sampling/sampling_logp_difference/mean": 5.09918212890625, "step": 136 }, { "clip_ratio/high_max": 0.003991768171545118, "clip_ratio/high_mean": 0.0020841609220951796, "clip_ratio/low_mean": 0.0033549795516591985, "clip_ratio/low_min": 0.0012073696561856195, "clip_ratio/region_mean": 0.005439140415546717, "completions/clipped_ratio": 1.0, "completions/max_length": 708.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 642.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 3.053026244044304, "epoch": 9.785714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.015275397337973118, "kl": 0.03644077433273196, "learning_rate": 3.84230043412246e-05, "loss": 0.0006, "num_tokens": 11476640.0, "reward": 0.15625, "reward_std": 0.1767766922712326, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5145493745803833, "sampling/importance_sampling_ratio/min": 6.998167936122733e-37, "sampling/sampling_logp_difference/max": 83.25, "sampling/sampling_logp_difference/mean": 7.0188446044921875, "step": 137 }, { "clip_ratio/high_max": 0.0003629032289609313, "clip_ratio/high_mean": 0.00018145161448046565, "clip_ratio/low_mean": 0.000826612907985691, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010080645224661566, "completions/clipped_ratio": 1.0, "completions/max_length": 775.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 717.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 659.0, "completions/min_terminated_length": 0.0, "entropy": 3.463388279080391, "epoch": 9.857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.012470799498260021, "kl": 0.08722926233895123, "learning_rate": 3.8251796141741946e-05, "loss": -0.0007, "num_tokens": 11554656.0, "reward": 0.140625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4998154044151306, "sampling/importance_sampling_ratio/min": 9.165791500701421e-35, "sampling/sampling_logp_difference/max": 78.375, "sampling/sampling_logp_difference/mean": 7.190260887145996, "step": 138 }, { "clip_ratio/high_max": 0.0030004068394191563, "clip_ratio/high_mean": 0.0015002034197095782, "clip_ratio/low_mean": 0.001067941397195682, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00256814481690526, "completions/clipped_ratio": 1.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 973.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 718.0, "completions/min_terminated_length": 0.0, "entropy": 3.756473571062088, "epoch": 9.928571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 0.007258562836796045, "kl": 0.04099463438615203, "learning_rate": 3.807971916455325e-05, "loss": 0.0006, "num_tokens": 11649088.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4957939684391022, "sampling/importance_sampling_ratio/min": 2.5163465940007e-33, "sampling/sampling_logp_difference/max": 75.0625, "sampling/sampling_logp_difference/mean": 7.2105512619018555, "step": 139 }, { "clip_ratio/high_max": 0.0017143417135230266, "clip_ratio/high_mean": 0.0008571708567615133, "clip_ratio/low_mean": 0.00036735892172146123, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012245297693880275, "completions/clipped_ratio": 1.0, "completions/max_length": 638.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 635.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 633.0, "completions/min_terminated_length": 0.0, "entropy": 2.723957121372223, "epoch": 10.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.008778627961874008, "kl": 0.05729053367394954, "learning_rate": 3.7906784690894645e-05, "loss": -0.0006, "num_tokens": 11721888.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5554947257041931, "sampling/importance_sampling_ratio/min": 2.9756967944000553e-35, "sampling/sampling_logp_difference/max": 79.5, "sampling/sampling_logp_difference/mean": 6.454415321350098, "step": 140 }, { "clip_ratio/high_max": 0.00028483071946538985, "clip_ratio/high_mean": 0.00014241535973269492, "clip_ratio/low_mean": 0.0007120768386812415, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008544921838620212, "completions/clipped_ratio": 1.0, "completions/max_length": 768.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 686.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 604.0, "completions/min_terminated_length": 0.0, "entropy": 2.853634610772133, "epoch": 10.071428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 0.032208096235990524, "kl": 0.12684023601468652, "learning_rate": 3.773300405821908e-05, "loss": 0.0014, "num_tokens": 11797920.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5643903017044067, "sampling/importance_sampling_ratio/min": 1.2779062104650084e-32, "sampling/sampling_logp_difference/max": 73.4375, "sampling/sampling_logp_difference/mean": 6.291780471801758, "step": 141 }, { "clip_ratio/high_max": 0.00131158758449601, "clip_ratio/high_mean": 0.000655793792248005, "clip_ratio/low_mean": 0.00022810218251834158, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008838959747663466, "completions/clipped_ratio": 1.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 802.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 548.0, "completions/min_terminated_length": 0.0, "entropy": 3.352193146944046, "epoch": 10.142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.014726459048688412, "kl": 0.030956801841966808, "learning_rate": 3.755838865945305e-05, "loss": -0.0006, "num_tokens": 11881376.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5132886171340942, "sampling/importance_sampling_ratio/min": 2.1069109648181553e-32, "sampling/sampling_logp_difference/max": 72.9375, "sampling/sampling_logp_difference/mean": 7.0329999923706055, "step": 142 }, { "clip_ratio/high_max": 0.0018664676426851656, "clip_ratio/high_mean": 0.0009332338213425828, "clip_ratio/low_mean": 0.00031996588222682476, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012531997035694076, "completions/clipped_ratio": 1.0, "completions/max_length": 880.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 733.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 586.0, "completions/min_terminated_length": 0.0, "entropy": 3.175238385796547, "epoch": 10.214285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 0.008291695266962051, "kl": 0.055907530477270484, "learning_rate": 3.7382949942249694e-05, "loss": -0.0007, "num_tokens": 11960416.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5111576914787292, "sampling/importance_sampling_ratio/min": 6.840139179297504e-33, "sampling/sampling_logp_difference/max": 74.0625, "sampling/sampling_logp_difference/mean": 7.056779861450195, "step": 143 }, { "clip_ratio/high_max": 0.0031142611987888813, "clip_ratio/high_mean": 0.0015571305993944407, "clip_ratio/low_mean": 0.0009396477726113517, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024967783720057923, "completions/clipped_ratio": 1.0, "completions/max_length": 582.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 572.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 563.0, "completions/min_terminated_length": 0.0, "entropy": 2.302978679537773, "epoch": 10.285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.1271924376487732, "kl": 1.623211397905834, "learning_rate": 3.7206699408238264e-05, "loss": -0.0017, "num_tokens": 12029184.0, "reward": 0.109375, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.614019513130188, "sampling/importance_sampling_ratio/min": 6.705835965062432e-35, "sampling/sampling_logp_difference/max": 78.6875, "sampling/sampling_logp_difference/mean": 5.662405490875244, "step": 144 }, { "clip_ratio/high_max": 0.0031147338122536894, "clip_ratio/high_mean": 0.0015573669061268447, "clip_ratio/low_mean": 0.0020559244621836115, "clip_ratio/low_min": 0.0005911150074098259, "clip_ratio/region_mean": 0.003613291317378753, "completions/clipped_ratio": 1.0, "completions/max_length": 730.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 696.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 663.0, "completions/min_terminated_length": 0.0, "entropy": 3.4328062683343887, "epoch": 10.357142857142858, "frac_reward_zero_std": 0.625, "grad_norm": 0.0840243473649025, "kl": 0.6134358146227896, "learning_rate": 3.702964861227013e-05, "loss": 0.0019, "num_tokens": 12105888.0, "reward": 0.15625, "reward_std": 0.19149437546730042, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48596256971359253, "sampling/importance_sampling_ratio/min": 3.0053473766614753e-34, "sampling/sampling_logp_difference/max": 77.1875, "sampling/sampling_logp_difference/mean": 7.395161151885986, "step": 145 }, { "clip_ratio/high_max": 0.0045797414786648005, "clip_ratio/high_mean": 0.0022898707393324003, "clip_ratio/low_mean": 0.000996767237666063, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032866379333427176, "completions/clipped_ratio": 1.0, "completions/max_length": 634.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 607.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 580.0, "completions/min_terminated_length": 0.0, "entropy": 2.455826297402382, "epoch": 10.428571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 0.01650192402303219, "kl": 0.19298574014101177, "learning_rate": 3.685180916166121e-05, "loss": -0.0011, "num_tokens": 12176864.0, "reward": 0.109375, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5811736583709717, "sampling/importance_sampling_ratio/min": 3.783174026200654e-36, "sampling/sampling_logp_difference/max": 81.5625, "sampling/sampling_logp_difference/mean": 6.13636589050293, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 734.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 649.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 565.0, "completions/min_terminated_length": 0.0, "entropy": 2.926171362400055, "epoch": 10.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009155573206953704, "kl": 0.045344250509515405, "learning_rate": 3.6673192715431015e-05, "loss": 0.0, "num_tokens": 12250560.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5480413436889648, "sampling/importance_sampling_ratio/min": 1.9792599190321352e-32, "sampling/sampling_logp_difference/max": 73.0, "sampling/sampling_logp_difference/mean": 6.544835090637207, "step": 147 }, { "clip_ratio/high_max": 0.0006569069228135049, "clip_ratio/high_mean": 0.00032845346140675247, "clip_ratio/low_mean": 0.0013138138310750946, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001642267292481847, "completions/clipped_ratio": 1.0, "completions/max_length": 994.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 830.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 666.0, "completions/min_terminated_length": 0.0, "entropy": 3.5975677520036697, "epoch": 10.571428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 0.013835657387971878, "kl": 0.06447094457689673, "learning_rate": 3.649381098353834e-05, "loss": -0.0005, "num_tokens": 12335808.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.49293720722198486, "sampling/importance_sampling_ratio/min": 8.696261289635204e-34, "sampling/sampling_logp_difference/max": 76.125, "sampling/sampling_logp_difference/mean": 7.257169246673584, "step": 148 }, { "clip_ratio/high_max": 0.004340277868323028, "clip_ratio/high_mean": 0.002170138934161514, "clip_ratio/low_mean": 0.0006269290170166641, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002797067951178178, "completions/clipped_ratio": 1.0, "completions/max_length": 648.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 517.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 386.0, "completions/min_terminated_length": 0.0, "entropy": 1.9844420813024044, "epoch": 10.642857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.011263075284659863, "kl": 0.0355567648075521, "learning_rate": 3.631367572611348e-05, "loss": 0.0011, "num_tokens": 12401024.0, "reward": 0.28125, "reward_std": 0.1246790662407875, "rewards/tree_correctness_reward/mean": 0.28125, "rewards/tree_correctness_reward/std": 0.4531635046005249, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6336819529533386, "sampling/importance_sampling_ratio/min": 6.036401529078488e-33, "sampling/sampling_logp_difference/max": 74.1875, "sampling/sampling_logp_difference/mean": 5.4877214431762695, "step": 149 }, { "clip_ratio/high_max": 0.0025522046780679375, "clip_ratio/high_mean": 0.001409792181220837, "clip_ratio/low_mean": 0.0015248228337441105, "clip_ratio/low_min": 0.00010026738164015114, "clip_ratio/region_mean": 0.002934615040430799, "completions/clipped_ratio": 1.0, "completions/max_length": 935.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 875.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 815.0, "completions/min_terminated_length": 0.0, "entropy": 3.9735279083251953, "epoch": 10.714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.01205904595553875, "kl": 0.03632840421050787, "learning_rate": 3.613279875268731e-05, "loss": 0.0017, "num_tokens": 12489152.0, "reward": 0.09375, "reward_std": 0.1293872892856598, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.462913453578949, "sampling/importance_sampling_ratio/min": 1.8778695052055614e-31, "sampling/sampling_logp_difference/max": 70.75, "sampling/sampling_logp_difference/mean": 7.654515743255615, "step": 150 }, { "epoch": 10.714285714285714, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 455.16, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 415.92, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 376.68, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.7118269783258439, "eval_frac_reward_zero_std": 0.88, "eval_kl": 0.0479753103107214, "eval_loss": 0.0007543526007793844, "eval_num_tokens": 12489152.0, "eval_reward": 0.055, "eval_reward_std": 0.042426406145095824, "eval_rewards/tree_correctness_reward/mean": 0.055, "eval_rewards/tree_correctness_reward/std": 0.042426406145095824, "eval_runtime": 137.5544, "eval_samples_per_second": 0.182, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8368827080726624, "eval_sampling/importance_sampling_ratio/min": 3.872664992734431e-30, "eval_sampling/sampling_logp_difference/max": 70.60125, "eval_sampling/sampling_logp_difference/mean": 2.6490023374557494, "eval_steps_per_second": 0.029, "step": 150 }, { "clip_ratio/high_max": 0.0005317320465110242, "clip_ratio/high_mean": 0.0002658660232555121, "clip_ratio/low_mean": 0.001272063069336582, "clip_ratio/low_min": 3.109452882199548e-05, "clip_ratio/region_mean": 0.0015379290780401789, "completions/clipped_ratio": 1.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 801.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 597.0, "completions/min_terminated_length": 0.0, "entropy": 3.150477647781372, "epoch": 10.785714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.019510814920067787, "kl": 0.0821337237721309, "learning_rate": 3.595119192141706e-05, "loss": -0.0049, "num_tokens": 12572544.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5528907179832458, "sampling/importance_sampling_ratio/min": 3.473709233586696e-32, "sampling/sampling_logp_difference/max": 72.4375, "sampling/sampling_logp_difference/mean": 6.467709541320801, "step": 151 }, { "clip_ratio/high_max": 0.0025394574186066166, "clip_ratio/high_mean": 0.0013975928668514825, "clip_ratio/low_mean": 0.0015095116359589156, "clip_ratio/low_min": 0.00039359596848953515, "clip_ratio/region_mean": 0.002907104488258483, "completions/clipped_ratio": 1.0, "completions/max_length": 680.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 645.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 611.0, "completions/min_terminated_length": 0.0, "entropy": 3.2082372158765793, "epoch": 10.857142857142858, "frac_reward_zero_std": 0.625, "grad_norm": 0.01371091976761818, "kl": 0.03467380674555898, "learning_rate": 3.576886713830887e-05, "loss": -0.0004, "num_tokens": 12645984.0, "reward": 0.125, "reward_std": 0.1825428307056427, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4990752637386322, "sampling/importance_sampling_ratio/min": 1.9792599190321352e-32, "sampling/sampling_logp_difference/max": 73.0, "sampling/sampling_logp_difference/mean": 7.227528095245361, "step": 152 }, { "clip_ratio/high_max": 0.005476693055243231, "clip_ratio/high_mean": 0.0027383465276216157, "clip_ratio/low_mean": 0.0015233524263749132, "clip_ratio/low_min": 0.0004300562941352837, "clip_ratio/region_mean": 0.0042616989521775395, "completions/clipped_ratio": 1.0, "completions/max_length": 680.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 542.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 405.0, "completions/min_terminated_length": 0.0, "entropy": 1.82301726937294, "epoch": 10.928571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.016930660232901573, "kl": 0.036135215195827186, "learning_rate": 3.5585836356437264e-05, "loss": -0.0002, "num_tokens": 12712832.0, "reward": 0.15625, "reward_std": 0.17570312321186066, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6503637433052063, "sampling/importance_sampling_ratio/min": 1.8593425975169688e-32, "sampling/sampling_logp_difference/max": 73.0625, "sampling/sampling_logp_difference/mean": 5.2594990730285645, "step": 153 }, { "clip_ratio/high_max": 0.005593192632659338, "clip_ratio/high_mean": 0.0033082365334848873, "clip_ratio/low_mean": 0.0005127133408677764, "clip_ratio/low_min": 7.458234176738188e-05, "clip_ratio/region_mean": 0.003820949867076706, "completions/clipped_ratio": 1.0, "completions/max_length": 838.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 766.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 695.0, "completions/min_terminated_length": 0.0, "entropy": 3.674585059285164, "epoch": 11.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.012330197729170322, "kl": 0.032320293597877026, "learning_rate": 3.540211157516149e-05, "loss": -0.0001, "num_tokens": 12794016.0, "reward": 0.28125, "reward_std": 0.1735912710428238, "rewards/tree_correctness_reward/mean": 0.28125, "rewards/tree_correctness_reward/std": 0.4531635046005249, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4553881883621216, "sampling/importance_sampling_ratio/min": 1.1166220121487572e-33, "sampling/sampling_logp_difference/max": 75.875, "sampling/sampling_logp_difference/mean": 7.826248645782471, "step": 154 }, { "clip_ratio/high_max": 0.00025790921063162386, "clip_ratio/high_mean": 0.00012895460531581193, "clip_ratio/low_mean": 0.0006017881732987007, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007307427786145126, "completions/clipped_ratio": 1.0, "completions/max_length": 727.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 634.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 542.0, "completions/min_terminated_length": 0.0, "entropy": 2.3082604855298996, "epoch": 11.071428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 0.028648540377616882, "kl": 0.03582047135569155, "learning_rate": 3.521770483933891e-05, "loss": 0.0002, "num_tokens": 12866752.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.627093493938446, "sampling/importance_sampling_ratio/min": 7.674422876742102e-34, "sampling/sampling_logp_difference/max": 76.25, "sampling/sampling_logp_difference/mean": 5.479762077331543, "step": 155 }, { "clip_ratio/high_max": 0.005449234886327758, "clip_ratio/high_mean": 0.0028036313742632046, "clip_ratio/low_mean": 0.0008507617931172717, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003654393178294413, "completions/clipped_ratio": 1.0, "completions/max_length": 791.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 598.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 406.0, "completions/min_terminated_length": 0.0, "entropy": 2.654262326657772, "epoch": 11.142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.012344290502369404, "kl": 0.038257374544627964, "learning_rate": 3.503262823853527e-05, "loss": -0.0004, "num_tokens": 12937184.0, "reward": 0.171875, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5564864873886108, "sampling/importance_sampling_ratio/min": 4.148754129619672e-33, "sampling/sampling_logp_difference/max": 74.5625, "sampling/sampling_logp_difference/mean": 6.550540924072266, "step": 156 }, { "clip_ratio/high_max": 0.000292056065518409, "clip_ratio/high_mean": 0.0001460280327592045, "clip_ratio/low_mean": 0.001262099453015253, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014081274857744575, "completions/clipped_ratio": 1.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1047.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 597.0, "completions/min_terminated_length": 0.0, "entropy": 3.8721549063920975, "epoch": 11.214285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 0.006035434547811747, "kl": 0.03278359794057906, "learning_rate": 3.484689390623218e-05, "loss": -0.0002, "num_tokens": 13036352.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.46663910150527954, "sampling/importance_sampling_ratio/min": 1.3469032840202626e-33, "sampling/sampling_logp_difference/max": 75.6875, "sampling/sampling_logp_difference/mean": 7.798165321350098, "step": 157 }, { "clip_ratio/high_max": 0.0025557620720064733, "clip_ratio/high_mean": 0.0012778810360032367, "clip_ratio/low_mean": 0.0011617100244620815, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024395910750172334, "completions/clipped_ratio": 1.0, "completions/max_length": 650.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 594.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 538.0, "completions/min_terminated_length": 0.0, "entropy": 2.162627197802067, "epoch": 11.285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.013386757113039494, "kl": 0.0422228038078174, "learning_rate": 3.466051401903162e-05, "loss": 0.0016, "num_tokens": 13106496.0, "reward": 0.15625, "reward_std": 0.1246790662407875, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6274937391281128, "sampling/importance_sampling_ratio/min": 2.363888833789888e-33, "sampling/sampling_logp_difference/max": 75.125, "sampling/sampling_logp_difference/mean": 5.475372314453125, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 912.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 645.0, "completions/min_terminated_length": 0.0, "entropy": 3.427534803748131, "epoch": 11.357142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004483109514694661, "kl": 0.028303045546635985, "learning_rate": 3.447350079585767e-05, "loss": 0.0, "num_tokens": 13196992.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5217200517654419, "sampling/importance_sampling_ratio/min": 2.8513940753957304e-33, "sampling/sampling_logp_difference/max": 74.9375, "sampling/sampling_logp_difference/mean": 6.885860443115234, "step": 159 }, { "clip_ratio/high_max": 0.00209806757629849, "clip_ratio/high_mean": 0.0012610419216798618, "clip_ratio/low_mean": 0.0014334001825773157, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026944421260850504, "completions/clipped_ratio": 1.0, "completions/max_length": 737.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 707.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 678.0, "completions/min_terminated_length": 0.0, "entropy": 3.067272536456585, "epoch": 11.428571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 0.015560557134449482, "kl": 0.028108153608627617, "learning_rate": 3.4285866497155414e-05, "loss": -0.0002, "num_tokens": 13274400.0, "reward": 0.0625, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5371795296669006, "sampling/importance_sampling_ratio/min": 1.729458002261791e-33, "sampling/sampling_logp_difference/max": 75.4375, "sampling/sampling_logp_difference/mean": 6.713515758514404, "step": 160 }, { "clip_ratio/high_max": 0.008051194425206631, "clip_ratio/high_mean": 0.004673463001381606, "clip_ratio/low_mean": 0.0016786002815933898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006352063290250953, "completions/clipped_ratio": 1.0, "completions/max_length": 581.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 495.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 410.0, "completions/min_terminated_length": 0.0, "entropy": 1.6211810484528542, "epoch": 11.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.019389666616916656, "kl": 0.03819849621504545, "learning_rate": 3.409762342408719e-05, "loss": 0.004, "num_tokens": 13338240.0, "reward": 0.21875, "reward_std": 0.1462521106004715, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.4166666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6521129608154297, "sampling/importance_sampling_ratio/min": 1.419625586416994e-34, "sampling/sampling_logp_difference/max": 77.9375, "sampling/sampling_logp_difference/mean": 5.278217315673828, "step": 161 }, { "clip_ratio/high_max": 0.005565162355196662, "clip_ratio/high_mean": 0.002782581177598331, "clip_ratio/low_mean": 0.004731417455332121, "clip_ratio/low_min": 0.001742973196087405, "clip_ratio/region_mean": 0.007513998694776092, "completions/clipped_ratio": 1.0, "completions/max_length": 759.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 586.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 414.0, "completions/min_terminated_length": 0.0, "entropy": 1.8836063481867313, "epoch": 11.571428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.023613467812538147, "kl": 0.03364666411653161, "learning_rate": 3.3908783917726124e-05, "loss": -0.0012, "num_tokens": 13407904.0, "reward": 0.109375, "reward_std": 0.19939783215522766, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6601095199584961, "sampling/importance_sampling_ratio/min": 1.2652986294762563e-33, "sampling/sampling_logp_difference/max": 75.75, "sampling/sampling_logp_difference/mean": 5.126976013183594, "step": 162 }, { "clip_ratio/high_max": 0.003259408586018253, "clip_ratio/high_mean": 0.0016801889469206799, "clip_ratio/low_mean": 0.0016398555126215797, "clip_ratio/low_min": 0.00015145396173465997, "clip_ratio/region_mean": 0.0033200444268004503, "completions/clipped_ratio": 1.0, "completions/max_length": 930.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 774.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 619.0, "completions/min_terminated_length": 0.0, "entropy": 3.3991441428661346, "epoch": 11.642857142857142, "frac_reward_zero_std": 0.625, "grad_norm": 0.011383580975234509, "kl": 0.027087062248028815, "learning_rate": 3.3719360358247054e-05, "loss": 0.003, "num_tokens": 13489600.0, "reward": 0.203125, "reward_std": 0.15992169082164764, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4930904507637024, "sampling/importance_sampling_ratio/min": 9.952343492388037e-33, "sampling/sampling_logp_difference/max": 73.6875, "sampling/sampling_logp_difference/mean": 7.293552398681641, "step": 163 }, { "clip_ratio/high_max": 0.0024323181132785976, "clip_ratio/high_mean": 0.0012161590566392988, "clip_ratio/low_mean": 0.0009517766302451491, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002167935686884448, "completions/clipped_ratio": 1.0, "completions/max_length": 704.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 647.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 591.0, "completions/min_terminated_length": 0.0, "entropy": 2.716811217367649, "epoch": 11.714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.013003077358007431, "kl": 0.027676914469338953, "learning_rate": 3.35293651641149e-05, "loss": 0.0015, "num_tokens": 13563168.0, "reward": 0.25, "reward_std": 0.1293872892856598, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5543859004974365, "sampling/importance_sampling_ratio/min": 9.257121786836173e-34, "sampling/sampling_logp_difference/max": 76.0625, "sampling/sampling_logp_difference/mean": 6.473443984985352, "step": 164 }, { "clip_ratio/high_max": 0.00307377046556212, "clip_ratio/high_mean": 0.00153688523278106, "clip_ratio/low_mean": 0.0021223652802291326, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036592505202861503, "completions/clipped_ratio": 1.0, "completions/max_length": 653.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 540.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 427.0, "completions/min_terminated_length": 0.0, "entropy": 1.90948536247015, "epoch": 11.785714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.010771824046969414, "kl": 0.031589307240210474, "learning_rate": 3.333881079127052e-05, "loss": 0.0011, "num_tokens": 13629856.0, "reward": 0.0625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6521148681640625, "sampling/importance_sampling_ratio/min": 9.854154449263851e-34, "sampling/sampling_logp_difference/max": 76.0, "sampling/sampling_logp_difference/mean": 5.22401237487793, "step": 165 }, { "clip_ratio/high_max": 0.0024792435579001904, "clip_ratio/high_mean": 0.0012396217789500952, "clip_ratio/low_mean": 0.00020179888815619051, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014414206671062857, "completions/clipped_ratio": 1.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 947.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 811.0, "completions/min_terminated_length": 0.0, "entropy": 4.364269733428955, "epoch": 11.857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.010578085668385029, "kl": 0.040434715221636, "learning_rate": 3.3147709732314084e-05, "loss": -0.0008, "num_tokens": 13722624.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4180676341056824, "sampling/importance_sampling_ratio/min": 1.8593425975169688e-32, "sampling/sampling_logp_difference/max": 73.0625, "sampling/sampling_logp_difference/mean": 8.292906761169434, "step": 166 }, { "clip_ratio/high_max": 0.0007780083105899394, "clip_ratio/high_mean": 0.0003890041552949697, "clip_ratio/low_mean": 0.0016208506785915233, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002009854833886493, "completions/clipped_ratio": 1.0, "completions/max_length": 898.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 690.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 482.0, "completions/min_terminated_length": 0.0, "entropy": 2.727604664862156, "epoch": 11.928571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 0.005017708521336317, "kl": 0.028064887854270637, "learning_rate": 3.29560745156861e-05, "loss": 0.0005, "num_tokens": 13798912.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5755779147148132, "sampling/importance_sampling_ratio/min": 5.976846944504e-34, "sampling/sampling_logp_difference/max": 76.5, "sampling/sampling_logp_difference/mean": 6.219026565551758, "step": 167 }, { "clip_ratio/high_max": 0.0017361111094942316, "clip_ratio/high_mean": 0.0008680555547471158, "clip_ratio/low_mean": 0.0009864267558441497, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018544823069532868, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1420.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 792.0, "completions/min_terminated_length": 0.0, "entropy": 4.716835483908653, "epoch": 12.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.008977319113910198, "kl": 0.026214802404865623, "learning_rate": 3.276391770484606e-05, "loss": 0.0001, "num_tokens": 13921920.0, "reward": 0.109375, "reward_std": 0.1315089464187622, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3726658821105957, "sampling/importance_sampling_ratio/min": 7.067899417566791e-36, "sampling/sampling_logp_difference/max": 80.9375, "sampling/sampling_logp_difference/mean": 9.279924392700195, "step": 168 }, { "clip_ratio/high_max": 0.0014960106054786593, "clip_ratio/high_mean": 0.0007480053027393296, "clip_ratio/low_mean": 0.0011220079722988885, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018700132750382181, "completions/clipped_ratio": 1.0, "completions/max_length": 989.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 870.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 752.0, "completions/min_terminated_length": 0.0, "entropy": 3.8945877701044083, "epoch": 12.071428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.011262379586696625, "kl": 0.02788143919315189, "learning_rate": 3.2571251897448765e-05, "loss": -0.0006, "num_tokens": 14009760.0, "reward": 0.046875, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.47217994928359985, "sampling/importance_sampling_ratio/min": 9.660673761439827e-36, "sampling/sampling_logp_difference/max": 80.625, "sampling/sampling_logp_difference/mean": 7.561606407165527, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 987.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 872.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 758.0, "completions/min_terminated_length": 0.0, "entropy": 3.3098011761903763, "epoch": 12.142857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007309013744816184, "kl": 0.03071731887757778, "learning_rate": 3.2378089724518465e-05, "loss": 0.0, "num_tokens": 14097728.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5338716506958008, "sampling/importance_sampling_ratio/min": 6.840139179297504e-33, "sampling/sampling_logp_difference/max": 74.0625, "sampling/sampling_logp_difference/mean": 6.704576015472412, "step": 170 }, { "clip_ratio/high_max": 0.0006708966102451086, "clip_ratio/high_mean": 0.0003354483051225543, "clip_ratio/low_mean": 0.0025515276502119377, "clip_ratio/low_min": 0.0004346986097516492, "clip_ratio/region_mean": 0.002886975955334492, "completions/clipped_ratio": 1.0, "completions/max_length": 647.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 528.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 410.0, "completions/min_terminated_length": 0.0, "entropy": 1.6784919872879982, "epoch": 12.214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.012306527234613895, "kl": 0.03451740217860788, "learning_rate": 3.218444384962071e-05, "loss": 0.0007, "num_tokens": 14163680.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.668670117855072, "sampling/importance_sampling_ratio/min": 1.7123953114587469e-34, "sampling/sampling_logp_difference/max": 77.75, "sampling/sampling_logp_difference/mean": 5.008136749267578, "step": 171 }, { "clip_ratio/high_max": 0.004599731357302517, "clip_ratio/high_mean": 0.0026684582117013633, "clip_ratio/low_mean": 0.0006723517763020936, "clip_ratio/low_min": 4.3342581193428487e-05, "clip_ratio/region_mean": 0.003340809984365478, "completions/clipped_ratio": 1.0, "completions/max_length": 721.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 720.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 720.0, "completions/min_terminated_length": 0.0, "entropy": 3.3961315602064133, "epoch": 12.285714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 0.012034837156534195, "kl": 0.02250248589552939, "learning_rate": 3.199032696803222e-05, "loss": 0.0004, "num_tokens": 14241920.0, "reward": 0.25, "reward_std": 0.17570312321186066, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4888167977333069, "sampling/importance_sampling_ratio/min": 2.823262612952882e-34, "sampling/sampling_logp_difference/max": 77.25, "sampling/sampling_logp_difference/mean": 7.343169212341309, "step": 172 }, { "clip_ratio/high_max": 0.0015513654943788424, "clip_ratio/high_mean": 0.0007756827471894212, "clip_ratio/low_mean": 0.0022829401495982893, "clip_ratio/low_min": 0.0005605381156783551, "clip_ratio/region_mean": 0.0030586229331674986, "completions/clipped_ratio": 1.0, "completions/max_length": 669.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 644.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 619.0, "completions/min_terminated_length": 0.0, "entropy": 2.3205360174179077, "epoch": 12.357142857142858, "frac_reward_zero_std": 0.625, "grad_norm": 0.016968626528978348, "kl": 0.028704503551125526, "learning_rate": 3.1795751805908573e-05, "loss": -0.0025, "num_tokens": 14315264.0, "reward": 0.09375, "reward_std": 0.16675157845020294, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.610810399055481, "sampling/importance_sampling_ratio/min": 2.5163465940007e-33, "sampling/sampling_logp_difference/max": 75.0625, "sampling/sampling_logp_difference/mean": 5.706564426422119, "step": 173 }, { "clip_ratio/high_max": 0.00031375500839203596, "clip_ratio/high_mean": 0.00015687750419601798, "clip_ratio/low_mean": 0.0009412650397280231, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001098142543924041, "completions/clipped_ratio": 1.0, "completions/max_length": 647.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 572.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 498.0, "completions/min_terminated_length": 0.0, "entropy": 2.3576813116669655, "epoch": 12.428571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 0.004450065549463034, "kl": 0.0302806087420322, "learning_rate": 3.160073111944983e-05, "loss": -0.0007, "num_tokens": 14384032.0, "reward": 0.140625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.595444917678833, "sampling/importance_sampling_ratio/min": 1.729458002261791e-33, "sampling/sampling_logp_difference/max": 75.4375, "sampling/sampling_logp_difference/mean": 5.952648639678955, "step": 174 }, { "clip_ratio/high_max": 0.003181098640197888, "clip_ratio/high_mean": 0.0018218404438812286, "clip_ratio/low_mean": 0.0013910110137658194, "clip_ratio/low_min": 7.217090023914352e-05, "clip_ratio/region_mean": 0.003212851457647048, "completions/clipped_ratio": 1.0, "completions/max_length": 866.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 737.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 608.0, "completions/min_terminated_length": 0.0, "entropy": 3.379017136991024, "epoch": 12.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.012382498942315578, "kl": 0.01943719497648999, "learning_rate": 3.1405277694064305e-05, "loss": 0.0005, "num_tokens": 14463328.0, "reward": 0.1875, "reward_std": 0.16675157845020294, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4958900213241577, "sampling/importance_sampling_ratio/min": 3.948096205583356e-38, "sampling/sampling_logp_difference/max": 86.125, "sampling/sampling_logp_difference/mean": 7.254941940307617, "step": 175 }, { "clip_ratio/high_max": 0.0018815820149029605, "clip_ratio/high_mean": 0.0011053400121454615, "clip_ratio/low_mean": 0.0009515224592178129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020568624640873168, "completions/clipped_ratio": 1.0, "completions/max_length": 728.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 676.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 624.0, "completions/min_terminated_length": 0.0, "entropy": 2.845198839902878, "epoch": 12.571428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.027791045606136322, "kl": 0.022636880865320563, "learning_rate": 3.120940434353038e-05, "loss": 0.0002, "num_tokens": 14538720.0, "reward": 0.125, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5571197867393494, "sampling/importance_sampling_ratio/min": 1.1056055867417628e-34, "sampling/sampling_logp_difference/max": 78.1875, "sampling/sampling_logp_difference/mean": 6.421689033508301, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1309.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 570.0, "completions/min_terminated_length": 0.0, "entropy": 3.6990578174591064, "epoch": 12.642857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042879636748693883, "kl": 0.027255569119006395, "learning_rate": 3.101312390915634e-05, "loss": 0.0, "num_tokens": 14654624.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4759170711040497, "sampling/importance_sampling_ratio/min": 1.9403981676294466e-34, "sampling/sampling_logp_difference/max": 77.625, "sampling/sampling_logp_difference/mean": 7.946482181549072, "step": 177 }, { "clip_ratio/high_max": 0.0016250000044237822, "clip_ratio/high_mean": 0.0008125000022118911, "clip_ratio/low_mean": 0.0013333333263290115, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021458333139889874, "completions/clipped_ratio": 1.0, "completions/max_length": 750.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 680.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 610.0, "completions/min_terminated_length": 0.0, "entropy": 2.7229682877659798, "epoch": 12.714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.009208586998283863, "kl": 0.037115896004252136, "learning_rate": 3.0816449258938656e-05, "loss": 0.0018, "num_tokens": 14730272.0, "reward": 0.234375, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42695629596710205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5800920724868774, "sampling/importance_sampling_ratio/min": 2.4915206584065623e-34, "sampling/sampling_logp_difference/max": 77.375, "sampling/sampling_logp_difference/mean": 6.133828163146973, "step": 178 }, { "clip_ratio/high_max": 0.0016007339108909946, "clip_ratio/high_mean": 0.0009207624807459069, "clip_ratio/low_mean": 0.001777294703060761, "clip_ratio/low_min": 0.00016960651555564255, "clip_ratio/region_mean": 0.0026980571910826256, "completions/clipped_ratio": 1.0, "completions/max_length": 737.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 588.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 439.0, "completions/min_terminated_length": 0.0, "entropy": 2.439795434474945, "epoch": 12.785714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.014131585136055946, "kl": 0.02913567249197513, "learning_rate": 3.061939328671824e-05, "loss": -0.0014, "num_tokens": 14800032.0, "reward": 0.109375, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5888058543205261, "sampling/importance_sampling_ratio/min": 6.036401529078488e-33, "sampling/sampling_logp_difference/max": 74.1875, "sampling/sampling_logp_difference/mean": 6.05779504776001, "step": 179 }, { "clip_ratio/high_max": 0.006987972854403779, "clip_ratio/high_mean": 0.0037310881016310304, "clip_ratio/low_mean": 0.0038861708089825697, "clip_ratio/low_min": 0.0005932663843850605, "clip_ratio/region_mean": 0.007617258874233812, "completions/clipped_ratio": 1.0, "completions/max_length": 1435.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1047.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 659.0, "completions/min_terminated_length": 0.0, "entropy": 4.228788249194622, "epoch": 12.857142857142858, "frac_reward_zero_std": 0.5, "grad_norm": 0.021607959643006325, "kl": 0.044380172970704734, "learning_rate": 3.0421968911335196e-05, "loss": 0.0006, "num_tokens": 14899168.0, "reward": 0.25, "reward_std": 0.2041158676147461, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4293631315231323, "sampling/importance_sampling_ratio/min": 7.209453331209461e-34, "sampling/sampling_logp_difference/max": 76.3125, "sampling/sampling_logp_difference/mean": 8.280280113220215, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 995.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 841.0, "completions/min_terminated_length": 0.0, "entropy": 4.185282461345196, "epoch": 12.928571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003255385672673583, "kl": 0.025569056975655258, "learning_rate": 3.0224189075781884e-05, "loss": 0.0, "num_tokens": 14995008.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.45240408182144165, "sampling/importance_sampling_ratio/min": 9.756933719611155e-35, "sampling/sampling_logp_difference/max": 78.3125, "sampling/sampling_logp_difference/mean": 7.8025031089782715, "step": 181 }, { "clip_ratio/high_max": 0.0021537161665037274, "clip_ratio/high_mean": 0.0010768580832518637, "clip_ratio/low_mean": 0.0006545608048327267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00173141885898076, "completions/clipped_ratio": 1.0, "completions/max_length": 740.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 676.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 613.0, "completions/min_terminated_length": 0.0, "entropy": 3.048665329813957, "epoch": 13.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.014822250232100487, "kl": 0.02501958835637197, "learning_rate": 3.002606674635432e-05, "loss": -0.0021, "num_tokens": 15070432.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5363292694091797, "sampling/importance_sampling_ratio/min": 1.921254254894722e-35, "sampling/sampling_logp_difference/max": 79.9375, "sampling/sampling_logp_difference/mean": 6.71608829498291, "step": 182 }, { "clip_ratio/high_max": 0.0006957444711588323, "clip_ratio/high_mean": 0.00034787223557941616, "clip_ratio/low_mean": 0.0019095307361567393, "clip_ratio/low_min": 0.00021836180530954152, "clip_ratio/region_mean": 0.002257402964460198, "completions/clipped_ratio": 1.0, "completions/max_length": 1288.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1063.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 839.0, "completions/min_terminated_length": 0.0, "entropy": 4.65557274222374, "epoch": 13.071428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.016191517934203148, "kl": 0.027342414134182036, "learning_rate": 2.9827614911802203e-05, "loss": -0.0011, "num_tokens": 15170624.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.39652225375175476, "sampling/importance_sampling_ratio/min": 8.088783257874516e-35, "sampling/sampling_logp_difference/max": 78.5, "sampling/sampling_logp_difference/mean": 8.616994857788086, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 602.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 602.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 602.0, "completions/min_terminated_length": 0.0, "entropy": 2.4057064279913902, "epoch": 13.142857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002082021237583831, "kl": 0.024940574658103287, "learning_rate": 2.9628846582477303e-05, "loss": 0.0, "num_tokens": 15241280.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5849063396453857, "sampling/importance_sampling_ratio/min": 3.589375444066961e-35, "sampling/sampling_logp_difference/max": 79.3125, "sampling/sampling_logp_difference/mean": 6.1050004959106445, "step": 184 }, { "clip_ratio/high_max": 0.004114529649086762, "clip_ratio/high_mean": 0.002057264824543381, "clip_ratio/low_mean": 0.0006201838332344778, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026774486614158377, "completions/clipped_ratio": 1.0, "completions/max_length": 718.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 549.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 381.0, "completions/min_terminated_length": 0.0, "entropy": 2.0768090412020683, "epoch": 13.214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.0090408343821764, "kl": 0.02835618262179196, "learning_rate": 2.9429774789480575e-05, "loss": 0.0016, "num_tokens": 15308576.0, "reward": 0.1875, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6188526153564453, "sampling/importance_sampling_ratio/min": 4.608849322522735e-35, "sampling/sampling_logp_difference/max": 79.0625, "sampling/sampling_logp_difference/mean": 5.732775688171387, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 691.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 628.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 565.0, "completions/min_terminated_length": 0.0, "entropy": 2.0882799327373505, "epoch": 13.285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013382836477831006, "kl": 0.02755960263311863, "learning_rate": 2.9230412583807793e-05, "loss": 0.0, "num_tokens": 15380896.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6454638242721558, "sampling/importance_sampling_ratio/min": 7.209453331209461e-34, "sampling/sampling_logp_difference/max": 76.3125, "sampling/sampling_logp_difference/mean": 5.233962059020996, "step": 186 }, { "clip_ratio/high_max": 0.0029592803912237287, "clip_ratio/high_mean": 0.0014796401956118643, "clip_ratio/low_mean": 0.0009469697251915932, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024266099208034575, "completions/clipped_ratio": 1.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 771.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 528.0, "completions/min_terminated_length": 0.0, "entropy": 3.373183622956276, "epoch": 13.357142857142858, "frac_reward_zero_std": 0.75, "grad_norm": 0.010952673852443695, "kl": 0.029659411986358464, "learning_rate": 2.9030773035493997e-05, "loss": 0.0009, "num_tokens": 15462368.0, "reward": 0.109375, "reward_std": 0.1315089464187622, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5044841766357422, "sampling/importance_sampling_ratio/min": 8.610464551321452e-35, "sampling/sampling_logp_difference/max": 78.4375, "sampling/sampling_logp_difference/mean": 7.175005912780762, "step": 187 }, { "clip_ratio/high_max": 0.002643823972903192, "clip_ratio/high_mean": 0.001321911986451596, "clip_ratio/low_mean": 0.0011632825480774045, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024851945345290005, "completions/clipped_ratio": 1.0, "completions/max_length": 739.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 665.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 591.0, "completions/min_terminated_length": 0.0, "entropy": 3.0737359076738358, "epoch": 13.428571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 0.01609029434621334, "kl": 0.030737996101379395, "learning_rate": 2.883086923275658e-05, "loss": 0.0036, "num_tokens": 15537056.0, "reward": 0.125, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5143834948539734, "sampling/importance_sampling_ratio/min": 3.199175604441272e-34, "sampling/sampling_logp_difference/max": 77.125, "sampling/sampling_logp_difference/mean": 7.012298583984375, "step": 188 }, { "clip_ratio/high_max": 0.0008668516238685697, "clip_ratio/high_mean": 0.00043342581193428487, "clip_ratio/low_mean": 0.0004117545067856554, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008451803078060038, "completions/clipped_ratio": 1.0, "completions/max_length": 765.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 743.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 721.0, "completions/min_terminated_length": 0.0, "entropy": 3.0138007551431656, "epoch": 13.5, "frac_reward_zero_std": 0.875, "grad_norm": 0.008941834792494774, "kl": 0.02489829994738102, "learning_rate": 2.863071428113726e-05, "loss": -0.0012, "num_tokens": 15616736.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5615396499633789, "sampling/importance_sampling_ratio/min": 1.3917519671963701e-36, "sampling/sampling_logp_difference/max": 82.5625, "sampling/sampling_logp_difference/mean": 6.346146106719971, "step": 189 }, { "clip_ratio/high_max": 0.00037726358277723193, "clip_ratio/high_mean": 0.00018863179138861597, "clip_ratio/low_mean": 0.001006036218313966, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001194668009702582, "completions/clipped_ratio": 1.0, "completions/max_length": 994.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 922.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 850.0, "completions/min_terminated_length": 0.0, "entropy": 3.851126864552498, "epoch": 13.571428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 0.006888445932418108, "kl": 0.02941384818404913, "learning_rate": 2.843032130264289e-05, "loss": -0.0002, "num_tokens": 15707872.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.49413827061653137, "sampling/importance_sampling_ratio/min": 4.2868981798105594e-36, "sampling/sampling_logp_difference/max": 81.4375, "sampling/sampling_logp_difference/mean": 7.230227470397949, "step": 190 }, { "clip_ratio/high_max": 0.003675088875752408, "clip_ratio/high_mean": 0.0019402279031055514, "clip_ratio/low_mean": 0.002561283210525289, "clip_ratio/low_min": 0.0003765060246223584, "clip_ratio/region_mean": 0.0045015110954409465, "completions/clipped_ratio": 1.0, "completions/max_length": 913.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 890.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 867.0, "completions/min_terminated_length": 0.0, "entropy": 3.9888048470020294, "epoch": 13.642857142857142, "frac_reward_zero_std": 0.5, "grad_norm": 0.023377398028969765, "kl": 0.024544507963582873, "learning_rate": 2.8229703434885163e-05, "loss": 0.0018, "num_tokens": 15796960.0, "reward": 0.234375, "reward_std": 0.2109457403421402, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42695629596710205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4573162794113159, "sampling/importance_sampling_ratio/min": 3.820870344979068e-35, "sampling/sampling_logp_difference/max": 79.25, "sampling/sampling_logp_difference/mean": 7.748763084411621, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 590.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 535.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 480.0, "completions/min_terminated_length": 0.0, "entropy": 1.4130274206399918, "epoch": 13.714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003907468926627189, "kl": 0.03524069313425571, "learning_rate": 2.8028873830219372e-05, "loss": 0.0, "num_tokens": 15863328.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7080346345901489, "sampling/importance_sampling_ratio/min": 9.854154449263851e-34, "sampling/sampling_logp_difference/max": 76.0, "sampling/sampling_logp_difference/mean": 4.446960926055908, "step": 192 }, { "clip_ratio/high_max": 0.005395241372752935, "clip_ratio/high_mean": 0.002897941187256947, "clip_ratio/low_mean": 0.0010782616318465443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003976202859121258, "completions/clipped_ratio": 1.0, "completions/max_length": 858.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 660.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 463.0, "completions/min_terminated_length": 0.0, "entropy": 2.415714241564274, "epoch": 13.785714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 0.010011076927185059, "kl": 0.025214752182364464, "learning_rate": 2.782784565488211e-05, "loss": 0.0012, "num_tokens": 15937728.0, "reward": 0.203125, "reward_std": 0.189372718334198, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6035638451576233, "sampling/importance_sampling_ratio/min": 4.107822865311063e-34, "sampling/sampling_logp_difference/max": 76.875, "sampling/sampling_logp_difference/mean": 5.838356971740723, "step": 193 }, { "clip_ratio/high_max": 0.0027389523456804454, "clip_ratio/high_mean": 0.0013694761728402227, "clip_ratio/low_mean": 0.0008012661419343203, "clip_ratio/low_min": 0.00016447369125671685, "clip_ratio/region_mean": 0.002170742300222628, "completions/clipped_ratio": 1.0, "completions/max_length": 950.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 740.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 530.0, "completions/min_terminated_length": 0.0, "entropy": 2.899347960948944, "epoch": 13.857142857142858, "frac_reward_zero_std": 0.75, "grad_norm": 0.01782345399260521, "kl": 0.025790081010200083, "learning_rate": 2.76266320881281e-05, "loss": 0.001, "num_tokens": 16017216.0, "reward": 0.140625, "reward_std": 0.1315089464187622, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5609447956085205, "sampling/importance_sampling_ratio/min": 1.6955009379010048e-35, "sampling/sampling_logp_difference/max": 80.0625, "sampling/sampling_logp_difference/mean": 6.391366481781006, "step": 194 }, { "clip_ratio/high_max": 0.00697748466336634, "clip_ratio/high_mean": 0.0037050052851554938, "clip_ratio/low_mean": 0.0039289576598093845, "clip_ratio/low_min": 0.00041666667675599456, "clip_ratio/region_mean": 0.007633962974068709, "completions/clipped_ratio": 1.0, "completions/max_length": 900.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 739.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 578.0, "completions/min_terminated_length": 0.0, "entropy": 3.5911632776260376, "epoch": 13.928571428571429, "frac_reward_zero_std": 0.375, "grad_norm": 0.020882975310087204, "kl": 0.028505169437266886, "learning_rate": 2.7425246321366203e-05, "loss": -0.0001, "num_tokens": 16096640.0, "reward": 0.265625, "reward_std": 0.28930896520614624, "rewards/tree_correctness_reward/mean": 0.265625, "rewards/tree_correctness_reward/std": 0.44515693187713623, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.46753185987472534, "sampling/importance_sampling_ratio/min": 1.419625586416994e-34, "sampling/sampling_logp_difference/max": 77.9375, "sampling/sampling_logp_difference/mean": 7.673016548156738, "step": 195 }, { "clip_ratio/high_max": 0.0028170455334475264, "clip_ratio/high_mean": 0.0017812745063565671, "clip_ratio/low_mean": 0.0004901172251265962, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022713917278451845, "completions/clipped_ratio": 1.0, "completions/max_length": 843.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 692.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 542.0, "completions/min_terminated_length": 0.0, "entropy": 2.9686704501509666, "epoch": 14.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.0103598115965724, "kl": 0.026476859231479466, "learning_rate": 2.7223701557294575e-05, "loss": 0.0059, "num_tokens": 16173088.0, "reward": 0.15625, "reward_std": 0.1293872892856598, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5382677316665649, "sampling/importance_sampling_ratio/min": 2.3405667503630854e-34, "sampling/sampling_logp_difference/max": 77.4375, "sampling/sampling_logp_difference/mean": 6.725530624389648, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1462.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 877.0, "completions/min_terminated_length": 0.0, "entropy": 4.905087798833847, "epoch": 14.071428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.001057412475347519, "kl": 0.029455579700879753, "learning_rate": 2.7022011009035107e-05, "loss": 0.0, "num_tokens": 16298816.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.35974109172821045, "sampling/importance_sampling_ratio/min": 5.917879946392805e-35, "sampling/sampling_logp_difference/max": 78.8125, "sampling/sampling_logp_difference/mean": 9.452171325683594, "step": 197 }, { "clip_ratio/high_max": 0.0007735148537904024, "clip_ratio/high_mean": 0.0003867574268952012, "clip_ratio/low_mean": 0.0006445957042160444, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010313531311112456, "completions/clipped_ratio": 1.0, "completions/max_length": 662.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 634.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 606.0, "completions/min_terminated_length": 0.0, "entropy": 2.113623693585396, "epoch": 14.142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.007259134668856859, "kl": 0.02433819102589041, "learning_rate": 2.68201878992672e-05, "loss": -0.0013, "num_tokens": 16371520.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6477788686752319, "sampling/importance_sampling_ratio/min": 2.1343206057139347e-37, "sampling/sampling_logp_difference/max": 84.4375, "sampling/sampling_logp_difference/mean": 5.209725379943848, "step": 198 }, { "clip_ratio/high_max": 0.003833049369859509, "clip_ratio/high_mean": 0.0019165246849297546, "clip_ratio/low_mean": 0.0012510647320596036, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031675894242653158, "completions/clipped_ratio": 1.0, "completions/max_length": 828.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 707.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 587.0, "completions/min_terminated_length": 0.0, "entropy": 2.9889007061719894, "epoch": 14.214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.010898197069764137, "kl": 0.031768193119205534, "learning_rate": 2.6618245459360897e-05, "loss": -0.0003, "num_tokens": 16448928.0, "reward": 0.109375, "reward_std": 0.1315089464187622, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5527850389480591, "sampling/importance_sampling_ratio/min": 7.067899417566791e-36, "sampling/sampling_logp_difference/max": 80.9375, "sampling/sampling_logp_difference/mean": 6.478890419006348, "step": 199 }, { "clip_ratio/high_max": 0.00035444235254544765, "clip_ratio/high_mean": 0.00017722117627272382, "clip_ratio/low_mean": 0.000827032134111505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010042533176601864, "completions/clipped_ratio": 1.0, "completions/max_length": 929.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 529.0, "completions/min_terminated_length": 0.0, "entropy": 2.8105850517749786, "epoch": 14.285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 0.008715618401765823, "kl": 0.03406154958065599, "learning_rate": 2.6416196928509408e-05, "loss": -0.0008, "num_tokens": 16527712.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5645280480384827, "sampling/importance_sampling_ratio/min": 1.252815132018138e-34, "sampling/sampling_logp_difference/max": 78.0625, "sampling/sampling_logp_difference/mean": 6.338950157165527, "step": 200 }, { "epoch": 14.285714285714286, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 470.72, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 432.34, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 393.96, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.7700299763679505, "eval_frac_reward_zero_std": 0.92, "eval_kl": 0.04062484011054039, "eval_loss": -0.0014642525929957628, "eval_num_tokens": 16527712.0, "eval_reward": 0.04, "eval_reward_std": 0.028284270763397217, "eval_rewards/tree_correctness_reward/mean": 0.04, "eval_rewards/tree_correctness_reward/std": 0.028284270763397217, "eval_runtime": 142.5231, "eval_samples_per_second": 0.175, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8294742226600647, "eval_sampling/importance_sampling_ratio/min": 3.4608193420152635e-31, "eval_sampling/sampling_logp_difference/max": 73.6225, "eval_sampling/sampling_logp_difference/mean": 2.746130599975586, "eval_steps_per_second": 0.028, "step": 200 }, { "clip_ratio/high_max": 0.002604166671517305, "clip_ratio/high_mean": 0.0013020833357586525, "clip_ratio/low_mean": 0.00036015070509165525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016622340408503078, "completions/clipped_ratio": 1.0, "completions/max_length": 893.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 728.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 564.0, "completions/min_terminated_length": 0.0, "entropy": 2.8500606566667557, "epoch": 14.357142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.009209881536662579, "kl": 0.024201573571190238, "learning_rate": 2.621405555286121e-05, "loss": 0.0016, "num_tokens": 16606464.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5770840644836426, "sampling/importance_sampling_ratio/min": 9.756933719611155e-35, "sampling/sampling_logp_difference/max": 78.3125, "sampling/sampling_logp_difference/mean": 6.15873384475708, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 804.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 574.0, "completions/min_terminated_length": 0.0, "entropy": 3.1010611057281494, "epoch": 14.428571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.00026591523783281446, "kl": 0.02304026298224926, "learning_rate": 2.6011834584651597e-05, "loss": 0.0, "num_tokens": 16690080.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.550147533416748, "sampling/importance_sampling_ratio/min": 3.820870344979068e-35, "sampling/sampling_logp_difference/max": 79.25, "sampling/sampling_logp_difference/mean": 6.507948875427246, "step": 202 }, { "clip_ratio/high_max": 0.003956718333938625, "clip_ratio/high_mean": 0.0019783591669693124, "clip_ratio/low_mean": 0.001413113706803415, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033914728373929393, "completions/clipped_ratio": 1.0, "completions/max_length": 656.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 521.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 387.0, "completions/min_terminated_length": 0.0, "entropy": 1.980580911040306, "epoch": 14.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.011515488848090172, "kl": 0.032972345943562686, "learning_rate": 2.5809547281333902e-05, "loss": 0.0017, "num_tokens": 16755584.0, "reward": 0.25, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6250667572021484, "sampling/importance_sampling_ratio/min": 4.4296417729802506e-39, "sampling/sampling_logp_difference/max": 88.3125, "sampling/sampling_logp_difference/mean": 5.643056869506836, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 826.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 633.0, "completions/min_terminated_length": 0.0, "entropy": 3.1859359741210938, "epoch": 14.571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.000218653935007751, "kl": 0.023515197914093733, "learning_rate": 2.560720690471033e-05, "loss": 0.0, "num_tokens": 16834400.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5276594161987305, "sampling/importance_sampling_ratio/min": 2.823262612952882e-34, "sampling/sampling_logp_difference/max": 77.25, "sampling/sampling_logp_difference/mean": 6.832664489746094, "step": 204 }, { "clip_ratio/high_max": 0.004514786909567192, "clip_ratio/high_mean": 0.002428852516459301, "clip_ratio/low_mean": 0.003914784647349734, "clip_ratio/low_min": 0.001542643949505873, "clip_ratio/region_mean": 0.006343637171084993, "completions/clipped_ratio": 1.0, "completions/max_length": 547.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 478.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 410.0, "completions/min_terminated_length": 0.0, "entropy": 1.7004540115594864, "epoch": 14.642857142857142, "frac_reward_zero_std": 0.375, "grad_norm": 0.021421676501631737, "kl": 0.036058435565792024, "learning_rate": 2.540482672006254e-05, "loss": 0.0017, "num_tokens": 16897152.0, "reward": 0.203125, "reward_std": 0.2867126166820526, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6488189697265625, "sampling/importance_sampling_ratio/min": 9.756933719611155e-35, "sampling/sampling_logp_difference/max": 78.3125, "sampling/sampling_logp_difference/mean": 5.339080810546875, "step": 205 }, { "clip_ratio/high_max": 0.002428756473818794, "clip_ratio/high_mean": 0.001214378236909397, "clip_ratio/low_mean": 0.001011981839837972, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022263600476435386, "completions/clipped_ratio": 1.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 727.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 386.0, "completions/min_terminated_length": 0.0, "entropy": 3.25674706697464, "epoch": 14.714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 0.009076106362044811, "kl": 0.03493415983393788, "learning_rate": 2.5202419995281968e-05, "loss": -0.0003, "num_tokens": 16975808.0, "reward": 0.421875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.421875, "rewards/tree_correctness_reward/std": 0.49776285886764526, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4993954300880432, "sampling/importance_sampling_ratio/min": 2.9756967944000553e-35, "sampling/sampling_logp_difference/max": 79.5, "sampling/sampling_logp_difference/mean": 7.340535640716553, "step": 206 }, { "clip_ratio/high_max": 0.002774532651528716, "clip_ratio/high_mean": 0.001387266325764358, "clip_ratio/low_mean": 0.0009833574658841826, "clip_ratio/low_min": 4.8676010919734836e-05, "clip_ratio/region_mean": 0.0023706237770966254, "completions/clipped_ratio": 1.0, "completions/max_length": 668.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 655.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 642.0, "completions/min_terminated_length": 0.0, "entropy": 2.4219513535499573, "epoch": 14.785714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.0073575968854129314, "kl": 0.026420122128911316, "learning_rate": 2.5e-05, "loss": -0.0035, "num_tokens": 17049856.0, "reward": 0.25, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6256164312362671, "sampling/importance_sampling_ratio/min": 4.608849322522735e-35, "sampling/sampling_logp_difference/max": 79.0625, "sampling/sampling_logp_difference/mean": 5.4917402267456055, "step": 207 }, { "clip_ratio/high_max": 0.0028541341744130477, "clip_ratio/high_mean": 0.0014270670872065239, "clip_ratio/low_mean": 0.0006864511742605828, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002113518257829128, "completions/clipped_ratio": 1.0, "completions/max_length": 640.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 610.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 581.0, "completions/min_terminated_length": 0.0, "entropy": 2.7840451151132584, "epoch": 14.857142857142858, "frac_reward_zero_std": 0.75, "grad_norm": 0.011082695797085762, "kl": 0.02726688567781821, "learning_rate": 2.4797580004718038e-05, "loss": 0.0011, "num_tokens": 17121056.0, "reward": 0.265625, "reward_std": 0.1315089464187622, "rewards/tree_correctness_reward/mean": 0.265625, "rewards/tree_correctness_reward/std": 0.44515693187713623, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.54203861951828, "sampling/importance_sampling_ratio/min": 1.0489692816875613e-33, "sampling/sampling_logp_difference/max": 75.9375, "sampling/sampling_logp_difference/mean": 6.665943145751953, "step": 208 }, { "clip_ratio/high_max": 0.003915313151082955, "clip_ratio/high_mean": 0.0019576565755414777, "clip_ratio/low_mean": 0.00174013924333849, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036977958043280523, "completions/clipped_ratio": 1.0, "completions/max_length": 979.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 705.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 431.0, "completions/min_terminated_length": 0.0, "entropy": 2.8865185379981995, "epoch": 14.928571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 0.01018757838755846, "kl": 0.03948140167631209, "learning_rate": 2.4595173279937464e-05, "loss": -0.0012, "num_tokens": 17198304.0, "reward": 0.140625, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5716643333435059, "sampling/importance_sampling_ratio/min": 8.44140221881032e-37, "sampling/sampling_logp_difference/max": 83.0625, "sampling/sampling_logp_difference/mean": 6.2904438972473145, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 657.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 533.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 409.0, "completions/min_terminated_length": 0.0, "entropy": 2.163128189742565, "epoch": 15.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002546028990764171, "kl": 0.0319756242679432, "learning_rate": 2.4392793095289677e-05, "loss": 0.0, "num_tokens": 17264544.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6186825037002563, "sampling/importance_sampling_ratio/min": 2.3405667503630854e-34, "sampling/sampling_logp_difference/max": 77.4375, "sampling/sampling_logp_difference/mean": 5.685050010681152, "step": 210 }, { "clip_ratio/high_max": 0.0032922240716288798, "clip_ratio/high_mean": 0.0016461120358144399, "clip_ratio/low_mean": 0.0009145066687779035, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025606187045923434, "completions/clipped_ratio": 1.0, "completions/max_length": 619.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 608.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 598.0, "completions/min_terminated_length": 0.0, "entropy": 2.316335365176201, "epoch": 15.071428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.010139139369130135, "kl": 0.028835673234425485, "learning_rate": 2.419045271866611e-05, "loss": 0.001, "num_tokens": 17335616.0, "reward": 0.234375, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42695629596710205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6151121258735657, "sampling/importance_sampling_ratio/min": 7.138325476697025e-35, "sampling/sampling_logp_difference/max": 78.625, "sampling/sampling_logp_difference/mean": 5.648311138153076, "step": 211 }, { "clip_ratio/high_max": 0.005747168164816685, "clip_ratio/high_mean": 0.0028735840824083425, "clip_ratio/low_mean": 0.0021333621043595485, "clip_ratio/low_min": 0.00059010097902501, "clip_ratio/region_mean": 0.005006946172215976, "completions/clipped_ratio": 1.0, "completions/max_length": 919.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 656.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 393.0, "completions/min_terminated_length": 0.0, "entropy": 2.845491275191307, "epoch": 15.142857142857142, "frac_reward_zero_std": 0.625, "grad_norm": 0.03462747484445572, "kl": 0.04340207576751709, "learning_rate": 2.3988165415348416e-05, "loss": -0.0002, "num_tokens": 17409728.0, "reward": 0.171875, "reward_std": 0.16887323558330536, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5521558523178101, "sampling/importance_sampling_ratio/min": 5.274548762960186e-34, "sampling/sampling_logp_difference/max": 76.625, "sampling/sampling_logp_difference/mean": 6.608832359313965, "step": 212 }, { "clip_ratio/high_max": 0.0034550330456113443, "clip_ratio/high_mean": 0.0017275165228056721, "clip_ratio/low_mean": 0.000876650166901527, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041666496894322, "completions/clipped_ratio": 1.0, "completions/max_length": 720.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 663.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 606.0, "completions/min_terminated_length": 0.0, "entropy": 3.0934078618884087, "epoch": 15.214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.009545124135911465, "kl": 0.029771902482025325, "learning_rate": 2.3785944447138802e-05, "loss": -0.0009, "num_tokens": 17484288.0, "reward": 0.140625, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5301157832145691, "sampling/importance_sampling_ratio/min": 1.038620201872273e-34, "sampling/sampling_logp_difference/max": 78.25, "sampling/sampling_logp_difference/mean": 6.791543006896973, "step": 213 }, { "clip_ratio/high_max": 0.0026911954082606826, "clip_ratio/high_mean": 0.0013455977041303413, "clip_ratio/low_mean": 0.00016066838361439295, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015062660513649462, "completions/clipped_ratio": 1.0, "completions/max_length": 778.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 719.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 661.0, "completions/min_terminated_length": 0.0, "entropy": 3.1694808304309845, "epoch": 15.285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 0.01034119538962841, "kl": 0.022417416563257575, "learning_rate": 2.358380307149059e-05, "loss": -0.0002, "num_tokens": 17562464.0, "reward": 0.21875, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.4166666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.538644552230835, "sampling/importance_sampling_ratio/min": 8.696261289635204e-34, "sampling/sampling_logp_difference/max": 76.125, "sampling/sampling_logp_difference/mean": 6.649867057800293, "step": 214 }, { "clip_ratio/high_max": 0.0026956868532579392, "clip_ratio/high_mean": 0.0013478434266289696, "clip_ratio/low_mean": 0.00017472045146860182, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015225638780975714, "completions/clipped_ratio": 1.0, "completions/max_length": 850.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 738.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 626.0, "completions/min_terminated_length": 0.0, "entropy": 3.112804114818573, "epoch": 15.357142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.0043120733462274075, "kl": 0.029794610920362175, "learning_rate": 2.338175454063911e-05, "loss": -0.0011, "num_tokens": 17641824.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5412973761558533, "sampling/importance_sampling_ratio/min": 1.028373383751816e-35, "sampling/sampling_logp_difference/max": 80.5625, "sampling/sampling_logp_difference/mean": 6.6173834800720215, "step": 215 }, { "clip_ratio/high_max": 0.00212053582072258, "clip_ratio/high_mean": 0.00106026791036129, "clip_ratio/low_mean": 0.0004464285848371219, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015066964951984119, "completions/clipped_ratio": 1.0, "completions/max_length": 601.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 580.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 560.0, "completions/min_terminated_length": 0.0, "entropy": 2.1566902697086334, "epoch": 15.428571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 0.007419189438223839, "kl": 0.026737020234577358, "learning_rate": 2.31798121007328e-05, "loss": -0.0013, "num_tokens": 17711104.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6231087446212769, "sampling/importance_sampling_ratio/min": 2.7678293012408834e-36, "sampling/sampling_logp_difference/max": 81.875, "sampling/sampling_logp_difference/mean": 5.572293281555176, "step": 216 }, { "clip_ratio/high_max": 0.002445864345645532, "clip_ratio/high_mean": 0.0013713882508454844, "clip_ratio/low_mean": 0.004617940263415221, "clip_ratio/low_min": 0.0007658027461729944, "clip_ratio/region_mean": 0.0059893285142607056, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1234.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 421.0, "completions/min_terminated_length": 0.0, "entropy": 3.667554661631584, "epoch": 15.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.017525849863886833, "kl": 0.03909668151754886, "learning_rate": 2.29779889909649e-05, "loss": 0.0005, "num_tokens": 17822240.0, "reward": 0.203125, "reward_std": 0.1530819982290268, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.45436516404151917, "sampling/importance_sampling_ratio/min": 3.4055047095468348e-34, "sampling/sampling_logp_difference/max": 77.0625, "sampling/sampling_logp_difference/mean": 8.33371353149414, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 753.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 696.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 640.0, "completions/min_terminated_length": 0.0, "entropy": 2.5313060954213142, "epoch": 15.571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007494184537790716, "kl": 0.032033045892603695, "learning_rate": 2.277629844270543e-05, "loss": 0.0, "num_tokens": 17898944.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6245624423027039, "sampling/importance_sampling_ratio/min": 1.2652986294762563e-33, "sampling/sampling_logp_difference/max": 75.75, "sampling/sampling_logp_difference/mean": 5.47308874130249, "step": 218 }, { "clip_ratio/high_max": 0.0008883779009920545, "clip_ratio/high_mean": 0.00044418895049602725, "clip_ratio/low_mean": 0.0011757942902477225, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016199832443817286, "completions/clipped_ratio": 1.0, "completions/max_length": 624.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 611.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 598.0, "completions/min_terminated_length": 0.0, "entropy": 2.4339858070015907, "epoch": 15.642857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.00969514437019825, "kl": 0.024944556469563395, "learning_rate": 2.25747536786338e-05, "loss": 0.0002, "num_tokens": 17970176.0, "reward": 0.203125, "reward_std": 0.11100947856903076, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5917245149612427, "sampling/importance_sampling_ratio/min": 5.119969264862197e-37, "sampling/sampling_logp_difference/max": 83.5625, "sampling/sampling_logp_difference/mean": 5.998866558074951, "step": 219 }, { "clip_ratio/high_max": 0.0023312612756853923, "clip_ratio/high_mean": 0.0012017578046652488, "clip_ratio/low_mean": 0.0011047833249904215, "clip_ratio/low_min": 0.00014450866729021072, "clip_ratio/region_mean": 0.0023065411296556704, "completions/clipped_ratio": 1.0, "completions/max_length": 865.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 731.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 597.0, "completions/min_terminated_length": 0.0, "entropy": 3.09745953977108, "epoch": 15.714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.0066674454137682915, "kl": 0.03201548790093511, "learning_rate": 2.2373367911871904e-05, "loss": -0.0009, "num_tokens": 18049088.0, "reward": 0.109375, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.537007749080658, "sampling/importance_sampling_ratio/min": 3.0053473766614753e-34, "sampling/sampling_logp_difference/max": 77.1875, "sampling/sampling_logp_difference/mean": 6.701072692871094, "step": 220 }, { "clip_ratio/high_max": 9.904913167702034e-05, "clip_ratio/high_mean": 4.952456583851017e-05, "clip_ratio/low_mean": 0.001535261517346953, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015847860831854632, "completions/clipped_ratio": 1.0, "completions/max_length": 631.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 549.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 467.0, "completions/min_terminated_length": 0.0, "entropy": 1.7934300675988197, "epoch": 15.785714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 0.007198211271315813, "kl": 0.036280740168876946, "learning_rate": 2.2172154345117894e-05, "loss": 0.0012, "num_tokens": 18116352.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6640059351921082, "sampling/importance_sampling_ratio/min": 2.3405667503630854e-34, "sampling/sampling_logp_difference/max": 77.4375, "sampling/sampling_logp_difference/mean": 5.0348100662231445, "step": 221 }, { "clip_ratio/high_max": 0.004329485007474432, "clip_ratio/high_mean": 0.002164742503737216, "clip_ratio/low_mean": 0.0018594582797959447, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004024200739877415, "completions/clipped_ratio": 1.0, "completions/max_length": 567.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 565.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 563.0, "completions/min_terminated_length": 0.0, "entropy": 2.231121025979519, "epoch": 15.857142857142858, "frac_reward_zero_std": 0.75, "grad_norm": 0.010057265870273113, "kl": 0.0349488704232499, "learning_rate": 2.1971126169780637e-05, "loss": 0.0032, "num_tokens": 18184640.0, "reward": 0.09375, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6257569193840027, "sampling/importance_sampling_ratio/min": 2.823262612952882e-34, "sampling/sampling_logp_difference/max": 77.25, "sampling/sampling_logp_difference/mean": 5.536738395690918, "step": 222 }, { "clip_ratio/high_max": 0.0012673095770878717, "clip_ratio/high_mean": 0.0006336547885439359, "clip_ratio/low_mean": 0.0010045746912510367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016382294797949726, "completions/clipped_ratio": 1.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 846.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 682.0, "completions/min_terminated_length": 0.0, "entropy": 3.601658582687378, "epoch": 15.928571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 0.007927772589027882, "kl": 0.0297056520357728, "learning_rate": 2.177029656511485e-05, "loss": 0.0007, "num_tokens": 18270944.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.502057671546936, "sampling/importance_sampling_ratio/min": 7.067899417566791e-36, "sampling/sampling_logp_difference/max": 80.9375, "sampling/sampling_logp_difference/mean": 7.148786544799805, "step": 223 }, { "clip_ratio/high_max": 0.000599680162849836, "clip_ratio/high_mean": 0.000299840081424918, "clip_ratio/low_mean": 0.0011993603329756297, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014992004144005477, "completions/clipped_ratio": 1.0, "completions/max_length": 843.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 656.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 469.0, "completions/min_terminated_length": 0.0, "entropy": 2.7331860437989235, "epoch": 16.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.01185515709221363, "kl": 0.03380422340705991, "learning_rate": 2.1569678697357127e-05, "loss": 0.0004, "num_tokens": 18345056.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5504503846168518, "sampling/importance_sampling_ratio/min": 1.729458002261791e-33, "sampling/sampling_logp_difference/max": 75.4375, "sampling/sampling_logp_difference/mean": 6.588512420654297, "step": 224 }, { "clip_ratio/high_max": 0.0017388712440151721, "clip_ratio/high_mean": 0.0008694356220075861, "clip_ratio/low_mean": 0.0016891892155399546, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002558624815719668, "completions/clipped_ratio": 1.0, "completions/max_length": 700.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 664.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 629.0, "completions/min_terminated_length": 0.0, "entropy": 3.474219486117363, "epoch": 16.071428571428573, "frac_reward_zero_std": 0.625, "grad_norm": 0.0100250830873847, "kl": 0.03455896023660898, "learning_rate": 2.136928571886275e-05, "loss": 0.0028, "num_tokens": 18419712.0, "reward": 0.125, "reward_std": 0.16675157845020294, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.46491092443466187, "sampling/importance_sampling_ratio/min": 1.6955009379010048e-35, "sampling/sampling_logp_difference/max": 80.0625, "sampling/sampling_logp_difference/mean": 7.704167366027832, "step": 225 }, { "clip_ratio/high_max": 0.004565939387248363, "clip_ratio/high_mean": 0.0024582033474871423, "clip_ratio/low_mean": 0.0017645313528191764, "clip_ratio/low_min": 0.0004167236365901772, "clip_ratio/region_mean": 0.004222734629365732, "completions/clipped_ratio": 1.0, "completions/max_length": 752.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 643.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 535.0, "completions/min_terminated_length": 0.0, "entropy": 2.9054799899458885, "epoch": 16.142857142857142, "frac_reward_zero_std": 0.5, "grad_norm": 0.020815420895814896, "kl": 0.03865009103901684, "learning_rate": 2.1169130767243422e-05, "loss": 0.0016, "num_tokens": 18493024.0, "reward": 0.21875, "reward_std": 0.2540663480758667, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.4166666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5345680713653564, "sampling/importance_sampling_ratio/min": 4.9549805217845695e-34, "sampling/sampling_logp_difference/max": 76.6875, "sampling/sampling_logp_difference/mean": 6.7815961837768555, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 739.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 614.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 489.0, "completions/min_terminated_length": 0.0, "entropy": 2.4024314656853676, "epoch": 16.214285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011004662374034524, "kl": 0.03969264472834766, "learning_rate": 2.0969226964506006e-05, "loss": 0.0, "num_tokens": 18564448.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5836594104766846, "sampling/importance_sampling_ratio/min": 2.8513940753957304e-33, "sampling/sampling_logp_difference/max": 74.9375, "sampling/sampling_logp_difference/mean": 6.142618656158447, "step": 227 }, { "clip_ratio/high_max": 0.0009357686503790319, "clip_ratio/high_mean": 0.00046788432518951595, "clip_ratio/low_mean": 0.0015905454056337476, "clip_ratio/low_min": 0.0004309708674554713, "clip_ratio/region_mean": 0.0020584297308232635, "completions/clipped_ratio": 1.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 916.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 755.0, "completions/min_terminated_length": 0.0, "entropy": 3.7237849980592728, "epoch": 16.285714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 0.012808295898139477, "kl": 0.02642447082325816, "learning_rate": 2.076958741619221e-05, "loss": 0.0006, "num_tokens": 18655200.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.50248783826828, "sampling/importance_sampling_ratio/min": 4.0271676663933825e-36, "sampling/sampling_logp_difference/max": 81.5, "sampling/sampling_logp_difference/mean": 7.122012138366699, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 612.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 582.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 552.0, "completions/min_terminated_length": 0.0, "entropy": 2.2150024324655533, "epoch": 16.357142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004922721418552101, "kl": 0.036965787410736084, "learning_rate": 2.0570225210519434e-05, "loss": 0.0, "num_tokens": 18724576.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6145634055137634, "sampling/importance_sampling_ratio/min": 9.756933719611155e-35, "sampling/sampling_logp_difference/max": 78.3125, "sampling/sampling_logp_difference/mean": 5.680960655212402, "step": 229 }, { "clip_ratio/high_max": 0.00620228995103389, "clip_ratio/high_mean": 0.003101144975516945, "clip_ratio/low_mean": 0.001789122135960497, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004890267111477442, "completions/clipped_ratio": 1.0, "completions/max_length": 835.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 614.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 393.0, "completions/min_terminated_length": 0.0, "entropy": 2.348319798707962, "epoch": 16.428571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.013621453195810318, "kl": 0.059258300345391035, "learning_rate": 2.03711534175227e-05, "loss": -0.0013, "num_tokens": 18796000.0, "reward": 0.140625, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6051973700523376, "sampling/importance_sampling_ratio/min": 2.4915206584065623e-34, "sampling/sampling_logp_difference/max": 77.375, "sampling/sampling_logp_difference/mean": 5.866542339324951, "step": 230 }, { "clip_ratio/high_max": 0.0010066526738228276, "clip_ratio/high_mean": 0.0005033263369114138, "clip_ratio/low_mean": 0.00015318627993110567, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006565126168425195, "completions/clipped_ratio": 1.0, "completions/max_length": 732.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 723.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 714.0, "completions/min_terminated_length": 0.0, "entropy": 3.006152391433716, "epoch": 16.5, "frac_reward_zero_std": 0.875, "grad_norm": 0.005610933527350426, "kl": 0.028114357963204384, "learning_rate": 2.0172385088197803e-05, "loss": -0.0005, "num_tokens": 18874400.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.551728367805481, "sampling/importance_sampling_ratio/min": 3.820870344979068e-35, "sampling/sampling_logp_difference/max": 79.25, "sampling/sampling_logp_difference/mean": 6.494020938873291, "step": 231 }, { "clip_ratio/high_max": 0.003667840392154176, "clip_ratio/high_mean": 0.001833920196077088, "clip_ratio/low_mean": 0.0004890453801635886, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002322965574421687, "completions/clipped_ratio": 1.0, "completions/max_length": 714.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 676.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 639.0, "completions/min_terminated_length": 0.0, "entropy": 3.3086784332990646, "epoch": 16.571428571428573, "frac_reward_zero_std": 0.75, "grad_norm": 0.007496510166674852, "kl": 0.02617929526604712, "learning_rate": 1.9973933253645682e-05, "loss": 0.0003, "num_tokens": 18949824.0, "reward": 0.15625, "reward_std": 0.1246790662407875, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5065264105796814, "sampling/importance_sampling_ratio/min": 3.1363615599988335e-36, "sampling/sampling_logp_difference/max": 81.75, "sampling/sampling_logp_difference/mean": 7.113087177276611, "step": 232 }, { "clip_ratio/high_max": 0.00855450369999744, "clip_ratio/high_mean": 0.004627812770195305, "clip_ratio/low_mean": 0.004609926982084289, "clip_ratio/low_min": 0.00028003990519209765, "clip_ratio/region_mean": 0.009237739723175764, "completions/clipped_ratio": 1.0, "completions/max_length": 624.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 508.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 392.0, "completions/min_terminated_length": 0.0, "entropy": 1.8649873919785023, "epoch": 16.642857142857142, "frac_reward_zero_std": 0.375, "grad_norm": 0.021938301622867584, "kl": 0.041727349045686424, "learning_rate": 1.9775810924218125e-05, "loss": -0.0034, "num_tokens": 19014464.0, "reward": 0.28125, "reward_std": 0.2619796395301819, "rewards/tree_correctness_reward/mean": 0.28125, "rewards/tree_correctness_reward/std": 0.4531635046005249, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6290148496627808, "sampling/importance_sampling_ratio/min": 2.9756967944000553e-35, "sampling/sampling_logp_difference/max": 79.5, "sampling/sampling_logp_difference/mean": 5.603180885314941, "step": 233 }, { "clip_ratio/high_max": 0.0007900280761532485, "clip_ratio/high_mean": 0.0003950140380766243, "clip_ratio/low_mean": 0.0007900281107140472, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011850421487906715, "completions/clipped_ratio": 1.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 860.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 712.0, "completions/min_terminated_length": 0.0, "entropy": 3.449104383587837, "epoch": 16.714285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 45.21695327758789, "kl": 175.68681081407703, "learning_rate": 1.957803108866481e-05, "loss": 0.0859, "num_tokens": 19101664.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5242963433265686, "sampling/importance_sampling_ratio/min": 1.3469032840202626e-33, "sampling/sampling_logp_difference/max": 75.6875, "sampling/sampling_logp_difference/mean": 6.8191633224487305, "step": 234 }, { "clip_ratio/high_max": 0.00106232293182984, "clip_ratio/high_mean": 0.00053116146591492, "clip_ratio/low_mean": 0.0012836402165703475, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018148016824852675, "completions/clipped_ratio": 1.0, "completions/max_length": 793.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 573.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 353.0, "completions/min_terminated_length": 0.0, "entropy": 1.951963759958744, "epoch": 16.785714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 0.007190134841948748, "kl": 0.04375857312697917, "learning_rate": 1.9380606713281775e-05, "loss": -0.0018, "num_tokens": 19170464.0, "reward": 0.15625, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6573222875595093, "sampling/importance_sampling_ratio/min": 5.559333506405371e-35, "sampling/sampling_logp_difference/max": 78.875, "sampling/sampling_logp_difference/mean": 5.179974555969238, "step": 235 }, { "clip_ratio/high_max": 0.0024379432143177837, "clip_ratio/high_mean": 0.0012189716071588919, "clip_ratio/low_mean": 0.0002659574383869767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014849290455458686, "completions/clipped_ratio": 1.0, "completions/max_length": 1128.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 916.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 705.0, "completions/min_terminated_length": 0.0, "entropy": 3.745537295937538, "epoch": 16.857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.005346810910850763, "kl": 0.03170344606041908, "learning_rate": 1.9183550741061354e-05, "loss": 0.0013, "num_tokens": 19261248.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48804014921188354, "sampling/importance_sampling_ratio/min": 1.8048514720778033e-35, "sampling/sampling_logp_difference/max": 80.0, "sampling/sampling_logp_difference/mean": 7.326418399810791, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 979.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 799.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 619.0, "completions/min_terminated_length": 0.0, "entropy": 3.654198318719864, "epoch": 16.928571428571427, "frac_reward_zero_std": 1.0, "grad_norm": 0.00045317894546315074, "kl": 0.03085865033790469, "learning_rate": 1.8986876090843667e-05, "loss": 0.0, "num_tokens": 19344512.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48427289724349976, "sampling/importance_sampling_ratio/min": 1.6408640419787877e-32, "sampling/sampling_logp_difference/max": 73.1875, "sampling/sampling_logp_difference/mean": 7.421727180480957, "step": 237 }, { "clip_ratio/high_max": 0.0034486417280277237, "clip_ratio/high_mean": 0.0017243208640138619, "clip_ratio/low_mean": 0.00029180815909057856, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020161290231044404, "completions/clipped_ratio": 1.0, "completions/max_length": 697.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 643.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 589.0, "completions/min_terminated_length": 0.0, "entropy": 2.7978850603103638, "epoch": 17.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.006244925782084465, "kl": 0.02959261799696833, "learning_rate": 1.879059565646963e-05, "loss": -0.0006, "num_tokens": 19417792.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.545297384262085, "sampling/importance_sampling_ratio/min": 2.0655430870101624e-34, "sampling/sampling_logp_difference/max": 77.5625, "sampling/sampling_logp_difference/mean": 6.624466896057129, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 660.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 640.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 620.0, "completions/min_terminated_length": 0.0, "entropy": 2.8174508810043335, "epoch": 17.071428571428573, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003795469820033759, "kl": 0.033359252964146435, "learning_rate": 1.859472230593569e-05, "loss": 0.0, "num_tokens": 19490880.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5660125017166138, "sampling/importance_sampling_ratio/min": 1.921254254894722e-35, "sampling/sampling_logp_difference/max": 79.9375, "sampling/sampling_logp_difference/mean": 6.30341911315918, "step": 239 }, { "clip_ratio/high_max": 0.0004101049853488803, "clip_ratio/high_mean": 0.00020505249267444015, "clip_ratio/low_mean": 0.0009842519648373127, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011893044575117528, "completions/clipped_ratio": 1.0, "completions/max_length": 916.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 839.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 762.0, "completions/min_terminated_length": 0.0, "entropy": 3.8678529262542725, "epoch": 17.142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.006528239231556654, "kl": 0.03439014917239547, "learning_rate": 1.8399268880550174e-05, "loss": 0.0002, "num_tokens": 19576704.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4757308065891266, "sampling/importance_sampling_ratio/min": 2.879805994260296e-32, "sampling/sampling_logp_difference/max": 72.625, "sampling/sampling_logp_difference/mean": 7.478569984436035, "step": 240 }, { "clip_ratio/high_max": 0.0009570572728989646, "clip_ratio/high_mean": 0.0004785286364494823, "clip_ratio/low_mean": 0.0003953062478103675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000873834880621871, "completions/clipped_ratio": 1.0, "completions/max_length": 751.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 726.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 702.0, "completions/min_terminated_length": 0.0, "entropy": 3.035512149333954, "epoch": 17.214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.0059369271621108055, "kl": 0.04879009793512523, "learning_rate": 1.820424819409143e-05, "loss": 0.0009, "num_tokens": 19655328.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5527477264404297, "sampling/importance_sampling_ratio/min": 9.952343492388037e-33, "sampling/sampling_logp_difference/max": 73.6875, "sampling/sampling_logp_difference/mean": 6.442648410797119, "step": 241 }, { "clip_ratio/high_max": 0.00030424201395362616, "clip_ratio/high_mean": 0.00015212100697681308, "clip_ratio/low_mean": 0.0007171418656071182, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008692628725839313, "completions/clipped_ratio": 1.0, "completions/max_length": 719.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 700.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 681.0, "completions/min_terminated_length": 0.0, "entropy": 3.2636193484067917, "epoch": 17.285714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 0.0051572625525295734, "kl": 0.03030487464275211, "learning_rate": 1.8009673031967776e-05, "loss": 0.0014, "num_tokens": 19732256.0, "reward": 0.140625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5119635462760925, "sampling/importance_sampling_ratio/min": 3.553963389762514e-36, "sampling/sampling_logp_difference/max": 81.625, "sampling/sampling_logp_difference/mean": 7.024777412414551, "step": 242 }, { "clip_ratio/high_max": 0.0007874015718698502, "clip_ratio/high_mean": 0.0003937007859349251, "clip_ratio/low_mean": 0.0003198818885721266, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007135826745070517, "completions/clipped_ratio": 1.0, "completions/max_length": 635.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 617.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 600.0, "completions/min_terminated_length": 0.0, "entropy": 2.4480661898851395, "epoch": 17.357142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.006844939664006233, "kl": 0.03771034558303654, "learning_rate": 1.7815556150379298e-05, "loss": -0.001, "num_tokens": 19803904.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5931791067123413, "sampling/importance_sampling_ratio/min": 3.589375444066961e-35, "sampling/sampling_logp_difference/max": 79.3125, "sampling/sampling_logp_difference/mean": 5.947538375854492, "step": 243 }, { "clip_ratio/high_max": 0.0008025682182051241, "clip_ratio/high_mean": 0.00040128410910256207, "clip_ratio/low_mean": 0.001554975959152216, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019562600828066934, "completions/clipped_ratio": 1.0, "completions/max_length": 653.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 638.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 623.0, "completions/min_terminated_length": 0.0, "entropy": 2.5742162540555, "epoch": 17.428571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.015347322449088097, "kl": 0.03885702439583838, "learning_rate": 1.7621910275481544e-05, "loss": 0.0011, "num_tokens": 19876864.0, "reward": 0.15625, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5952885150909424, "sampling/importance_sampling_ratio/min": 1.1166220121487572e-33, "sampling/sampling_logp_difference/max": 75.875, "sampling/sampling_logp_difference/mean": 5.9090776443481445, "step": 244 }, { "clip_ratio/high_max": 0.002593959987279959, "clip_ratio/high_mean": 0.0013505820315913297, "clip_ratio/low_mean": 0.0013219732063589618, "clip_ratio/low_min": 0.0002662024708115496, "clip_ratio/region_mean": 0.0026725552379502915, "completions/clipped_ratio": 1.0, "completions/max_length": 593.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 588.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 583.0, "completions/min_terminated_length": 0.0, "entropy": 2.7893759310245514, "epoch": 17.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.009343489073216915, "kl": 0.03707574668806046, "learning_rate": 1.7428748102551237e-05, "loss": -0.0005, "num_tokens": 19946624.0, "reward": 0.140625, "reward_std": 0.1804211586713791, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5488088130950928, "sampling/importance_sampling_ratio/min": 1.2779062104650084e-32, "sampling/sampling_logp_difference/max": 73.4375, "sampling/sampling_logp_difference/mean": 6.57177734375, "step": 245 }, { "clip_ratio/high_max": 0.0026885420411417726, "clip_ratio/high_mean": 0.0014730990678799571, "clip_ratio/low_mean": 0.001895647923447541, "clip_ratio/low_min": 0.00011042402911698446, "clip_ratio/region_mean": 0.003368746949490742, "completions/clipped_ratio": 1.0, "completions/max_length": 849.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 684.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 520.0, "completions/min_terminated_length": 0.0, "entropy": 3.2008379325270653, "epoch": 17.571428571428573, "frac_reward_zero_std": 0.625, "grad_norm": 0.010527659207582474, "kl": 0.04106716765090823, "learning_rate": 1.7236082295153946e-05, "loss": -0.0008, "num_tokens": 20022560.0, "reward": 0.140625, "reward_std": 0.19408093392848969, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5193254351615906, "sampling/importance_sampling_ratio/min": 6.840139179297504e-33, "sampling/sampling_logp_difference/max": 74.0625, "sampling/sampling_logp_difference/mean": 6.953815460205078, "step": 246 }, { "clip_ratio/high_max": 0.003564687503967434, "clip_ratio/high_mean": 0.0018191084527643397, "clip_ratio/low_mean": 0.0005364669268601574, "clip_ratio/low_min": 0.00018382353300694376, "clip_ratio/region_mean": 0.002355575379624497, "completions/clipped_ratio": 1.0, "completions/max_length": 850.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 746.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 642.0, "completions/min_terminated_length": 0.0, "entropy": 3.204658232629299, "epoch": 17.642857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.010997961275279522, "kl": 0.03525689570233226, "learning_rate": 1.704392548431391e-05, "loss": 0.0014, "num_tokens": 20102432.0, "reward": 0.171875, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5360501408576965, "sampling/importance_sampling_ratio/min": 5.614728103716021e-34, "sampling/sampling_logp_difference/max": 76.5625, "sampling/sampling_logp_difference/mean": 6.686817646026611, "step": 247 }, { "clip_ratio/high_max": 0.0007661996351089329, "clip_ratio/high_mean": 0.0003830998175544664, "clip_ratio/low_mean": 0.0006020139990141615, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009851138238445856, "completions/clipped_ratio": 1.0, "completions/max_length": 902.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 736.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 571.0, "completions/min_terminated_length": 0.0, "entropy": 2.9213257804512978, "epoch": 17.714285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.00829508900642395, "kl": 0.04560802865307778, "learning_rate": 1.685229026768593e-05, "loss": -0.0011, "num_tokens": 20181696.0, "reward": 0.1875, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5693761706352234, "sampling/importance_sampling_ratio/min": 2.3174752027302318e-35, "sampling/sampling_logp_difference/max": 79.75, "sampling/sampling_logp_difference/mean": 6.244451522827148, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 563.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 554.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 546.0, "completions/min_terminated_length": 0.0, "entropy": 2.228823371231556, "epoch": 17.785714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010893391445279121, "kl": 0.05176317773293704, "learning_rate": 1.666118920872949e-05, "loss": 0.0, "num_tokens": 20249312.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6012574434280396, "sampling/importance_sampling_ratio/min": 4.70115418990159e-33, "sampling/sampling_logp_difference/max": 74.4375, "sampling/sampling_logp_difference/mean": 5.88150691986084, "step": 249 }, { "clip_ratio/high_max": 0.0009328358210041188, "clip_ratio/high_mean": 0.0004664179105020594, "clip_ratio/low_mean": 0.00021200814080657437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006784260513086338, "completions/clipped_ratio": 1.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1322.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 737.0, "completions/min_terminated_length": 0.0, "entropy": 4.80486074090004, "epoch": 17.857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.005612866021692753, "kl": 0.03284882544539869, "learning_rate": 1.6470634835885097e-05, "loss": -0.0015, "num_tokens": 20366080.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.35130560398101807, "sampling/importance_sampling_ratio/min": 5.559333506405371e-35, "sampling/sampling_logp_difference/max": 78.875, "sampling/sampling_logp_difference/mean": 9.559228897094727, "step": 250 }, { "epoch": 17.857142857142858, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 468.44, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 426.72, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 385.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.8207133907079697, "eval_frac_reward_zero_std": 0.88, "eval_kl": 0.06911414787173271, "eval_loss": -0.0011072424240410328, "eval_num_tokens": 20366080.0, "eval_reward": 0.06, "eval_reward_std": 0.04680067300796509, "eval_rewards/tree_correctness_reward/mean": 0.06, "eval_rewards/tree_correctness_reward/std": 0.04680067300796509, "eval_runtime": 141.4126, "eval_samples_per_second": 0.177, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8219875144958496, "eval_sampling/importance_sampling_ratio/min": 1.1796555997392654e-29, "eval_sampling/sampling_logp_difference/max": 69.865, "eval_sampling/sampling_logp_difference/mean": 2.8499904680252075, "eval_steps_per_second": 0.028, "step": 250 }, { "clip_ratio/high_max": 0.005798374600999523, "clip_ratio/high_mean": 0.0031066839983395766, "clip_ratio/low_mean": 0.002275746395753231, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0053824304031877546, "completions/clipped_ratio": 1.0, "completions/max_length": 587.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 503.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 420.0, "completions/min_terminated_length": 0.0, "entropy": 2.420454941689968, "epoch": 17.928571428571427, "frac_reward_zero_std": 0.5, "grad_norm": 0.015215441584587097, "kl": 0.04993023374117911, "learning_rate": 1.6280639641752942e-05, "loss": -0.0005, "num_tokens": 20430432.0, "reward": 0.25, "reward_std": 0.1767766922712326, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5610599517822266, "sampling/importance_sampling_ratio/min": 1.1166220121487572e-33, "sampling/sampling_logp_difference/max": 75.875, "sampling/sampling_logp_difference/mean": 6.494109153747559, "step": 251 }, { "clip_ratio/high_max": 0.003105983851128258, "clip_ratio/high_mean": 0.001552991925564129, "clip_ratio/low_mean": 0.0001267748484679032, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016797667740320321, "completions/clipped_ratio": 1.0, "completions/max_length": 766.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 629.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 493.0, "completions/min_terminated_length": 0.0, "entropy": 2.5906548723578453, "epoch": 18.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.006621185690164566, "kl": 0.03670844866428524, "learning_rate": 1.6091216082273875e-05, "loss": -0.0, "num_tokens": 20502848.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5653937458992004, "sampling/importance_sampling_ratio/min": 3.1676128923554724e-35, "sampling/sampling_logp_difference/max": 79.4375, "sampling/sampling_logp_difference/mean": 6.387720108032227, "step": 252 }, { "clip_ratio/high_max": 0.0010660697589628398, "clip_ratio/high_mean": 0.0005330348794814199, "clip_ratio/low_mean": 0.0009428125176782487, "clip_ratio/low_min": 0.000298538405331783, "clip_ratio/region_mean": 0.0014758473971596686, "completions/clipped_ratio": 1.0, "completions/max_length": 877.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 713.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 550.0, "completions/min_terminated_length": 0.0, "entropy": 3.099463775753975, "epoch": 18.071428571428573, "frac_reward_zero_std": 0.75, "grad_norm": 0.010263399221003056, "kl": 0.02882854244671762, "learning_rate": 1.5902376575912815e-05, "loss": -0.0036, "num_tokens": 20580640.0, "reward": 0.203125, "reward_std": 0.12255740165710449, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5375943779945374, "sampling/importance_sampling_ratio/min": 1.6246755378554462e-33, "sampling/sampling_logp_difference/max": 75.5, "sampling/sampling_logp_difference/mean": 6.72574520111084, "step": 253 }, { "clip_ratio/high_max": 0.0008660508319735527, "clip_ratio/high_mean": 0.00043302541598677635, "clip_ratio/low_mean": 0.00041498267819406465, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000848008094180841, "completions/clipped_ratio": 1.0, "completions/max_length": 866.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 733.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 601.0, "completions/min_terminated_length": 0.0, "entropy": 3.1235202103853226, "epoch": 18.142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.009049593470990658, "kl": 0.030866706743836403, "learning_rate": 1.571413350284459e-05, "loss": 0.0002, "num_tokens": 20659712.0, "reward": 0.15625, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5342050790786743, "sampling/importance_sampling_ratio/min": 1.4337711226845796e-33, "sampling/sampling_logp_difference/max": 75.625, "sampling/sampling_logp_difference/mean": 6.754935264587402, "step": 254 }, { "clip_ratio/high_max": 0.0014974778750911355, "clip_ratio/high_mean": 0.0007487389375455678, "clip_ratio/low_mean": 0.00039407314034178853, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011428120778873563, "completions/clipped_ratio": 1.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 928.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 793.0, "completions/min_terminated_length": 0.0, "entropy": 3.992030903697014, "epoch": 18.214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.008095666766166687, "kl": 0.03411577420774847, "learning_rate": 1.552649920414233e-05, "loss": 0.0001, "num_tokens": 20751264.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4695257246494293, "sampling/importance_sampling_ratio/min": 7.2812904542176e-33, "sampling/sampling_logp_difference/max": 74.0, "sampling/sampling_logp_difference/mean": 7.599581718444824, "step": 255 }, { "clip_ratio/high_max": 0.004049912444315851, "clip_ratio/high_mean": 0.0020249562221579254, "clip_ratio/low_mean": 0.0002189141814596951, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022438704036176205, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1309.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 571.0, "completions/min_terminated_length": 0.0, "entropy": 3.9451248347759247, "epoch": 18.285714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 0.005534321069717407, "kl": 0.0462150638923049, "learning_rate": 1.5339485980968382e-05, "loss": 0.0009, "num_tokens": 20867200.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.44333183765411377, "sampling/importance_sampling_ratio/min": 3.625141007634431e-34, "sampling/sampling_logp_difference/max": 77.0, "sampling/sampling_logp_difference/mean": 8.348424911499023, "step": 256 }, { "clip_ratio/high_max": 0.0013950892825960182, "clip_ratio/high_mean": 0.0006975446412980091, "clip_ratio/low_mean": 0.00029894770341343246, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009964923519873992, "completions/clipped_ratio": 1.0, "completions/max_length": 784.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 707.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 630.0, "completions/min_terminated_length": 0.0, "entropy": 2.611161358654499, "epoch": 18.357142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.007340341340750456, "kl": 0.04326492524705827, "learning_rate": 1.5153106093767827e-05, "loss": 0.0002, "num_tokens": 20944576.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6070224046707153, "sampling/importance_sampling_ratio/min": 6.425715707179229e-33, "sampling/sampling_logp_difference/max": 74.125, "sampling/sampling_logp_difference/mean": 5.728065490722656, "step": 257 }, { "clip_ratio/high_max": 0.0032152959320228547, "clip_ratio/high_mean": 0.0016076479660114273, "clip_ratio/low_mean": 0.0014035044951015152, "clip_ratio/low_min": 0.0001931993756443262, "clip_ratio/region_mean": 0.0030111524611129425, "completions/clipped_ratio": 1.0, "completions/max_length": 662.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 654.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 647.0, "completions/min_terminated_length": 0.0, "entropy": 2.7342216670513153, "epoch": 18.428571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.010338971391320229, "kl": 0.05564419936854392, "learning_rate": 1.4967371761464738e-05, "loss": -0.0006, "num_tokens": 21018592.0, "reward": 0.0625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5738038420677185, "sampling/importance_sampling_ratio/min": 8.169380844367654e-34, "sampling/sampling_logp_difference/max": 76.1875, "sampling/sampling_logp_difference/mean": 6.187374114990234, "step": 258 }, { "clip_ratio/high_max": 0.010041291126981378, "clip_ratio/high_mean": 0.005196207377593964, "clip_ratio/low_mean": 0.0022009017411619425, "clip_ratio/low_min": 0.00046921923058107495, "clip_ratio/region_mean": 0.007397109089652076, "completions/clipped_ratio": 1.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 888.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 445.0, "completions/min_terminated_length": 0.0, "entropy": 3.7084835171699524, "epoch": 18.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.02113219164311886, "kl": 0.04568651271983981, "learning_rate": 1.4782295160661103e-05, "loss": -0.0002, "num_tokens": 21107584.0, "reward": 0.109375, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.45385921001434326, "sampling/importance_sampling_ratio/min": 2.879805994260296e-32, "sampling/sampling_logp_difference/max": 72.625, "sampling/sampling_logp_difference/mean": 7.974471569061279, "step": 259 }, { "clip_ratio/high_max": 0.0030655197479063645, "clip_ratio/high_mean": 0.0015327598739531823, "clip_ratio/low_mean": 0.00102946558399708, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025622254725021776, "completions/clipped_ratio": 1.0, "completions/max_length": 706.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 694.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 683.0, "completions/min_terminated_length": 0.0, "entropy": 3.344369515776634, "epoch": 18.571428571428573, "frac_reward_zero_std": 0.75, "grad_norm": 0.019802767783403397, "kl": 0.07121360301971436, "learning_rate": 1.4597888424838518e-05, "loss": 0.0011, "num_tokens": 21184160.0, "reward": 0.109375, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.49332767724990845, "sampling/importance_sampling_ratio/min": 8.696261289635204e-34, "sampling/sampling_logp_difference/max": 76.125, "sampling/sampling_logp_difference/mean": 7.31996488571167, "step": 260 }, { "clip_ratio/high_max": 0.00431964844756294, "clip_ratio/high_mean": 0.00215982422378147, "clip_ratio/low_mean": 0.0015103062705748016, "clip_ratio/low_min": 0.0004672897048294544, "clip_ratio/region_mean": 0.0036701304925372824, "completions/clipped_ratio": 1.0, "completions/max_length": 714.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 624.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 535.0, "completions/min_terminated_length": 0.0, "entropy": 2.6481492668390274, "epoch": 18.642857142857142, "frac_reward_zero_std": 0.625, "grad_norm": 0.012824798002839088, "kl": 0.04823589907027781, "learning_rate": 1.4414163643562755e-05, "loss": 0.0017, "num_tokens": 21256256.0, "reward": 0.203125, "reward_std": 0.1804211586713791, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5638168454170227, "sampling/importance_sampling_ratio/min": 5.004353121229029e-33, "sampling/sampling_logp_difference/max": 74.375, "sampling/sampling_logp_difference/mean": 6.367963790893555, "step": 261 }, { "clip_ratio/high_max": 0.000723007702617906, "clip_ratio/high_mean": 0.000361503851308953, "clip_ratio/low_mean": 0.001124678678024793, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001486182529333746, "completions/clipped_ratio": 1.0, "completions/max_length": 623.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 506.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 389.0, "completions/min_terminated_length": 0.0, "entropy": 1.9481540471315384, "epoch": 18.714285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.006499668583273888, "kl": 0.04095157841220498, "learning_rate": 1.4231132861691126e-05, "loss": 0.001, "num_tokens": 21320768.0, "reward": 0.15625, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6282984614372253, "sampling/importance_sampling_ratio/min": 4.70115418990159e-33, "sampling/sampling_logp_difference/max": 74.4375, "sampling/sampling_logp_difference/mean": 5.606186866760254, "step": 262 }, { "clip_ratio/high_max": 0.0014656616403954104, "clip_ratio/high_mean": 0.0007328308201977052, "clip_ratio/low_mean": 0.0011254187775193714, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018582495977170765, "completions/clipped_ratio": 1.0, "completions/max_length": 597.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 591.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 585.0, "completions/min_terminated_length": 0.0, "entropy": 2.2110106348991394, "epoch": 18.785714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 0.009593951515853405, "kl": 0.042904777568764985, "learning_rate": 1.4048808078582942e-05, "loss": -0.0022, "num_tokens": 21390720.0, "reward": 0.0625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6157020330429077, "sampling/importance_sampling_ratio/min": 9.854154449263851e-34, "sampling/sampling_logp_difference/max": 76.0, "sampling/sampling_logp_difference/mean": 5.671713352203369, "step": 263 }, { "clip_ratio/high_max": 0.0033437500533182174, "clip_ratio/high_mean": 0.0016718750266591087, "clip_ratio/low_mean": 0.0030624999963038135, "clip_ratio/low_min": 0.0003551136323949322, "clip_ratio/region_mean": 0.004734374921099516, "completions/clipped_ratio": 1.0, "completions/max_length": 440.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 407.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 375.0, "completions/min_terminated_length": 0.0, "entropy": 1.1100949253886938, "epoch": 18.857142857142858, "frac_reward_zero_std": 0.625, "grad_norm": 0.0142102325335145, "kl": 0.05439308285713196, "learning_rate": 1.3867201247312697e-05, "loss": 0.0014, "num_tokens": 21448928.0, "reward": 0.234375, "reward_std": 0.1804211586713791, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42695629596710205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.724484920501709, "sampling/importance_sampling_ratio/min": 7.2812904542176e-33, "sampling/sampling_logp_difference/max": 74.0, "sampling/sampling_logp_difference/mean": 4.338912487030029, "step": 264 }, { "clip_ratio/high_max": 0.0019009725074283779, "clip_ratio/high_mean": 0.0012101161701139063, "clip_ratio/low_mean": 0.0008616780614829622, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020717942315968685, "completions/clipped_ratio": 1.0, "completions/max_length": 792.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 727.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 662.0, "completions/min_terminated_length": 0.0, "entropy": 3.591612473130226, "epoch": 18.928571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.007618461735546589, "kl": 0.03576836397405714, "learning_rate": 1.368632427388653e-05, "loss": 0.0011, "num_tokens": 21527584.0, "reward": 0.15625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.47957873344421387, "sampling/importance_sampling_ratio/min": 9.854154449263851e-34, "sampling/sampling_logp_difference/max": 76.0, "sampling/sampling_logp_difference/mean": 7.486452102661133, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1204.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 953.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 703.0, "completions/min_terminated_length": 0.0, "entropy": 3.7803999334573746, "epoch": 19.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004085962427780032, "kl": 0.0365709497127682, "learning_rate": 1.3506189016461673e-05, "loss": 0.0, "num_tokens": 21620736.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48384690284729004, "sampling/importance_sampling_ratio/min": 3.231053036772405e-33, "sampling/sampling_logp_difference/max": 74.8125, "sampling/sampling_logp_difference/mean": 7.417416572570801, "step": 266 }, { "clip_ratio/high_max": 0.0006960726714169141, "clip_ratio/high_mean": 0.00034803633570845705, "clip_ratio/low_mean": 0.000512895654537715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008609319902461721, "completions/clipped_ratio": 1.0, "completions/max_length": 853.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 766.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 680.0, "completions/min_terminated_length": 0.0, "entropy": 3.0617028027772903, "epoch": 19.071428571428573, "frac_reward_zero_std": 0.875, "grad_norm": 0.08250057697296143, "kl": 0.5407702573575079, "learning_rate": 1.3326807284568984e-05, "loss": -0.0005, "num_tokens": 21701920.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5684400200843811, "sampling/importance_sampling_ratio/min": 1.3917519671963701e-36, "sampling/sampling_logp_difference/max": 82.5625, "sampling/sampling_logp_difference/mean": 6.251105308532715, "step": 267 }, { "clip_ratio/high_max": 0.005201274456339888, "clip_ratio/high_mean": 0.002699217679037247, "clip_ratio/low_mean": 0.0013534120953409001, "clip_ratio/low_min": 0.00023148147738538682, "clip_ratio/region_mean": 0.004052629788930062, "completions/clipped_ratio": 1.0, "completions/max_length": 810.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 722.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 634.0, "completions/min_terminated_length": 0.0, "entropy": 3.0990201085805893, "epoch": 19.142857142857142, "frac_reward_zero_std": 0.625, "grad_norm": 0.017006784677505493, "kl": 0.04612200893461704, "learning_rate": 1.3148190838338803e-05, "loss": 0.0018, "num_tokens": 21780256.0, "reward": 0.203125, "reward_std": 0.17358146607875824, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5479715466499329, "sampling/importance_sampling_ratio/min": 3.8973941318215115e-33, "sampling/sampling_logp_difference/max": 74.625, "sampling/sampling_logp_difference/mean": 6.543178558349609, "step": 268 }, { "clip_ratio/high_max": 0.0032632743241265416, "clip_ratio/high_mean": 0.0016316371620632708, "clip_ratio/low_mean": 0.0016316371620632708, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032632743241265416, "completions/clipped_ratio": 1.0, "completions/max_length": 880.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 722.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 565.0, "completions/min_terminated_length": 0.0, "entropy": 3.161627918481827, "epoch": 19.214285714285715, "frac_reward_zero_std": 0.75, "grad_norm": 0.008171661756932735, "kl": 0.039095761720091105, "learning_rate": 1.2970351387729873e-05, "loss": 0.0, "num_tokens": 21858624.0, "reward": 0.21875, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.4166666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5364316701889038, "sampling/importance_sampling_ratio/min": 2.8513940753957304e-33, "sampling/sampling_logp_difference/max": 74.9375, "sampling/sampling_logp_difference/mean": 6.718721389770508, "step": 269 }, { "clip_ratio/high_max": 0.00012617766333278269, "clip_ratio/high_mean": 6.308883166639134e-05, "clip_ratio/low_mean": 0.0006729475062456913, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007360363306361251, "completions/clipped_ratio": 1.0, "completions/max_length": 743.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 715.0, "completions/min_terminated_length": 0.0, "entropy": 3.2086858451366425, "epoch": 19.285714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 0.010293475352227688, "kl": 0.030657074064947665, "learning_rate": 1.2793300591761742e-05, "loss": 0.0011, "num_tokens": 21937408.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5357067584991455, "sampling/importance_sampling_ratio/min": 5.670674379810965e-33, "sampling/sampling_logp_difference/max": 74.25, "sampling/sampling_logp_difference/mean": 6.714910507202148, "step": 270 }, { "clip_ratio/high_max": 0.00038343558844644576, "clip_ratio/high_mean": 0.00019171779422322288, "clip_ratio/low_mean": 0.0005032591980125289, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006949769922357518, "completions/clipped_ratio": 1.0, "completions/max_length": 656.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 654.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 652.0, "completions/min_terminated_length": 0.0, "entropy": 3.0782003700733185, "epoch": 19.357142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.006098453886806965, "kl": 0.028177522821351886, "learning_rate": 1.2617050057750322e-05, "loss": 0.001, "num_tokens": 22011392.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5166208148002625, "sampling/importance_sampling_ratio/min": 2.363888833789888e-33, "sampling/sampling_logp_difference/max": 75.125, "sampling/sampling_logp_difference/mean": 6.9937357902526855, "step": 271 }, { "clip_ratio/high_max": 0.0006933438744454179, "clip_ratio/high_mean": 0.00034667193722270895, "clip_ratio/low_mean": 0.000544770184205845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000891442121428554, "completions/clipped_ratio": 1.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 822.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 631.0, "completions/min_terminated_length": 0.0, "entropy": 2.975179485976696, "epoch": 19.428571428571427, "frac_reward_zero_std": 0.875, "grad_norm": 0.013186449185013771, "kl": 0.03623701829928905, "learning_rate": 1.2441611340546957e-05, "loss": 0.002, "num_tokens": 22096160.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5789433717727661, "sampling/importance_sampling_ratio/min": 2.3874431324522758e-32, "sampling/sampling_logp_difference/max": 72.8125, "sampling/sampling_logp_difference/mean": 6.083719253540039, "step": 272 }, { "clip_ratio/high_max": 0.00349262791860383, "clip_ratio/high_mean": 0.0018312324318685569, "clip_ratio/low_mean": 0.002986773873999482, "clip_ratio/low_min": 0.0005095108645036817, "clip_ratio/region_mean": 0.004818006305868039, "completions/clipped_ratio": 1.0, "completions/max_length": 418.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 393.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 368.0, "completions/min_terminated_length": 0.0, "entropy": 1.1033034231513739, "epoch": 19.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.015344452112913132, "kl": 0.0404674286255613, "learning_rate": 1.2266995941780934e-05, "loss": -0.004, "num_tokens": 22153440.0, "reward": 0.15625, "reward_std": 0.16675157845020294, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.36596253514289856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7032126188278198, "sampling/importance_sampling_ratio/min": 3.231053036772405e-33, "sampling/sampling_logp_difference/max": 74.8125, "sampling/sampling_logp_difference/mean": 4.69811487197876, "step": 273 }, { "clip_ratio/high_max": 0.0011652542161755264, "clip_ratio/high_mean": 0.0005826271080877632, "clip_ratio/low_mean": 0.000503177954669809, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001085805066395551, "completions/clipped_ratio": 1.0, "completions/max_length": 622.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 606.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 590.0, "completions/min_terminated_length": 0.0, "entropy": 2.400308296084404, "epoch": 19.571428571428573, "frac_reward_zero_std": 0.875, "grad_norm": 0.009512593038380146, "kl": 0.034496058942750096, "learning_rate": 1.2093215309105352e-05, "loss": -0.0006, "num_tokens": 22224352.0, "reward": 0.171875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6120572090148926, "sampling/importance_sampling_ratio/min": 3.439437695788472e-33, "sampling/sampling_logp_difference/max": 74.75, "sampling/sampling_logp_difference/mean": 5.70483922958374, "step": 274 }, { "clip_ratio/high_max": 0.002912783034844324, "clip_ratio/high_mean": 0.001456391517422162, "clip_ratio/low_mean": 0.00014563915283360984, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016020306702557718, "completions/clipped_ratio": 1.0, "completions/max_length": 814.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 782.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 751.0, "completions/min_terminated_length": 0.0, "entropy": 4.118153899908066, "epoch": 19.642857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.005975057836622, "kl": 0.03213500522542745, "learning_rate": 1.1920280835446748e-05, "loss": 0.0002, "num_tokens": 22306560.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.422539621591568, "sampling/importance_sampling_ratio/min": 1.0489692816875613e-33, "sampling/sampling_logp_difference/max": 75.9375, "sampling/sampling_logp_difference/mean": 8.268775939941406, "step": 275 }, { "clip_ratio/high_max": 0.0018601191040943377, "clip_ratio/high_mean": 0.0009300595520471688, "clip_ratio/low_mean": 0.0002325148889212869, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011625744409684557, "completions/clipped_ratio": 1.0, "completions/max_length": 709.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 690.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 672.0, "completions/min_terminated_length": 0.0, "entropy": 3.123047910630703, "epoch": 19.714285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.005299247335642576, "kl": 0.03396000841166824, "learning_rate": 1.1748203858258055e-05, "loss": -0.0006, "num_tokens": 22382880.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.526711106300354, "sampling/importance_sampling_ratio/min": 8.610464551321452e-35, "sampling/sampling_logp_difference/max": 78.4375, "sampling/sampling_logp_difference/mean": 6.829362392425537, "step": 276 }, { "clip_ratio/high_max": 0.007936845770018408, "clip_ratio/high_mean": 0.00406901299902529, "clip_ratio/low_mean": 0.0009110438513744157, "clip_ratio/low_min": 3.843788363155909e-05, "clip_ratio/region_mean": 0.004980056839485769, "completions/clipped_ratio": 1.0, "completions/max_length": 813.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 639.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 466.0, "completions/min_terminated_length": 0.0, "entropy": 2.755197837948799, "epoch": 19.785714285714285, "frac_reward_zero_std": 0.625, "grad_norm": 0.009685874916613102, "kl": 0.031454625190235674, "learning_rate": 1.1576995658775405e-05, "loss": 0.003, "num_tokens": 22455936.0, "reward": 0.25, "reward_std": 0.1825428307056427, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5593308806419373, "sampling/importance_sampling_ratio/min": 4.148754129619672e-33, "sampling/sampling_logp_difference/max": 74.5625, "sampling/sampling_logp_difference/mean": 6.467427730560303, "step": 277 }, { "clip_ratio/high_max": 0.003713348793098703, "clip_ratio/high_mean": 0.0018566743965493515, "clip_ratio/low_mean": 0.00012056327250320464, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001977237669052556, "completions/clipped_ratio": 1.0, "completions/max_length": 688.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 668.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 648.0, "completions/min_terminated_length": 0.0, "entropy": 3.3185091987252235, "epoch": 19.857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.004906622227281332, "kl": 0.03457023319788277, "learning_rate": 1.140666746127854e-05, "loss": -0.0009, "num_tokens": 22530816.0, "reward": 0.234375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42695629596710205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48349207639694214, "sampling/importance_sampling_ratio/min": 1.3469032840202626e-33, "sampling/sampling_logp_difference/max": 75.6875, "sampling/sampling_logp_difference/mean": 7.440810680389404, "step": 278 }, { "clip_ratio/high_max": 0.0025532420841045678, "clip_ratio/high_mean": 0.0012766210420522839, "clip_ratio/low_mean": 0.001292631806791178, "clip_ratio/low_min": 0.0001050420178216882, "clip_ratio/region_mean": 0.0025692528524814406, "completions/clipped_ratio": 1.0, "completions/max_length": 743.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 669.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 595.0, "completions/min_terminated_length": 0.0, "entropy": 2.8970746994018555, "epoch": 19.928571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.014264358207583427, "kl": 0.0326030032010749, "learning_rate": 1.1237230432354912e-05, "loss": -0.0013, "num_tokens": 22605760.0, "reward": 0.0625, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5533453226089478, "sampling/importance_sampling_ratio/min": 1.1166220121487572e-33, "sampling/sampling_logp_difference/max": 75.875, "sampling/sampling_logp_difference/mean": 6.496923446655273, "step": 279 }, { "clip_ratio/high_max": 0.0005796796176582575, "clip_ratio/high_mean": 0.00028983980882912874, "clip_ratio/low_mean": 0.00131745362887159, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016072934377007186, "completions/clipped_ratio": 1.0, "completions/max_length": 943.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 768.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 593.0, "completions/min_terminated_length": 0.0, "entropy": 3.2724734991788864, "epoch": 20.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.009037709794938564, "kl": 0.03321816318202764, "learning_rate": 1.1068695680167664e-05, "loss": 0.0005, "num_tokens": 22687040.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5147531032562256, "sampling/importance_sampling_ratio/min": 1.3336148713971936e-34, "sampling/sampling_logp_difference/max": 78.0, "sampling/sampling_logp_difference/mean": 7.004648208618164, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 810.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 725.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 640.0, "completions/min_terminated_length": 0.0, "entropy": 3.1744881719350815, "epoch": 20.071428571428573, "frac_reward_zero_std": 1.0, "grad_norm": 0.00035028057754971087, "kl": 0.03308352699968964, "learning_rate": 1.0901074253727336e-05, "loss": 0.0, "num_tokens": 22765568.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5393441915512085, "sampling/importance_sampling_ratio/min": 6.29955056431282e-35, "sampling/sampling_logp_difference/max": 78.75, "sampling/sampling_logp_difference/mean": 6.6588897705078125, "step": 281 }, { "clip_ratio/high_max": 0.0004562043759506196, "clip_ratio/high_mean": 0.0002281021879753098, "clip_ratio/low_mean": 0.0012925791015732102, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00152068128954852, "completions/clipped_ratio": 1.0, "completions/max_length": 663.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 537.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 411.0, "completions/min_terminated_length": 0.0, "entropy": 2.085183121263981, "epoch": 20.142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.029463816434144974, "kl": 0.03980205627158284, "learning_rate": 1.0734377142167549e-05, "loss": -0.0011, "num_tokens": 22832064.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6277563571929932, "sampling/importance_sampling_ratio/min": 7.209453331209461e-34, "sampling/sampling_logp_difference/max": 76.3125, "sampling/sampling_logp_difference/mean": 5.595057487487793, "step": 282 }, { "clip_ratio/high_max": 0.00018447461479809135, "clip_ratio/high_mean": 9.223730739904568e-05, "clip_ratio/low_mean": 0.0004980814628652297, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005903187666262966, "completions/clipped_ratio": 1.0, "completions/max_length": 847.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 802.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 757.0, "completions/min_terminated_length": 0.0, "entropy": 3.287946864962578, "epoch": 20.214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.005722363945096731, "kl": 0.038288781652227044, "learning_rate": 1.0568615274024522e-05, "loss": -0.0012, "num_tokens": 22915520.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5395029783248901, "sampling/importance_sampling_ratio/min": 1.405619611404976e-35, "sampling/sampling_logp_difference/max": 80.25, "sampling/sampling_logp_difference/mean": 6.6403608322143555, "step": 283 }, { "clip_ratio/high_max": 0.0012828407270717435, "clip_ratio/high_mean": 0.0006414203635358717, "clip_ratio/low_mean": 0.00023091133152775, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008723316950636217, "completions/clipped_ratio": 1.0, "completions/max_length": 799.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 704.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 609.0, "completions/min_terminated_length": 0.0, "entropy": 3.3953690975904465, "epoch": 20.285714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 0.007122804410755634, "kl": 0.03260667261201888, "learning_rate": 1.0403799516520618e-05, "loss": -0.0009, "num_tokens": 22992704.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.49202537536621094, "sampling/importance_sampling_ratio/min": 3.858942701415657e-34, "sampling/sampling_logp_difference/max": 76.9375, "sampling/sampling_logp_difference/mean": 7.344038009643555, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 859.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 679.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 499.0, "completions/min_terminated_length": 0.0, "entropy": 2.999907076358795, "epoch": 20.357142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003695541527122259, "kl": 0.034540616907179356, "learning_rate": 1.0239940674851941e-05, "loss": 0.0, "num_tokens": 23068288.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.537083625793457, "sampling/importance_sampling_ratio/min": 6.772654783300765e-34, "sampling/sampling_logp_difference/max": 76.375, "sampling/sampling_logp_difference/mean": 6.765527725219727, "step": 285 }, { "clip_ratio/high_max": 0.000978915166342631, "clip_ratio/high_mean": 0.0004894575831713155, "clip_ratio/low_mean": 0.0028041578698321246, "clip_ratio/low_min": 0.00013746334298048168, "clip_ratio/region_mean": 0.0032936154602793977, "completions/clipped_ratio": 1.0, "completions/max_length": 682.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 559.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 437.0, "completions/min_terminated_length": 0.0, "entropy": 2.1022243797779083, "epoch": 20.428571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.024060776457190514, "kl": 0.04247980774380267, "learning_rate": 1.0077049491479872e-05, "loss": 0.0019, "num_tokens": 23136224.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6398518085479736, "sampling/importance_sampling_ratio/min": 1.4337711226845796e-33, "sampling/sampling_logp_difference/max": 75.625, "sampling/sampling_logp_difference/mean": 5.35186767578125, "step": 286 }, { "clip_ratio/high_max": 0.004089365971594816, "clip_ratio/high_mean": 0.0023435960447386606, "clip_ratio/low_mean": 0.00026143769355257973, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002605033745567198, "completions/clipped_ratio": 1.0, "completions/max_length": 794.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 684.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 575.0, "completions/min_terminated_length": 0.0, "entropy": 2.6268244683742523, "epoch": 20.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.008614555932581425, "kl": 0.03104138607159257, "learning_rate": 9.915136645426884e-06, "loss": -0.0018, "num_tokens": 23212160.0, "reward": 0.21875, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.4166666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5935471057891846, "sampling/importance_sampling_ratio/min": 2.466939338135878e-35, "sampling/sampling_logp_difference/max": 79.6875, "sampling/sampling_logp_difference/mean": 5.921877384185791, "step": 287 }, { "clip_ratio/high_max": 0.003439022613747511, "clip_ratio/high_mean": 0.0017434393303119577, "clip_ratio/low_mean": 0.0017151172887679422, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034585566190799, "completions/clipped_ratio": 1.0, "completions/max_length": 653.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 601.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 550.0, "completions/min_terminated_length": 0.0, "entropy": 2.502801187336445, "epoch": 20.571428571428573, "frac_reward_zero_std": 0.625, "grad_norm": 0.019848058000206947, "kl": 0.02786860184278339, "learning_rate": 9.754212751576386e-06, "loss": 0.0007, "num_tokens": 23282784.0, "reward": 0.1875, "reward_std": 0.1825428307056427, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5774905681610107, "sampling/importance_sampling_ratio/min": 3.1676128923554724e-35, "sampling/sampling_logp_difference/max": 79.4375, "sampling/sampling_logp_difference/mean": 6.192251205444336, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 726.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 722.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 718.0, "completions/min_terminated_length": 0.0, "entropy": 2.838571324944496, "epoch": 20.642857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003423626476433128, "kl": 0.03331817011348903, "learning_rate": 9.594288359976817e-06, "loss": 0.0, "num_tokens": 23361120.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5795115232467651, "sampling/importance_sampling_ratio/min": 7.674422876742102e-34, "sampling/sampling_logp_difference/max": 76.25, "sampling/sampling_logp_difference/mean": 6.088791847229004, "step": 289 }, { "clip_ratio/high_max": 0.0023177070761448704, "clip_ratio/high_mean": 0.0012327305848884862, "clip_ratio/low_mean": 0.0010867118326132186, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023194424757093657, "completions/clipped_ratio": 1.0, "completions/max_length": 846.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 721.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 596.0, "completions/min_terminated_length": 0.0, "entropy": 3.3964425772428513, "epoch": 20.714285714285715, "frac_reward_zero_std": 0.625, "grad_norm": 0.014369925484061241, "kl": 0.028744567651301622, "learning_rate": 9.43537395515003e-06, "loss": -0.001, "num_tokens": 23439392.0, "reward": 0.3125, "reward_std": 0.16675157845020294, "rewards/tree_correctness_reward/mean": 0.3125, "rewards/tree_correctness_reward/std": 0.467176616191864, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.49325865507125854, "sampling/importance_sampling_ratio/min": 1.921254254894722e-35, "sampling/sampling_logp_difference/max": 79.9375, "sampling/sampling_logp_difference/mean": 7.300381183624268, "step": 290 }, { "clip_ratio/high_max": 0.0058583022764651105, "clip_ratio/high_mean": 0.003123653201328125, "clip_ratio/low_mean": 0.0026210244759568013, "clip_ratio/low_min": 0.0003182281070621684, "clip_ratio/region_mean": 0.005744677589973435, "completions/clipped_ratio": 1.0, "completions/max_length": 491.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 486.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 482.0, "completions/min_terminated_length": 0.0, "entropy": 1.917329154908657, "epoch": 20.785714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.09790107607841492, "kl": 0.04667193512432277, "learning_rate": 9.277479955403887e-06, "loss": 0.0021, "num_tokens": 23502656.0, "reward": 0.203125, "reward_std": 0.24039676785469055, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6239532232284546, "sampling/importance_sampling_ratio/min": 2.9463390310530646e-36, "sampling/sampling_logp_difference/max": 81.8125, "sampling/sampling_logp_difference/mean": 5.649072647094727, "step": 291 }, { "clip_ratio/high_max": 0.002762430925940862, "clip_ratio/high_mean": 0.001381215462970431, "clip_ratio/low_mean": 0.0019279466068837792, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00330916206985421, "completions/clipped_ratio": 1.0, "completions/max_length": 957.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 750.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 543.0, "completions/min_terminated_length": 0.0, "entropy": 3.04830901324749, "epoch": 20.857142857142858, "frac_reward_zero_std": 0.75, "grad_norm": 0.01053124014288187, "kl": 0.03712528373580426, "learning_rate": 9.12061671214929e-06, "loss": 0.0024, "num_tokens": 23582784.0, "reward": 0.109375, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5320740342140198, "sampling/importance_sampling_ratio/min": 4.70115418990159e-33, "sampling/sampling_logp_difference/max": 74.4375, "sampling/sampling_logp_difference/mean": 6.783917427062988, "step": 292 }, { "clip_ratio/high_max": 0.0023062395630404353, "clip_ratio/high_mean": 0.0011531197815202177, "clip_ratio/low_mean": 0.0018287884540768573, "clip_ratio/low_min": 0.0003541076584951952, "clip_ratio/region_mean": 0.0029819081919413293, "completions/clipped_ratio": 1.0, "completions/max_length": 706.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 684.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 663.0, "completions/min_terminated_length": 0.0, "entropy": 2.895677424967289, "epoch": 20.928571428571427, "frac_reward_zero_std": 0.625, "grad_norm": 0.015329074114561081, "kl": 0.030395982670597732, "learning_rate": 8.964794509221508e-06, "loss": 0.0009, "num_tokens": 23658720.0, "reward": 0.234375, "reward_std": 0.17358146607875824, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42695629596710205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5574125051498413, "sampling/importance_sampling_ratio/min": 3.4055047095468348e-34, "sampling/sampling_logp_difference/max": 77.0625, "sampling/sampling_logp_difference/mean": 6.415499687194824, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1271.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1011.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 752.0, "completions/min_terminated_length": 0.0, "entropy": 4.458953306078911, "epoch": 21.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005336848553270102, "kl": 0.03545710304751992, "learning_rate": 8.810023562206e-06, "loss": 0.0, "num_tokens": 23755584.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.41201311349868774, "sampling/importance_sampling_ratio/min": 3.2730868116823916e-38, "sampling/sampling_logp_difference/max": 86.3125, "sampling/sampling_logp_difference/mean": 8.409960746765137, "step": 294 }, { "clip_ratio/high_max": 0.0011075949296355247, "clip_ratio/high_mean": 0.0005537974648177624, "clip_ratio/low_mean": 0.0006329113821266219, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011867088687722571, "completions/clipped_ratio": 1.0, "completions/max_length": 1291.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1040.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 790.0, "completions/min_terminated_length": 0.0, "entropy": 4.701723158359528, "epoch": 21.071428571428573, "frac_reward_zero_std": 0.875, "grad_norm": 0.010418850928544998, "kl": 0.0400402604136616, "learning_rate": 8.656314017768693e-06, "loss": 0.0023, "num_tokens": 23854304.0, "reward": 0.1875, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3879181742668152, "sampling/importance_sampling_ratio/min": 9.165791500701421e-35, "sampling/sampling_logp_difference/max": 78.375, "sampling/sampling_logp_difference/mean": 8.72014045715332, "step": 295 }, { "clip_ratio/high_max": 0.0004770992381963879, "clip_ratio/high_mean": 0.00023854961909819394, "clip_ratio/low_mean": 0.0004600599677360151, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000698609586834209, "completions/clipped_ratio": 1.0, "completions/max_length": 917.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 854.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 791.0, "completions/min_terminated_length": 0.0, "entropy": 3.6669458150863647, "epoch": 21.142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.0054246848449110985, "kl": 0.029907634132541716, "learning_rate": 8.503675952990756e-06, "loss": 0.0015, "num_tokens": 23941088.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5067466497421265, "sampling/importance_sampling_ratio/min": 4.518357081925638e-37, "sampling/sampling_logp_difference/max": 83.6875, "sampling/sampling_logp_difference/mean": 7.074636459350586, "step": 296 }, { "clip_ratio/high_max": 0.0012263193420949392, "clip_ratio/high_mean": 0.0006131596710474696, "clip_ratio/low_mean": 0.0005708727985620499, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011840324696095195, "completions/clipped_ratio": 1.0, "completions/max_length": 739.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 719.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 699.0, "completions/min_terminated_length": 0.0, "entropy": 3.589129462838173, "epoch": 21.214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.011018096469342709, "kl": 0.02729221945628524, "learning_rate": 8.352119374707978e-06, "loss": -0.0013, "num_tokens": 24019232.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48019468784332275, "sampling/importance_sampling_ratio/min": 2.363888833789888e-33, "sampling/sampling_logp_difference/max": 75.125, "sampling/sampling_logp_difference/mean": 7.468961715698242, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 837.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 629.0, "completions/min_terminated_length": 0.0, "entropy": 3.669554218649864, "epoch": 21.285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004845149233005941, "kl": 0.0345802316442132, "learning_rate": 8.20165421885469e-06, "loss": 0.0, "num_tokens": 24104928.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48055025935173035, "sampling/importance_sampling_ratio/min": 4.416325663883611e-33, "sampling/sampling_logp_difference/max": 74.5, "sampling/sampling_logp_difference/mean": 7.471445083618164, "step": 298 }, { "clip_ratio/high_max": 0.00539417232357664, "clip_ratio/high_mean": 0.003055457757000113, "clip_ratio/low_mean": 0.0010745098297775257, "clip_ratio/low_min": 0.0005783844826510176, "clip_ratio/region_mean": 0.00412996761406248, "completions/clipped_ratio": 1.0, "completions/max_length": 654.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 643.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 632.0, "completions/min_terminated_length": 0.0, "entropy": 3.096943512558937, "epoch": 21.357142857142858, "frac_reward_zero_std": 0.625, "grad_norm": 0.011148538440465927, "kl": 0.025197796756401658, "learning_rate": 8.052290349812419e-06, "loss": 0.0004, "num_tokens": 24178208.0, "reward": 0.21875, "reward_std": 0.1872510462999344, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.4166666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5148020386695862, "sampling/importance_sampling_ratio/min": 4.906094994852858e-35, "sampling/sampling_logp_difference/max": 79.0, "sampling/sampling_logp_difference/mean": 7.036322593688965, "step": 299 }, { "clip_ratio/high_max": 0.0009656983966124244, "clip_ratio/high_mean": 0.0004828491983062122, "clip_ratio/low_mean": 0.00042490729174460284, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009077564754988998, "completions/clipped_ratio": 1.0, "completions/max_length": 809.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 714.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 620.0, "completions/min_terminated_length": 0.0, "entropy": 3.0828917026519775, "epoch": 21.428571428571427, "frac_reward_zero_std": 0.875, "grad_norm": 0.007475872524082661, "kl": 0.03184617985971272, "learning_rate": 7.904037559763162e-06, "loss": -0.0001, "num_tokens": 24256064.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5318372249603271, "sampling/importance_sampling_ratio/min": 1.1056055867417628e-34, "sampling/sampling_logp_difference/max": 78.1875, "sampling/sampling_logp_difference/mean": 6.780000686645508, "step": 300 }, { "epoch": 21.428571428571427, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 427.52, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 401.54, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 375.56, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.6094792276620865, "eval_frac_reward_zero_std": 0.88, "eval_kl": 0.048574046418070796, "eval_loss": -0.0017979006515815854, "eval_num_tokens": 24256064.0, "eval_reward": 0.055, "eval_reward_std": 0.042426406145095824, "eval_rewards/tree_correctness_reward/mean": 0.055, "eval_rewards/tree_correctness_reward/std": 0.042426406145095824, "eval_runtime": 130.7931, "eval_samples_per_second": 0.191, "eval_sampling/importance_sampling_ratio/max": 1.9998392486572265, "eval_sampling/importance_sampling_ratio/mean": 0.8500382804870605, "eval_sampling/importance_sampling_ratio/min": 1.6478321673846992e-30, "eval_sampling/sampling_logp_difference/max": 73.05125, "eval_sampling/sampling_logp_difference/mean": 2.486447100639343, "eval_steps_per_second": 0.031, "step": 300 }, { "clip_ratio/high_max": 0.0025425296189496294, "clip_ratio/high_mean": 0.0012712648094748147, "clip_ratio/low_mean": 0.00011556952813407406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013868343376088887, "completions/clipped_ratio": 1.0, "completions/max_length": 676.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 673.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 671.0, "completions/min_terminated_length": 0.0, "entropy": 3.00010547041893, "epoch": 21.5, "frac_reward_zero_std": 0.875, "grad_norm": 0.00591428205370903, "kl": 0.026414246240165085, "learning_rate": 7.756905568047393e-06, "loss": -0.0019, "num_tokens": 24331296.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5483303666114807, "sampling/importance_sampling_ratio/min": 9.660673761439827e-36, "sampling/sampling_logp_difference/max": 80.625, "sampling/sampling_logp_difference/mean": 6.544478416442871, "step": 301 }, { "clip_ratio/high_max": 0.0021086837296024896, "clip_ratio/high_mean": 0.0012285336160857696, "clip_ratio/low_mean": 0.00123186922428431, "clip_ratio/low_min": 0.00017730495892465115, "clip_ratio/region_mean": 0.0024604028367321007, "completions/clipped_ratio": 1.0, "completions/max_length": 897.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 801.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 705.0, "completions/min_terminated_length": 0.0, "entropy": 3.667951673269272, "epoch": 21.571428571428573, "frac_reward_zero_std": 0.75, "grad_norm": 0.020312171429395676, "kl": 0.03670086641795933, "learning_rate": 7.6109040205269375e-06, "loss": 0.0024, "num_tokens": 24414688.0, "reward": 0.09375, "reward_std": 0.1246790662407875, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.47999224066734314, "sampling/importance_sampling_ratio/min": 3.8973941318215115e-33, "sampling/sampling_logp_difference/max": 74.625, "sampling/sampling_logp_difference/mean": 7.477461814880371, "step": 302 }, { "clip_ratio/high_max": 0.0026836577453650534, "clip_ratio/high_mean": 0.0015165183285716921, "clip_ratio/low_mean": 0.0008476647089992184, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002364183026656974, "completions/clipped_ratio": 1.0, "completions/max_length": 805.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 734.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 664.0, "completions/min_terminated_length": 0.0, "entropy": 2.692466974258423, "epoch": 21.642857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.012543311342597008, "kl": 0.034981554257683456, "learning_rate": 7.466042488952521e-06, "loss": 0.0008, "num_tokens": 24493824.0, "reward": 0.09375, "reward_std": 0.1293872892856598, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6047110557556152, "sampling/importance_sampling_ratio/min": 1.1166220121487572e-33, "sampling/sampling_logp_difference/max": 75.875, "sampling/sampling_logp_difference/mean": 5.751143932342529, "step": 303 }, { "clip_ratio/high_max": 0.0032094594789668918, "clip_ratio/high_mean": 0.0016047297394834459, "clip_ratio/low_mean": 8.445946150459349e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016891892009880394, "completions/clipped_ratio": 1.0, "completions/max_length": 766.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 568.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 370.0, "completions/min_terminated_length": 0.0, "entropy": 2.2137836441397667, "epoch": 21.714285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.012719240970909595, "kl": 0.04544758575502783, "learning_rate": 7.3223304703363135e-06, "loss": -0.0006, "num_tokens": 24562304.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6094276905059814, "sampling/importance_sampling_ratio/min": 6.036401529078488e-33, "sampling/sampling_logp_difference/max": 74.1875, "sampling/sampling_logp_difference/mean": 5.874734401702881, "step": 304 }, { "clip_ratio/high_max": 0.003189024530001916, "clip_ratio/high_mean": 0.001594512265000958, "clip_ratio/low_mean": 0.0006851193138572853, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022796315715822857, "completions/clipped_ratio": 1.0, "completions/max_length": 737.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 690.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 643.0, "completions/min_terminated_length": 0.0, "entropy": 2.6421917900443077, "epoch": 21.785714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 0.010678024962544441, "kl": 0.02847434754949063, "learning_rate": 7.179777386329276e-06, "loss": -0.0029, "num_tokens": 24638592.0, "reward": 0.125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5820248126983643, "sampling/importance_sampling_ratio/min": 9.075362379071311e-36, "sampling/sampling_logp_difference/max": 80.6875, "sampling/sampling_logp_difference/mean": 6.08541202545166, "step": 305 }, { "clip_ratio/high_max": 0.0006553079947480001, "clip_ratio/high_mean": 0.00032765399737400003, "clip_ratio/low_mean": 0.0004505242504819762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007781782442179974, "completions/clipped_ratio": 1.0, "completions/max_length": 763.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 606.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 449.0, "completions/min_terminated_length": 0.0, "entropy": 2.4691090062260628, "epoch": 21.857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.005778017919510603, "kl": 0.0329372767591849, "learning_rate": 7.038392582603481e-06, "loss": 0.0026, "num_tokens": 24709504.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5851432085037231, "sampling/importance_sampling_ratio/min": 3.553963389762514e-36, "sampling/sampling_logp_difference/max": 81.625, "sampling/sampling_logp_difference/mean": 6.113028049468994, "step": 306 }, { "clip_ratio/high_max": 0.004120954901736695, "clip_ratio/high_mean": 0.0023079694983607624, "clip_ratio/low_mean": 0.0015551458509435179, "clip_ratio/low_min": 0.0003629883867688477, "clip_ratio/region_mean": 0.0038631154147878988, "completions/clipped_ratio": 1.0, "completions/max_length": 947.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 882.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 818.0, "completions/min_terminated_length": 0.0, "entropy": 4.3296633660793304, "epoch": 21.928571428571427, "frac_reward_zero_std": 0.625, "grad_norm": 0.01924809068441391, "kl": 0.09181580890435725, "learning_rate": 6.898185328239468e-06, "loss": 0.0, "num_tokens": 24798112.0, "reward": 0.1875, "reward_std": 0.1552036553621292, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4158199429512024, "sampling/importance_sampling_ratio/min": 4.9549805217845695e-34, "sampling/sampling_logp_difference/max": 76.6875, "sampling/sampling_logp_difference/mean": 8.322710037231445, "step": 307 }, { "clip_ratio/high_max": 0.0024277889169752598, "clip_ratio/high_mean": 0.0012138944584876299, "clip_ratio/low_mean": 0.0015251494478434324, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027390439063310623, "completions/clipped_ratio": 1.0, "completions/max_length": 615.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 558.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 502.0, "completions/min_terminated_length": 0.0, "entropy": 2.447858788073063, "epoch": 22.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.012169688008725643, "kl": 0.03468403511215001, "learning_rate": 6.7591648151184935e-06, "loss": -0.0032, "num_tokens": 24865984.0, "reward": 0.234375, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42695629596710205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5677176713943481, "sampling/importance_sampling_ratio/min": 2.466939338135878e-35, "sampling/sampling_logp_difference/max": 79.6875, "sampling/sampling_logp_difference/mean": 6.365835189819336, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 797.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 726.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 655.0, "completions/min_terminated_length": 0.0, "entropy": 3.3167001008987427, "epoch": 22.071428571428573, "frac_reward_zero_std": 1.0, "grad_norm": 0.00025465787621214986, "kl": 0.02410870569292456, "learning_rate": 6.621340157319997e-06, "loss": 0.0, "num_tokens": 24944576.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.49533283710479736, "sampling/importance_sampling_ratio/min": 1.6086464555616602e-34, "sampling/sampling_logp_difference/max": 77.8125, "sampling/sampling_logp_difference/mean": 7.274279594421387, "step": 309 }, { "clip_ratio/high_max": 0.0024096385313896462, "clip_ratio/high_mean": 0.0012048192656948231, "clip_ratio/low_mean": 0.002522590340959141, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037274096212058794, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 463.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 415.0, "completions/min_terminated_length": 0.0, "entropy": 1.6651964485645294, "epoch": 22.142857142857142, "frac_reward_zero_std": 0.625, "grad_norm": 0.011450446210801601, "kl": 0.039293885231018066, "learning_rate": 6.484720390524007e-06, "loss": -0.003, "num_tokens": 25006368.0, "reward": 0.078125, "reward_std": 0.1530819833278656, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6431849598884583, "sampling/importance_sampling_ratio/min": 2.0655430870101624e-34, "sampling/sampling_logp_difference/max": 77.5625, "sampling/sampling_logp_difference/mean": 5.418754577636719, "step": 310 }, { "clip_ratio/high_max": 0.0001575100759509951, "clip_ratio/high_mean": 7.875503797549754e-05, "clip_ratio/low_mean": 0.0007560483791166916, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008348034170921892, "completions/clipped_ratio": 1.0, "completions/max_length": 992.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 867.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 743.0, "completions/min_terminated_length": 0.0, "entropy": 3.897570066154003, "epoch": 22.214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.015682704746723175, "kl": 0.03304166591260582, "learning_rate": 6.349314471418849e-06, "loss": 0.0014, "num_tokens": 25094016.0, "reward": 0.140625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4759749174118042, "sampling/importance_sampling_ratio/min": 4.563378985501584e-36, "sampling/sampling_logp_difference/max": 81.375, "sampling/sampling_logp_difference/mean": 7.50895881652832, "step": 311 }, { "clip_ratio/high_max": 0.0014492753471131437, "clip_ratio/high_mean": 0.0007246376735565718, "clip_ratio/low_mean": 0.00018115942111762706, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009057971001311671, "completions/clipped_ratio": 1.0, "completions/max_length": 690.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 639.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 589.0, "completions/min_terminated_length": 0.0, "entropy": 2.6275454834103584, "epoch": 22.285714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 0.00547413993626833, "kl": 0.0339380819350481, "learning_rate": 6.2151312771139e-06, "loss": 0.0006, "num_tokens": 25167072.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5797343850135803, "sampling/importance_sampling_ratio/min": 6.362320011142437e-34, "sampling/sampling_logp_difference/max": 76.4375, "sampling/sampling_logp_difference/mean": 6.14002799987793, "step": 312 }, { "clip_ratio/high_max": 0.0030169309029588476, "clip_ratio/high_mean": 0.0015084654514794238, "clip_ratio/low_mean": 0.00018011526844929904, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001688580734480638, "completions/clipped_ratio": 1.0, "completions/max_length": 989.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 841.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 694.0, "completions/min_terminated_length": 0.0, "entropy": 3.5383784025907516, "epoch": 22.357142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.008759263902902603, "kl": 0.03496837837155908, "learning_rate": 6.082179604557617e-06, "loss": -0.0003, "num_tokens": 25253056.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5042128562927246, "sampling/importance_sampling_ratio/min": 5.274548762960186e-34, "sampling/sampling_logp_difference/max": 76.625, "sampling/sampling_logp_difference/mean": 7.118901252746582, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 860.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 832.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 804.0, "completions/min_terminated_length": 0.0, "entropy": 3.622985988855362, "epoch": 22.428571428571427, "frac_reward_zero_std": 1.0, "grad_norm": 0.00041419517947360873, "kl": 0.02818414306966588, "learning_rate": 5.950468169960846e-06, "loss": 0.0, "num_tokens": 25338432.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.513663649559021, "sampling/importance_sampling_ratio/min": 1.4815121963806348e-36, "sampling/sampling_logp_difference/max": 82.5, "sampling/sampling_logp_difference/mean": 6.9761810302734375, "step": 314 }, { "clip_ratio/high_max": 0.0010693138028727844, "clip_ratio/high_mean": 0.0007037063551251777, "clip_ratio/low_mean": 0.0015385579936264548, "clip_ratio/low_min": 0.00031046244839672, "clip_ratio/region_mean": 0.0022422643487516325, "completions/clipped_ratio": 1.0, "completions/max_length": 755.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 701.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 647.0, "completions/min_terminated_length": 0.0, "entropy": 2.866408832371235, "epoch": 22.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.010817025788128376, "kl": 0.030553108430467546, "learning_rate": 5.820005608225346e-06, "loss": -0.0002, "num_tokens": 25415424.0, "reward": 0.0625, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5735502243041992, "sampling/importance_sampling_ratio/min": 4.9549805217845695e-34, "sampling/sampling_logp_difference/max": 76.6875, "sampling/sampling_logp_difference/mean": 6.187777519226074, "step": 315 }, { "clip_ratio/high_max": 0.0006065389898139983, "clip_ratio/high_mean": 0.00030326949490699917, "clip_ratio/low_mean": 0.0014628962853748817, "clip_ratio/low_min": 0.0002473614877089858, "clip_ratio/region_mean": 0.0017661657802818809, "completions/clipped_ratio": 1.0, "completions/max_length": 758.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 723.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 688.0, "completions/min_terminated_length": 0.0, "entropy": 3.5647934526205063, "epoch": 22.571428571428573, "frac_reward_zero_std": 0.75, "grad_norm": 0.012187639251351357, "kl": 0.03345098940189928, "learning_rate": 5.690800472377747e-06, "loss": 0.0002, "num_tokens": 25493824.0, "reward": 0.046875, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.47919416427612305, "sampling/importance_sampling_ratio/min": 1.84099845715941e-33, "sampling/sampling_logp_difference/max": 75.375, "sampling/sampling_logp_difference/mean": 7.491150856018066, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1401.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 755.0, "completions/min_terminated_length": 0.0, "entropy": 4.428362786769867, "epoch": 22.642857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.00044392517884261906, "kl": 0.03065445157699287, "learning_rate": 5.562861233008774e-06, "loss": 0.0, "num_tokens": 25615648.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.39959412813186646, "sampling/importance_sampling_ratio/min": 2.3405667503630854e-34, "sampling/sampling_logp_difference/max": 77.4375, "sampling/sampling_logp_difference/mean": 8.965696334838867, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 851.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 777.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 703.0, "completions/min_terminated_length": 0.0, "entropy": 3.7499843388795853, "epoch": 22.714285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003046572091989219, "kl": 0.02724529185798019, "learning_rate": 5.436196277717928e-06, "loss": 0.0, "num_tokens": 25697504.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.46849268674850464, "sampling/importance_sampling_ratio/min": 2.363888833789888e-33, "sampling/sampling_logp_difference/max": 75.125, "sampling/sampling_logp_difference/mean": 7.616781711578369, "step": 318 }, { "clip_ratio/high_max": 0.004933086805976927, "clip_ratio/high_mean": 0.0025889964017551392, "clip_ratio/low_mean": 0.0018699776192079298, "clip_ratio/low_min": 0.0001686909527052194, "clip_ratio/region_mean": 0.0044589739554794505, "completions/clipped_ratio": 1.0, "completions/max_length": 741.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 689.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 638.0, "completions/min_terminated_length": 0.0, "entropy": 3.341927334666252, "epoch": 22.785714285714285, "frac_reward_zero_std": 0.625, "grad_norm": 0.014166045933961868, "kl": 0.03269073669798672, "learning_rate": 5.310813910563644e-06, "loss": 0.0001, "num_tokens": 25773760.0, "reward": 0.203125, "reward_std": 0.19408093392848969, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.495296835899353, "sampling/importance_sampling_ratio/min": 4.329613164961045e-35, "sampling/sampling_logp_difference/max": 79.125, "sampling/sampling_logp_difference/mean": 7.295586109161377, "step": 319 }, { "clip_ratio/high_max": 0.002832861202477943, "clip_ratio/high_mean": 0.0014164306012389716, "clip_ratio/low_mean": 0.0001106586423702538, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015270892436092254, "completions/clipped_ratio": 1.0, "completions/max_length": 706.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 680.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 654.0, "completions/min_terminated_length": 0.0, "entropy": 2.6668814942240715, "epoch": 22.857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.010814397595822811, "kl": 0.0300122385378927, "learning_rate": 5.186722351518822e-06, "loss": 0.0019, "num_tokens": 25849408.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5872267484664917, "sampling/importance_sampling_ratio/min": 1.3204573989724904e-35, "sampling/sampling_logp_difference/max": 80.3125, "sampling/sampling_logp_difference/mean": 5.98575496673584, "step": 320 }, { "clip_ratio/high_max": 0.003173536442773184, "clip_ratio/high_mean": 0.0017321170726063428, "clip_ratio/low_mean": 0.0017014670229400508, "clip_ratio/low_min": 0.0004622683409252204, "clip_ratio/region_mean": 0.003433584070080542, "completions/clipped_ratio": 1.0, "completions/max_length": 837.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 741.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 645.0, "completions/min_terminated_length": 0.0, "entropy": 3.5705398470163345, "epoch": 22.928571428571427, "frac_reward_zero_std": 0.5, "grad_norm": 0.02629125490784645, "kl": 0.03334061417263001, "learning_rate": 5.063929735931985e-06, "loss": 0.0005, "num_tokens": 25928960.0, "reward": 0.203125, "reward_std": 0.24039676785469055, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48446568846702576, "sampling/importance_sampling_ratio/min": 1.3336148713971936e-34, "sampling/sampling_logp_difference/max": 78.0, "sampling/sampling_logp_difference/mean": 7.425329685211182, "step": 321 }, { "clip_ratio/high_max": 0.0005798800048069097, "clip_ratio/high_mean": 0.00028994000240345486, "clip_ratio/low_mean": 0.001968201730051078, "clip_ratio/low_min": 0.0006238656787900254, "clip_ratio/region_mean": 0.00225814180157613, "completions/clipped_ratio": 1.0, "completions/max_length": 551.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 500.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 450.0, "completions/min_terminated_length": 0.0, "entropy": 2.046732872724533, "epoch": 23.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.017939554527401924, "kl": 0.03287921636365354, "learning_rate": 4.94244411399388e-06, "loss": -0.0018, "num_tokens": 25993120.0, "reward": 0.1875, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6150625348091125, "sampling/importance_sampling_ratio/min": 2.7678293012408834e-36, "sampling/sampling_logp_difference/max": 81.875, "sampling/sampling_logp_difference/mean": 5.745357513427734, "step": 322 }, { "clip_ratio/high_max": 0.001585623700520955, "clip_ratio/high_mean": 0.0007928118502604775, "clip_ratio/low_mean": 0.001701242053968599, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002494053911505034, "completions/clipped_ratio": 1.0, "completions/max_length": 946.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 744.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 543.0, "completions/min_terminated_length": 0.0, "entropy": 3.4495043605566025, "epoch": 23.071428571428573, "frac_reward_zero_std": 0.625, "grad_norm": 0.012410551309585571, "kl": 0.035199379781261086, "learning_rate": 4.8222734502097665e-06, "loss": 0.0008, "num_tokens": 26072896.0, "reward": 0.265625, "reward_std": 0.13258251547813416, "rewards/tree_correctness_reward/mean": 0.265625, "rewards/tree_correctness_reward/std": 0.44515693187713623, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4865579605102539, "sampling/importance_sampling_ratio/min": 1.1166220121487572e-33, "sampling/sampling_logp_difference/max": 75.875, "sampling/sampling_logp_difference/mean": 7.41387939453125, "step": 323 }, { "clip_ratio/high_max": 0.0035889086866518483, "clip_ratio/high_mean": 0.001989434749702923, "clip_ratio/low_mean": 0.0019571839839045424, "clip_ratio/low_min": 0.0008451610410702415, "clip_ratio/region_mean": 0.003946618642657995, "completions/clipped_ratio": 1.0, "completions/max_length": 670.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 651.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 632.0, "completions/min_terminated_length": 0.0, "entropy": 3.1305430456995964, "epoch": 23.142857142857142, "frac_reward_zero_std": 0.5, "grad_norm": 0.018377134576439857, "kl": 0.027719255536794662, "learning_rate": 4.703425622877239e-06, "loss": 0.0039, "num_tokens": 26146688.0, "reward": 0.265625, "reward_std": 0.2519446909427643, "rewards/tree_correctness_reward/mean": 0.265625, "rewards/tree_correctness_reward/std": 0.44515693187713623, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5061472058296204, "sampling/importance_sampling_ratio/min": 5.976846944504e-34, "sampling/sampling_logp_difference/max": 76.5, "sampling/sampling_logp_difference/mean": 7.14011812210083, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 751.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 647.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 544.0, "completions/min_terminated_length": 0.0, "entropy": 2.903729349374771, "epoch": 23.214285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002850657037924975, "kl": 0.02729220176115632, "learning_rate": 4.585908423569724e-06, "loss": 0.0, "num_tokens": 26220256.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5404461026191711, "sampling/importance_sampling_ratio/min": 3.4055047095468348e-34, "sampling/sampling_logp_difference/max": 77.0625, "sampling/sampling_logp_difference/mean": 6.675492286682129, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 872.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 780.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 688.0, "completions/min_terminated_length": 0.0, "entropy": 3.1217919811606407, "epoch": 23.285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014142281142994761, "kl": 0.026132597122341394, "learning_rate": 4.469729556625704e-06, "loss": 0.0, "num_tokens": 26302304.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5486347079277039, "sampling/importance_sampling_ratio/min": 7.209453331209461e-34, "sampling/sampling_logp_difference/max": 76.3125, "sampling/sampling_logp_difference/mean": 6.518483638763428, "step": 326 }, { "clip_ratio/high_max": 0.0033926683754543774, "clip_ratio/high_mean": 0.0018256802250107285, "clip_ratio/low_mean": 0.0008564888739783783, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002682169095351128, "completions/clipped_ratio": 1.0, "completions/max_length": 604.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 509.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 415.0, "completions/min_terminated_length": 0.0, "entropy": 1.483363501727581, "epoch": 23.357142857142858, "frac_reward_zero_std": 0.75, "grad_norm": 0.009819873608648777, "kl": 0.042439315118826926, "learning_rate": 4.35489663864359e-06, "loss": 0.0003, "num_tokens": 26367040.0, "reward": 0.28125, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.28125, "rewards/tree_correctness_reward/std": 0.4531635046005249, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6955311298370361, "sampling/importance_sampling_ratio/min": 1.1166220121487572e-33, "sampling/sampling_logp_difference/max": 75.875, "sampling/sampling_logp_difference/mean": 4.684072017669678, "step": 327 }, { "clip_ratio/high_max": 0.001878415234386921, "clip_ratio/high_mean": 0.0009392076171934605, "clip_ratio/low_mean": 0.0003130692130071111, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001252276822924614, "completions/clipped_ratio": 1.0, "completions/max_length": 549.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 545.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 541.0, "completions/min_terminated_length": 0.0, "entropy": 2.212411530315876, "epoch": 23.428571428571427, "frac_reward_zero_std": 0.875, "grad_norm": 0.007904840633273125, "kl": 0.03136044612620026, "learning_rate": 4.2414171979824e-06, "loss": 0.0043, "num_tokens": 26434048.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5745662450790405, "sampling/importance_sampling_ratio/min": 1.1769110281448947e-34, "sampling/sampling_logp_difference/max": 78.125, "sampling/sampling_logp_difference/mean": 6.300686359405518, "step": 328 }, { "clip_ratio/high_max": 0.0025289998047810514, "clip_ratio/high_mean": 0.0012644999023905257, "clip_ratio/low_mean": 0.0017423934114049189, "clip_ratio/low_min": 0.0005129419223521836, "clip_ratio/region_mean": 0.0030068933574511902, "completions/clipped_ratio": 1.0, "completions/max_length": 792.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 664.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 537.0, "completions/min_terminated_length": 0.0, "entropy": 2.8462434858083725, "epoch": 23.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.013252409175038338, "kl": 0.03154220583382994, "learning_rate": 4.129298674268225e-06, "loss": 0.0005, "num_tokens": 26508704.0, "reward": 0.0625, "reward_std": 0.1462521106004715, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.564745306968689, "sampling/importance_sampling_ratio/min": 5.1709859456893004e-36, "sampling/sampling_logp_difference/max": 81.25, "sampling/sampling_logp_difference/mean": 6.324804306030273, "step": 329 }, { "clip_ratio/high_max": 0.0018861454009311274, "clip_ratio/high_mean": 0.0009430727004655637, "clip_ratio/low_mean": 0.00017146776008303277, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001114540467824554, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 656.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 584.0, "completions/min_terminated_length": 0.0, "entropy": 2.4487291499972343, "epoch": 23.571428571428573, "frac_reward_zero_std": 0.875, "grad_norm": 0.0060674515552818775, "kl": 0.035857322392985225, "learning_rate": 4.0185484179064425e-06, "loss": -0.0002, "num_tokens": 26582848.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6112833023071289, "sampling/importance_sampling_ratio/min": 2.9463390310530646e-36, "sampling/sampling_logp_difference/max": 81.8125, "sampling/sampling_logp_difference/mean": 5.6887102127075195, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 713.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 667.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 622.0, "completions/min_terminated_length": 0.0, "entropy": 3.2786065340042114, "epoch": 23.642857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003228187852073461, "kl": 0.03406538243871182, "learning_rate": 3.90917368959989e-06, "loss": 0.0, "num_tokens": 26657696.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4998883605003357, "sampling/importance_sampling_ratio/min": 9.075362379071311e-36, "sampling/sampling_logp_difference/max": 80.6875, "sampling/sampling_logp_difference/mean": 7.2237324714660645, "step": 331 }, { "clip_ratio/high_max": 0.0037712266203016043, "clip_ratio/high_mean": 0.0019404378690524027, "clip_ratio/low_mean": 0.00228736705139454, "clip_ratio/low_min": 0.000136661808937788, "clip_ratio/region_mean": 0.004227804933179868, "completions/clipped_ratio": 1.0, "completions/max_length": 686.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 628.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 570.0, "completions/min_terminated_length": 0.0, "entropy": 2.7188537791371346, "epoch": 23.714285714285715, "frac_reward_zero_std": 0.625, "grad_norm": 0.017408309504389763, "kl": 0.038082794402725995, "learning_rate": 3.8011816598728045e-06, "loss": -0.0034, "num_tokens": 26730016.0, "reward": 0.125, "reward_std": 0.1462521106004715, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5647954940795898, "sampling/importance_sampling_ratio/min": 5.1709859456893004e-36, "sampling/sampling_logp_difference/max": 81.25, "sampling/sampling_logp_difference/mean": 6.360941410064697, "step": 332 }, { "clip_ratio/high_max": 0.0007851895643398166, "clip_ratio/high_mean": 0.0003925947821699083, "clip_ratio/low_mean": 0.0021916235054959543, "clip_ratio/low_min": 0.0003337783782626502, "clip_ratio/region_mean": 0.002584218258562032, "completions/clipped_ratio": 1.0, "completions/max_length": 749.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 704.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 659.0, "completions/min_terminated_length": 0.0, "entropy": 2.568383567035198, "epoch": 23.785714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 0.013391060754656792, "kl": 0.031790089095011353, "learning_rate": 3.694579408600771e-06, "loss": 0.002, "num_tokens": 26807200.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6066774129867554, "sampling/importance_sampling_ratio/min": 1.511183624582218e-34, "sampling/sampling_logp_difference/max": 77.875, "sampling/sampling_logp_difference/mean": 5.728677272796631, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1231.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1107.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 983.0, "completions/min_terminated_length": 0.0, "entropy": 4.617639869451523, "epoch": 23.857142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.00041508881258778274, "kl": 0.033007799996994436, "learning_rate": 3.5893739245465465e-06, "loss": 0.0, "num_tokens": 26910176.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.42088204622268677, "sampling/importance_sampling_ratio/min": 8.169380844367654e-34, "sampling/sampling_logp_difference/max": 76.1875, "sampling/sampling_logp_difference/mean": 8.25636100769043, "step": 334 }, { "clip_ratio/high_max": 0.002828666096320376, "clip_ratio/high_mean": 0.0015273529425030574, "clip_ratio/low_mean": 0.002064989661448635, "clip_ratio/low_min": 0.0005059488757979125, "clip_ratio/region_mean": 0.003592342654883396, "completions/clipped_ratio": 1.0, "completions/max_length": 749.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 651.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 553.0, "completions/min_terminated_length": 0.0, "entropy": 2.7579140663146973, "epoch": 23.928571428571427, "frac_reward_zero_std": 0.625, "grad_norm": 0.017120854929089546, "kl": 0.03383038518950343, "learning_rate": 3.4855721049018688e-06, "loss": -0.0008, "num_tokens": 26983968.0, "reward": 0.125, "reward_std": 0.1872510462999344, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5798553824424744, "sampling/importance_sampling_ratio/min": 1.511183624582218e-34, "sampling/sampling_logp_difference/max": 77.875, "sampling/sampling_logp_difference/mean": 6.124856948852539, "step": 335 }, { "clip_ratio/high_max": 0.0007375598361250013, "clip_ratio/high_mean": 0.00036877991806250066, "clip_ratio/low_mean": 0.0014178176079440163, "clip_ratio/low_min": 0.0003847064508590847, "clip_ratio/region_mean": 0.001786597526006517, "completions/clipped_ratio": 1.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 881.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 707.0, "completions/min_terminated_length": 0.0, "entropy": 4.0671223402023315, "epoch": 24.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.012993636541068554, "kl": 0.030147228157147765, "learning_rate": 3.383180754835344e-06, "loss": -0.0021, "num_tokens": 27072512.0, "reward": 0.0625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.44404542446136475, "sampling/importance_sampling_ratio/min": 1.252815132018138e-34, "sampling/sampling_logp_difference/max": 78.0625, "sampling/sampling_logp_difference/mean": 7.93873405456543, "step": 336 }, { "clip_ratio/high_max": 0.0014970060292398557, "clip_ratio/high_mean": 0.0007485030146199279, "clip_ratio/low_mean": 0.000916916185815353, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001665419178607408, "completions/clipped_ratio": 1.0, "completions/max_length": 934.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 884.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 835.0, "completions/min_terminated_length": 0.0, "entropy": 4.2092956602573395, "epoch": 24.071428571428573, "frac_reward_zero_std": 0.75, "grad_norm": 0.009270807728171349, "kl": 0.031767552020028234, "learning_rate": 3.2822065870462217e-06, "loss": 0.0012, "num_tokens": 27161248.0, "reward": 0.09375, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.43823450803756714, "sampling/importance_sampling_ratio/min": 1.419625586416994e-34, "sampling/sampling_logp_difference/max": 77.9375, "sampling/sampling_logp_difference/mean": 8.013327598571777, "step": 337 }, { "clip_ratio/high_max": 0.0011718942914740182, "clip_ratio/high_mean": 0.0005859471457370091, "clip_ratio/low_mean": 0.0008050898395595141, "clip_ratio/low_min": 0.00017705382197164, "clip_ratio/region_mean": 0.0013910369780205656, "completions/clipped_ratio": 1.0, "completions/max_length": 706.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 668.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 631.0, "completions/min_terminated_length": 0.0, "entropy": 2.6917623430490494, "epoch": 24.142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.01361438725143671, "kl": 0.029710760107263923, "learning_rate": 3.1826562213243843e-06, "loss": 0.0013, "num_tokens": 27236160.0, "reward": 0.109375, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5770062208175659, "sampling/importance_sampling_ratio/min": 9.165791500701421e-35, "sampling/sampling_logp_difference/max": 78.375, "sampling/sampling_logp_difference/mean": 6.169223308563232, "step": 338 }, { "clip_ratio/high_max": 0.0009159106848528609, "clip_ratio/high_mean": 0.00045795534242643043, "clip_ratio/low_mean": 0.003249302171752788, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037072575141792186, "completions/clipped_ratio": 1.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 986.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 540.0, "completions/min_terminated_length": 0.0, "entropy": 3.5103937089443207, "epoch": 24.214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.006596799474209547, "kl": 0.039347052574157715, "learning_rate": 3.08453618411631e-06, "loss": -0.001, "num_tokens": 27331424.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.17536810040473938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5170261263847351, "sampling/importance_sampling_ratio/min": 9.756933719611155e-35, "sampling/sampling_logp_difference/max": 78.3125, "sampling/sampling_logp_difference/mean": 7.0343828201293945, "step": 339 }, { "clip_ratio/high_max": 0.00020371578284539282, "clip_ratio/high_mean": 0.00010185789142269641, "clip_ratio/low_mean": 0.0007130052254069597, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008148631168296561, "completions/clipped_ratio": 1.0, "completions/max_length": 776.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 771.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 767.0, "completions/min_terminated_length": 0.0, "entropy": 3.315613269805908, "epoch": 24.285714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 0.01088276132941246, "kl": 0.03437206568196416, "learning_rate": 2.98785290809723e-06, "loss": -0.0015, "num_tokens": 27412928.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5268118977546692, "sampling/importance_sampling_ratio/min": 1.4815121963806348e-36, "sampling/sampling_logp_difference/max": 82.5, "sampling/sampling_logp_difference/mean": 6.835104942321777, "step": 340 }, { "clip_ratio/high_max": 0.002498500471119769, "clip_ratio/high_mean": 0.0012492502355598845, "clip_ratio/low_mean": 0.000983167759841308, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00223241800267715, "completions/clipped_ratio": 1.0, "completions/max_length": 809.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 760.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 712.0, "completions/min_terminated_length": 0.0, "entropy": 2.872413530945778, "epoch": 24.357142857142858, "frac_reward_zero_std": 0.75, "grad_norm": 0.01257752813398838, "kl": 0.031583890900947154, "learning_rate": 2.892612731749414e-06, "loss": 0.0011, "num_tokens": 27493728.0, "reward": 0.21875, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.4166666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5797193050384521, "sampling/importance_sampling_ratio/min": 9.075362379071311e-36, "sampling/sampling_logp_difference/max": 80.6875, "sampling/sampling_logp_difference/mean": 6.0778703689575195, "step": 341 }, { "clip_ratio/high_max": 0.00546183627739083, "clip_ratio/high_mean": 0.002730918138695415, "clip_ratio/low_mean": 0.0019158677459927276, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0046467858701362275, "completions/clipped_ratio": 1.0, "completions/max_length": 565.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 552.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 540.0, "completions/min_terminated_length": 0.0, "entropy": 2.14389131963253, "epoch": 24.428571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.02196558564901352, "kl": 0.03394889773335308, "learning_rate": 2.798821898946588e-06, "loss": 0.005, "num_tokens": 27561216.0, "reward": 0.125, "reward_std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6152878999710083, "sampling/importance_sampling_ratio/min": 7.92996370415694e-37, "sampling/sampling_logp_difference/max": 83.125, "sampling/sampling_logp_difference/mean": 5.717930793762207, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 800.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 761.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 722.0, "completions/min_terminated_length": 0.0, "entropy": 3.265130177140236, "epoch": 24.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004712637746706605, "kl": 0.027243931428529322, "learning_rate": 2.7064865585446434e-06, "loss": 0.0, "num_tokens": 27642048.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5217603445053101, "sampling/importance_sampling_ratio/min": 1.028373383751816e-35, "sampling/sampling_logp_difference/max": 80.5625, "sampling/sampling_logp_difference/mean": 6.884984493255615, "step": 343 }, { "clip_ratio/high_max": 0.004040746083774138, "clip_ratio/high_mean": 0.002020373041887069, "clip_ratio/low_mean": 0.0009148421268037055, "clip_ratio/low_min": 0.00015318627993110567, "clip_ratio/region_mean": 0.0029352151414059335, "completions/clipped_ratio": 1.0, "completions/max_length": 627.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 619.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 612.0, "completions/min_terminated_length": 0.0, "entropy": 2.977713331580162, "epoch": 24.571428571428573, "frac_reward_zero_std": 0.75, "grad_norm": 0.009802260436117649, "kl": 0.03284119220916182, "learning_rate": 2.6156127639784618e-06, "loss": -0.0011, "num_tokens": 27713824.0, "reward": 0.21875, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.4166666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5067130923271179, "sampling/importance_sampling_ratio/min": 1.1056055867417628e-34, "sampling/sampling_logp_difference/max": 78.1875, "sampling/sampling_logp_difference/mean": 7.164061546325684, "step": 344 }, { "clip_ratio/high_max": 0.001518939228844829, "clip_ratio/high_mean": 0.0007594696144224145, "clip_ratio/low_mean": 0.0009477620005782228, "clip_ratio/low_min": 0.0003092463448410854, "clip_ratio/region_mean": 0.0017072316022677114, "completions/clipped_ratio": 1.0, "completions/max_length": 838.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 784.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 731.0, "completions/min_terminated_length": 0.0, "entropy": 3.928545966744423, "epoch": 24.642857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.009492919780313969, "kl": 0.02797161415219307, "learning_rate": 2.52620647286512e-06, "loss": -0.0013, "num_tokens": 27796160.0, "reward": 0.0625, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.44534832239151, "sampling/importance_sampling_ratio/min": 9.756933719611155e-35, "sampling/sampling_logp_difference/max": 78.3125, "sampling/sampling_logp_difference/mean": 7.9555768966674805, "step": 345 }, { "clip_ratio/high_max": 0.003501463041175157, "clip_ratio/high_mean": 0.001971192381461151, "clip_ratio/low_mean": 0.001088131386495661, "clip_ratio/low_min": 0.00030637255986221135, "clip_ratio/region_mean": 0.003059323767956812, "completions/clipped_ratio": 1.0, "completions/max_length": 714.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 707.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 701.0, "completions/min_terminated_length": 0.0, "entropy": 3.205912619829178, "epoch": 24.714285714285715, "frac_reward_zero_std": 0.625, "grad_norm": 0.015560809522867203, "kl": 0.03514812421053648, "learning_rate": 2.4382735466132565e-06, "loss": 0.0017, "num_tokens": 27873568.0, "reward": 0.375, "reward_std": 0.16675157845020294, "rewards/tree_correctness_reward/mean": 0.375, "rewards/tree_correctness_reward/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5119877457618713, "sampling/importance_sampling_ratio/min": 5.1709859456893004e-36, "sampling/sampling_logp_difference/max": 81.25, "sampling/sampling_logp_difference/mean": 7.051201820373535, "step": 346 }, { "clip_ratio/high_max": 0.0029404978413367644, "clip_ratio/high_mean": 0.0014702489206683822, "clip_ratio/low_mean": 0.0013701175994356163, "clip_ratio/low_min": 0.0002367424312978983, "clip_ratio/region_mean": 0.0028403665201039985, "completions/clipped_ratio": 1.0, "completions/max_length": 709.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 618.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 528.0, "completions/min_terminated_length": 0.0, "entropy": 2.3151541501283646, "epoch": 24.785714285714285, "frac_reward_zero_std": 0.625, "grad_norm": 0.034907467663288116, "kl": 0.02927974017802626, "learning_rate": 2.351819750038828e-06, "loss": 0.0004, "num_tokens": 27945280.0, "reward": 0.140625, "reward_std": 0.13258251547813416, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3503824472427368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6083064079284668, "sampling/importance_sampling_ratio/min": 6.772654783300765e-34, "sampling/sampling_logp_difference/max": 76.375, "sampling/sampling_logp_difference/mean": 5.765897274017334, "step": 347 }, { "clip_ratio/high_max": 0.002867879666155204, "clip_ratio/high_mean": 0.001433939833077602, "clip_ratio/low_mean": 6.592826684936881e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001499868099926971, "completions/clipped_ratio": 1.0, "completions/max_length": 948.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 891.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 834.0, "completions/min_terminated_length": 0.0, "entropy": 4.050565630197525, "epoch": 24.857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.010816297493875027, "kl": 0.03286199190188199, "learning_rate": 2.2668507509871954e-06, "loss": 0.0002, "num_tokens": 28034432.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.46095389127731323, "sampling/importance_sampling_ratio/min": 1.5262413857920149e-33, "sampling/sampling_logp_difference/max": 75.5625, "sampling/sampling_logp_difference/mean": 7.722670555114746, "step": 348 }, { "clip_ratio/high_max": 0.0015408320759888738, "clip_ratio/high_mean": 0.0007704160379944369, "clip_ratio/low_mean": 0.00014445300621446222, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009148690442088991, "completions/clipped_ratio": 1.0, "completions/max_length": 649.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 535.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 422.0, "completions/min_terminated_length": 0.0, "entropy": 2.025471419095993, "epoch": 24.928571428571427, "frac_reward_zero_std": 0.875, "grad_norm": 0.004489051643759012, "kl": 0.03640926326625049, "learning_rate": 2.183372119961499e-06, "loss": -0.0016, "num_tokens": 28100832.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6393419504165649, "sampling/importance_sampling_ratio/min": 1.3336148713971936e-34, "sampling/sampling_logp_difference/max": 78.0, "sampling/sampling_logp_difference/mean": 5.402565002441406, "step": 349 }, { "clip_ratio/high_max": 0.0032608695764793083, "clip_ratio/high_mean": 0.0016304347882396542, "clip_ratio/low_mean": 0.003034420315088937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004664855110604549, "completions/clipped_ratio": 1.0, "completions/max_length": 817.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 581.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 345.0, "completions/min_terminated_length": 0.0, "entropy": 1.972697727382183, "epoch": 25.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.012395664118230343, "kl": 0.05201729363761842, "learning_rate": 2.101389329757478e-06, "loss": -0.0003, "num_tokens": 28170144.0, "reward": 0.078125, "reward_std": 0.1530819982290268, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6589673161506653, "sampling/importance_sampling_ratio/min": 2.3405667503630854e-34, "sampling/sampling_logp_difference/max": 77.4375, "sampling/sampling_logp_difference/mean": 5.170504570007324, "step": 350 }, { "epoch": 25.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 470.08, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 433.24, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 396.4, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.8604605835676193, "eval_frac_reward_zero_std": 0.96, "eval_kl": 0.059366118311882016, "eval_loss": 0.0002705826482269913, "eval_num_tokens": 28170144.0, "eval_reward": 0.045, "eval_reward_std": 0.014142135381698609, "eval_rewards/tree_correctness_reward/mean": 0.045, "eval_rewards/tree_correctness_reward/std": 0.014142135381698609, "eval_runtime": 142.7397, "eval_samples_per_second": 0.175, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8174767971038819, "eval_sampling/importance_sampling_ratio/min": 8.25803144792373e-31, "eval_sampling/sampling_logp_difference/max": 72.24, "eval_sampling/sampling_logp_difference/mean": 2.9412149667739866, "eval_steps_per_second": 0.028, "step": 350 }, { "clip_ratio/high_max": 0.002209596015745774, "clip_ratio/high_mean": 0.001104798007872887, "clip_ratio/low_mean": 0.000405844155466184, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001510642163339071, "completions/clipped_ratio": 1.0, "completions/max_length": 693.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 665.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 637.0, "completions/min_terminated_length": 0.0, "entropy": 3.258365735411644, "epoch": 25.071428571428573, "frac_reward_zero_std": 0.875, "grad_norm": 0.005559945944696665, "kl": 0.03061885794159025, "learning_rate": 2.020907755104698e-06, "loss": 0.0018, "num_tokens": 28244832.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5082234740257263, "sampling/importance_sampling_ratio/min": 2.0451645895225563e-35, "sampling/sampling_logp_difference/max": 79.875, "sampling/sampling_logp_difference/mean": 7.094335079193115, "step": 351 }, { "clip_ratio/high_max": 0.002668539294973016, "clip_ratio/high_mean": 0.001334269647486508, "clip_ratio/low_mean": 0.0020014045003335923, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033356741623720154, "completions/clipped_ratio": 1.0, "completions/max_length": 616.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 530.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 445.0, "completions/min_terminated_length": 0.0, "entropy": 2.1778449714183807, "epoch": 25.142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.011449098587036133, "kl": 0.05449135648086667, "learning_rate": 1.9419326723141534e-06, "loss": 0.0005, "num_tokens": 28310912.0, "reward": 0.09375, "reward_std": 0.10888782143592834, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6080831289291382, "sampling/importance_sampling_ratio/min": 5.504485832522232e-36, "sampling/sampling_logp_difference/max": 81.1875, "sampling/sampling_logp_difference/mean": 5.803290367126465, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1362.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 677.0, "completions/min_terminated_length": 0.0, "entropy": 4.467976793646812, "epoch": 25.214285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005359721835702658, "kl": 0.030531332013197243, "learning_rate": 1.864469258932397e-06, "loss": 0.0, "num_tokens": 28430240.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.39048439264297485, "sampling/importance_sampling_ratio/min": 1.7123953114587469e-34, "sampling/sampling_logp_difference/max": 77.75, "sampling/sampling_logp_difference/mean": 9.016610145568848, "step": 353 }, { "clip_ratio/high_max": 0.005239153921138495, "clip_ratio/high_mean": 0.0027302356029395014, "clip_ratio/low_mean": 0.001792943206964992, "clip_ratio/low_min": 0.00037563827936537564, "clip_ratio/region_mean": 0.004523178809904493, "completions/clipped_ratio": 1.0, "completions/max_length": 706.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 555.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 405.0, "completions/min_terminated_length": 0.0, "entropy": 2.0136357843875885, "epoch": 25.285714285714285, "frac_reward_zero_std": 0.625, "grad_norm": 0.01485985703766346, "kl": 0.03807677293661982, "learning_rate": 1.7885225934020588e-06, "loss": 0.0019, "num_tokens": 28497920.0, "reward": 0.1875, "reward_std": 0.1825428307056427, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6534005999565125, "sampling/importance_sampling_ratio/min": 3.2632478433947105e-32, "sampling/sampling_logp_difference/max": 72.5, "sampling/sampling_logp_difference/mean": 5.195428848266602, "step": 354 }, { "clip_ratio/high_max": 0.0024561485261074267, "clip_ratio/high_mean": 0.0012280742630537134, "clip_ratio/low_mean": 0.0021421114724944346, "clip_ratio/low_min": 0.0007411162878270261, "clip_ratio/region_mean": 0.0033701857610139996, "completions/clipped_ratio": 1.0, "completions/max_length": 738.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 709.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 681.0, "completions/min_terminated_length": 0.0, "entropy": 3.0162560269236565, "epoch": 25.357142857142858, "frac_reward_zero_std": 0.625, "grad_norm": 0.012989602982997894, "kl": 0.028844914631918073, "learning_rate": 1.7140976547289438e-06, "loss": 0.0035, "num_tokens": 28575456.0, "reward": 0.125, "reward_std": 0.16675157845020294, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5385817289352417, "sampling/importance_sampling_ratio/min": 3.625141007634431e-34, "sampling/sampling_logp_difference/max": 77.0, "sampling/sampling_logp_difference/mean": 6.6720662117004395, "step": 355 }, { "clip_ratio/high_max": 0.003594921210606117, "clip_ratio/high_mean": 0.002006986953347223, "clip_ratio/low_mean": 0.0023789536135154776, "clip_ratio/low_min": 0.0005719714536098763, "clip_ratio/region_mean": 0.004385940577776637, "completions/clipped_ratio": 1.0, "completions/max_length": 890.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 781.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 673.0, "completions/min_terminated_length": 0.0, "entropy": 3.8792046159505844, "epoch": 25.428571428571427, "frac_reward_zero_std": 0.5, "grad_norm": 0.019072920083999634, "kl": 0.028978185961022973, "learning_rate": 1.6411993221555927e-06, "loss": 0.0027, "num_tokens": 28657600.0, "reward": 0.25, "reward_std": 0.23356688022613525, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4479881525039673, "sampling/importance_sampling_ratio/min": 1.6246755378554462e-33, "sampling/sampling_logp_difference/max": 75.5, "sampling/sampling_logp_difference/mean": 7.903661727905273, "step": 356 }, { "clip_ratio/high_max": 0.006490161511464976, "clip_ratio/high_mean": 0.003626870362495538, "clip_ratio/low_mean": 0.0021695470277336426, "clip_ratio/low_min": 0.00014302058843895793, "clip_ratio/region_mean": 0.005796417368401308, "completions/clipped_ratio": 1.0, "completions/max_length": 683.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 560.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 437.0, "completions/min_terminated_length": 0.0, "entropy": 2.3061829581856728, "epoch": 25.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.01406443864107132, "kl": 0.02828279708046466, "learning_rate": 1.5698323748414124e-06, "loss": 0.0001, "num_tokens": 28725568.0, "reward": 0.28125, "reward_std": 0.2177756428718567, "rewards/tree_correctness_reward/mean": 0.28125, "rewards/tree_correctness_reward/std": 0.4531635046005249, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5963047742843628, "sampling/importance_sampling_ratio/min": 1.511183624582218e-34, "sampling/sampling_logp_difference/max": 77.875, "sampling/sampling_logp_difference/mean": 5.9892377853393555, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 898.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 823.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 748.0, "completions/min_terminated_length": 0.0, "entropy": 4.166168466210365, "epoch": 25.571428571428573, "frac_reward_zero_std": 1.0, "grad_norm": 0.00024534878320991993, "kl": 0.02678728278260678, "learning_rate": 1.5000014915493467e-06, "loss": 0.0, "num_tokens": 28810368.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4312209486961365, "sampling/importance_sampling_ratio/min": 4.148754129619672e-33, "sampling/sampling_logp_difference/max": 74.5625, "sampling/sampling_logp_difference/mean": 8.141897201538086, "step": 358 }, { "clip_ratio/high_max": 0.0011017628603440244, "clip_ratio/high_mean": 0.0005508814301720122, "clip_ratio/low_mean": 0.0004256811043887865, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009765625327418093, "completions/clipped_ratio": 1.0, "completions/max_length": 624.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 624.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 624.0, "completions/min_terminated_length": 0.0, "entropy": 1.7865494415163994, "epoch": 25.642857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.005844090133905411, "kl": 0.040602551540359855, "learning_rate": 1.4317112503391432e-06, "loss": -0.0003, "num_tokens": 28882432.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6983903646469116, "sampling/importance_sampling_ratio/min": 9.660673761439827e-36, "sampling/sampling_logp_difference/max": 80.625, "sampling/sampling_logp_difference/mean": 4.493856430053711, "step": 359 }, { "clip_ratio/high_max": 0.0012562813935801387, "clip_ratio/high_mean": 0.0006281406967900693, "clip_ratio/low_mean": 0.0004711055262305308, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010992462302965578, "completions/clipped_ratio": 1.0, "completions/max_length": 796.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 757.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 719.0, "completions/min_terminated_length": 0.0, "entropy": 3.0052345395088196, "epoch": 25.714285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.01291694026440382, "kl": 0.03113581147044897, "learning_rate": 1.3649661282672476e-06, "loss": 0.0, "num_tokens": 28963040.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5675064325332642, "sampling/importance_sampling_ratio/min": 1.7123953114587469e-34, "sampling/sampling_logp_difference/max": 77.75, "sampling/sampling_logp_difference/mean": 6.252553939819336, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 780.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 656.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 532.0, "completions/min_terminated_length": 0.0, "entropy": 2.7273360267281532, "epoch": 25.785714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004347563080955297, "kl": 0.03494859323836863, "learning_rate": 1.2997705010932393e-06, "loss": 0.0, "num_tokens": 29037152.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5564488172531128, "sampling/importance_sampling_ratio/min": 1.2652986294762563e-33, "sampling/sampling_logp_difference/max": 75.75, "sampling/sampling_logp_difference/mean": 6.479736328125, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 917.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 820.0, "completions/min_terminated_length": 0.0, "entropy": 3.9363171458244324, "epoch": 25.857142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006535943248309195, "kl": 0.035571441636420786, "learning_rate": 1.2361286429929952e-06, "loss": 0.0, "num_tokens": 29128000.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48305249214172363, "sampling/importance_sampling_ratio/min": 3.1363615599988335e-36, "sampling/sampling_logp_difference/max": 81.75, "sampling/sampling_logp_difference/mean": 7.404021739959717, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 559.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 534.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 509.0, "completions/min_terminated_length": 0.0, "entropy": 2.098852977156639, "epoch": 25.928571428571427, "frac_reward_zero_std": 1.0, "grad_norm": 0.00035536312498152256, "kl": 0.032702949712984264, "learning_rate": 1.1740447262784781e-06, "loss": 0.0, "num_tokens": 29194304.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6041094064712524, "sampling/importance_sampling_ratio/min": 4.244603898090227e-37, "sampling/sampling_logp_difference/max": 83.75, "sampling/sampling_logp_difference/mean": 5.88153600692749, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 786.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 739.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 692.0, "completions/min_terminated_length": 0.0, "entropy": 3.4196005314588547, "epoch": 26.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005046993028372526, "kl": 0.03356144158169627, "learning_rate": 1.1135228211241828e-06, "loss": 0.0, "num_tokens": 29273728.0, "reward": 0.25, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.48920661211013794, "sampling/importance_sampling_ratio/min": 5.222510753034111e-35, "sampling/sampling_logp_difference/max": 78.9375, "sampling/sampling_logp_difference/mean": 7.358584403991699, "step": 364 }, { "clip_ratio/high_max": 0.0006923929104232229, "clip_ratio/high_mean": 0.00034619645521161146, "clip_ratio/low_mean": 0.000392355981603032, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007385524368146434, "completions/clipped_ratio": 1.0, "completions/max_length": 820.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 748.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 677.0, "completions/min_terminated_length": 0.0, "entropy": 3.1508950144052505, "epoch": 26.071428571428573, "frac_reward_zero_std": 0.875, "grad_norm": 0.006979056634008884, "kl": 0.03369669336825609, "learning_rate": 1.0545668953003241e-06, "loss": -0.0011, "num_tokens": 29353760.0, "reward": 0.046875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.542253315448761, "sampling/importance_sampling_ratio/min": 2.3405667503630854e-34, "sampling/sampling_logp_difference/max": 77.4375, "sampling/sampling_logp_difference/mean": 6.607968807220459, "step": 365 }, { "clip_ratio/high_max": 0.002056451565294992, "clip_ratio/high_mean": 0.001028225782647496, "clip_ratio/low_mean": 0.0002217741985077737, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012499999938881956, "completions/clipped_ratio": 1.0, "completions/max_length": 775.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 754.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 733.0, "completions/min_terminated_length": 0.0, "entropy": 3.2356587648391724, "epoch": 26.142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.007185130380094051, "kl": 0.03757871745619923, "learning_rate": 9.97180813912682e-07, "loss": 0.0021, "num_tokens": 29434144.0, "reward": 0.078125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5350773334503174, "sampling/importance_sampling_ratio/min": 2.1987591985594636e-34, "sampling/sampling_logp_difference/max": 77.5, "sampling/sampling_logp_difference/mean": 6.699105739593506, "step": 366 }, { "clip_ratio/high_max": 0.004301075343391858, "clip_ratio/high_mean": 0.002150537671695929, "clip_ratio/low_mean": 0.0007392473125946708, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002889784977014642, "completions/clipped_ratio": 1.0, "completions/max_length": 930.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 805.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 680.0, "completions/min_terminated_length": 0.0, "entropy": 3.9962343722581863, "epoch": 26.214285714285715, "frac_reward_zero_std": 0.75, "grad_norm": 0.007993404753506184, "kl": 0.029904949944466352, "learning_rate": 9.413683391492456e-07, "loss": 0.0025, "num_tokens": 29517792.0, "reward": 0.171875, "reward_std": 0.11100947856903076, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.43934065103530884, "sampling/importance_sampling_ratio/min": 4.107822865311063e-34, "sampling/sampling_logp_difference/max": 76.875, "sampling/sampling_logp_difference/mean": 8.043020248413086, "step": 367 }, { "clip_ratio/high_max": 0.0009458672866458073, "clip_ratio/high_mean": 0.00047293364332290366, "clip_ratio/low_mean": 0.0003274156042607501, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008003492548596114, "completions/clipped_ratio": 1.0, "completions/max_length": 859.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 757.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 655.0, "completions/min_terminated_length": 0.0, "entropy": 3.436677947640419, "epoch": 26.285714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 0.010743551887571812, "kl": 0.029503861675038934, "learning_rate": 8.871331300335323e-07, "loss": 0.0009, "num_tokens": 29598368.0, "reward": 0.203125, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40550529956817627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5015701651573181, "sampling/importance_sampling_ratio/min": 2.4915206584065623e-34, "sampling/sampling_logp_difference/max": 77.375, "sampling/sampling_logp_difference/mean": 7.175085067749023, "step": 368 }, { "clip_ratio/high_max": 0.0077698087188764475, "clip_ratio/high_mean": 0.003971230336901499, "clip_ratio/low_mean": 0.004038451348606031, "clip_ratio/low_min": 0.0017104805738199502, "clip_ratio/region_mean": 0.008009681663679658, "completions/clipped_ratio": 1.0, "completions/max_length": 366.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 364.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 362.0, "completions/min_terminated_length": 0.0, "entropy": 0.8892752826213837, "epoch": 26.357142857142858, "frac_reward_zero_std": 0.625, "grad_norm": 0.012944036163389683, "kl": 0.04952238011173904, "learning_rate": 8.344787421847217e-07, "loss": 0.0009, "num_tokens": 29653792.0, "reward": 0.28125, "reward_std": 0.17570312321186066, "rewards/tree_correctness_reward/mean": 0.28125, "rewards/tree_correctness_reward/std": 0.4531635046005249, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.723290205001831, "sampling/importance_sampling_ratio/min": 7.523739969508069e-36, "sampling/sampling_logp_difference/max": 80.875, "sampling/sampling_logp_difference/mean": 4.487146377563477, "step": 369 }, { "clip_ratio/high_max": 0.0017468789010308683, "clip_ratio/high_mean": 0.0008734394505154341, "clip_ratio/low_mean": 0.001957816944923252, "clip_ratio/low_min": 0.0006399232224794105, "clip_ratio/region_mean": 0.0028312563881627284, "completions/clipped_ratio": 1.0, "completions/max_length": 760.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 741.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 723.0, "completions/min_terminated_length": 0.0, "entropy": 3.559417486190796, "epoch": 26.428571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.032933011651039124, "kl": 0.03421620489098132, "learning_rate": 7.834086275845587e-07, "loss": -0.0012, "num_tokens": 29733376.0, "reward": 0.078125, "reward_std": 0.11100947856903076, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.47877657413482666, "sampling/importance_sampling_ratio/min": 2.5163465940007e-33, "sampling/sampling_logp_difference/max": 75.0625, "sampling/sampling_logp_difference/mean": 7.496694564819336, "step": 370 }, { "clip_ratio/high_max": 0.00040760869160294533, "clip_ratio/high_mean": 0.00020380434580147266, "clip_ratio/low_mean": 0.0004585597780533135, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006623641238547862, "completions/clipped_ratio": 1.0, "completions/max_length": 920.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 753.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 587.0, "completions/min_terminated_length": 0.0, "entropy": 3.5352616906166077, "epoch": 26.5, "frac_reward_zero_std": 0.875, "grad_norm": 0.00902850367128849, "kl": 0.03158037440152839, "learning_rate": 7.339261343510206e-07, "loss": 0.0027, "num_tokens": 29813728.0, "reward": 0.171875, "reward_std": 0.0646936446428299, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4817163944244385, "sampling/importance_sampling_ratio/min": 1.0489692816875613e-33, "sampling/sampling_logp_difference/max": 75.9375, "sampling/sampling_logp_difference/mean": 7.445906639099121, "step": 371 }, { "clip_ratio/high_max": 0.00012335526116658002, "clip_ratio/high_mean": 6.167763058329001e-05, "clip_ratio/low_mean": 0.0006990131514612585, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007606907820445485, "completions/clipped_ratio": 1.0, "completions/max_length": 760.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 640.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 521.0, "completions/min_terminated_length": 0.0, "entropy": 2.747797891497612, "epoch": 26.571428571428573, "frac_reward_zero_std": 0.875, "grad_norm": 0.010381992906332016, "kl": 0.03316591528709978, "learning_rate": 6.860345065188512e-07, "loss": 0.0006, "num_tokens": 29886848.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5618965029716492, "sampling/importance_sampling_ratio/min": 2.1987591985594636e-34, "sampling/sampling_logp_difference/max": 77.5, "sampling/sampling_logp_difference/mean": 6.401259422302246, "step": 372 }, { "clip_ratio/high_max": 0.0038684719074808527, "clip_ratio/high_mean": 0.0019342359537404263, "clip_ratio/low_mean": 9.066730854101479e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020249032550054835, "completions/clipped_ratio": 1.0, "completions/max_length": 650.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 583.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 517.0, "completions/min_terminated_length": 0.0, "entropy": 2.3472982048988342, "epoch": 26.642857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 0.010912664234638214, "kl": 0.04039017343893647, "learning_rate": 6.397368838268497e-07, "loss": 0.0006, "num_tokens": 29956320.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6107279658317566, "sampling/importance_sampling_ratio/min": 3.783174026200654e-36, "sampling/sampling_logp_difference/max": 81.5625, "sampling/sampling_logp_difference/mean": 5.742642402648926, "step": 373 }, { "clip_ratio/high_max": 0.001462305044697132, "clip_ratio/high_mean": 0.000731152522348566, "clip_ratio/low_mean": 0.000487435017930693, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012185875348222908, "completions/clipped_ratio": 1.0, "completions/max_length": 766.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 671.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 577.0, "completions/min_terminated_length": 0.0, "entropy": 2.9075578302145004, "epoch": 26.714285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.008210618048906326, "kl": 0.032968637882731855, "learning_rate": 5.950363015120503e-07, "loss": 0.0003, "num_tokens": 30031424.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.550100564956665, "sampling/importance_sampling_ratio/min": 7.674422876742102e-34, "sampling/sampling_logp_difference/max": 76.25, "sampling/sampling_logp_difference/mean": 6.530878067016602, "step": 374 }, { "clip_ratio/high_max": 0.0006531350663863122, "clip_ratio/high_mean": 0.0003265675331931561, "clip_ratio/low_mean": 0.0002512057835701853, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005777733167633414, "completions/clipped_ratio": 1.0, "completions/max_length": 622.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 571.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 521.0, "completions/min_terminated_length": 0.0, "entropy": 1.7613919824361801, "epoch": 26.785714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 0.01607496850192547, "kl": 0.040298212436027825, "learning_rate": 5.519356901107358e-07, "loss": 0.0007, "num_tokens": 30100128.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6780560612678528, "sampling/importance_sampling_ratio/min": 5.274548762960186e-34, "sampling/sampling_logp_difference/max": 76.625, "sampling/sampling_logp_difference/mean": 4.820460319519043, "step": 375 }, { "clip_ratio/high_max": 0.005523989886569325, "clip_ratio/high_mean": 0.0027619949432846624, "clip_ratio/low_mean": 2.630471317388583e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027882996564585483, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1321.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 594.0, "completions/min_terminated_length": 0.0, "entropy": 4.119924679398537, "epoch": 26.857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 0.0058745830319821835, "kl": 0.03999795438721776, "learning_rate": 5.104378752663008e-07, "loss": -0.0002, "num_tokens": 30216800.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.42686283588409424, "sampling/importance_sampling_ratio/min": 2.9756967944000553e-35, "sampling/sampling_logp_difference/max": 79.5, "sampling/sampling_logp_difference/mean": 8.626762390136719, "step": 376 }, { "clip_ratio/high_max": 0.0034643846593098715, "clip_ratio/high_mean": 0.0020059231828781776, "clip_ratio/low_mean": 0.0014751898706890643, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034811130608431995, "completions/clipped_ratio": 1.0, "completions/max_length": 962.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 848.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 735.0, "completions/min_terminated_length": 0.0, "entropy": 3.252412900328636, "epoch": 26.928571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.012022046372294426, "kl": 0.03955165296792984, "learning_rate": 4.7054557754402373e-07, "loss": -0.001, "num_tokens": 30303232.0, "reward": 0.1875, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5512263774871826, "sampling/importance_sampling_ratio/min": 1.822835416400275e-34, "sampling/sampling_logp_difference/max": 77.6875, "sampling/sampling_logp_difference/mean": 6.467156410217285, "step": 377 }, { "clip_ratio/high_max": 0.002268145195557736, "clip_ratio/high_mean": 0.001134072597778868, "clip_ratio/low_mean": 0.0007770497322781011, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019111223155050538, "completions/clipped_ratio": 1.0, "completions/max_length": 744.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 726.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 709.0, "completions/min_terminated_length": 0.0, "entropy": 3.5655471086502075, "epoch": 27.0, "frac_reward_zero_std": 0.875, "grad_norm": 0.00933501310646534, "kl": 0.028229808202013373, "learning_rate": 4.32261412252688e-07, "loss": -0.0009, "num_tokens": 30381856.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4834950566291809, "sampling/importance_sampling_ratio/min": 1.511183624582218e-34, "sampling/sampling_logp_difference/max": 77.875, "sampling/sampling_logp_difference/mean": 7.427951812744141, "step": 378 }, { "clip_ratio/high_max": 0.0016708815237507224, "clip_ratio/high_mean": 0.0008354407618753612, "clip_ratio/low_mean": 0.0004515895852819085, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012870303471572697, "completions/clipped_ratio": 1.0, "completions/max_length": 881.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 786.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 692.0, "completions/min_terminated_length": 0.0, "entropy": 3.5652935206890106, "epoch": 27.071428571428573, "frac_reward_zero_std": 0.875, "grad_norm": 0.005314526613801718, "kl": 0.03398803609889001, "learning_rate": 3.9558788927314407e-07, "loss": 0.0006, "num_tokens": 30464320.0, "reward": 0.0625, "reward_std": 0.06681530922651291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24397502839565277, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.49967968463897705, "sampling/importance_sampling_ratio/min": 9.165791500701421e-35, "sampling/sampling_logp_difference/max": 78.375, "sampling/sampling_logp_difference/mean": 7.211946487426758, "step": 379 }, { "clip_ratio/high_max": 0.005557081181905232, "clip_ratio/high_mean": 0.002899352453823667, "clip_ratio/low_mean": 0.002115725845214911, "clip_ratio/low_min": 5.122950824443251e-05, "clip_ratio/region_mean": 0.005015078255382832, "completions/clipped_ratio": 1.0, "completions/max_length": 610.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 499.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 388.0, "completions/min_terminated_length": 0.0, "entropy": 1.5797580741345882, "epoch": 27.142857142857142, "frac_reward_zero_std": 0.5, "grad_norm": 0.017306851223111153, "kl": 0.038341411971487105, "learning_rate": 3.605274128937464e-07, "loss": 0.0015, "num_tokens": 30528384.0, "reward": 0.3125, "reward_std": 0.2130674123764038, "rewards/tree_correctness_reward/mean": 0.3125, "rewards/tree_correctness_reward/std": 0.467176616191864, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6933298707008362, "sampling/importance_sampling_ratio/min": 2.1987591985594636e-34, "sampling/sampling_logp_difference/max": 77.5, "sampling/sampling_logp_difference/mean": 4.686048984527588, "step": 380 }, { "clip_ratio/high_max": 0.0036904197768308222, "clip_ratio/high_mean": 0.0021326517744455487, "clip_ratio/low_mean": 0.001151452015619725, "clip_ratio/low_min": 0.0003789424226852134, "clip_ratio/region_mean": 0.003284103768237401, "completions/clipped_ratio": 1.0, "completions/max_length": 813.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 657.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 502.0, "completions/min_terminated_length": 0.0, "entropy": 2.889097273349762, "epoch": 27.214285714285715, "frac_reward_zero_std": 0.625, "grad_norm": 0.016170253977179527, "kl": 0.03723262331914157, "learning_rate": 3.270822816527325e-07, "loss": -0.003, "num_tokens": 30602592.0, "reward": 0.171875, "reward_std": 0.1804211586713791, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.38025420904159546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5488963723182678, "sampling/importance_sampling_ratio/min": 6.998167936122733e-37, "sampling/sampling_logp_difference/max": 83.25, "sampling/sampling_logp_difference/mean": 6.5954203605651855, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 657.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 629.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 602.0, "completions/min_terminated_length": 0.0, "entropy": 2.712170548737049, "epoch": 27.285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004793900006916374, "kl": 0.03126904892269522, "learning_rate": 2.9525468818755455e-07, "loss": 0.0, "num_tokens": 30675008.0, "reward": 0.125, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3333333432674408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5632752180099487, "sampling/importance_sampling_ratio/min": 1.6787733175152708e-36, "sampling/sampling_logp_difference/max": 82.375, "sampling/sampling_logp_difference/mean": 6.37312126159668, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 863.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 758.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 653.0, "completions/min_terminated_length": 0.0, "entropy": 3.433161422610283, "epoch": 27.357142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002974590170197189, "kl": 0.02995622286107391, "learning_rate": 2.650467190910999e-07, "loss": 0.0, "num_tokens": 30755648.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5027236938476562, "sampling/importance_sampling_ratio/min": 1.6246755378554462e-33, "sampling/sampling_logp_difference/max": 75.5, "sampling/sampling_logp_difference/mean": 7.143535614013672, "step": 383 }, { "clip_ratio/high_max": 0.0008195965638151392, "clip_ratio/high_mean": 0.0006418527555069886, "clip_ratio/low_mean": 0.002135449554771185, "clip_ratio/low_min": 0.00022163119865581393, "clip_ratio/region_mean": 0.002777302317554131, "completions/clipped_ratio": 1.0, "completions/max_length": 705.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 655.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 606.0, "completions/min_terminated_length": 0.0, "entropy": 3.0131986886262894, "epoch": 27.428571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.0178351029753685, "kl": 0.03579407965298742, "learning_rate": 2.3646035477491723e-07, "loss": 0.0003, "num_tokens": 30829728.0, "reward": 0.046875, "reward_std": 0.10205793380737305, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21304203569889069, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5158674716949463, "sampling/importance_sampling_ratio/min": 1.028373383751816e-35, "sampling/sampling_logp_difference/max": 80.5625, "sampling/sampling_logp_difference/mean": 7.010845184326172, "step": 384 }, { "clip_ratio/high_max": 0.002733785855525639, "clip_ratio/high_mean": 0.0013668929277628195, "clip_ratio/low_mean": 0.00023567119205836207, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016025641198211815, "completions/clipped_ratio": 1.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 846.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 663.0, "completions/min_terminated_length": 0.0, "entropy": 3.9464381486177444, "epoch": 27.5, "frac_reward_zero_std": 0.875, "grad_norm": 0.008013339713215828, "kl": 0.0361758922226727, "learning_rate": 2.094974693393731e-07, "loss": 0.0026, "num_tokens": 30916032.0, "reward": 0.09375, "reward_std": 0.0578637570142746, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.44916850328445435, "sampling/importance_sampling_ratio/min": 6.036401529078488e-33, "sampling/sampling_logp_difference/max": 74.1875, "sampling/sampling_logp_difference/mean": 7.898123264312744, "step": 385 }, { "clip_ratio/high_max": 0.005819198602694087, "clip_ratio/high_mean": 0.0032084410850075074, "clip_ratio/low_mean": 0.0008801711701380555, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004088612236955669, "completions/clipped_ratio": 1.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 916.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 730.0, "completions/min_terminated_length": 0.0, "entropy": 4.595413371920586, "epoch": 27.571428571428573, "frac_reward_zero_std": 0.625, "grad_norm": 0.01331446971744299, "kl": 0.030657698749564588, "learning_rate": 1.841598304507891e-07, "loss": 0.0011, "num_tokens": 31006784.0, "reward": 0.25, "reward_std": 0.1462521106004715, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.4364357888698578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.3817736506462097, "sampling/importance_sampling_ratio/min": 3.439437695788472e-33, "sampling/sampling_logp_difference/max": 74.75, "sampling/sampling_logp_difference/mean": 8.810517311096191, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 862.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 848.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 834.0, "completions/min_terminated_length": 0.0, "entropy": 3.9101418554782867, "epoch": 27.642857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.00030644756043329835, "kl": 0.02690782700665295, "learning_rate": 1.6044909922555974e-07, "loss": 0.0, "num_tokens": 31093184.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.4763064682483673, "sampling/importance_sampling_ratio/min": 2.3405667503630854e-34, "sampling/sampling_logp_difference/max": 77.4375, "sampling/sampling_logp_difference/mean": 7.482927322387695, "step": 387 }, { "clip_ratio/high_max": 0.00362809601938352, "clip_ratio/high_mean": 0.00181404800969176, "clip_ratio/low_mean": 4.83746116515249e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018624226213432848, "completions/clipped_ratio": 1.0, "completions/max_length": 808.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 727.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 646.0, "completions/min_terminated_length": 0.0, "entropy": 3.226002424955368, "epoch": 27.714285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 0.030719775706529617, "kl": 0.031397583428770304, "learning_rate": 1.383668301212393e-07, "loss": -0.0004, "num_tokens": 31171840.0, "reward": 0.109375, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.3145764470100403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5217556953430176, "sampling/importance_sampling_ratio/min": 9.756933719611155e-35, "sampling/sampling_logp_difference/max": 78.3125, "sampling/sampling_logp_difference/mean": 6.895229339599609, "step": 388 }, { "clip_ratio/high_max": 0.0008218160655815154, "clip_ratio/high_mean": 0.0005268204986350611, "clip_ratio/low_mean": 0.0021088761313876603, "clip_ratio/low_min": 0.0009065749327419326, "clip_ratio/region_mean": 0.002635696637298679, "completions/clipped_ratio": 1.0, "completions/max_length": 698.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 686.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 674.0, "completions/min_terminated_length": 0.0, "entropy": 2.893871396780014, "epoch": 27.785714285714285, "frac_reward_zero_std": 0.625, "grad_norm": 0.014170842245221138, "kl": 0.03879878832958639, "learning_rate": 1.1791447083465134e-07, "loss": 0.0059, "num_tokens": 31247872.0, "reward": 0.078125, "reward_std": 0.15992169082164764, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.27048972249031067, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5599030256271362, "sampling/importance_sampling_ratio/min": 1.405619611404976e-35, "sampling/sampling_logp_difference/max": 80.25, "sampling/sampling_logp_difference/mean": 6.389625072479248, "step": 389 }, { "clip_ratio/high_max": 0.010551989689702168, "clip_ratio/high_mean": 0.0057252760452684015, "clip_ratio/low_mean": 0.005955422566330526, "clip_ratio/low_min": 0.0008985623135231435, "clip_ratio/region_mean": 0.011680698531563394, "completions/clipped_ratio": 1.0, "completions/max_length": 626.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 517.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 408.0, "completions/min_terminated_length": 0.0, "entropy": 1.6573655046522617, "epoch": 27.857142857142858, "frac_reward_zero_std": 0.25, "grad_norm": 0.017347579821944237, "kl": 0.04419349820818752, "learning_rate": 9.90933622069562e-08, "loss": 0.0023, "num_tokens": 31313088.0, "reward": 0.265625, "reward_std": 0.3356248140335083, "rewards/tree_correctness_reward/mean": 0.265625, "rewards/tree_correctness_reward/std": 0.44515693187713623, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6770005226135254, "sampling/importance_sampling_ratio/min": 3.199175604441272e-34, "sampling/sampling_logp_difference/max": 77.125, "sampling/sampling_logp_difference/mean": 4.891195297241211, "step": 390 }, { "clip_ratio/high_max": 0.000217921900912188, "clip_ratio/high_mean": 0.000108960950456094, "clip_ratio/low_mean": 0.0009588563334546052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010678172839106992, "completions/clipped_ratio": 1.0, "completions/max_length": 717.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 690.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 664.0, "completions/min_terminated_length": 0.0, "entropy": 2.8585730344057083, "epoch": 27.928571428571427, "frac_reward_zero_std": 0.875, "grad_norm": 0.008073284290730953, "kl": 0.029020402114838362, "learning_rate": 8.190473813576572e-08, "loss": -0.0005, "num_tokens": 31389408.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.015625, "rewards/tree_correctness_reward/std": 0.125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.565218448638916, "sampling/importance_sampling_ratio/min": 1.6086464555616602e-34, "sampling/sampling_logp_difference/max": 77.8125, "sampling/sampling_logp_difference/mean": 6.296497344970703, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 845.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 817.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 790.0, "completions/min_terminated_length": 0.0, "entropy": 3.0055589973926544, "epoch": 28.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006146320374682546, "kl": 0.033835752634331584, "learning_rate": 6.634972549423857e-08, "loss": 0.0, "num_tokens": 31473856.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5839071273803711, "sampling/importance_sampling_ratio/min": 1.038620201872273e-34, "sampling/sampling_logp_difference/max": 78.25, "sampling/sampling_logp_difference/mean": 6.017183303833008, "step": 392 }, { "clip_ratio/high_max": 0.0002470355830155313, "clip_ratio/high_mean": 0.00012351779150776565, "clip_ratio/low_mean": 0.0006175889357109554, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007411067272187211, "completions/clipped_ratio": 1.0, "completions/max_length": 841.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 673.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 506.0, "completions/min_terminated_length": 0.0, "entropy": 3.0374721065163612, "epoch": 28.071428571428573, "frac_reward_zero_std": 0.875, "grad_norm": 0.004725766833871603, "kl": 0.030147752142511308, "learning_rate": 5.242934405720879e-08, "loss": -0.0004, "num_tokens": 31549088.0, "reward": 0.265625, "reward_std": 0.04419417306780815, "rewards/tree_correctness_reward/mean": 0.265625, "rewards/tree_correctness_reward/std": 0.44515693187713623, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5273865461349487, "sampling/importance_sampling_ratio/min": 1.4524234374972437e-38, "sampling/sampling_logp_difference/max": 87.125, "sampling/sampling_logp_difference/mean": 6.85051155090332, "step": 393 }, { "clip_ratio/high_max": 0.0015289373804989737, "clip_ratio/high_mean": 0.0008033852554945042, "clip_ratio/low_mean": 0.0008279908161057392, "clip_ratio/low_min": 0.0001252504989679437, "clip_ratio/region_mean": 0.0016313760952471057, "completions/clipped_ratio": 1.0, "completions/max_length": 998.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 900.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 803.0, "completions/min_terminated_length": 0.0, "entropy": 4.192389488220215, "epoch": 28.142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.008556331507861614, "kl": 0.03314753097947687, "learning_rate": 4.01445064343281e-08, "loss": 0.0006, "num_tokens": 31638848.0, "reward": 0.09375, "reward_std": 0.1246790662407875, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.44594383239746094, "sampling/importance_sampling_ratio/min": 1.511183624582218e-34, "sampling/sampling_logp_difference/max": 77.875, "sampling/sampling_logp_difference/mean": 7.915619850158691, "step": 394 }, { "clip_ratio/high_max": 0.006900200998643413, "clip_ratio/high_mean": 0.00383236346533522, "clip_ratio/low_mean": 0.004507960242335685, "clip_ratio/low_min": 0.002306851005414501, "clip_ratio/region_mean": 0.008340323605807498, "completions/clipped_ratio": 1.0, "completions/max_length": 648.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 487.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 327.0, "completions/min_terminated_length": 0.0, "entropy": 1.4175974112004042, "epoch": 28.214285714285715, "frac_reward_zero_std": 0.5, "grad_norm": 0.01834862120449543, "kl": 0.03848658688366413, "learning_rate": 2.9496018010233274e-08, "loss": -0.0036, "num_tokens": 31702176.0, "reward": 0.234375, "reward_std": 0.23144522309303284, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42695629596710205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7209559679031372, "sampling/importance_sampling_ratio/min": 1.6246755378554462e-33, "sampling/sampling_logp_difference/max": 75.5, "sampling/sampling_logp_difference/mean": 4.338961124420166, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 887.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 730.0, "completions/min_terminated_length": 0.0, "entropy": 3.4747135937213898, "epoch": 28.285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.00047250805073417723, "kl": 0.03634030360262841, "learning_rate": 2.048457689174943e-08, "loss": 0.0, "num_tokens": 31791104.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5257115960121155, "sampling/importance_sampling_ratio/min": 6.362320011142437e-34, "sampling/sampling_logp_difference/max": 76.4375, "sampling/sampling_logp_difference/mean": 6.809136390686035, "step": 396 }, { "clip_ratio/high_max": 0.0020683943439507857, "clip_ratio/high_mean": 0.0011239960222155787, "clip_ratio/low_mean": 0.0013030444642936345, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024270404974231496, "completions/clipped_ratio": 1.0, "completions/max_length": 654.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 588.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 522.0, "completions/min_terminated_length": 0.0, "entropy": 2.5548413917422295, "epoch": 28.357142857142858, "frac_reward_zero_std": 0.75, "grad_norm": 0.018732797354459763, "kl": 0.04221606825012714, "learning_rate": 1.3110773862126669e-08, "loss": 0.0006, "num_tokens": 31860864.0, "reward": 0.09375, "reward_std": 0.1293872892856598, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29378482699394226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5737744569778442, "sampling/importance_sampling_ratio/min": 5.917879946392805e-35, "sampling/sampling_logp_difference/max": 78.8125, "sampling/sampling_logp_difference/mean": 6.24160099029541, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 839.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 794.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 749.0, "completions/min_terminated_length": 0.0, "entropy": 3.503511145710945, "epoch": 28.428571428571427, "frac_reward_zero_std": 1.0, "grad_norm": 0.00045655632857233286, "kl": 0.03290901647415012, "learning_rate": 7.375092342298828e-09, "loss": 0.0, "num_tokens": 31943808.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5099718570709229, "sampling/importance_sampling_ratio/min": 9.756933719611155e-35, "sampling/sampling_logp_difference/max": 78.3125, "sampling/sampling_logp_difference/mean": 7.042860984802246, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1343.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 639.0, "completions/min_terminated_length": 0.0, "entropy": 4.241745114326477, "epoch": 28.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015939692966639996, "kl": 0.03287532273679972, "learning_rate": 3.2779083591949478e-09, "loss": 0.0, "num_tokens": 32061920.0, "reward": 0.0, "reward_std": 0.0, "rewards/tree_correctness_reward/mean": 0.0, "rewards/tree_correctness_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.42086899280548096, "sampling/importance_sampling_ratio/min": 7.674422876742102e-34, "sampling/sampling_logp_difference/max": 76.25, "sampling/sampling_logp_difference/mean": 8.653843879699707, "step": 399 }, { "clip_ratio/high_max": 0.0032174556035897695, "clip_ratio/high_mean": 0.0016087278017948847, "clip_ratio/low_mean": 0.00024038462106545921, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018491124210413545, "completions/clipped_ratio": 1.0, "completions/max_length": 845.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 707.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 569.0, "completions/min_terminated_length": 0.0, "entropy": 3.2500969916582108, "epoch": 28.571428571428573, "frac_reward_zero_std": 0.75, "grad_norm": 0.008967576548457146, "kl": 0.035135343787260354, "learning_rate": 8.194905210923143e-10, "loss": 0.0015, "num_tokens": 32139296.0, "reward": 0.1875, "reward_std": 0.1157275140285492, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39339789748191833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5091722011566162, "sampling/importance_sampling_ratio/min": 3.8973941318215115e-33, "sampling/sampling_logp_difference/max": 74.625, "sampling/sampling_logp_difference/mean": 7.119640827178955, "step": 400 }, { "epoch": 28.571428571428573, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 463.52, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 423.1, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 382.68, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.7444057339429855, "eval_frac_reward_zero_std": 0.88, "eval_kl": 0.044948767423629764, "eval_loss": 0.0003816725220531225, "eval_num_tokens": 32139296.0, "eval_reward": 0.045, "eval_reward_std": 0.042426406145095824, "eval_rewards/tree_correctness_reward/mean": 0.045, "eval_rewards/tree_correctness_reward/std": 0.042426406145095824, "eval_runtime": 138.8839, "eval_samples_per_second": 0.18, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8342835831642151, "eval_sampling/importance_sampling_ratio/min": 1.3929199464688169e-30, "eval_sampling/sampling_logp_difference/max": 73.3325, "eval_sampling/sampling_logp_difference/mean": 2.6887982940673827, "eval_steps_per_second": 0.029, "step": 400 }, { "epoch": 28.571428571428573, "step": 400, "total_flos": 0.0, "train_loss": 0.0005530451710296802, "train_runtime": 20175.9584, "train_samples_per_second": 1.269, "train_steps_per_second": 0.02 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 32139296, "num_train_epochs": 29, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }