{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 200.0, "eval_steps": 10, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 346.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 287.45, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 233.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.6234924793243408, "eval_frac_reward_zero_std": 0.3, "eval_kl": 0.0, "eval_loss": -0.00297290226444602, "eval_num_tokens": 0.0, "eval_reward": 0.06999999955296517, "eval_reward_std": 0.13499628007411957, "eval_rewards/num_nodes_reward/mean": 0.175, "eval_rewards/num_nodes_reward/std": 0.3672485947608948, "eval_rewards/tree_correctness_reward/mean": 0.025, "eval_rewards/tree_correctness_reward/std": 0.1, "eval_runtime": 67.7474, "eval_samples_per_second": 0.148, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8931006908416748, "eval_sampling/importance_sampling_ratio/min": 2.217230369316485e-29, "eval_sampling/sampling_logp_difference/max": 66.69375, "eval_sampling/sampling_logp_difference/mean": 1.7906005144119264, "eval_steps_per_second": 0.015, "step": 0 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 422.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 308.0, "completions/min_terminated_length": 0.0, "entropy": 1.3188621550798416, "epoch": 0.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.021780600771307945, "kl": 0.0, "learning_rate": 8e-05, "loss": 0.001, "num_tokens": 109072.0, "reward": 0.11484375596046448, "reward_std": 0.17371003329753876, "rewards/num_nodes_reward/mean": 0.1640625, "rewards/num_nodes_reward/std": 0.371787428855896, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29262590408325195, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7027766108512878, "sampling/importance_sampling_ratio/min": 6.489743268163787e-32, "sampling/sampling_logp_difference/max": 71.8125, "sampling/sampling_logp_difference/mean": 4.67726993560791, "step": 1 }, { "clip_ratio/high_max": 0.01576408391701989, "clip_ratio/high_mean": 0.0034574203455122188, "clip_ratio/low_mean": 0.008490052714478225, "clip_ratio/low_min": 0.0004770992381963879, "clip_ratio/region_mean": 0.01194747316185385, "completions/clipped_ratio": 1.0, "completions/max_length": 523.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 403.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 323.0, "completions/min_terminated_length": 0.0, "entropy": 1.5711934566497803, "epoch": 1.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.018054289743304253, "kl": 0.004996898700483143, "learning_rate": 8e-05, "loss": -0.0015, "num_tokens": 224832.0, "reward": 0.11328125, "reward_std": 0.1885376274585724, "rewards/num_nodes_reward/mean": 0.1953125, "rewards/num_nodes_reward/std": 0.3979988098144531, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6691538095474243, "sampling/importance_sampling_ratio/min": 4.190093106797024e-32, "sampling/sampling_logp_difference/max": 72.25, "sampling/sampling_logp_difference/mean": 5.042572021484375, "step": 2 }, { "clip_ratio/high_max": 0.015893289004452527, "clip_ratio/high_mean": 0.003334009350510314, "clip_ratio/low_mean": 0.008109733171295375, "clip_ratio/low_min": 0.00020627063349820673, "clip_ratio/region_mean": 0.011443742667324841, "completions/clipped_ratio": 1.0, "completions/max_length": 567.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 358.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 298.0, "completions/min_terminated_length": 0.0, "entropy": 1.319297045469284, "epoch": 1.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.02184843271970749, "kl": 0.006913644319865853, "learning_rate": 8e-05, "loss": 0.0033, "num_tokens": 334896.0, "reward": 0.08515624701976776, "reward_std": 0.14007213711738586, "rewards/num_nodes_reward/mean": 0.2109375, "rewards/num_nodes_reward/std": 0.4095771610736847, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.1746762990951538, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7132927179336548, "sampling/importance_sampling_ratio/min": 1.6572140516790924e-31, "sampling/sampling_logp_difference/max": 70.875, "sampling/sampling_logp_difference/mean": 4.5152130126953125, "step": 3 }, { "clip_ratio/high_max": 0.022280368430074304, "clip_ratio/high_mean": 0.006657411402557045, "clip_ratio/low_mean": 0.0076666559325531125, "clip_ratio/low_min": 0.00017170330102089792, "clip_ratio/region_mean": 0.014324066694825888, "completions/clipped_ratio": 1.0, "completions/max_length": 447.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 374.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 314.0, "completions/min_terminated_length": 0.0, "entropy": 1.5237607955932617, "epoch": 2.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.017403028905391693, "kl": 0.008676016179379076, "learning_rate": 8e-05, "loss": 0.0037, "num_tokens": 446912.0, "reward": 0.17265625298023224, "reward_std": 0.23584818840026855, "rewards/num_nodes_reward/mean": 0.265625, "rewards/num_nodes_reward/std": 0.44340085983276367, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6790295839309692, "sampling/importance_sampling_ratio/min": 4.795307422964777e-31, "sampling/sampling_logp_difference/max": 69.8125, "sampling/sampling_logp_difference/mean": 4.935495376586914, "step": 4 }, { "clip_ratio/high_max": 0.03358583466615528, "clip_ratio/high_mean": 0.00806717797240708, "clip_ratio/low_mean": 0.008692011993844062, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01675919012632221, "completions/clipped_ratio": 1.0, "completions/max_length": 420.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 368.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 315.0, "completions/min_terminated_length": 0.0, "entropy": 1.3926227241754532, "epoch": 2.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.017051078379154205, "kl": 0.010294820589479059, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 558144.0, "reward": 0.15156249701976776, "reward_std": 0.20689831674098969, "rewards/num_nodes_reward/mean": 0.3046875, "rewards/num_nodes_reward/std": 0.46208351850509644, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6956404447555542, "sampling/importance_sampling_ratio/min": 1.1503345399350276e-30, "sampling/sampling_logp_difference/max": 68.9375, "sampling/sampling_logp_difference/mean": 4.716602325439453, "step": 5 }, { "clip_ratio/high_max": 0.018164387089200318, "clip_ratio/high_mean": 0.005304208490997553, "clip_ratio/low_mean": 0.008202551864087582, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013506760005839169, "completions/clipped_ratio": 1.0, "completions/max_length": 486.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 369.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 259.0, "completions/min_terminated_length": 0.0, "entropy": 1.538826808333397, "epoch": 3.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.017105819657444954, "kl": 0.023768347105942667, "learning_rate": 8e-05, "loss": 0.0028, "num_tokens": 669568.0, "reward": 0.13749998807907104, "reward_std": 0.19299918413162231, "rewards/num_nodes_reward/mean": 0.203125, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.31333550810813904, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6788300275802612, "sampling/importance_sampling_ratio/min": 1.4770587748873233e-30, "sampling/sampling_logp_difference/max": 68.6875, "sampling/sampling_logp_difference/mean": 4.9538893699646, "step": 6 }, { "clip_ratio/high_max": 0.022460941690951586, "clip_ratio/high_mean": 0.007248806650750339, "clip_ratio/low_mean": 0.008241471601650119, "clip_ratio/low_min": 0.0006625441747019067, "clip_ratio/region_mean": 0.01549027836881578, "completions/clipped_ratio": 1.0, "completions/max_length": 432.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 283.0, "completions/min_terminated_length": 0.0, "entropy": 1.3705239295959473, "epoch": 3.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.015557471662759781, "kl": 0.03367286967113614, "learning_rate": 8e-05, "loss": 0.0028, "num_tokens": 779856.0, "reward": 0.10781249403953552, "reward_std": 0.17730551958084106, "rewards/num_nodes_reward/mean": 0.25, "rewards/num_nodes_reward/std": 0.434714138507843, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21220162510871887, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6987200975418091, "sampling/importance_sampling_ratio/min": 1.7640951702309984e-31, "sampling/sampling_logp_difference/max": 70.8125, "sampling/sampling_logp_difference/mean": 4.704798221588135, "step": 7 }, { "clip_ratio/high_max": 0.02262382413027808, "clip_ratio/high_mean": 0.005354439119400922, "clip_ratio/low_mean": 0.009192084369715303, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014546523103490472, "completions/clipped_ratio": 1.0, "completions/max_length": 777.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 433.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 344.0, "completions/min_terminated_length": 0.0, "entropy": 1.840475469827652, "epoch": 4.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.018008537590503693, "kl": 0.00984396249987185, "learning_rate": 8e-05, "loss": -0.0038, "num_tokens": 899504.0, "reward": 0.13828124105930328, "reward_std": 0.21586966514587402, "rewards/num_nodes_reward/mean": 0.2421875, "rewards/num_nodes_reward/std": 0.4300905168056488, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29262590408325195, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6402052044868469, "sampling/importance_sampling_ratio/min": 1.2124436647516845e-31, "sampling/sampling_logp_difference/max": 71.1875, "sampling/sampling_logp_difference/mean": 5.440937042236328, "step": 8 }, { "clip_ratio/high_max": 0.022450002492405474, "clip_ratio/high_mean": 0.005561904996284284, "clip_ratio/low_mean": 0.009460534434765577, "clip_ratio/low_min": 0.0009722427348606288, "clip_ratio/region_mean": 0.015022439416497946, "completions/clipped_ratio": 1.0, "completions/max_length": 563.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 389.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 303.0, "completions/min_terminated_length": 0.0, "entropy": 1.5895740538835526, "epoch": 4.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.015666000545024872, "kl": 0.010837141540832818, "learning_rate": 8e-05, "loss": 0.0019, "num_tokens": 1013504.0, "reward": 0.15781250596046448, "reward_std": 0.1865069717168808, "rewards/num_nodes_reward/mean": 0.2890625, "rewards/num_nodes_reward/std": 0.45510825514793396, "rewards/tree_correctness_reward/mean": 0.1015625, "rewards/tree_correctness_reward/std": 0.3032590448856354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6676886081695557, "sampling/importance_sampling_ratio/min": 1.9989817461156974e-31, "sampling/sampling_logp_difference/max": 70.6875, "sampling/sampling_logp_difference/mean": 5.096254825592041, "step": 9 }, { "clip_ratio/high_max": 0.025855677376966923, "clip_ratio/high_mean": 0.005809825808682945, "clip_ratio/low_mean": 0.008363057160750031, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014172883238643408, "completions/clipped_ratio": 1.0, "completions/max_length": 465.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 378.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 276.0, "completions/min_terminated_length": 0.0, "entropy": 1.3821395859122276, "epoch": 5.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.019947737455368042, "kl": 0.01344757410697639, "learning_rate": 8e-05, "loss": 0.0026, "num_tokens": 1126032.0, "reward": 0.13124999403953552, "reward_std": 0.208847776055336, "rewards/num_nodes_reward/mean": 0.21875, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29262590408325195, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7118703126907349, "sampling/importance_sampling_ratio/min": 5.784245648136424e-31, "sampling/sampling_logp_difference/max": 69.625, "sampling/sampling_logp_difference/mean": 4.494324207305908, "step": 10 }, { "epoch": 5.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 357.4, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 275.25, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 212.6, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.5316240191459656, "eval_frac_reward_zero_std": 0.3, "eval_kl": 0.014366858638823032, "eval_loss": -7.962415111251175e-05, "eval_num_tokens": 1126032.0, "eval_reward": 0.075, "eval_reward_std": 0.13274334967136384, "eval_rewards/num_nodes_reward/mean": 0.1625, "eval_rewards/num_nodes_reward/std": 0.3014051020145416, "eval_rewards/tree_correctness_reward/mean": 0.0375, "eval_rewards/tree_correctness_reward/std": 0.11831300854682922, "eval_runtime": 44.0883, "eval_samples_per_second": 0.227, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9107975840568543, "eval_sampling/importance_sampling_ratio/min": 5.429852051632438e-29, "eval_sampling/sampling_logp_difference/max": 65.6, "eval_sampling/sampling_logp_difference/mean": 1.5740029335021972, "eval_steps_per_second": 0.023, "step": 10 }, { "clip_ratio/high_max": 0.018841346842236817, "clip_ratio/high_mean": 0.004982051701517776, "clip_ratio/low_mean": 0.012567506812047213, "clip_ratio/low_min": 0.0016452233539894223, "clip_ratio/region_mean": 0.01754955854266882, "completions/clipped_ratio": 1.0, "completions/max_length": 510.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 294.0, "completions/min_terminated_length": 0.0, "entropy": 1.4188585132360458, "epoch": 5.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.018624259158968925, "kl": 0.017281539272516966, "learning_rate": 8e-05, "loss": 0.002, "num_tokens": 1236320.0, "reward": 0.12890625, "reward_std": 0.18777883052825928, "rewards/num_nodes_reward/mean": 0.265625, "rewards/num_nodes_reward/std": 0.44340085983276367, "rewards/tree_correctness_reward/mean": 0.0703125, "rewards/tree_correctness_reward/std": 0.2566775679588318, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7018622756004333, "sampling/importance_sampling_ratio/min": 8.696261289635204e-34, "sampling/sampling_logp_difference/max": 76.125, "sampling/sampling_logp_difference/mean": 4.650069713592529, "step": 11 }, { "clip_ratio/high_max": 0.022581592318601906, "clip_ratio/high_mean": 0.006918017199495807, "clip_ratio/low_mean": 0.008246128854807466, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015164146199822426, "completions/clipped_ratio": 1.0, "completions/max_length": 417.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 383.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 343.0, "completions/min_terminated_length": 0.0, "entropy": 1.67644602060318, "epoch": 6.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.028291599825024605, "kl": 0.06424686429090798, "learning_rate": 8e-05, "loss": 0.0006, "num_tokens": 1349520.0, "reward": 0.21953123807907104, "reward_std": 0.24291223287582397, "rewards/num_nodes_reward/mean": 0.3671875, "rewards/num_nodes_reward/std": 0.4839322865009308, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6442349553108215, "sampling/importance_sampling_ratio/min": 7.906125798828331e-31, "sampling/sampling_logp_difference/max": 69.3125, "sampling/sampling_logp_difference/mean": 5.417321681976318, "step": 12 }, { "clip_ratio/high_max": 0.01919092785101384, "clip_ratio/high_mean": 0.006002688372973353, "clip_ratio/low_mean": 0.006910990603500977, "clip_ratio/low_min": 0.00013796909479424357, "clip_ratio/region_mean": 0.012913679354824126, "completions/clipped_ratio": 1.0, "completions/max_length": 906.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 423.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 289.0, "completions/min_terminated_length": 0.0, "entropy": 1.8622514009475708, "epoch": 6.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.01703224703669548, "kl": 0.06042225903365761, "learning_rate": 8e-05, "loss": -0.006, "num_tokens": 1467888.0, "reward": 0.16562500596046448, "reward_std": 0.16107617318630219, "rewards/num_nodes_reward/mean": 0.296875, "rewards/num_nodes_reward/std": 0.45867621898651123, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.31333550810813904, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6389739513397217, "sampling/importance_sampling_ratio/min": 2.411232316926477e-31, "sampling/sampling_logp_difference/max": 70.5, "sampling/sampling_logp_difference/mean": 5.496977806091309, "step": 13 }, { "clip_ratio/high_max": 0.019336953875608742, "clip_ratio/high_mean": 0.005215088982367888, "clip_ratio/low_mean": 0.008382412808714435, "clip_ratio/low_min": 0.00045399516238830984, "clip_ratio/region_mean": 0.013597502140328288, "completions/clipped_ratio": 1.0, "completions/max_length": 525.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 370.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 311.0, "completions/min_terminated_length": 0.0, "entropy": 1.5207564979791641, "epoch": 7.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.028219075873494148, "kl": 0.022900795098394156, "learning_rate": 8e-05, "loss": -0.001, "num_tokens": 1579392.0, "reward": 0.15859374403953552, "reward_std": 0.21873405575752258, "rewards/num_nodes_reward/mean": 0.2734375, "rewards/num_nodes_reward/std": 0.447474867105484, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.31333550810813904, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6861681938171387, "sampling/importance_sampling_ratio/min": 1.7640951702309984e-31, "sampling/sampling_logp_difference/max": 70.8125, "sampling/sampling_logp_difference/mean": 4.825418949127197, "step": 14 }, { "clip_ratio/high_max": 0.02266923850402236, "clip_ratio/high_mean": 0.00648190570063889, "clip_ratio/low_mean": 0.010190563509240746, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01667246944271028, "completions/clipped_ratio": 1.0, "completions/max_length": 430.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 380.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 319.0, "completions/min_terminated_length": 0.0, "entropy": 1.681582361459732, "epoch": 7.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.021219799295067787, "kl": 0.022493391996249557, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 1692256.0, "reward": 0.18281251192092896, "reward_std": 0.18375007808208466, "rewards/num_nodes_reward/mean": 0.3359375, "rewards/num_nodes_reward/std": 0.47417303919792175, "rewards/tree_correctness_reward/mean": 0.1171875, "rewards/tree_correctness_reward/std": 0.322907418012619, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6578193306922913, "sampling/importance_sampling_ratio/min": 1.2633953733485285e-30, "sampling/sampling_logp_difference/max": 68.84375, "sampling/sampling_logp_difference/mean": 5.220351696014404, "step": 15 }, { "clip_ratio/high_max": 0.020125656621530652, "clip_ratio/high_mean": 0.004762223776197061, "clip_ratio/low_mean": 0.008477540744934231, "clip_ratio/low_min": 0.0003805175074376166, "clip_ratio/region_mean": 0.01323976437561214, "completions/clipped_ratio": 1.0, "completions/max_length": 657.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 420.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 326.0, "completions/min_terminated_length": 0.0, "entropy": 1.8913476169109344, "epoch": 8.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.016144072636961937, "kl": 0.030348304193466902, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 1810176.0, "reward": 0.13671875, "reward_std": 0.22697092592716217, "rewards/num_nodes_reward/mean": 0.21875, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.1015625, "rewards/tree_correctness_reward/std": 0.3032590448856354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6342688798904419, "sampling/importance_sampling_ratio/min": 1.069977863662251e-31, "sampling/sampling_logp_difference/max": 71.3125, "sampling/sampling_logp_difference/mean": 5.515870094299316, "step": 16 }, { "clip_ratio/high_max": 0.027173306443728507, "clip_ratio/high_mean": 0.007300559198483825, "clip_ratio/low_mean": 0.010123738087713718, "clip_ratio/low_min": 0.0012833665241487324, "clip_ratio/region_mean": 0.017424297519028187, "completions/clipped_ratio": 1.0, "completions/max_length": 409.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 346.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 278.0, "completions/min_terminated_length": 0.0, "entropy": 1.283915862441063, "epoch": 8.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01777520589530468, "kl": 0.030220132553949952, "learning_rate": 8e-05, "loss": -0.0009, "num_tokens": 1918592.0, "reward": 0.12421874701976776, "reward_std": 0.20798474550247192, "rewards/num_nodes_reward/mean": 0.3046875, "rewards/num_nodes_reward/std": 0.46208351850509644, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21220162510871887, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7224938869476318, "sampling/importance_sampling_ratio/min": 4.795307422964777e-31, "sampling/sampling_logp_difference/max": 69.8125, "sampling/sampling_logp_difference/mean": 4.336972713470459, "step": 17 }, { "clip_ratio/high_max": 0.03217295720241964, "clip_ratio/high_mean": 0.007231807510834187, "clip_ratio/low_mean": 0.009330775123089552, "clip_ratio/low_min": 0.0005053908098489046, "clip_ratio/region_mean": 0.016562582808546722, "completions/clipped_ratio": 1.0, "completions/max_length": 515.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 376.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 312.0, "completions/min_terminated_length": 0.0, "entropy": 1.4928106665611267, "epoch": 9.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.023119129240512848, "kl": 0.0510411246214062, "learning_rate": 8e-05, "loss": 0.0016, "num_tokens": 2030880.0, "reward": 0.14609375596046448, "reward_std": 0.20571982860565186, "rewards/num_nodes_reward/mean": 0.3046875, "rewards/num_nodes_reward/std": 0.46208351850509644, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6927189826965332, "sampling/importance_sampling_ratio/min": 2.908501221957089e-31, "sampling/sampling_logp_difference/max": 70.3125, "sampling/sampling_logp_difference/mean": 4.737643718719482, "step": 18 }, { "clip_ratio/high_max": 0.021955122705549, "clip_ratio/high_mean": 0.007316044269828126, "clip_ratio/low_mean": 0.005574701877776533, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012890746351331472, "completions/clipped_ratio": 1.0, "completions/max_length": 461.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 373.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 253.0, "completions/min_terminated_length": 0.0, "entropy": 1.4320306181907654, "epoch": 9.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.015888290479779243, "kl": 0.03786909068003297, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 2142848.0, "reward": 0.20546874403953552, "reward_std": 0.21427880227565765, "rewards/num_nodes_reward/mean": 0.3203125, "rewards/num_nodes_reward/std": 0.4684300124645233, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6895436644554138, "sampling/importance_sampling_ratio/min": 1.7640951702309984e-31, "sampling/sampling_logp_difference/max": 70.8125, "sampling/sampling_logp_difference/mean": 4.810562610626221, "step": 19 }, { "clip_ratio/high_max": 0.023927540984004736, "clip_ratio/high_mean": 0.007040566822979599, "clip_ratio/low_mean": 0.012243267381563783, "clip_ratio/low_min": 0.00143640668829903, "clip_ratio/region_mean": 0.01928383414633572, "completions/clipped_ratio": 1.0, "completions/max_length": 444.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 363.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 296.0, "completions/min_terminated_length": 0.0, "entropy": 1.3726564943790436, "epoch": 10.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.021175475791096687, "kl": 0.05874769762158394, "learning_rate": 8e-05, "loss": 0.001, "num_tokens": 2253536.0, "reward": 0.18125000596046448, "reward_std": 0.23291082680225372, "rewards/num_nodes_reward/mean": 0.3125, "rewards/num_nodes_reward/std": 0.4653336703777313, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3320184051990509, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6780282258987427, "sampling/importance_sampling_ratio/min": 5.784245648136424e-31, "sampling/sampling_logp_difference/max": 69.625, "sampling/sampling_logp_difference/mean": 5.027543544769287, "step": 20 }, { "epoch": 10.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 384.6, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 306.0, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 237.8, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.4072810113430023, "eval_frac_reward_zero_std": 0.1, "eval_kl": 0.09232058078050613, "eval_loss": 0.0015283293323591352, "eval_num_tokens": 2253536.0, "eval_reward": 0.06500000283122062, "eval_reward_std": 0.1214054137468338, "eval_rewards/num_nodes_reward/mean": 0.1875, "eval_rewards/num_nodes_reward/std": 0.3961340427398682, "eval_rewards/tree_correctness_reward/mean": 0.0125, "eval_rewards/tree_correctness_reward/std": 0.05, "eval_runtime": 47.8391, "eval_samples_per_second": 0.209, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9107956409454345, "eval_sampling/importance_sampling_ratio/min": 7.537469572015537e-29, "eval_sampling/sampling_logp_difference/max": 66.08125, "eval_sampling/sampling_logp_difference/mean": 1.5281862497329712, "eval_steps_per_second": 0.021, "step": 20 }, { "clip_ratio/high_max": 0.03137423109728843, "clip_ratio/high_mean": 0.008611803437815979, "clip_ratio/low_mean": 0.009795682155527174, "clip_ratio/low_min": 0.0014371609140653163, "clip_ratio/region_mean": 0.018407485680654645, "completions/clipped_ratio": 1.0, "completions/max_length": 545.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 414.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 329.0, "completions/min_terminated_length": 0.0, "entropy": 1.5305435210466385, "epoch": 10.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.018939631059765816, "kl": 0.08685059286653996, "learning_rate": 8e-05, "loss": 0.0037, "num_tokens": 2370752.0, "reward": 0.17890626192092896, "reward_std": 0.2206301987171173, "rewards/num_nodes_reward/mean": 0.359375, "rewards/num_nodes_reward/std": 0.481702595949173, "rewards/tree_correctness_reward/mean": 0.1015625, "rewards/tree_correctness_reward/std": 0.3032590448856354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6620954871177673, "sampling/importance_sampling_ratio/min": 2.1279047943387423e-31, "sampling/sampling_logp_difference/max": 70.625, "sampling/sampling_logp_difference/mean": 5.183498859405518, "step": 21 }, { "clip_ratio/high_max": 0.02323276037350297, "clip_ratio/high_mean": 0.005185554357012734, "clip_ratio/low_mean": 0.008460916695185006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013646471197716892, "completions/clipped_ratio": 1.0, "completions/max_length": 414.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 363.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 310.0, "completions/min_terminated_length": 0.0, "entropy": 1.4057644754648209, "epoch": 11.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.018324729055166245, "kl": 0.10620310809463263, "learning_rate": 8e-05, "loss": 0.0029, "num_tokens": 2481392.0, "reward": 0.1796875, "reward_std": 0.17310969531536102, "rewards/num_nodes_reward/mean": 0.2890625, "rewards/num_nodes_reward/std": 0.45510825514793396, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6882869005203247, "sampling/importance_sampling_ratio/min": 5.222510753034111e-35, "sampling/sampling_logp_difference/max": 78.9375, "sampling/sampling_logp_difference/mean": 4.8537678718566895, "step": 22 }, { "clip_ratio/high_max": 0.022023478290066123, "clip_ratio/high_mean": 0.005410779005615041, "clip_ratio/low_mean": 0.009220426843967289, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014631205587647855, "completions/clipped_ratio": 1.0, "completions/max_length": 581.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 413.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 278.0, "completions/min_terminated_length": 0.0, "entropy": 1.8622871041297913, "epoch": 11.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.021750230342149734, "kl": 0.464962319470942, "learning_rate": 8e-05, "loss": 0.0016, "num_tokens": 2598384.0, "reward": 0.18437498807907104, "reward_std": 0.20676815509796143, "rewards/num_nodes_reward/mean": 0.3046875, "rewards/num_nodes_reward/std": 0.46208351850509644, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6263189315795898, "sampling/importance_sampling_ratio/min": 4.795307422964777e-31, "sampling/sampling_logp_difference/max": 69.8125, "sampling/sampling_logp_difference/mean": 5.631839275360107, "step": 23 }, { "clip_ratio/high_max": 0.030777871841564775, "clip_ratio/high_mean": 0.008760096359765157, "clip_ratio/low_mean": 0.008961622195784003, "clip_ratio/low_min": 0.0006249999860301614, "clip_ratio/region_mean": 0.017721718293614686, "completions/clipped_ratio": 1.0, "completions/max_length": 600.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 410.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 330.0, "completions/min_terminated_length": 0.0, "entropy": 1.7364970594644547, "epoch": 12.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.016970500349998474, "kl": 0.08210289664566517, "learning_rate": 8e-05, "loss": 0.0036, "num_tokens": 2715024.0, "reward": 0.2070312649011612, "reward_std": 0.19383887946605682, "rewards/num_nodes_reward/mean": 0.3984375, "rewards/num_nodes_reward/std": 0.4915000796318054, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3320184051990509, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.652280330657959, "sampling/importance_sampling_ratio/min": 1.9567850724240038e-30, "sampling/sampling_logp_difference/max": 68.40625, "sampling/sampling_logp_difference/mean": 5.261103630065918, "step": 24 }, { "clip_ratio/high_max": 0.02150406240252778, "clip_ratio/high_mean": 0.006195713795023039, "clip_ratio/low_mean": 0.007727239513769746, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013922953396104276, "completions/clipped_ratio": 1.0, "completions/max_length": 417.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 367.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 316.0, "completions/min_terminated_length": 0.0, "entropy": 1.4891314059495926, "epoch": 12.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.017552239820361137, "kl": 0.09481783490628004, "learning_rate": 8e-05, "loss": 0.0038, "num_tokens": 2826160.0, "reward": 0.17656250298023224, "reward_std": 0.21144869923591614, "rewards/num_nodes_reward/mean": 0.3515625, "rewards/num_nodes_reward/std": 0.4793342351913452, "rewards/tree_correctness_reward/mean": 0.1015625, "rewards/tree_correctness_reward/std": 0.3032590448856354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6831129193305969, "sampling/importance_sampling_ratio/min": 3.4684842740447485e-29, "sampling/sampling_logp_difference/max": 65.53125, "sampling/sampling_logp_difference/mean": 4.840445518493652, "step": 25 }, { "clip_ratio/high_max": 0.022724946727976203, "clip_ratio/high_mean": 0.005343676311895251, "clip_ratio/low_mean": 0.010959626059047878, "clip_ratio/low_min": 0.0003662109375, "clip_ratio/region_mean": 0.016303302836604416, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 382.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 305.0, "completions/min_terminated_length": 0.0, "entropy": 1.4879240542650223, "epoch": 13.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.01598610356450081, "kl": 0.10678435303270817, "learning_rate": 8e-05, "loss": 0.0012, "num_tokens": 2939280.0, "reward": 0.10078124701976776, "reward_std": 0.17255054414272308, "rewards/num_nodes_reward/mean": 0.2265625, "rewards/num_nodes_reward/std": 0.4202519655227661, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21220162510871887, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.685404896736145, "sampling/importance_sampling_ratio/min": 4.0550689701785223e-29, "sampling/sampling_logp_difference/max": 65.375, "sampling/sampling_logp_difference/mean": 4.785576820373535, "step": 26 }, { "clip_ratio/high_max": 0.024009671178646386, "clip_ratio/high_mean": 0.007962659641634673, "clip_ratio/low_mean": 0.008863541937898844, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016826201463118196, "completions/clipped_ratio": 1.0, "completions/max_length": 398.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 342.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 289.0, "completions/min_terminated_length": 0.0, "entropy": 1.2859250828623772, "epoch": 13.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.014577718451619148, "kl": 0.10450363904237747, "learning_rate": 8e-05, "loss": 0.0043, "num_tokens": 3047248.0, "reward": 0.2109375, "reward_std": 0.2359497845172882, "rewards/num_nodes_reward/mean": 0.375, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7110545039176941, "sampling/importance_sampling_ratio/min": 7.04665266583478e-30, "sampling/sampling_logp_difference/max": 67.125, "sampling/sampling_logp_difference/mean": 4.514170169830322, "step": 27 }, { "clip_ratio/high_max": 0.02334557194262743, "clip_ratio/high_mean": 0.006384907435858622, "clip_ratio/low_mean": 0.010563333053141832, "clip_ratio/low_min": 0.00034818940912373364, "clip_ratio/region_mean": 0.01694824011065066, "completions/clipped_ratio": 1.0, "completions/max_length": 674.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 435.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 357.0, "completions/min_terminated_length": 0.0, "entropy": 2.1218643486499786, "epoch": 14.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.020889101549983025, "kl": 0.10312982834875584, "learning_rate": 8e-05, "loss": 0.0021, "num_tokens": 3167056.0, "reward": 0.18515625596046448, "reward_std": 0.18098938465118408, "rewards/num_nodes_reward/mean": 0.34375, "rewards/num_nodes_reward/std": 0.47682511806488037, "rewards/tree_correctness_reward/mean": 0.1171875, "rewards/tree_correctness_reward/std": 0.322907418012619, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5960564613342285, "sampling/importance_sampling_ratio/min": 1.4153580920397806e-28, "sampling/sampling_logp_difference/max": 64.125, "sampling/sampling_logp_difference/mean": 5.977953910827637, "step": 28 }, { "clip_ratio/high_max": 0.02468423533719033, "clip_ratio/high_mean": 0.007689093501539901, "clip_ratio/low_mean": 0.010837185312993824, "clip_ratio/low_min": 0.001011035667033866, "clip_ratio/region_mean": 0.018526279251091182, "completions/clipped_ratio": 1.0, "completions/max_length": 497.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 292.0, "completions/min_terminated_length": 0.0, "entropy": 1.2693349719047546, "epoch": 14.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.019531767815351486, "kl": 0.18471159972250462, "learning_rate": 8e-05, "loss": 0.0015, "num_tokens": 3275280.0, "reward": 0.15625, "reward_std": 0.2003408819437027, "rewards/num_nodes_reward/mean": 0.3203125, "rewards/num_nodes_reward/std": 0.4684300124645233, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6971137523651123, "sampling/importance_sampling_ratio/min": 2.8754744950605667e-29, "sampling/sampling_logp_difference/max": 65.71875, "sampling/sampling_logp_difference/mean": 4.748955249786377, "step": 29 }, { "clip_ratio/high_max": 0.028233791352249682, "clip_ratio/high_mean": 0.009604192600818351, "clip_ratio/low_mean": 0.00898873881669715, "clip_ratio/low_min": 0.0010781507298815995, "clip_ratio/region_mean": 0.018592931097373366, "completions/clipped_ratio": 1.0, "completions/max_length": 827.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 476.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 269.0, "completions/min_terminated_length": 0.0, "entropy": 1.9841263145208359, "epoch": 15.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01877565309405327, "kl": 0.10088046547025442, "learning_rate": 8e-05, "loss": 0.0088, "num_tokens": 3400416.0, "reward": 0.19374999403953552, "reward_std": 0.2513698935508728, "rewards/num_nodes_reward/mean": 0.390625, "rewards/num_nodes_reward/std": 0.4898075461387634, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.31333550810813904, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6178136467933655, "sampling/importance_sampling_ratio/min": 6.479982085325971e-29, "sampling/sampling_logp_difference/max": 64.90625, "sampling/sampling_logp_difference/mean": 5.760928630828857, "step": 30 }, { "epoch": 15.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 394.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 280.95, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 216.2, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.43270275592803953, "eval_frac_reward_zero_std": 0.1, "eval_kl": 0.11525175720453262, "eval_loss": -0.0012721334351226687, "eval_num_tokens": 3400416.0, "eval_reward": 0.08750000074505807, "eval_reward_std": 0.1432440310716629, "eval_rewards/num_nodes_reward/mean": 0.2625, "eval_rewards/num_nodes_reward/std": 0.4365905404090881, "eval_rewards/tree_correctness_reward/mean": 0.0125, "eval_rewards/tree_correctness_reward/std": 0.05, "eval_runtime": 48.0365, "eval_samples_per_second": 0.208, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8836063265800476, "eval_sampling/importance_sampling_ratio/min": 2.359868785450687e-28, "eval_sampling/sampling_logp_difference/max": 65.35625, "eval_sampling/sampling_logp_difference/mean": 1.9826534032821654, "eval_steps_per_second": 0.021, "step": 30 }, { "clip_ratio/high_max": 0.023760272772051394, "clip_ratio/high_mean": 0.006840807836852036, "clip_ratio/low_mean": 0.008243024989496917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015083832549862564, "completions/clipped_ratio": 1.0, "completions/max_length": 416.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 348.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 250.0, "completions/min_terminated_length": 0.0, "entropy": 1.14379171282053, "epoch": 15.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.017209049314260483, "kl": 0.16927079670131207, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 3509152.0, "reward": 0.13437500596046448, "reward_std": 0.18341295421123505, "rewards/num_nodes_reward/mean": 0.265625, "rewards/num_nodes_reward/std": 0.44340085983276367, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6916522979736328, "sampling/importance_sampling_ratio/min": 7.739234377113275e-30, "sampling/sampling_logp_difference/max": 67.03125, "sampling/sampling_logp_difference/mean": 4.918485641479492, "step": 31 }, { "clip_ratio/high_max": 0.021950071677565575, "clip_ratio/high_mean": 0.005826892127515748, "clip_ratio/low_mean": 0.010337555431760848, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01616444729734212, "completions/clipped_ratio": 1.0, "completions/max_length": 434.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 332.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 273.0, "completions/min_terminated_length": 0.0, "entropy": 1.255202367901802, "epoch": 16.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.055969491600990295, "kl": 0.33255964796990156, "learning_rate": 8e-05, "loss": 0.0021, "num_tokens": 3615856.0, "reward": 0.21249999105930328, "reward_std": 0.21417073905467987, "rewards/num_nodes_reward/mean": 0.34375, "rewards/num_nodes_reward/std": 0.47682511806488037, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6914284229278564, "sampling/importance_sampling_ratio/min": 3.434264486209825e-30, "sampling/sampling_logp_difference/max": 67.84375, "sampling/sampling_logp_difference/mean": 4.883679389953613, "step": 32 }, { "clip_ratio/high_max": 0.039684499613940716, "clip_ratio/high_mean": 0.010373118362622336, "clip_ratio/low_mean": 0.012662912718951702, "clip_ratio/low_min": 0.0008355614845640957, "clip_ratio/region_mean": 0.02303603116888553, "completions/clipped_ratio": 1.0, "completions/max_length": 374.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 268.0, "completions/min_terminated_length": 0.0, "entropy": 1.2601165771484375, "epoch": 16.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.023097224533557892, "kl": 0.5779337175190449, "learning_rate": 8e-05, "loss": -0.0009, "num_tokens": 3722224.0, "reward": 0.19609375298023224, "reward_std": 0.24019736051559448, "rewards/num_nodes_reward/mean": 0.34375, "rewards/num_nodes_reward/std": 0.47682511806488037, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6929054260253906, "sampling/importance_sampling_ratio/min": 2.6746079387988347e-30, "sampling/sampling_logp_difference/max": 68.09375, "sampling/sampling_logp_difference/mean": 4.866211414337158, "step": 33 }, { "clip_ratio/high_max": 0.02214621484745294, "clip_ratio/high_mean": 0.006370651186443865, "clip_ratio/low_mean": 0.009824704029597342, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016195355216041207, "completions/clipped_ratio": 1.0, "completions/max_length": 496.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 329.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 261.0, "completions/min_terminated_length": 0.0, "entropy": 1.3826522827148438, "epoch": 17.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.035209476947784424, "kl": 0.1160268122330308, "learning_rate": 8e-05, "loss": 0.0009, "num_tokens": 3828560.0, "reward": 0.23671874403953552, "reward_std": 0.2372259944677353, "rewards/num_nodes_reward/mean": 0.3515625, "rewards/num_nodes_reward/std": 0.4793342351913452, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39184603095054626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.685745894908905, "sampling/importance_sampling_ratio/min": 5.319093129561238e-30, "sampling/sampling_logp_difference/max": 67.40625, "sampling/sampling_logp_difference/mean": 4.903210639953613, "step": 34 }, { "clip_ratio/high_max": 0.02634466369636357, "clip_ratio/high_mean": 0.007860585872549564, "clip_ratio/low_mean": 0.010359672247432172, "clip_ratio/low_min": 0.00020424836839083582, "clip_ratio/region_mean": 0.018220258061774075, "completions/clipped_ratio": 1.0, "completions/max_length": 624.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 370.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 306.0, "completions/min_terminated_length": 0.0, "entropy": 1.582766279578209, "epoch": 17.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.018563592806458473, "kl": 0.12331247562542558, "learning_rate": 8e-05, "loss": 0.002, "num_tokens": 3940064.0, "reward": 0.17031249403953552, "reward_std": 0.20386356115341187, "rewards/num_nodes_reward/mean": 0.3671875, "rewards/num_nodes_reward/std": 0.4839322865009308, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6719623804092407, "sampling/importance_sampling_ratio/min": 7.1986098550006065e-31, "sampling/sampling_logp_difference/max": 69.40625, "sampling/sampling_logp_difference/mean": 5.04443359375, "step": 35 }, { "clip_ratio/high_max": 0.03328611503820866, "clip_ratio/high_mean": 0.00904201103548985, "clip_ratio/low_mean": 0.008888563315849751, "clip_ratio/low_min": 0.0007102272938936949, "clip_ratio/region_mean": 0.017930574482306838, "completions/clipped_ratio": 1.0, "completions/max_length": 319.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 255.0, "completions/min_terminated_length": 0.0, "entropy": 0.9203391894698143, "epoch": 18.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.02274245396256447, "kl": 0.09651335515081882, "learning_rate": 8e-05, "loss": -0.0016, "num_tokens": 4040848.0, "reward": 0.13906250894069672, "reward_std": 0.18384268879890442, "rewards/num_nodes_reward/mean": 0.3359375, "rewards/num_nodes_reward/std": 0.47417303919792175, "rewards/tree_correctness_reward/mean": 0.0546875, "rewards/tree_correctness_reward/std": 0.22826264798641205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7671911120414734, "sampling/importance_sampling_ratio/min": 2.4352582929827077e-30, "sampling/sampling_logp_difference/max": 68.1875, "sampling/sampling_logp_difference/mean": 3.8148961067199707, "step": 36 }, { "clip_ratio/high_max": 0.02016261813696474, "clip_ratio/high_mean": 0.005845085557666607, "clip_ratio/low_mean": 0.01172737724846229, "clip_ratio/low_min": 0.00040849673678167164, "clip_ratio/region_mean": 0.017572462325915694, "completions/clipped_ratio": 1.0, "completions/max_length": 396.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 320.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 278.0, "completions/min_terminated_length": 0.0, "entropy": 1.1303500458598137, "epoch": 18.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.02016027830541134, "kl": 0.10191036853939295, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 4146000.0, "reward": 0.18046875298023224, "reward_std": 0.22489714622497559, "rewards/num_nodes_reward/mean": 0.3828125, "rewards/num_nodes_reward/std": 0.4879830479621887, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29262590408325195, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7310328483581543, "sampling/importance_sampling_ratio/min": 6.157297243929202e-31, "sampling/sampling_logp_difference/max": 69.5625, "sampling/sampling_logp_difference/mean": 4.264334201812744, "step": 37 }, { "clip_ratio/high_max": 0.02499994623940438, "clip_ratio/high_mean": 0.006971912283916026, "clip_ratio/low_mean": 0.00942765100626275, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016399563290178776, "completions/clipped_ratio": 1.0, "completions/max_length": 471.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 268.0, "completions/min_terminated_length": 0.0, "entropy": 1.3286628648638725, "epoch": 19.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.013679983094334602, "kl": 0.26765078864991665, "learning_rate": 8e-05, "loss": -0.0011, "num_tokens": 4250608.0, "reward": 0.19765625894069672, "reward_std": 0.20679643750190735, "rewards/num_nodes_reward/mean": 0.3125, "rewards/num_nodes_reward/std": 0.4653336703777313, "rewards/tree_correctness_reward/mean": 0.1484375, "rewards/tree_correctness_reward/std": 0.356930136680603, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7153570652008057, "sampling/importance_sampling_ratio/min": 1.1389853770767086e-31, "sampling/sampling_logp_difference/max": 71.25, "sampling/sampling_logp_difference/mean": 4.482044219970703, "step": 38 }, { "clip_ratio/high_max": 0.027657350292429328, "clip_ratio/high_mean": 0.010526920494157821, "clip_ratio/low_mean": 0.007367000973317772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017893921118229628, "completions/clipped_ratio": 1.0, "completions/max_length": 377.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 277.0, "completions/min_terminated_length": 0.0, "entropy": 1.372626006603241, "epoch": 19.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01881311647593975, "kl": 0.10195530578494072, "learning_rate": 8e-05, "loss": -0.0005, "num_tokens": 4355488.0, "reward": 0.25859373807907104, "reward_std": 0.27541470527648926, "rewards/num_nodes_reward/mean": 0.4609375, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.3787541687488556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7058644890785217, "sampling/importance_sampling_ratio/min": 1.7816729014961325e-30, "sampling/sampling_logp_difference/max": 68.5, "sampling/sampling_logp_difference/mean": 4.5933427810668945, "step": 39 }, { "clip_ratio/high_max": 0.020776290213689208, "clip_ratio/high_mean": 0.006346785405185074, "clip_ratio/low_mean": 0.009560910548316315, "clip_ratio/low_min": 0.000830564764328301, "clip_ratio/region_mean": 0.01590769551694393, "completions/clipped_ratio": 1.0, "completions/max_length": 394.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 321.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 281.0, "completions/min_terminated_length": 0.0, "entropy": 1.3571174144744873, "epoch": 20.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.020793592557311058, "kl": 0.07546079531311989, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 4460816.0, "reward": 0.18359375, "reward_std": 0.19039751589298248, "rewards/num_nodes_reward/mean": 0.375, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.1015625, "rewards/tree_correctness_reward/std": 0.3032590448856354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7044184803962708, "sampling/importance_sampling_ratio/min": 3.328603530794073e-30, "sampling/sampling_logp_difference/max": 67.875, "sampling/sampling_logp_difference/mean": 4.642924785614014, "step": 40 }, { "epoch": 20.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 334.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 269.0, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 214.2, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.512590229511261, "eval_frac_reward_zero_std": 0.3, "eval_kl": 0.08347582668066025, "eval_loss": 0.0019772450905293226, "eval_num_tokens": 4460816.0, "eval_reward": 0.1199999988079071, "eval_reward_std": 0.1598766639828682, "eval_rewards/num_nodes_reward/mean": 0.3125, "eval_rewards/num_nodes_reward/std": 0.45677800178527833, "eval_rewards/tree_correctness_reward/mean": 0.0375, "eval_rewards/tree_correctness_reward/std": 0.15, "eval_runtime": 42.3185, "eval_samples_per_second": 0.236, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8967474222183227, "eval_sampling/importance_sampling_ratio/min": 4.261833468723189e-29, "eval_sampling/sampling_logp_difference/max": 66.5625, "eval_sampling/sampling_logp_difference/mean": 1.7886011600494385, "eval_steps_per_second": 0.024, "step": 40 }, { "clip_ratio/high_max": 0.024958890862762928, "clip_ratio/high_mean": 0.007247598812682554, "clip_ratio/low_mean": 0.0089869424700737, "clip_ratio/low_min": 0.0005296610179357231, "clip_ratio/region_mean": 0.016234541544690728, "completions/clipped_ratio": 1.0, "completions/max_length": 411.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 341.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 266.0, "completions/min_terminated_length": 0.0, "entropy": 1.4715371578931808, "epoch": 20.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.016453370451927185, "kl": 0.06598789431154728, "learning_rate": 8e-05, "loss": 0.0026, "num_tokens": 4568624.0, "reward": 0.20546874403953552, "reward_std": 0.2402987778186798, "rewards/num_nodes_reward/mean": 0.375, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6859472990036011, "sampling/importance_sampling_ratio/min": 2.5923188203429674e-30, "sampling/sampling_logp_difference/max": 68.125, "sampling/sampling_logp_difference/mean": 4.892743110656738, "step": 41 }, { "clip_ratio/high_max": 0.029607935342937708, "clip_ratio/high_mean": 0.008916470338590443, "clip_ratio/low_mean": 0.00797058257739991, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01688705338165164, "completions/clipped_ratio": 1.0, "completions/max_length": 402.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 336.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 309.0, "completions/min_terminated_length": 0.0, "entropy": 1.304415225982666, "epoch": 21.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.026901695877313614, "kl": 0.18033620435744524, "learning_rate": 8e-05, "loss": -0.0014, "num_tokens": 4675856.0, "reward": 0.22187501192092896, "reward_std": 0.26757723093032837, "rewards/num_nodes_reward/mean": 0.375, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6959612369537354, "sampling/importance_sampling_ratio/min": 5.662144510848715e-30, "sampling/sampling_logp_difference/max": 67.34375, "sampling/sampling_logp_difference/mean": 4.799839019775391, "step": 42 }, { "clip_ratio/high_max": 0.02922317571938038, "clip_ratio/high_mean": 0.008055589423747733, "clip_ratio/low_mean": 0.006867704272735864, "clip_ratio/low_min": 0.00020424836839083582, "clip_ratio/region_mean": 0.014923293958418071, "completions/clipped_ratio": 1.0, "completions/max_length": 694.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 378.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 301.0, "completions/min_terminated_length": 0.0, "entropy": 1.506517842411995, "epoch": 21.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.013327512890100479, "kl": 0.07364828884601593, "learning_rate": 8e-05, "loss": 0.0057, "num_tokens": 4788416.0, "reward": 0.17265625298023224, "reward_std": 0.19572840631008148, "rewards/num_nodes_reward/mean": 0.375, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6737306714057922, "sampling/importance_sampling_ratio/min": 1.5239456710387979e-30, "sampling/sampling_logp_difference/max": 68.65625, "sampling/sampling_logp_difference/mean": 5.096302032470703, "step": 43 }, { "clip_ratio/high_max": 0.019377231248654425, "clip_ratio/high_mean": 0.005192502780118957, "clip_ratio/low_mean": 0.009289373003412038, "clip_ratio/low_min": 0.00041254126699641347, "clip_ratio/region_mean": 0.014481875929050148, "completions/clipped_ratio": 1.0, "completions/max_length": 604.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 366.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 269.0, "completions/min_terminated_length": 0.0, "entropy": 1.4528640136122704, "epoch": 22.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.022027945145964622, "kl": 0.07306265132501721, "learning_rate": 8e-05, "loss": 0.0008, "num_tokens": 4899392.0, "reward": 0.21718749403953552, "reward_std": 0.26741713285446167, "rewards/num_nodes_reward/mean": 0.3046875, "rewards/num_nodes_reward/std": 0.46208351850509644, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6830634474754333, "sampling/importance_sampling_ratio/min": 1.587987769361992e-29, "sampling/sampling_logp_difference/max": 66.3125, "sampling/sampling_logp_difference/mean": 4.952033519744873, "step": 44 }, { "clip_ratio/high_max": 0.023605642607435584, "clip_ratio/high_mean": 0.006167254643514752, "clip_ratio/low_mean": 0.009567796834744513, "clip_ratio/low_min": 0.0005222259496804327, "clip_ratio/region_mean": 0.015735051245428622, "completions/clipped_ratio": 1.0, "completions/max_length": 374.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 342.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 292.0, "completions/min_terminated_length": 0.0, "entropy": 1.1843339204788208, "epoch": 22.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.02136458270251751, "kl": 0.07532984064891934, "learning_rate": 8e-05, "loss": -0.0042, "num_tokens": 5007408.0, "reward": 0.15625, "reward_std": 0.17858564853668213, "rewards/num_nodes_reward/mean": 0.375, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24301259219646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7063877582550049, "sampling/importance_sampling_ratio/min": 1.5723209311175096e-30, "sampling/sampling_logp_difference/max": 68.625, "sampling/sampling_logp_difference/mean": 4.6953840255737305, "step": 45 }, { "clip_ratio/high_max": 0.02377504063770175, "clip_ratio/high_mean": 0.006953375646844506, "clip_ratio/low_mean": 0.006827459670603275, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01378083520103246, "completions/clipped_ratio": 1.0, "completions/max_length": 400.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 282.0, "completions/min_terminated_length": 0.0, "entropy": 1.4082599580287933, "epoch": 23.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.013320153579115868, "kl": 0.0853613680228591, "learning_rate": 8e-05, "loss": 0.0028, "num_tokens": 5114768.0, "reward": 0.31328123807907104, "reward_std": 0.24592912197113037, "rewards/num_nodes_reward/mean": 0.40625, "rewards/num_nodes_reward/std": 0.4930621087551117, "rewards/tree_correctness_reward/mean": 0.2734375, "rewards/tree_correctness_reward/std": 0.447474867105484, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6728501319885254, "sampling/importance_sampling_ratio/min": 4.274011319363809e-30, "sampling/sampling_logp_difference/max": 67.625, "sampling/sampling_logp_difference/mean": 5.122320652008057, "step": 46 }, { "clip_ratio/high_max": 0.032670305110514164, "clip_ratio/high_mean": 0.00797749770572409, "clip_ratio/low_mean": 0.009248764137737453, "clip_ratio/low_min": 0.00043302541598677635, "clip_ratio/region_mean": 0.01722626166883856, "completions/clipped_ratio": 1.0, "completions/max_length": 433.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 351.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 321.0, "completions/min_terminated_length": 0.0, "entropy": 1.5086831152439117, "epoch": 23.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.017905812710523605, "kl": 0.07807782106101513, "learning_rate": 8e-05, "loss": 0.0029, "num_tokens": 5223872.0, "reward": 0.28203123807907104, "reward_std": 0.25505807995796204, "rewards/num_nodes_reward/mean": 0.4296875, "rewards/num_nodes_reward/std": 0.4969765841960907, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.41502299904823303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6686773300170898, "sampling/importance_sampling_ratio/min": 8.15709290233341e-31, "sampling/sampling_logp_difference/max": 69.28125, "sampling/sampling_logp_difference/mean": 5.150998115539551, "step": 47 }, { "clip_ratio/high_max": 0.02818808937445283, "clip_ratio/high_mean": 0.009505914116743952, "clip_ratio/low_mean": 0.009267379238735884, "clip_ratio/low_min": 0.0005621549644274637, "clip_ratio/region_mean": 0.018773293239064515, "completions/clipped_ratio": 1.0, "completions/max_length": 362.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 331.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 306.0, "completions/min_terminated_length": 0.0, "entropy": 1.1123817935585976, "epoch": 24.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.016510041430592537, "kl": 0.08883423544466496, "learning_rate": 8e-05, "loss": -0.0009, "num_tokens": 5330464.0, "reward": 0.17500001192092896, "reward_std": 0.1919649839401245, "rewards/num_nodes_reward/mean": 0.4375, "rewards/num_nodes_reward/std": 0.49802759289741516, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24301259219646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7282172441482544, "sampling/importance_sampling_ratio/min": 9.048082073307534e-30, "sampling/sampling_logp_difference/max": 66.875, "sampling/sampling_logp_difference/mean": 4.357303619384766, "step": 48 }, { "clip_ratio/high_max": 0.03161731082946062, "clip_ratio/high_mean": 0.00814559290301986, "clip_ratio/low_mean": 0.007802179316058755, "clip_ratio/low_min": 0.0001806358341127634, "clip_ratio/region_mean": 0.01594777253922075, "completions/clipped_ratio": 1.0, "completions/max_length": 392.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 284.0, "completions/min_terminated_length": 0.0, "entropy": 1.0186249688267708, "epoch": 24.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01576230861246586, "kl": 0.08237889781594276, "learning_rate": 8e-05, "loss": 0.0025, "num_tokens": 5436976.0, "reward": 0.13671875, "reward_std": 0.2009136974811554, "rewards/num_nodes_reward/mean": 0.328125, "rewards/num_nodes_reward/std": 0.4713755249977112, "rewards/tree_correctness_reward/mean": 0.0546875, "rewards/tree_correctness_reward/std": 0.22826264798641205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7501358985900879, "sampling/importance_sampling_ratio/min": 5.266615300158608e-31, "sampling/sampling_logp_difference/max": 69.71875, "sampling/sampling_logp_difference/mean": 4.040156364440918, "step": 49 }, { "clip_ratio/high_max": 0.025265710428357124, "clip_ratio/high_mean": 0.008964006090536714, "clip_ratio/low_mean": 0.006059841747628525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015023847809061408, "completions/clipped_ratio": 1.0, "completions/max_length": 377.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 334.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 294.0, "completions/min_terminated_length": 0.0, "entropy": 1.5162832140922546, "epoch": 25.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01368473656475544, "kl": 0.1216685357503593, "learning_rate": 8e-05, "loss": 0.0034, "num_tokens": 5543904.0, "reward": 0.2906250059604645, "reward_std": 0.2603147625923157, "rewards/num_nodes_reward/mean": 0.53125, "rewards/num_nodes_reward/std": 0.5009832978248596, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39184603095054626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6750624179840088, "sampling/importance_sampling_ratio/min": 2.2877134394121284e-30, "sampling/sampling_logp_difference/max": 68.25, "sampling/sampling_logp_difference/mean": 5.027508735656738, "step": 50 }, { "epoch": 25.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 330.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 273.475, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 222.2, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.4963851451873779, "eval_frac_reward_zero_std": 0.2, "eval_kl": 0.08878492712974548, "eval_loss": 0.0014535055961459875, "eval_num_tokens": 5543904.0, "eval_reward": 0.16499999836087226, "eval_reward_std": 0.20676954239606857, "eval_rewards/num_nodes_reward/mean": 0.375, "eval_rewards/num_nodes_reward/std": 0.48803112506866453, "eval_rewards/tree_correctness_reward/mean": 0.075, "eval_rewards/tree_correctness_reward/std": 0.20493902564048766, "eval_runtime": 41.1986, "eval_samples_per_second": 0.243, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.895362138748169, "eval_sampling/importance_sampling_ratio/min": 2.0050247899166646e-28, "eval_sampling/sampling_logp_difference/max": 64.80625, "eval_sampling/sampling_logp_difference/mean": 1.7994532585144043, "eval_steps_per_second": 0.024, "step": 50 }, { "clip_ratio/high_max": 0.026102284318767488, "clip_ratio/high_mean": 0.007913386914879084, "clip_ratio/low_mean": 0.009147425182163715, "clip_ratio/low_min": 0.000515109917614609, "clip_ratio/region_mean": 0.017060812329873443, "completions/clipped_ratio": 1.0, "completions/max_length": 451.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 335.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 255.0, "completions/min_terminated_length": 0.0, "entropy": 1.3594164103269577, "epoch": 25.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.01686623878777027, "kl": 0.0903850244358182, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 5650912.0, "reward": 0.17734375596046448, "reward_std": 0.24893349409103394, "rewards/num_nodes_reward/mean": 0.390625, "rewards/num_nodes_reward/std": 0.4898075461387634, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6972545385360718, "sampling/importance_sampling_ratio/min": 3.1943635805629367e-31, "sampling/sampling_logp_difference/max": 70.21875, "sampling/sampling_logp_difference/mean": 4.74014139175415, "step": 51 }, { "clip_ratio/high_max": 0.03163799655158073, "clip_ratio/high_mean": 0.008631394244730473, "clip_ratio/low_mean": 0.006990044581471011, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015621437807567418, "completions/clipped_ratio": 1.0, "completions/max_length": 489.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 357.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 289.0, "completions/min_terminated_length": 0.0, "entropy": 1.4162916988134384, "epoch": 26.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.01663323864340782, "kl": 0.08927854150533676, "learning_rate": 8e-05, "loss": 0.0037, "num_tokens": 5760816.0, "reward": 0.25859376788139343, "reward_std": 0.2740044593811035, "rewards/num_nodes_reward/mean": 0.4609375, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.3787541687488556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6846030950546265, "sampling/importance_sampling_ratio/min": 2.8754744950605667e-29, "sampling/sampling_logp_difference/max": 65.71875, "sampling/sampling_logp_difference/mean": 4.915652275085449, "step": 52 }, { "clip_ratio/high_max": 0.02919642929919064, "clip_ratio/high_mean": 0.00909939588746056, "clip_ratio/low_mean": 0.006390504102455452, "clip_ratio/low_min": 0.00017458100046496838, "clip_ratio/region_mean": 0.015489900717511773, "completions/clipped_ratio": 1.0, "completions/max_length": 366.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 328.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 273.0, "completions/min_terminated_length": 0.0, "entropy": 1.3649296462535858, "epoch": 26.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.01446872390806675, "kl": 0.12801157543435693, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 5866960.0, "reward": 0.2523437738418579, "reward_std": 0.2125542014837265, "rewards/num_nodes_reward/mean": 0.4765625, "rewards/num_nodes_reward/std": 0.5014128684997559, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6927582025527954, "sampling/importance_sampling_ratio/min": 7.739234377113275e-30, "sampling/sampling_logp_difference/max": 67.03125, "sampling/sampling_logp_difference/mean": 4.809751033782959, "step": 53 }, { "clip_ratio/high_max": 0.022412333288230002, "clip_ratio/high_mean": 0.007199650252005085, "clip_ratio/low_mean": 0.010359178180806339, "clip_ratio/low_min": 0.0007470055861631408, "clip_ratio/region_mean": 0.01755882811266929, "completions/clipped_ratio": 1.0, "completions/max_length": 434.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 335.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 261.0, "completions/min_terminated_length": 0.0, "entropy": 1.3801551014184952, "epoch": 27.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.017827318981289864, "kl": 0.10812216717749834, "learning_rate": 8e-05, "loss": 0.001, "num_tokens": 5973968.0, "reward": 0.22734376788139343, "reward_std": 0.24771972000598907, "rewards/num_nodes_reward/mean": 0.375, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6830041408538818, "sampling/importance_sampling_ratio/min": 2.0189000747124148e-30, "sampling/sampling_logp_difference/max": 68.375, "sampling/sampling_logp_difference/mean": 4.974677085876465, "step": 54 }, { "clip_ratio/high_max": 0.02179216395597905, "clip_ratio/high_mean": 0.00514518961426802, "clip_ratio/low_mean": 0.007329581305384636, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012474770890548825, "completions/clipped_ratio": 1.0, "completions/max_length": 375.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 309.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 271.0, "completions/min_terminated_length": 0.0, "entropy": 1.1433220282196999, "epoch": 27.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.0161687433719635, "kl": 0.11182346381247044, "learning_rate": 8e-05, "loss": 0.0016, "num_tokens": 6077664.0, "reward": 0.23828125, "reward_std": 0.166175976395607, "rewards/num_nodes_reward/mean": 0.484375, "rewards/num_nodes_reward/std": 0.5017194747924805, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7335993051528931, "sampling/importance_sampling_ratio/min": 1.4013943476265886e-29, "sampling/sampling_logp_difference/max": 66.4375, "sampling/sampling_logp_difference/mean": 4.2973480224609375, "step": 55 }, { "clip_ratio/high_max": 0.03312312415800989, "clip_ratio/high_mean": 0.009022045298479497, "clip_ratio/low_mean": 0.007904802827397361, "clip_ratio/low_min": 0.0007787960930727422, "clip_ratio/region_mean": 0.016926848562434316, "completions/clipped_ratio": 1.0, "completions/max_length": 381.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 308.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 267.0, "completions/min_terminated_length": 0.0, "entropy": 1.2076809257268906, "epoch": 28.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.020576391369104385, "kl": 0.09819220937788486, "learning_rate": 8e-05, "loss": -0.0033, "num_tokens": 6181232.0, "reward": 0.24765625596046448, "reward_std": 0.21016791462898254, "rewards/num_nodes_reward/mean": 0.515625, "rewards/num_nodes_reward/std": 0.5017194747924805, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7208269834518433, "sampling/importance_sampling_ratio/min": 6.0273218931633014e-30, "sampling/sampling_logp_difference/max": 67.28125, "sampling/sampling_logp_difference/mean": 4.474984169006348, "step": 56 }, { "clip_ratio/high_max": 0.03603834635578096, "clip_ratio/high_mean": 0.010579350520856678, "clip_ratio/low_mean": 0.008179082709830254, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018758433405309916, "completions/clipped_ratio": 1.0, "completions/max_length": 307.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 253.0, "completions/min_terminated_length": 0.0, "entropy": 0.7046204619109631, "epoch": 28.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.015703110024333, "kl": 0.2039838396012783, "learning_rate": 8e-05, "loss": -0.0005, "num_tokens": 6281648.0, "reward": 0.18671874701976776, "reward_std": 0.2075519859790802, "rewards/num_nodes_reward/mean": 0.421875, "rewards/num_nodes_reward/std": 0.4957992732524872, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8078951835632324, "sampling/importance_sampling_ratio/min": 5.319093129561238e-30, "sampling/sampling_logp_difference/max": 67.40625, "sampling/sampling_logp_difference/mean": 3.298273801803589, "step": 57 }, { "clip_ratio/high_max": 0.031320654321461916, "clip_ratio/high_mean": 0.010095121571794152, "clip_ratio/low_mean": 0.006367274560034275, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016462396597489715, "completions/clipped_ratio": 1.0, "completions/max_length": 332.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 237.0, "completions/min_terminated_length": 0.0, "entropy": 0.9775640517473221, "epoch": 29.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.016515173017978668, "kl": 0.24808669555932283, "learning_rate": 8e-05, "loss": 0.0005, "num_tokens": 6381840.0, "reward": 0.2828125059604645, "reward_std": 0.23257049918174744, "rewards/num_nodes_reward/mean": 0.5234375, "rewards/num_nodes_reward/std": 0.5014128684997559, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7530981302261353, "sampling/importance_sampling_ratio/min": 1.0473915155374724e-30, "sampling/sampling_logp_difference/max": 69.03125, "sampling/sampling_logp_difference/mean": 4.0734052658081055, "step": 58 }, { "clip_ratio/high_max": 0.026522311265580356, "clip_ratio/high_mean": 0.006606486334931105, "clip_ratio/low_mean": 0.007358212023973465, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01396469830069691, "completions/clipped_ratio": 1.0, "completions/max_length": 344.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 297.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 265.0, "completions/min_terminated_length": 0.0, "entropy": 1.265642300248146, "epoch": 29.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.017903149127960205, "kl": 0.1256992220878601, "learning_rate": 8e-05, "loss": 0.0029, "num_tokens": 6484000.0, "reward": 0.24375000596046448, "reward_std": 0.18986675143241882, "rewards/num_nodes_reward/mean": 0.4296875, "rewards/num_nodes_reward/std": 0.4969765841960907, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7110901474952698, "sampling/importance_sampling_ratio/min": 2.7595090997417772e-30, "sampling/sampling_logp_difference/max": 68.0625, "sampling/sampling_logp_difference/mean": 4.629883289337158, "step": 59 }, { "clip_ratio/high_max": 0.03863569814711809, "clip_ratio/high_mean": 0.012202476849779487, "clip_ratio/low_mean": 0.007711057201959193, "clip_ratio/low_min": 0.00021043770539108664, "clip_ratio/region_mean": 0.019913534400984645, "completions/clipped_ratio": 1.0, "completions/max_length": 348.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 272.0, "completions/min_terminated_length": 0.0, "entropy": 1.109927773475647, "epoch": 30.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.014393464662134647, "kl": 0.10836683120578527, "learning_rate": 8e-05, "loss": 0.0022, "num_tokens": 6586480.0, "reward": 0.24140626192092896, "reward_std": 0.18571873009204865, "rewards/num_nodes_reward/mean": 0.53125, "rewards/num_nodes_reward/std": 0.5009832978248596, "rewards/tree_correctness_reward/mean": 0.1171875, "rewards/tree_correctness_reward/std": 0.322907418012619, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7247623801231384, "sampling/importance_sampling_ratio/min": 4.8430893875347414e-30, "sampling/sampling_logp_difference/max": 67.5, "sampling/sampling_logp_difference/mean": 4.4581475257873535, "step": 60 }, { "epoch": 30.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 291.0, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 243.475, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 206.6, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.44202286601066587, "eval_frac_reward_zero_std": 0.1, "eval_kl": 0.21619968861341476, "eval_loss": -0.0031181154772639275, "eval_num_tokens": 6586480.0, "eval_reward": 0.1650000035762787, "eval_reward_std": 0.19037772566080094, "eval_rewards/num_nodes_reward/mean": 0.4625, "eval_rewards/num_nodes_reward/std": 0.47500433325767516, "eval_rewards/tree_correctness_reward/mean": 0.0375, "eval_rewards/tree_correctness_reward/std": 0.15, "eval_runtime": 36.9602, "eval_samples_per_second": 0.271, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.910171902179718, "eval_sampling/importance_sampling_ratio/min": 1.816025560441625e-28, "eval_sampling/sampling_logp_difference/max": 64.5625, "eval_sampling/sampling_logp_difference/mean": 1.6427417516708374, "eval_steps_per_second": 0.027, "step": 60 }, { "clip_ratio/high_max": 0.03227627417072654, "clip_ratio/high_mean": 0.011085564619861543, "clip_ratio/low_mean": 0.007012205256614834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01809776946902275, "completions/clipped_ratio": 1.0, "completions/max_length": 450.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 313.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 269.0, "completions/min_terminated_length": 0.0, "entropy": 1.2767343968153, "epoch": 30.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.014972602017223835, "kl": 0.1321253152564168, "learning_rate": 8e-05, "loss": -0.0011, "num_tokens": 6690720.0, "reward": 0.2679687738418579, "reward_std": 0.2380528301000595, "rewards/num_nodes_reward/mean": 0.546875, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.1484375, "rewards/tree_correctness_reward/std": 0.356930136680603, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7003626823425293, "sampling/importance_sampling_ratio/min": 6.416050539895483e-30, "sampling/sampling_logp_difference/max": 67.21875, "sampling/sampling_logp_difference/mean": 4.760910987854004, "step": 61 }, { "clip_ratio/high_max": 0.032294590724632144, "clip_ratio/high_mean": 0.010691975359804928, "clip_ratio/low_mean": 0.006633859622525051, "clip_ratio/low_min": 0.00045476193190552294, "clip_ratio/region_mean": 0.01732583437114954, "completions/clipped_ratio": 1.0, "completions/max_length": 316.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 263.0, "completions/min_terminated_length": 0.0, "entropy": 0.9472327083349228, "epoch": 31.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.014592193067073822, "kl": 0.16506292391568422, "learning_rate": 8e-05, "loss": 0.0009, "num_tokens": 6791056.0, "reward": 0.30859375, "reward_std": 0.21023596823215485, "rewards/num_nodes_reward/mean": 0.609375, "rewards/num_nodes_reward/std": 0.4898075461387634, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7614758610725403, "sampling/importance_sampling_ratio/min": 1.838229414177471e-30, "sampling/sampling_logp_difference/max": 68.46875, "sampling/sampling_logp_difference/mean": 3.9790267944335938, "step": 62 }, { "clip_ratio/high_max": 0.03611426963470876, "clip_ratio/high_mean": 0.012037943699397147, "clip_ratio/low_mean": 0.005005814411561005, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017043758183717728, "completions/clipped_ratio": 1.0, "completions/max_length": 304.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 229.0, "completions/min_terminated_length": 0.0, "entropy": 0.9895319566130638, "epoch": 31.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.015923891216516495, "kl": 0.153902149759233, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 6889872.0, "reward": 0.28984373807907104, "reward_std": 0.2260342389345169, "rewards/num_nodes_reward/mean": 0.4921875, "rewards/num_nodes_reward/std": 0.5019033551216125, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7451896071434021, "sampling/importance_sampling_ratio/min": 9.63163282792579e-30, "sampling/sampling_logp_difference/max": 66.8125, "sampling/sampling_logp_difference/mean": 4.206387996673584, "step": 63 }, { "clip_ratio/high_max": 0.028931546257808805, "clip_ratio/high_mean": 0.010689187329262495, "clip_ratio/low_mean": 0.006471950997365639, "clip_ratio/low_min": 0.00019654087373055518, "clip_ratio/region_mean": 0.017161138355731964, "completions/clipped_ratio": 1.0, "completions/max_length": 321.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 283.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 242.0, "completions/min_terminated_length": 0.0, "entropy": 1.118834227323532, "epoch": 32.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.015021177008748055, "kl": 0.224459920078516, "learning_rate": 8e-05, "loss": -0.0008, "num_tokens": 6990240.0, "reward": 0.26093751192092896, "reward_std": 0.18920685350894928, "rewards/num_nodes_reward/mean": 0.5234375, "rewards/num_nodes_reward/std": 0.5014128684997559, "rewards/tree_correctness_reward/mean": 0.1484375, "rewards/tree_correctness_reward/std": 0.356930136680603, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7282269597053528, "sampling/importance_sampling_ratio/min": 6.619718383830818e-30, "sampling/sampling_logp_difference/max": 67.1875, "sampling/sampling_logp_difference/mean": 4.429420471191406, "step": 64 }, { "clip_ratio/high_max": 0.020738491788506508, "clip_ratio/high_mean": 0.0061625529488082975, "clip_ratio/low_mean": 0.00917861913330853, "clip_ratio/low_min": 0.0009769790631253272, "clip_ratio/region_mean": 0.015341172693297267, "completions/clipped_ratio": 1.0, "completions/max_length": 295.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 244.0, "completions/min_terminated_length": 0.0, "entropy": 1.066385380923748, "epoch": 32.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.016138380393385887, "kl": 0.1674604406580329, "learning_rate": 8e-05, "loss": 0.0024, "num_tokens": 7089184.0, "reward": 0.21796876192092896, "reward_std": 0.19744589924812317, "rewards/num_nodes_reward/mean": 0.3984375, "rewards/num_nodes_reward/std": 0.4915000796318054, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.733193576335907, "sampling/importance_sampling_ratio/min": 6.619718383830818e-30, "sampling/sampling_logp_difference/max": 67.1875, "sampling/sampling_logp_difference/mean": 4.378459930419922, "step": 65 }, { "clip_ratio/high_max": 0.02939399646129459, "clip_ratio/high_mean": 0.01225231442367658, "clip_ratio/low_mean": 0.006301198809524067, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018553513335064054, "completions/clipped_ratio": 1.0, "completions/max_length": 320.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 290.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 243.0, "completions/min_terminated_length": 0.0, "entropy": 1.1647237911820412, "epoch": 33.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.015179651789367199, "kl": 0.2552249114960432, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 7190512.0, "reward": 0.30000001192092896, "reward_std": 0.2541177272796631, "rewards/num_nodes_reward/mean": 0.6171875, "rewards/num_nodes_reward/std": 0.4879830479621887, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7201616168022156, "sampling/importance_sampling_ratio/min": 7.906125798828331e-31, "sampling/sampling_logp_difference/max": 69.3125, "sampling/sampling_logp_difference/mean": 4.514437675476074, "step": 66 }, { "clip_ratio/high_max": 0.0276826408226043, "clip_ratio/high_mean": 0.008986074768472463, "clip_ratio/low_mean": 0.008631912525743246, "clip_ratio/low_min": 0.00025510202976875007, "clip_ratio/region_mean": 0.017617986886762083, "completions/clipped_ratio": 1.0, "completions/max_length": 318.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 265.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 225.0, "completions/min_terminated_length": 0.0, "entropy": 0.8704263865947723, "epoch": 33.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.0162493996322155, "kl": 0.20874830335378647, "learning_rate": 8e-05, "loss": -0.0005, "num_tokens": 7288624.0, "reward": 0.22499999403953552, "reward_std": 0.20624345541000366, "rewards/num_nodes_reward/mean": 0.4765625, "rewards/num_nodes_reward/std": 0.5014128684997559, "rewards/tree_correctness_reward/mean": 0.1171875, "rewards/tree_correctness_reward/std": 0.322907418012619, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7643664479255676, "sampling/importance_sampling_ratio/min": 1.8965809537522895e-30, "sampling/sampling_logp_difference/max": 68.4375, "sampling/sampling_logp_difference/mean": 3.955047130584717, "step": 67 }, { "clip_ratio/high_max": 0.028447074233554304, "clip_ratio/high_mean": 0.009280722471885383, "clip_ratio/low_mean": 0.00481643111561425, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014097153558395803, "completions/clipped_ratio": 1.0, "completions/max_length": 353.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 283.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 225.0, "completions/min_terminated_length": 0.0, "entropy": 1.140500396490097, "epoch": 34.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.017634915187954903, "kl": 0.17672116123139858, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 7389008.0, "reward": 0.24843749403953552, "reward_std": 0.1984369158744812, "rewards/num_nodes_reward/mean": 0.5546875, "rewards/num_nodes_reward/std": 0.4989531338214874, "rewards/tree_correctness_reward/mean": 0.1171875, "rewards/tree_correctness_reward/std": 0.322907418012619, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7276209592819214, "sampling/importance_sampling_ratio/min": 9.839332523444898e-31, "sampling/sampling_logp_difference/max": 69.09375, "sampling/sampling_logp_difference/mean": 4.417024612426758, "step": 68 }, { "clip_ratio/high_max": 0.02859858935698867, "clip_ratio/high_mean": 0.009705420001409948, "clip_ratio/low_mean": 0.005640895775286481, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015346316155046225, "completions/clipped_ratio": 1.0, "completions/max_length": 298.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 233.0, "completions/min_terminated_length": 0.0, "entropy": 1.0813162848353386, "epoch": 34.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.016033802181482315, "kl": 0.18725919909775257, "learning_rate": 8e-05, "loss": -0.0022, "num_tokens": 7487968.0, "reward": 0.30156248807907104, "reward_std": 0.2536444067955017, "rewards/num_nodes_reward/mean": 0.5859375, "rewards/num_nodes_reward/std": 0.49449479579925537, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7419481873512268, "sampling/importance_sampling_ratio/min": 1.4174903165407597e-31, "sampling/sampling_logp_difference/max": 71.03125, "sampling/sampling_logp_difference/mean": 4.220781326293945, "step": 69 }, { "clip_ratio/high_max": 0.024476708262227476, "clip_ratio/high_mean": 0.008292077225632966, "clip_ratio/low_mean": 0.008074083249084651, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016366160474717617, "completions/clipped_ratio": 1.0, "completions/max_length": 346.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 294.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 263.0, "completions/min_terminated_length": 0.0, "entropy": 1.1830936595797539, "epoch": 35.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.01757310889661312, "kl": 0.27536137215793133, "learning_rate": 8e-05, "loss": 0.0019, "num_tokens": 7589824.0, "reward": 0.16250000894069672, "reward_std": 0.19410589337348938, "rewards/num_nodes_reward/mean": 0.4140625, "rewards/num_nodes_reward/std": 0.49449479579925537, "rewards/tree_correctness_reward/mean": 0.0546875, "rewards/tree_correctness_reward/std": 0.22826264798641205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7213863730430603, "sampling/importance_sampling_ratio/min": 7.501123232145174e-30, "sampling/sampling_logp_difference/max": 67.0625, "sampling/sampling_logp_difference/mean": 4.4570159912109375, "step": 70 }, { "epoch": 35.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 272.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 226.175, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 183.4, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.4569208860397339, "eval_frac_reward_zero_std": 0.4, "eval_kl": 0.2542456328868866, "eval_loss": -0.003766710637137294, "eval_num_tokens": 7589824.0, "eval_reward": 0.0825000025331974, "eval_reward_std": 0.09089890271425247, "eval_rewards/num_nodes_reward/mean": 0.275, "eval_rewards/num_nodes_reward/std": 0.3725348174571991, "eval_rewards/tree_correctness_reward/mean": 0.0, "eval_rewards/tree_correctness_reward/std": 0.0, "eval_runtime": 35.0844, "eval_samples_per_second": 0.285, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8982391953468323, "eval_sampling/importance_sampling_ratio/min": 1.8784613293715395e-28, "eval_sampling/sampling_logp_difference/max": 64.8375, "eval_sampling/sampling_logp_difference/mean": 1.7998093128204347, "eval_steps_per_second": 0.029, "step": 70 }, { "clip_ratio/high_max": 0.03507831378374249, "clip_ratio/high_mean": 0.011019864192348905, "clip_ratio/low_mean": 0.006814615073380992, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01783447980415076, "completions/clipped_ratio": 1.0, "completions/max_length": 312.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 239.0, "completions/min_terminated_length": 0.0, "entropy": 1.105099268257618, "epoch": 35.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01603645645081997, "kl": 0.30066984705626965, "learning_rate": 8e-05, "loss": 0.0015, "num_tokens": 7688928.0, "reward": 0.21171876788139343, "reward_std": 0.19017577171325684, "rewards/num_nodes_reward/mean": 0.5234375, "rewards/num_nodes_reward/std": 0.5014128684997559, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7329100370407104, "sampling/importance_sampling_ratio/min": 1.3875683899592226e-30, "sampling/sampling_logp_difference/max": 68.75, "sampling/sampling_logp_difference/mean": 4.338023662567139, "step": 71 }, { "clip_ratio/high_max": 0.030155991902574897, "clip_ratio/high_mean": 0.011614291172008961, "clip_ratio/low_mean": 0.005964636889984831, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017578928032889962, "completions/clipped_ratio": 1.0, "completions/max_length": 309.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 263.0, "completions/min_terminated_length": 0.0, "entropy": 1.1721753478050232, "epoch": 36.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.01554561872035265, "kl": 0.20127884671092033, "learning_rate": 8e-05, "loss": 0.0028, "num_tokens": 7790112.0, "reward": 0.32500001788139343, "reward_std": 0.2806606590747833, "rewards/num_nodes_reward/mean": 0.5546875, "rewards/num_nodes_reward/std": 0.4989531338214874, "rewards/tree_correctness_reward/mean": 0.2265625, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7124358415603638, "sampling/importance_sampling_ratio/min": 9.63163282792579e-30, "sampling/sampling_logp_difference/max": 66.8125, "sampling/sampling_logp_difference/mean": 4.6320061683654785, "step": 72 }, { "clip_ratio/high_max": 0.021465352387167513, "clip_ratio/high_mean": 0.00742116040783003, "clip_ratio/low_mean": 0.00633750410634093, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013758664368651807, "completions/clipped_ratio": 1.0, "completions/max_length": 305.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 238.0, "completions/min_terminated_length": 0.0, "entropy": 1.2467812150716782, "epoch": 36.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.021203340962529182, "kl": 0.221596522256732, "learning_rate": 8e-05, "loss": 0.0018, "num_tokens": 7889184.0, "reward": 0.28984373807907104, "reward_std": 0.22398096323013306, "rewards/num_nodes_reward/mean": 0.4921875, "rewards/num_nodes_reward/std": 0.5019033551216125, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.712213397026062, "sampling/importance_sampling_ratio/min": 1.6222316689370394e-30, "sampling/sampling_logp_difference/max": 68.59375, "sampling/sampling_logp_difference/mean": 4.634646415710449, "step": 73 }, { "clip_ratio/high_max": 0.026857965858653188, "clip_ratio/high_mean": 0.009398147522006184, "clip_ratio/low_mean": 0.006271943013416603, "clip_ratio/low_min": 0.00022321428696159273, "clip_ratio/region_mean": 0.015670091030187905, "completions/clipped_ratio": 1.0, "completions/max_length": 379.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 276.0, "completions/min_terminated_length": 0.0, "entropy": 1.0195468813180923, "epoch": 37.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.019495781511068344, "kl": 0.23774315416812897, "learning_rate": 8e-05, "loss": -0.0025, "num_tokens": 7991968.0, "reward": 0.19999998807907104, "reward_std": 0.2050626128911972, "rewards/num_nodes_reward/mean": 0.484375, "rewards/num_nodes_reward/std": 0.5017194747924805, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7300363779067993, "sampling/importance_sampling_ratio/min": 4.8430893875347414e-30, "sampling/sampling_logp_difference/max": 67.5, "sampling/sampling_logp_difference/mean": 4.368963718414307, "step": 74 }, { "clip_ratio/high_max": 0.03260693675838411, "clip_ratio/high_mean": 0.010830834798980504, "clip_ratio/low_mean": 0.004573418089421466, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015404253266751766, "completions/clipped_ratio": 1.0, "completions/max_length": 327.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 235.0, "completions/min_terminated_length": 0.0, "entropy": 0.9606436938047409, "epoch": 37.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.03323759883642197, "kl": 0.23395021259784698, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 8093216.0, "reward": 0.21406249701976776, "reward_std": 0.20258957147598267, "rewards/num_nodes_reward/mean": 0.53125, "rewards/num_nodes_reward/std": 0.5009832978248596, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7518829107284546, "sampling/importance_sampling_ratio/min": 3.4684842740447485e-29, "sampling/sampling_logp_difference/max": 65.53125, "sampling/sampling_logp_difference/mean": 4.064020156860352, "step": 75 }, { "clip_ratio/high_max": 0.03051688219420612, "clip_ratio/high_mean": 0.008431879163254052, "clip_ratio/low_mean": 0.005886364990146831, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014318244182504714, "completions/clipped_ratio": 1.0, "completions/max_length": 368.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 265.0, "completions/min_terminated_length": 0.0, "entropy": 0.9442616328597069, "epoch": 38.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.019845349714159966, "kl": 0.277046550065279, "learning_rate": 8e-05, "loss": -0.0023, "num_tokens": 8195344.0, "reward": 0.29218751192092896, "reward_std": 0.24867869913578033, "rewards/num_nodes_reward/mean": 0.5, "rewards/num_nodes_reward/std": 0.5019646286964417, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7583028674125671, "sampling/importance_sampling_ratio/min": 2.8754744950605667e-29, "sampling/sampling_logp_difference/max": 65.71875, "sampling/sampling_logp_difference/mean": 3.9542019367218018, "step": 76 }, { "clip_ratio/high_max": 0.027463858714327216, "clip_ratio/high_mean": 0.00826288282405585, "clip_ratio/low_mean": 0.0050172057817690074, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01328008878044784, "completions/clipped_ratio": 1.0, "completions/max_length": 347.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 249.0, "completions/min_terminated_length": 0.0, "entropy": 1.0572926253080368, "epoch": 38.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.014397574588656425, "kl": 0.23300139792263508, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 8295888.0, "reward": 0.29218751192092896, "reward_std": 0.1975468397140503, "rewards/num_nodes_reward/mean": 0.5, "rewards/num_nodes_reward/std": 0.5019646286964417, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7283005714416504, "sampling/importance_sampling_ratio/min": 3.891531412521503e-30, "sampling/sampling_logp_difference/max": 67.71875, "sampling/sampling_logp_difference/mean": 4.410280704498291, "step": 77 }, { "clip_ratio/high_max": 0.03049723827280104, "clip_ratio/high_mean": 0.011016786796972156, "clip_ratio/low_mean": 0.005011813394958153, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0160286002792418, "completions/clipped_ratio": 1.0, "completions/max_length": 305.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 257.0, "completions/min_terminated_length": 0.0, "entropy": 0.7359563186764717, "epoch": 39.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.015022535808384418, "kl": 0.25465079210698605, "learning_rate": 8e-05, "loss": -0.0015, "num_tokens": 8396656.0, "reward": 0.20546875894069672, "reward_std": 0.20139387249946594, "rewards/num_nodes_reward/mean": 0.484375, "rewards/num_nodes_reward/std": 0.5017194747924805, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7874658703804016, "sampling/importance_sampling_ratio/min": 3.060926502377367e-29, "sampling/sampling_logp_difference/max": 65.65625, "sampling/sampling_logp_difference/mean": 3.585594654083252, "step": 78 }, { "clip_ratio/high_max": 0.029865185148082674, "clip_ratio/high_mean": 0.01019700412871316, "clip_ratio/low_mean": 0.005959442118182778, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016156446421518922, "completions/clipped_ratio": 1.0, "completions/max_length": 397.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 269.0, "completions/min_terminated_length": 0.0, "entropy": 0.8961157202720642, "epoch": 39.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.021332310512661934, "kl": 0.4772935640066862, "learning_rate": 8e-05, "loss": -0.0043, "num_tokens": 8499440.0, "reward": 0.27656248211860657, "reward_std": 0.23717530071735382, "rewards/num_nodes_reward/mean": 0.5390625, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7616719007492065, "sampling/importance_sampling_ratio/min": 5.662144510848715e-30, "sampling/sampling_logp_difference/max": 67.34375, "sampling/sampling_logp_difference/mean": 3.9454257488250732, "step": 79 }, { "clip_ratio/high_max": 0.023918193066492677, "clip_ratio/high_mean": 0.008147409360390157, "clip_ratio/low_mean": 0.0060304956859909, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014177904929965734, "completions/clipped_ratio": 1.0, "completions/max_length": 377.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 280.0, "completions/min_terminated_length": 0.0, "entropy": 1.0396562069654465, "epoch": 40.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.019107449799776077, "kl": 0.2583164945244789, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 8603968.0, "reward": 0.2796874940395355, "reward_std": 0.18496906757354736, "rewards/num_nodes_reward/mean": 0.5859375, "rewards/num_nodes_reward/std": 0.49449479579925537, "rewards/tree_correctness_reward/mean": 0.1484375, "rewards/tree_correctness_reward/std": 0.356930136680603, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7331088185310364, "sampling/importance_sampling_ratio/min": 5.319093129561238e-30, "sampling/sampling_logp_difference/max": 67.40625, "sampling/sampling_logp_difference/mean": 4.299379348754883, "step": 80 }, { "epoch": 40.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 313.0, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 264.675, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 221.8, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.41943386793136594, "eval_frac_reward_zero_std": 0.1, "eval_kl": 0.2702707827091217, "eval_loss": -0.007201368920505047, "eval_num_tokens": 8603968.0, "eval_reward": 0.18000000417232515, "eval_reward_std": 0.16988532096147538, "eval_rewards/num_nodes_reward/mean": 0.5125, "eval_rewards/num_nodes_reward/std": 0.4766708672046661, "eval_rewards/tree_correctness_reward/mean": 0.0375, "eval_rewards/tree_correctness_reward/std": 0.11831300854682922, "eval_runtime": 39.1664, "eval_samples_per_second": 0.255, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9068281412124634, "eval_sampling/importance_sampling_ratio/min": 6.605650897997465e-28, "eval_sampling/sampling_logp_difference/max": 63.5125, "eval_sampling/sampling_logp_difference/mean": 1.6323777914047242, "eval_steps_per_second": 0.026, "step": 80 }, { "clip_ratio/high_max": 0.03464826266281307, "clip_ratio/high_mean": 0.011272164876572788, "clip_ratio/low_mean": 0.004575380531605333, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015847545466385782, "completions/clipped_ratio": 1.0, "completions/max_length": 329.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 286.0, "completions/min_terminated_length": 0.0, "entropy": 0.7862470969557762, "epoch": 40.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.015833018347620964, "kl": 0.22909852303564548, "learning_rate": 8e-05, "loss": -0.0016, "num_tokens": 8707472.0, "reward": 0.20468750596046448, "reward_std": 0.14774280786514282, "rewards/num_nodes_reward/mean": 0.609375, "rewards/num_nodes_reward/std": 0.4898075461387634, "rewards/tree_correctness_reward/mean": 0.03125, "rewards/tree_correctness_reward/std": 0.1746762990951538, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7736379504203796, "sampling/importance_sampling_ratio/min": 2.239421836915376e-29, "sampling/sampling_logp_difference/max": 65.96875, "sampling/sampling_logp_difference/mean": 3.7569191455841064, "step": 81 }, { "clip_ratio/high_max": 0.020930869854055345, "clip_ratio/high_mean": 0.007385892502497882, "clip_ratio/low_mean": 0.0050201230915263295, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012406015885062516, "completions/clipped_ratio": 1.0, "completions/max_length": 329.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 287.0, "completions/min_terminated_length": 0.0, "entropy": 1.0000544488430023, "epoch": 41.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.01584947109222412, "kl": 0.2602599784731865, "learning_rate": 8e-05, "loss": -0.0019, "num_tokens": 8811488.0, "reward": 0.28203123807907104, "reward_std": 0.16969555616378784, "rewards/num_nodes_reward/mean": 0.5390625, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.3787541687488556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.728001594543457, "sampling/importance_sampling_ratio/min": 3.1269332749387515e-30, "sampling/sampling_logp_difference/max": 67.9375, "sampling/sampling_logp_difference/mean": 4.405168533325195, "step": 82 }, { "clip_ratio/high_max": 0.03089630091562867, "clip_ratio/high_mean": 0.010089446674101055, "clip_ratio/low_mean": 0.003915482433512807, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014004929340444505, "completions/clipped_ratio": 1.0, "completions/max_length": 328.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 299.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 272.0, "completions/min_terminated_length": 0.0, "entropy": 0.6921835169196129, "epoch": 41.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.03126024827361107, "kl": 0.2692830804735422, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 8913904.0, "reward": 0.3101562261581421, "reward_std": 0.17464834451675415, "rewards/num_nodes_reward/mean": 0.6328125, "rewards/num_nodes_reward/std": 0.4839322865009308, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.3787541687488556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8008518218994141, "sampling/importance_sampling_ratio/min": 9.048082073307534e-30, "sampling/sampling_logp_difference/max": 66.875, "sampling/sampling_logp_difference/mean": 3.359869956970215, "step": 83 }, { "clip_ratio/high_max": 0.02215676160994917, "clip_ratio/high_mean": 0.006402329570846632, "clip_ratio/low_mean": 0.0035002359218196943, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00990256539080292, "completions/clipped_ratio": 1.0, "completions/max_length": 338.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 309.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 268.0, "completions/min_terminated_length": 0.0, "entropy": 0.7437123730778694, "epoch": 42.0, "frac_reward_zero_std": 0.4375, "grad_norm": 1.1232529878616333, "kl": 8.703314380720258, "learning_rate": 8e-05, "loss": 0.002, "num_tokens": 9017664.0, "reward": 0.3335937559604645, "reward_std": 0.13699379563331604, "rewards/num_nodes_reward/mean": 0.546875, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7685422897338867, "sampling/importance_sampling_ratio/min": 2.1705219940814765e-29, "sampling/sampling_logp_difference/max": 66.0, "sampling/sampling_logp_difference/mean": 3.8508870601654053, "step": 84 }, { "clip_ratio/high_max": 0.03503464022651315, "clip_ratio/high_mean": 0.01157919003162533, "clip_ratio/low_mean": 0.0030823395936749876, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014661529450677335, "completions/clipped_ratio": 1.0, "completions/max_length": 368.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 292.0, "completions/min_terminated_length": 0.0, "entropy": 1.068261332809925, "epoch": 42.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.02259916625916958, "kl": 0.24496312253177166, "learning_rate": 8e-05, "loss": 0.0061, "num_tokens": 9124160.0, "reward": 0.265625, "reward_std": 0.13531494140625, "rewards/num_nodes_reward/mean": 0.6484375, "rewards/num_nodes_reward/std": 0.4793342351913452, "rewards/tree_correctness_reward/mean": 0.1015625, "rewards/tree_correctness_reward/std": 0.3032590448856354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7246466279029846, "sampling/importance_sampling_ratio/min": 3.891531412521503e-30, "sampling/sampling_logp_difference/max": 67.71875, "sampling/sampling_logp_difference/mean": 4.431977272033691, "step": 85 }, { "clip_ratio/high_max": 0.01617667154641822, "clip_ratio/high_mean": 0.004965607717167586, "clip_ratio/low_mean": 0.0052561165211955085, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010221724398434162, "completions/clipped_ratio": 1.0, "completions/max_length": 399.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 310.0, "completions/min_terminated_length": 0.0, "entropy": 0.9775459095835686, "epoch": 43.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.012475029565393925, "kl": 0.2512502409517765, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 9231264.0, "reward": 0.3062500059604645, "reward_std": 0.1526554971933365, "rewards/num_nodes_reward/mean": 0.546875, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7483980655670166, "sampling/importance_sampling_ratio/min": 8.769700936908512e-30, "sampling/sampling_logp_difference/max": 66.90625, "sampling/sampling_logp_difference/mean": 4.048500061035156, "step": 86 }, { "clip_ratio/high_max": 0.024587219930253923, "clip_ratio/high_mean": 0.005566652747802436, "clip_ratio/low_mean": 0.006889881507959217, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012456534313969314, "completions/clipped_ratio": 1.0, "completions/max_length": 413.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 335.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 279.0, "completions/min_terminated_length": 0.0, "entropy": 0.9804441034793854, "epoch": 43.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.016995273530483246, "kl": 0.2786136753857136, "learning_rate": 8e-05, "loss": -0.0015, "num_tokens": 9338384.0, "reward": 0.27031248807907104, "reward_std": 0.23455211520195007, "rewards/num_nodes_reward/mean": 0.4453125, "rewards/num_nodes_reward/std": 0.4989531338214874, "rewards/tree_correctness_reward/mean": 0.1953125, "rewards/tree_correctness_reward/std": 0.3979988098144531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7602368593215942, "sampling/importance_sampling_ratio/min": 2.7322840393196706e-31, "sampling/sampling_logp_difference/max": 70.375, "sampling/sampling_logp_difference/mean": 3.8581480979919434, "step": 87 }, { "clip_ratio/high_max": 0.024316129158250988, "clip_ratio/high_mean": 0.006513459957204759, "clip_ratio/low_mean": 0.004421923018526286, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010935383092146367, "completions/clipped_ratio": 1.0, "completions/max_length": 379.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 333.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 270.0, "completions/min_terminated_length": 0.0, "entropy": 1.0260984152555466, "epoch": 44.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.015030437149107456, "kl": 0.21454177983105183, "learning_rate": 8e-05, "loss": 0.0009, "num_tokens": 9445184.0, "reward": 0.15781250596046448, "reward_std": 0.12872421741485596, "rewards/num_nodes_reward/mean": 0.34375, "rewards/num_nodes_reward/std": 0.47682511806488037, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7513948678970337, "sampling/importance_sampling_ratio/min": 1.63839597639079e-29, "sampling/sampling_logp_difference/max": 66.28125, "sampling/sampling_logp_difference/mean": 4.016755104064941, "step": 88 }, { "clip_ratio/high_max": 0.02607972570694983, "clip_ratio/high_mean": 0.007015722920186818, "clip_ratio/low_mean": 0.006516193185234442, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01353191607631743, "completions/clipped_ratio": 1.0, "completions/max_length": 410.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 365.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 296.0, "completions/min_terminated_length": 0.0, "entropy": 1.568969875574112, "epoch": 44.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.018031980842351913, "kl": 0.18580541387200356, "learning_rate": 8e-05, "loss": 0.0014, "num_tokens": 9556064.0, "reward": 0.23593750596046448, "reward_std": 0.2094469666481018, "rewards/num_nodes_reward/mean": 0.421875, "rewards/num_nodes_reward/std": 0.4957992732524872, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6733278036117554, "sampling/importance_sampling_ratio/min": 3.3617703013855747e-29, "sampling/sampling_logp_difference/max": 65.5625, "sampling/sampling_logp_difference/mean": 5.01413631439209, "step": 89 }, { "clip_ratio/high_max": 0.027758668176829815, "clip_ratio/high_mean": 0.006532962783239782, "clip_ratio/low_mean": 0.005962536408333108, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012495498987846076, "completions/clipped_ratio": 1.0, "completions/max_length": 478.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 356.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 302.0, "completions/min_terminated_length": 0.0, "entropy": 1.233905851840973, "epoch": 45.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.015440949238836765, "kl": 0.19410163536667824, "learning_rate": 8e-05, "loss": 0.0007, "num_tokens": 9665840.0, "reward": 0.22734375298023224, "reward_std": 0.16023516654968262, "rewards/num_nodes_reward/mean": 0.4296875, "rewards/num_nodes_reward/std": 0.4969765841960907, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7294955849647522, "sampling/importance_sampling_ratio/min": 2.537597470634223e-29, "sampling/sampling_logp_difference/max": 65.84375, "sampling/sampling_logp_difference/mean": 4.280447006225586, "step": 90 }, { "epoch": 45.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 370.2, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 291.05, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 226.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.6334636330604553, "eval_frac_reward_zero_std": 0.3, "eval_kl": 0.23041727542877197, "eval_loss": 0.001776426681317389, "eval_num_tokens": 9665840.0, "eval_reward": 0.17875000089406967, "eval_reward_std": 0.1618502616882324, "eval_rewards/num_nodes_reward/mean": 0.45, "eval_rewards/num_nodes_reward/std": 0.4715925633907318, "eval_rewards/tree_correctness_reward/mean": 0.0625, "eval_rewards/tree_correctness_reward/std": 0.18062257766723633, "eval_runtime": 44.5883, "eval_samples_per_second": 0.224, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8966295003890992, "eval_sampling/importance_sampling_ratio/min": 8.673089082202451e-28, "eval_sampling/sampling_logp_difference/max": 64.15625, "eval_sampling/sampling_logp_difference/mean": 1.7638072967529297, "eval_steps_per_second": 0.022, "step": 90 }, { "clip_ratio/high_max": 0.02904010470956564, "clip_ratio/high_mean": 0.007284593477379531, "clip_ratio/low_mean": 0.005042760240030475, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012327353586442769, "completions/clipped_ratio": 1.0, "completions/max_length": 454.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 355.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 293.0, "completions/min_terminated_length": 0.0, "entropy": 1.4535559266805649, "epoch": 45.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.016851935535669327, "kl": 0.2285276371985674, "learning_rate": 8e-05, "loss": 0.0026, "num_tokens": 9775472.0, "reward": 0.16015625, "reward_std": 0.15951424837112427, "rewards/num_nodes_reward/mean": 0.3515625, "rewards/num_nodes_reward/std": 0.4793342351913452, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.695281982421875, "sampling/importance_sampling_ratio/min": 5.0466150436734006e-29, "sampling/sampling_logp_difference/max": 65.15625, "sampling/sampling_logp_difference/mean": 4.720391750335693, "step": 91 }, { "clip_ratio/high_max": 0.021848080097697675, "clip_ratio/high_mean": 0.0070141540199983865, "clip_ratio/low_mean": 0.006910390977282077, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01392454479355365, "completions/clipped_ratio": 1.0, "completions/max_length": 444.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 311.0, "completions/min_terminated_length": 0.0, "entropy": 1.50407774746418, "epoch": 46.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.02013283036649227, "kl": 0.17355215921998024, "learning_rate": 8e-05, "loss": 0.0008, "num_tokens": 9886880.0, "reward": 0.24531251192092896, "reward_std": 0.2360229641199112, "rewards/num_nodes_reward/mean": 0.3984375, "rewards/num_nodes_reward/std": 0.4915000796318054, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6693640351295471, "sampling/importance_sampling_ratio/min": 8.064468122844547e-29, "sampling/sampling_logp_difference/max": 64.6875, "sampling/sampling_logp_difference/mean": 5.107875823974609, "step": 92 }, { "clip_ratio/high_max": 0.021611329808365554, "clip_ratio/high_mean": 0.0049472796090412885, "clip_ratio/low_mean": 0.005551315902266651, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010498595132958144, "completions/clipped_ratio": 1.0, "completions/max_length": 377.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 312.0, "completions/min_terminated_length": 0.0, "entropy": 1.1271354705095291, "epoch": 46.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.016716040670871735, "kl": 0.22168135456740856, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 9995168.0, "reward": 0.15703125298023224, "reward_std": 0.1667753905057907, "rewards/num_nodes_reward/mean": 0.359375, "rewards/num_nodes_reward/std": 0.481702595949173, "rewards/tree_correctness_reward/mean": 0.0703125, "rewards/tree_correctness_reward/std": 0.2566775679588318, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7338093519210815, "sampling/importance_sampling_ratio/min": 3.060926502377367e-29, "sampling/sampling_logp_difference/max": 65.65625, "sampling/sampling_logp_difference/mean": 4.253211975097656, "step": 93 }, { "clip_ratio/high_max": 0.02012150385417044, "clip_ratio/high_mean": 0.004532157734502107, "clip_ratio/low_mean": 0.007024158694548532, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011556316749192774, "completions/clipped_ratio": 1.0, "completions/max_length": 448.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 381.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 324.0, "completions/min_terminated_length": 0.0, "entropy": 1.573821172118187, "epoch": 47.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.017013603821396828, "kl": 0.169479513540864, "learning_rate": 8e-05, "loss": 0.0028, "num_tokens": 10108080.0, "reward": 0.14140625298023224, "reward_std": 0.18449550867080688, "rewards/num_nodes_reward/mean": 0.2890625, "rewards/num_nodes_reward/std": 0.45510825514793396, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6602554321289062, "sampling/importance_sampling_ratio/min": 7.04665266583478e-30, "sampling/sampling_logp_difference/max": 67.125, "sampling/sampling_logp_difference/mean": 5.218673229217529, "step": 94 }, { "clip_ratio/high_max": 0.027959934785030782, "clip_ratio/high_mean": 0.005800338345579803, "clip_ratio/low_mean": 0.005125154653796926, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010925492970272899, "completions/clipped_ratio": 1.0, "completions/max_length": 415.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 353.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 335.0, "completions/min_terminated_length": 0.0, "entropy": 1.2243845909833908, "epoch": 47.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.014418534003198147, "kl": 0.23661457374691963, "learning_rate": 8e-05, "loss": 0.0019, "num_tokens": 10217488.0, "reward": 0.19218750298023224, "reward_std": 0.15112987160682678, "rewards/num_nodes_reward/mean": 0.3125, "rewards/num_nodes_reward/std": 0.4653336703777313, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7199616432189941, "sampling/importance_sampling_ratio/min": 7.116867858635386e-29, "sampling/sampling_logp_difference/max": 64.8125, "sampling/sampling_logp_difference/mean": 4.4626145362854, "step": 95 }, { "clip_ratio/high_max": 0.023650136310607195, "clip_ratio/high_mean": 0.005285817343974486, "clip_ratio/low_mean": 0.0050571571046020836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010342974273953587, "completions/clipped_ratio": 1.0, "completions/max_length": 459.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 374.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 304.0, "completions/min_terminated_length": 0.0, "entropy": 1.389024779200554, "epoch": 48.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.01602841541171074, "kl": 0.19089994952082634, "learning_rate": 8e-05, "loss": 0.0032, "num_tokens": 10329536.0, "reward": 0.19609373807907104, "reward_std": 0.14415645599365234, "rewards/num_nodes_reward/mean": 0.453125, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6874412298202515, "sampling/importance_sampling_ratio/min": 2.082986710165573e-30, "sampling/sampling_logp_difference/max": 68.34375, "sampling/sampling_logp_difference/mean": 4.854156494140625, "step": 96 }, { "clip_ratio/high_max": 0.034998406074009836, "clip_ratio/high_mean": 0.008078919287072495, "clip_ratio/low_mean": 0.004862763860728592, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01294168340973556, "completions/clipped_ratio": 1.0, "completions/max_length": 445.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 382.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 319.0, "completions/min_terminated_length": 0.0, "entropy": 1.3083463311195374, "epoch": 48.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.02270352467894554, "kl": 0.196229986846447, "learning_rate": 8e-05, "loss": 0.0026, "num_tokens": 10442624.0, "reward": 0.21718749403953552, "reward_std": 0.1893259435892105, "rewards/num_nodes_reward/mean": 0.4140625, "rewards/num_nodes_reward/std": 0.49449479579925537, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6932873725891113, "sampling/importance_sampling_ratio/min": 5.319093129561238e-30, "sampling/sampling_logp_difference/max": 67.40625, "sampling/sampling_logp_difference/mean": 4.807107925415039, "step": 97 }, { "clip_ratio/high_max": 0.02135718910722062, "clip_ratio/high_mean": 0.006010888493619859, "clip_ratio/low_mean": 0.005597818657406606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011608707718551159, "completions/clipped_ratio": 1.0, "completions/max_length": 399.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 346.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 308.0, "completions/min_terminated_length": 0.0, "entropy": 1.3539289981126785, "epoch": 49.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.01520631741732359, "kl": 0.20834442414343357, "learning_rate": 8e-05, "loss": 0.0032, "num_tokens": 10551072.0, "reward": 0.23906250298023224, "reward_std": 0.14155416190624237, "rewards/num_nodes_reward/mean": 0.5234375, "rewards/num_nodes_reward/std": 0.5014128684997559, "rewards/tree_correctness_reward/mean": 0.1171875, "rewards/tree_correctness_reward/std": 0.322907418012619, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7116930484771729, "sampling/importance_sampling_ratio/min": 2.7595090997417772e-30, "sampling/sampling_logp_difference/max": 68.0625, "sampling/sampling_logp_difference/mean": 4.528259754180908, "step": 98 }, { "clip_ratio/high_max": 0.02598529658280313, "clip_ratio/high_mean": 0.006185364865814336, "clip_ratio/low_mean": 0.00691390092833899, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013099265983328223, "completions/clipped_ratio": 1.0, "completions/max_length": 360.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 306.0, "completions/min_terminated_length": 0.0, "entropy": 0.9947627410292625, "epoch": 49.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.015323255211114883, "kl": 0.3318562489002943, "learning_rate": 8e-05, "loss": 0.0015, "num_tokens": 10658016.0, "reward": 0.19296875596046448, "reward_std": 0.13155752420425415, "rewards/num_nodes_reward/mean": 0.515625, "rewards/num_nodes_reward/std": 0.5017194747924805, "rewards/tree_correctness_reward/mean": 0.0546875, "rewards/tree_correctness_reward/std": 0.22826264798641205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7589511871337891, "sampling/importance_sampling_ratio/min": 3.1269332749387515e-30, "sampling/sampling_logp_difference/max": 67.9375, "sampling/sampling_logp_difference/mean": 3.9300928115844727, "step": 99 }, { "clip_ratio/high_max": 0.026465349132195115, "clip_ratio/high_mean": 0.008018679567612708, "clip_ratio/low_mean": 0.003564299375284463, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011582979408558458, "completions/clipped_ratio": 1.0, "completions/max_length": 430.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 350.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 278.0, "completions/min_terminated_length": 0.0, "entropy": 1.436234787106514, "epoch": 50.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.01692371442914009, "kl": 0.1756493579596281, "learning_rate": 8e-05, "loss": 0.0021, "num_tokens": 10767056.0, "reward": 0.26640623807907104, "reward_std": 0.16579538583755493, "rewards/num_nodes_reward/mean": 0.5234375, "rewards/num_nodes_reward/std": 0.5014128684997559, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6866275072097778, "sampling/importance_sampling_ratio/min": 2.217327846950428e-30, "sampling/sampling_logp_difference/max": 68.28125, "sampling/sampling_logp_difference/mean": 4.879342079162598, "step": 100 }, { "epoch": 50.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 367.0, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 297.0, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 231.4, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.6409168481826782, "eval_frac_reward_zero_std": 0.2, "eval_kl": 0.22275569438934326, "eval_loss": -0.0013155965134501457, "eval_num_tokens": 10767056.0, "eval_reward": 0.22624999582767485, "eval_reward_std": 0.20470716208219528, "eval_rewards/num_nodes_reward/mean": 0.55, "eval_rewards/num_nodes_reward/std": 0.40554115176200867, "eval_rewards/tree_correctness_reward/mean": 0.0875, "eval_rewards/tree_correctness_reward/std": 0.24893558621406556, "eval_runtime": 46.0419, "eval_samples_per_second": 0.217, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8945022225379944, "eval_sampling/importance_sampling_ratio/min": 1.3949312080995529e-28, "eval_sampling/sampling_logp_difference/max": 64.9875, "eval_sampling/sampling_logp_difference/mean": 1.8098880290985107, "eval_steps_per_second": 0.022, "step": 100 }, { "clip_ratio/high_max": 0.025892633944749832, "clip_ratio/high_mean": 0.007311474299058318, "clip_ratio/low_mean": 0.007880628894781694, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01519210310652852, "completions/clipped_ratio": 1.0, "completions/max_length": 396.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 351.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 296.0, "completions/min_terminated_length": 0.0, "entropy": 1.1863634586334229, "epoch": 50.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.01727943681180477, "kl": 0.19017930328845978, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 10876144.0, "reward": 0.2460937649011612, "reward_std": 0.18753814697265625, "rewards/num_nodes_reward/mean": 0.546875, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.1171875, "rewards/tree_correctness_reward/std": 0.322907418012619, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7174276113510132, "sampling/importance_sampling_ratio/min": 8.238372224122146e-30, "sampling/sampling_logp_difference/max": 66.96875, "sampling/sampling_logp_difference/mean": 4.481776714324951, "step": 101 }, { "clip_ratio/high_max": 0.01629497029352933, "clip_ratio/high_mean": 0.00394011486787349, "clip_ratio/low_mean": 0.004311856668209657, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008251971448771656, "completions/clipped_ratio": 1.0, "completions/max_length": 424.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 295.0, "completions/min_terminated_length": 0.0, "entropy": 1.4277305901050568, "epoch": 51.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.016243265941739082, "kl": 0.23186054825782776, "learning_rate": 8e-05, "loss": 0.0016, "num_tokens": 10986384.0, "reward": 0.29296875, "reward_std": 0.12975817918777466, "rewards/num_nodes_reward/mean": 0.59375, "rewards/num_nodes_reward/std": 0.4930621087551117, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6863697171211243, "sampling/importance_sampling_ratio/min": 4.274011319363809e-30, "sampling/sampling_logp_difference/max": 67.625, "sampling/sampling_logp_difference/mean": 4.894171714782715, "step": 102 }, { "clip_ratio/high_max": 0.029391401330940425, "clip_ratio/high_mean": 0.0073293532186653465, "clip_ratio/low_mean": 0.008625224989373237, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015954577829688787, "completions/clipped_ratio": 1.0, "completions/max_length": 637.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 379.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 317.0, "completions/min_terminated_length": 0.0, "entropy": 1.5343266874551773, "epoch": 51.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.05818008631467819, "kl": 0.19930880516767502, "learning_rate": 8e-05, "loss": 0.0026, "num_tokens": 11099136.0, "reward": 0.32109373807907104, "reward_std": 0.18611875176429749, "rewards/num_nodes_reward/mean": 0.6875, "rewards/num_nodes_reward/std": 0.4653336703777313, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6635197401046753, "sampling/importance_sampling_ratio/min": 5.718563876253154e-29, "sampling/sampling_logp_difference/max": 65.03125, "sampling/sampling_logp_difference/mean": 5.186980724334717, "step": 103 }, { "clip_ratio/high_max": 0.018746584333712235, "clip_ratio/high_mean": 0.0033248991530854255, "clip_ratio/low_mean": 0.005997639411361888, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009322538680862635, "completions/clipped_ratio": 1.0, "completions/max_length": 459.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 367.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 312.0, "completions/min_terminated_length": 0.0, "entropy": 1.5288115888834, "epoch": 52.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.013581684790551662, "kl": 0.17932115867733955, "learning_rate": 8e-05, "loss": 0.0007, "num_tokens": 11210304.0, "reward": 0.28046876192092896, "reward_std": 0.15361686050891876, "rewards/num_nodes_reward/mean": 0.4609375, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6956852674484253, "sampling/importance_sampling_ratio/min": 3.5083221349443985e-31, "sampling/sampling_logp_difference/max": 70.125, "sampling/sampling_logp_difference/mean": 4.68017578125, "step": 104 }, { "clip_ratio/high_max": 0.02387523732613772, "clip_ratio/high_mean": 0.005250764254014939, "clip_ratio/low_mean": 0.006207126716617495, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011457890970632434, "completions/clipped_ratio": 1.0, "completions/max_length": 362.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 246.0, "completions/min_terminated_length": 0.0, "entropy": 0.9807261228561401, "epoch": 52.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.020294323563575745, "kl": 0.20384644903242588, "learning_rate": 8e-05, "loss": -0.0018, "num_tokens": 11314560.0, "reward": 0.3023437261581421, "reward_std": 0.17114359140396118, "rewards/num_nodes_reward/mean": 0.6796875, "rewards/num_nodes_reward/std": 0.4684300124645233, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7626187801361084, "sampling/importance_sampling_ratio/min": 8.499887179496474e-30, "sampling/sampling_logp_difference/max": 66.9375, "sampling/sampling_logp_difference/mean": 3.85005521774292, "step": 105 }, { "clip_ratio/high_max": 0.03060229797847569, "clip_ratio/high_mean": 0.007901376578956842, "clip_ratio/low_mean": 0.00438190798740834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012283284799195826, "completions/clipped_ratio": 1.0, "completions/max_length": 428.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 339.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 307.0, "completions/min_terminated_length": 0.0, "entropy": 1.3041435331106186, "epoch": 53.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.016189055517315865, "kl": 0.212393119931221, "learning_rate": 8e-05, "loss": 0.0016, "num_tokens": 11422096.0, "reward": 0.2789062261581421, "reward_std": 0.1371718794107437, "rewards/num_nodes_reward/mean": 0.546875, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7179786562919617, "sampling/importance_sampling_ratio/min": 1.2367262089069031e-29, "sampling/sampling_logp_difference/max": 66.5625, "sampling/sampling_logp_difference/mean": 4.450005531311035, "step": 106 }, { "clip_ratio/high_max": 0.015035640506539494, "clip_ratio/high_mean": 0.003805186126555782, "clip_ratio/low_mean": 0.007183770154369995, "clip_ratio/low_min": 0.0003822630096692592, "clip_ratio/region_mean": 0.010988956200890243, "completions/clipped_ratio": 1.0, "completions/max_length": 408.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 349.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 310.0, "completions/min_terminated_length": 0.0, "entropy": 1.4260711520910263, "epoch": 53.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.014669189229607582, "kl": 0.1986222080886364, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 11531008.0, "reward": 0.28984376788139343, "reward_std": 0.1511957198381424, "rewards/num_nodes_reward/mean": 0.546875, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6834430694580078, "sampling/importance_sampling_ratio/min": 7.816349966256032e-29, "sampling/sampling_logp_difference/max": 64.71875, "sampling/sampling_logp_difference/mean": 4.919986248016357, "step": 107 }, { "clip_ratio/high_max": 0.01348037226125598, "clip_ratio/high_mean": 0.0028517855535028502, "clip_ratio/low_mean": 0.004930719354888424, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007782505068462342, "completions/clipped_ratio": 1.0, "completions/max_length": 430.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 346.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 286.0, "completions/min_terminated_length": 0.0, "entropy": 1.2396919429302216, "epoch": 54.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.023526426404714584, "kl": 0.2266864962875843, "learning_rate": 8e-05, "loss": 0.0019, "num_tokens": 11639488.0, "reward": 0.21796874701976776, "reward_std": 0.11479286849498749, "rewards/num_nodes_reward/mean": 0.6171875, "rewards/num_nodes_reward/std": 0.4879830479621887, "rewards/tree_correctness_reward/mean": 0.046875, "rewards/tree_correctness_reward/std": 0.21220162510871887, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7252365350723267, "sampling/importance_sampling_ratio/min": 5.206811789961654e-29, "sampling/sampling_logp_difference/max": 65.125, "sampling/sampling_logp_difference/mean": 4.3245038986206055, "step": 108 }, { "clip_ratio/high_max": 0.029223773861303926, "clip_ratio/high_mean": 0.007116494263755158, "clip_ratio/low_mean": 0.00988276390125975, "clip_ratio/low_min": 0.00018656716565601528, "clip_ratio/region_mean": 0.016999258077703416, "completions/clipped_ratio": 1.0, "completions/max_length": 382.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 329.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 297.0, "completions/min_terminated_length": 0.0, "entropy": 1.2354019060730934, "epoch": 54.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.01705428585410118, "kl": 0.24366542883217335, "learning_rate": 8e-05, "loss": 0.0004, "num_tokens": 11745760.0, "reward": 0.2710937261581421, "reward_std": 0.23664113879203796, "rewards/num_nodes_reward/mean": 0.5390625, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7179815769195557, "sampling/importance_sampling_ratio/min": 3.3617703013855747e-29, "sampling/sampling_logp_difference/max": 65.5625, "sampling/sampling_logp_difference/mean": 4.465206146240234, "step": 109 }, { "clip_ratio/high_max": 0.028188285185024142, "clip_ratio/high_mean": 0.005576092094997875, "clip_ratio/low_mean": 0.005123120819916949, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010699212725739926, "completions/clipped_ratio": 1.0, "completions/max_length": 415.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 219.0, "completions/min_terminated_length": 0.0, "entropy": 1.2415581792593002, "epoch": 55.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.013026142492890358, "kl": 0.20073599182069302, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 11852304.0, "reward": 0.24453124403953552, "reward_std": 0.12546178698539734, "rewards/num_nodes_reward/mean": 0.5234375, "rewards/num_nodes_reward/std": 0.5014128684997559, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3320184051990509, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.712395191192627, "sampling/importance_sampling_ratio/min": 5.0466150436734006e-29, "sampling/sampling_logp_difference/max": 65.15625, "sampling/sampling_logp_difference/mean": 4.553300857543945, "step": 110 }, { "epoch": 55.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 310.2, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 259.225, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 200.4, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.5872734546661377, "eval_frac_reward_zero_std": 0.6, "eval_kl": 0.2573328047990799, "eval_loss": 0.0010099844075739384, "eval_num_tokens": 11852304.0, "eval_reward": 0.30499999821186063, "eval_reward_std": 0.12337257117033004, "eval_rewards/num_nodes_reward/mean": 0.8125, "eval_rewards/num_nodes_reward/std": 0.2532795548439026, "eval_rewards/tree_correctness_reward/mean": 0.0875, "eval_rewards/tree_correctness_reward/std": 0.24893558621406556, "eval_runtime": 38.4144, "eval_samples_per_second": 0.26, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9055965304374695, "eval_sampling/importance_sampling_ratio/min": 5.677477097932818e-28, "eval_sampling/sampling_logp_difference/max": 62.875, "eval_sampling/sampling_logp_difference/mean": 1.6251771688461303, "eval_steps_per_second": 0.026, "step": 110 }, { "clip_ratio/high_max": 0.01474824734032154, "clip_ratio/high_mean": 0.0022916618472663686, "clip_ratio/low_mean": 0.007419144327286631, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009710806421935558, "completions/clipped_ratio": 1.0, "completions/max_length": 364.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 255.0, "completions/min_terminated_length": 0.0, "entropy": 1.0934151262044907, "epoch": 55.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.017199324443936348, "kl": 0.21641815453767776, "learning_rate": 8e-05, "loss": 0.0028, "num_tokens": 11956144.0, "reward": 0.31171873211860657, "reward_std": 0.1355111449956894, "rewards/num_nodes_reward/mean": 0.7109375, "rewards/num_nodes_reward/std": 0.45510825514793396, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7484222650527954, "sampling/importance_sampling_ratio/min": 6.897904670286248e-29, "sampling/sampling_logp_difference/max": 64.84375, "sampling/sampling_logp_difference/mean": 4.062161445617676, "step": 111 }, { "clip_ratio/high_max": 0.027715622214600444, "clip_ratio/high_mean": 0.006707245949655771, "clip_ratio/low_mean": 0.0058434092497918755, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01255065540317446, "completions/clipped_ratio": 1.0, "completions/max_length": 370.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 327.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 291.0, "completions/min_terminated_length": 0.0, "entropy": 1.3096813037991524, "epoch": 56.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.017104916274547577, "kl": 0.22319420613348484, "learning_rate": 8e-05, "loss": -0.0012, "num_tokens": 12062224.0, "reward": 0.27656251192092896, "reward_std": 0.15692192316055298, "rewards/num_nodes_reward/mean": 0.59375, "rewards/num_nodes_reward/std": 0.4930621087551117, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6948609352111816, "sampling/importance_sampling_ratio/min": 8.15709290233341e-31, "sampling/sampling_logp_difference/max": 69.28125, "sampling/sampling_logp_difference/mean": 4.8250322341918945, "step": 112 }, { "clip_ratio/high_max": 0.019910678442101926, "clip_ratio/high_mean": 0.004871320961683523, "clip_ratio/low_mean": 0.005231493123574182, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010102814005222172, "completions/clipped_ratio": 1.0, "completions/max_length": 342.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 312.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 280.0, "completions/min_terminated_length": 0.0, "entropy": 1.3534389212727547, "epoch": 56.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.014659112319350243, "kl": 0.2068516630679369, "learning_rate": 8e-05, "loss": 0.0007, "num_tokens": 12166336.0, "reward": 0.3812499940395355, "reward_std": 0.18766826391220093, "rewards/num_nodes_reward/mean": 0.7421875, "rewards/num_nodes_reward/std": 0.43914902210235596, "rewards/tree_correctness_reward/mean": 0.2265625, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6945724487304688, "sampling/importance_sampling_ratio/min": 8.584580550654011e-29, "sampling/sampling_logp_difference/max": 64.625, "sampling/sampling_logp_difference/mean": 4.801627159118652, "step": 113 }, { "clip_ratio/high_max": 0.02917112340219319, "clip_ratio/high_mean": 0.00722770401625894, "clip_ratio/low_mean": 0.006975109514314681, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014202813152223825, "completions/clipped_ratio": 1.0, "completions/max_length": 378.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 305.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 264.0, "completions/min_terminated_length": 0.0, "entropy": 0.8770395293831825, "epoch": 57.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.017740435898303986, "kl": 0.27152837067842484, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 12269568.0, "reward": 0.20468750596046448, "reward_std": 0.16998672485351562, "rewards/num_nodes_reward/mean": 0.4453125, "rewards/num_nodes_reward/std": 0.4989531338214874, "rewards/tree_correctness_reward/mean": 0.1015625, "rewards/tree_correctness_reward/std": 0.3032590448856354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7782745361328125, "sampling/importance_sampling_ratio/min": 1.8565459107282552e-29, "sampling/sampling_logp_difference/max": 66.15625, "sampling/sampling_logp_difference/mean": 3.672184467315674, "step": 114 }, { "clip_ratio/high_max": 0.02637151617091149, "clip_ratio/high_mean": 0.005867535277502611, "clip_ratio/low_mean": 0.009605792700313032, "clip_ratio/low_min": 0.000327225134242326, "clip_ratio/region_mean": 0.015473327366635203, "completions/clipped_ratio": 1.0, "completions/max_length": 333.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 191.0, "completions/min_terminated_length": 0.0, "entropy": 1.1078726649284363, "epoch": 57.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.024404821917414665, "kl": 0.25701846182346344, "learning_rate": 8e-05, "loss": -0.0008, "num_tokens": 12370768.0, "reward": 0.3687500059604645, "reward_std": 0.2377256155014038, "rewards/num_nodes_reward/mean": 0.6640625, "rewards/num_nodes_reward/std": 0.47417303919792175, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7349720001220703, "sampling/importance_sampling_ratio/min": 6.08737946060331e-29, "sampling/sampling_logp_difference/max": 64.96875, "sampling/sampling_logp_difference/mean": 4.312590599060059, "step": 115 }, { "clip_ratio/high_max": 0.024498727085301653, "clip_ratio/high_mean": 0.00482231207206496, "clip_ratio/low_mean": 0.0056475853198207915, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010469897300936282, "completions/clipped_ratio": 1.0, "completions/max_length": 368.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 276.0, "completions/min_terminated_length": 0.0, "entropy": 0.9653989523649216, "epoch": 58.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.022045670077204704, "kl": 0.3544970974326134, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 12474496.0, "reward": 0.21015626192092896, "reward_std": 0.14307278394699097, "rewards/num_nodes_reward/mean": 0.5, "rewards/num_nodes_reward/std": 0.5019646286964417, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7588527798652649, "sampling/importance_sampling_ratio/min": 1.63839597639079e-29, "sampling/sampling_logp_difference/max": 66.28125, "sampling/sampling_logp_difference/mean": 3.947988510131836, "step": 116 }, { "clip_ratio/high_max": 0.017907760455273092, "clip_ratio/high_mean": 0.004551533900666982, "clip_ratio/low_mean": 0.006124536594143137, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01067607052391395, "completions/clipped_ratio": 1.0, "completions/max_length": 395.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 336.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 296.0, "completions/min_terminated_length": 0.0, "entropy": 1.2359319627285004, "epoch": 58.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.014133713208138943, "kl": 0.30020686984062195, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 12581632.0, "reward": 0.2789062559604645, "reward_std": 0.1521216630935669, "rewards/num_nodes_reward/mean": 0.546875, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7147839665412903, "sampling/importance_sampling_ratio/min": 1.9567850724240038e-30, "sampling/sampling_logp_difference/max": 68.40625, "sampling/sampling_logp_difference/mean": 4.5253801345825195, "step": 117 }, { "clip_ratio/high_max": 0.022573079564608634, "clip_ratio/high_mean": 0.0049083206977229565, "clip_ratio/low_mean": 0.007190555683337152, "clip_ratio/low_min": 0.0005352670705178753, "clip_ratio/region_mean": 0.012098876293748617, "completions/clipped_ratio": 1.0, "completions/max_length": 713.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 393.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 271.0, "completions/min_terminated_length": 0.0, "entropy": 1.7694540321826935, "epoch": 59.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.017449017614126205, "kl": 0.24595136009156704, "learning_rate": 8e-05, "loss": -0.0037, "num_tokens": 12696112.0, "reward": 0.21328124403953552, "reward_std": 0.22720548510551453, "rewards/num_nodes_reward/mean": 0.3828125, "rewards/num_nodes_reward/std": 0.4879830479621887, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6438877582550049, "sampling/importance_sampling_ratio/min": 7.116867858635386e-29, "sampling/sampling_logp_difference/max": 64.8125, "sampling/sampling_logp_difference/mean": 5.410667419433594, "step": 118 }, { "clip_ratio/high_max": 0.013418294140137732, "clip_ratio/high_mean": 0.0023752823763061315, "clip_ratio/low_mean": 0.004871673503657803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007246955938171595, "completions/clipped_ratio": 1.0, "completions/max_length": 435.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 350.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 291.0, "completions/min_terminated_length": 0.0, "entropy": 1.532955840229988, "epoch": 59.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.01398333441466093, "kl": 0.27546034567058086, "learning_rate": 8e-05, "loss": 0.0023, "num_tokens": 12805120.0, "reward": 0.32343751192092896, "reward_std": 0.12154465168714523, "rewards/num_nodes_reward/mean": 0.5859375, "rewards/num_nodes_reward/std": 0.49449479579925537, "rewards/tree_correctness_reward/mean": 0.2109375, "rewards/tree_correctness_reward/std": 0.4095771610736847, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6785688400268555, "sampling/importance_sampling_ratio/min": 2.966751838288923e-29, "sampling/sampling_logp_difference/max": 65.6875, "sampling/sampling_logp_difference/mean": 4.986217498779297, "step": 119 }, { "clip_ratio/high_max": 0.024467053008265793, "clip_ratio/high_mean": 0.005139640357811004, "clip_ratio/low_mean": 0.0067039254936389625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011843565851449966, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 572.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 266.0, "completions/min_terminated_length": 0.0, "entropy": 1.9483372867107391, "epoch": 60.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.015056724660098553, "kl": 0.409986887127161, "learning_rate": 8e-05, "loss": 0.0021, "num_tokens": 12942480.0, "reward": 0.2796875238418579, "reward_std": 0.20140421390533447, "rewards/num_nodes_reward/mean": 0.640625, "rewards/num_nodes_reward/std": 0.481702595949173, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3320184051990509, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6226392984390259, "sampling/importance_sampling_ratio/min": 3.1943635805629367e-31, "sampling/sampling_logp_difference/max": 70.21875, "sampling/sampling_logp_difference/mean": 5.839439392089844, "step": 120 }, { "epoch": 60.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 366.4, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 283.575, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 227.6, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.548667711019516, "eval_frac_reward_zero_std": 0.7, "eval_kl": 0.44362484812736513, "eval_loss": 0.0006708790897391737, "eval_num_tokens": 12942480.0, "eval_reward": 0.29500000476837157, "eval_reward_std": 0.11441459953784942, "eval_rewards/num_nodes_reward/mean": 0.75, "eval_rewards/num_nodes_reward/std": 0.30149178504943847, "eval_rewards/tree_correctness_reward/mean": 0.1, "eval_rewards/tree_correctness_reward/std": 0.22606874108314515, "eval_runtime": 44.1245, "eval_samples_per_second": 0.227, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9058596730232239, "eval_sampling/importance_sampling_ratio/min": 7.756031403335113e-29, "eval_sampling/sampling_logp_difference/max": 65.71875, "eval_sampling/sampling_logp_difference/mean": 1.5838570833206176, "eval_steps_per_second": 0.023, "step": 120 }, { "clip_ratio/high_max": 0.01775308366632089, "clip_ratio/high_mean": 0.0036749204809893854, "clip_ratio/low_mean": 0.005354416018235497, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009029336390085518, "completions/clipped_ratio": 1.0, "completions/max_length": 458.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 385.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 307.0, "completions/min_terminated_length": 0.0, "entropy": 1.8960187286138535, "epoch": 60.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.01382515113800764, "kl": 0.2855394948273897, "learning_rate": 8e-05, "loss": 0.003, "num_tokens": 13056000.0, "reward": 0.3968749940395355, "reward_std": 0.18486711382865906, "rewards/num_nodes_reward/mean": 0.7578125, "rewards/num_nodes_reward/std": 0.4300905168056488, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.624099612236023, "sampling/importance_sampling_ratio/min": 8.238372224122146e-30, "sampling/sampling_logp_difference/max": 66.96875, "sampling/sampling_logp_difference/mean": 5.670686721801758, "step": 121 }, { "clip_ratio/high_max": 0.01846451486926526, "clip_ratio/high_mean": 0.003230636488297023, "clip_ratio/low_mean": 0.004973265720764175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008203902340028435, "completions/clipped_ratio": 1.0, "completions/max_length": 548.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 395.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 325.0, "completions/min_terminated_length": 0.0, "entropy": 1.5677141696214676, "epoch": 61.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.020058678463101387, "kl": 0.9522172808647156, "learning_rate": 8e-05, "loss": -0.0016, "num_tokens": 13170704.0, "reward": 0.23124998807907104, "reward_std": 0.21289917826652527, "rewards/num_nodes_reward/mean": 0.4609375, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6629204750061035, "sampling/importance_sampling_ratio/min": 2.3838521382698358e-29, "sampling/sampling_logp_difference/max": 65.90625, "sampling/sampling_logp_difference/mean": 5.144073486328125, "step": 122 }, { "clip_ratio/high_max": 0.024506882997229695, "clip_ratio/high_mean": 0.005722499554394744, "clip_ratio/low_mean": 0.006454277609009296, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012176776945125312, "completions/clipped_ratio": 1.0, "completions/max_length": 702.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 435.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 353.0, "completions/min_terminated_length": 0.0, "entropy": 2.045535072684288, "epoch": 61.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.018078317865729332, "kl": 0.3513330779969692, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 13290592.0, "reward": 0.32421875, "reward_std": 0.21355412900447845, "rewards/num_nodes_reward/mean": 0.625, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.1953125, "rewards/tree_correctness_reward/std": 0.3979988098144531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6081721782684326, "sampling/importance_sampling_ratio/min": 2.8754744950605667e-29, "sampling/sampling_logp_difference/max": 65.71875, "sampling/sampling_logp_difference/mean": 5.846678256988525, "step": 123 }, { "clip_ratio/high_max": 0.02542744146194309, "clip_ratio/high_mean": 0.00597946101333946, "clip_ratio/low_mean": 0.006469933316111565, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012449394096620381, "completions/clipped_ratio": 1.0, "completions/max_length": 398.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 286.0, "completions/min_terminated_length": 0.0, "entropy": 1.1792194992303848, "epoch": 62.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.015120038762688637, "kl": 0.3730519972741604, "learning_rate": 8e-05, "loss": 0.0018, "num_tokens": 13398384.0, "reward": 0.25468748807907104, "reward_std": 0.17189821600914001, "rewards/num_nodes_reward/mean": 0.59375, "rewards/num_nodes_reward/std": 0.4930621087551117, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.31333550810813904, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7318875789642334, "sampling/importance_sampling_ratio/min": 9.839332523444898e-31, "sampling/sampling_logp_difference/max": 69.09375, "sampling/sampling_logp_difference/mean": 4.256145477294922, "step": 124 }, { "clip_ratio/high_max": 0.01151478334213607, "clip_ratio/high_mean": 0.002099377103149891, "clip_ratio/low_mean": 0.004735176189569756, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006834553263615817, "completions/clipped_ratio": 1.0, "completions/max_length": 506.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 415.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 317.0, "completions/min_terminated_length": 0.0, "entropy": 1.8735380619764328, "epoch": 62.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.027063028886914253, "kl": 0.27873899415135384, "learning_rate": 8e-05, "loss": 0.0015, "num_tokens": 13515664.0, "reward": 0.3023437559604645, "reward_std": 0.17503343522548676, "rewards/num_nodes_reward/mean": 0.625, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6190707683563232, "sampling/importance_sampling_ratio/min": 5.90009059760205e-29, "sampling/sampling_logp_difference/max": 65.0, "sampling/sampling_logp_difference/mean": 5.699274063110352, "step": 125 }, { "clip_ratio/high_max": 0.03100068517960608, "clip_ratio/high_mean": 0.007693292805925012, "clip_ratio/low_mean": 0.006126531807240099, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013819824438542128, "completions/clipped_ratio": 1.0, "completions/max_length": 497.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 386.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 286.0, "completions/min_terminated_length": 0.0, "entropy": 1.6767232120037079, "epoch": 63.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.01864105835556984, "kl": 0.28530125692486763, "learning_rate": 8e-05, "loss": 0.0012, "num_tokens": 13629232.0, "reward": 0.33125001192092896, "reward_std": 0.2393726110458374, "rewards/num_nodes_reward/mean": 0.6484375, "rewards/num_nodes_reward/std": 0.4793342351913452, "rewards/tree_correctness_reward/mean": 0.1953125, "rewards/tree_correctness_reward/std": 0.3979988098144531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6467541456222534, "sampling/importance_sampling_ratio/min": 7.1986098550006065e-31, "sampling/sampling_logp_difference/max": 69.40625, "sampling/sampling_logp_difference/mean": 5.373539924621582, "step": 126 }, { "clip_ratio/high_max": 0.026646867161616683, "clip_ratio/high_mean": 0.007369871833361685, "clip_ratio/low_mean": 0.005723825408495031, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013093697663862258, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 577.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 321.0, "completions/min_terminated_length": 0.0, "entropy": 2.0316274017095566, "epoch": 63.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.016446257010102272, "kl": 0.30697569623589516, "learning_rate": 8e-05, "loss": 0.0173, "num_tokens": 13767312.0, "reward": 0.23906248807907104, "reward_std": 0.18062299489974976, "rewards/num_nodes_reward/mean": 0.578125, "rewards/num_nodes_reward/std": 0.4957992732524872, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29262590408325195, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.618274450302124, "sampling/importance_sampling_ratio/min": 3.2583394374284734e-29, "sampling/sampling_logp_difference/max": 65.59375, "sampling/sampling_logp_difference/mean": 5.8751983642578125, "step": 127 }, { "clip_ratio/high_max": 0.01942947180941701, "clip_ratio/high_mean": 0.0063385786197613925, "clip_ratio/low_mean": 0.006388126668753102, "clip_ratio/low_min": 0.0005323679579305463, "clip_ratio/region_mean": 0.012726705404929817, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 601.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 304.0, "completions/min_terminated_length": 0.0, "entropy": 2.378005266189575, "epoch": 64.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.03017839416861534, "kl": 0.24480830691754818, "learning_rate": 8e-05, "loss": 0.0127, "num_tokens": 13908448.0, "reward": 0.3203125, "reward_std": 0.231505885720253, "rewards/num_nodes_reward/mean": 0.59375, "rewards/num_nodes_reward/std": 0.4930621087551117, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5675022602081299, "sampling/importance_sampling_ratio/min": 6.157297243929202e-31, "sampling/sampling_logp_difference/max": 69.5625, "sampling/sampling_logp_difference/mean": 6.5157318115234375, "step": 128 }, { "clip_ratio/high_max": 0.022114855819381773, "clip_ratio/high_mean": 0.0053071286383783445, "clip_ratio/low_mean": 0.005236616503680125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010543745185714215, "completions/clipped_ratio": 1.0, "completions/max_length": 443.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 383.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 285.0, "completions/min_terminated_length": 0.0, "entropy": 1.5913755595684052, "epoch": 64.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.022199204191565514, "kl": 0.3247086629271507, "learning_rate": 8e-05, "loss": 0.0008, "num_tokens": 14021712.0, "reward": 0.33906251192092896, "reward_std": 0.2187701314687729, "rewards/num_nodes_reward/mean": 0.65625, "rewards/num_nodes_reward/std": 0.47682511806488037, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6516109704971313, "sampling/importance_sampling_ratio/min": 2.847105246053189e-30, "sampling/sampling_logp_difference/max": 68.03125, "sampling/sampling_logp_difference/mean": 5.3292341232299805, "step": 129 }, { "clip_ratio/high_max": 0.015176160610280931, "clip_ratio/high_mean": 0.004535657790256664, "clip_ratio/low_mean": 0.006615778140258044, "clip_ratio/low_min": 0.00029761905898340046, "clip_ratio/region_mean": 0.011151435901410878, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 811.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 344.0, "completions/min_terminated_length": 0.0, "entropy": 2.848235696554184, "epoch": 65.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.015479855239391327, "kl": 0.29897816851735115, "learning_rate": 8e-05, "loss": 0.0183, "num_tokens": 14189760.0, "reward": 0.2679687738418579, "reward_std": 0.1860959529876709, "rewards/num_nodes_reward/mean": 0.4921875, "rewards/num_nodes_reward/std": 0.5019033551216125, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.3787541687488556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5150429606437683, "sampling/importance_sampling_ratio/min": 8.238372224122146e-30, "sampling/sampling_logp_difference/max": 66.96875, "sampling/sampling_logp_difference/mean": 7.355480194091797, "step": 130 }, { "epoch": 65.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 360.0, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 278.1, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 225.2, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.5526695013046264, "eval_frac_reward_zero_std": 0.6, "eval_kl": 0.4299846589565277, "eval_loss": -0.0018459494458511472, "eval_num_tokens": 14189760.0, "eval_reward": 0.3075000047683716, "eval_reward_std": 0.13013954609632492, "eval_rewards/num_nodes_reward/mean": 0.7625, "eval_rewards/num_nodes_reward/std": 0.29519179463386536, "eval_rewards/tree_correctness_reward/mean": 0.1125, "eval_rewards/tree_correctness_reward/std": 0.27606874108314516, "eval_runtime": 42.6851, "eval_samples_per_second": 0.234, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.901832389831543, "eval_sampling/importance_sampling_ratio/min": 5.660253578662536e-28, "eval_sampling/sampling_logp_difference/max": 63.0625, "eval_sampling/sampling_logp_difference/mean": 1.696182417869568, "eval_steps_per_second": 0.023, "step": 130 }, { "clip_ratio/high_max": 0.018856453360058367, "clip_ratio/high_mean": 0.00579915865091607, "clip_ratio/low_mean": 0.0044898916385136545, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01028905060957186, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 593.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 288.0, "completions/min_terminated_length": 0.0, "entropy": 2.294717848300934, "epoch": 65.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.016361650079488754, "kl": 0.32100204564630985, "learning_rate": 8e-05, "loss": 0.0223, "num_tokens": 14329888.0, "reward": 0.2796875238418579, "reward_std": 0.13276483118534088, "rewards/num_nodes_reward/mean": 0.640625, "rewards/num_nodes_reward/std": 0.481702595949173, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3320184051990509, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.5756416320800781, "sampling/importance_sampling_ratio/min": 2.847105246053189e-30, "sampling/sampling_logp_difference/max": 68.03125, "sampling/sampling_logp_difference/mean": 6.444683074951172, "step": 131 }, { "clip_ratio/high_max": 0.012915871920995414, "clip_ratio/high_mean": 0.0034009291703114286, "clip_ratio/low_mean": 0.004754675785079598, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008155605115462095, "completions/clipped_ratio": 1.0, "completions/max_length": 394.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 333.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 296.0, "completions/min_terminated_length": 0.0, "entropy": 1.2312566488981247, "epoch": 66.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.011694896034896374, "kl": 0.34927068650722504, "learning_rate": 8e-05, "loss": 0.0012, "num_tokens": 14436656.0, "reward": 0.3179687559604645, "reward_std": 0.14422079920768738, "rewards/num_nodes_reward/mean": 0.640625, "rewards/num_nodes_reward/std": 0.481702595949173, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7004845142364502, "sampling/importance_sampling_ratio/min": 1.6904042099061981e-29, "sampling/sampling_logp_difference/max": 66.25, "sampling/sampling_logp_difference/mean": 4.737485885620117, "step": 132 }, { "clip_ratio/high_max": 0.015496064210310578, "clip_ratio/high_mean": 0.004446731451025698, "clip_ratio/low_mean": 0.00710120250005275, "clip_ratio/low_min": 0.00020424836839083582, "clip_ratio/region_mean": 0.011547934031113982, "completions/clipped_ratio": 1.0, "completions/max_length": 807.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 416.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 306.0, "completions/min_terminated_length": 0.0, "entropy": 1.8244241774082184, "epoch": 66.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.032767049968242645, "kl": 0.5939013250172138, "learning_rate": 8e-05, "loss": -0.0005, "num_tokens": 14554112.0, "reward": 0.3609374761581421, "reward_std": 0.2509858310222626, "rewards/num_nodes_reward/mean": 0.65625, "rewards/num_nodes_reward/std": 0.47682511806488037, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42527204751968384, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6159789562225342, "sampling/importance_sampling_ratio/min": 5.662144510848715e-30, "sampling/sampling_logp_difference/max": 67.34375, "sampling/sampling_logp_difference/mean": 5.828707218170166, "step": 133 }, { "clip_ratio/high_max": 0.03524530038703233, "clip_ratio/high_mean": 0.010053227539174259, "clip_ratio/low_mean": 0.006709231005515903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016762458253651857, "completions/clipped_ratio": 1.0, "completions/max_length": 378.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 289.0, "completions/min_terminated_length": 0.0, "entropy": 1.2454154193401337, "epoch": 67.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.022423624992370605, "kl": 0.2514845635741949, "learning_rate": 8e-05, "loss": -0.0024, "num_tokens": 14661440.0, "reward": 0.31328123807907104, "reward_std": 0.2195829451084137, "rewards/num_nodes_reward/mean": 0.625, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6950277090072632, "sampling/importance_sampling_ratio/min": 3.2957636040558285e-31, "sampling/sampling_logp_difference/max": 70.1875, "sampling/sampling_logp_difference/mean": 4.813918113708496, "step": 134 }, { "clip_ratio/high_max": 0.026780626852996647, "clip_ratio/high_mean": 0.007992175815161318, "clip_ratio/low_mean": 0.007767322240397334, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015759498346596956, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 257.0, "completions/min_terminated_length": 0.0, "entropy": 1.1643287390470505, "epoch": 67.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.01480638887733221, "kl": 0.2842021156102419, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 14767632.0, "reward": 0.21953125298023224, "reward_std": 0.204127237200737, "rewards/num_nodes_reward/mean": 0.53125, "rewards/num_nodes_reward/std": 0.5009832978248596, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.71321702003479, "sampling/importance_sampling_ratio/min": 8.41602703635062e-31, "sampling/sampling_logp_difference/max": 69.25, "sampling/sampling_logp_difference/mean": 4.5719451904296875, "step": 135 }, { "clip_ratio/high_max": 0.023850938072428107, "clip_ratio/high_mean": 0.006416868447558954, "clip_ratio/low_mean": 0.006703388993628323, "clip_ratio/low_min": 0.00020627063349820673, "clip_ratio/region_mean": 0.013120257062837481, "completions/clipped_ratio": 1.0, "completions/max_length": 317.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 251.0, "completions/min_terminated_length": 0.0, "entropy": 0.9103499203920364, "epoch": 68.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01405610702931881, "kl": 0.29504169896245, "learning_rate": 8e-05, "loss": -0.0013, "num_tokens": 14867968.0, "reward": 0.34687501192092896, "reward_std": 0.20048978924751282, "rewards/num_nodes_reward/mean": 0.609375, "rewards/num_nodes_reward/std": 0.4898075461387634, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42527204751968384, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7677021622657776, "sampling/importance_sampling_ratio/min": 4.8913468724834555e-29, "sampling/sampling_logp_difference/max": 65.1875, "sampling/sampling_logp_difference/mean": 3.855882167816162, "step": 136 }, { "clip_ratio/high_max": 0.026445729192346334, "clip_ratio/high_mean": 0.006793641980038956, "clip_ratio/low_mean": 0.006571940146386623, "clip_ratio/low_min": 0.00015624999650754035, "clip_ratio/region_mean": 0.013365581980906427, "completions/clipped_ratio": 1.0, "completions/max_length": 400.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 317.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 255.0, "completions/min_terminated_length": 0.0, "entropy": 1.0649447441101074, "epoch": 68.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.01727861724793911, "kl": 0.2731939684599638, "learning_rate": 8e-05, "loss": 0.0012, "num_tokens": 14972688.0, "reward": 0.24375000596046448, "reward_std": 0.24859894812107086, "rewards/num_nodes_reward/mean": 0.484375, "rewards/num_nodes_reward/std": 0.5017194747924805, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7368776202201843, "sampling/importance_sampling_ratio/min": 3.697744350921214e-32, "sampling/sampling_logp_difference/max": 72.375, "sampling/sampling_logp_difference/mean": 4.259332656860352, "step": 137 }, { "clip_ratio/high_max": 0.023571691708639264, "clip_ratio/high_mean": 0.006808046135120094, "clip_ratio/low_mean": 0.006276801199419424, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01308484748005867, "completions/clipped_ratio": 1.0, "completions/max_length": 367.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 313.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 277.0, "completions/min_terminated_length": 0.0, "entropy": 1.3223612755537033, "epoch": 69.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.01676156371831894, "kl": 0.2565441932529211, "learning_rate": 8e-05, "loss": -0.0055, "num_tokens": 15076976.0, "reward": 0.32109373807907104, "reward_std": 0.20282673835754395, "rewards/num_nodes_reward/mean": 0.6875, "rewards/num_nodes_reward/std": 0.4653336703777313, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6960285902023315, "sampling/importance_sampling_ratio/min": 8.238372224122146e-30, "sampling/sampling_logp_difference/max": 66.96875, "sampling/sampling_logp_difference/mean": 4.781155109405518, "step": 138 }, { "clip_ratio/high_max": 0.018699733598623425, "clip_ratio/high_mean": 0.005454863596241921, "clip_ratio/low_mean": 0.006340822787024081, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011795686325058341, "completions/clipped_ratio": 1.0, "completions/max_length": 361.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 311.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 287.0, "completions/min_terminated_length": 0.0, "entropy": 1.396705113351345, "epoch": 69.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.017130838707089424, "kl": 0.21293207630515099, "learning_rate": 8e-05, "loss": 0.0026, "num_tokens": 15180944.0, "reward": 0.390625, "reward_std": 0.21911324560642242, "rewards/num_nodes_reward/mean": 0.71875, "rewards/num_nodes_reward/std": 0.4513758420944214, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.434714138507843, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6898590326309204, "sampling/importance_sampling_ratio/min": 1.6904042099061981e-29, "sampling/sampling_logp_difference/max": 66.25, "sampling/sampling_logp_difference/mean": 4.874538421630859, "step": 139 }, { "clip_ratio/high_max": 0.03299790224991739, "clip_ratio/high_mean": 0.009733997634612024, "clip_ratio/low_mean": 0.007610989734530449, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017344987485557795, "completions/clipped_ratio": 1.0, "completions/max_length": 341.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 272.0, "completions/min_terminated_length": 0.0, "entropy": 1.0240770429372787, "epoch": 70.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.017044972628355026, "kl": 0.2599506340920925, "learning_rate": 8e-05, "loss": -0.0014, "num_tokens": 15283232.0, "reward": 0.2828125059604645, "reward_std": 0.24034598469734192, "rewards/num_nodes_reward/mean": 0.6328125, "rewards/num_nodes_reward/std": 0.4839322865009308, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7433508634567261, "sampling/importance_sampling_ratio/min": 2.537597470634223e-29, "sampling/sampling_logp_difference/max": 65.84375, "sampling/sampling_logp_difference/mean": 4.169303894042969, "step": 140 }, { "epoch": 70.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 320.0, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 241.725, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 193.8, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.5203133881092071, "eval_frac_reward_zero_std": 0.3, "eval_kl": 0.32426294684410095, "eval_loss": -0.0017597040859982371, "eval_num_tokens": 15283232.0, "eval_reward": 0.25875000059604647, "eval_reward_std": 0.1889177605509758, "eval_rewards/num_nodes_reward/mean": 0.6875, "eval_rewards/num_nodes_reward/std": 0.38062257766723634, "eval_rewards/tree_correctness_reward/mean": 0.075, "eval_rewards/tree_correctness_reward/std": 0.2683130085468292, "eval_runtime": 38.0469, "eval_samples_per_second": 0.263, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9107447385787963, "eval_sampling/importance_sampling_ratio/min": 1.8312203361415406e-28, "eval_sampling/sampling_logp_difference/max": 65.25625, "eval_sampling/sampling_logp_difference/mean": 1.5877326726913452, "eval_steps_per_second": 0.026, "step": 140 }, { "clip_ratio/high_max": 0.025008225697092712, "clip_ratio/high_mean": 0.007777805905789137, "clip_ratio/low_mean": 0.008181380107998848, "clip_ratio/low_min": 0.00020559210679493845, "clip_ratio/region_mean": 0.015959185198880732, "completions/clipped_ratio": 1.0, "completions/max_length": 356.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 249.0, "completions/min_terminated_length": 0.0, "entropy": 1.0018879994750023, "epoch": 70.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.018320756033062935, "kl": 0.2679164446890354, "learning_rate": 8e-05, "loss": -0.0026, "num_tokens": 15386736.0, "reward": 0.2945312261581421, "reward_std": 0.22732296586036682, "rewards/num_nodes_reward/mean": 0.6171875, "rewards/num_nodes_reward/std": 0.4879830479621887, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7513340711593628, "sampling/importance_sampling_ratio/min": 1.0914069705124871e-29, "sampling/sampling_logp_difference/max": 66.6875, "sampling/sampling_logp_difference/mean": 4.015472412109375, "step": 141 }, { "clip_ratio/high_max": 0.01900578266941011, "clip_ratio/high_mean": 0.005367098143324256, "clip_ratio/low_mean": 0.006058025319362059, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011425123200751841, "completions/clipped_ratio": 1.0, "completions/max_length": 364.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 261.0, "completions/min_terminated_length": 0.0, "entropy": 1.1259435713291168, "epoch": 71.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.019754966720938683, "kl": 0.5063142795115709, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 15488224.0, "reward": 0.3343749940395355, "reward_std": 0.20835553109645844, "rewards/num_nodes_reward/mean": 0.6953125, "rewards/num_nodes_reward/std": 0.46208351850509644, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7376000285148621, "sampling/importance_sampling_ratio/min": 1.2367262089069031e-29, "sampling/sampling_logp_difference/max": 66.5625, "sampling/sampling_logp_difference/mean": 4.247427940368652, "step": 142 }, { "clip_ratio/high_max": 0.03162862313911319, "clip_ratio/high_mean": 0.008634316793177277, "clip_ratio/low_mean": 0.005163151363376528, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013797467923723161, "completions/clipped_ratio": 1.0, "completions/max_length": 360.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 311.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 279.0, "completions/min_terminated_length": 0.0, "entropy": 1.340917892754078, "epoch": 71.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.01613469235599041, "kl": 0.26185661368072033, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 15592176.0, "reward": 0.3695312738418579, "reward_std": 0.20178167521953583, "rewards/num_nodes_reward/mean": 0.703125, "rewards/num_nodes_reward/std": 0.45867621898651123, "rewards/tree_correctness_reward/mean": 0.2265625, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6867132186889648, "sampling/importance_sampling_ratio/min": 7.501123232145174e-30, "sampling/sampling_logp_difference/max": 67.0625, "sampling/sampling_logp_difference/mean": 4.920871734619141, "step": 143 }, { "clip_ratio/high_max": 0.026571481372229755, "clip_ratio/high_mean": 0.007518760539824143, "clip_ratio/low_mean": 0.008646892732940614, "clip_ratio/low_min": 0.0008872954058460891, "clip_ratio/region_mean": 0.01616565336007625, "completions/clipped_ratio": 1.0, "completions/max_length": 354.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 234.0, "completions/min_terminated_length": 0.0, "entropy": 1.3336425051093102, "epoch": 72.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.016558421775698662, "kl": 0.24349545128643513, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 15696736.0, "reward": 0.2835937440395355, "reward_std": 0.28353041410446167, "rewards/num_nodes_reward/mean": 0.5625, "rewards/num_nodes_reward/std": 0.49802759289741516, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6859455108642578, "sampling/importance_sampling_ratio/min": 1.0578279305422649e-29, "sampling/sampling_logp_difference/max": 66.71875, "sampling/sampling_logp_difference/mean": 4.95201301574707, "step": 144 }, { "clip_ratio/high_max": 0.02230340742971748, "clip_ratio/high_mean": 0.006954966840567067, "clip_ratio/low_mean": 0.009589621040504426, "clip_ratio/low_min": 0.001383804701617919, "clip_ratio/region_mean": 0.016544587444514036, "completions/clipped_ratio": 1.0, "completions/max_length": 360.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 275.0, "completions/min_terminated_length": 0.0, "entropy": 1.4090240448713303, "epoch": 72.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.015551316551864147, "kl": 0.2347242869436741, "learning_rate": 8e-05, "loss": 0.0008, "num_tokens": 15799824.0, "reward": 0.3648437559604645, "reward_std": 0.27491357922554016, "rewards/num_nodes_reward/mean": 0.6328125, "rewards/num_nodes_reward/std": 0.4839322865009308, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.434714138507843, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.688360869884491, "sampling/importance_sampling_ratio/min": 1.8965809537522895e-30, "sampling/sampling_logp_difference/max": 68.4375, "sampling/sampling_logp_difference/mean": 4.896355628967285, "step": 145 }, { "clip_ratio/high_max": 0.022798147285357118, "clip_ratio/high_mean": 0.006141383637441322, "clip_ratio/low_mean": 0.007906944782007486, "clip_ratio/low_min": 0.00039682540227659047, "clip_ratio/region_mean": 0.014048328041099012, "completions/clipped_ratio": 1.0, "completions/max_length": 346.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 268.0, "completions/min_terminated_length": 0.0, "entropy": 1.0978063344955444, "epoch": 73.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.017941683530807495, "kl": 0.2770458124577999, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 15902160.0, "reward": 0.2867187559604645, "reward_std": 0.27752575278282166, "rewards/num_nodes_reward/mean": 0.5546875, "rewards/num_nodes_reward/std": 0.4989531338214874, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.3787541687488556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7352327108383179, "sampling/importance_sampling_ratio/min": 2.5923188203429674e-30, "sampling/sampling_logp_difference/max": 68.125, "sampling/sampling_logp_difference/mean": 4.271994590759277, "step": 146 }, { "clip_ratio/high_max": 0.02003038430120796, "clip_ratio/high_mean": 0.0065869072277564555, "clip_ratio/low_mean": 0.006754421687219292, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013341328711248934, "completions/clipped_ratio": 1.0, "completions/max_length": 397.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 321.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 278.0, "completions/min_terminated_length": 0.0, "entropy": 1.3097603619098663, "epoch": 73.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.017453007400035858, "kl": 0.22119995765388012, "learning_rate": 8e-05, "loss": 0.0022, "num_tokens": 16007392.0, "reward": 0.3070312440395355, "reward_std": 0.19217300415039062, "rewards/num_nodes_reward/mean": 0.5859375, "rewards/num_nodes_reward/std": 0.49449479579925537, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39184603095054626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6986221075057983, "sampling/importance_sampling_ratio/min": 3.5083221349443985e-31, "sampling/sampling_logp_difference/max": 70.125, "sampling/sampling_logp_difference/mean": 4.771892547607422, "step": 147 }, { "clip_ratio/high_max": 0.022479014005512, "clip_ratio/high_mean": 0.005424479604698718, "clip_ratio/low_mean": 0.010020841553341597, "clip_ratio/low_min": 0.0014217901916708797, "clip_ratio/region_mean": 0.015445320983417332, "completions/clipped_ratio": 1.0, "completions/max_length": 370.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 302.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 267.0, "completions/min_terminated_length": 0.0, "entropy": 1.2613011598587036, "epoch": 74.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.018936745822429657, "kl": 0.2640984486788511, "learning_rate": 8e-05, "loss": 0.0027, "num_tokens": 16110240.0, "reward": 0.2562499940395355, "reward_std": 0.30367380380630493, "rewards/num_nodes_reward/mean": 0.5078125, "rewards/num_nodes_reward/std": 0.5019033551216125, "rewards/tree_correctness_reward/mean": 0.1484375, "rewards/tree_correctness_reward/std": 0.356930136680603, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7034671902656555, "sampling/importance_sampling_ratio/min": 7.984904797113095e-30, "sampling/sampling_logp_difference/max": 67.0, "sampling/sampling_logp_difference/mean": 4.719974517822266, "step": 148 }, { "clip_ratio/high_max": 0.032902244944125414, "clip_ratio/high_mean": 0.00953541015041992, "clip_ratio/low_mean": 0.00589190746541135, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015427317470312119, "completions/clipped_ratio": 1.0, "completions/max_length": 344.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 255.0, "completions/min_terminated_length": 0.0, "entropy": 1.155761443078518, "epoch": 74.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.1440742462873459, "kl": 2.1975564286112785, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 16211856.0, "reward": 0.34687501192092896, "reward_std": 0.20124612748622894, "rewards/num_nodes_reward/mean": 0.71875, "rewards/num_nodes_reward/std": 0.4513758420944214, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39184603095054626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7178699970245361, "sampling/importance_sampling_ratio/min": 1.4458794172967342e-29, "sampling/sampling_logp_difference/max": 66.40625, "sampling/sampling_logp_difference/mean": 4.548828125, "step": 149 }, { "clip_ratio/high_max": 0.021231766208074987, "clip_ratio/high_mean": 0.006271047081099823, "clip_ratio/low_mean": 0.008449049608316272, "clip_ratio/low_min": 0.0009356287191621959, "clip_ratio/region_mean": 0.014720096834935248, "completions/clipped_ratio": 1.0, "completions/max_length": 348.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 308.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 253.0, "completions/min_terminated_length": 0.0, "entropy": 1.473582461476326, "epoch": 75.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.019735658541321754, "kl": 0.22061245888471603, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 16315488.0, "reward": 0.29296875, "reward_std": 0.2743210196495056, "rewards/num_nodes_reward/mean": 0.484375, "rewards/num_nodes_reward/std": 0.5017194747924805, "rewards/tree_correctness_reward/mean": 0.2109375, "rewards/tree_correctness_reward/std": 0.4095771610736847, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.672417402267456, "sampling/importance_sampling_ratio/min": 5.0466150436734006e-29, "sampling/sampling_logp_difference/max": 65.15625, "sampling/sampling_logp_difference/mean": 5.128747940063477, "step": 150 }, { "epoch": 75.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 292.2, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 233.825, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 179.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.5566826701164246, "eval_frac_reward_zero_std": 0.3, "eval_kl": 0.34766939878463743, "eval_loss": 0.0010842689080163836, "eval_num_tokens": 16315488.0, "eval_reward": 0.2650000035762787, "eval_reward_std": 0.14464514702558517, "eval_rewards/num_nodes_reward/mean": 0.7375, "eval_rewards/num_nodes_reward/std": 0.3443817555904388, "eval_rewards/tree_correctness_reward/mean": 0.0625, "eval_rewards/tree_correctness_reward/std": 0.18662601709365845, "eval_runtime": 37.3097, "eval_samples_per_second": 0.268, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9059570074081421, "eval_sampling/importance_sampling_ratio/min": 3.260098632846728e-27, "eval_sampling/sampling_logp_difference/max": 63.41875, "eval_sampling/sampling_logp_difference/mean": 1.6923516511917114, "eval_steps_per_second": 0.027, "step": 150 }, { "clip_ratio/high_max": 0.02863226947374642, "clip_ratio/high_mean": 0.005847451102454215, "clip_ratio/low_mean": 0.005829700909089297, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011677152418997139, "completions/clipped_ratio": 1.0, "completions/max_length": 322.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 275.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 241.0, "completions/min_terminated_length": 0.0, "entropy": 1.1844363063573837, "epoch": 75.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.013581766746938229, "kl": 0.22165772877633572, "learning_rate": 8e-05, "loss": 0.0022, "num_tokens": 16414816.0, "reward": 0.390625, "reward_std": 0.14662916958332062, "rewards/num_nodes_reward/mean": 0.71875, "rewards/num_nodes_reward/std": 0.4513758420944214, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.434714138507843, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7133288383483887, "sampling/importance_sampling_ratio/min": 1.5239456710387979e-30, "sampling/sampling_logp_difference/max": 68.65625, "sampling/sampling_logp_difference/mean": 4.624045372009277, "step": 151 }, { "clip_ratio/high_max": 0.022641431336523965, "clip_ratio/high_mean": 0.00480798880016664, "clip_ratio/low_mean": 0.005127322598127648, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009935311507433653, "completions/clipped_ratio": 1.0, "completions/max_length": 317.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 257.0, "completions/min_terminated_length": 0.0, "entropy": 1.0295504555106163, "epoch": 76.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.014385302551090717, "kl": 0.28786694072186947, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 16515360.0, "reward": 0.28515625, "reward_std": 0.19044578075408936, "rewards/num_nodes_reward/mean": 0.640625, "rewards/num_nodes_reward/std": 0.481702595949173, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7509303092956543, "sampling/importance_sampling_ratio/min": 4.409682927674699e-30, "sampling/sampling_logp_difference/max": 67.59375, "sampling/sampling_logp_difference/mean": 4.097110271453857, "step": 152 }, { "clip_ratio/high_max": 0.027152832597494125, "clip_ratio/high_mean": 0.0052112306002527475, "clip_ratio/low_mean": 0.00805140647571534, "clip_ratio/low_min": 0.0004448398540262133, "clip_ratio/region_mean": 0.013262636959552765, "completions/clipped_ratio": 1.0, "completions/max_length": 317.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 240.0, "completions/min_terminated_length": 0.0, "entropy": 1.039419487118721, "epoch": 76.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.01750550977885723, "kl": 0.30384873785078526, "learning_rate": 8e-05, "loss": 0.0007, "num_tokens": 16614400.0, "reward": 0.265625, "reward_std": 0.19592693448066711, "rewards/num_nodes_reward/mean": 0.703125, "rewards/num_nodes_reward/std": 0.45867621898651123, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7560471892356873, "sampling/importance_sampling_ratio/min": 1.2106200630011295e-28, "sampling/sampling_logp_difference/max": 64.28125, "sampling/sampling_logp_difference/mean": 4.04527473449707, "step": 153 }, { "clip_ratio/high_max": 0.026826681452803314, "clip_ratio/high_mean": 0.0070587074442300946, "clip_ratio/low_mean": 0.008837743662297726, "clip_ratio/low_min": 0.0008780942880548537, "clip_ratio/region_mean": 0.015896450961008668, "completions/clipped_ratio": 1.0, "completions/max_length": 299.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 236.0, "completions/min_terminated_length": 0.0, "entropy": 1.3848023265600204, "epoch": 77.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.015993298962712288, "kl": 0.2515564914792776, "learning_rate": 8e-05, "loss": -0.0016, "num_tokens": 16713296.0, "reward": 0.38203126192092896, "reward_std": 0.27663442492485046, "rewards/num_nodes_reward/mean": 0.5625, "rewards/num_nodes_reward/std": 0.49802759289741516, "rewards/tree_correctness_reward/mean": 0.3046875, "rewards/tree_correctness_reward/std": 0.46208351850509644, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7001727819442749, "sampling/importance_sampling_ratio/min": 8.584580550654011e-29, "sampling/sampling_logp_difference/max": 64.625, "sampling/sampling_logp_difference/mean": 4.778580665588379, "step": 154 }, { "clip_ratio/high_max": 0.01858298690058291, "clip_ratio/high_mean": 0.005445582093670964, "clip_ratio/low_mean": 0.005288896907586604, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010734478826634586, "completions/clipped_ratio": 1.0, "completions/max_length": 295.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 244.0, "completions/min_terminated_length": 0.0, "entropy": 1.2209803760051727, "epoch": 77.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.016118600964546204, "kl": 0.2783854827284813, "learning_rate": 8e-05, "loss": -0.0008, "num_tokens": 16811552.0, "reward": 0.2984375059604645, "reward_std": 0.1798608899116516, "rewards/num_nodes_reward/mean": 0.6484375, "rewards/num_nodes_reward/std": 0.4793342351913452, "rewards/tree_correctness_reward/mean": 0.1484375, "rewards/tree_correctness_reward/std": 0.356930136680603, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7170861959457397, "sampling/importance_sampling_ratio/min": 6.479982085325971e-29, "sampling/sampling_logp_difference/max": 64.90625, "sampling/sampling_logp_difference/mean": 4.572415351867676, "step": 155 }, { "clip_ratio/high_max": 0.02124451339477673, "clip_ratio/high_mean": 0.007155340164899826, "clip_ratio/low_mean": 0.008273939078208059, "clip_ratio/low_min": 0.00022893772984389216, "clip_ratio/region_mean": 0.015429279417730868, "completions/clipped_ratio": 1.0, "completions/max_length": 306.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 244.0, "completions/min_terminated_length": 0.0, "entropy": 1.0892489776015282, "epoch": 78.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.017177535220980644, "kl": 0.2594756484031677, "learning_rate": 8e-05, "loss": 0.0014, "num_tokens": 16910512.0, "reward": 0.3414062559604645, "reward_std": 0.2753470540046692, "rewards/num_nodes_reward/mean": 0.609375, "rewards/num_nodes_reward/std": 0.4898075461387634, "rewards/tree_correctness_reward/mean": 0.2265625, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7464601993560791, "sampling/importance_sampling_ratio/min": 8.857084591457114e-29, "sampling/sampling_logp_difference/max": 64.59375, "sampling/sampling_logp_difference/mean": 4.178498268127441, "step": 156 }, { "clip_ratio/high_max": 0.024924379074946046, "clip_ratio/high_mean": 0.005452253230032511, "clip_ratio/low_mean": 0.007626071106642485, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013078324031084776, "completions/clipped_ratio": 1.0, "completions/max_length": 280.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 219.0, "completions/min_terminated_length": 0.0, "entropy": 1.1068796068429947, "epoch": 78.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.023760147392749786, "kl": 0.287541925907135, "learning_rate": 8e-05, "loss": -0.0014, "num_tokens": 17006864.0, "reward": 0.30390626192092896, "reward_std": 0.20832616090774536, "rewards/num_nodes_reward/mean": 0.59375, "rewards/num_nodes_reward/std": 0.4930621087551117, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7417775988578796, "sampling/importance_sampling_ratio/min": 6.0273218931633014e-30, "sampling/sampling_logp_difference/max": 67.28125, "sampling/sampling_logp_difference/mean": 4.258414268493652, "step": 157 }, { "clip_ratio/high_max": 0.0233972916030325, "clip_ratio/high_mean": 0.00592993083409965, "clip_ratio/low_mean": 0.007428077980875969, "clip_ratio/low_min": 0.00022163119865581393, "clip_ratio/region_mean": 0.013358008931390941, "completions/clipped_ratio": 1.0, "completions/max_length": 339.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 232.0, "completions/min_terminated_length": 0.0, "entropy": 1.4145057797431946, "epoch": 79.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01833372190594673, "kl": 0.27348776161670685, "learning_rate": 8e-05, "loss": -0.002, "num_tokens": 17106832.0, "reward": 0.36406248807907104, "reward_std": 0.27270838618278503, "rewards/num_nodes_reward/mean": 0.6484375, "rewards/num_nodes_reward/std": 0.4793342351913452, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6991637349128723, "sampling/importance_sampling_ratio/min": 5.718563876253154e-29, "sampling/sampling_logp_difference/max": 65.03125, "sampling/sampling_logp_difference/mean": 4.764675140380859, "step": 158 }, { "clip_ratio/high_max": 0.02243427385110408, "clip_ratio/high_mean": 0.005174358942895196, "clip_ratio/low_mean": 0.008067965391092002, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013242324232123792, "completions/clipped_ratio": 1.0, "completions/max_length": 306.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 212.0, "completions/min_terminated_length": 0.0, "entropy": 1.429254099726677, "epoch": 79.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.015116102062165737, "kl": 0.29301965422928333, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 17205024.0, "reward": 0.36406248807907104, "reward_std": 0.1804610639810562, "rewards/num_nodes_reward/mean": 0.6484375, "rewards/num_nodes_reward/std": 0.4793342351913452, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6936541199684143, "sampling/importance_sampling_ratio/min": 1.2367262089069031e-29, "sampling/sampling_logp_difference/max": 66.5625, "sampling/sampling_logp_difference/mean": 4.86217737197876, "step": 159 }, { "clip_ratio/high_max": 0.027557278983294964, "clip_ratio/high_mean": 0.007052414701320231, "clip_ratio/low_mean": 0.007368742488324642, "clip_ratio/low_min": 0.00015508684737142175, "clip_ratio/region_mean": 0.014421157189644873, "completions/clipped_ratio": 1.0, "completions/max_length": 403.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 288.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 239.0, "completions/min_terminated_length": 0.0, "entropy": 1.5316922515630722, "epoch": 80.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.017353909090161324, "kl": 0.30782008543610573, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 17306032.0, "reward": 0.2593750059604645, "reward_std": 0.21053007245063782, "rewards/num_nodes_reward/mean": 0.609375, "rewards/num_nodes_reward/std": 0.4898075461387634, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.31333550810813904, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6862719058990479, "sampling/importance_sampling_ratio/min": 5.372093894082752e-29, "sampling/sampling_logp_difference/max": 65.09375, "sampling/sampling_logp_difference/mean": 4.934695243835449, "step": 160 }, { "epoch": 80.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 262.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 209.125, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 165.4, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.6009063720703125, "eval_frac_reward_zero_std": 0.4, "eval_kl": 0.32209721207618713, "eval_loss": 0.0018569286912679672, "eval_num_tokens": 17306032.0, "eval_reward": 0.27999999821186067, "eval_reward_std": 0.1944136753678322, "eval_rewards/num_nodes_reward/mean": 0.7, "eval_rewards/num_nodes_reward/std": 0.38637164831161497, "eval_rewards/tree_correctness_reward/mean": 0.1, "eval_rewards/tree_correctness_reward/std": 0.3049390256404877, "eval_runtime": 34.071, "eval_samples_per_second": 0.294, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9190220355987548, "eval_sampling/importance_sampling_ratio/min": 3.033344306308475e-27, "eval_sampling/sampling_logp_difference/max": 62.16875, "eval_sampling/sampling_logp_difference/mean": 1.524347710609436, "eval_steps_per_second": 0.029, "step": 160 }, { "clip_ratio/high_max": 0.026200907421298325, "clip_ratio/high_mean": 0.00796364905545488, "clip_ratio/low_mean": 0.004787230427609757, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012750879279337823, "completions/clipped_ratio": 1.0, "completions/max_length": 321.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 196.0, "completions/min_terminated_length": 0.0, "entropy": 1.3951140344142914, "epoch": 80.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.016176912933588028, "kl": 0.2495391145348549, "learning_rate": 8e-05, "loss": 0.0009, "num_tokens": 17403280.0, "reward": 0.4296875, "reward_std": 0.20709669589996338, "rewards/num_nodes_reward/mean": 0.7578125, "rewards/num_nodes_reward/std": 0.4300905168056488, "rewards/tree_correctness_reward/mean": 0.2890625, "rewards/tree_correctness_reward/std": 0.45510825514793396, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6987972259521484, "sampling/importance_sampling_ratio/min": 9.335298413326433e-30, "sampling/sampling_logp_difference/max": 66.84375, "sampling/sampling_logp_difference/mean": 4.820359230041504, "step": 161 }, { "clip_ratio/high_max": 0.016303609998431057, "clip_ratio/high_mean": 0.003815754986135289, "clip_ratio/low_mean": 0.0043639598588924855, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008179714961443096, "completions/clipped_ratio": 1.0, "completions/max_length": 335.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 227.0, "completions/min_terminated_length": 0.0, "entropy": 1.1761988401412964, "epoch": 81.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.01723138988018036, "kl": 0.3086940422654152, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 17501424.0, "reward": 0.25468751788139343, "reward_std": 0.17375174164772034, "rewards/num_nodes_reward/mean": 0.5390625, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7358721494674683, "sampling/importance_sampling_ratio/min": 9.048082073307534e-30, "sampling/sampling_logp_difference/max": 66.875, "sampling/sampling_logp_difference/mean": 4.297650337219238, "step": 162 }, { "clip_ratio/high_max": 0.02213797194417566, "clip_ratio/high_mean": 0.005945661949226633, "clip_ratio/low_mean": 0.005783340806374326, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011729002755600959, "completions/clipped_ratio": 1.0, "completions/max_length": 277.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 240.0, "completions/min_terminated_length": 0.0, "entropy": 1.3539886772632599, "epoch": 81.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.014390775933861732, "kl": 0.27391311898827553, "learning_rate": 8e-05, "loss": -0.001, "num_tokens": 17599632.0, "reward": 0.40000003576278687, "reward_std": 0.19197945296764374, "rewards/num_nodes_reward/mean": 0.6953125, "rewards/num_nodes_reward/std": 0.46208351850509644, "rewards/tree_correctness_reward/mean": 0.2734375, "rewards/tree_correctness_reward/std": 0.447474867105484, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7144193649291992, "sampling/importance_sampling_ratio/min": 2.333531109663677e-28, "sampling/sampling_logp_difference/max": 63.625, "sampling/sampling_logp_difference/mean": 4.605173587799072, "step": 163 }, { "clip_ratio/high_max": 0.022983363480307162, "clip_ratio/high_mean": 0.007259948702994734, "clip_ratio/low_mean": 0.008802653173916042, "clip_ratio/low_min": 0.0006820436829002574, "clip_ratio/region_mean": 0.016062601818703115, "completions/clipped_ratio": 1.0, "completions/max_length": 336.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 252.0, "completions/min_terminated_length": 0.0, "entropy": 1.509288340806961, "epoch": 82.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.023527711629867554, "kl": 0.3221615869551897, "learning_rate": 8e-05, "loss": 0.0009, "num_tokens": 17700176.0, "reward": 0.28593748807907104, "reward_std": 0.23411770164966583, "rewards/num_nodes_reward/mean": 0.625, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6784208416938782, "sampling/importance_sampling_ratio/min": 2.4595237313441332e-29, "sampling/sampling_logp_difference/max": 65.875, "sampling/sampling_logp_difference/mean": 5.08829402923584, "step": 164 }, { "clip_ratio/high_max": 0.0196476464625448, "clip_ratio/high_mean": 0.00469570021959953, "clip_ratio/low_mean": 0.006024837726727128, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010720537742599845, "completions/clipped_ratio": 1.0, "completions/max_length": 334.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 243.0, "completions/min_terminated_length": 0.0, "entropy": 1.3038495182991028, "epoch": 82.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.016648728400468826, "kl": 0.2886706832796335, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 17798880.0, "reward": 0.32343751192092896, "reward_std": 0.22959469258785248, "rewards/num_nodes_reward/mean": 0.5859375, "rewards/num_nodes_reward/std": 0.49449479579925537, "rewards/tree_correctness_reward/mean": 0.2109375, "rewards/tree_correctness_reward/std": 0.4095771610736847, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7259455919265747, "sampling/importance_sampling_ratio/min": 1.7072478214393567e-28, "sampling/sampling_logp_difference/max": 63.9375, "sampling/sampling_logp_difference/mean": 4.437117576599121, "step": 165 }, { "clip_ratio/high_max": 0.02758353273384273, "clip_ratio/high_mean": 0.0078549844911322, "clip_ratio/low_mean": 0.007039969030302018, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014894953463226557, "completions/clipped_ratio": 1.0, "completions/max_length": 349.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 245.0, "completions/min_terminated_length": 0.0, "entropy": 1.332153894007206, "epoch": 83.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.020474927499890327, "kl": 0.306723952293396, "learning_rate": 8e-05, "loss": 0.0014, "num_tokens": 17898832.0, "reward": 0.3578124940395355, "reward_std": 0.2870534658432007, "rewards/num_nodes_reward/mean": 0.6640625, "rewards/num_nodes_reward/std": 0.47417303919792175, "rewards/tree_correctness_reward/mean": 0.2265625, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7137020230293274, "sampling/importance_sampling_ratio/min": 1.63839597639079e-29, "sampling/sampling_logp_difference/max": 66.28125, "sampling/sampling_logp_difference/mean": 4.5988264083862305, "step": 166 }, { "clip_ratio/high_max": 0.021666851360350847, "clip_ratio/high_mean": 0.00654818257316947, "clip_ratio/low_mean": 0.006177016970468685, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012725199805572629, "completions/clipped_ratio": 1.0, "completions/max_length": 404.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 305.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 263.0, "completions/min_terminated_length": 0.0, "entropy": 1.6179018169641495, "epoch": 83.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.017899855971336365, "kl": 0.2522517014294863, "learning_rate": 8e-05, "loss": 0.0041, "num_tokens": 18002032.0, "reward": 0.30546873807907104, "reward_std": 0.25698789954185486, "rewards/num_nodes_reward/mean": 0.671875, "rewards/num_nodes_reward/std": 0.4713755249977112, "rewards/tree_correctness_reward/mean": 0.1484375, "rewards/tree_correctness_reward/std": 0.356930136680603, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6693464517593384, "sampling/importance_sampling_ratio/min": 2.701258182483389e-29, "sampling/sampling_logp_difference/max": 65.78125, "sampling/sampling_logp_difference/mean": 5.169867992401123, "step": 167 }, { "clip_ratio/high_max": 0.03922474733553827, "clip_ratio/high_mean": 0.009278804354835302, "clip_ratio/low_mean": 0.008524733304511756, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017803537310101092, "completions/clipped_ratio": 1.0, "completions/max_length": 311.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 236.0, "completions/min_terminated_length": 0.0, "entropy": 1.373923547565937, "epoch": 84.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01893892139196396, "kl": 0.28938567265868187, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 18101488.0, "reward": 0.35234373807907104, "reward_std": 0.3096430003643036, "rewards/num_nodes_reward/mean": 0.5546875, "rewards/num_nodes_reward/std": 0.4989531338214874, "rewards/tree_correctness_reward/mean": 0.265625, "rewards/tree_correctness_reward/std": 0.44340085983276367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.70479416847229, "sampling/importance_sampling_ratio/min": 7.116867858635386e-29, "sampling/sampling_logp_difference/max": 64.8125, "sampling/sampling_logp_difference/mean": 4.734579086303711, "step": 168 }, { "clip_ratio/high_max": 0.019284790963865817, "clip_ratio/high_mean": 0.0065988545247819275, "clip_ratio/low_mean": 0.00681968207936734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01341853616759181, "completions/clipped_ratio": 1.0, "completions/max_length": 304.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 261.0, "completions/min_terminated_length": 0.0, "entropy": 1.3647940903902054, "epoch": 84.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.019097259268164635, "kl": 0.24355414882302284, "learning_rate": 8e-05, "loss": -0.0012, "num_tokens": 18201456.0, "reward": 0.375, "reward_std": 0.2934407591819763, "rewards/num_nodes_reward/mean": 0.59375, "rewards/num_nodes_reward/std": 0.4930621087551117, "rewards/tree_correctness_reward/mean": 0.28125, "rewards/tree_correctness_reward/std": 0.4513758420944214, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7063145637512207, "sampling/importance_sampling_ratio/min": 2.701258182483389e-29, "sampling/sampling_logp_difference/max": 65.78125, "sampling/sampling_logp_difference/mean": 4.72279167175293, "step": 169 }, { "clip_ratio/high_max": 0.029747829656116664, "clip_ratio/high_mean": 0.008977173885796219, "clip_ratio/low_mean": 0.007914574176538736, "clip_ratio/low_min": 0.00027056277031078935, "clip_ratio/region_mean": 0.016891747945919633, "completions/clipped_ratio": 1.0, "completions/max_length": 343.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 283.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 231.0, "completions/min_terminated_length": 0.0, "entropy": 1.2574879378080368, "epoch": 85.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.018645018339157104, "kl": 0.30087384954094887, "learning_rate": 8e-05, "loss": -0.0014, "num_tokens": 18301840.0, "reward": 0.22734376788139343, "reward_std": 0.2344072461128235, "rewards/num_nodes_reward/mean": 0.5390625, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29262590408325195, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7397449016571045, "sampling/importance_sampling_ratio/min": 2.103742028355895e-29, "sampling/sampling_logp_difference/max": 66.03125, "sampling/sampling_logp_difference/mean": 4.224130153656006, "step": 170 }, { "epoch": 85.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 301.0, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 216.7, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 166.6, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.7499956011772155, "eval_frac_reward_zero_std": 0.2, "eval_kl": 0.34277952909469606, "eval_loss": 0.005831228569149971, "eval_num_tokens": 18301840.0, "eval_reward": 0.14125000163912774, "eval_reward_std": 0.14571774750947952, "eval_rewards/num_nodes_reward/mean": 0.4125, "eval_rewards/num_nodes_reward/std": 0.4376549541950226, "eval_rewards/tree_correctness_reward/mean": 0.025, "eval_rewards/tree_correctness_reward/std": 0.1, "eval_runtime": 38.1577, "eval_samples_per_second": 0.262, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8980738759040833, "eval_sampling/importance_sampling_ratio/min": 1.5058171618015466e-28, "eval_sampling/sampling_logp_difference/max": 64.36875, "eval_sampling/sampling_logp_difference/mean": 1.8168490886688233, "eval_steps_per_second": 0.026, "step": 170 }, { "clip_ratio/high_max": 0.033081351895816624, "clip_ratio/high_mean": 0.009316105919424444, "clip_ratio/low_mean": 0.005388646386563778, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014704752364195883, "completions/clipped_ratio": 1.0, "completions/max_length": 325.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 286.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 224.0, "completions/min_terminated_length": 0.0, "entropy": 1.427287220954895, "epoch": 85.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.018717218190431595, "kl": 0.2852567285299301, "learning_rate": 8e-05, "loss": -0.0008, "num_tokens": 18402672.0, "reward": 0.23281249403953552, "reward_std": 0.2451014667749405, "rewards/num_nodes_reward/mean": 0.484375, "rewards/num_nodes_reward/std": 0.5017194747924805, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3320184051990509, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7078977823257446, "sampling/importance_sampling_ratio/min": 1.6222316689370394e-30, "sampling/sampling_logp_difference/max": 68.59375, "sampling/sampling_logp_difference/mean": 4.663632392883301, "step": 171 }, { "clip_ratio/high_max": 0.019984879298135638, "clip_ratio/high_mean": 0.00669236428802833, "clip_ratio/low_mean": 0.006738755677361041, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013431119848974049, "completions/clipped_ratio": 1.0, "completions/max_length": 359.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 219.0, "completions/min_terminated_length": 0.0, "entropy": 1.4355349093675613, "epoch": 86.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.01684500463306904, "kl": 0.25394849479198456, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 18502720.0, "reward": 0.3499999940395355, "reward_std": 0.2905277609825134, "rewards/num_nodes_reward/mean": 0.546875, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.265625, "rewards/tree_correctness_reward/std": 0.44340085983276367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7014011144638062, "sampling/importance_sampling_ratio/min": 3.3617703013855747e-29, "sampling/sampling_logp_difference/max": 65.5625, "sampling/sampling_logp_difference/mean": 4.796557903289795, "step": 172 }, { "clip_ratio/high_max": 0.02659254998434335, "clip_ratio/high_mean": 0.007317429641261697, "clip_ratio/low_mean": 0.0074032716220244765, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01472070172894746, "completions/clipped_ratio": 1.0, "completions/max_length": 306.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 239.0, "completions/min_terminated_length": 0.0, "entropy": 1.4168401956558228, "epoch": 86.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.04474812373518944, "kl": 0.26837258599698544, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 18602688.0, "reward": 0.2992187440395355, "reward_std": 0.19436343014240265, "rewards/num_nodes_reward/mean": 0.578125, "rewards/num_nodes_reward/std": 0.4957992732524872, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7016750574111938, "sampling/importance_sampling_ratio/min": 6.479982085325971e-29, "sampling/sampling_logp_difference/max": 64.90625, "sampling/sampling_logp_difference/mean": 4.7917656898498535, "step": 173 }, { "clip_ratio/high_max": 0.026526062283664942, "clip_ratio/high_mean": 0.00648493118933402, "clip_ratio/low_mean": 0.006241094728466123, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012726025888696313, "completions/clipped_ratio": 1.0, "completions/max_length": 331.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 234.0, "completions/min_terminated_length": 0.0, "entropy": 1.3848496675491333, "epoch": 87.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.01434396579861641, "kl": 0.23133928515017033, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 18700720.0, "reward": 0.2890625, "reward_std": 0.24480833113193512, "rewards/num_nodes_reward/mean": 0.5078125, "rewards/num_nodes_reward/std": 0.5019033551216125, "rewards/tree_correctness_reward/mean": 0.1953125, "rewards/tree_correctness_reward/std": 0.3979988098144531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7175954580307007, "sampling/importance_sampling_ratio/min": 7.1986098550006065e-31, "sampling/sampling_logp_difference/max": 69.40625, "sampling/sampling_logp_difference/mean": 4.579862594604492, "step": 174 }, { "clip_ratio/high_max": 0.028241605730727315, "clip_ratio/high_mean": 0.010801075608469546, "clip_ratio/low_mean": 0.005793687421828508, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01659476337954402, "completions/clipped_ratio": 1.0, "completions/max_length": 315.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 244.0, "completions/min_terminated_length": 0.0, "entropy": 1.4576978385448456, "epoch": 87.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01825234293937683, "kl": 0.23876949213445187, "learning_rate": 8e-05, "loss": -0.0008, "num_tokens": 18801136.0, "reward": 0.3101562559604645, "reward_std": 0.23874546587467194, "rewards/num_nodes_reward/mean": 0.578125, "rewards/num_nodes_reward/std": 0.4957992732524872, "rewards/tree_correctness_reward/mean": 0.1953125, "rewards/tree_correctness_reward/std": 0.3979988098144531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6967538595199585, "sampling/importance_sampling_ratio/min": 2.1491077972015863e-30, "sampling/sampling_logp_difference/max": 68.3125, "sampling/sampling_logp_difference/mean": 4.8037285804748535, "step": 175 }, { "clip_ratio/high_max": 0.019515096792019904, "clip_ratio/high_mean": 0.005960183567367494, "clip_ratio/low_mean": 0.005774337856564671, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011734521598555148, "completions/clipped_ratio": 1.0, "completions/max_length": 365.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 212.0, "completions/min_terminated_length": 0.0, "entropy": 1.4753764122724533, "epoch": 88.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.020577091723680496, "kl": 0.2704933322966099, "learning_rate": 8e-05, "loss": -0.001, "num_tokens": 18901280.0, "reward": 0.3031250238418579, "reward_std": 0.2483447939157486, "rewards/num_nodes_reward/mean": 0.5546875, "rewards/num_nodes_reward/std": 0.4989531338214874, "rewards/tree_correctness_reward/mean": 0.1953125, "rewards/tree_correctness_reward/std": 0.3979988098144531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.699561595916748, "sampling/importance_sampling_ratio/min": 4.549661547287001e-30, "sampling/sampling_logp_difference/max": 67.5625, "sampling/sampling_logp_difference/mean": 4.7967400550842285, "step": 176 }, { "clip_ratio/high_max": 0.03305169939994812, "clip_ratio/high_mean": 0.009873877745121717, "clip_ratio/low_mean": 0.004732866596896201, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014606744283810258, "completions/clipped_ratio": 1.0, "completions/max_length": 409.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 206.0, "completions/min_terminated_length": 0.0, "entropy": 1.5283161997795105, "epoch": 88.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.018892087042331696, "kl": 0.24232917465269566, "learning_rate": 8e-05, "loss": -0.0032, "num_tokens": 19003088.0, "reward": 0.3531250059604645, "reward_std": 0.2702024281024933, "rewards/num_nodes_reward/mean": 0.5390625, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.2734375, "rewards/tree_correctness_reward/std": 0.447474867105484, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6801394820213318, "sampling/importance_sampling_ratio/min": 1.6737267850623815e-30, "sampling/sampling_logp_difference/max": 68.5625, "sampling/sampling_logp_difference/mean": 5.061403274536133, "step": 177 }, { "clip_ratio/high_max": 0.023890757584013045, "clip_ratio/high_mean": 0.007401508220937103, "clip_ratio/low_mean": 0.008079939114395529, "clip_ratio/low_min": 0.00021258502965793014, "clip_ratio/region_mean": 0.015481446869671345, "completions/clipped_ratio": 1.0, "completions/max_length": 294.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 215.0, "completions/min_terminated_length": 0.0, "entropy": 1.2177962809801102, "epoch": 89.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01740807481110096, "kl": 0.2849873714148998, "learning_rate": 8e-05, "loss": -0.001, "num_tokens": 19101520.0, "reward": 0.31640625, "reward_std": 0.24487492442131042, "rewards/num_nodes_reward/mean": 0.671875, "rewards/num_nodes_reward/std": 0.4713755249977112, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7281813621520996, "sampling/importance_sampling_ratio/min": 5.784245648136424e-31, "sampling/sampling_logp_difference/max": 69.625, "sampling/sampling_logp_difference/mean": 4.4136152267456055, "step": 178 }, { "clip_ratio/high_max": 0.012254519970156252, "clip_ratio/high_mean": 0.003975365121732466, "clip_ratio/low_mean": 0.00649744804832153, "clip_ratio/low_min": 0.00020627063349820673, "clip_ratio/region_mean": 0.010472813388332725, "completions/clipped_ratio": 1.0, "completions/max_length": 342.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 252.0, "completions/min_terminated_length": 0.0, "entropy": 1.6771805584430695, "epoch": 89.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.013857672922313213, "kl": 0.22496209479868412, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 19202064.0, "reward": 0.375, "reward_std": 0.24042272567749023, "rewards/num_nodes_reward/mean": 0.59375, "rewards/num_nodes_reward/std": 0.4930621087551117, "rewards/tree_correctness_reward/mean": 0.28125, "rewards/tree_correctness_reward/std": 0.4513758420944214, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6720218062400818, "sampling/importance_sampling_ratio/min": 7.501123232145174e-30, "sampling/sampling_logp_difference/max": 67.0625, "sampling/sampling_logp_difference/mean": 5.145228385925293, "step": 179 }, { "clip_ratio/high_max": 0.02050487359520048, "clip_ratio/high_mean": 0.006737605173839256, "clip_ratio/low_mean": 0.007323658501263708, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014061263645999134, "completions/clipped_ratio": 1.0, "completions/max_length": 335.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 229.0, "completions/min_terminated_length": 0.0, "entropy": 1.2350883930921555, "epoch": 90.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.017039036378264427, "kl": 0.23602011613547802, "learning_rate": 8e-05, "loss": 0.0004, "num_tokens": 19303312.0, "reward": 0.2953125238418579, "reward_std": 0.2303306609392166, "rewards/num_nodes_reward/mean": 0.65625, "rewards/num_nodes_reward/std": 0.47682511806488037, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7330542802810669, "sampling/importance_sampling_ratio/min": 1.0252820466629127e-29, "sampling/sampling_logp_difference/max": 66.75, "sampling/sampling_logp_difference/mean": 4.322797775268555, "step": 180 }, { "epoch": 90.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 334.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 244.15, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 183.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.7776754260063171, "eval_frac_reward_zero_std": 0.1, "eval_kl": 0.2631746083498001, "eval_loss": -0.0025983233936131, "eval_num_tokens": 19303312.0, "eval_reward": 0.15249999463558198, "eval_reward_std": 0.21977685987949372, "eval_rewards/num_nodes_reward/mean": 0.3625, "eval_rewards/num_nodes_reward/std": 0.44476498365402223, "eval_rewards/tree_correctness_reward/mean": 0.0625, "eval_rewards/tree_correctness_reward/std": 0.21831300854682922, "eval_runtime": 41.5751, "eval_samples_per_second": 0.241, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.884695303440094, "eval_sampling/importance_sampling_ratio/min": 7.729978770822245e-28, "eval_sampling/sampling_logp_difference/max": 64.39375, "eval_sampling/sampling_logp_difference/mean": 1.9868874549865723, "eval_steps_per_second": 0.024, "step": 180 }, { "clip_ratio/high_max": 0.018017688882537186, "clip_ratio/high_mean": 0.006531948136398569, "clip_ratio/low_mean": 0.007387098157778382, "clip_ratio/low_min": 0.0013130186998751014, "clip_ratio/region_mean": 0.013919046497903764, "completions/clipped_ratio": 1.0, "completions/max_length": 335.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 298.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 273.0, "completions/min_terminated_length": 0.0, "entropy": 1.409109115600586, "epoch": 90.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01793644204735756, "kl": 0.20374773815274239, "learning_rate": 8e-05, "loss": -0.003, "num_tokens": 19405632.0, "reward": 0.31328123807907104, "reward_std": 0.276388943195343, "rewards/num_nodes_reward/mean": 0.625, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7087727785110474, "sampling/importance_sampling_ratio/min": 8.238372224122146e-30, "sampling/sampling_logp_difference/max": 66.96875, "sampling/sampling_logp_difference/mean": 4.6336164474487305, "step": 181 }, { "clip_ratio/high_max": 0.021266067982651293, "clip_ratio/high_mean": 0.006830809928942472, "clip_ratio/low_mean": 0.007364800258073956, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014195610652677715, "completions/clipped_ratio": 1.0, "completions/max_length": 332.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 218.0, "completions/min_terminated_length": 0.0, "entropy": 1.3035294562578201, "epoch": 91.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.017657192423939705, "kl": 0.24759200029075146, "learning_rate": 8e-05, "loss": 0.0004, "num_tokens": 19504976.0, "reward": 0.3257812559604645, "reward_std": 0.2755081057548523, "rewards/num_nodes_reward/mean": 0.484375, "rewards/num_nodes_reward/std": 0.5017194747924805, "rewards/tree_correctness_reward/mean": 0.2578125, "rewards/tree_correctness_reward/std": 0.43914902210235596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7174037098884583, "sampling/importance_sampling_ratio/min": 7.739234377113275e-30, "sampling/sampling_logp_difference/max": 67.03125, "sampling/sampling_logp_difference/mean": 4.582976818084717, "step": 182 }, { "clip_ratio/high_max": 0.020621206029318273, "clip_ratio/high_mean": 0.0068034269934287295, "clip_ratio/low_mean": 0.007041032193228602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01384445873554796, "completions/clipped_ratio": 1.0, "completions/max_length": 330.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 299.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 262.0, "completions/min_terminated_length": 0.0, "entropy": 1.5680060535669327, "epoch": 91.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.016982441768050194, "kl": 0.22435804270207882, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 19607392.0, "reward": 0.4937500059604645, "reward_std": 0.24584883451461792, "rewards/num_nodes_reward/mean": 0.6796875, "rewards/num_nodes_reward/std": 0.4684300124645233, "rewards/tree_correctness_reward/mean": 0.4140625, "rewards/tree_correctness_reward/std": 0.49449479579925537, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6777727603912354, "sampling/importance_sampling_ratio/min": 4.8430893875347414e-30, "sampling/sampling_logp_difference/max": 67.5, "sampling/sampling_logp_difference/mean": 5.073945999145508, "step": 183 }, { "clip_ratio/high_max": 0.023525719298049808, "clip_ratio/high_mean": 0.007351312611717731, "clip_ratio/low_mean": 0.0070830152835696936, "clip_ratio/low_min": 0.00016149871225934476, "clip_ratio/region_mean": 0.014434327487833798, "completions/clipped_ratio": 1.0, "completions/max_length": 387.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 310.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 268.0, "completions/min_terminated_length": 0.0, "entropy": 1.5578247904777527, "epoch": 92.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.021541016176342964, "kl": 0.24714725464582443, "learning_rate": 8e-05, "loss": 0.0029, "num_tokens": 19711200.0, "reward": 0.23750001192092896, "reward_std": 0.25867000222206116, "rewards/num_nodes_reward/mean": 0.5, "rewards/num_nodes_reward/std": 0.5019646286964417, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3320184051990509, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6873108148574829, "sampling/importance_sampling_ratio/min": 2.082986710165573e-30, "sampling/sampling_logp_difference/max": 68.34375, "sampling/sampling_logp_difference/mean": 4.918642997741699, "step": 184 }, { "clip_ratio/high_max": 0.023402475635521114, "clip_ratio/high_mean": 0.007462222449248657, "clip_ratio/low_mean": 0.004876815917668864, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012339038425125182, "completions/clipped_ratio": 1.0, "completions/max_length": 417.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 325.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 278.0, "completions/min_terminated_length": 0.0, "entropy": 1.7627083361148834, "epoch": 92.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.01580158807337284, "kl": 0.24765130691230297, "learning_rate": 8e-05, "loss": 0.0005, "num_tokens": 19816944.0, "reward": 0.27421873807907104, "reward_std": 0.191839799284935, "rewards/num_nodes_reward/mean": 0.5859375, "rewards/num_nodes_reward/std": 0.49449479579925537, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6587610244750977, "sampling/importance_sampling_ratio/min": 1.3448773503633471e-30, "sampling/sampling_logp_difference/max": 68.78125, "sampling/sampling_logp_difference/mean": 5.274637699127197, "step": 185 }, { "clip_ratio/high_max": 0.02018162701278925, "clip_ratio/high_mean": 0.007052422457491048, "clip_ratio/low_mean": 0.006674901640508324, "clip_ratio/low_min": 0.00018996960716322064, "clip_ratio/region_mean": 0.013727323501370847, "completions/clipped_ratio": 1.0, "completions/max_length": 364.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 268.0, "completions/min_terminated_length": 0.0, "entropy": 1.5968713164329529, "epoch": 93.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.016651958227157593, "kl": 0.2597046624869108, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 19920288.0, "reward": 0.33671873807907104, "reward_std": 0.250440776348114, "rewards/num_nodes_reward/mean": 0.59375, "rewards/num_nodes_reward/std": 0.4930621087551117, "rewards/tree_correctness_reward/mean": 0.2265625, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6781532764434814, "sampling/importance_sampling_ratio/min": 1.4316144454031779e-30, "sampling/sampling_logp_difference/max": 68.71875, "sampling/sampling_logp_difference/mean": 5.034563064575195, "step": 186 }, { "clip_ratio/high_max": 0.022600414929911494, "clip_ratio/high_mean": 0.007097787165548652, "clip_ratio/low_mean": 0.0056477578764315695, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012745545129291713, "completions/clipped_ratio": 1.0, "completions/max_length": 348.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 300.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 245.0, "completions/min_terminated_length": 0.0, "entropy": 1.3966753333806992, "epoch": 93.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3811348080635071, "kl": 5.840161630883813, "learning_rate": 8e-05, "loss": 0.001, "num_tokens": 20022864.0, "reward": 0.30000001192092896, "reward_std": 0.22083806991577148, "rewards/num_nodes_reward/mean": 0.6171875, "rewards/num_nodes_reward/std": 0.4879830479621887, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7187724709510803, "sampling/importance_sampling_ratio/min": 1.6737267850623815e-30, "sampling/sampling_logp_difference/max": 68.5625, "sampling/sampling_logp_difference/mean": 4.5109405517578125, "step": 187 }, { "clip_ratio/high_max": 0.029464378720149398, "clip_ratio/high_mean": 0.008312451711390167, "clip_ratio/low_mean": 0.006690647918730974, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015003100270405412, "completions/clipped_ratio": 1.0, "completions/max_length": 405.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 319.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 263.0, "completions/min_terminated_length": 0.0, "entropy": 1.68578739464283, "epoch": 94.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.021556168794631958, "kl": 0.2305986639112234, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 20127904.0, "reward": 0.3359375, "reward_std": 0.22975577414035797, "rewards/num_nodes_reward/mean": 0.6640625, "rewards/num_nodes_reward/std": 0.47417303919792175, "rewards/tree_correctness_reward/mean": 0.1953125, "rewards/tree_correctness_reward/std": 0.3979988098144531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6662974953651428, "sampling/importance_sampling_ratio/min": 1.0252820466629127e-29, "sampling/sampling_logp_difference/max": 66.75, "sampling/sampling_logp_difference/mean": 5.183964729309082, "step": 188 }, { "clip_ratio/high_max": 0.011418264650274068, "clip_ratio/high_mean": 0.004113806295208633, "clip_ratio/low_mean": 0.007494517078157514, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01160832354798913, "completions/clipped_ratio": 1.0, "completions/max_length": 388.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 321.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 251.0, "completions/min_terminated_length": 0.0, "entropy": 1.830347865819931, "epoch": 94.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.023768307641148567, "kl": 0.20553884282708168, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 20233200.0, "reward": 0.3460937738418579, "reward_std": 0.23252537846565247, "rewards/num_nodes_reward/mean": 0.625, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.2265625, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6603444218635559, "sampling/importance_sampling_ratio/min": 3.2583394374284734e-29, "sampling/sampling_logp_difference/max": 65.59375, "sampling/sampling_logp_difference/mean": 5.253490447998047, "step": 189 }, { "clip_ratio/high_max": 0.020226672990247607, "clip_ratio/high_mean": 0.006648910348303616, "clip_ratio/low_mean": 0.007492211647331715, "clip_ratio/low_min": 0.00038580247201025486, "clip_ratio/region_mean": 0.014141121762804687, "completions/clipped_ratio": 1.0, "completions/max_length": 348.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 220.0, "completions/min_terminated_length": 0.0, "entropy": 1.513109266757965, "epoch": 95.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.019232146441936493, "kl": 0.2652201410382986, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 20333856.0, "reward": 0.31562501192092896, "reward_std": 0.25090163946151733, "rewards/num_nodes_reward/mean": 0.6328125, "rewards/num_nodes_reward/std": 0.4839322865009308, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6976497769355774, "sampling/importance_sampling_ratio/min": 2.966751838288923e-29, "sampling/sampling_logp_difference/max": 65.6875, "sampling/sampling_logp_difference/mean": 4.804313659667969, "step": 190 }, { "epoch": 95.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 294.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 226.525, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 165.2, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.8139205574989319, "eval_frac_reward_zero_std": 0.2, "eval_kl": 0.3276823401451111, "eval_loss": 0.0014853342436254025, "eval_num_tokens": 20333856.0, "eval_reward": 0.13875000402331353, "eval_reward_std": 0.1540715828537941, "eval_rewards/num_nodes_reward/mean": 0.375, "eval_rewards/num_nodes_reward/std": 0.43377997279167174, "eval_rewards/tree_correctness_reward/mean": 0.0375, "eval_rewards/tree_correctness_reward/std": 0.11831300854682922, "eval_runtime": 38.2158, "eval_samples_per_second": 0.262, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8884372353553772, "eval_sampling/importance_sampling_ratio/min": 1.308484024046567e-27, "eval_sampling/sampling_logp_difference/max": 62.625, "eval_sampling/sampling_logp_difference/mean": 1.9602711915969848, "eval_steps_per_second": 0.026, "step": 190 }, { "clip_ratio/high_max": 0.020915500237606466, "clip_ratio/high_mean": 0.00587960088159889, "clip_ratio/low_mean": 0.00894976983545348, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01482937065884471, "completions/clipped_ratio": 1.0, "completions/max_length": 364.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 301.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 247.0, "completions/min_terminated_length": 0.0, "entropy": 1.4669725596904755, "epoch": 95.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.018544264137744904, "kl": 0.22382770664989948, "learning_rate": 8e-05, "loss": 0.0027, "num_tokens": 20436512.0, "reward": 0.26875001192092896, "reward_std": 0.20410732924938202, "rewards/num_nodes_reward/mean": 0.6953125, "rewards/num_nodes_reward/std": 0.46208351850509644, "rewards/tree_correctness_reward/mean": 0.0859375, "rewards/tree_correctness_reward/std": 0.2813730239868164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7178261280059814, "sampling/importance_sampling_ratio/min": 7.342781246671332e-29, "sampling/sampling_logp_difference/max": 64.78125, "sampling/sampling_logp_difference/mean": 4.523100852966309, "step": 191 }, { "clip_ratio/high_max": 0.02769683615770191, "clip_ratio/high_mean": 0.008401289291214198, "clip_ratio/low_mean": 0.004417364136315882, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01281865336932242, "completions/clipped_ratio": 1.0, "completions/max_length": 367.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 295.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 206.0, "completions/min_terminated_length": 0.0, "entropy": 1.7504733353853226, "epoch": 96.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.01733982376754284, "kl": 0.2343532033264637, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 20538400.0, "reward": 0.34062498807907104, "reward_std": 0.22420969605445862, "rewards/num_nodes_reward/mean": 0.625, "rewards/num_nodes_reward/std": 0.4860251843929291, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.41502299904823303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6695427894592285, "sampling/importance_sampling_ratio/min": 1.0252820466629127e-29, "sampling/sampling_logp_difference/max": 66.75, "sampling/sampling_logp_difference/mean": 5.149251461029053, "step": 192 }, { "clip_ratio/high_max": 0.019897880498319864, "clip_ratio/high_mean": 0.004961848899256438, "clip_ratio/low_mean": 0.005787328802398406, "clip_ratio/low_min": 0.00024900399148464203, "clip_ratio/region_mean": 0.010749178123660386, "completions/clipped_ratio": 1.0, "completions/max_length": 324.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 290.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 251.0, "completions/min_terminated_length": 0.0, "entropy": 1.429119423031807, "epoch": 96.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.019379908218979836, "kl": 0.24594655074179173, "learning_rate": 8e-05, "loss": 0.0011, "num_tokens": 20639696.0, "reward": 0.28984373807907104, "reward_std": 0.2510574758052826, "rewards/num_nodes_reward/mean": 0.65625, "rewards/num_nodes_reward/std": 0.47682511806488037, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7213782072067261, "sampling/importance_sampling_ratio/min": 5.542621749889874e-29, "sampling/sampling_logp_difference/max": 65.0625, "sampling/sampling_logp_difference/mean": 4.4773664474487305, "step": 193 }, { "clip_ratio/high_max": 0.027132830931805074, "clip_ratio/high_mean": 0.009992622071877122, "clip_ratio/low_mean": 0.00419604082708247, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014188662986271083, "completions/clipped_ratio": 1.0, "completions/max_length": 335.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 218.0, "completions/min_terminated_length": 0.0, "entropy": 1.6722991466522217, "epoch": 97.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.016174377873539925, "kl": 0.23446959257125854, "learning_rate": 8e-05, "loss": -0.0008, "num_tokens": 20739664.0, "reward": 0.36015623807907104, "reward_std": 0.19859999418258667, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6783707141876221, "sampling/importance_sampling_ratio/min": 4.3165985204907103e-29, "sampling/sampling_logp_difference/max": 65.3125, "sampling/sampling_logp_difference/mean": 5.100043773651123, "step": 194 }, { "clip_ratio/high_max": 0.026676195906475186, "clip_ratio/high_mean": 0.008921720465878025, "clip_ratio/low_mean": 0.005481305968714878, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014403026085346937, "completions/clipped_ratio": 1.0, "completions/max_length": 357.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 232.0, "completions/min_terminated_length": 0.0, "entropy": 1.7699875235557556, "epoch": 97.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.016524629667401314, "kl": 0.2245592325925827, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 20840208.0, "reward": 0.3812499940395355, "reward_std": 0.2128911316394806, "rewards/num_nodes_reward/mean": 0.796875, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6587755680084229, "sampling/importance_sampling_ratio/min": 1.587987769361992e-29, "sampling/sampling_logp_difference/max": 66.3125, "sampling/sampling_logp_difference/mean": 5.358015060424805, "step": 195 }, { "clip_ratio/high_max": 0.02138559950981289, "clip_ratio/high_mean": 0.006536108179716393, "clip_ratio/low_mean": 0.005903464130824432, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01243957260157913, "completions/clipped_ratio": 1.0, "completions/max_length": 303.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 275.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 230.0, "completions/min_terminated_length": 0.0, "entropy": 1.48151433467865, "epoch": 98.0, "frac_reward_zero_std": 0.0625, "grad_norm": 0.017531322315335274, "kl": 0.3049443420022726, "learning_rate": 8e-05, "loss": 0.002, "num_tokens": 20939584.0, "reward": 0.23750001192092896, "reward_std": 0.19987830519676208, "rewards/num_nodes_reward/mean": 0.609375, "rewards/num_nodes_reward/std": 0.4898075461387634, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7117818593978882, "sampling/importance_sampling_ratio/min": 1.63839597639079e-29, "sampling/sampling_logp_difference/max": 66.28125, "sampling/sampling_logp_difference/mean": 4.636693954467773, "step": 196 }, { "clip_ratio/high_max": 0.022882161720190197, "clip_ratio/high_mean": 0.007847858127206564, "clip_ratio/low_mean": 0.0032058516080724075, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011053709895350039, "completions/clipped_ratio": 1.0, "completions/max_length": 326.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 1.7840345948934555, "epoch": 98.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.016948217526078224, "kl": 0.2652716990560293, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 21039648.0, "reward": 0.3984375298023224, "reward_std": 0.19357767701148987, "rewards/num_nodes_reward/mean": 0.7265625, "rewards/num_nodes_reward/std": 0.447474867105484, "rewards/tree_correctness_reward/mean": 0.2578125, "rewards/tree_correctness_reward/std": 0.43914902210235596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6499286890029907, "sampling/importance_sampling_ratio/min": 1.4917765848206356e-29, "sampling/sampling_logp_difference/max": 66.375, "sampling/sampling_logp_difference/mean": 5.498523235321045, "step": 197 }, { "clip_ratio/high_max": 0.02140058355871588, "clip_ratio/high_mean": 0.005900571384700015, "clip_ratio/low_mean": 0.004518298548646271, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010418870369903743, "completions/clipped_ratio": 1.0, "completions/max_length": 274.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 217.0, "completions/min_terminated_length": 0.0, "entropy": 1.3060125038027763, "epoch": 99.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.018743472173810005, "kl": 0.2703985311090946, "learning_rate": 8e-05, "loss": -0.0013, "num_tokens": 21135968.0, "reward": 0.2132812738418579, "reward_std": 0.14588363468647003, "rewards/num_nodes_reward/mean": 0.546875, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.0703125, "rewards/tree_correctness_reward/std": 0.2566775679588318, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7292590141296387, "sampling/importance_sampling_ratio/min": 6.619718383830818e-30, "sampling/sampling_logp_difference/max": 67.1875, "sampling/sampling_logp_difference/mean": 4.473660469055176, "step": 198 }, { "clip_ratio/high_max": 0.022005202597938478, "clip_ratio/high_mean": 0.005613323592115194, "clip_ratio/low_mean": 0.005146120500285178, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01075944397598505, "completions/clipped_ratio": 1.0, "completions/max_length": 328.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 232.0, "completions/min_terminated_length": 0.0, "entropy": 1.6402310281991959, "epoch": 99.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.01818041503429413, "kl": 0.2540849316865206, "learning_rate": 8e-05, "loss": -0.0019, "num_tokens": 21235056.0, "reward": 0.2757812440395355, "reward_std": 0.1597360372543335, "rewards/num_nodes_reward/mean": 0.609375, "rewards/num_nodes_reward/std": 0.4898075461387634, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6811355352401733, "sampling/importance_sampling_ratio/min": 1.7816729014961325e-30, "sampling/sampling_logp_difference/max": 68.5, "sampling/sampling_logp_difference/mean": 5.078385353088379, "step": 199 }, { "clip_ratio/high_max": 0.02287518448429182, "clip_ratio/high_mean": 0.004679984020185657, "clip_ratio/low_mean": 0.004928496957290918, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009608480904716998, "completions/clipped_ratio": 1.0, "completions/max_length": 302.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 235.0, "completions/min_terminated_length": 0.0, "entropy": 1.4595999419689178, "epoch": 100.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.02159377746284008, "kl": 0.282560758292675, "learning_rate": 8e-05, "loss": 0.0015, "num_tokens": 21332832.0, "reward": 0.2710937261581421, "reward_std": 0.16946053504943848, "rewards/num_nodes_reward/mean": 0.59375, "rewards/num_nodes_reward/std": 0.4930621087551117, "rewards/tree_correctness_reward/mean": 0.1328125, "rewards/tree_correctness_reward/std": 0.3407054841518402, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7102721929550171, "sampling/importance_sampling_ratio/min": 2.103742028355895e-29, "sampling/sampling_logp_difference/max": 66.03125, "sampling/sampling_logp_difference/mean": 4.704805374145508, "step": 200 }, { "epoch": 100.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 248.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 200.425, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 159.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.8354318499565124, "eval_frac_reward_zero_std": 0.4, "eval_kl": 0.36596168875694274, "eval_loss": -0.0030207892414182425, "eval_num_tokens": 21332832.0, "eval_reward": 0.10124999880790711, "eval_reward_std": 0.13291654139757156, "eval_rewards/num_nodes_reward/mean": 0.25, "eval_rewards/num_nodes_reward/std": 0.3095080256462097, "eval_rewards/tree_correctness_reward/mean": 0.0375, "eval_rewards/tree_correctness_reward/std": 0.11831300854682922, "eval_runtime": 30.2441, "eval_samples_per_second": 0.331, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8926485657691956, "eval_sampling/importance_sampling_ratio/min": 3.8029870639423113e-28, "eval_sampling/sampling_logp_difference/max": 64.05, "eval_sampling/sampling_logp_difference/mean": 1.958072543144226, "eval_steps_per_second": 0.033, "step": 200 }, { "clip_ratio/high_max": 0.02365729189477861, "clip_ratio/high_mean": 0.007762351131532341, "clip_ratio/low_mean": 0.0038741017051506788, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011636452923994511, "completions/clipped_ratio": 1.0, "completions/max_length": 369.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 220.0, "completions/min_terminated_length": 0.0, "entropy": 1.7644225805997849, "epoch": 100.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.01773136295378208, "kl": 0.25113457441329956, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 21433184.0, "reward": 0.2671875059604645, "reward_std": 0.16184386610984802, "rewards/num_nodes_reward/mean": 0.6171875, "rewards/num_nodes_reward/std": 0.4879830479621887, "rewards/tree_correctness_reward/mean": 0.1171875, "rewards/tree_correctness_reward/std": 0.322907418012619, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6568517684936523, "sampling/importance_sampling_ratio/min": 1.5723209311175096e-30, "sampling/sampling_logp_difference/max": 68.625, "sampling/sampling_logp_difference/mean": 5.421789169311523, "step": 201 }, { "clip_ratio/high_max": 0.029644534224644303, "clip_ratio/high_mean": 0.009506475762464106, "clip_ratio/low_mean": 0.006011978373862803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015518454019911587, "completions/clipped_ratio": 1.0, "completions/max_length": 297.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 225.0, "completions/min_terminated_length": 0.0, "entropy": 1.623191237449646, "epoch": 101.0, "frac_reward_zero_std": 0.125, "grad_norm": 0.07896304875612259, "kl": 0.3002929035574198, "learning_rate": 8e-05, "loss": -0.0013, "num_tokens": 21528720.0, "reward": 0.36015623807907104, "reward_std": 0.21513843536376953, "rewards/num_nodes_reward/mean": 0.7265625, "rewards/num_nodes_reward/std": 0.447474867105484, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.669711709022522, "sampling/importance_sampling_ratio/min": 1.5391306882548106e-29, "sampling/sampling_logp_difference/max": 66.34375, "sampling/sampling_logp_difference/mean": 5.282166481018066, "step": 202 }, { "clip_ratio/high_max": 0.023982305894605815, "clip_ratio/high_mean": 0.008395560609642416, "clip_ratio/low_mean": 0.005304125777911395, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013699686736799777, "completions/clipped_ratio": 1.0, "completions/max_length": 341.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 259.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 193.0, "completions/min_terminated_length": 0.0, "entropy": 1.5072964131832123, "epoch": 101.5, "frac_reward_zero_std": 0.0625, "grad_norm": 0.034010715782642365, "kl": 0.30211471393704414, "learning_rate": 8e-05, "loss": -0.0027, "num_tokens": 21626064.0, "reward": 0.31562501192092896, "reward_std": 0.23251399397850037, "rewards/num_nodes_reward/mean": 0.6875, "rewards/num_nodes_reward/std": 0.4653336703777313, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7038673758506775, "sampling/importance_sampling_ratio/min": 3.434264486209825e-30, "sampling/sampling_logp_difference/max": 67.84375, "sampling/sampling_logp_difference/mean": 4.814756870269775, "step": 203 }, { "clip_ratio/high_max": 0.019030557479709387, "clip_ratio/high_mean": 0.005392763967392966, "clip_ratio/low_mean": 0.0032524616981390864, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008645225723739713, "completions/clipped_ratio": 1.0, "completions/max_length": 296.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 257.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 203.0, "completions/min_terminated_length": 0.0, "entropy": 1.5413818359375, "epoch": 102.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.01503215916454792, "kl": 0.25805808417499065, "learning_rate": 8e-05, "loss": 0.0027, "num_tokens": 21723104.0, "reward": 0.35624998807907104, "reward_std": 0.1225004494190216, "rewards/num_nodes_reward/mean": 0.8046875, "rewards/num_nodes_reward/std": 0.3979988098144531, "rewards/tree_correctness_reward/mean": 0.1640625, "rewards/tree_correctness_reward/std": 0.371787428855896, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7008283138275146, "sampling/importance_sampling_ratio/min": 8.238372224122146e-30, "sampling/sampling_logp_difference/max": 66.96875, "sampling/sampling_logp_difference/mean": 4.835282325744629, "step": 204 }, { "clip_ratio/high_max": 0.01860949647380039, "clip_ratio/high_mean": 0.00515569859999232, "clip_ratio/low_mean": 0.005530470923986286, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010686169378459454, "completions/clipped_ratio": 1.0, "completions/max_length": 285.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 235.0, "completions/min_terminated_length": 0.0, "entropy": 1.7681178152561188, "epoch": 102.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.015823645517230034, "kl": 0.2827087752521038, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 21820016.0, "reward": 0.37187498807907104, "reward_std": 0.17316080629825592, "rewards/num_nodes_reward/mean": 0.7109375, "rewards/num_nodes_reward/std": 0.45510825514793396, "rewards/tree_correctness_reward/mean": 0.2265625, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.645512044429779, "sampling/importance_sampling_ratio/min": 3.060926502377367e-29, "sampling/sampling_logp_difference/max": 65.65625, "sampling/sampling_logp_difference/mean": 5.586297988891602, "step": 205 }, { "clip_ratio/high_max": 0.01854479289613664, "clip_ratio/high_mean": 0.004820860340259969, "clip_ratio/low_mean": 0.005860238612513058, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010681098909117281, "completions/clipped_ratio": 1.0, "completions/max_length": 322.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 214.0, "completions/min_terminated_length": 0.0, "entropy": 1.627752423286438, "epoch": 103.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.01734820567071438, "kl": 0.26768303848803043, "learning_rate": 8e-05, "loss": -0.0009, "num_tokens": 21917984.0, "reward": 0.3226562738418579, "reward_std": 0.17262934148311615, "rewards/num_nodes_reward/mean": 0.8203125, "rewards/num_nodes_reward/std": 0.3854354918003082, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.31333550810813904, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6800965666770935, "sampling/importance_sampling_ratio/min": 4.9968252407126834e-30, "sampling/sampling_logp_difference/max": 67.46875, "sampling/sampling_logp_difference/mean": 5.1192708015441895, "step": 206 }, { "clip_ratio/high_max": 0.018181635532528162, "clip_ratio/high_mean": 0.0044434470910346135, "clip_ratio/low_mean": 0.004978103999746963, "clip_ratio/low_min": 0.0003196930920239538, "clip_ratio/region_mean": 0.009421551018022, "completions/clipped_ratio": 1.0, "completions/max_length": 391.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 206.0, "completions/min_terminated_length": 0.0, "entropy": 2.0927814841270447, "epoch": 103.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.019796239212155342, "kl": 0.2444443702697754, "learning_rate": 8e-05, "loss": 0.0019, "num_tokens": 22016448.0, "reward": 0.390625, "reward_std": 0.24367755651474, "rewards/num_nodes_reward/mean": 0.7734375, "rewards/num_nodes_reward/std": 0.4202519655227661, "rewards/tree_correctness_reward/mean": 0.2265625, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.610334038734436, "sampling/importance_sampling_ratio/min": 1.7994259883065448e-29, "sampling/sampling_logp_difference/max": 66.1875, "sampling/sampling_logp_difference/mean": 6.014393329620361, "step": 207 }, { "clip_ratio/high_max": 0.024127913173288107, "clip_ratio/high_mean": 0.00557922397274524, "clip_ratio/low_mean": 0.005038623683503829, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010617847787216306, "completions/clipped_ratio": 1.0, "completions/max_length": 346.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 177.0, "completions/min_terminated_length": 0.0, "entropy": 1.7466848492622375, "epoch": 104.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.018417060375213623, "kl": 0.2793179899454117, "learning_rate": 8e-05, "loss": -0.0024, "num_tokens": 22113424.0, "reward": 0.31562501192092896, "reward_std": 0.18356101214885712, "rewards/num_nodes_reward/mean": 0.6875, "rewards/num_nodes_reward/std": 0.4653336703777313, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.659046471118927, "sampling/importance_sampling_ratio/min": 1.0252820466629127e-29, "sampling/sampling_logp_difference/max": 66.75, "sampling/sampling_logp_difference/mean": 5.414454936981201, "step": 208 }, { "clip_ratio/high_max": 0.028481169254519045, "clip_ratio/high_mean": 0.007777331629768014, "clip_ratio/low_mean": 0.005960670532658696, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013738001813180745, "completions/clipped_ratio": 1.0, "completions/max_length": 233.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 217.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 193.0, "completions/min_terminated_length": 0.0, "entropy": 1.5112451761960983, "epoch": 104.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.019619986414909363, "kl": 0.29853687807917595, "learning_rate": 8e-05, "loss": -0.0023, "num_tokens": 22205424.0, "reward": 0.39140623807907104, "reward_std": 0.15881308913230896, "rewards/num_nodes_reward/mean": 0.6484375, "rewards/num_nodes_reward/std": 0.4793342351913452, "rewards/tree_correctness_reward/mean": 0.28125, "rewards/tree_correctness_reward/std": 0.4513758420944214, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.698881983757019, "sampling/importance_sampling_ratio/min": 1.0252820466629127e-29, "sampling/sampling_logp_difference/max": 66.75, "sampling/sampling_logp_difference/mean": 4.913680076599121, "step": 209 }, { "clip_ratio/high_max": 0.02157585381064564, "clip_ratio/high_mean": 0.005174745863769203, "clip_ratio/low_mean": 0.0046337274252437055, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009808473289012909, "completions/clipped_ratio": 1.0, "completions/max_length": 308.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 211.0, "completions/min_terminated_length": 0.0, "entropy": 1.5128308683633804, "epoch": 105.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.02468598634004593, "kl": 0.3004851471632719, "learning_rate": 8e-05, "loss": -0.0017, "num_tokens": 22300944.0, "reward": 0.23515625298023224, "reward_std": 0.13479170203208923, "rewards/num_nodes_reward/mean": 0.65625, "rewards/num_nodes_reward/std": 0.47682511806488037, "rewards/tree_correctness_reward/mean": 0.0546875, "rewards/tree_correctness_reward/std": 0.22826264798641205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6970168352127075, "sampling/importance_sampling_ratio/min": 1.1260519675679445e-29, "sampling/sampling_logp_difference/max": 66.65625, "sampling/sampling_logp_difference/mean": 4.9128499031066895, "step": 210 }, { "epoch": 105.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 221.2, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 177.55, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 134.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.9048738241195678, "eval_frac_reward_zero_std": 0.4, "eval_kl": 1.5183478772640229, "eval_loss": 0.0005431682802736759, "eval_num_tokens": 22300944.0, "eval_reward": 0.17625000178813935, "eval_reward_std": 0.13030225932598113, "eval_rewards/num_nodes_reward/mean": 0.5, "eval_rewards/num_nodes_reward/std": 0.2947649836540222, "eval_rewards/tree_correctness_reward/mean": 0.0375, "eval_rewards/tree_correctness_reward/std": 0.15, "eval_runtime": 29.6022, "eval_samples_per_second": 0.338, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.88685622215271, "eval_sampling/importance_sampling_ratio/min": 1.5917373852411378e-27, "eval_sampling/sampling_logp_difference/max": 62.225, "eval_sampling/sampling_logp_difference/mean": 2.049546813964844, "eval_steps_per_second": 0.034, "step": 210 }, { "clip_ratio/high_max": 0.01763117016525939, "clip_ratio/high_mean": 0.005178008985240012, "clip_ratio/low_mean": 0.005545248161070049, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010723257320933044, "completions/clipped_ratio": 1.0, "completions/max_length": 296.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 215.0, "completions/min_terminated_length": 0.0, "entropy": 1.6476407498121262, "epoch": 105.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.019710587337613106, "kl": 0.3016149625182152, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 22394896.0, "reward": 0.40937501192092896, "reward_std": 0.19425983726978302, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.434714138507843, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6675548553466797, "sampling/importance_sampling_ratio/min": 5.784245648136424e-31, "sampling/sampling_logp_difference/max": 69.625, "sampling/sampling_logp_difference/mean": 5.317447662353516, "step": 211 }, { "clip_ratio/high_max": 0.015356820193119347, "clip_ratio/high_mean": 0.004445754624612164, "clip_ratio/low_mean": 0.004257053369656205, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008702807943336666, "completions/clipped_ratio": 1.0, "completions/max_length": 366.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 214.0, "completions/min_terminated_length": 0.0, "entropy": 1.729636862874031, "epoch": 106.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.019859829917550087, "kl": 0.2832673769444227, "learning_rate": 8e-05, "loss": 0.0014, "num_tokens": 22493424.0, "reward": 0.3109375238418579, "reward_std": 0.17016439139842987, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.109375, "rewards/tree_correctness_reward/std": 0.31333550810813904, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6737052202224731, "sampling/importance_sampling_ratio/min": 6.0273218931633014e-30, "sampling/sampling_logp_difference/max": 67.28125, "sampling/sampling_logp_difference/mean": 5.183588981628418, "step": 212 }, { "clip_ratio/high_max": 0.012216553674079478, "clip_ratio/high_mean": 0.00197416078299284, "clip_ratio/low_mean": 0.0032209535856964067, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005195114237722009, "completions/clipped_ratio": 1.0, "completions/max_length": 329.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 250.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 219.0, "completions/min_terminated_length": 0.0, "entropy": 1.696972906589508, "epoch": 106.5, "frac_reward_zero_std": 0.5625, "grad_norm": 0.011618039570748806, "kl": 0.27543069049715996, "learning_rate": 8e-05, "loss": 0.0009, "num_tokens": 22589584.0, "reward": 0.2984375059604645, "reward_std": 0.11503194272518158, "rewards/num_nodes_reward/mean": 0.6484375, "rewards/num_nodes_reward/std": 0.4793342351913452, "rewards/tree_correctness_reward/mean": 0.1484375, "rewards/tree_correctness_reward/std": 0.356930136680603, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6652777194976807, "sampling/importance_sampling_ratio/min": 1.4013943476265886e-29, "sampling/sampling_logp_difference/max": 66.4375, "sampling/sampling_logp_difference/mean": 5.332718849182129, "step": 213 }, { "clip_ratio/high_max": 0.015252115146722645, "clip_ratio/high_mean": 0.003415066887100693, "clip_ratio/low_mean": 0.006080952822230756, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009496019629295915, "completions/clipped_ratio": 1.0, "completions/max_length": 344.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 222.0, "completions/min_terminated_length": 0.0, "entropy": 1.7433791905641556, "epoch": 107.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01836307905614376, "kl": 0.34765414893627167, "learning_rate": 8e-05, "loss": 0.0008, "num_tokens": 22687232.0, "reward": 0.2671875059604645, "reward_std": 0.17309916019439697, "rewards/num_nodes_reward/mean": 0.671875, "rewards/num_nodes_reward/std": 0.4713755249977112, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29262590408325195, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6762189865112305, "sampling/importance_sampling_ratio/min": 3.3617703013855747e-29, "sampling/sampling_logp_difference/max": 65.5625, "sampling/sampling_logp_difference/mean": 5.11811637878418, "step": 214 }, { "clip_ratio/high_max": 0.017875692050438374, "clip_ratio/high_mean": 0.0037782827639603056, "clip_ratio/low_mean": 0.004266485921107233, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008044768706895411, "completions/clipped_ratio": 1.0, "completions/max_length": 272.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 229.0, "completions/min_terminated_length": 0.0, "entropy": 1.6619960367679596, "epoch": 107.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.016561802476644516, "kl": 0.3548151236027479, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 22783616.0, "reward": 0.19609375298023224, "reward_std": 0.13331842422485352, "rewards/num_nodes_reward/mean": 0.5078125, "rewards/num_nodes_reward/std": 0.5019033551216125, "rewards/tree_correctness_reward/mean": 0.0625, "rewards/tree_correctness_reward/std": 0.24301259219646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6756213307380676, "sampling/importance_sampling_ratio/min": 1.0252820466629127e-29, "sampling/sampling_logp_difference/max": 66.75, "sampling/sampling_logp_difference/mean": 5.184983253479004, "step": 215 }, { "clip_ratio/high_max": 0.021470553474500775, "clip_ratio/high_mean": 0.0051959839765913785, "clip_ratio/low_mean": 0.005131017445819452, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010327001626137644, "completions/clipped_ratio": 1.0, "completions/max_length": 268.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 241.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 222.0, "completions/min_terminated_length": 0.0, "entropy": 1.7313107401132584, "epoch": 108.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.01661527529358864, "kl": 0.3297047019004822, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 22878592.0, "reward": 0.39453125, "reward_std": 0.16437053680419922, "rewards/num_nodes_reward/mean": 0.75, "rewards/num_nodes_reward/std": 0.434714138507843, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.661271333694458, "sampling/importance_sampling_ratio/min": 1.654721332212976e-28, "sampling/sampling_logp_difference/max": 63.96875, "sampling/sampling_logp_difference/mean": 5.406142711639404, "step": 216 }, { "clip_ratio/high_max": 0.017878302896860987, "clip_ratio/high_mean": 0.004980358855391387, "clip_ratio/low_mean": 0.0043703276896849275, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009350686508696526, "completions/clipped_ratio": 1.0, "completions/max_length": 273.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 241.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 218.0, "completions/min_terminated_length": 0.0, "entropy": 1.710324466228485, "epoch": 108.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.01461783703416586, "kl": 0.29373692721128464, "learning_rate": 8e-05, "loss": 0.0033, "num_tokens": 22973680.0, "reward": 0.39921873807907104, "reward_std": 0.17498353123664856, "rewards/num_nodes_reward/mean": 0.765625, "rewards/num_nodes_reward/std": 0.42527204751968384, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6582725048065186, "sampling/importance_sampling_ratio/min": 1.1986760774318029e-29, "sampling/sampling_logp_difference/max": 66.59375, "sampling/sampling_logp_difference/mean": 5.448356628417969, "step": 217 }, { "clip_ratio/high_max": 0.017033518408425152, "clip_ratio/high_mean": 0.0042141911399085075, "clip_ratio/low_mean": 0.005145577160874382, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009359768242575228, "completions/clipped_ratio": 1.0, "completions/max_length": 296.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 222.0, "completions/min_terminated_length": 0.0, "entropy": 1.7749605476856232, "epoch": 109.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.020495356991887093, "kl": 0.28999666310846806, "learning_rate": 8e-05, "loss": 0.0048, "num_tokens": 23070128.0, "reward": 0.3414062559604645, "reward_std": 0.1651439219713211, "rewards/num_nodes_reward/mean": 0.71875, "rewards/num_nodes_reward/std": 0.4513758420944214, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.646104633808136, "sampling/importance_sampling_ratio/min": 3.4684842740447485e-29, "sampling/sampling_logp_difference/max": 65.53125, "sampling/sampling_logp_difference/mean": 5.608184814453125, "step": 218 }, { "clip_ratio/high_max": 0.02488150750286877, "clip_ratio/high_mean": 0.006595282713533379, "clip_ratio/low_mean": 0.005962333729257807, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012557616224512458, "completions/clipped_ratio": 1.0, "completions/max_length": 312.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 203.0, "completions/min_terminated_length": 0.0, "entropy": 1.8746763169765472, "epoch": 109.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.016047973185777664, "kl": 0.33587760850787163, "learning_rate": 8e-05, "loss": -0.0028, "num_tokens": 23165808.0, "reward": 0.33984375, "reward_std": 0.17173998057842255, "rewards/num_nodes_reward/mean": 0.8046875, "rewards/num_nodes_reward/std": 0.3979988098144531, "rewards/tree_correctness_reward/mean": 0.140625, "rewards/tree_correctness_reward/std": 0.3490002751350403, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6326545476913452, "sampling/importance_sampling_ratio/min": 2.9374821418009058e-30, "sampling/sampling_logp_difference/max": 68.0, "sampling/sampling_logp_difference/mean": 5.77263069152832, "step": 219 }, { "clip_ratio/high_max": 0.026261603110469878, "clip_ratio/high_mean": 0.007093562715454027, "clip_ratio/low_mean": 0.004129385182750411, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01122294767992571, "completions/clipped_ratio": 1.0, "completions/max_length": 292.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 205.0, "completions/min_terminated_length": 0.0, "entropy": 1.9328104704618454, "epoch": 110.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.013415747322142124, "kl": 0.2956479638814926, "learning_rate": 8e-05, "loss": -0.0021, "num_tokens": 23260480.0, "reward": 0.4585937261581421, "reward_std": 0.173549622297287, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.3203125, "rewards/tree_correctness_reward/std": 0.4684300124645233, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6147794723510742, "sampling/importance_sampling_ratio/min": 2.5125616226792233e-30, "sampling/sampling_logp_difference/max": 68.15625, "sampling/sampling_logp_difference/mean": 6.002937316894531, "step": 220 }, { "epoch": 110.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 216.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 168.25, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 107.2, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.8647922277450562, "eval_frac_reward_zero_std": 0.4, "eval_kl": 0.4262888491153717, "eval_loss": -0.0035005835816264153, "eval_num_tokens": 23260480.0, "eval_reward": 0.09624999761581421, "eval_reward_std": 0.10588685870170593, "eval_rewards/num_nodes_reward/mean": 0.2625, "eval_rewards/num_nodes_reward/std": 0.24893558621406556, "eval_rewards/tree_correctness_reward/mean": 0.025, "eval_rewards/tree_correctness_reward/std": 0.1, "eval_runtime": 27.4585, "eval_samples_per_second": 0.364, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8841562747955323, "eval_sampling/importance_sampling_ratio/min": 2.9539778009876897e-28, "eval_sampling/sampling_logp_difference/max": 64.26875, "eval_sampling/sampling_logp_difference/mean": 2.1010339736938475, "eval_steps_per_second": 0.036, "step": 220 }, { "clip_ratio/high_max": 0.022296695213299245, "clip_ratio/high_mean": 0.0058722691392176785, "clip_ratio/low_mean": 0.004871661309152842, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010743930411990732, "completions/clipped_ratio": 1.0, "completions/max_length": 335.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 235.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 191.0, "completions/min_terminated_length": 0.0, "entropy": 1.7310949563980103, "epoch": 110.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.01729489117860794, "kl": 0.3009943924844265, "learning_rate": 8e-05, "loss": -0.0024, "num_tokens": 23354768.0, "reward": 0.29531246423721313, "reward_std": 0.15164124965667725, "rewards/num_nodes_reward/mean": 0.7109375, "rewards/num_nodes_reward/std": 0.45510825514793396, "rewards/tree_correctness_reward/mean": 0.1171875, "rewards/tree_correctness_reward/std": 0.322907418012619, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6520100831985474, "sampling/importance_sampling_ratio/min": 5.1554417464352396e-30, "sampling/sampling_logp_difference/max": 67.4375, "sampling/sampling_logp_difference/mean": 5.549160957336426, "step": 221 }, { "clip_ratio/high_max": 0.016373677586670965, "clip_ratio/high_mean": 0.0033352802201989107, "clip_ratio/low_mean": 0.0029067806754028425, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006242060859221965, "completions/clipped_ratio": 1.0, "completions/max_length": 291.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 196.0, "completions/min_terminated_length": 0.0, "entropy": 1.8843982815742493, "epoch": 111.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.013192488811910152, "kl": 0.27887532114982605, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 23449776.0, "reward": 0.2750000059604645, "reward_std": 0.1055653989315033, "rewards/num_nodes_reward/mean": 0.4609375, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.1953125, "rewards/tree_correctness_reward/std": 0.3979988098144531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6238988637924194, "sampling/importance_sampling_ratio/min": 1.4917765848206356e-29, "sampling/sampling_logp_difference/max": 66.375, "sampling/sampling_logp_difference/mean": 5.918201446533203, "step": 222 }, { "clip_ratio/high_max": 0.006656603538431227, "clip_ratio/high_mean": 0.0011121626303065568, "clip_ratio/low_mean": 0.0035958431544713676, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004708005755674094, "completions/clipped_ratio": 1.0, "completions/max_length": 254.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 145.0, "completions/min_terminated_length": 0.0, "entropy": 1.594870164990425, "epoch": 111.5, "frac_reward_zero_std": 0.6875, "grad_norm": 0.008953382261097431, "kl": 0.43903137370944023, "learning_rate": 8e-05, "loss": 0.0006, "num_tokens": 23540640.0, "reward": 0.30781251192092896, "reward_std": 0.07899300754070282, "rewards/num_nodes_reward/mean": 0.4609375, "rewards/num_nodes_reward/std": 0.5004304051399231, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6650223731994629, "sampling/importance_sampling_ratio/min": 1.4316144454031779e-30, "sampling/sampling_logp_difference/max": 68.71875, "sampling/sampling_logp_difference/mean": 5.4364423751831055, "step": 223 }, { "clip_ratio/high_max": 0.017639887082623318, "clip_ratio/high_mean": 0.0034713777458819095, "clip_ratio/low_mean": 0.004875898564932868, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00834727636538446, "completions/clipped_ratio": 1.0, "completions/max_length": 249.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 234.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 210.0, "completions/min_terminated_length": 0.0, "entropy": 1.898175984621048, "epoch": 112.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.01610950008034706, "kl": 0.2780996020883322, "learning_rate": 8e-05, "loss": -0.0025, "num_tokens": 23634832.0, "reward": 0.27265623211860657, "reward_std": 0.16880005598068237, "rewards/num_nodes_reward/mean": 0.6171875, "rewards/num_nodes_reward/std": 0.4879830479621887, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3320184051990509, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6158846020698547, "sampling/importance_sampling_ratio/min": 1.1868500964489711e-30, "sampling/sampling_logp_difference/max": 68.90625, "sampling/sampling_logp_difference/mean": 6.051201820373535, "step": 224 }, { "clip_ratio/high_max": 0.020430181932169944, "clip_ratio/high_mean": 0.003965519965277053, "clip_ratio/low_mean": 0.004639807928469963, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00860532812657766, "completions/clipped_ratio": 1.0, "completions/max_length": 286.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 205.0, "completions/min_terminated_length": 0.0, "entropy": 1.7464776635169983, "epoch": 112.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.012889985926449299, "kl": 0.2915843240916729, "learning_rate": 8e-05, "loss": 0.0007, "num_tokens": 23728832.0, "reward": 0.3140625059604645, "reward_std": 0.15015074610710144, "rewards/num_nodes_reward/mean": 0.609375, "rewards/num_nodes_reward/std": 0.4898075461387634, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39184603095054626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6419916152954102, "sampling/importance_sampling_ratio/min": 1.6222316689370394e-30, "sampling/sampling_logp_difference/max": 68.59375, "sampling/sampling_logp_difference/mean": 5.708286285400391, "step": 225 }, { "clip_ratio/high_max": 0.015048911911435425, "clip_ratio/high_mean": 0.0029626130999531597, "clip_ratio/low_mean": 0.004700915451394394, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0076635286677628756, "completions/clipped_ratio": 1.0, "completions/max_length": 262.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 158.0, "completions/min_terminated_length": 0.0, "entropy": 1.4874525368213654, "epoch": 113.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.01365059707313776, "kl": 0.3284875750541687, "learning_rate": 8e-05, "loss": -0.0018, "num_tokens": 23819696.0, "reward": 0.2578125, "reward_std": 0.09992457181215286, "rewards/num_nodes_reward/mean": 0.421875, "rewards/num_nodes_reward/std": 0.4957992732524872, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39184603095054626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6798741221427917, "sampling/importance_sampling_ratio/min": 2.5923188203429674e-30, "sampling/sampling_logp_difference/max": 68.125, "sampling/sampling_logp_difference/mean": 5.2562665939331055, "step": 226 }, { "clip_ratio/high_max": 0.02127115847542882, "clip_ratio/high_mean": 0.003715199200087227, "clip_ratio/low_mean": 0.0027287347729725298, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006443934078561142, "completions/clipped_ratio": 1.0, "completions/max_length": 249.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 203.0, "completions/min_terminated_length": 0.0, "entropy": 1.6966571062803268, "epoch": 113.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.012292533181607723, "kl": 0.30929306149482727, "learning_rate": 8e-05, "loss": 0.001, "num_tokens": 23912768.0, "reward": 0.19062501192092896, "reward_std": 0.1136297732591629, "rewards/num_nodes_reward/mean": 0.453125, "rewards/num_nodes_reward/std": 0.4997538626194, "rewards/tree_correctness_reward/mean": 0.078125, "rewards/tree_correctness_reward/std": 0.2694226801395416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6479252576828003, "sampling/importance_sampling_ratio/min": 2.3105086634790934e-29, "sampling/sampling_logp_difference/max": 65.9375, "sampling/sampling_logp_difference/mean": 5.647585868835449, "step": 227 }, { "clip_ratio/high_max": 0.018677369342185557, "clip_ratio/high_mean": 0.00407354740309529, "clip_ratio/low_mean": 0.006137484102509916, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010211031534709036, "completions/clipped_ratio": 1.0, "completions/max_length": 262.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 159.0, "completions/min_terminated_length": 0.0, "entropy": 1.8193008601665497, "epoch": 114.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.014283494092524052, "kl": 0.3047901876270771, "learning_rate": 8e-05, "loss": -0.0011, "num_tokens": 24003680.0, "reward": 0.34687501192092896, "reward_std": 0.18206271529197693, "rewards/num_nodes_reward/mean": 0.6640625, "rewards/num_nodes_reward/std": 0.47417303919792175, "rewards/tree_correctness_reward/mean": 0.2109375, "rewards/tree_correctness_reward/std": 0.4095771610736847, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6349354982376099, "sampling/importance_sampling_ratio/min": 4.3165985204907103e-29, "sampling/sampling_logp_difference/max": 65.3125, "sampling/sampling_logp_difference/mean": 5.820737838745117, "step": 228 }, { "clip_ratio/high_max": 0.014875913548166864, "clip_ratio/high_mean": 0.003043391412575147, "clip_ratio/low_mean": 0.004398082528496161, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007441474182996899, "completions/clipped_ratio": 1.0, "completions/max_length": 318.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 154.0, "completions/min_terminated_length": 0.0, "entropy": 1.9428502470254898, "epoch": 114.5, "frac_reward_zero_std": 0.5625, "grad_norm": 0.009856556542217731, "kl": 0.31634342670440674, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 24096800.0, "reward": 0.3140625059604645, "reward_std": 0.11524613201618195, "rewards/num_nodes_reward/mean": 0.5546875, "rewards/num_nodes_reward/std": 0.4989531338214874, "rewards/tree_correctness_reward/mean": 0.2109375, "rewards/tree_correctness_reward/std": 0.4095771610736847, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6189867258071899, "sampling/importance_sampling_ratio/min": 1.587987769361992e-29, "sampling/sampling_logp_difference/max": 66.3125, "sampling/sampling_logp_difference/mean": 5.987451553344727, "step": 229 }, { "clip_ratio/high_max": 0.012516416725702584, "clip_ratio/high_mean": 0.0040140245837392285, "clip_ratio/low_mean": 0.003080238493566867, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007094263157341629, "completions/clipped_ratio": 1.0, "completions/max_length": 242.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 181.0, "completions/min_terminated_length": 0.0, "entropy": 1.7188843041658401, "epoch": 115.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.026538850739598274, "kl": 0.311817467212677, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 24188832.0, "reward": 0.38749998807907104, "reward_std": 0.14032793045043945, "rewards/num_nodes_reward/mean": 0.7265625, "rewards/num_nodes_reward/std": 0.447474867105484, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6400513648986816, "sampling/importance_sampling_ratio/min": 3.891531412521503e-30, "sampling/sampling_logp_difference/max": 67.71875, "sampling/sampling_logp_difference/mean": 5.743717670440674, "step": 230 }, { "epoch": 115.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 222.0, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 155.45, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 107.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.8631210327148438, "eval_frac_reward_zero_std": 0.7, "eval_kl": 0.46145750880241393, "eval_loss": 0.0009953310946002603, "eval_num_tokens": 24188832.0, "eval_reward": 0.12249999940395355, "eval_reward_std": 0.07040631920099258, "eval_rewards/num_nodes_reward/mean": 0.35, "eval_rewards/num_nodes_reward/std": 0.19272227883338927, "eval_rewards/tree_correctness_reward/mean": 0.025, "eval_rewards/tree_correctness_reward/std": 0.1, "eval_runtime": 28.4558, "eval_samples_per_second": 0.351, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.88069748878479, "eval_sampling/importance_sampling_ratio/min": 1.0637411487598504e-27, "eval_sampling/sampling_logp_difference/max": 63.14375, "eval_sampling/sampling_logp_difference/mean": 2.1438560962677, "eval_steps_per_second": 0.035, "step": 230 }, { "clip_ratio/high_max": 0.017859499668702483, "clip_ratio/high_mean": 0.004722897778265178, "clip_ratio/low_mean": 0.006439798860810697, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011162696755491197, "completions/clipped_ratio": 1.0, "completions/max_length": 266.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 209.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 164.0, "completions/min_terminated_length": 0.0, "entropy": 1.512330025434494, "epoch": 115.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.013762576505541801, "kl": 0.41820983216166496, "learning_rate": 8e-05, "loss": -0.0016, "num_tokens": 24279792.0, "reward": 0.19687500596046448, "reward_std": 0.13315501809120178, "rewards/num_nodes_reward/mean": 0.4375, "rewards/num_nodes_reward/std": 0.49802759289741516, "rewards/tree_correctness_reward/mean": 0.09375, "rewards/tree_correctness_reward/std": 0.29262590408325195, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6766320466995239, "sampling/importance_sampling_ratio/min": 3.655755457610972e-30, "sampling/sampling_logp_difference/max": 67.78125, "sampling/sampling_logp_difference/mean": 5.290630340576172, "step": 231 }, { "clip_ratio/high_max": 0.021260409499518573, "clip_ratio/high_mean": 0.0037085030926391482, "clip_ratio/low_mean": 0.0036982479687139858, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007406750810332596, "completions/clipped_ratio": 1.0, "completions/max_length": 224.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 195.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 125.0, "completions/min_terminated_length": 0.0, "entropy": 1.4242727756500244, "epoch": 116.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.012877022847533226, "kl": 0.37780167907476425, "learning_rate": 8e-05, "loss": -0.0008, "num_tokens": 24368960.0, "reward": 0.4664062559604645, "reward_std": 0.11869731545448303, "rewards/num_nodes_reward/mean": 0.8984375, "rewards/num_nodes_reward/std": 0.3032590448856354, "rewards/tree_correctness_reward/mean": 0.28125, "rewards/tree_correctness_reward/std": 0.4513758420944214, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6910234689712524, "sampling/importance_sampling_ratio/min": 3.060926502377367e-29, "sampling/sampling_logp_difference/max": 65.65625, "sampling/sampling_logp_difference/mean": 5.115143775939941, "step": 232 }, { "clip_ratio/high_max": 0.013787463889457285, "clip_ratio/high_mean": 0.002864149399101734, "clip_ratio/low_mean": 0.004312078221119009, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00717622775118798, "completions/clipped_ratio": 1.0, "completions/max_length": 255.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 227.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 185.0, "completions/min_terminated_length": 0.0, "entropy": 1.894801765680313, "epoch": 116.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.01065799966454506, "kl": 0.3246583305299282, "learning_rate": 8e-05, "loss": -0.0012, "num_tokens": 24462176.0, "reward": 0.37109375, "reward_std": 0.1509796679019928, "rewards/num_nodes_reward/mean": 0.8359375, "rewards/num_nodes_reward/std": 0.371787428855896, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.3787541687488556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6220229864120483, "sampling/importance_sampling_ratio/min": 2.3105086634790934e-29, "sampling/sampling_logp_difference/max": 65.9375, "sampling/sampling_logp_difference/mean": 5.977163791656494, "step": 233 }, { "clip_ratio/high_max": 0.02817815641174093, "clip_ratio/high_mean": 0.006164226128021255, "clip_ratio/low_mean": 0.0022780237486585975, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008442249963991344, "completions/clipped_ratio": 1.0, "completions/max_length": 250.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 208.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 177.0, "completions/min_terminated_length": 0.0, "entropy": 1.5044425576925278, "epoch": 117.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.01599426567554474, "kl": 0.3663487061858177, "learning_rate": 8e-05, "loss": 0.0011, "num_tokens": 24552992.0, "reward": 0.43437501788139343, "reward_std": 0.1203780472278595, "rewards/num_nodes_reward/mean": 0.8828125, "rewards/num_nodes_reward/std": 0.322907418012619, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6712648868560791, "sampling/importance_sampling_ratio/min": 6.0273218931633014e-30, "sampling/sampling_logp_difference/max": 67.28125, "sampling/sampling_logp_difference/mean": 5.364951133728027, "step": 234 }, { "clip_ratio/high_max": 0.016935364285018295, "clip_ratio/high_mean": 0.0036991748202126473, "clip_ratio/low_mean": 0.0041793630152824335, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078785379009787, "completions/clipped_ratio": 1.0, "completions/max_length": 290.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 231.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 171.0, "completions/min_terminated_length": 0.0, "entropy": 1.8445152342319489, "epoch": 117.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.012167908251285553, "kl": 0.3594192713499069, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 24646736.0, "reward": 0.3843750059604645, "reward_std": 0.15654028952121735, "rewards/num_nodes_reward/mean": 0.734375, "rewards/num_nodes_reward/std": 0.44340085983276367, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42527204751968384, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6252297163009644, "sampling/importance_sampling_ratio/min": 7.27033814412959e-30, "sampling/sampling_logp_difference/max": 67.09375, "sampling/sampling_logp_difference/mean": 5.909942626953125, "step": 235 }, { "clip_ratio/high_max": 0.012455011019483209, "clip_ratio/high_mean": 0.0020777075696969405, "clip_ratio/low_mean": 0.002648540808877442, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004726248327642679, "completions/clipped_ratio": 1.0, "completions/max_length": 275.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 225.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 177.0, "completions/min_terminated_length": 0.0, "entropy": 1.8300661444664001, "epoch": 118.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.012442821636795998, "kl": 0.35591205954551697, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 24739680.0, "reward": 0.37109375, "reward_std": 0.09200895577669144, "rewards/num_nodes_reward/mean": 0.8359375, "rewards/num_nodes_reward/std": 0.371787428855896, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.3787541687488556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.624110996723175, "sampling/importance_sampling_ratio/min": 2.618149189631665e-29, "sampling/sampling_logp_difference/max": 65.8125, "sampling/sampling_logp_difference/mean": 5.957012176513672, "step": 236 }, { "clip_ratio/high_max": 0.015628253226168454, "clip_ratio/high_mean": 0.0028134425956523046, "clip_ratio/low_mean": 0.003910299245035276, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006723741855239496, "completions/clipped_ratio": 1.0, "completions/max_length": 280.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 218.0, "completions/min_terminated_length": 0.0, "entropy": 1.6736329048871994, "epoch": 118.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.011593576520681381, "kl": 0.3900125212967396, "learning_rate": 8e-05, "loss": 0.0025, "num_tokens": 24834608.0, "reward": 0.34296876192092896, "reward_std": 0.1423397958278656, "rewards/num_nodes_reward/mean": 0.8515625, "rewards/num_nodes_reward/std": 0.356930136680603, "rewards/tree_correctness_reward/mean": 0.125, "rewards/tree_correctness_reward/std": 0.3320184051990509, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6563664674758911, "sampling/importance_sampling_ratio/min": 7.739234377113275e-30, "sampling/sampling_logp_difference/max": 67.03125, "sampling/sampling_logp_difference/mean": 5.462949752807617, "step": 237 }, { "clip_ratio/high_max": 0.019592621771153063, "clip_ratio/high_mean": 0.003681881535158027, "clip_ratio/low_mean": 0.004221287643304095, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007903169171186164, "completions/clipped_ratio": 1.0, "completions/max_length": 297.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 138.0, "completions/min_terminated_length": 0.0, "entropy": 1.8012602925300598, "epoch": 119.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.011859124526381493, "kl": 0.4131884202361107, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 24926736.0, "reward": 0.3804687559604645, "reward_std": 0.11014176160097122, "rewards/num_nodes_reward/mean": 0.703125, "rewards/num_nodes_reward/std": 0.45867621898651123, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.632965087890625, "sampling/importance_sampling_ratio/min": 1.1617967007143025e-29, "sampling/sampling_logp_difference/max": 66.625, "sampling/sampling_logp_difference/mean": 5.8287200927734375, "step": 238 }, { "clip_ratio/high_max": 0.015099321492016315, "clip_ratio/high_mean": 0.003591347180190496, "clip_ratio/low_mean": 0.0030363293808477465, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0066276766010560095, "completions/clipped_ratio": 1.0, "completions/max_length": 261.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 197.0, "completions/min_terminated_length": 0.0, "entropy": 1.8149425387382507, "epoch": 119.5, "frac_reward_zero_std": 0.5625, "grad_norm": 0.012822012417018414, "kl": 0.38492316380143166, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 25020976.0, "reward": 0.37109375, "reward_std": 0.12208747863769531, "rewards/num_nodes_reward/mean": 0.8359375, "rewards/num_nodes_reward/std": 0.371787428855896, "rewards/tree_correctness_reward/mean": 0.171875, "rewards/tree_correctness_reward/std": 0.3787541687488556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6298648118972778, "sampling/importance_sampling_ratio/min": 1.63839597639079e-29, "sampling/sampling_logp_difference/max": 66.28125, "sampling/sampling_logp_difference/mean": 5.845816612243652, "step": 239 }, { "clip_ratio/high_max": 0.015785104362294078, "clip_ratio/high_mean": 0.002836609448422678, "clip_ratio/low_mean": 0.003608913073549047, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00644552253652364, "completions/clipped_ratio": 1.0, "completions/max_length": 266.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 191.0, "completions/min_terminated_length": 0.0, "entropy": 1.873578667640686, "epoch": 120.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0106892678886652, "kl": 0.33703334257006645, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 25114768.0, "reward": 0.4593749940395355, "reward_std": 0.10829973220825195, "rewards/num_nodes_reward/mean": 0.8203125, "rewards/num_nodes_reward/std": 0.3854354918003082, "rewards/tree_correctness_reward/mean": 0.3046875, "rewards/tree_correctness_reward/std": 0.46208351850509644, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6236026287078857, "sampling/importance_sampling_ratio/min": 1.1260519675679445e-29, "sampling/sampling_logp_difference/max": 66.65625, "sampling/sampling_logp_difference/mean": 5.919617652893066, "step": 240 }, { "epoch": 120.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 239.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 163.35, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 108.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.937930679321289, "eval_frac_reward_zero_std": 0.6, "eval_kl": 0.4529753506183624, "eval_loss": -0.0012951086973771453, "eval_num_tokens": 25114768.0, "eval_reward": 0.19625000655651093, "eval_reward_std": 0.05984923988580704, "eval_rewards/num_nodes_reward/mean": 0.625, "eval_rewards/num_nodes_reward/std": 0.20493903160095214, "eval_rewards/tree_correctness_reward/mean": 0.0125, "eval_rewards/tree_correctness_reward/std": 0.05, "eval_runtime": 31.4155, "eval_samples_per_second": 0.318, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8633377552032471, "eval_sampling/importance_sampling_ratio/min": 1.9417127404662576e-27, "eval_sampling/sampling_logp_difference/max": 61.91875, "eval_sampling/sampling_logp_difference/mean": 2.4152849674224854, "eval_steps_per_second": 0.032, "step": 240 }, { "clip_ratio/high_max": 0.01808754459489137, "clip_ratio/high_mean": 0.004199564253212884, "clip_ratio/low_mean": 0.005885387101443484, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01008495147107169, "completions/clipped_ratio": 1.0, "completions/max_length": 259.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 193.0, "completions/min_terminated_length": 0.0, "entropy": 1.7379372119903564, "epoch": 120.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.013561884872615337, "kl": 0.35896025225520134, "learning_rate": 8e-05, "loss": 0.0023, "num_tokens": 25208352.0, "reward": 0.4156250059604645, "reward_std": 0.2019832879304886, "rewards/num_nodes_reward/mean": 0.9296875, "rewards/num_nodes_reward/std": 0.2566775679588318, "rewards/tree_correctness_reward/mean": 0.1953125, "rewards/tree_correctness_reward/std": 0.3979988098144531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6432969570159912, "sampling/importance_sampling_ratio/min": 2.8754744950605667e-29, "sampling/sampling_logp_difference/max": 65.71875, "sampling/sampling_logp_difference/mean": 5.643307685852051, "step": 241 }, { "clip_ratio/high_max": 0.008914263104088604, "clip_ratio/high_mean": 0.001493183670390863, "clip_ratio/low_mean": 0.0012410360504873097, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027342198154656217, "completions/clipped_ratio": 1.0, "completions/max_length": 258.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 161.0, "completions/min_terminated_length": 0.0, "entropy": 1.7631020545959473, "epoch": 121.0, "frac_reward_zero_std": 0.8125, "grad_norm": 0.006721786689013243, "kl": 0.374761663377285, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 25300272.0, "reward": 0.32734376192092896, "reward_std": 0.06928840279579163, "rewards/num_nodes_reward/mean": 0.7265625, "rewards/num_nodes_reward/std": 0.447474867105484, "rewards/tree_correctness_reward/mean": 0.15625, "rewards/tree_correctness_reward/std": 0.3645188808441162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6327179670333862, "sampling/importance_sampling_ratio/min": 2.411232316926477e-31, "sampling/sampling_logp_difference/max": 70.5, "sampling/sampling_logp_difference/mean": 5.85317325592041, "step": 242 }, { "clip_ratio/high_max": 0.031974080484360456, "clip_ratio/high_mean": 0.008515525521943346, "clip_ratio/low_mean": 0.003176448793965392, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011691974417772144, "completions/clipped_ratio": 1.0, "completions/max_length": 272.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 174.0, "completions/min_terminated_length": 0.0, "entropy": 1.8811472356319427, "epoch": 121.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.012265964411199093, "kl": 0.342033788561821, "learning_rate": 8e-05, "loss": -0.002, "num_tokens": 25393408.0, "reward": 0.48749998211860657, "reward_std": 0.16112762689590454, "rewards/num_nodes_reward/mean": 0.8046875, "rewards/num_nodes_reward/std": 0.3979988098144531, "rewards/tree_correctness_reward/mean": 0.3515625, "rewards/tree_correctness_reward/std": 0.4793342351913452, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6137145757675171, "sampling/importance_sampling_ratio/min": 1.0914069705124871e-29, "sampling/sampling_logp_difference/max": 66.6875, "sampling/sampling_logp_difference/mean": 6.098541736602783, "step": 243 }, { "clip_ratio/high_max": 0.018799941171891987, "clip_ratio/high_mean": 0.003083861491177231, "clip_ratio/low_mean": 0.0030530126532539725, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006136874115327373, "completions/clipped_ratio": 1.0, "completions/max_length": 266.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 159.0, "completions/min_terminated_length": 0.0, "entropy": 1.6224970668554306, "epoch": 122.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.012454099021852016, "kl": 0.3447927236557007, "learning_rate": 8e-05, "loss": -0.0008, "num_tokens": 25485328.0, "reward": 0.38671875, "reward_std": 0.11360576003789902, "rewards/num_nodes_reward/mean": 0.8515625, "rewards/num_nodes_reward/std": 0.356930136680603, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39184603095054626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6705121994018555, "sampling/importance_sampling_ratio/min": 5.90009059760205e-29, "sampling/sampling_logp_difference/max": 65.0, "sampling/sampling_logp_difference/mean": 5.330760955810547, "step": 244 }, { "clip_ratio/high_max": 0.03214726026635617, "clip_ratio/high_mean": 0.007130042416974902, "clip_ratio/low_mean": 0.0018016947506112047, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00893173721851781, "completions/clipped_ratio": 1.0, "completions/max_length": 257.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 193.0, "completions/min_terminated_length": 0.0, "entropy": 1.5269181281328201, "epoch": 122.5, "frac_reward_zero_std": 0.5625, "grad_norm": 0.009826947934925556, "kl": 0.38187022879719734, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 25576256.0, "reward": 0.38203126192092896, "reward_std": 0.11081285774707794, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.2109375, "rewards/tree_correctness_reward/std": 0.4095771610736847, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6734804511070251, "sampling/importance_sampling_ratio/min": 4.8430893875347414e-30, "sampling/sampling_logp_difference/max": 67.5, "sampling/sampling_logp_difference/mean": 5.285495281219482, "step": 245 }, { "clip_ratio/high_max": 0.01896539586596191, "clip_ratio/high_mean": 0.0034726119483821094, "clip_ratio/low_mean": 0.0032138254318851978, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006686437380267307, "completions/clipped_ratio": 1.0, "completions/max_length": 269.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 190.0, "completions/min_terminated_length": 0.0, "entropy": 1.6424100548028946, "epoch": 123.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.009778701700270176, "kl": 0.31821757182478905, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 25669360.0, "reward": 0.3648437559604645, "reward_std": 0.11734872311353683, "rewards/num_nodes_reward/mean": 0.796875, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6629234552383423, "sampling/importance_sampling_ratio/min": 1.0914069705124871e-29, "sampling/sampling_logp_difference/max": 66.6875, "sampling/sampling_logp_difference/mean": 5.414827346801758, "step": 246 }, { "clip_ratio/high_max": 0.02460351533954963, "clip_ratio/high_mean": 0.006426895939512178, "clip_ratio/low_mean": 0.003344562108395621, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009771457844180986, "completions/clipped_ratio": 1.0, "completions/max_length": 276.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 237.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 196.0, "completions/min_terminated_length": 0.0, "entropy": 1.9673324674367905, "epoch": 123.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.01396065391600132, "kl": 0.32167396508157253, "learning_rate": 8e-05, "loss": 0.0011, "num_tokens": 25763840.0, "reward": 0.3539062738418579, "reward_std": 0.15982075035572052, "rewards/num_nodes_reward/mean": 0.7421875, "rewards/num_nodes_reward/std": 0.43914902210235596, "rewards/tree_correctness_reward/mean": 0.1875, "rewards/tree_correctness_reward/std": 0.39184603095054626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6165443062782288, "sampling/importance_sampling_ratio/min": 6.5544083878968715e-31, "sampling/sampling_logp_difference/max": 69.5, "sampling/sampling_logp_difference/mean": 5.98671817779541, "step": 247 }, { "clip_ratio/high_max": 0.029042671900242567, "clip_ratio/high_mean": 0.008521875890437514, "clip_ratio/low_mean": 0.003610850792028941, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012132727075368166, "completions/clipped_ratio": 1.0, "completions/max_length": 253.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 215.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 164.0, "completions/min_terminated_length": 0.0, "entropy": 1.5413777381181717, "epoch": 124.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.012264092452824116, "kl": 0.36034897714853287, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 25855520.0, "reward": 0.4273437559604645, "reward_std": 0.17945042252540588, "rewards/num_nodes_reward/mean": 0.8046875, "rewards/num_nodes_reward/std": 0.3979988098144531, "rewards/tree_correctness_reward/mean": 0.265625, "rewards/tree_correctness_reward/std": 0.44340085983276367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.680801510810852, "sampling/importance_sampling_ratio/min": 7.984904797113095e-30, "sampling/sampling_logp_difference/max": 67.0, "sampling/sampling_logp_difference/mean": 5.180078506469727, "step": 248 }, { "clip_ratio/high_max": 0.015676732175052166, "clip_ratio/high_mean": 0.002831995690939948, "clip_ratio/low_mean": 0.004092439980013296, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006924435670953244, "completions/clipped_ratio": 1.0, "completions/max_length": 261.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 215.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 182.0, "completions/min_terminated_length": 0.0, "entropy": 1.6639769226312637, "epoch": 124.5, "frac_reward_zero_std": 0.5625, "grad_norm": 0.011913917027413845, "kl": 0.3338594622910023, "learning_rate": 8e-05, "loss": -0.0014, "num_tokens": 25947216.0, "reward": 0.3765624761581421, "reward_std": 0.13466668128967285, "rewards/num_nodes_reward/mean": 0.671875, "rewards/num_nodes_reward/std": 0.4713755249977112, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.434714138507843, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6585390567779541, "sampling/importance_sampling_ratio/min": 6.08737946060331e-29, "sampling/sampling_logp_difference/max": 64.96875, "sampling/sampling_logp_difference/mean": 5.4690141677856445, "step": 249 }, { "clip_ratio/high_max": 0.015565441688522696, "clip_ratio/high_mean": 0.0034602787636686116, "clip_ratio/low_mean": 0.005746219627326354, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009206498449202627, "completions/clipped_ratio": 1.0, "completions/max_length": 285.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 177.0, "completions/min_terminated_length": 0.0, "entropy": 1.7562279552221298, "epoch": 125.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.011749785393476486, "kl": 0.29453151673078537, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 26040224.0, "reward": 0.41718748211860657, "reward_std": 0.19254179298877716, "rewards/num_nodes_reward/mean": 0.84375, "rewards/num_nodes_reward/std": 0.3645188808441162, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42527204751968384, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6354606747627258, "sampling/importance_sampling_ratio/min": 1.1986760774318029e-29, "sampling/sampling_logp_difference/max": 66.59375, "sampling/sampling_logp_difference/mean": 5.800774097442627, "step": 250 }, { "epoch": 125.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 231.2, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 167.625, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 117.6, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.907706904411316, "eval_frac_reward_zero_std": 0.6, "eval_kl": 0.43728540539741517, "eval_loss": -0.00021231843857094646, "eval_num_tokens": 26040224.0, "eval_reward": 0.17374999821186066, "eval_reward_std": 0.10810993164777756, "eval_rewards/num_nodes_reward/mean": 0.4625, "eval_rewards/num_nodes_reward/std": 0.302301824092865, "eval_rewards/tree_correctness_reward/mean": 0.05, "eval_rewards/tree_correctness_reward/std": 0.16831300854682923, "eval_runtime": 30.1156, "eval_samples_per_second": 0.332, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8774518847465516, "eval_sampling/importance_sampling_ratio/min": 2.590189402960114e-27, "eval_sampling/sampling_logp_difference/max": 63.06875, "eval_sampling/sampling_logp_difference/mean": 2.2091002464294434, "eval_steps_per_second": 0.033, "step": 250 }, { "clip_ratio/high_max": 0.024656387569848448, "clip_ratio/high_mean": 0.004688583023380488, "clip_ratio/low_mean": 0.0065391327370889485, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011227715876884758, "completions/clipped_ratio": 1.0, "completions/max_length": 242.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 223.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 176.0, "completions/min_terminated_length": 0.0, "entropy": 1.629948377609253, "epoch": 125.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.014670688658952713, "kl": 0.34945010393857956, "learning_rate": 8e-05, "loss": 0.0027, "num_tokens": 26132912.0, "reward": 0.34062498807907104, "reward_std": 0.17713642120361328, "rewards/num_nodes_reward/mean": 0.7890625, "rewards/num_nodes_reward/std": 0.4095771610736847, "rewards/tree_correctness_reward/mean": 0.1484375, "rewards/tree_correctness_reward/std": 0.356930136680603, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6684303879737854, "sampling/importance_sampling_ratio/min": 3.930307529307671e-29, "sampling/sampling_logp_difference/max": 65.40625, "sampling/sampling_logp_difference/mean": 5.3497633934021, "step": 251 }, { "clip_ratio/high_max": 0.01839330338407308, "clip_ratio/high_mean": 0.004259333887603134, "clip_ratio/low_mean": 0.0061427677574101835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010402101674117148, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 196.0, "completions/min_terminated_length": 0.0, "entropy": 1.6733917146921158, "epoch": 126.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01641732081770897, "kl": 0.3487337790429592, "learning_rate": 8e-05, "loss": -0.0013, "num_tokens": 26225056.0, "reward": 0.38749998807907104, "reward_std": 0.2086641639471054, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.41502299904823303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.648997962474823, "sampling/importance_sampling_ratio/min": 1.275983958642445e-29, "sampling/sampling_logp_difference/max": 66.53125, "sampling/sampling_logp_difference/mean": 5.60938835144043, "step": 252 }, { "clip_ratio/high_max": 0.023207228281535208, "clip_ratio/high_mean": 0.005522320425370708, "clip_ratio/low_mean": 0.004358886246336624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009881206817226484, "completions/clipped_ratio": 1.0, "completions/max_length": 252.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 183.0, "completions/min_terminated_length": 0.0, "entropy": 1.5075791776180267, "epoch": 126.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.012872000224888325, "kl": 0.3726794980466366, "learning_rate": 8e-05, "loss": -0.0012, "num_tokens": 26316880.0, "reward": 0.38984376192092896, "reward_std": 0.1401437520980835, "rewards/num_nodes_reward/mean": 0.84375, "rewards/num_nodes_reward/std": 0.3645188808441162, "rewards/tree_correctness_reward/mean": 0.1953125, "rewards/tree_correctness_reward/std": 0.3979988098144531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6713494062423706, "sampling/importance_sampling_ratio/min": 6.28061342860165e-29, "sampling/sampling_logp_difference/max": 64.9375, "sampling/sampling_logp_difference/mean": 5.352904319763184, "step": 253 }, { "clip_ratio/high_max": 0.01595491199987009, "clip_ratio/high_mean": 0.002794264531985391, "clip_ratio/low_mean": 0.0039241708582267165, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006718435266520828, "completions/clipped_ratio": 1.0, "completions/max_length": 281.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 196.0, "completions/min_terminated_length": 0.0, "entropy": 1.9525563567876816, "epoch": 127.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.017581799998879433, "kl": 0.3385389819741249, "learning_rate": 8e-05, "loss": 0.0015, "num_tokens": 26410256.0, "reward": 0.4835937023162842, "reward_std": 0.12044055759906769, "rewards/num_nodes_reward/mean": 0.8828125, "rewards/num_nodes_reward/std": 0.322907418012619, "rewards/tree_correctness_reward/mean": 0.3125, "rewards/tree_correctness_reward/std": 0.4653336703777313, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.601538360118866, "sampling/importance_sampling_ratio/min": 3.2583394374284734e-29, "sampling/sampling_logp_difference/max": 65.59375, "sampling/sampling_logp_difference/mean": 6.2386474609375, "step": 254 }, { "clip_ratio/high_max": 0.020626159268431365, "clip_ratio/high_mean": 0.003666898322990164, "clip_ratio/low_mean": 0.005072325904620811, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008739223994780332, "completions/clipped_ratio": 1.0, "completions/max_length": 228.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 207.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 181.0, "completions/min_terminated_length": 0.0, "entropy": 1.476215898990631, "epoch": 127.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.04382946714758873, "kl": 0.5408997945487499, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 26500896.0, "reward": 0.4867187440395355, "reward_std": 0.14702406525611877, "rewards/num_nodes_reward/mean": 0.8203125, "rewards/num_nodes_reward/std": 0.3854354918003082, "rewards/tree_correctness_reward/mean": 0.34375, "rewards/tree_correctness_reward/std": 0.47682511806488037, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6761458516120911, "sampling/importance_sampling_ratio/min": 2.3838521382698358e-29, "sampling/sampling_logp_difference/max": 65.90625, "sampling/sampling_logp_difference/mean": 5.284674167633057, "step": 255 }, { "clip_ratio/high_max": 0.025717246811836958, "clip_ratio/high_mean": 0.006516473134979606, "clip_ratio/low_mean": 0.004689787223469466, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011206260183826089, "completions/clipped_ratio": 1.0, "completions/max_length": 238.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 171.0, "completions/min_terminated_length": 0.0, "entropy": 1.4468669146299362, "epoch": 128.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.014245498925447464, "kl": 0.4425293244421482, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 26591440.0, "reward": 0.40703123807907104, "reward_std": 0.17153030633926392, "rewards/num_nodes_reward/mean": 0.8828125, "rewards/num_nodes_reward/std": 0.322907418012619, "rewards/tree_correctness_reward/mean": 0.203125, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6667509078979492, "sampling/importance_sampling_ratio/min": 3.1269332749387515e-30, "sampling/sampling_logp_difference/max": 67.9375, "sampling/sampling_logp_difference/mean": 5.424140930175781, "step": 256 }, { "clip_ratio/high_max": 0.021360022947192192, "clip_ratio/high_mean": 0.005456253100419417, "clip_ratio/low_mean": 0.006248256715480238, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011704509786795825, "completions/clipped_ratio": 1.0, "completions/max_length": 276.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 203.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 157.0, "completions/min_terminated_length": 0.0, "entropy": 1.4726988077163696, "epoch": 128.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.011382337659597397, "kl": 0.3785170018672943, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 26681584.0, "reward": 0.39531248807907104, "reward_std": 0.20351168513298035, "rewards/num_nodes_reward/mean": 0.734375, "rewards/num_nodes_reward/std": 0.44340085983276367, "rewards/tree_correctness_reward/mean": 0.25, "rewards/tree_correctness_reward/std": 0.434714138507843, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6896395683288574, "sampling/importance_sampling_ratio/min": 1.6904042099061981e-29, "sampling/sampling_logp_difference/max": 66.25, "sampling/sampling_logp_difference/mean": 5.114596366882324, "step": 257 }, { "clip_ratio/high_max": 0.009230605443008244, "clip_ratio/high_mean": 0.002176246591261588, "clip_ratio/low_mean": 0.005780531879281625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007956778397783637, "completions/clipped_ratio": 1.0, "completions/max_length": 221.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 176.0, "completions/min_terminated_length": 0.0, "entropy": 1.3636512905359268, "epoch": 129.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.011266940273344517, "kl": 0.35056428983807564, "learning_rate": 8e-05, "loss": 0.0023, "num_tokens": 26771472.0, "reward": 0.4453125, "reward_std": 0.16939949989318848, "rewards/num_nodes_reward/mean": 0.9375, "rewards/num_nodes_reward/std": 0.24301259219646454, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42527204751968384, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6898045539855957, "sampling/importance_sampling_ratio/min": 1.5066409121314163e-28, "sampling/sampling_logp_difference/max": 64.0625, "sampling/sampling_logp_difference/mean": 5.095357894897461, "step": 258 }, { "clip_ratio/high_max": 0.023049293347867206, "clip_ratio/high_mean": 0.005393661085690837, "clip_ratio/low_mean": 0.00799722783267498, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013390889158472419, "completions/clipped_ratio": 1.0, "completions/max_length": 264.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 171.0, "completions/min_terminated_length": 0.0, "entropy": 1.518968641757965, "epoch": 129.5, "frac_reward_zero_std": 0.125, "grad_norm": 0.016031689941883087, "kl": 0.3549388758838177, "learning_rate": 8e-05, "loss": 0.0026, "num_tokens": 26862304.0, "reward": 0.41484373807907104, "reward_std": 0.243071049451828, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.2578125, "rewards/tree_correctness_reward/std": 0.43914902210235596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.663368821144104, "sampling/importance_sampling_ratio/min": 4.3165985204907103e-29, "sampling/sampling_logp_difference/max": 65.3125, "sampling/sampling_logp_difference/mean": 5.480572700500488, "step": 259 }, { "clip_ratio/high_max": 0.017328122106846422, "clip_ratio/high_mean": 0.003667845405288972, "clip_ratio/low_mean": 0.005914052278967574, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0095818976406008, "completions/clipped_ratio": 1.0, "completions/max_length": 227.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 165.0, "completions/min_terminated_length": 0.0, "entropy": 1.6164878606796265, "epoch": 130.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.011551850475370884, "kl": 0.33223435655236244, "learning_rate": 8e-05, "loss": -0.0022, "num_tokens": 26952528.0, "reward": 0.5171874761581421, "reward_std": 0.1946006566286087, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.3671875, "rewards/tree_correctness_reward/std": 0.4839322865009308, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6433292627334595, "sampling/importance_sampling_ratio/min": 2.4352582929827077e-30, "sampling/sampling_logp_difference/max": 68.1875, "sampling/sampling_logp_difference/mean": 5.747242450714111, "step": 260 }, { "epoch": 130.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 208.2, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 161.85, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 123.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.8822226405143738, "eval_frac_reward_zero_std": 0.5, "eval_kl": 0.556795597076416, "eval_loss": 2.3516482542618178e-05, "eval_num_tokens": 26952528.0, "eval_reward": 0.32499998807907104, "eval_reward_std": 0.14788837134838104, "eval_rewards/num_nodes_reward/mean": 0.7625, "eval_rewards/num_nodes_reward/std": 0.11831300854682922, "eval_rewards/tree_correctness_reward/mean": 0.1375, "eval_rewards/tree_correctness_reward/std": 0.25698786973953247, "eval_runtime": 28.5484, "eval_samples_per_second": 0.35, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8697314739227295, "eval_sampling/importance_sampling_ratio/min": 3.220089698842448e-26, "eval_sampling/sampling_logp_difference/max": 61.1375, "eval_sampling/sampling_logp_difference/mean": 2.3184921026229857, "eval_steps_per_second": 0.035, "step": 260 }, { "clip_ratio/high_max": 0.01816109975334257, "clip_ratio/high_mean": 0.004647945228498429, "clip_ratio/low_mean": 0.007459521701093763, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012107467278838158, "completions/clipped_ratio": 1.0, "completions/max_length": 260.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 203.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 166.0, "completions/min_terminated_length": 0.0, "entropy": 1.4319838285446167, "epoch": 130.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.014133859425783157, "kl": 0.464911513030529, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 27042704.0, "reward": 0.4007812440395355, "reward_std": 0.20144200325012207, "rewards/num_nodes_reward/mean": 0.7890625, "rewards/num_nodes_reward/std": 0.4095771610736847, "rewards/tree_correctness_reward/mean": 0.234375, "rewards/tree_correctness_reward/std": 0.42527204751968384, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6769293546676636, "sampling/importance_sampling_ratio/min": 1.9345652124047606e-28, "sampling/sampling_logp_difference/max": 63.8125, "sampling/sampling_logp_difference/mean": 5.312958717346191, "step": 261 }, { "clip_ratio/high_max": 0.03425805550068617, "clip_ratio/high_mean": 0.009544478263705969, "clip_ratio/low_mean": 0.005522388993995264, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015066867577843368, "completions/clipped_ratio": 1.0, "completions/max_length": 237.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 172.0, "completions/min_terminated_length": 0.0, "entropy": 1.475217655301094, "epoch": 131.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.011738593690097332, "kl": 0.33808913454413414, "learning_rate": 8e-05, "loss": 0.0015, "num_tokens": 27131824.0, "reward": 0.5406249761581421, "reward_std": 0.21771328151226044, "rewards/num_nodes_reward/mean": 0.890625, "rewards/num_nodes_reward/std": 0.31333550810813904, "rewards/tree_correctness_reward/mean": 0.390625, "rewards/tree_correctness_reward/std": 0.4898075461387634, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6636722087860107, "sampling/importance_sampling_ratio/min": 1.0354981420529148e-28, "sampling/sampling_logp_difference/max": 64.4375, "sampling/sampling_logp_difference/mean": 5.505125045776367, "step": 262 }, { "clip_ratio/high_max": 0.01696615543914959, "clip_ratio/high_mean": 0.003927611833205447, "clip_ratio/low_mean": 0.004773441673023626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00870105333160609, "completions/clipped_ratio": 1.0, "completions/max_length": 270.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 204.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 123.0, "completions/min_terminated_length": 0.0, "entropy": 1.5604054629802704, "epoch": 131.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.028527192771434784, "kl": 0.35327450558543205, "learning_rate": 8e-05, "loss": 0.0016, "num_tokens": 27222112.0, "reward": 0.44609373807907104, "reward_std": 0.15692488849163055, "rewards/num_nodes_reward/mean": 0.8125, "rewards/num_nodes_reward/std": 0.39184603095054626, "rewards/tree_correctness_reward/mean": 0.2890625, "rewards/tree_correctness_reward/std": 0.45510825514793396, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6690828800201416, "sampling/importance_sampling_ratio/min": 2.966751838288923e-29, "sampling/sampling_logp_difference/max": 65.6875, "sampling/sampling_logp_difference/mean": 5.38098669052124, "step": 263 }, { "clip_ratio/high_max": 0.01454741129418835, "clip_ratio/high_mean": 0.002930003662186209, "clip_ratio/low_mean": 0.006427550601074472, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009357554139569402, "completions/clipped_ratio": 1.0, "completions/max_length": 222.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 172.0, "completions/min_terminated_length": 0.0, "entropy": 1.5302696377038956, "epoch": 132.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.012173857539892197, "kl": 0.350512333214283, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 27311568.0, "reward": 0.4632812440395355, "reward_std": 0.19342269003391266, "rewards/num_nodes_reward/mean": 0.8515625, "rewards/num_nodes_reward/std": 0.356930136680603, "rewards/tree_correctness_reward/mean": 0.296875, "rewards/tree_correctness_reward/std": 0.45867621898651123, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6602640151977539, "sampling/importance_sampling_ratio/min": 8.584580550654011e-29, "sampling/sampling_logp_difference/max": 64.625, "sampling/sampling_logp_difference/mean": 5.524335861206055, "step": 264 }, { "clip_ratio/high_max": 0.01935220032464713, "clip_ratio/high_mean": 0.006029968644725159, "clip_ratio/low_mean": 0.006051427510101348, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012081395951099694, "completions/clipped_ratio": 1.0, "completions/max_length": 257.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 152.0, "completions/min_terminated_length": 0.0, "entropy": 1.7612916976213455, "epoch": 132.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.014193153940141201, "kl": 0.2958354540169239, "learning_rate": 8e-05, "loss": 0.0012, "num_tokens": 27402608.0, "reward": 0.4359375238418579, "reward_std": 0.21139273047447205, "rewards/num_nodes_reward/mean": 0.8515625, "rewards/num_nodes_reward/std": 0.356930136680603, "rewards/tree_correctness_reward/mean": 0.2578125, "rewards/tree_correctness_reward/std": 0.43914902210235596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6186585426330566, "sampling/importance_sampling_ratio/min": 3.060926502377367e-29, "sampling/sampling_logp_difference/max": 65.65625, "sampling/sampling_logp_difference/mean": 6.0582427978515625, "step": 265 }, { "clip_ratio/high_max": 0.02181376045336947, "clip_ratio/high_mean": 0.004142673446040135, "clip_ratio/low_mean": 0.005181470507523045, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009324143931735307, "completions/clipped_ratio": 1.0, "completions/max_length": 225.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 194.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 156.0, "completions/min_terminated_length": 0.0, "entropy": 1.442167341709137, "epoch": 133.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.013743567280471325, "kl": 0.34786032512784004, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 27491584.0, "reward": 0.4976562559604645, "reward_std": 0.21136781573295593, "rewards/num_nodes_reward/mean": 0.9296875, "rewards/num_nodes_reward/std": 0.2566775679588318, "rewards/tree_correctness_reward/mean": 0.3125, "rewards/tree_correctness_reward/std": 0.4653336703777313, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6722066402435303, "sampling/importance_sampling_ratio/min": 2.103742028355895e-29, "sampling/sampling_logp_difference/max": 66.03125, "sampling/sampling_logp_difference/mean": 5.373363494873047, "step": 266 }, { "clip_ratio/high_max": 0.031373800709843636, "clip_ratio/high_mean": 0.008214229485020041, "clip_ratio/low_mean": 0.006778204871807247, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014992434298619628, "completions/clipped_ratio": 1.0, "completions/max_length": 230.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 192.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 159.0, "completions/min_terminated_length": 0.0, "entropy": 1.468655213713646, "epoch": 133.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.013552000746130943, "kl": 0.38289405032992363, "learning_rate": 8e-05, "loss": -0.0019, "num_tokens": 27580304.0, "reward": 0.5046875476837158, "reward_std": 0.20822423696517944, "rewards/num_nodes_reward/mean": 0.953125, "rewards/num_nodes_reward/std": 0.21220162510871887, "rewards/tree_correctness_reward/mean": 0.3125, "rewards/tree_correctness_reward/std": 0.4653336703777313, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6776779890060425, "sampling/importance_sampling_ratio/min": 1.8565459107282552e-29, "sampling/sampling_logp_difference/max": 66.15625, "sampling/sampling_logp_difference/mean": 5.298709392547607, "step": 267 }, { "clip_ratio/high_max": 0.018070768448524177, "clip_ratio/high_mean": 0.004153535584919155, "clip_ratio/low_mean": 0.007247960107633844, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011401495721656829, "completions/clipped_ratio": 1.0, "completions/max_length": 282.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 211.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 181.0, "completions/min_terminated_length": 0.0, "entropy": 1.8263465613126755, "epoch": 134.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01449547428637743, "kl": 0.3077739290893078, "learning_rate": 8e-05, "loss": -0.0005, "num_tokens": 27671520.0, "reward": 0.4281250238418579, "reward_std": 0.18474993109703064, "rewards/num_nodes_reward/mean": 0.7890625, "rewards/num_nodes_reward/std": 0.4095771610736847, "rewards/tree_correctness_reward/mean": 0.2734375, "rewards/tree_correctness_reward/std": 0.447474867105484, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6207212805747986, "sampling/importance_sampling_ratio/min": 1.5391306882548106e-29, "sampling/sampling_logp_difference/max": 66.34375, "sampling/sampling_logp_difference/mean": 6.040550231933594, "step": 268 }, { "clip_ratio/high_max": 0.018807699962053448, "clip_ratio/high_mean": 0.004679734702222049, "clip_ratio/low_mean": 0.004393344715936109, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009073079359950498, "completions/clipped_ratio": 1.0, "completions/max_length": 274.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 200.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 155.0, "completions/min_terminated_length": 0.0, "entropy": 1.7303829342126846, "epoch": 134.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.012899080291390419, "kl": 0.3280996084213257, "learning_rate": 8e-05, "loss": 0.0011, "num_tokens": 27761360.0, "reward": 0.4593750238418579, "reward_std": 0.16673080623149872, "rewards/num_nodes_reward/mean": 0.8203125, "rewards/num_nodes_reward/std": 0.3854354918003082, "rewards/tree_correctness_reward/mean": 0.3046875, "rewards/tree_correctness_reward/std": 0.46208351850509644, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6189649105072021, "sampling/importance_sampling_ratio/min": 1.1617967007143025e-29, "sampling/sampling_logp_difference/max": 66.625, "sampling/sampling_logp_difference/mean": 6.093552589416504, "step": 269 }, { "clip_ratio/high_max": 0.02568976319162175, "clip_ratio/high_mean": 0.005712674777896609, "clip_ratio/low_mean": 0.004459422772924881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010172097594477236, "completions/clipped_ratio": 1.0, "completions/max_length": 275.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 209.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 177.0, "completions/min_terminated_length": 0.0, "entropy": 1.641958937048912, "epoch": 135.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.011590682901442051, "kl": 0.5251788645982742, "learning_rate": 8e-05, "loss": 0.0028, "num_tokens": 27852320.0, "reward": 0.5234375, "reward_std": 0.27633434534072876, "rewards/num_nodes_reward/mean": 0.90625, "rewards/num_nodes_reward/std": 0.29262590408325195, "rewards/tree_correctness_reward/mean": 0.359375, "rewards/tree_correctness_reward/std": 0.481702595949173, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6486040353775024, "sampling/importance_sampling_ratio/min": 2.1705219940814765e-29, "sampling/sampling_logp_difference/max": 66.0, "sampling/sampling_logp_difference/mean": 5.6754231452941895, "step": 270 }, { "epoch": 135.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 203.4, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 142.975, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 105.2, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.8091652035713196, "eval_frac_reward_zero_std": 0.4, "eval_kl": 0.4407765865325928, "eval_loss": -0.0030600179452449083, "eval_num_tokens": 27852320.0, "eval_reward": 0.26749999523162843, "eval_reward_std": 0.13551807701587676, "eval_rewards/num_nodes_reward/mean": 0.6, "eval_rewards/num_nodes_reward/std": 0.20655910968780516, "eval_rewards/tree_correctness_reward/mean": 0.125, "eval_rewards/tree_correctness_reward/std": 0.2032795548439026, "eval_runtime": 26.02, "eval_samples_per_second": 0.384, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8792747020721435, "eval_sampling/importance_sampling_ratio/min": 5.0736504718421995e-28, "eval_sampling/sampling_logp_difference/max": 63.85, "eval_sampling/sampling_logp_difference/mean": 2.2083408355712892, "eval_steps_per_second": 0.038, "step": 270 }, { "clip_ratio/high_max": 0.023022705514449626, "clip_ratio/high_mean": 0.004466372396564111, "clip_ratio/low_mean": 0.005558801494771615, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010025173658505082, "completions/clipped_ratio": 1.0, "completions/max_length": 375.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 1.971728801727295, "epoch": 135.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.01196969486773014, "kl": 0.3298344165086746, "learning_rate": 8e-05, "loss": 0.0006, "num_tokens": 27944736.0, "reward": 0.4023437201976776, "reward_std": 0.2264990359544754, "rewards/num_nodes_reward/mean": 0.8125, "rewards/num_nodes_reward/std": 0.39184603095054626, "rewards/tree_correctness_reward/mean": 0.2265625, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6077896356582642, "sampling/importance_sampling_ratio/min": 1.2367262089069031e-29, "sampling/sampling_logp_difference/max": 66.5625, "sampling/sampling_logp_difference/mean": 6.1353759765625, "step": 271 }, { "clip_ratio/high_max": 0.021533466293476522, "clip_ratio/high_mean": 0.005206762041780166, "clip_ratio/low_mean": 0.00739994557807222, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012606707052327693, "completions/clipped_ratio": 1.0, "completions/max_length": 203.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 130.0, "completions/min_terminated_length": 0.0, "entropy": 1.2578584104776382, "epoch": 136.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01214124821126461, "kl": 0.3593839667737484, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 28032480.0, "reward": 0.49687498807907104, "reward_std": 0.22950084507465363, "rewards/num_nodes_reward/mean": 0.8359375, "rewards/num_nodes_reward/std": 0.371787428855896, "rewards/tree_correctness_reward/mean": 0.3515625, "rewards/tree_correctness_reward/std": 0.4793342351913452, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6963524222373962, "sampling/importance_sampling_ratio/min": 4.453622116576232e-29, "sampling/sampling_logp_difference/max": 65.28125, "sampling/sampling_logp_difference/mean": 5.106828689575195, "step": 272 }, { "clip_ratio/high_max": 0.022137976484373212, "clip_ratio/high_mean": 0.005252155242487788, "clip_ratio/low_mean": 0.006030655509675853, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011282810824923217, "completions/clipped_ratio": 1.0, "completions/max_length": 258.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 199.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 171.0, "completions/min_terminated_length": 0.0, "entropy": 1.526609182357788, "epoch": 136.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.014142234809696674, "kl": 0.3566073924303055, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 28122144.0, "reward": 0.5562499761581421, "reward_std": 0.2299451231956482, "rewards/num_nodes_reward/mean": 0.9609375, "rewards/num_nodes_reward/std": 0.194504976272583, "rewards/tree_correctness_reward/mean": 0.3828125, "rewards/tree_correctness_reward/std": 0.4879830479621887, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.653571367263794, "sampling/importance_sampling_ratio/min": 7.739234377113275e-30, "sampling/sampling_logp_difference/max": 67.03125, "sampling/sampling_logp_difference/mean": 5.64471960067749, "step": 273 }, { "clip_ratio/high_max": 0.02166517381556332, "clip_ratio/high_mean": 0.004972031572833657, "clip_ratio/low_mean": 0.008195256581529975, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013167287688702345, "completions/clipped_ratio": 1.0, "completions/max_length": 240.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 142.0, "completions/min_terminated_length": 0.0, "entropy": 1.6952607482671738, "epoch": 137.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.011941192671656609, "kl": 0.3412458375096321, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 28212528.0, "reward": 0.45234373211860657, "reward_std": 0.2621420621871948, "rewards/num_nodes_reward/mean": 0.796875, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.3046875, "rewards/tree_correctness_reward/std": 0.46208351850509644, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6407455205917358, "sampling/importance_sampling_ratio/min": 3.1269332749387515e-30, "sampling/sampling_logp_difference/max": 67.9375, "sampling/sampling_logp_difference/mean": 5.747024059295654, "step": 274 }, { "clip_ratio/high_max": 0.022762180131394416, "clip_ratio/high_mean": 0.0065135063414345495, "clip_ratio/low_mean": 0.007005097635556012, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013518604333512485, "completions/clipped_ratio": 1.0, "completions/max_length": 253.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 117.0, "completions/min_terminated_length": 0.0, "entropy": 1.4818222522735596, "epoch": 137.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.015233244746923447, "kl": 0.4333213269710541, "learning_rate": 8e-05, "loss": 0.0014, "num_tokens": 28302272.0, "reward": 0.4976562261581421, "reward_std": 0.27320218086242676, "rewards/num_nodes_reward/mean": 0.875, "rewards/num_nodes_reward/std": 0.3320184051990509, "rewards/tree_correctness_reward/mean": 0.3359375, "rewards/tree_correctness_reward/std": 0.47417303919792175, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6725389957427979, "sampling/importance_sampling_ratio/min": 3.8093847075572265e-29, "sampling/sampling_logp_difference/max": 65.4375, "sampling/sampling_logp_difference/mean": 5.3475751876831055, "step": 275 }, { "clip_ratio/high_max": 0.02601588366087526, "clip_ratio/high_mean": 0.0064709407452028245, "clip_ratio/low_mean": 0.005843143124366179, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012314083869569004, "completions/clipped_ratio": 1.0, "completions/max_length": 286.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 167.0, "completions/min_terminated_length": 0.0, "entropy": 1.406552255153656, "epoch": 138.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.014785257168114185, "kl": 1.1802620217204094, "learning_rate": 8e-05, "loss": 0.0016, "num_tokens": 28391536.0, "reward": 0.4593749940395355, "reward_std": 0.24755096435546875, "rewards/num_nodes_reward/mean": 0.765625, "rewards/num_nodes_reward/std": 0.42527204751968384, "rewards/tree_correctness_reward/mean": 0.328125, "rewards/tree_correctness_reward/std": 0.4713755249977112, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.678042471408844, "sampling/importance_sampling_ratio/min": 1.137272224775357e-28, "sampling/sampling_logp_difference/max": 64.34375, "sampling/sampling_logp_difference/mean": 5.277191162109375, "step": 276 }, { "clip_ratio/high_max": 0.017517729196697474, "clip_ratio/high_mean": 0.003415988918277435, "clip_ratio/low_mean": 0.00540128254215233, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008817271562293172, "completions/clipped_ratio": 1.0, "completions/max_length": 265.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 173.0, "completions/min_terminated_length": 0.0, "entropy": 1.5612072348594666, "epoch": 138.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.010506180115044117, "kl": 0.31485839933156967, "learning_rate": 8e-05, "loss": 0.0011, "num_tokens": 28481920.0, "reward": 0.43671876192092896, "reward_std": 0.19460555911064148, "rewards/num_nodes_reward/mean": 0.890625, "rewards/num_nodes_reward/std": 0.31333550810813904, "rewards/tree_correctness_reward/mean": 0.2421875, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6470571756362915, "sampling/importance_sampling_ratio/min": 1.1022820499163511e-28, "sampling/sampling_logp_difference/max": 64.375, "sampling/sampling_logp_difference/mean": 5.692469596862793, "step": 277 }, { "clip_ratio/high_max": 0.021193768829107285, "clip_ratio/high_mean": 0.005028896383009851, "clip_ratio/low_mean": 0.003072678991884459, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00810157525120303, "completions/clipped_ratio": 1.0, "completions/max_length": 307.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 212.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 172.0, "completions/min_terminated_length": 0.0, "entropy": 1.7173904180526733, "epoch": 139.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.011158541776239872, "kl": 0.34404245391488075, "learning_rate": 8e-05, "loss": 0.0007, "num_tokens": 28573184.0, "reward": 0.609375, "reward_std": 0.21277354657649994, "rewards/num_nodes_reward/mean": 0.8828125, "rewards/num_nodes_reward/std": 0.322907418012619, "rewards/tree_correctness_reward/mean": 0.4921875, "rewards/tree_correctness_reward/std": 0.5019033551216125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.626318097114563, "sampling/importance_sampling_ratio/min": 9.727604926320145e-29, "sampling/sampling_logp_difference/max": 64.5, "sampling/sampling_logp_difference/mean": 5.977054595947266, "step": 278 }, { "clip_ratio/high_max": 0.025871909805573523, "clip_ratio/high_mean": 0.006783321325201541, "clip_ratio/low_mean": 0.005973388615529984, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012756710057146847, "completions/clipped_ratio": 1.0, "completions/max_length": 205.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 166.0, "completions/min_terminated_length": 0.0, "entropy": 1.3482685089111328, "epoch": 139.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.012485730461776257, "kl": 0.37552617862820625, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 28660720.0, "reward": 0.610156238079071, "reward_std": 0.2140398919582367, "rewards/num_nodes_reward/mean": 0.921875, "rewards/num_nodes_reward/std": 0.2694226801395416, "rewards/tree_correctness_reward/mean": 0.4765625, "rewards/tree_correctness_reward/std": 0.5014128684997559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6662806272506714, "sampling/importance_sampling_ratio/min": 3.3952676397964373e-28, "sampling/sampling_logp_difference/max": 63.25, "sampling/sampling_logp_difference/mean": 5.52789306640625, "step": 279 }, { "clip_ratio/high_max": 0.02780526370042935, "clip_ratio/high_mean": 0.0050757728604367, "clip_ratio/low_mean": 0.006337600148981437, "clip_ratio/low_min": 0.0002615062694530934, "clip_ratio/region_mean": 0.011413372762035578, "completions/clipped_ratio": 1.0, "completions/max_length": 239.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 181.0, "completions/min_terminated_length": 0.0, "entropy": 1.5730272084474564, "epoch": 140.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.011252875439822674, "kl": 0.4080684520304203, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 28750800.0, "reward": 0.3804687261581421, "reward_std": 0.21953944861888885, "rewards/num_nodes_reward/mean": 0.7578125, "rewards/num_nodes_reward/std": 0.4300905168056488, "rewards/tree_correctness_reward/mean": 0.21875, "rewards/tree_correctness_reward/std": 0.41502299904823303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6425725221633911, "sampling/importance_sampling_ratio/min": 6.479982085325971e-29, "sampling/sampling_logp_difference/max": 64.90625, "sampling/sampling_logp_difference/mean": 5.773411750793457, "step": 280 }, { "epoch": 140.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 171.0, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 133.45, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 100.2, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.679623830318451, "eval_frac_reward_zero_std": 0.6, "eval_kl": 0.539061677455902, "eval_loss": -0.0027198102325201035, "eval_num_tokens": 28750800.0, "eval_reward": 0.2762500047683716, "eval_reward_std": 0.12932011485099792, "eval_rewards/num_nodes_reward/mean": 0.6875, "eval_rewards/num_nodes_reward/std": 0.1532795548439026, "eval_rewards/tree_correctness_reward/mean": 0.1, "eval_rewards/tree_correctness_reward/std": 0.17636529207229615, "eval_runtime": 24.3198, "eval_samples_per_second": 0.411, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8914477348327636, "eval_sampling/importance_sampling_ratio/min": 6.615339234415925e-27, "eval_sampling/sampling_logp_difference/max": 61.59375, "eval_sampling/sampling_logp_difference/mean": 2.0141668796539305, "eval_steps_per_second": 0.041, "step": 280 }, { "clip_ratio/high_max": 0.026345117250457406, "clip_ratio/high_mean": 0.006032728764694184, "clip_ratio/low_mean": 0.0029017704146099277, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008934499404858798, "completions/clipped_ratio": 1.0, "completions/max_length": 230.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 180.0, "completions/min_terminated_length": 0.0, "entropy": 1.4038500934839249, "epoch": 140.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.023313550278544426, "kl": 0.38379572704434395, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 28840480.0, "reward": 0.5054687261581421, "reward_std": 0.18933714926242828, "rewards/num_nodes_reward/mean": 0.9375, "rewards/num_nodes_reward/std": 0.24301259219646454, "rewards/tree_correctness_reward/mean": 0.3203125, "rewards/tree_correctness_reward/std": 0.4684300124645233, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.679529070854187, "sampling/importance_sampling_ratio/min": 9.63163282792579e-30, "sampling/sampling_logp_difference/max": 66.8125, "sampling/sampling_logp_difference/mean": 5.271996021270752, "step": 281 }, { "clip_ratio/high_max": 0.019483601325191557, "clip_ratio/high_mean": 0.003854389076877851, "clip_ratio/low_mean": 0.005168210394913331, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009022599377203733, "completions/clipped_ratio": 1.0, "completions/max_length": 287.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 154.0, "completions/min_terminated_length": 0.0, "entropy": 1.7811193317174911, "epoch": 141.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.013548043556511402, "kl": 0.3976973704993725, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 28930720.0, "reward": 0.4765625, "reward_std": 0.16641299426555634, "rewards/num_nodes_reward/mean": 0.859375, "rewards/num_nodes_reward/std": 0.3490002751350403, "rewards/tree_correctness_reward/mean": 0.3125, "rewards/tree_correctness_reward/std": 0.4653336703777313, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.615733802318573, "sampling/importance_sampling_ratio/min": 2.78700539843237e-29, "sampling/sampling_logp_difference/max": 65.75, "sampling/sampling_logp_difference/mean": 6.114956855773926, "step": 282 }, { "clip_ratio/high_max": 0.016556838294491172, "clip_ratio/high_mean": 0.0038581997068831697, "clip_ratio/low_mean": 0.005372343643102795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00923054339364171, "completions/clipped_ratio": 1.0, "completions/max_length": 243.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 178.0, "completions/min_terminated_length": 0.0, "entropy": 1.68726846575737, "epoch": 141.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.01072645466774702, "kl": 0.34269992262125015, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 29021264.0, "reward": 0.5023437142372131, "reward_std": 0.19632481038570404, "rewards/num_nodes_reward/mean": 0.890625, "rewards/num_nodes_reward/std": 0.31333550810813904, "rewards/tree_correctness_reward/mean": 0.3359375, "rewards/tree_correctness_reward/std": 0.47417303919792175, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6283835172653198, "sampling/importance_sampling_ratio/min": 1.4013943476265886e-29, "sampling/sampling_logp_difference/max": 66.4375, "sampling/sampling_logp_difference/mean": 5.951109886169434, "step": 283 }, { "clip_ratio/high_max": 0.02013612270820886, "clip_ratio/high_mean": 0.004443193000042811, "clip_ratio/low_mean": 0.004580227585393004, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0090234205708839, "completions/clipped_ratio": 1.0, "completions/max_length": 216.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 168.0, "completions/min_terminated_length": 0.0, "entropy": 1.2570075690746307, "epoch": 142.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.010058355517685413, "kl": 0.4408813491463661, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 29108800.0, "reward": 0.47265625, "reward_std": 0.20321674644947052, "rewards/num_nodes_reward/mean": 0.828125, "rewards/num_nodes_reward/std": 0.3787541687488556, "rewards/tree_correctness_reward/mean": 0.3203125, "rewards/tree_correctness_reward/std": 0.4684300124645233, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6787062883377075, "sampling/importance_sampling_ratio/min": 6.08737946060331e-29, "sampling/sampling_logp_difference/max": 64.96875, "sampling/sampling_logp_difference/mean": 5.345096588134766, "step": 284 }, { "clip_ratio/high_max": 0.021406202809885144, "clip_ratio/high_mean": 0.005671892897225916, "clip_ratio/low_mean": 0.006177507893880829, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011849401169456542, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 154.0, "completions/min_terminated_length": 0.0, "entropy": 1.3532821089029312, "epoch": 142.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.013194461353123188, "kl": 0.39156312867999077, "learning_rate": 8e-05, "loss": 0.0018, "num_tokens": 29196544.0, "reward": 0.5484374761581421, "reward_std": 0.21492403745651245, "rewards/num_nodes_reward/mean": 0.8984375, "rewards/num_nodes_reward/std": 0.3032590448856354, "rewards/tree_correctness_reward/mean": 0.3984375, "rewards/tree_correctness_reward/std": 0.4915000796318054, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6738592386245728, "sampling/importance_sampling_ratio/min": 6.685678625623819e-29, "sampling/sampling_logp_difference/max": 64.875, "sampling/sampling_logp_difference/mean": 5.39508581161499, "step": 285 }, { "clip_ratio/high_max": 0.039809594134567305, "clip_ratio/high_mean": 0.007061144162435085, "clip_ratio/low_mean": 0.0056987349526025355, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012759878823999316, "completions/clipped_ratio": 1.0, "completions/max_length": 267.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 154.0, "completions/min_terminated_length": 0.0, "entropy": 1.3077737540006638, "epoch": 143.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.018616918474435806, "kl": 0.38564640283584595, "learning_rate": 8e-05, "loss": 0.0017, "num_tokens": 29284032.0, "reward": 0.526562511920929, "reward_std": 0.20556065440177917, "rewards/num_nodes_reward/mean": 0.8984375, "rewards/num_nodes_reward/std": 0.3032590448856354, "rewards/tree_correctness_reward/mean": 0.3671875, "rewards/tree_correctness_reward/std": 0.4839322865009308, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6906293630599976, "sampling/importance_sampling_ratio/min": 2.78700539843237e-29, "sampling/sampling_logp_difference/max": 65.75, "sampling/sampling_logp_difference/mean": 5.183629035949707, "step": 286 }, { "clip_ratio/high_max": 0.03593875985825434, "clip_ratio/high_mean": 0.009035233990289271, "clip_ratio/low_mean": 0.004929980670567602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013965214253403246, "completions/clipped_ratio": 1.0, "completions/max_length": 202.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 131.0, "completions/min_terminated_length": 0.0, "entropy": 1.2435108125209808, "epoch": 143.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.010237528011202812, "kl": 0.4473017416894436, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 29370432.0, "reward": 0.6773437261581421, "reward_std": 0.220424622297287, "rewards/num_nodes_reward/mean": 0.9453125, "rewards/num_nodes_reward/std": 0.22826264798641205, "rewards/tree_correctness_reward/mean": 0.5625, "rewards/tree_correctness_reward/std": 0.49802759289741516, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6920973658561707, "sampling/importance_sampling_ratio/min": 5.48793925732239e-30, "sampling/sampling_logp_difference/max": 67.375, "sampling/sampling_logp_difference/mean": 5.190129280090332, "step": 287 }, { "clip_ratio/high_max": 0.02480855560861528, "clip_ratio/high_mean": 0.008265561453299597, "clip_ratio/low_mean": 0.005769440133008175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014035001513548195, "completions/clipped_ratio": 1.0, "completions/max_length": 228.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 170.0, "completions/min_terminated_length": 0.0, "entropy": 1.5231626480817795, "epoch": 144.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.013437200337648392, "kl": 0.370336402207613, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 29459632.0, "reward": 0.5968749523162842, "reward_std": 0.2769577205181122, "rewards/num_nodes_reward/mean": 0.96875, "rewards/num_nodes_reward/std": 0.1746762990951538, "rewards/tree_correctness_reward/mean": 0.4375, "rewards/tree_correctness_reward/std": 0.49802759289741516, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6480154395103455, "sampling/importance_sampling_ratio/min": 2.239421836915376e-29, "sampling/sampling_logp_difference/max": 65.96875, "sampling/sampling_logp_difference/mean": 5.728247165679932, "step": 288 }, { "clip_ratio/high_max": 0.02533687569666654, "clip_ratio/high_mean": 0.00622320527327247, "clip_ratio/low_mean": 0.0045776970946462825, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010800902848131955, "completions/clipped_ratio": 1.0, "completions/max_length": 224.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 180.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 140.0, "completions/min_terminated_length": 0.0, "entropy": 1.249767705798149, "epoch": 144.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.01057086419314146, "kl": 0.836896512657404, "learning_rate": 8e-05, "loss": 0.0018, "num_tokens": 29546848.0, "reward": 0.47890621423721313, "reward_std": 0.1945556402206421, "rewards/num_nodes_reward/mean": 0.8125, "rewards/num_nodes_reward/std": 0.39184603095054626, "rewards/tree_correctness_reward/mean": 0.3359375, "rewards/tree_correctness_reward/std": 0.47417303919792175, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.679122269153595, "sampling/importance_sampling_ratio/min": 8.320461527346283e-29, "sampling/sampling_logp_difference/max": 64.65625, "sampling/sampling_logp_difference/mean": 5.351059913635254, "step": 289 }, { "clip_ratio/high_max": 0.032250113785266876, "clip_ratio/high_mean": 0.009459773194976151, "clip_ratio/low_mean": 0.005145936185726896, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014605709468014538, "completions/clipped_ratio": 1.0, "completions/max_length": 259.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 157.0, "completions/min_terminated_length": 0.0, "entropy": 1.532029151916504, "epoch": 145.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.01341152936220169, "kl": 0.4618722125887871, "learning_rate": 8e-05, "loss": 0.0015, "num_tokens": 29635808.0, "reward": 0.5765625238418579, "reward_std": 0.2640123665332794, "rewards/num_nodes_reward/mean": 0.9375, "rewards/num_nodes_reward/std": 0.24301259219646454, "rewards/tree_correctness_reward/mean": 0.421875, "rewards/tree_correctness_reward/std": 0.4957992732524872, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6463807821273804, "sampling/importance_sampling_ratio/min": 1.7994259883065448e-29, "sampling/sampling_logp_difference/max": 66.1875, "sampling/sampling_logp_difference/mean": 5.729488372802734, "step": 290 }, { "epoch": 145.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 175.0, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 129.55, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 101.8, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.643105399608612, "eval_frac_reward_zero_std": 0.7, "eval_kl": 0.6451162576675415, "eval_loss": -0.0010791693348437548, "eval_num_tokens": 29635808.0, "eval_reward": 0.18749999701976777, "eval_reward_std": 0.10016381293535233, "eval_rewards/num_nodes_reward/mean": 0.45, "eval_rewards/num_nodes_reward/std": 0.30149178504943847, "eval_rewards/tree_correctness_reward/mean": 0.075, "eval_rewards/tree_correctness_reward/std": 0.1894427239894867, "eval_runtime": 24.3834, "eval_samples_per_second": 0.41, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.8907423496246338, "eval_sampling/importance_sampling_ratio/min": 2.059931241203754e-27, "eval_sampling/sampling_logp_difference/max": 62.41875, "eval_sampling/sampling_logp_difference/mean": 2.0502917766571045, "eval_steps_per_second": 0.041, "step": 290 }, { "clip_ratio/high_max": 0.017742031486704946, "clip_ratio/high_mean": 0.004272695994586684, "clip_ratio/low_mean": 0.005113868057378568, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0093865639064461, "completions/clipped_ratio": 1.0, "completions/max_length": 255.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 149.0, "completions/min_terminated_length": 0.0, "entropy": 1.5020641535520554, "epoch": 145.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.012238342314958572, "kl": 0.4286966063082218, "learning_rate": 8e-05, "loss": 0.0004, "num_tokens": 29724256.0, "reward": 0.49921876192092896, "reward_std": 0.26731011271476746, "rewards/num_nodes_reward/mean": 0.8984375, "rewards/num_nodes_reward/std": 0.3032590448856354, "rewards/tree_correctness_reward/mean": 0.328125, "rewards/tree_correctness_reward/std": 0.4713755249977112, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6484622955322266, "sampling/importance_sampling_ratio/min": 1.1617967007143025e-29, "sampling/sampling_logp_difference/max": 66.625, "sampling/sampling_logp_difference/mean": 5.725001811981201, "step": 291 }, { "clip_ratio/high_max": 0.025488721672445536, "clip_ratio/high_mean": 0.006657244768575765, "clip_ratio/low_mean": 0.004128302010940388, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010785546706756577, "completions/clipped_ratio": 1.0, "completions/max_length": 254.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 155.0, "completions/min_terminated_length": 0.0, "entropy": 1.5586745589971542, "epoch": 146.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.010065633803606033, "kl": 0.37109317630529404, "learning_rate": 8e-05, "loss": 0.0014, "num_tokens": 29811888.0, "reward": 0.51171875, "reward_std": 0.18748408555984497, "rewards/num_nodes_reward/mean": 0.8125, "rewards/num_nodes_reward/std": 0.39184603095054626, "rewards/tree_correctness_reward/mean": 0.3828125, "rewards/tree_correctness_reward/std": 0.4879830479621887, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6436420679092407, "sampling/importance_sampling_ratio/min": 2.3838521382698358e-29, "sampling/sampling_logp_difference/max": 65.90625, "sampling/sampling_logp_difference/mean": 5.806795120239258, "step": 292 }, { "clip_ratio/high_max": 0.025016335130203515, "clip_ratio/high_mean": 0.00456063830642961, "clip_ratio/low_mean": 0.004043015782372095, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008603653957834467, "completions/clipped_ratio": 1.0, "completions/max_length": 217.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 153.0, "completions/min_terminated_length": 0.0, "entropy": 1.5724337995052338, "epoch": 146.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.009000114165246487, "kl": 0.3931497074663639, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 29899488.0, "reward": 0.4625000059604645, "reward_std": 0.18267835676670074, "rewards/num_nodes_reward/mean": 0.8125, "rewards/num_nodes_reward/std": 0.39184603095054626, "rewards/tree_correctness_reward/mean": 0.3125, "rewards/tree_correctness_reward/std": 0.4653336703777313, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6293280124664307, "sampling/importance_sampling_ratio/min": 3.158090871777918e-29, "sampling/sampling_logp_difference/max": 65.625, "sampling/sampling_logp_difference/mean": 6.005258560180664, "step": 293 }, { "clip_ratio/high_max": 0.020906729740090668, "clip_ratio/high_mean": 0.003914748376701027, "clip_ratio/low_mean": 0.00263788983647828, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006552638253197074, "completions/clipped_ratio": 1.0, "completions/max_length": 233.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 149.0, "completions/min_terminated_length": 0.0, "entropy": 1.2085216343402863, "epoch": 147.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.00794070865958929, "kl": 0.4154396764934063, "learning_rate": 8e-05, "loss": -0.0011, "num_tokens": 29985760.0, "reward": 0.4625000059604645, "reward_std": 0.1406680792570114, "rewards/num_nodes_reward/mean": 0.8125, "rewards/num_nodes_reward/std": 0.39184603095054626, "rewards/tree_correctness_reward/mean": 0.3125, "rewards/tree_correctness_reward/std": 0.4653336703777313, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6949424743652344, "sampling/importance_sampling_ratio/min": 2.239421836915376e-29, "sampling/sampling_logp_difference/max": 65.96875, "sampling/sampling_logp_difference/mean": 5.113114833831787, "step": 294 }, { "clip_ratio/high_max": 0.02663096075411886, "clip_ratio/high_mean": 0.004912740900181234, "clip_ratio/low_mean": 0.002363187933951849, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007275928830495104, "completions/clipped_ratio": 1.0, "completions/max_length": 220.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 182.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 128.0, "completions/min_terminated_length": 0.0, "entropy": 1.5844163596630096, "epoch": 147.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.008707955479621887, "kl": 0.6722125001251698, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 30073184.0, "reward": 0.5499999523162842, "reward_std": 0.13016511499881744, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.4140625, "rewards/tree_correctness_reward/std": 0.49449479579925537, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6281208395957947, "sampling/importance_sampling_ratio/min": 4.788094879269939e-28, "sampling/sampling_logp_difference/max": 62.90625, "sampling/sampling_logp_difference/mean": 6.019742012023926, "step": 295 }, { "clip_ratio/high_max": 0.022610104293562472, "clip_ratio/high_mean": 0.005115991909406148, "clip_ratio/low_mean": 0.005011117085814476, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010127109417226166, "completions/clipped_ratio": 1.0, "completions/max_length": 269.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 114.0, "completions/min_terminated_length": 0.0, "entropy": 1.4695009142160416, "epoch": 148.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.012045902200043201, "kl": 0.4482559822499752, "learning_rate": 8e-05, "loss": -0.0019, "num_tokens": 30160512.0, "reward": 0.32499998807907104, "reward_std": 0.1809743344783783, "rewards/num_nodes_reward/mean": 0.6640625, "rewards/num_nodes_reward/std": 0.47417303919792175, "rewards/tree_correctness_reward/mean": 0.1796875, "rewards/tree_correctness_reward/std": 0.3854354918003082, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.654472827911377, "sampling/importance_sampling_ratio/min": 4.549661547287001e-30, "sampling/sampling_logp_difference/max": 67.5625, "sampling/sampling_logp_difference/mean": 5.665345668792725, "step": 296 }, { "clip_ratio/high_max": 0.02442181814694777, "clip_ratio/high_mean": 0.005840892503329087, "clip_ratio/low_mean": 0.004923800006508827, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010764692502561957, "completions/clipped_ratio": 1.0, "completions/max_length": 182.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 154.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 125.0, "completions/min_terminated_length": 0.0, "entropy": 1.040582723915577, "epoch": 148.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.010546150617301464, "kl": 0.5501969456672668, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 30244384.0, "reward": 0.47187501192092896, "reward_std": 0.23945362865924835, "rewards/num_nodes_reward/mean": 0.7890625, "rewards/num_nodes_reward/std": 0.4095771610736847, "rewards/tree_correctness_reward/mean": 0.3359375, "rewards/tree_correctness_reward/std": 0.47417303919792175, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7114049196243286, "sampling/importance_sampling_ratio/min": 7.984904797113095e-30, "sampling/sampling_logp_difference/max": 67.0, "sampling/sampling_logp_difference/mean": 5.019596099853516, "step": 297 }, { "clip_ratio/high_max": 0.028666591038927436, "clip_ratio/high_mean": 0.007864236744353548, "clip_ratio/low_mean": 0.0035919283400289714, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011456165404524654, "completions/clipped_ratio": 1.0, "completions/max_length": 189.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 129.0, "completions/min_terminated_length": 0.0, "entropy": 1.1407744511961937, "epoch": 149.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.013824915513396263, "kl": 0.4256378635764122, "learning_rate": 8e-05, "loss": 0.0012, "num_tokens": 30328816.0, "reward": 0.555468738079071, "reward_std": 0.19614484906196594, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.421875, "rewards/tree_correctness_reward/std": 0.4957992732524872, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6818726062774658, "sampling/importance_sampling_ratio/min": 7.816349966256032e-29, "sampling/sampling_logp_difference/max": 64.71875, "sampling/sampling_logp_difference/mean": 5.386983394622803, "step": 298 }, { "clip_ratio/high_max": 0.03126078034983948, "clip_ratio/high_mean": 0.009523068350972608, "clip_ratio/low_mean": 0.0031620322115486488, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012685100547969341, "completions/clipped_ratio": 1.0, "completions/max_length": 182.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 126.0, "completions/min_terminated_length": 0.0, "entropy": 1.2237948328256607, "epoch": 149.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.012663435190916061, "kl": 0.42006711289286613, "learning_rate": 8e-05, "loss": 0.0012, "num_tokens": 30413584.0, "reward": 0.542187511920929, "reward_std": 0.17690931260585785, "rewards/num_nodes_reward/mean": 0.859375, "rewards/num_nodes_reward/std": 0.3490002751350403, "rewards/tree_correctness_reward/mean": 0.40625, "rewards/tree_correctness_reward/std": 0.4930621087551117, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6678721904754639, "sampling/importance_sampling_ratio/min": 7.816349966256032e-29, "sampling/sampling_logp_difference/max": 64.71875, "sampling/sampling_logp_difference/mean": 5.5893168449401855, "step": 299 }, { "clip_ratio/high_max": 0.0156419996637851, "clip_ratio/high_mean": 0.005518190242582932, "clip_ratio/low_mean": 0.006011124904034659, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011529315321240574, "completions/clipped_ratio": 1.0, "completions/max_length": 212.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 123.0, "completions/min_terminated_length": 0.0, "entropy": 1.4055225402116776, "epoch": 150.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.010921507142484188, "kl": 0.4839504174888134, "learning_rate": 8e-05, "loss": 0.0007, "num_tokens": 30499968.0, "reward": 0.609375, "reward_std": 0.25246959924697876, "rewards/num_nodes_reward/mean": 0.8828125, "rewards/num_nodes_reward/std": 0.322907418012619, "rewards/tree_correctness_reward/mean": 0.4921875, "rewards/tree_correctness_reward/std": 0.5019033551216125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6448041200637817, "sampling/importance_sampling_ratio/min": 3.3617703013855747e-29, "sampling/sampling_logp_difference/max": 65.5625, "sampling/sampling_logp_difference/mean": 5.848965644836426, "step": 300 }, { "epoch": 150.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 156.6, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 117.5, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 94.6, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.49947993755340575, "eval_frac_reward_zero_std": 0.6, "eval_kl": 0.5897437930107117, "eval_loss": -9.491017408436164e-05, "eval_num_tokens": 30499968.0, "eval_reward": 0.19000000059604644, "eval_reward_std": 0.12009179294109344, "eval_rewards/num_nodes_reward/mean": 0.4875, "eval_rewards/num_nodes_reward/std": 0.3057490706443787, "eval_rewards/tree_correctness_reward/mean": 0.0625, "eval_rewards/tree_correctness_reward/std": 0.14893558621406555, "eval_runtime": 22.6559, "eval_samples_per_second": 0.441, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9140603065490722, "eval_sampling/importance_sampling_ratio/min": 9.718463591320783e-27, "eval_sampling/sampling_logp_difference/max": 61.85, "eval_sampling/sampling_logp_difference/mean": 1.7179163217544555, "eval_steps_per_second": 0.044, "step": 300 }, { "clip_ratio/high_max": 0.03177051595412195, "clip_ratio/high_mean": 0.008704571810085326, "clip_ratio/low_mean": 0.004658132849726826, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013362704776227474, "completions/clipped_ratio": 1.0, "completions/max_length": 213.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 125.0, "completions/min_terminated_length": 0.0, "entropy": 1.334066852927208, "epoch": 150.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.014517316594719887, "kl": 0.4370826594531536, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 30585744.0, "reward": 0.578125, "reward_std": 0.18953277170658112, "rewards/num_nodes_reward/mean": 0.796875, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.484375, "rewards/tree_correctness_reward/std": 0.5017194747924805, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6613467335700989, "sampling/importance_sampling_ratio/min": 4.8430893875347414e-30, "sampling/sampling_logp_difference/max": 67.5, "sampling/sampling_logp_difference/mean": 5.654513359069824, "step": 301 }, { "clip_ratio/high_max": 0.023745160549879074, "clip_ratio/high_mean": 0.007432055950630456, "clip_ratio/low_mean": 0.004156539507675916, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011588595574721694, "completions/clipped_ratio": 1.0, "completions/max_length": 209.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 119.0, "completions/min_terminated_length": 0.0, "entropy": 1.40232053399086, "epoch": 151.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.010078532621264458, "kl": 0.47707053646445274, "learning_rate": 8e-05, "loss": 0.0016, "num_tokens": 30671120.0, "reward": 0.5671875476837158, "reward_std": 0.21736112236976624, "rewards/num_nodes_reward/mean": 0.90625, "rewards/num_nodes_reward/std": 0.29262590408325195, "rewards/tree_correctness_reward/mean": 0.421875, "rewards/tree_correctness_reward/std": 0.4957992732524872, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.656900942325592, "sampling/importance_sampling_ratio/min": 5.319093129561238e-30, "sampling/sampling_logp_difference/max": 67.40625, "sampling/sampling_logp_difference/mean": 5.669802665710449, "step": 302 }, { "clip_ratio/high_max": 0.024324648082256317, "clip_ratio/high_mean": 0.005800963030196726, "clip_ratio/low_mean": 0.0035660035209730268, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009366966725792736, "completions/clipped_ratio": 1.0, "completions/max_length": 202.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 124.0, "completions/min_terminated_length": 0.0, "entropy": 1.2186279892921448, "epoch": 151.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008663905784487724, "kl": 0.39370546489953995, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 30755280.0, "reward": 0.586718738079071, "reward_std": 0.17919012904167175, "rewards/num_nodes_reward/mean": 0.84375, "rewards/num_nodes_reward/std": 0.3645188808441162, "rewards/tree_correctness_reward/mean": 0.4765625, "rewards/tree_correctness_reward/std": 0.5014128684997559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6711858510971069, "sampling/importance_sampling_ratio/min": 7.816349966256032e-29, "sampling/sampling_logp_difference/max": 64.71875, "sampling/sampling_logp_difference/mean": 5.550451278686523, "step": 303 }, { "clip_ratio/high_max": 0.021192373242229223, "clip_ratio/high_mean": 0.005206649919273332, "clip_ratio/low_mean": 0.006721525453031063, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011928175343200564, "completions/clipped_ratio": 1.0, "completions/max_length": 190.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 133.0, "completions/min_terminated_length": 0.0, "entropy": 1.171515829861164, "epoch": 152.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.013358689844608307, "kl": 0.4549235329031944, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 30840096.0, "reward": 0.5492187738418579, "reward_std": 0.2402801215648651, "rewards/num_nodes_reward/mean": 0.9921875, "rewards/num_nodes_reward/std": 0.0883883461356163, "rewards/tree_correctness_reward/mean": 0.359375, "rewards/tree_correctness_reward/std": 0.481702595949173, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6956913471221924, "sampling/importance_sampling_ratio/min": 5.718563876253154e-29, "sampling/sampling_logp_difference/max": 65.03125, "sampling/sampling_logp_difference/mean": 5.16790771484375, "step": 304 }, { "clip_ratio/high_max": 0.02789597201626748, "clip_ratio/high_mean": 0.0066725395154207945, "clip_ratio/low_mean": 0.003829195396974683, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010501734854187816, "completions/clipped_ratio": 1.0, "completions/max_length": 168.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 157.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 144.0, "completions/min_terminated_length": 0.0, "entropy": 1.2845255583524704, "epoch": 152.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.009989631362259388, "kl": 0.4044843427836895, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 30924368.0, "reward": 0.5390625, "reward_std": 0.20509308576583862, "rewards/num_nodes_reward/mean": 0.921875, "rewards/num_nodes_reward/std": 0.2694226801395416, "rewards/tree_correctness_reward/mean": 0.375, "rewards/tree_correctness_reward/std": 0.4860251843929291, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6647681593894958, "sampling/importance_sampling_ratio/min": 1.5066409121314163e-28, "sampling/sampling_logp_difference/max": 64.0625, "sampling/sampling_logp_difference/mean": 5.585759162902832, "step": 305 }, { "clip_ratio/high_max": 0.035146570298820734, "clip_ratio/high_mean": 0.008085156878223643, "clip_ratio/low_mean": 0.004348238464444876, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012433395138941705, "completions/clipped_ratio": 1.0, "completions/max_length": 190.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 170.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 141.0, "completions/min_terminated_length": 0.0, "entropy": 1.4398539364337921, "epoch": 153.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.010194714181125164, "kl": 0.40519535541534424, "learning_rate": 8e-05, "loss": 0.0019, "num_tokens": 31010304.0, "reward": 0.6453125476837158, "reward_std": 0.2239047735929489, "rewards/num_nodes_reward/mean": 0.984375, "rewards/num_nodes_reward/std": 0.12450689822435379, "rewards/tree_correctness_reward/mean": 0.5, "rewards/tree_correctness_reward/std": 0.5019646286964417, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6400289535522461, "sampling/importance_sampling_ratio/min": 1.9154790660999508e-29, "sampling/sampling_logp_difference/max": 66.125, "sampling/sampling_logp_difference/mean": 5.8977766036987305, "step": 306 }, { "clip_ratio/high_max": 0.03687068563885987, "clip_ratio/high_mean": 0.00920431932900101, "clip_ratio/low_mean": 0.002837173582520336, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012041492969729006, "completions/clipped_ratio": 1.0, "completions/max_length": 215.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 178.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 136.0, "completions/min_terminated_length": 0.0, "entropy": 1.5998101234436035, "epoch": 153.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.01575559563934803, "kl": 0.4199545718729496, "learning_rate": 8e-05, "loss": 0.0007, "num_tokens": 31097280.0, "reward": 0.640625, "reward_std": 0.20301656424999237, "rewards/num_nodes_reward/mean": 0.96875, "rewards/num_nodes_reward/std": 0.1746762990951538, "rewards/tree_correctness_reward/mean": 0.5, "rewards/tree_correctness_reward/std": 0.5019646286964417, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6183018684387207, "sampling/importance_sampling_ratio/min": 1.8173558845136485e-28, "sampling/sampling_logp_difference/max": 63.875, "sampling/sampling_logp_difference/mean": 6.179808139801025, "step": 307 }, { "clip_ratio/high_max": 0.032010650960728526, "clip_ratio/high_mean": 0.008647306356579065, "clip_ratio/low_mean": 0.004803696210728958, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01345100230537355, "completions/clipped_ratio": 1.0, "completions/max_length": 219.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 169.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 133.0, "completions/min_terminated_length": 0.0, "entropy": 1.4974098429083824, "epoch": 154.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.011305311694741249, "kl": 0.4154079966247082, "learning_rate": 8e-05, "loss": -0.0015, "num_tokens": 31183152.0, "reward": 0.621874988079071, "reward_std": 0.19456961750984192, "rewards/num_nodes_reward/mean": 0.90625, "rewards/num_nodes_reward/std": 0.29262590408325195, "rewards/tree_correctness_reward/mean": 0.5, "rewards/tree_correctness_reward/std": 0.5019646286964417, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6360844373703003, "sampling/importance_sampling_ratio/min": 6.685678625623819e-29, "sampling/sampling_logp_difference/max": 64.875, "sampling/sampling_logp_difference/mean": 5.917601585388184, "step": 308 }, { "clip_ratio/high_max": 0.03559867653530091, "clip_ratio/high_mean": 0.008675011282321066, "clip_ratio/low_mean": 0.005384789372328669, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014059800305403769, "completions/clipped_ratio": 1.0, "completions/max_length": 233.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 118.0, "completions/min_terminated_length": 0.0, "entropy": 1.4806820452213287, "epoch": 154.5, "frac_reward_zero_std": 0.1875, "grad_norm": 0.03851074352860451, "kl": 0.4034973233938217, "learning_rate": 8e-05, "loss": 0.0029, "num_tokens": 31268000.0, "reward": 0.628125011920929, "reward_std": 0.23752129077911377, "rewards/num_nodes_reward/mean": 0.9453125, "rewards/num_nodes_reward/std": 0.22826264798641205, "rewards/tree_correctness_reward/mean": 0.4921875, "rewards/tree_correctness_reward/std": 0.5019033551216125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.64024817943573, "sampling/importance_sampling_ratio/min": 1.9959748127583844e-28, "sampling/sampling_logp_difference/max": 63.78125, "sampling/sampling_logp_difference/mean": 5.914041519165039, "step": 309 }, { "clip_ratio/high_max": 0.021364455635193735, "clip_ratio/high_mean": 0.005731431221647654, "clip_ratio/low_mean": 0.0050960774315171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010827508638612926, "completions/clipped_ratio": 1.0, "completions/max_length": 207.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 133.0, "completions/min_terminated_length": 0.0, "entropy": 1.5851516872644424, "epoch": 155.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.01312279049307108, "kl": 0.45511481165885925, "learning_rate": 8e-05, "loss": 0.0028, "num_tokens": 31353200.0, "reward": 0.48124998807907104, "reward_std": 0.17363451421260834, "rewards/num_nodes_reward/mean": 0.765625, "rewards/num_nodes_reward/std": 0.42527204751968384, "rewards/tree_correctness_reward/mean": 0.359375, "rewards/tree_correctness_reward/std": 0.481702595949173, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6122477054595947, "sampling/importance_sampling_ratio/min": 1.0354981420529148e-28, "sampling/sampling_logp_difference/max": 64.4375, "sampling/sampling_logp_difference/mean": 6.2821784019470215, "step": 310 }, { "epoch": 155.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 147.0, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 107.7, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 85.4, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.4380474627017975, "eval_frac_reward_zero_std": 0.6, "eval_kl": 0.5463492631912231, "eval_loss": -0.00020257604774087667, "eval_num_tokens": 31353200.0, "eval_reward": 0.2587500035762787, "eval_reward_std": 0.11812961399555207, "eval_rewards/num_nodes_reward/mean": 0.6875, "eval_rewards/num_nodes_reward/std": 0.1532795548439026, "eval_rewards/tree_correctness_reward/mean": 0.075, "eval_rewards/tree_correctness_reward/std": 0.1894427239894867, "eval_runtime": 21.3566, "eval_samples_per_second": 0.468, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9066449880599976, "eval_sampling/importance_sampling_ratio/min": 2.0115718625239196e-26, "eval_sampling/sampling_logp_difference/max": 60.20625, "eval_sampling/sampling_logp_difference/mean": 1.874459409713745, "eval_steps_per_second": 0.047, "step": 310 }, { "clip_ratio/high_max": 0.026608376909280196, "clip_ratio/high_mean": 0.003942876373912441, "clip_ratio/low_mean": 0.003298017443739809, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007240893799462356, "completions/clipped_ratio": 1.0, "completions/max_length": 194.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 119.0, "completions/min_terminated_length": 0.0, "entropy": 1.5250743925571442, "epoch": 155.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.01308327354490757, "kl": 0.4251590482890606, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 31438928.0, "reward": 0.5140625238418579, "reward_std": 0.09187361598014832, "rewards/num_nodes_reward/mean": 0.765625, "rewards/num_nodes_reward/std": 0.42527204751968384, "rewards/tree_correctness_reward/mean": 0.40625, "rewards/tree_correctness_reward/std": 0.4930621087551117, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6248466968536377, "sampling/importance_sampling_ratio/min": 1.0914069705124871e-29, "sampling/sampling_logp_difference/max": 66.6875, "sampling/sampling_logp_difference/mean": 6.091444969177246, "step": 311 }, { "clip_ratio/high_max": 0.03799840249121189, "clip_ratio/high_mean": 0.010932476376183331, "clip_ratio/low_mean": 0.004222424991894513, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015154901309870183, "completions/clipped_ratio": 1.0, "completions/max_length": 227.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 150.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 100.0, "completions/min_terminated_length": 0.0, "entropy": 1.2110442370176315, "epoch": 156.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.012152692303061485, "kl": 0.4233608841896057, "learning_rate": 8e-05, "loss": 0.0011, "num_tokens": 31522304.0, "reward": 0.553906261920929, "reward_std": 0.20736467838287354, "rewards/num_nodes_reward/mean": 0.8984375, "rewards/num_nodes_reward/std": 0.3032590448856354, "rewards/tree_correctness_reward/mean": 0.40625, "rewards/tree_correctness_reward/std": 0.4930621087551117, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6878225803375244, "sampling/importance_sampling_ratio/min": 9.048082073307534e-30, "sampling/sampling_logp_difference/max": 66.875, "sampling/sampling_logp_difference/mean": 5.283109664916992, "step": 312 }, { "clip_ratio/high_max": 0.037128034862689674, "clip_ratio/high_mean": 0.006904131078044884, "clip_ratio/low_mean": 0.0017401387740392238, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008644269677461125, "completions/clipped_ratio": 1.0, "completions/max_length": 191.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 110.0, "completions/min_terminated_length": 0.0, "entropy": 1.2141150832176208, "epoch": 156.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.009928014129400253, "kl": 0.5416676700115204, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 31604464.0, "reward": 0.48828125, "reward_std": 0.11471328139305115, "rewards/num_nodes_reward/mean": 0.7890625, "rewards/num_nodes_reward/std": 0.4095771610736847, "rewards/tree_correctness_reward/mean": 0.359375, "rewards/tree_correctness_reward/std": 0.481702595949173, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6743477582931519, "sampling/importance_sampling_ratio/min": 2.5628824030209753e-28, "sampling/sampling_logp_difference/max": 63.53125, "sampling/sampling_logp_difference/mean": 5.523463249206543, "step": 313 }, { "clip_ratio/high_max": 0.0366418999619782, "clip_ratio/high_mean": 0.010675784084014595, "clip_ratio/low_mean": 0.006301168788922951, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016976952785626054, "completions/clipped_ratio": 1.0, "completions/max_length": 204.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 153.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 120.0, "completions/min_terminated_length": 0.0, "entropy": 1.4612130969762802, "epoch": 157.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.013412309810519218, "kl": 0.3993598595261574, "learning_rate": 8e-05, "loss": 0.0011, "num_tokens": 31688208.0, "reward": 0.6078125238418579, "reward_std": 0.20962680876255035, "rewards/num_nodes_reward/mean": 0.96875, "rewards/num_nodes_reward/std": 0.1746762990951538, "rewards/tree_correctness_reward/mean": 0.453125, "rewards/tree_correctness_reward/std": 0.4997538626194, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6372522115707397, "sampling/importance_sampling_ratio/min": 6.0273218931633014e-30, "sampling/sampling_logp_difference/max": 67.28125, "sampling/sampling_logp_difference/mean": 6.013955116271973, "step": 314 }, { "clip_ratio/high_max": 0.0337376871611923, "clip_ratio/high_mean": 0.0069356916937977076, "clip_ratio/low_mean": 0.003214751835912466, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010150443820748478, "completions/clipped_ratio": 1.0, "completions/max_length": 150.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 128.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 111.0, "completions/min_terminated_length": 0.0, "entropy": 0.9266814664006233, "epoch": 157.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.009258735924959183, "kl": 0.5398026369512081, "learning_rate": 8e-05, "loss": 0.0005, "num_tokens": 31768800.0, "reward": 0.616406261920929, "reward_std": 0.127866730093956, "rewards/num_nodes_reward/mean": 0.90625, "rewards/num_nodes_reward/std": 0.29262590408325195, "rewards/tree_correctness_reward/mean": 0.4921875, "rewards/tree_correctness_reward/std": 0.5019033551216125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7230396866798401, "sampling/importance_sampling_ratio/min": 6.685678625623819e-29, "sampling/sampling_logp_difference/max": 64.875, "sampling/sampling_logp_difference/mean": 4.935888290405273, "step": 315 }, { "clip_ratio/high_max": 0.03162668482400477, "clip_ratio/high_mean": 0.008520194693119265, "clip_ratio/low_mean": 0.003508303787384648, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012028498516883701, "completions/clipped_ratio": 1.0, "completions/max_length": 163.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 113.0, "completions/min_terminated_length": 0.0, "entropy": 1.0894809886813164, "epoch": 158.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.008932342752814293, "kl": 0.459070798009634, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 31849936.0, "reward": 0.569531261920929, "reward_std": 0.13537189364433289, "rewards/num_nodes_reward/mean": 0.859375, "rewards/num_nodes_reward/std": 0.3490002751350403, "rewards/tree_correctness_reward/mean": 0.4453125, "rewards/tree_correctness_reward/std": 0.4989531338214874, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6944496035575867, "sampling/importance_sampling_ratio/min": 3.692182541718095e-29, "sampling/sampling_logp_difference/max": 65.46875, "sampling/sampling_logp_difference/mean": 5.307049751281738, "step": 316 }, { "clip_ratio/high_max": 0.030640010256320238, "clip_ratio/high_mean": 0.007783941982779652, "clip_ratio/low_mean": 0.0038791327679064125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011663074954412878, "completions/clipped_ratio": 1.0, "completions/max_length": 158.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 93.0, "completions/min_terminated_length": 0.0, "entropy": 0.9022008329629898, "epoch": 158.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.024805167689919472, "kl": 0.5044564343988895, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 31929840.0, "reward": 0.7046874761581421, "reward_std": 0.16497819125652313, "rewards/num_nodes_reward/mean": 0.9453125, "rewards/num_nodes_reward/std": 0.22826264798641205, "rewards/tree_correctness_reward/mean": 0.6015625, "rewards/tree_correctness_reward/std": 0.4915000796318054, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7266812324523926, "sampling/importance_sampling_ratio/min": 7.816349966256032e-29, "sampling/sampling_logp_difference/max": 64.71875, "sampling/sampling_logp_difference/mean": 4.933302879333496, "step": 317 }, { "clip_ratio/high_max": 0.03267290093936026, "clip_ratio/high_mean": 0.009542297339066863, "clip_ratio/low_mean": 0.007472494675312191, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017014792189002037, "completions/clipped_ratio": 1.0, "completions/max_length": 155.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 121.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 104.0, "completions/min_terminated_length": 0.0, "entropy": 0.9160272926092148, "epoch": 159.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.01132299192249775, "kl": 0.5565587282180786, "learning_rate": 8e-05, "loss": -0.0005, "num_tokens": 32009568.0, "reward": 0.4960937798023224, "reward_std": 0.24763567745685577, "rewards/num_nodes_reward/mean": 0.796875, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.3671875, "rewards/tree_correctness_reward/std": 0.4839322865009308, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7262495756149292, "sampling/importance_sampling_ratio/min": 5.372093894082752e-29, "sampling/sampling_logp_difference/max": 65.09375, "sampling/sampling_logp_difference/mean": 4.951010227203369, "step": 318 }, { "clip_ratio/high_max": 0.024162700166925788, "clip_ratio/high_mean": 0.0055057157296687365, "clip_ratio/low_mean": 0.004456592709175311, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009962308336980641, "completions/clipped_ratio": 1.0, "completions/max_length": 159.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 110.0, "completions/min_terminated_length": 0.0, "entropy": 1.1707621216773987, "epoch": 159.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.013858642429113388, "kl": 0.4932952858507633, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 32090464.0, "reward": 0.6195312738418579, "reward_std": 0.15046875178813934, "rewards/num_nodes_reward/mean": 0.84375, "rewards/num_nodes_reward/std": 0.3645188808441162, "rewards/tree_correctness_reward/mean": 0.5234375, "rewards/tree_correctness_reward/std": 0.5014128684997559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6654311418533325, "sampling/importance_sampling_ratio/min": 5.372093894082752e-29, "sampling/sampling_logp_difference/max": 65.09375, "sampling/sampling_logp_difference/mean": 5.786745071411133, "step": 319 }, { "clip_ratio/high_max": 0.02872136642690748, "clip_ratio/high_mean": 0.008010824560187757, "clip_ratio/low_mean": 0.008361998479813337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016372823272831738, "completions/clipped_ratio": 1.0, "completions/max_length": 149.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 104.0, "completions/min_terminated_length": 0.0, "entropy": 1.0482332408428192, "epoch": 160.0, "frac_reward_zero_std": 0.1875, "grad_norm": 0.014879890717566013, "kl": 0.5657174363732338, "learning_rate": 8e-05, "loss": 0.001, "num_tokens": 32170944.0, "reward": 0.5796874761581421, "reward_std": 0.27711760997772217, "rewards/num_nodes_reward/mean": 0.9296875, "rewards/num_nodes_reward/std": 0.2566775679588318, "rewards/tree_correctness_reward/mean": 0.4296875, "rewards/tree_correctness_reward/std": 0.4969765841960907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.704620897769928, "sampling/importance_sampling_ratio/min": 2.966751838288923e-29, "sampling/sampling_logp_difference/max": 65.6875, "sampling/sampling_logp_difference/mean": 5.2028045654296875, "step": 320 }, { "epoch": 160.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 124.6, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 93.8, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 72.0, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.48092647790908816, "eval_frac_reward_zero_std": 0.5, "eval_kl": 0.6253231048583985, "eval_loss": -0.0003426431503612548, "eval_num_tokens": 32170944.0, "eval_reward": 0.25000000596046446, "eval_reward_std": 0.12169951945543289, "eval_rewards/num_nodes_reward/mean": 0.6, "eval_rewards/num_nodes_reward/std": 0.2715925633907318, "eval_rewards/tree_correctness_reward/mean": 0.1, "eval_rewards/tree_correctness_reward/std": 0.21405572295188904, "eval_runtime": 19.742, "eval_samples_per_second": 0.507, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.905564033985138, "eval_sampling/importance_sampling_ratio/min": 4.2197206606138886e-26, "eval_sampling/sampling_logp_difference/max": 59.2125, "eval_sampling/sampling_logp_difference/mean": 1.925797438621521, "eval_steps_per_second": 0.051, "step": 320 }, { "clip_ratio/high_max": 0.031822524149902165, "clip_ratio/high_mean": 0.00645184816676192, "clip_ratio/low_mean": 0.004864998292759992, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011316846590489149, "completions/clipped_ratio": 1.0, "completions/max_length": 149.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 99.0, "completions/min_terminated_length": 0.0, "entropy": 0.9903005138039589, "epoch": 160.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.007375057321041822, "kl": 0.5315045490860939, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 32250848.0, "reward": 0.49296873807907104, "reward_std": 0.18272764980793, "rewards/num_nodes_reward/mean": 0.8046875, "rewards/num_nodes_reward/std": 0.3979988098144531, "rewards/tree_correctness_reward/mean": 0.359375, "rewards/tree_correctness_reward/std": 0.481702595949173, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7160682678222656, "sampling/importance_sampling_ratio/min": 2.2617360524555665e-28, "sampling/sampling_logp_difference/max": 63.65625, "sampling/sampling_logp_difference/mean": 5.063522815704346, "step": 321 }, { "clip_ratio/high_max": 0.027579740504734218, "clip_ratio/high_mean": 0.0071552353037986904, "clip_ratio/low_mean": 0.00170594995142892, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00886118522612378, "completions/clipped_ratio": 1.0, "completions/max_length": 158.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 121.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 96.0, "completions/min_terminated_length": 0.0, "entropy": 0.9409227669239044, "epoch": 161.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03942809998989105, "kl": 0.5495539382100105, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 32330496.0, "reward": 0.624218761920929, "reward_std": 0.12301520258188248, "rewards/num_nodes_reward/mean": 0.859375, "rewards/num_nodes_reward/std": 0.3490002751350403, "rewards/tree_correctness_reward/mean": 0.5234375, "rewards/tree_correctness_reward/std": 0.5014128684997559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7182135581970215, "sampling/importance_sampling_ratio/min": 4.1837909054240737e-29, "sampling/sampling_logp_difference/max": 65.34375, "sampling/sampling_logp_difference/mean": 5.076604843139648, "step": 322 }, { "clip_ratio/high_max": 0.028401459276210517, "clip_ratio/high_mean": 0.008259096284746192, "clip_ratio/low_mean": 0.005645842997182626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013904939638450742, "completions/clipped_ratio": 1.0, "completions/max_length": 182.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 105.0, "completions/min_terminated_length": 0.0, "entropy": 1.16312725096941, "epoch": 161.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.009434161707758904, "kl": 0.5476006641983986, "learning_rate": 8e-05, "loss": -0.0013, "num_tokens": 32411200.0, "reward": 0.56640625, "reward_std": 0.23169314861297607, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.4375, "rewards/tree_correctness_reward/std": 0.49802759289741516, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6790035963058472, "sampling/importance_sampling_ratio/min": 3.578585776920167e-29, "sampling/sampling_logp_difference/max": 65.5, "sampling/sampling_logp_difference/mean": 5.6049628257751465, "step": 323 }, { "clip_ratio/high_max": 0.031090240692719817, "clip_ratio/high_mean": 0.006580979155842215, "clip_ratio/low_mean": 0.004217152563796844, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010798131464980543, "completions/clipped_ratio": 1.0, "completions/max_length": 158.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 128.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 104.0, "completions/min_terminated_length": 0.0, "entropy": 1.0729616731405258, "epoch": 162.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.011217009276151657, "kl": 0.558863490819931, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 32491760.0, "reward": 0.46953123807907104, "reward_std": 0.1816636323928833, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.3359375, "rewards/tree_correctness_reward/std": 0.47417303919792175, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6963933706283569, "sampling/importance_sampling_ratio/min": 2.192149796152425e-28, "sampling/sampling_logp_difference/max": 63.6875, "sampling/sampling_logp_difference/mean": 5.320058822631836, "step": 324 }, { "clip_ratio/high_max": 0.038859359570778906, "clip_ratio/high_mean": 0.009403446223586798, "clip_ratio/low_mean": 0.0062911686254665256, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01569461531471461, "completions/clipped_ratio": 1.0, "completions/max_length": 158.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 119.0, "completions/min_terminated_length": 0.0, "entropy": 1.2490892857313156, "epoch": 162.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.009694541804492474, "kl": 0.4980255290865898, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 32572848.0, "reward": 0.4742187261581421, "reward_std": 0.24364253878593445, "rewards/num_nodes_reward/mean": 0.6875, "rewards/num_nodes_reward/std": 0.4653336703777313, "rewards/tree_correctness_reward/mean": 0.3828125, "rewards/tree_correctness_reward/std": 0.4879830479621887, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.656621515750885, "sampling/importance_sampling_ratio/min": 5.206811789961654e-29, "sampling/sampling_logp_difference/max": 65.125, "sampling/sampling_logp_difference/mean": 5.88887882232666, "step": 325 }, { "clip_ratio/high_max": 0.020662229042500257, "clip_ratio/high_mean": 0.004769068182213232, "clip_ratio/low_mean": 0.004069671413162723, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008838739362545311, "completions/clipped_ratio": 1.0, "completions/max_length": 162.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 96.0, "completions/min_terminated_length": 0.0, "entropy": 1.1091360747814178, "epoch": 163.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.008326264098286629, "kl": 0.48695147037506104, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 32652848.0, "reward": 0.56640625, "reward_std": 0.18159663677215576, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.4375, "rewards/tree_correctness_reward/std": 0.49802759289741516, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6939294338226318, "sampling/importance_sampling_ratio/min": 1.7994259883065448e-29, "sampling/sampling_logp_difference/max": 66.1875, "sampling/sampling_logp_difference/mean": 5.373312950134277, "step": 326 }, { "clip_ratio/high_max": 0.034801841946318746, "clip_ratio/high_mean": 0.00886819459265098, "clip_ratio/low_mean": 0.005979289810056798, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014847484533675015, "completions/clipped_ratio": 1.0, "completions/max_length": 183.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 133.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 104.0, "completions/min_terminated_length": 0.0, "entropy": 1.3255197256803513, "epoch": 163.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.010627230629324913, "kl": 0.4726017042994499, "learning_rate": 8e-05, "loss": -0.0008, "num_tokens": 32734112.0, "reward": 0.53515625, "reward_std": 0.25062263011932373, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.4296875, "rewards/tree_correctness_reward/std": 0.4969765841960907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.658615231513977, "sampling/importance_sampling_ratio/min": 2.0390167891397317e-29, "sampling/sampling_logp_difference/max": 66.0625, "sampling/sampling_logp_difference/mean": 5.8250627517700195, "step": 327 }, { "clip_ratio/high_max": 0.028614893672056496, "clip_ratio/high_mean": 0.006039659376256168, "clip_ratio/low_mean": 0.004862587986281142, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01090224739164114, "completions/clipped_ratio": 1.0, "completions/max_length": 163.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 101.0, "completions/min_terminated_length": 0.0, "entropy": 1.0972390100359917, "epoch": 164.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.009899328462779522, "kl": 0.48334866389632225, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 32813984.0, "reward": 0.551562488079071, "reward_std": 0.18406865000724792, "rewards/num_nodes_reward/mean": 0.8359375, "rewards/num_nodes_reward/std": 0.371787428855896, "rewards/tree_correctness_reward/mean": 0.4296875, "rewards/tree_correctness_reward/std": 0.4969765841960907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6983877420425415, "sampling/importance_sampling_ratio/min": 6.218649491443252e-30, "sampling/sampling_logp_difference/max": 67.25, "sampling/sampling_logp_difference/mean": 5.3194732666015625, "step": 328 }, { "clip_ratio/high_max": 0.03715016762726009, "clip_ratio/high_mean": 0.0076788063161075115, "clip_ratio/low_mean": 0.004297291219700128, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0119760975940153, "completions/clipped_ratio": 1.0, "completions/max_length": 154.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 104.0, "completions/min_terminated_length": 0.0, "entropy": 1.004439391195774, "epoch": 164.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.009567638859152794, "kl": 0.5280287861824036, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 32893840.0, "reward": 0.56640625, "reward_std": 0.18890923261642456, "rewards/num_nodes_reward/mean": 0.7578125, "rewards/num_nodes_reward/std": 0.4300905168056488, "rewards/tree_correctness_reward/mean": 0.484375, "rewards/tree_correctness_reward/std": 0.5017194747924805, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.722152590751648, "sampling/importance_sampling_ratio/min": 1.0036392283569288e-28, "sampling/sampling_logp_difference/max": 64.46875, "sampling/sampling_logp_difference/mean": 4.941024303436279, "step": 329 }, { "clip_ratio/high_max": 0.0326322375331074, "clip_ratio/high_mean": 0.007821999432053417, "clip_ratio/low_mean": 0.007490315125323832, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015312314499169588, "completions/clipped_ratio": 1.0, "completions/max_length": 147.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 125.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 105.0, "completions/min_terminated_length": 0.0, "entropy": 1.0592318177223206, "epoch": 165.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.012771165929734707, "kl": 0.4928359165787697, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 32974000.0, "reward": 0.612500011920929, "reward_std": 0.22167912125587463, "rewards/num_nodes_reward/mean": 0.875, "rewards/num_nodes_reward/std": 0.3320184051990509, "rewards/tree_correctness_reward/mean": 0.5, "rewards/tree_correctness_reward/std": 0.5019646286964417, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7076689004898071, "sampling/importance_sampling_ratio/min": 2.9041266150285263e-28, "sampling/sampling_logp_difference/max": 63.40625, "sampling/sampling_logp_difference/mean": 5.167263984680176, "step": 330 }, { "epoch": 165.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 142.6, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 101.025, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 77.2, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.5889325737953186, "eval_frac_reward_zero_std": 0.3, "eval_kl": 0.6583130896091461, "eval_loss": -0.0007515263860113919, "eval_num_tokens": 32974000.0, "eval_reward": 0.24000000357627868, "eval_reward_std": 0.17548287212848662, "eval_rewards/num_nodes_reward/mean": 0.5375, "eval_rewards/num_nodes_reward/std": 0.30068174600601194, "eval_rewards/tree_correctness_reward/mean": 0.1125, "eval_rewards/tree_correctness_reward/std": 0.23837831020355224, "eval_runtime": 21.5868, "eval_samples_per_second": 0.463, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9003038763999939, "eval_sampling/importance_sampling_ratio/min": 1.2003744840310913e-26, "eval_sampling/sampling_logp_difference/max": 60.31875, "eval_sampling/sampling_logp_difference/mean": 1.9468605041503906, "eval_steps_per_second": 0.046, "step": 330 }, { "clip_ratio/high_max": 0.026461431989446282, "clip_ratio/high_mean": 0.007768306939397007, "clip_ratio/low_mean": 0.005953439540462568, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013721746625378728, "completions/clipped_ratio": 1.0, "completions/max_length": 133.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 121.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 109.0, "completions/min_terminated_length": 0.0, "entropy": 0.9748139902949333, "epoch": 165.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.010637398809194565, "kl": 0.5461441688239574, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 33053648.0, "reward": 0.6070312261581421, "reward_std": 0.21227845549583435, "rewards/num_nodes_reward/mean": 0.9296875, "rewards/num_nodes_reward/std": 0.2566775679588318, "rewards/tree_correctness_reward/mean": 0.46875, "rewards/tree_correctness_reward/std": 0.5009832978248596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7225387096405029, "sampling/importance_sampling_ratio/min": 7.501123232145174e-30, "sampling/sampling_logp_difference/max": 67.0625, "sampling/sampling_logp_difference/mean": 4.972292423248291, "step": 331 }, { "clip_ratio/high_max": 0.03570980962831527, "clip_ratio/high_mean": 0.009044791630003601, "clip_ratio/low_mean": 0.0038512588653247803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01289605034980923, "completions/clipped_ratio": 1.0, "completions/max_length": 151.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 107.0, "completions/min_terminated_length": 0.0, "entropy": 1.167006015777588, "epoch": 166.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.011065209284424782, "kl": 0.5109198540449142, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 33133872.0, "reward": 0.73046875, "reward_std": 0.18631845712661743, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.671875, "rewards/tree_correctness_reward/std": 0.4713755249977112, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6789472103118896, "sampling/importance_sampling_ratio/min": 2.5628824030209753e-28, "sampling/sampling_logp_difference/max": 63.53125, "sampling/sampling_logp_difference/mean": 5.566969871520996, "step": 332 }, { "clip_ratio/high_max": 0.029434064752422273, "clip_ratio/high_mean": 0.007138894987292588, "clip_ratio/low_mean": 0.006158500298624858, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01329739554785192, "completions/clipped_ratio": 1.0, "completions/max_length": 141.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 113.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 100.0, "completions/min_terminated_length": 0.0, "entropy": 0.8507975339889526, "epoch": 166.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.010467317886650562, "kl": 0.5837590470910072, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 33212576.0, "reward": 0.6492187976837158, "reward_std": 0.22341284155845642, "rewards/num_nodes_reward/mean": 0.796875, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.5859375, "rewards/tree_correctness_reward/std": 0.49449479579925537, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7505720257759094, "sampling/importance_sampling_ratio/min": 6.28061342860165e-29, "sampling/sampling_logp_difference/max": 64.9375, "sampling/sampling_logp_difference/mean": 4.559021949768066, "step": 333 }, { "clip_ratio/high_max": 0.03375281940679997, "clip_ratio/high_mean": 0.009480272798100486, "clip_ratio/low_mean": 0.0037823714956175536, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013262644410133362, "completions/clipped_ratio": 1.0, "completions/max_length": 164.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 93.0, "completions/min_terminated_length": 0.0, "entropy": 1.185932070016861, "epoch": 167.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.009737839922308922, "kl": 0.6043210849165916, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 33292960.0, "reward": 0.663281261920929, "reward_std": 0.2044423371553421, "rewards/num_nodes_reward/mean": 0.8984375, "rewards/num_nodes_reward/std": 0.3032590448856354, "rewards/tree_correctness_reward/mean": 0.5625, "rewards/tree_correctness_reward/std": 0.49802759289741516, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.6953614950180054, "sampling/importance_sampling_ratio/min": 2.0593340560688357e-28, "sampling/sampling_logp_difference/max": 63.75, "sampling/sampling_logp_difference/mean": 5.295923233032227, "step": 334 }, { "clip_ratio/high_max": 0.025960725033655763, "clip_ratio/high_mean": 0.005364043754525483, "clip_ratio/low_mean": 0.004099610945559107, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009463654831051826, "completions/clipped_ratio": 1.0, "completions/max_length": 157.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 120.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 97.0, "completions/min_terminated_length": 0.0, "entropy": 1.0328280478715897, "epoch": 167.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.010186145082116127, "kl": 0.5842746943235397, "learning_rate": 8e-05, "loss": 0.0005, "num_tokens": 33372464.0, "reward": 0.6468750238418579, "reward_std": 0.14785712957382202, "rewards/num_nodes_reward/mean": 0.84375, "rewards/num_nodes_reward/std": 0.3645188808441162, "rewards/tree_correctness_reward/mean": 0.5625, "rewards/tree_correctness_reward/std": 0.49802759289741516, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7207794785499573, "sampling/importance_sampling_ratio/min": 7.575866316337689e-29, "sampling/sampling_logp_difference/max": 64.75, "sampling/sampling_logp_difference/mean": 5.00062370300293, "step": 335 }, { "clip_ratio/high_max": 0.025416558724828064, "clip_ratio/high_mean": 0.007902851095423102, "clip_ratio/low_mean": 0.005268753331620246, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013171604718081653, "completions/clipped_ratio": 1.0, "completions/max_length": 189.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 98.0, "completions/min_terminated_length": 0.0, "entropy": 1.1428208351135254, "epoch": 168.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01062654610723257, "kl": 0.46199511736631393, "learning_rate": 8e-05, "loss": 0.0005, "num_tokens": 33453216.0, "reward": 0.6460937261581421, "reward_std": 0.19696196913719177, "rewards/num_nodes_reward/mean": 0.9140625, "rewards/num_nodes_reward/std": 0.2813730239868164, "rewards/tree_correctness_reward/mean": 0.53125, "rewards/tree_correctness_reward/std": 0.5009832978248596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.698479413986206, "sampling/importance_sampling_ratio/min": 9.138239066976054e-29, "sampling/sampling_logp_difference/max": 64.5625, "sampling/sampling_logp_difference/mean": 5.2765607833862305, "step": 336 }, { "clip_ratio/high_max": 0.03773256833665073, "clip_ratio/high_mean": 0.01092256820993498, "clip_ratio/low_mean": 0.0033639941830188036, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014286562334746122, "completions/clipped_ratio": 1.0, "completions/max_length": 141.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 116.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 104.0, "completions/min_terminated_length": 0.0, "entropy": 0.8985861241817474, "epoch": 168.5, "frac_reward_zero_std": 0.3125, "grad_norm": 0.010642734356224537, "kl": 0.7304801680147648, "learning_rate": 8e-05, "loss": 0.0007, "num_tokens": 33532288.0, "reward": 0.5890624523162842, "reward_std": 0.22137236595153809, "rewards/num_nodes_reward/mean": 0.796875, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.5, "rewards/tree_correctness_reward/std": 0.5019646286964417, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7342652678489685, "sampling/importance_sampling_ratio/min": 1.7072478214393567e-28, "sampling/sampling_logp_difference/max": 63.9375, "sampling/sampling_logp_difference/mean": 4.863547325134277, "step": 337 }, { "clip_ratio/high_max": 0.03166947315912694, "clip_ratio/high_mean": 0.008824907999951392, "clip_ratio/low_mean": 0.007205935165984556, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01603084267117083, "completions/clipped_ratio": 1.0, "completions/max_length": 149.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 114.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 96.0, "completions/min_terminated_length": 0.0, "entropy": 0.8398830071091652, "epoch": 169.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.012975513935089111, "kl": 0.5038297139108181, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 33611072.0, "reward": 0.6796875, "reward_std": 0.2338915467262268, "rewards/num_nodes_reward/mean": 0.8984375, "rewards/num_nodes_reward/std": 0.3032590448856354, "rewards/tree_correctness_reward/mean": 0.5859375, "rewards/tree_correctness_reward/std": 0.49449479579925537, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7242422103881836, "sampling/importance_sampling_ratio/min": 1.5066409121314163e-28, "sampling/sampling_logp_difference/max": 64.0625, "sampling/sampling_logp_difference/mean": 5.035279273986816, "step": 338 }, { "clip_ratio/high_max": 0.03341680660378188, "clip_ratio/high_mean": 0.008814251283183694, "clip_ratio/low_mean": 0.0032922800091910176, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012106531532481313, "completions/clipped_ratio": 1.0, "completions/max_length": 158.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 116.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 94.0, "completions/min_terminated_length": 0.0, "entropy": 0.856178030371666, "epoch": 169.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.009036192670464516, "kl": 0.5310977920889854, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 33690080.0, "reward": 0.7124999761581421, "reward_std": 0.1898190975189209, "rewards/num_nodes_reward/mean": 0.953125, "rewards/num_nodes_reward/std": 0.21220162510871887, "rewards/tree_correctness_reward/mean": 0.609375, "rewards/tree_correctness_reward/std": 0.4898075461387634, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7235058546066284, "sampling/importance_sampling_ratio/min": 6.479982085325971e-29, "sampling/sampling_logp_difference/max": 64.90625, "sampling/sampling_logp_difference/mean": 5.051271438598633, "step": 339 }, { "clip_ratio/high_max": 0.046216202434152365, "clip_ratio/high_mean": 0.010653747769538313, "clip_ratio/low_mean": 0.0062521102372556925, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016905857948586345, "completions/clipped_ratio": 1.0, "completions/max_length": 178.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 114.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 92.0, "completions/min_terminated_length": 0.0, "entropy": 0.8152811229228973, "epoch": 170.0, "frac_reward_zero_std": 0.3125, "grad_norm": 0.007950558327138424, "kl": 0.6316078156232834, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 33768816.0, "reward": 0.5679687261581421, "reward_std": 0.211870014667511, "rewards/num_nodes_reward/mean": 0.8359375, "rewards/num_nodes_reward/std": 0.371787428855896, "rewards/tree_correctness_reward/mean": 0.453125, "rewards/tree_correctness_reward/std": 0.4997538626194, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7265092134475708, "sampling/importance_sampling_ratio/min": 2.814775743412354e-28, "sampling/sampling_logp_difference/max": 63.4375, "sampling/sampling_logp_difference/mean": 5.0594024658203125, "step": 340 }, { "epoch": 170.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 120.2, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 89.675, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 71.4, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.38342564702034, "eval_frac_reward_zero_std": 0.5, "eval_kl": 0.7163724541664124, "eval_loss": -0.00035697699058800936, "eval_num_tokens": 33768816.0, "eval_reward": 0.3162499964237213, "eval_reward_std": 0.1702295109629631, "eval_rewards/num_nodes_reward/mean": 0.675, "eval_rewards/num_nodes_reward/std": 0.2032795548439026, "eval_rewards/tree_correctness_reward/mean": 0.1625, "eval_rewards/tree_correctness_reward/std": 0.2427222788333893, "eval_runtime": 18.7259, "eval_samples_per_second": 0.534, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9276837110519409, "eval_sampling/importance_sampling_ratio/min": 9.315301415306312e-27, "eval_sampling/sampling_logp_difference/max": 61.08125, "eval_sampling/sampling_logp_difference/mean": 1.5590992212295531, "eval_steps_per_second": 0.053, "step": 340 }, { "clip_ratio/high_max": 0.03367707342840731, "clip_ratio/high_mean": 0.008743811020394787, "clip_ratio/low_mean": 0.003916497364116367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012660308129852638, "completions/clipped_ratio": 1.0, "completions/max_length": 157.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 116.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 96.0, "completions/min_terminated_length": 0.0, "entropy": 0.9117901846766472, "epoch": 170.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.009238265454769135, "kl": 0.6076523549854755, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 33847824.0, "reward": 0.5640624761581421, "reward_std": 0.16333389282226562, "rewards/num_nodes_reward/mean": 0.859375, "rewards/num_nodes_reward/std": 0.3490002751350403, "rewards/tree_correctness_reward/mean": 0.4375, "rewards/tree_correctness_reward/std": 0.49802759289741516, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7005487084388733, "sampling/importance_sampling_ratio/min": 3.060926502377367e-29, "sampling/sampling_logp_difference/max": 65.65625, "sampling/sampling_logp_difference/mean": 5.393312454223633, "step": 341 }, { "clip_ratio/high_max": 0.02093664777930826, "clip_ratio/high_mean": 0.004507795267272741, "clip_ratio/low_mean": 0.0033472209615865722, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007855016272515059, "completions/clipped_ratio": 1.0, "completions/max_length": 147.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 92.0, "completions/min_terminated_length": 0.0, "entropy": 0.8918511942028999, "epoch": 171.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0058958400040864944, "kl": 0.517019048333168, "learning_rate": 8e-05, "loss": 0.0004, "num_tokens": 33926800.0, "reward": 0.7632812261581421, "reward_std": 0.13365024328231812, "rewards/num_nodes_reward/mean": 0.921875, "rewards/num_nodes_reward/std": 0.2694226801395416, "rewards/tree_correctness_reward/mean": 0.6953125, "rewards/tree_correctness_reward/std": 0.46208351850509644, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7055119872093201, "sampling/importance_sampling_ratio/min": 6.752296422029552e-28, "sampling/sampling_logp_difference/max": 62.5625, "sampling/sampling_logp_difference/mean": 5.336289882659912, "step": 342 }, { "clip_ratio/high_max": 0.033752829069271684, "clip_ratio/high_mean": 0.010972686228342354, "clip_ratio/low_mean": 0.005234461976215243, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016207148088142276, "completions/clipped_ratio": 1.0, "completions/max_length": 126.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 98.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 88.0, "completions/min_terminated_length": 0.0, "entropy": 0.4645770937204361, "epoch": 171.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.009488098323345184, "kl": 0.7052705511450768, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 34003552.0, "reward": 0.590624988079071, "reward_std": 0.20732758939266205, "rewards/num_nodes_reward/mean": 0.875, "rewards/num_nodes_reward/std": 0.3320184051990509, "rewards/tree_correctness_reward/mean": 0.46875, "rewards/tree_correctness_reward/std": 0.5009832978248596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7934492826461792, "sampling/importance_sampling_ratio/min": 7.816349966256032e-29, "sampling/sampling_logp_difference/max": 64.71875, "sampling/sampling_logp_difference/mean": 4.185587406158447, "step": 343 }, { "clip_ratio/high_max": 0.038479583570733666, "clip_ratio/high_mean": 0.00939895014744252, "clip_ratio/low_mean": 0.0038728598010493442, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013271809904836118, "completions/clipped_ratio": 1.0, "completions/max_length": 153.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 106.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 92.0, "completions/min_terminated_length": 0.0, "entropy": 0.698673814535141, "epoch": 172.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.015240144915878773, "kl": 0.6144898906350136, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 34081264.0, "reward": 0.6890624761581421, "reward_std": 0.17618268728256226, "rewards/num_nodes_reward/mean": 0.9296875, "rewards/num_nodes_reward/std": 0.2566775679588318, "rewards/tree_correctness_reward/mean": 0.5859375, "rewards/tree_correctness_reward/std": 0.49449479579925537, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7376217842102051, "sampling/importance_sampling_ratio/min": 2.333531109663677e-28, "sampling/sampling_logp_difference/max": 63.625, "sampling/sampling_logp_difference/mean": 4.908658027648926, "step": 344 }, { "clip_ratio/high_max": 0.03479680645978078, "clip_ratio/high_mean": 0.008541994146071374, "clip_ratio/low_mean": 0.006965365304495208, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015507359406910837, "completions/clipped_ratio": 1.0, "completions/max_length": 139.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 112.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 88.0, "completions/min_terminated_length": 0.0, "entropy": 0.7566463276743889, "epoch": 172.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008472113870084286, "kl": 0.5427013039588928, "learning_rate": 8e-05, "loss": -0.0013, "num_tokens": 34159808.0, "reward": 0.555468738079071, "reward_std": 0.18433701992034912, "rewards/num_nodes_reward/mean": 0.7578125, "rewards/num_nodes_reward/std": 0.4300905168056488, "rewards/tree_correctness_reward/mean": 0.46875, "rewards/tree_correctness_reward/std": 0.5009832978248596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7151710391044617, "sampling/importance_sampling_ratio/min": 1.5544668082166554e-28, "sampling/sampling_logp_difference/max": 64.03125, "sampling/sampling_logp_difference/mean": 5.238584518432617, "step": 345 }, { "clip_ratio/high_max": 0.025557372369803488, "clip_ratio/high_mean": 0.00551136743160896, "clip_ratio/low_mean": 0.0028550304341479205, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008366397873032838, "completions/clipped_ratio": 1.0, "completions/max_length": 134.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 102.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 87.0, "completions/min_terminated_length": 0.0, "entropy": 0.6185888722538948, "epoch": 173.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.007658608257770538, "kl": 0.6383940055966377, "learning_rate": 8e-05, "loss": 0.0005, "num_tokens": 34237072.0, "reward": 0.6499999761581421, "reward_std": 0.14298442006111145, "rewards/num_nodes_reward/mean": 0.890625, "rewards/num_nodes_reward/std": 0.31333550810813904, "rewards/tree_correctness_reward/mean": 0.546875, "rewards/tree_correctness_reward/std": 0.4997538626194, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7408756017684937, "sampling/importance_sampling_ratio/min": 1.9959748127583844e-28, "sampling/sampling_logp_difference/max": 63.78125, "sampling/sampling_logp_difference/mean": 5.007340908050537, "step": 346 }, { "clip_ratio/high_max": 0.0371067957021296, "clip_ratio/high_mean": 0.007481988752260804, "clip_ratio/low_mean": 0.006674528791336343, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014156517514493316, "completions/clipped_ratio": 1.0, "completions/max_length": 249.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 118.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 86.0, "completions/min_terminated_length": 0.0, "entropy": 0.8815940096974373, "epoch": 173.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.01026577316224575, "kl": 0.5617185011506081, "learning_rate": 8e-05, "loss": 0.0013, "num_tokens": 34316400.0, "reward": 0.50390625, "reward_std": 0.1637399047613144, "rewards/num_nodes_reward/mean": 0.859375, "rewards/num_nodes_reward/std": 0.3490002751350403, "rewards/tree_correctness_reward/mean": 0.3515625, "rewards/tree_correctness_reward/std": 0.4793342351913452, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7196659445762634, "sampling/importance_sampling_ratio/min": 4.359610133382778e-28, "sampling/sampling_logp_difference/max": 63.0, "sampling/sampling_logp_difference/mean": 5.132627487182617, "step": 347 }, { "clip_ratio/high_max": 0.02750651224050671, "clip_ratio/high_mean": 0.0064050522341858596, "clip_ratio/low_mean": 0.0025058738683583215, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008910926058888435, "completions/clipped_ratio": 1.0, "completions/max_length": 145.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 104.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 88.0, "completions/min_terminated_length": 0.0, "entropy": 0.710379034280777, "epoch": 174.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.005436969455331564, "kl": 0.6118328124284744, "learning_rate": 8e-05, "loss": -0.0007, "num_tokens": 34393904.0, "reward": 0.7320312261581421, "reward_std": 0.1277744621038437, "rewards/num_nodes_reward/mean": 0.890625, "rewards/num_nodes_reward/std": 0.31333550810813904, "rewards/tree_correctness_reward/mean": 0.6640625, "rewards/tree_correctness_reward/std": 0.47417303919792175, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.741448163986206, "sampling/importance_sampling_ratio/min": 2.618149189631665e-29, "sampling/sampling_logp_difference/max": 65.8125, "sampling/sampling_logp_difference/mean": 4.90103006362915, "step": 348 }, { "clip_ratio/high_max": 0.055632391944527626, "clip_ratio/high_mean": 0.013455649372190237, "clip_ratio/low_mean": 0.008017624641070142, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021473274566233158, "completions/clipped_ratio": 1.0, "completions/max_length": 144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 107.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 85.0, "completions/min_terminated_length": 0.0, "entropy": 0.7353673875331879, "epoch": 174.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.010856468230485916, "kl": 0.7137491405010223, "learning_rate": 8e-05, "loss": 0.0004, "num_tokens": 34471792.0, "reward": 0.5859375, "reward_std": 0.1771308183670044, "rewards/num_nodes_reward/mean": 0.8046875, "rewards/num_nodes_reward/std": 0.3979988098144531, "rewards/tree_correctness_reward/mean": 0.4921875, "rewards/tree_correctness_reward/std": 0.5019033551216125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7222919464111328, "sampling/importance_sampling_ratio/min": 1.7072478214393567e-28, "sampling/sampling_logp_difference/max": 63.9375, "sampling/sampling_logp_difference/mean": 5.153779029846191, "step": 349 }, { "clip_ratio/high_max": 0.037717235973104835, "clip_ratio/high_mean": 0.008144962135702372, "clip_ratio/low_mean": 0.004389499343233183, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012534461682662368, "completions/clipped_ratio": 1.0, "completions/max_length": 122.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 100.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 85.0, "completions/min_terminated_length": 0.0, "entropy": 0.5906867682933807, "epoch": 175.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.00733138108626008, "kl": 0.6462159529328346, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 34548736.0, "reward": 0.6546875238418579, "reward_std": 0.17136690020561218, "rewards/num_nodes_reward/mean": 0.8515625, "rewards/num_nodes_reward/std": 0.356930136680603, "rewards/tree_correctness_reward/mean": 0.5703125, "rewards/tree_correctness_reward/std": 0.4969765841960907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7556290626525879, "sampling/importance_sampling_ratio/min": 3.1895588216951936e-28, "sampling/sampling_logp_difference/max": 63.3125, "sampling/sampling_logp_difference/mean": 4.719051837921143, "step": 350 }, { "epoch": 175.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 109.2, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 82.6, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 70.4, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.326774126291275, "eval_frac_reward_zero_std": 0.4, "eval_kl": 0.7047309398651123, "eval_loss": 0.0012167991371825337, "eval_num_tokens": 34548736.0, "eval_reward": 0.3199999928474426, "eval_reward_std": 0.1752820536494255, "eval_rewards/num_nodes_reward/mean": 0.6, "eval_rewards/num_nodes_reward/std": 0.20655910968780516, "eval_rewards/tree_correctness_reward/mean": 0.2, "eval_rewards/tree_correctness_reward/std": 0.3207825243473053, "eval_runtime": 17.9246, "eval_samples_per_second": 0.558, "eval_sampling/importance_sampling_ratio/max": 1.9589381217956543, "eval_sampling/importance_sampling_ratio/mean": 0.9363293409347534, "eval_sampling/importance_sampling_ratio/min": 2.1386177336427263e-26, "eval_sampling/sampling_logp_difference/max": 59.8625, "eval_sampling/sampling_logp_difference/mean": 1.4442716836929321, "eval_steps_per_second": 0.056, "step": 350 }, { "clip_ratio/high_max": 0.0279581529321149, "clip_ratio/high_mean": 0.006075543482438661, "clip_ratio/low_mean": 0.0033827392471721396, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009458282904233783, "completions/clipped_ratio": 1.0, "completions/max_length": 113.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 93.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 77.0, "completions/min_terminated_length": 0.0, "entropy": 0.48072775453329086, "epoch": 175.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.009594643488526344, "kl": 0.6595404744148254, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 34624880.0, "reward": 0.7124999761581421, "reward_std": 0.13177290558815002, "rewards/num_nodes_reward/mean": 0.84375, "rewards/num_nodes_reward/std": 0.3645188808441162, "rewards/tree_correctness_reward/mean": 0.65625, "rewards/tree_correctness_reward/std": 0.47682511806488037, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.783311128616333, "sampling/importance_sampling_ratio/min": 4.0550689701785223e-29, "sampling/sampling_logp_difference/max": 65.375, "sampling/sampling_logp_difference/mean": 4.356761932373047, "step": 351 }, { "clip_ratio/high_max": 0.03050704812631011, "clip_ratio/high_mean": 0.006498634407762438, "clip_ratio/low_mean": 0.003582978664780967, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01008161308709532, "completions/clipped_ratio": 1.0, "completions/max_length": 115.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 97.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 86.0, "completions/min_terminated_length": 0.0, "entropy": 0.5135119035840034, "epoch": 176.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.0055729057639837265, "kl": 0.8189079388976097, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 34701456.0, "reward": 0.524218738079071, "reward_std": 0.11162321269512177, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.4140625, "rewards/tree_correctness_reward/std": 0.49449479579925537, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7701988816261292, "sampling/importance_sampling_ratio/min": 3.3952676397964373e-28, "sampling/sampling_logp_difference/max": 63.25, "sampling/sampling_logp_difference/mean": 4.486443519592285, "step": 352 }, { "clip_ratio/high_max": 0.044811603147536516, "clip_ratio/high_mean": 0.00749943929258734, "clip_ratio/low_mean": 0.0031175240001175553, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010616963438224047, "completions/clipped_ratio": 1.0, "completions/max_length": 106.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 96.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 88.0, "completions/min_terminated_length": 0.0, "entropy": 0.5107113532721996, "epoch": 176.5, "frac_reward_zero_std": 0.6875, "grad_norm": 0.004945790860801935, "kl": 0.6301107183098793, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 34777872.0, "reward": 0.7992187738418579, "reward_std": 0.11441348493099213, "rewards/num_nodes_reward/mean": 0.96875, "rewards/num_nodes_reward/std": 0.1746762990951538, "rewards/tree_correctness_reward/mean": 0.7265625, "rewards/tree_correctness_reward/std": 0.447474867105484, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7697237730026245, "sampling/importance_sampling_ratio/min": 5.90009059760205e-29, "sampling/sampling_logp_difference/max": 65.0, "sampling/sampling_logp_difference/mean": 4.5610456466674805, "step": 353 }, { "clip_ratio/high_max": 0.021178370574489236, "clip_ratio/high_mean": 0.004577570565743372, "clip_ratio/low_mean": 0.004871479599387385, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009449050121475011, "completions/clipped_ratio": 1.0, "completions/max_length": 117.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 93.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 75.0, "completions/min_terminated_length": 0.0, "entropy": 0.44530239701271057, "epoch": 177.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.007521794177591801, "kl": 1.107382945716381, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 34854000.0, "reward": 0.5687499642372131, "reward_std": 0.11857087165117264, "rewards/num_nodes_reward/mean": 0.8203125, "rewards/num_nodes_reward/std": 0.3854354918003082, "rewards/tree_correctness_reward/mean": 0.4609375, "rewards/tree_correctness_reward/std": 0.5004304051399231, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7962568998336792, "sampling/importance_sampling_ratio/min": 9.82453264707878e-28, "sampling/sampling_logp_difference/max": 62.1875, "sampling/sampling_logp_difference/mean": 4.1512274742126465, "step": 354 }, { "clip_ratio/high_max": 0.03705668624024838, "clip_ratio/high_mean": 0.008704323583515361, "clip_ratio/low_mean": 0.0037698288506362587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012474152521463111, "completions/clipped_ratio": 1.0, "completions/max_length": 131.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 104.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 89.0, "completions/min_terminated_length": 0.0, "entropy": 0.7088726386427879, "epoch": 177.5, "frac_reward_zero_std": 0.5625, "grad_norm": 0.006596851162612438, "kl": 0.7108505219221115, "learning_rate": 8e-05, "loss": -0.0006, "num_tokens": 34931472.0, "reward": 0.7960937023162842, "reward_std": 0.14824533462524414, "rewards/num_nodes_reward/mean": 0.9765625, "rewards/num_nodes_reward/std": 0.15188287198543549, "rewards/tree_correctness_reward/mean": 0.71875, "rewards/tree_correctness_reward/std": 0.4513758420944214, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7356235980987549, "sampling/importance_sampling_ratio/min": 1.9345652124047606e-28, "sampling/sampling_logp_difference/max": 63.8125, "sampling/sampling_logp_difference/mean": 4.975739479064941, "step": 355 }, { "clip_ratio/high_max": 0.029305514181032777, "clip_ratio/high_mean": 0.005555245355935767, "clip_ratio/low_mean": 0.005665080971084535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011220326530747116, "completions/clipped_ratio": 1.0, "completions/max_length": 120.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 104.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 89.0, "completions/min_terminated_length": 0.0, "entropy": 0.6670315191149712, "epoch": 178.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.006496935151517391, "kl": 0.5876795686781406, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 35008960.0, "reward": 0.5914062261581421, "reward_std": 0.14991651475429535, "rewards/num_nodes_reward/mean": 0.859375, "rewards/num_nodes_reward/std": 0.3490002751350403, "rewards/tree_correctness_reward/mean": 0.4765625, "rewards/tree_correctness_reward/std": 0.5014128684997559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7435545921325684, "sampling/importance_sampling_ratio/min": 1.2886982259115603e-28, "sampling/sampling_logp_difference/max": 64.21875, "sampling/sampling_logp_difference/mean": 4.835331439971924, "step": 356 }, { "clip_ratio/high_max": 0.01712330209556967, "clip_ratio/high_mean": 0.002716325492656324, "clip_ratio/low_mean": 0.004678240715293214, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007394566142465919, "completions/clipped_ratio": 1.0, "completions/max_length": 118.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 97.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 82.0, "completions/min_terminated_length": 0.0, "entropy": 0.5244022719562054, "epoch": 178.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.006036782171577215, "kl": 0.7153104767203331, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 35085536.0, "reward": 0.7210937738418579, "reward_std": 0.12779612839221954, "rewards/num_nodes_reward/mean": 0.890625, "rewards/num_nodes_reward/std": 0.31333550810813904, "rewards/tree_correctness_reward/mean": 0.6484375, "rewards/tree_correctness_reward/std": 0.4793342351913452, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7861155867576599, "sampling/importance_sampling_ratio/min": 1.9959748127583844e-28, "sampling/sampling_logp_difference/max": 63.78125, "sampling/sampling_logp_difference/mean": 4.239538669586182, "step": 357 }, { "clip_ratio/high_max": 0.040509232436306775, "clip_ratio/high_mean": 0.011790821212343872, "clip_ratio/low_mean": 0.005143214264535345, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016934035695157945, "completions/clipped_ratio": 1.0, "completions/max_length": 128.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 103.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 87.0, "completions/min_terminated_length": 0.0, "entropy": 0.6262470856308937, "epoch": 179.0, "frac_reward_zero_std": 0.375, "grad_norm": 0.007949902676045895, "kl": 0.7603812366724014, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 35162848.0, "reward": 0.577343761920929, "reward_std": 0.1561359167098999, "rewards/num_nodes_reward/mean": 0.8125, "rewards/num_nodes_reward/std": 0.39184603095054626, "rewards/tree_correctness_reward/mean": 0.4765625, "rewards/tree_correctness_reward/std": 0.5014128684997559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7533783316612244, "sampling/importance_sampling_ratio/min": 5.319093129561238e-30, "sampling/sampling_logp_difference/max": 67.40625, "sampling/sampling_logp_difference/mean": 4.723906517028809, "step": 358 }, { "clip_ratio/high_max": 0.026061988319270313, "clip_ratio/high_mean": 0.0047464264498557895, "clip_ratio/low_mean": 0.004742512435768731, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009488938725553453, "completions/clipped_ratio": 1.0, "completions/max_length": 119.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 87.0, "completions/min_terminated_length": 0.0, "entropy": 0.5908772125840187, "epoch": 179.5, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0198537465184927, "kl": 0.6611628904938698, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 35240032.0, "reward": 0.7109375, "reward_std": 0.13458451628684998, "rewards/num_nodes_reward/mean": 0.875, "rewards/num_nodes_reward/std": 0.3320184051990509, "rewards/tree_correctness_reward/mean": 0.640625, "rewards/tree_correctness_reward/std": 0.481702595949173, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7663519978523254, "sampling/importance_sampling_ratio/min": 4.3165985204907103e-29, "sampling/sampling_logp_difference/max": 65.3125, "sampling/sampling_logp_difference/mean": 4.542418003082275, "step": 359 }, { "clip_ratio/high_max": 0.04122798587195575, "clip_ratio/high_mean": 0.011651547159999609, "clip_ratio/low_mean": 0.0036361841412144713, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015287731512216851, "completions/clipped_ratio": 1.0, "completions/max_length": 153.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 104.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 87.0, "completions/min_terminated_length": 0.0, "entropy": 0.6608068645000458, "epoch": 180.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008273901417851448, "kl": 0.7010775431990623, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 35317536.0, "reward": 0.715624988079071, "reward_std": 0.165480375289917, "rewards/num_nodes_reward/mean": 0.890625, "rewards/num_nodes_reward/std": 0.31333550810813904, "rewards/tree_correctness_reward/mean": 0.640625, "rewards/tree_correctness_reward/std": 0.481702595949173, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7547210454940796, "sampling/importance_sampling_ratio/min": 8.584580550654011e-29, "sampling/sampling_logp_difference/max": 64.625, "sampling/sampling_logp_difference/mean": 4.719609260559082, "step": 360 }, { "epoch": 180.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 103.2, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 82.6, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 69.4, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.29487515687942506, "eval_frac_reward_zero_std": 0.5, "eval_kl": 0.7124163866043091, "eval_loss": 0.0010664049768820405, "eval_num_tokens": 35317536.0, "eval_reward": 0.3499999940395355, "eval_reward_std": 0.14287835210561753, "eval_rewards/num_nodes_reward/mean": 0.7, "eval_rewards/num_nodes_reward/std": 0.10327955484390258, "eval_rewards/tree_correctness_reward/mean": 0.2, "eval_rewards/tree_correctness_reward/std": 0.22006530165672303, "eval_runtime": 16.7931, "eval_samples_per_second": 0.595, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9343437552452087, "eval_sampling/importance_sampling_ratio/min": 3.2322143350757396e-26, "eval_sampling/sampling_logp_difference/max": 60.14375, "eval_sampling/sampling_logp_difference/mean": 1.4897047519683837, "eval_steps_per_second": 0.06, "step": 360 }, { "epoch": 180.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 121.4, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 84.55, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 69.6, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.3595031797885895, "eval_frac_reward_zero_std": 0.4, "eval_kl": 0.7100868344306945, "eval_loss": 0.00018776349315885454, "eval_num_tokens": 35317536.0, "eval_reward": 0.3549999952316284, "eval_reward_std": 0.18707678616046905, "eval_rewards/num_nodes_reward/mean": 0.6875, "eval_rewards/num_nodes_reward/std": 0.1532795548439026, "eval_rewards/tree_correctness_reward/mean": 0.2125, "eval_rewards/tree_correctness_reward/std": 0.25349844694137574, "eval_runtime": 25.4557, "eval_samples_per_second": 0.393, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9297355651855469, "eval_sampling/importance_sampling_ratio/min": 2.0758027127548345e-26, "eval_sampling/sampling_logp_difference/max": 60.34375, "eval_sampling/sampling_logp_difference/mean": 1.529999351501465, "eval_steps_per_second": 0.039, "step": 360 }, { "clip_ratio/high_max": 0.027897030115127563, "clip_ratio/high_mean": 0.006824091688031331, "clip_ratio/low_mean": 0.0021184606484894175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008942552580265328, "completions/clipped_ratio": 1.0, "completions/max_length": 172.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 113.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 82.0, "completions/min_terminated_length": 0.0, "entropy": 0.9123650342226028, "epoch": 180.5, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0059018502943217754, "kl": 0.5765103027224541, "learning_rate": 8e-05, "loss": 0.0006, "num_tokens": 35396240.0, "reward": 0.694531261920929, "reward_std": 0.13458795845508575, "rewards/num_nodes_reward/mean": 0.875, "rewards/num_nodes_reward/std": 0.3320184051990509, "rewards/tree_correctness_reward/mean": 0.6171875, "rewards/tree_correctness_reward/std": 0.4879830479621887, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7269114255905151, "sampling/importance_sampling_ratio/min": 1.137272224775357e-28, "sampling/sampling_logp_difference/max": 64.34375, "sampling/sampling_logp_difference/mean": 4.984188079833984, "step": 361 }, { "clip_ratio/high_max": 0.03250980447046459, "clip_ratio/high_mean": 0.0067204706283519045, "clip_ratio/low_mean": 0.0012583184507093392, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007978788955369964, "completions/clipped_ratio": 1.0, "completions/max_length": 125.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 99.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 91.0, "completions/min_terminated_length": 0.0, "entropy": 0.5470959469676018, "epoch": 181.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.006882414687424898, "kl": 0.6148336306214333, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 35473040.0, "reward": 0.8726562261581421, "reward_std": 0.12485998123884201, "rewards/num_nodes_reward/mean": 0.9765625, "rewards/num_nodes_reward/std": 0.15188287198543549, "rewards/tree_correctness_reward/mean": 0.828125, "rewards/tree_correctness_reward/std": 0.3787541687488556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7739870548248291, "sampling/importance_sampling_ratio/min": 1.9345652124047606e-28, "sampling/sampling_logp_difference/max": 63.8125, "sampling/sampling_logp_difference/mean": 4.492979049682617, "step": 362 }, { "clip_ratio/high_max": 0.042520994786173105, "clip_ratio/high_mean": 0.013180926674976945, "clip_ratio/low_mean": 0.0023049426890793256, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015485869254916906, "completions/clipped_ratio": 1.0, "completions/max_length": 115.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 97.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 82.0, "completions/min_terminated_length": 0.0, "entropy": 0.5171583853662014, "epoch": 181.5, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008538903668522835, "kl": 0.6991748958826065, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 35549664.0, "reward": 0.7039062976837158, "reward_std": 0.1550445556640625, "rewards/num_nodes_reward/mean": 0.9609375, "rewards/num_nodes_reward/std": 0.194504976272583, "rewards/tree_correctness_reward/mean": 0.59375, "rewards/tree_correctness_reward/std": 0.4930621087551117, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7825911045074463, "sampling/importance_sampling_ratio/min": 2.701258182483389e-29, "sampling/sampling_logp_difference/max": 65.78125, "sampling/sampling_logp_difference/mean": 4.368264675140381, "step": 363 }, { "clip_ratio/high_max": 0.03215591795742512, "clip_ratio/high_mean": 0.00862394942669198, "clip_ratio/low_mean": 0.004897460908978246, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01352141029201448, "completions/clipped_ratio": 1.0, "completions/max_length": 115.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 95.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 75.0, "completions/min_terminated_length": 0.0, "entropy": 0.4742717258632183, "epoch": 182.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.00609088409692049, "kl": 0.7207903489470482, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 35625968.0, "reward": 0.7554687261581421, "reward_std": 0.19694583117961884, "rewards/num_nodes_reward/mean": 0.859375, "rewards/num_nodes_reward/std": 0.3490002751350403, "rewards/tree_correctness_reward/mean": 0.7109375, "rewards/tree_correctness_reward/std": 0.45510825514793396, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7899937629699707, "sampling/importance_sampling_ratio/min": 7.816349966256032e-29, "sampling/sampling_logp_difference/max": 64.71875, "sampling/sampling_logp_difference/mean": 4.245706558227539, "step": 364 }, { "clip_ratio/high_max": 0.04116437386255711, "clip_ratio/high_mean": 0.007038761832518503, "clip_ratio/low_mean": 0.002450156731356401, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009488918585702777, "completions/clipped_ratio": 1.0, "completions/max_length": 141.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 98.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 84.0, "completions/min_terminated_length": 0.0, "entropy": 0.519985843449831, "epoch": 182.5, "frac_reward_zero_std": 0.6875, "grad_norm": 0.007319293450564146, "kl": 0.7247961163520813, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 35702688.0, "reward": 0.667187511920929, "reward_std": 0.10678164660930634, "rewards/num_nodes_reward/mean": 0.875, "rewards/num_nodes_reward/std": 0.3320184051990509, "rewards/tree_correctness_reward/mean": 0.578125, "rewards/tree_correctness_reward/std": 0.4957992732524872, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7877151966094971, "sampling/importance_sampling_ratio/min": 1.5544668082166554e-28, "sampling/sampling_logp_difference/max": 64.03125, "sampling/sampling_logp_difference/mean": 4.240224838256836, "step": 365 }, { "clip_ratio/high_max": 0.039955109590664506, "clip_ratio/high_mean": 0.010667750291759148, "clip_ratio/low_mean": 0.005455432430608198, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01612318237312138, "completions/clipped_ratio": 1.0, "completions/max_length": 139.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 98.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 86.0, "completions/min_terminated_length": 0.0, "entropy": 0.5208862833678722, "epoch": 183.0, "frac_reward_zero_std": 0.4375, "grad_norm": 0.01826290600001812, "kl": 0.7288786843419075, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 35779472.0, "reward": 0.67578125, "reward_std": 0.20516584813594818, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.59375, "rewards/tree_correctness_reward/std": 0.4930621087551117, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.769821047782898, "sampling/importance_sampling_ratio/min": 6.966637017400296e-28, "sampling/sampling_logp_difference/max": 62.53125, "sampling/sampling_logp_difference/mean": 4.564477443695068, "step": 366 }, { "clip_ratio/high_max": 0.025312797748483717, "clip_ratio/high_mean": 0.004072140480275266, "clip_ratio/low_mean": 0.005545366686419584, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00961750722490251, "completions/clipped_ratio": 1.0, "completions/max_length": 108.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 95.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 86.0, "completions/min_terminated_length": 0.0, "entropy": 0.4440036118030548, "epoch": 183.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.006318064406514168, "kl": 0.7188501134514809, "learning_rate": 8e-05, "loss": -0.0005, "num_tokens": 35855840.0, "reward": 0.676562488079071, "reward_std": 0.10798124969005585, "rewards/num_nodes_reward/mean": 0.90625, "rewards/num_nodes_reward/std": 0.29262590408325195, "rewards/tree_correctness_reward/mean": 0.578125, "rewards/tree_correctness_reward/std": 0.4957992732524872, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.782090425491333, "sampling/importance_sampling_ratio/min": 6.752296422029552e-28, "sampling/sampling_logp_difference/max": 62.5625, "sampling/sampling_logp_difference/mean": 4.396368980407715, "step": 367 }, { "clip_ratio/high_max": 0.03373740799725056, "clip_ratio/high_mean": 0.008129282272420824, "clip_ratio/low_mean": 0.001835072907852009, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009964354889234528, "completions/clipped_ratio": 1.0, "completions/max_length": 135.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 101.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 83.0, "completions/min_terminated_length": 0.0, "entropy": 0.6326133534312248, "epoch": 184.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.005434783175587654, "kl": 0.7549038082361221, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 35932944.0, "reward": 0.7914062738418579, "reward_std": 0.12394934892654419, "rewards/num_nodes_reward/mean": 0.90625, "rewards/num_nodes_reward/std": 0.29262590408325195, "rewards/tree_correctness_reward/mean": 0.7421875, "rewards/tree_correctness_reward/std": 0.43914902210235596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7454341650009155, "sampling/importance_sampling_ratio/min": 3.158090871777918e-29, "sampling/sampling_logp_difference/max": 65.625, "sampling/sampling_logp_difference/mean": 4.908700466156006, "step": 368 }, { "clip_ratio/high_max": 0.032388686086051166, "clip_ratio/high_mean": 0.006339760890114121, "clip_ratio/low_mean": 0.002383519873546902, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00872328071272932, "completions/clipped_ratio": 1.0, "completions/max_length": 118.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 99.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 86.0, "completions/min_terminated_length": 0.0, "entropy": 0.5307888388633728, "epoch": 184.5, "frac_reward_zero_std": 0.6875, "grad_norm": 0.005802895408123732, "kl": 0.6325005069375038, "learning_rate": 8e-05, "loss": 0.0004, "num_tokens": 36009744.0, "reward": 0.651562511920929, "reward_std": 0.10939201712608337, "rewards/num_nodes_reward/mean": 0.9140625, "rewards/num_nodes_reward/std": 0.2813730239868164, "rewards/tree_correctness_reward/mean": 0.5390625, "rewards/tree_correctness_reward/std": 0.5004304051399231, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7508459091186523, "sampling/importance_sampling_ratio/min": 1.275983958642445e-29, "sampling/sampling_logp_difference/max": 66.53125, "sampling/sampling_logp_difference/mean": 4.841769695281982, "step": 369 }, { "clip_ratio/high_max": 0.05045486451126635, "clip_ratio/high_mean": 0.01177544193342328, "clip_ratio/low_mean": 0.0037853018584428355, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015560743515379727, "completions/clipped_ratio": 1.0, "completions/max_length": 110.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 93.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 86.0, "completions/min_terminated_length": 0.0, "entropy": 0.4154197759926319, "epoch": 185.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.008196084760129452, "kl": 0.9588414654135704, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 36085872.0, "reward": 0.780468761920929, "reward_std": 0.16638565063476562, "rewards/num_nodes_reward/mean": 0.90625, "rewards/num_nodes_reward/std": 0.29262590408325195, "rewards/tree_correctness_reward/mean": 0.7265625, "rewards/tree_correctness_reward/std": 0.447474867105484, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7988976836204529, "sampling/importance_sampling_ratio/min": 1.371812093773456e-28, "sampling/sampling_logp_difference/max": 64.15625, "sampling/sampling_logp_difference/mean": 4.1529927253723145, "step": 370 }, { "epoch": 185.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 101.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 80.325, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 68.8, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.262069308757782, "eval_frac_reward_zero_std": 0.7, "eval_kl": 0.7317587494850158, "eval_loss": 0.0001651317288633436, "eval_num_tokens": 36085872.0, "eval_reward": 0.3850000023841858, "eval_reward_std": 0.0869140475988388, "eval_rewards/num_nodes_reward/mean": 0.7, "eval_rewards/num_nodes_reward/std": 0.10327955484390258, "eval_rewards/tree_correctness_reward/mean": 0.25, "eval_rewards/tree_correctness_reward/std": 0.1894427239894867, "eval_runtime": 16.8337, "eval_samples_per_second": 0.594, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9360223293304444, "eval_sampling/importance_sampling_ratio/min": 1.1238693089952993e-25, "eval_sampling/sampling_logp_difference/max": 59.7375, "eval_sampling/sampling_logp_difference/mean": 1.5069370031356812, "eval_steps_per_second": 0.059, "step": 370 }, { "clip_ratio/high_max": 0.02436972805298865, "clip_ratio/high_mean": 0.006899724394315854, "clip_ratio/low_mean": 0.0037577325783786364, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010657456878107041, "completions/clipped_ratio": 1.0, "completions/max_length": 145.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 105.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 84.0, "completions/min_terminated_length": 0.0, "entropy": 0.7117888033390045, "epoch": 185.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.008038122206926346, "kl": 0.70255908370018, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 36163552.0, "reward": 0.582812488079071, "reward_std": 0.1833796203136444, "rewards/num_nodes_reward/mean": 0.921875, "rewards/num_nodes_reward/std": 0.2694226801395416, "rewards/tree_correctness_reward/mean": 0.4375, "rewards/tree_correctness_reward/std": 0.49802759289741516, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7324906587600708, "sampling/importance_sampling_ratio/min": 7.816349966256032e-29, "sampling/sampling_logp_difference/max": 64.71875, "sampling/sampling_logp_difference/mean": 5.006816387176514, "step": 371 }, { "clip_ratio/high_max": 0.022743420209735632, "clip_ratio/high_mean": 0.005491425021318719, "clip_ratio/low_mean": 0.004853067519434262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010344492562580854, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 86.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 77.0, "completions/min_terminated_length": 0.0, "entropy": 0.3063677288591862, "epoch": 186.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.009592382237315178, "kl": 0.780382551252842, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 36238688.0, "reward": 0.7421875, "reward_std": 0.14429199695587158, "rewards/num_nodes_reward/mean": 0.8515625, "rewards/num_nodes_reward/std": 0.356930136680603, "rewards/tree_correctness_reward/mean": 0.6953125, "rewards/tree_correctness_reward/std": 0.46208351850509644, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8268502950668335, "sampling/importance_sampling_ratio/min": 4.497998792501695e-28, "sampling/sampling_logp_difference/max": 62.96875, "sampling/sampling_logp_difference/mean": 3.766963005065918, "step": 372 }, { "clip_ratio/high_max": 0.042124570813030005, "clip_ratio/high_mean": 0.008330369659233838, "clip_ratio/low_mean": 0.004249643905495759, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012580013601109385, "completions/clipped_ratio": 1.0, "completions/max_length": 114.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 97.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 89.0, "completions/min_terminated_length": 0.0, "entropy": 0.5176512636244297, "epoch": 186.5, "frac_reward_zero_std": 0.5625, "grad_norm": 0.010291434824466705, "kl": 0.6621697843074799, "learning_rate": 8e-05, "loss": 0.0003, "num_tokens": 36315328.0, "reward": 0.78515625, "reward_std": 0.1656482219696045, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.75, "rewards/tree_correctness_reward/std": 0.434714138507843, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.748168408870697, "sampling/importance_sampling_ratio/min": 2.192149796152425e-28, "sampling/sampling_logp_difference/max": 63.6875, "sampling/sampling_logp_difference/mean": 4.898381233215332, "step": 373 }, { "clip_ratio/high_max": 0.021431234141346067, "clip_ratio/high_mean": 0.004978061631845776, "clip_ratio/low_mean": 0.0033283154625678435, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008306377159897238, "completions/clipped_ratio": 1.0, "completions/max_length": 105.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 86.0, "completions/min_terminated_length": 0.0, "entropy": 0.37720317393541336, "epoch": 187.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.007131617050617933, "kl": 0.665986955165863, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 36391232.0, "reward": 0.6671874523162842, "reward_std": 0.11378994584083557, "rewards/num_nodes_reward/mean": 0.9296875, "rewards/num_nodes_reward/std": 0.2566775679588318, "rewards/tree_correctness_reward/mean": 0.5546875, "rewards/tree_correctness_reward/std": 0.4989531338214874, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7962744235992432, "sampling/importance_sampling_ratio/min": 7.342781246671332e-29, "sampling/sampling_logp_difference/max": 64.78125, "sampling/sampling_logp_difference/mean": 4.21209716796875, "step": 374 }, { "clip_ratio/high_max": 0.027662408887408674, "clip_ratio/high_mean": 0.00640273500175681, "clip_ratio/low_mean": 0.0006165514932945371, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007019286465947516, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 90.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 84.0, "completions/min_terminated_length": 0.0, "entropy": 0.3610077239573002, "epoch": 187.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.004398862365633249, "kl": 0.7623052671551704, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 36466912.0, "reward": 0.8179687261581421, "reward_std": 0.07991445064544678, "rewards/num_nodes_reward/mean": 0.921875, "rewards/num_nodes_reward/std": 0.2694226801395416, "rewards/tree_correctness_reward/mean": 0.7734375, "rewards/tree_correctness_reward/std": 0.4202519655227661, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7901061773300171, "sampling/importance_sampling_ratio/min": 5.958880211007578e-28, "sampling/sampling_logp_difference/max": 62.6875, "sampling/sampling_logp_difference/mean": 4.358028411865234, "step": 375 }, { "clip_ratio/high_max": 0.022393293620552868, "clip_ratio/high_mean": 0.0041137784501188435, "clip_ratio/low_mean": 0.0010024697839980945, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005116248132253531, "completions/clipped_ratio": 1.0, "completions/max_length": 106.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 91.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 79.0, "completions/min_terminated_length": 0.0, "entropy": 0.403529804199934, "epoch": 188.0, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0033462378196418285, "kl": 0.8778767138719559, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 36542800.0, "reward": 0.7789062261581421, "reward_std": 0.055972591042518616, "rewards/num_nodes_reward/mean": 0.9375, "rewards/num_nodes_reward/std": 0.24301259219646454, "rewards/tree_correctness_reward/mean": 0.7109375, "rewards/tree_correctness_reward/std": 0.45510825514793396, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7872514724731445, "sampling/importance_sampling_ratio/min": 1.1486043002716342e-27, "sampling/sampling_logp_difference/max": 62.03125, "sampling/sampling_logp_difference/mean": 4.340131759643555, "step": 376 }, { "clip_ratio/high_max": 0.029331887373700738, "clip_ratio/high_mean": 0.00570644426625222, "clip_ratio/low_mean": 0.0036902570645906962, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009396701294463128, "completions/clipped_ratio": 1.0, "completions/max_length": 124.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 95.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 78.0, "completions/min_terminated_length": 0.0, "entropy": 0.4706118740141392, "epoch": 188.5, "frac_reward_zero_std": 0.625, "grad_norm": 0.005480558145791292, "kl": 0.9544296860694885, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 36619136.0, "reward": 0.7007812261581421, "reward_std": 0.11519742757081985, "rewards/num_nodes_reward/mean": 0.859375, "rewards/num_nodes_reward/std": 0.3490002751350403, "rewards/tree_correctness_reward/mean": 0.6328125, "rewards/tree_correctness_reward/std": 0.4839322865009308, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7748527526855469, "sampling/importance_sampling_ratio/min": 1.9959748127583844e-28, "sampling/sampling_logp_difference/max": 63.78125, "sampling/sampling_logp_difference/mean": 4.512609958648682, "step": 377 }, { "clip_ratio/high_max": 0.02436950709670782, "clip_ratio/high_mean": 0.005416427447926253, "clip_ratio/low_mean": 0.0014884088523103856, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006904836278408766, "completions/clipped_ratio": 1.0, "completions/max_length": 111.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 88.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 83.0, "completions/min_terminated_length": 0.0, "entropy": 0.37028710544109344, "epoch": 189.0, "frac_reward_zero_std": 0.6875, "grad_norm": 0.005504610948264599, "kl": 0.7522167786955833, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 36694544.0, "reward": 0.745312511920929, "reward_std": 0.09361547976732254, "rewards/num_nodes_reward/mean": 0.8984375, "rewards/num_nodes_reward/std": 0.3032590448856354, "rewards/tree_correctness_reward/mean": 0.6796875, "rewards/tree_correctness_reward/std": 0.4684300124645233, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8082362413406372, "sampling/importance_sampling_ratio/min": 2.4315951772377487e-27, "sampling/sampling_logp_difference/max": 61.28125, "sampling/sampling_logp_difference/mean": 4.0575408935546875, "step": 378 }, { "clip_ratio/high_max": 0.022257848759181798, "clip_ratio/high_mean": 0.0042109557034564205, "clip_ratio/low_mean": 0.0027452061694930308, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006956161872949451, "completions/clipped_ratio": 1.0, "completions/max_length": 120.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 77.0, "completions/min_terminated_length": 0.0, "entropy": 0.4397210665047169, "epoch": 189.5, "frac_reward_zero_std": 0.6875, "grad_norm": 0.004604096058756113, "kl": 0.6888674721121788, "learning_rate": 8e-05, "loss": 0.0005, "num_tokens": 36770288.0, "reward": 0.6656250357627869, "reward_std": 0.08979924768209457, "rewards/num_nodes_reward/mean": 0.90625, "rewards/num_nodes_reward/std": 0.29262590408325195, "rewards/tree_correctness_reward/mean": 0.5625, "rewards/tree_correctness_reward/std": 0.49802759289741516, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7889770865440369, "sampling/importance_sampling_ratio/min": 2.7281743775330376e-28, "sampling/sampling_logp_difference/max": 63.46875, "sampling/sampling_logp_difference/mean": 4.316176414489746, "step": 379 }, { "clip_ratio/high_max": 0.023623179527930915, "clip_ratio/high_mean": 0.003902257070876658, "clip_ratio/low_mean": 0.0033472160575911403, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007249473012052476, "completions/clipped_ratio": 1.0, "completions/max_length": 97.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 90.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 86.0, "completions/min_terminated_length": 0.0, "entropy": 0.34349144995212555, "epoch": 190.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.005316847003996372, "kl": 0.7725635170936584, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 36845952.0, "reward": 0.6773437261581421, "reward_std": 0.07859142869710922, "rewards/num_nodes_reward/mean": 0.78125, "rewards/num_nodes_reward/std": 0.41502299904823303, "rewards/tree_correctness_reward/mean": 0.6328125, "rewards/tree_correctness_reward/std": 0.4839322865009308, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7870017290115356, "sampling/importance_sampling_ratio/min": 2.2617360524555665e-28, "sampling/sampling_logp_difference/max": 63.65625, "sampling/sampling_logp_difference/mean": 4.3796586990356445, "step": 380 }, { "epoch": 190.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 85.8, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 74.0, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 67.6, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.20707919597625732, "eval_frac_reward_zero_std": 0.7, "eval_kl": 0.9449100971221924, "eval_loss": 0.00042462439159862697, "eval_num_tokens": 36845952.0, "eval_reward": 0.32375000715255736, "eval_reward_std": 0.0711840644478798, "eval_rewards/num_nodes_reward/mean": 0.6125, "eval_rewards/num_nodes_reward/std": 0.20574907064437867, "eval_rewards/tree_correctness_reward/mean": 0.2, "eval_rewards/tree_correctness_reward/std": 0.1, "eval_runtime": 15.0018, "eval_samples_per_second": 0.667, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9575300693511963, "eval_sampling/importance_sampling_ratio/min": 6.1732146819211e-26, "eval_sampling/sampling_logp_difference/max": 58.78125, "eval_sampling/sampling_logp_difference/mean": 1.108824372291565, "eval_steps_per_second": 0.067, "step": 380 }, { "clip_ratio/high_max": 0.0025684931315481663, "clip_ratio/high_mean": 0.0003210616414435208, "clip_ratio/low_mean": 0.0006421232828870416, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009631849243305624, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 90.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 73.0, "completions/min_terminated_length": 0.0, "entropy": 0.3710038773715496, "epoch": 190.5, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0026101444382220507, "kl": 0.6916875243186951, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 36921648.0, "reward": 0.660937488079071, "reward_std": 0.03234682232141495, "rewards/num_nodes_reward/mean": 0.8359375, "rewards/num_nodes_reward/std": 0.371787428855896, "rewards/tree_correctness_reward/mean": 0.5859375, "rewards/tree_correctness_reward/std": 0.49449479579925537, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.78098464012146, "sampling/importance_sampling_ratio/min": 4.788094879269939e-28, "sampling/sampling_logp_difference/max": 62.90625, "sampling/sampling_logp_difference/mean": 4.481934547424316, "step": 381 }, { "clip_ratio/high_max": 0.03591069905087352, "clip_ratio/high_mean": 0.007468281139153987, "clip_ratio/low_mean": 0.0030008184839971364, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010469099681358784, "completions/clipped_ratio": 1.0, "completions/max_length": 102.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 83.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 73.0, "completions/min_terminated_length": 0.0, "entropy": 0.29182541370391846, "epoch": 191.0, "frac_reward_zero_std": 0.625, "grad_norm": 0.005270560272037983, "kl": 0.8865031152963638, "learning_rate": 8e-05, "loss": -0.0004, "num_tokens": 36996496.0, "reward": 0.746874988079071, "reward_std": 0.10992893576622009, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.6953125, "rewards/tree_correctness_reward/std": 0.46208351850509644, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.833159327507019, "sampling/importance_sampling_ratio/min": 1.013639655704346e-27, "sampling/sampling_logp_difference/max": 62.15625, "sampling/sampling_logp_difference/mean": 3.6772990226745605, "step": 382 }, { "clip_ratio/high_max": 0.01929909596219659, "clip_ratio/high_mean": 0.0032467969867866486, "clip_ratio/low_mean": 0.0005562733276747167, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038030703726690263, "completions/clipped_ratio": 1.0, "completions/max_length": 89.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 83.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 77.0, "completions/min_terminated_length": 0.0, "entropy": 0.2700684703886509, "epoch": 191.5, "frac_reward_zero_std": 0.875, "grad_norm": 0.0036089986097067595, "kl": 0.8284575343132019, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 37071280.0, "reward": 0.7554687261581421, "reward_std": 0.03756504878401756, "rewards/num_nodes_reward/mean": 0.8046875, "rewards/num_nodes_reward/std": 0.3979988098144531, "rewards/tree_correctness_reward/mean": 0.734375, "rewards/tree_correctness_reward/std": 0.44340085983276367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8236706256866455, "sampling/importance_sampling_ratio/min": 2.3567828137340154e-27, "sampling/sampling_logp_difference/max": 61.3125, "sampling/sampling_logp_difference/mean": 3.9352946281433105, "step": 383 }, { "clip_ratio/high_max": 0.032589396461844444, "clip_ratio/high_mean": 0.006821497634518892, "clip_ratio/low_mean": 0.0015323160914704204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008353813522262499, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 85.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 80.0, "completions/min_terminated_length": 0.0, "entropy": 0.2940428741276264, "epoch": 192.0, "frac_reward_zero_std": 0.6875, "grad_norm": 0.014723536558449268, "kl": 0.8727653473615646, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 37146352.0, "reward": 0.746874988079071, "reward_std": 0.10973204672336578, "rewards/num_nodes_reward/mean": 0.921875, "rewards/num_nodes_reward/std": 0.2694226801395416, "rewards/tree_correctness_reward/mean": 0.671875, "rewards/tree_correctness_reward/std": 0.4713755249977112, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8031394481658936, "sampling/importance_sampling_ratio/min": 3.7289723175541984e-28, "sampling/sampling_logp_difference/max": 63.15625, "sampling/sampling_logp_difference/mean": 4.223554611206055, "step": 384 }, { "clip_ratio/high_max": 0.015415469417348504, "clip_ratio/high_mean": 0.0036762843155884184, "clip_ratio/low_mean": 0.0031136025208979845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006789886887418106, "completions/clipped_ratio": 1.0, "completions/max_length": 94.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 82.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 72.0, "completions/min_terminated_length": 0.0, "entropy": 0.2723454646766186, "epoch": 192.5, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0047114258632063866, "kl": 0.8367524519562721, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 37220992.0, "reward": 0.76953125, "reward_std": 0.10802260786294937, "rewards/num_nodes_reward/mean": 0.796875, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.7578125, "rewards/tree_correctness_reward/std": 0.4300905168056488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8327475786209106, "sampling/importance_sampling_ratio/min": 2.1458751884891414e-27, "sampling/sampling_logp_difference/max": 61.40625, "sampling/sampling_logp_difference/mean": 3.697751522064209, "step": 385 }, { "clip_ratio/high_max": 0.018417407642118633, "clip_ratio/high_mean": 0.002302175955264829, "clip_ratio/low_mean": 0.0017843906316556968, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004086566521436907, "completions/clipped_ratio": 1.0, "completions/max_length": 102.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 84.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 72.0, "completions/min_terminated_length": 0.0, "entropy": 0.2963908091187477, "epoch": 193.0, "frac_reward_zero_std": 0.8125, "grad_norm": 0.005321076139807701, "kl": 0.8752584233880043, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 37295920.0, "reward": 0.7156250476837158, "reward_std": 0.0638899877667427, "rewards/num_nodes_reward/mean": 1.0, "rewards/num_nodes_reward/std": 0.0, "rewards/tree_correctness_reward/mean": 0.59375, "rewards/tree_correctness_reward/std": 0.4930621087551117, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8262186050415039, "sampling/importance_sampling_ratio/min": 3.1895588216951936e-28, "sampling/sampling_logp_difference/max": 63.3125, "sampling/sampling_logp_difference/mean": 3.783421754837036, "step": 386 }, { "clip_ratio/high_max": 0.03564241586718708, "clip_ratio/high_mean": 0.006658761922153644, "clip_ratio/low_mean": 0.0018506498236092739, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0085094116802793, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 82.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 69.0, "completions/min_terminated_length": 0.0, "entropy": 0.27315158396959305, "epoch": 193.5, "frac_reward_zero_std": 0.6875, "grad_norm": 0.006161698140203953, "kl": 0.7671466022729874, "learning_rate": 8e-05, "loss": -0.0003, "num_tokens": 37370624.0, "reward": 0.6890624761581421, "reward_std": 0.10406197607517242, "rewards/num_nodes_reward/mean": 0.875, "rewards/num_nodes_reward/std": 0.3320184051990509, "rewards/tree_correctness_reward/mean": 0.609375, "rewards/tree_correctness_reward/std": 0.4898075461387634, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8381950259208679, "sampling/importance_sampling_ratio/min": 3.290806162577846e-28, "sampling/sampling_logp_difference/max": 63.28125, "sampling/sampling_logp_difference/mean": 3.6331167221069336, "step": 387 }, { "clip_ratio/high_max": 0.029706281085964292, "clip_ratio/high_mean": 0.007191480821347795, "clip_ratio/low_mean": 0.0026766495648189448, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009868130349786952, "completions/clipped_ratio": 1.0, "completions/max_length": 94.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 82.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 77.0, "completions/min_terminated_length": 0.0, "entropy": 0.2541748434305191, "epoch": 194.0, "frac_reward_zero_std": 0.5625, "grad_norm": 0.007917685434222221, "kl": 0.7916299700737, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 37445296.0, "reward": 0.8289062976837158, "reward_std": 0.13502641022205353, "rewards/num_nodes_reward/mean": 0.921875, "rewards/num_nodes_reward/std": 0.2694226801395416, "rewards/tree_correctness_reward/mean": 0.7890625, "rewards/tree_correctness_reward/std": 0.4095771610736847, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8359342217445374, "sampling/importance_sampling_ratio/min": 4.8430893875347414e-30, "sampling/sampling_logp_difference/max": 67.5, "sampling/sampling_logp_difference/mean": 3.7026376724243164, "step": 388 }, { "clip_ratio/high_max": 0.05419340170919895, "clip_ratio/high_mean": 0.01603995938785374, "clip_ratio/low_mean": 0.0037575441383523867, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019797503598965704, "completions/clipped_ratio": 1.0, "completions/max_length": 111.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 89.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 79.0, "completions/min_terminated_length": 0.0, "entropy": 0.42293187975883484, "epoch": 194.5, "frac_reward_zero_std": 0.375, "grad_norm": 0.007536278106272221, "kl": 0.7875378355383873, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 37520864.0, "reward": 0.7054687142372131, "reward_std": 0.17952129244804382, "rewards/num_nodes_reward/mean": 0.875, "rewards/num_nodes_reward/std": 0.3320184051990509, "rewards/tree_correctness_reward/mean": 0.6328125, "rewards/tree_correctness_reward/std": 0.4839322865009308, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.7582734823226929, "sampling/importance_sampling_ratio/min": 1.6712093388374966e-27, "sampling/sampling_logp_difference/max": 61.65625, "sampling/sampling_logp_difference/mean": 4.905490875244141, "step": 389 }, { "clip_ratio/high_max": 0.02206170599674806, "clip_ratio/high_mean": 0.003682216069137212, "clip_ratio/low_mean": 0.000632073511951603, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004314289544709027, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 82.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 68.0, "completions/min_terminated_length": 0.0, "entropy": 0.2755807340145111, "epoch": 195.0, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0036227735690772533, "kl": 0.9291605204343796, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 37595488.0, "reward": 0.649218738079071, "reward_std": 0.06465215235948563, "rewards/num_nodes_reward/mean": 0.796875, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.5859375, "rewards/tree_correctness_reward/std": 0.49449479579925537, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8191021680831909, "sampling/importance_sampling_ratio/min": 5.958880211007578e-28, "sampling/sampling_logp_difference/max": 62.6875, "sampling/sampling_logp_difference/mean": 3.954815626144409, "step": 390 }, { "epoch": 195.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 81.2, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 70.55, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 62.6, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.20421577394008636, "eval_frac_reward_zero_std": 0.6, "eval_kl": 0.9790506005287171, "eval_loss": -8.885812712833285e-05, "eval_num_tokens": 37595488.0, "eval_reward": 0.23250000476837157, "eval_reward_std": 0.12473232448101043, "eval_rewards/num_nodes_reward/mean": 0.5125, "eval_rewards/num_nodes_reward/std": 0.30902862548828125, "eval_rewards/tree_correctness_reward/mean": 0.1125, "eval_rewards/tree_correctness_reward/std": 0.1707825243473053, "eval_runtime": 14.8563, "eval_samples_per_second": 0.673, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9450402379035949, "eval_sampling/importance_sampling_ratio/min": 1.82532015410891e-25, "eval_sampling/sampling_logp_difference/max": 58.46875, "eval_sampling/sampling_logp_difference/mean": 1.3549765825271607, "eval_steps_per_second": 0.067, "step": 390 }, { "clip_ratio/high_max": 0.024604668840765953, "clip_ratio/high_mean": 0.003546216175891459, "clip_ratio/low_mean": 0.00018825300503522158, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037344691809266806, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 80.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 71.0, "completions/min_terminated_length": 0.0, "entropy": 0.22626784816384315, "epoch": 195.5, "frac_reward_zero_std": 0.875, "grad_norm": 0.002924986183643341, "kl": 0.7221743166446686, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 37669920.0, "reward": 0.6265624761581421, "reward_std": 0.030935922637581825, "rewards/num_nodes_reward/mean": 0.8125, "rewards/num_nodes_reward/std": 0.39184603095054626, "rewards/tree_correctness_reward/mean": 0.546875, "rewards/tree_correctness_reward/std": 0.4997538626194, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8308653831481934, "sampling/importance_sampling_ratio/min": 6.148035420495571e-28, "sampling/sampling_logp_difference/max": 62.65625, "sampling/sampling_logp_difference/mean": 3.839251756668091, "step": 391 }, { "clip_ratio/high_max": 0.03919311729259789, "clip_ratio/high_mean": 0.007501703017624095, "clip_ratio/low_mean": 0.0015736665518488735, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009075369511265308, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 81.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 72.0, "completions/min_terminated_length": 0.0, "entropy": 0.25000043772161007, "epoch": 196.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.004920801613479853, "kl": 0.7994618862867355, "learning_rate": 8e-05, "loss": -0.0002, "num_tokens": 37744416.0, "reward": 0.8226562738418579, "reward_std": 0.06850097328424454, "rewards/num_nodes_reward/mean": 0.8828125, "rewards/num_nodes_reward/std": 0.322907418012619, "rewards/tree_correctness_reward/mean": 0.796875, "rewards/tree_correctness_reward/std": 0.40390563011169434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8266158103942871, "sampling/importance_sampling_ratio/min": 2.2842721291068193e-27, "sampling/sampling_logp_difference/max": 61.34375, "sampling/sampling_logp_difference/mean": 3.892357349395752, "step": 392 }, { "clip_ratio/high_max": 0.02175403299042955, "clip_ratio/high_mean": 0.0038612416246905923, "clip_ratio/low_mean": 0.0011485877475934103, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005009829328628257, "completions/clipped_ratio": 1.0, "completions/max_length": 87.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 79.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 70.0, "completions/min_terminated_length": 0.0, "entropy": 0.25001101568341255, "epoch": 196.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.00525604048743844, "kl": 0.8849917575716972, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 37818688.0, "reward": 0.7093750238418579, "reward_std": 0.07513009011745453, "rewards/num_nodes_reward/mean": 0.796875, "rewards/num_nodes_reward/std": 0.40390563011169434, "rewards/tree_correctness_reward/mean": 0.671875, "rewards/tree_correctness_reward/std": 0.4713755249977112, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8325746655464172, "sampling/importance_sampling_ratio/min": 2.1458751884891414e-27, "sampling/sampling_logp_difference/max": 61.40625, "sampling/sampling_logp_difference/mean": 3.737415313720703, "step": 393 }, { "clip_ratio/high_max": 0.01736833219183609, "clip_ratio/high_mean": 0.003359339665621519, "clip_ratio/low_mean": 0.0010754549293778837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004434794587723445, "completions/clipped_ratio": 1.0, "completions/max_length": 85.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 79.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 70.0, "completions/min_terminated_length": 0.0, "entropy": 0.2229394167661667, "epoch": 197.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.004803083837032318, "kl": 0.7327466309070587, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 37892992.0, "reward": 0.72265625, "reward_std": 0.07533575594425201, "rewards/num_nodes_reward/mean": 0.859375, "rewards/num_nodes_reward/std": 0.3490002751350403, "rewards/tree_correctness_reward/mean": 0.6640625, "rewards/tree_correctness_reward/std": 0.47417303919792175, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8238424062728882, "sampling/importance_sampling_ratio/min": 1.196873169873732e-26, "sampling/sampling_logp_difference/max": 59.6875, "sampling/sampling_logp_difference/mean": 3.961211919784546, "step": 394 }, { "clip_ratio/high_max": 0.007263103150762618, "clip_ratio/high_mean": 0.0016511714493390173, "clip_ratio/low_mean": 0.005032197885157075, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006683369378151838, "completions/clipped_ratio": 1.0, "completions/max_length": 83.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 72.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 68.0, "completions/min_terminated_length": 0.0, "entropy": 0.18092772364616394, "epoch": 197.5, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0033364694099873304, "kl": 0.850004106760025, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 37966416.0, "reward": 0.8031250238418579, "reward_std": 0.09831856191158295, "rewards/num_nodes_reward/mean": 0.9453125, "rewards/num_nodes_reward/std": 0.22826264798641205, "rewards/tree_correctness_reward/mean": 0.7421875, "rewards/tree_correctness_reward/std": 0.43914902210235596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8866138458251953, "sampling/importance_sampling_ratio/min": 7.972893139180348e-27, "sampling/sampling_logp_difference/max": 60.09375, "sampling/sampling_logp_difference/mean": 2.8824779987335205, "step": 395 }, { "clip_ratio/high_max": 0.019691155233886093, "clip_ratio/high_mean": 0.00385062880377518, "clip_ratio/low_mean": 0.0022057135647628456, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006056342390365899, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 78.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 71.0, "completions/min_terminated_length": 0.0, "entropy": 0.2254241481423378, "epoch": 198.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.006684280000627041, "kl": 0.9299482479691505, "learning_rate": 8e-05, "loss": 0.0001, "num_tokens": 38040608.0, "reward": 0.6640625, "reward_std": 0.06977653503417969, "rewards/num_nodes_reward/mean": 0.8828125, "rewards/num_nodes_reward/std": 0.322907418012619, "rewards/tree_correctness_reward/mean": 0.5703125, "rewards/tree_correctness_reward/std": 0.4969765841960907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8431491851806641, "sampling/importance_sampling_ratio/min": 4.267582267292156e-27, "sampling/sampling_logp_difference/max": 60.71875, "sampling/sampling_logp_difference/mean": 3.5744214057922363, "step": 396 }, { "clip_ratio/high_max": 0.049030419904738665, "clip_ratio/high_mean": 0.012741899496177211, "clip_ratio/low_mean": 0.0036949885106878355, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016436888196039945, "completions/clipped_ratio": 1.0, "completions/max_length": 85.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 78.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 69.0, "completions/min_terminated_length": 0.0, "entropy": 0.21472573280334473, "epoch": 198.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.008050273172557354, "kl": 1.043533816933632, "learning_rate": 8e-05, "loss": 0.0, "num_tokens": 38114768.0, "reward": 0.692187488079071, "reward_std": 0.15190069377422333, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.6171875, "rewards/tree_correctness_reward/std": 0.4879830479621887, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8212376832962036, "sampling/importance_sampling_ratio/min": 7.972893139180348e-27, "sampling/sampling_logp_difference/max": 60.09375, "sampling/sampling_logp_difference/mean": 4.016968727111816, "step": 397 }, { "clip_ratio/high_max": 0.02989498618990183, "clip_ratio/high_mean": 0.005211119074374437, "clip_ratio/low_mean": 0.001975535196834244, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00718665421300102, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 80.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 72.0, "completions/min_terminated_length": 0.0, "entropy": 0.248255280777812, "epoch": 199.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.003393469611182809, "kl": 0.8070840090513229, "learning_rate": 8e-05, "loss": -0.0, "num_tokens": 38189216.0, "reward": 0.73046875, "reward_std": 0.08524449914693832, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.671875, "rewards/tree_correctness_reward/std": 0.4713755249977112, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8113921880722046, "sampling/importance_sampling_ratio/min": 3.7289723175541984e-28, "sampling/sampling_logp_difference/max": 63.15625, "sampling/sampling_logp_difference/mean": 4.147217750549316, "step": 398 }, { "clip_ratio/high_max": 0.020581876975484192, "clip_ratio/high_mean": 0.0036288216506363824, "clip_ratio/low_mean": 0.003044510362087749, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006673332012724131, "completions/clipped_ratio": 1.0, "completions/max_length": 86.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 78.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 70.0, "completions/min_terminated_length": 0.0, "entropy": 0.20714273303747177, "epoch": 199.5, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0050604636780917645, "kl": 0.9331685900688171, "learning_rate": 8e-05, "loss": -0.0001, "num_tokens": 38263408.0, "reward": 0.7320312261581421, "reward_std": 0.11438354849815369, "rewards/num_nodes_reward/mean": 0.890625, "rewards/num_nodes_reward/std": 0.31333550810813904, "rewards/tree_correctness_reward/mean": 0.6640625, "rewards/tree_correctness_reward/std": 0.47417303919792175, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8281363248825073, "sampling/importance_sampling_ratio/min": 7.41594636229363e-28, "sampling/sampling_logp_difference/max": 62.46875, "sampling/sampling_logp_difference/mean": 3.9100537300109863, "step": 399 }, { "clip_ratio/high_max": 0.041290885652415454, "clip_ratio/high_mean": 0.008128613088047132, "clip_ratio/low_mean": 0.0013543176901293918, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00948293082183227, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 76.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 67.0, "completions/min_terminated_length": 0.0, "entropy": 0.2007293663918972, "epoch": 200.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.01022141519933939, "kl": 0.8350934460759163, "learning_rate": 8e-05, "loss": 0.0002, "num_tokens": 38337344.0, "reward": 0.7578125, "reward_std": 0.0626017227768898, "rewards/num_nodes_reward/mean": 0.8671875, "rewards/num_nodes_reward/std": 0.3407054841518402, "rewards/tree_correctness_reward/mean": 0.7109375, "rewards/tree_correctness_reward/std": 0.45510825514793396, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.8391262888908386, "sampling/importance_sampling_ratio/min": 2.48403073849663e-28, "sampling/sampling_logp_difference/max": 63.5625, "sampling/sampling_logp_difference/mean": 3.765017032623291, "step": 400 }, { "epoch": 200.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 78.4, "eval_completions/max_terminated_length": 0.0, "eval_completions/mean_length": 67.75, "eval_completions/mean_terminated_length": 0.0, "eval_completions/min_length": 58.8, "eval_completions/min_terminated_length": 0.0, "eval_entropy": 0.16210335195064546, "eval_frac_reward_zero_std": 0.7, "eval_kl": 1.034114944934845, "eval_loss": -2.3693830371485092e-05, "eval_num_tokens": 38337344.0, "eval_reward": 0.2200000047683716, "eval_reward_std": 0.09184188842773437, "eval_rewards/num_nodes_reward/mean": 0.5875, "eval_rewards/num_nodes_reward/std": 0.28390213251113894, "eval_rewards/tree_correctness_reward/mean": 0.0625, "eval_rewards/tree_correctness_reward/std": 0.14893558621406555, "eval_runtime": 14.535, "eval_samples_per_second": 0.688, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 0.9504938125610352, "eval_sampling/importance_sampling_ratio/min": 1.854839192056068e-26, "eval_sampling/sampling_logp_difference/max": 60.20625, "eval_sampling/sampling_logp_difference/mean": 1.281513524055481, "eval_steps_per_second": 0.069, "step": 400 }, { "epoch": 200.0, "step": 400, "total_flos": 0.0, "train_loss": 1.7647783374741266e-06, "train_runtime": 1084.2338, "train_samples_per_second": 47.222, "train_steps_per_second": 0.369 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 38337344, "num_train_epochs": 200, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }