{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9404388714733543, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0011156126391142606, "clip_ratio/high_mean": 0.0011156126391142606, "clip_ratio/low_mean": 0.0004950081609422341, "clip_ratio/low_min": 0.0004950081609422341, "clip_ratio/region_mean": 0.0016106208000564947, "completions/clipped_ratio": 0.05, "completions/max_length": 3211.9, "completions/max_terminated_length": 2960.1, "completions/mean_length": 1786.0, "completions/mean_terminated_length": 1665.2375244140626, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "entropy": 0.28542927699163556, "epoch": 0.03134796238244514, "frac_reward_zero_std": 0.775, "grad_norm": 0.0, "learning_rate": 9.375000000000001e-07, "loss": -0.028166231513023377, "num_tokens": 153066.0, "reward": 0.807500034570694, "reward_std": 0.40052716955542567, "rewards/correctness_reward/mean": 0.7125, "rewards/correctness_reward/std": 0.39180393517017365, "rewards/format_reward/mean": 0.0950000025331974, "rewards/format_reward/std": 0.011700168624520302, "sampling/importance_sampling_ratio/max": 1.1340648874640464, "sampling/importance_sampling_ratio/mean": 0.3192014880478382, "sampling/importance_sampling_ratio/min": 0.0048722516017733145, "sampling/sampling_logp_difference/max": 0.82484130859375, "sampling/sampling_logp_difference/mean": 0.017703271098434926, "step": 10, "step_time": 32.390700388292316 }, { "clip_ratio/high_max": 0.0013198494911193849, "clip_ratio/high_mean": 0.0013198494911193849, "clip_ratio/low_mean": 0.0005266870924970135, "clip_ratio/low_min": 0.0005266870924970135, "clip_ratio/region_mean": 0.0018465365836163982, "completions/clipped_ratio": 0.1125, "completions/max_length": 3759.7, "completions/max_terminated_length": 2936.5, "completions/mean_length": 2046.6, "completions/mean_terminated_length": 1757.4464477539063, "completions/min_length": 914.5, "completions/min_terminated_length": 914.5, "entropy": 0.29361761510372164, "epoch": 0.06269592476489028, "frac_reward_zero_std": 0.8, "grad_norm": 0.15296021103858948, "learning_rate": 1.9791666666666666e-06, "loss": 0.01227574199438095, "num_tokens": 328184.0, "reward": 0.5512500315904617, "reward_std": 0.5255446821451187, "rewards/correctness_reward/mean": 0.4625, "rewards/correctness_reward/std": 0.518874603509903, "rewards/format_reward/mean": 0.0887500025331974, "rewards/format_reward/std": 0.022051936015486717, "sampling/importance_sampling_ratio/max": 1.5312834233045578, "sampling/importance_sampling_ratio/mean": 0.3868576504290104, "sampling/importance_sampling_ratio/min": 0.0028360410206005326, "sampling/sampling_logp_difference/max": 0.7048742353916169, "sampling/sampling_logp_difference/mean": 0.017818821407854558, "step": 20, "step_time": 37.95259270980314 }, { "clip_ratio/high_max": 0.0005335707101039588, "clip_ratio/high_mean": 0.0005335707101039588, "clip_ratio/low_mean": 0.0003907225793227553, "clip_ratio/low_min": 0.0003907225793227553, "clip_ratio/region_mean": 0.0009242932894267142, "completions/clipped_ratio": 0.2, "completions/max_length": 3884.0, "completions/max_terminated_length": 2609.1, "completions/mean_length": 2025.475, "completions/mean_terminated_length": 1500.2830017089843, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "entropy": 0.2717891433276236, "epoch": 0.09404388714733543, "frac_reward_zero_std": 0.875, "grad_norm": 0.0, "learning_rate": 3.0208333333333334e-06, "loss": -0.009346835315227509, "num_tokens": 503624.0, "reward": 0.6175000309944153, "reward_std": 0.5193032801151276, "rewards/correctness_reward/mean": 0.5375, "rewards/correctness_reward/std": 0.5007855296134949, "rewards/format_reward/mean": 0.08000000268220901, "rewards/format_reward/std": 0.03648562915623188, "sampling/importance_sampling_ratio/max": 1.2546093359589576, "sampling/importance_sampling_ratio/mean": 0.3157872579991817, "sampling/importance_sampling_ratio/min": 0.0024478550085405004, "sampling/sampling_logp_difference/max": 1.171226930618286, "sampling/sampling_logp_difference/mean": 0.01727942144498229, "step": 30, "step_time": 29.273015804101306 }, { "clip_ratio/high_max": 0.0006550904829055071, "clip_ratio/high_mean": 0.0006550904829055071, "clip_ratio/low_mean": 0.00024783555418252947, "clip_ratio/low_min": 0.00024783555418252947, "clip_ratio/region_mean": 0.0009029260370880365, "completions/clipped_ratio": 0.15, "completions/max_length": 3851.1, "completions/max_terminated_length": 2804.8, "completions/mean_length": 2002.5, "completions/mean_terminated_length": 1608.4672912597657, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "entropy": 0.2781035866588354, "epoch": 0.12539184952978055, "frac_reward_zero_std": 0.875, "grad_norm": 0.0, "learning_rate": 4.0625000000000005e-06, "loss": -0.025474557280540468, "num_tokens": 675168.0, "reward": 0.6575000196695328, "reward_std": 0.48194521963596343, "rewards/correctness_reward/mean": 0.575, "rewards/correctness_reward/std": 0.4662890374660492, "rewards/format_reward/mean": 0.0825000025331974, "rewards/format_reward/std": 0.03185652866959572, "sampling/importance_sampling_ratio/max": 1.3444975703954696, "sampling/importance_sampling_ratio/mean": 0.2994952708482742, "sampling/importance_sampling_ratio/min": 0.0006009162229020148, "sampling/sampling_logp_difference/max": 0.8522951543331146, "sampling/sampling_logp_difference/mean": 0.01767203528434038, "step": 40, "step_time": 28.375074741206483 }, { "clip_ratio/high_max": 0.0008747221669182182, "clip_ratio/high_mean": 0.0008747221669182182, "clip_ratio/low_mean": 0.0004933560645440594, "clip_ratio/low_min": 0.0004933560645440594, "clip_ratio/region_mean": 0.0013680782314622775, "completions/clipped_ratio": 0.0875, "completions/max_length": 3590.3, "completions/max_terminated_length": 2730.9, "completions/mean_length": 1847.5875, "completions/mean_terminated_length": 1597.0345336914063, "completions/min_length": 903.3, "completions/min_terminated_length": 903.3, "entropy": 0.2937802216038108, "epoch": 0.15673981191222572, "frac_reward_zero_std": 0.825, "grad_norm": 0.0, "learning_rate": 4.999985069252202e-06, "loss": -0.03807174563407898, "num_tokens": 832515.0, "reward": 0.6537500292062759, "reward_std": 0.4774915397167206, "rewards/correctness_reward/mean": 0.5625, "rewards/correctness_reward/std": 0.4700634628534317, "rewards/format_reward/mean": 0.09125000238418579, "rewards/format_reward/std": 0.017422835528850555, "sampling/importance_sampling_ratio/max": 1.1297938469797373, "sampling/importance_sampling_ratio/mean": 0.31722771739587186, "sampling/importance_sampling_ratio/min": 0.0008020710116397822, "sampling/sampling_logp_difference/max": 0.9012966513633728, "sampling/sampling_logp_difference/mean": 0.018274252861738206, "step": 50, "step_time": 40.12386009950569 }, { "clip_ratio/high_max": 0.0017888416070491076, "clip_ratio/high_mean": 0.0017888416070491076, "clip_ratio/low_mean": 0.0008694853982888162, "clip_ratio/low_min": 0.0008694853982888162, "clip_ratio/region_mean": 0.002658327005337924, "completions/clipped_ratio": 0.1, "completions/max_length": 3365.6, "completions/max_terminated_length": 3129.8, "completions/mean_length": 2091.725, "completions/mean_terminated_length": 1897.8851806640625, "completions/min_length": 982.9, "completions/min_terminated_length": 982.9, "entropy": 0.28510729074478147, "epoch": 0.18808777429467086, "frac_reward_zero_std": 0.7, "grad_norm": 0.1018686294555664, "learning_rate": 4.998193595299945e-06, "loss": 0.001230363454669714, "num_tokens": 1009715.0, "reward": 0.653750017285347, "reward_std": 0.5125159472227097, "rewards/correctness_reward/mean": 0.5625, "rewards/correctness_reward/std": 0.5083187103271485, "rewards/format_reward/mean": 0.09125000238418579, "rewards/format_reward/std": 0.016875660791993142, "sampling/importance_sampling_ratio/max": 0.9389026090502739, "sampling/importance_sampling_ratio/mean": 0.20821646079421044, "sampling/importance_sampling_ratio/min": 0.002376743620061461, "sampling/sampling_logp_difference/max": 0.9540291488170624, "sampling/sampling_logp_difference/mean": 0.018469406850636007, "step": 60, "step_time": 30.81253262050159 }, { "clip_ratio/high_max": 0.0011308694491162896, "clip_ratio/high_mean": 0.0011308694491162896, "clip_ratio/low_mean": 0.0007005490129813552, "clip_ratio/low_min": 0.0007005490129813552, "clip_ratio/region_mean": 0.001831418462097645, "completions/clipped_ratio": 0.15, "completions/max_length": 3438.3, "completions/max_terminated_length": 2581.2, "completions/mean_length": 1902.025, "completions/mean_terminated_length": 1513.3540649414062, "completions/min_length": 807.6, "completions/min_terminated_length": 807.6, "entropy": 0.2557149097323418, "epoch": 0.219435736677116, "frac_reward_zero_std": 0.775, "grad_norm": 0.0, "learning_rate": 4.9934184235060005e-06, "loss": 0.008044037222862243, "num_tokens": 1172149.0, "reward": 0.6600000157952308, "reward_std": 0.478184649348259, "rewards/correctness_reward/mean": 0.575, "rewards/correctness_reward/std": 0.4662890374660492, "rewards/format_reward/mean": 0.08500000312924386, "rewards/format_reward/std": 0.029669395834207534, "sampling/importance_sampling_ratio/max": 0.9751305520534516, "sampling/importance_sampling_ratio/mean": 0.2421271875500679, "sampling/importance_sampling_ratio/min": 0.0038532244869657005, "sampling/sampling_logp_difference/max": 0.918789267539978, "sampling/sampling_logp_difference/mean": 0.017296981997787952, "step": 70, "step_time": 26.532471208798235 }, { "clip_ratio/high_max": 0.001505734668171499, "clip_ratio/high_mean": 0.001505734668171499, "clip_ratio/low_mean": 0.0009694579581264407, "clip_ratio/low_min": 0.0009694579581264407, "clip_ratio/region_mean": 0.0024751926262979395, "completions/clipped_ratio": 0.15, "completions/max_length": 3744.2, "completions/max_terminated_length": 3042.2, "completions/mean_length": 1918.7875, "completions/mean_terminated_length": 1552.7396728515625, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "entropy": 0.2795219488441944, "epoch": 0.2507836990595611, "frac_reward_zero_std": 0.7, "grad_norm": 0.00016649632016196847, "learning_rate": 4.9856652570591865e-06, "loss": 0.023329220712184906, "num_tokens": 1335446.0, "reward": 0.7600000262260437, "reward_std": 0.49029034972190855, "rewards/correctness_reward/mean": 0.675, "rewards/correctness_reward/std": 0.4792939692735672, "rewards/format_reward/mean": 0.08500000312924386, "rewards/format_reward/std": 0.029669395834207534, "sampling/importance_sampling_ratio/max": 1.1784333825111388, "sampling/importance_sampling_ratio/mean": 0.2728546090424061, "sampling/importance_sampling_ratio/min": 0.0026182837519627355, "sampling/sampling_logp_difference/max": 1.054230934381485, "sampling/sampling_logp_difference/mean": 0.018482295237481593, "step": 80, "step_time": 28.250651192403165 }, { "clip_ratio/high_max": 0.00106070198235102, "clip_ratio/high_mean": 0.00106070198235102, "clip_ratio/low_mean": 0.0006351321993861347, "clip_ratio/low_min": 0.0006351321993861347, "clip_ratio/region_mean": 0.0016958341817371546, "completions/clipped_ratio": 0.075, "completions/max_length": 3207.2, "completions/max_terminated_length": 2825.2, "completions/mean_length": 1676.925, "completions/mean_terminated_length": 1499.47861328125, "completions/min_length": 786.9, "completions/min_terminated_length": 786.9, "entropy": 0.24367527915164827, "epoch": 0.28213166144200624, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 4.9749433558927755e-06, "loss": -0.031055212020874023, "num_tokens": 1480914.0, "reward": 0.7562500096857547, "reward_std": 0.3321071207523346, "rewards/correctness_reward/mean": 0.6625, "rewards/correctness_reward/std": 0.3311904460191727, "rewards/format_reward/mean": 0.09375000298023224, "rewards/format_reward/std": 0.015235702693462371, "sampling/importance_sampling_ratio/max": 0.9751467287540436, "sampling/importance_sampling_ratio/mean": 0.27101266086101533, "sampling/importance_sampling_ratio/min": 0.0010261921809387787, "sampling/sampling_logp_difference/max": 0.9967936933040619, "sampling/sampling_logp_difference/mean": 0.01640293300151825, "step": 90, "step_time": 23.365028806497868 }, { "clip_ratio/high_max": 0.0015215544030070306, "clip_ratio/high_mean": 0.0015215544030070306, "clip_ratio/low_mean": 0.0007961224560858682, "clip_ratio/low_min": 0.0007961224560858682, "clip_ratio/region_mean": 0.0023176768590928987, "completions/clipped_ratio": 0.1375, "completions/max_length": 3607.3, "completions/max_terminated_length": 2945.7, "completions/mean_length": 2053.875, "completions/mean_terminated_length": 1730.1625, "completions/min_length": 1006.1, "completions/min_terminated_length": 1006.1, "entropy": 0.2696403542533517, "epoch": 0.31347962382445144, "frac_reward_zero_std": 0.725, "grad_norm": 0.1218937337398529, "learning_rate": 4.961265525624965e-06, "loss": -0.030147406458854675, "num_tokens": 1656476.0, "reward": 0.6987500250339508, "reward_std": 0.49789137542247774, "rewards/correctness_reward/mean": 0.6125, "rewards/correctness_reward/std": 0.4847578853368759, "rewards/format_reward/mean": 0.08625000230967998, "rewards/format_reward/std": 0.022768060863018035, "sampling/importance_sampling_ratio/max": 1.5745819807052612, "sampling/importance_sampling_ratio/mean": 0.4060269758105278, "sampling/importance_sampling_ratio/min": 0.0030046395702811425, "sampling/sampling_logp_difference/max": 0.9531826674938202, "sampling/sampling_logp_difference/mean": 0.018279338628053664, "step": 100, "step_time": 26.72063476130279 }, { "clip_ratio/high_max": 0.0012467309192288668, "clip_ratio/high_mean": 0.0012467309192288668, "clip_ratio/low_mean": 0.0006530083628604189, "clip_ratio/low_min": 0.0006530083628604189, "clip_ratio/region_mean": 0.0018997392820892856, "completions/clipped_ratio": 0.1125, "completions/max_length": 3448.5, "completions/max_terminated_length": 2953.2, "completions/mean_length": 1943.05, "completions/mean_terminated_length": 1719.2406616210938, "completions/min_length": 780.8, "completions/min_terminated_length": 780.8, "entropy": 0.2528709888458252, "epoch": 0.3448275862068966, "frac_reward_zero_std": 0.775, "grad_norm": 0.0014118266990408301, "learning_rate": 4.944648102264594e-06, "loss": -0.0005479533225297928, "num_tokens": 1826378.0, "reward": 0.7762500166893005, "reward_std": 0.45516199469566343, "rewards/correctness_reward/mean": 0.6875, "rewards/correctness_reward/std": 0.4473332822322845, "rewards/format_reward/mean": 0.08875000327825547, "rewards/format_reward/std": 0.02394672892987728, "sampling/importance_sampling_ratio/max": 1.2980196245014668, "sampling/importance_sampling_ratio/mean": 0.27484212815761566, "sampling/importance_sampling_ratio/min": 0.003586299168330598, "sampling/sampling_logp_difference/max": 1.0450670838356018, "sampling/sampling_logp_difference/mean": 0.017865628376603125, "step": 110, "step_time": 26.002142286603338 }, { "clip_ratio/high_max": 0.001192069088574499, "clip_ratio/high_mean": 0.001192069088574499, "clip_ratio/low_mean": 0.0007645920908544212, "clip_ratio/low_min": 0.0007645920908544212, "clip_ratio/region_mean": 0.0019566611794289202, "completions/clipped_ratio": 0.0375, "completions/max_length": 3207.6, "completions/max_terminated_length": 2768.2, "completions/mean_length": 1762.2375, "completions/mean_terminated_length": 1658.0006103515625, "completions/min_length": 813.7, "completions/min_terminated_length": 813.7, "entropy": 0.25225072130560877, "epoch": 0.3761755485893417, "frac_reward_zero_std": 0.775, "grad_norm": 0.0, "learning_rate": 4.925110932700353e-06, "loss": 0.012529200315475464, "num_tokens": 1981719.0, "reward": 0.5975000187754631, "reward_std": 0.4746046274900436, "rewards/correctness_reward/mean": 0.5, "rewards/correctness_reward/std": 0.4734502792358398, "rewards/format_reward/mean": 0.0975000023841858, "rewards/format_reward/std": 0.00707106813788414, "sampling/importance_sampling_ratio/max": 1.3150353372097014, "sampling/importance_sampling_ratio/mean": 0.33656532913446424, "sampling/importance_sampling_ratio/min": 0.00034802594600478185, "sampling/sampling_logp_difference/max": 1.1183373570442199, "sampling/sampling_logp_difference/mean": 0.018386438302695752, "step": 120, "step_time": 23.834052741806953 }, { "clip_ratio/high_max": 0.0016922274080570787, "clip_ratio/high_mean": 0.0016922274080570787, "clip_ratio/low_mean": 0.0011557157937204464, "clip_ratio/low_min": 0.0011557157937204464, "clip_ratio/region_mean": 0.002847943201777525, "completions/clipped_ratio": 0.175, "completions/max_length": 3555.5, "completions/max_terminated_length": 2864.1, "completions/mean_length": 2111.65, "completions/mean_terminated_length": 1698.3780151367187, "completions/min_length": 916.4, "completions/min_terminated_length": 916.4, "entropy": 0.25562331043183806, "epoch": 0.40752351097178685, "frac_reward_zero_std": 0.675, "grad_norm": 0.004960106685757637, "learning_rate": 4.902677350996812e-06, "loss": 0.00046389871276915074, "num_tokens": 2162831.0, "reward": 0.6700000166893005, "reward_std": 0.49468653202056884, "rewards/correctness_reward/mean": 0.5875, "rewards/correctness_reward/std": 0.48815253674983977, "rewards/format_reward/mean": 0.0825000025331974, "rewards/format_reward/std": 0.027019719779491424, "sampling/importance_sampling_ratio/max": 1.0717660136520863, "sampling/importance_sampling_ratio/mean": 0.2801607523113489, "sampling/importance_sampling_ratio/min": 0.0024215662131609863, "sampling/sampling_logp_difference/max": 1.1462424278259278, "sampling/sampling_logp_difference/mean": 0.01845086943358183, "step": 130, "step_time": 26.717056876599962 }, { "clip_ratio/high_max": 0.0010852621460799127, "clip_ratio/high_mean": 0.0010852621460799127, "clip_ratio/low_mean": 0.0007938130234833807, "clip_ratio/low_min": 0.0007938130234833807, "clip_ratio/region_mean": 0.0018790751695632935, "completions/clipped_ratio": 0.1125, "completions/max_length": 3726.5, "completions/max_terminated_length": 3401.4, "completions/mean_length": 2116.0875, "completions/mean_terminated_length": 1885.794091796875, "completions/min_length": 872.7, "completions/min_terminated_length": 872.7, "entropy": 0.22774933548644186, "epoch": 0.438871473354232, "frac_reward_zero_std": 0.775, "grad_norm": 0.014324595220386982, "learning_rate": 4.877374150525563e-06, "loss": 0.008725580573081971, "num_tokens": 2344012.0, "reward": 0.6125000238418579, "reward_std": 0.4733792722225189, "rewards/correctness_reward/mean": 0.525, "rewards/correctness_reward/std": 0.47005562782287597, "rewards/format_reward/mean": 0.08750000298023224, "rewards/format_reward/std": 0.025587470084428788, "sampling/importance_sampling_ratio/max": 0.7013322688639164, "sampling/importance_sampling_ratio/mean": 0.16202473118901253, "sampling/importance_sampling_ratio/min": 4.9783244139689484e-05, "sampling/sampling_logp_difference/max": 0.9842534542083741, "sampling/sampling_logp_difference/mean": 0.017839640099555253, "step": 140, "step_time": 27.824079571482436 }, { "clip_ratio/high_max": 0.0004921408195514232, "clip_ratio/high_mean": 0.0004921408195514232, "clip_ratio/low_mean": 0.00043502082116901873, "clip_ratio/low_min": 0.00043502082116901873, "clip_ratio/region_mean": 0.0009271616407204419, "completions/clipped_ratio": 0.1, "completions/max_length": 3266.1, "completions/max_terminated_length": 2758.9, "completions/mean_length": 1879.1875, "completions/mean_terminated_length": 1675.7728637695313, "completions/min_length": 908.9, "completions/min_terminated_length": 908.9, "entropy": 0.2281673434190452, "epoch": 0.4702194357366771, "frac_reward_zero_std": 0.9, "grad_norm": 0.041722770780324936, "learning_rate": 4.849231551964771e-06, "loss": -0.010190128535032272, "num_tokens": 2506153.0, "reward": 0.6525000244379043, "reward_std": 0.49419236183166504, "rewards/correctness_reward/mean": 0.5625, "rewards/correctness_reward/std": 0.484385946393013, "rewards/format_reward/mean": 0.09000000208616257, "rewards/format_reward/std": 0.017969227209687234, "sampling/importance_sampling_ratio/max": 1.2248954117298125, "sampling/importance_sampling_ratio/mean": 0.3192861404269934, "sampling/importance_sampling_ratio/min": 0.0012205516845654075, "sampling/sampling_logp_difference/max": 1.283074104785919, "sampling/sampling_logp_difference/mean": 0.019163230992853643, "step": 150, "step_time": 23.953872213704745 }, { "clip_ratio/high_max": 0.0009587546926923096, "clip_ratio/high_mean": 0.0009587546926923096, "clip_ratio/low_mean": 0.0005863742553628981, "clip_ratio/low_min": 0.0005863742553628981, "clip_ratio/region_mean": 0.0015451289480552077, "completions/clipped_ratio": 0.05, "completions/max_length": 3210.4, "completions/max_terminated_length": 2931.4, "completions/mean_length": 1695.7875, "completions/mean_terminated_length": 1574.0583374023438, "completions/min_length": 790.5, "completions/min_terminated_length": 790.5, "entropy": 0.24640227360650896, "epoch": 0.5015673981191222, "frac_reward_zero_std": 0.85, "grad_norm": 0.0, "learning_rate": 4.818283167205341e-06, "loss": 0.0067611634731292725, "num_tokens": 2653232.0, "reward": 0.6825000226497651, "reward_std": 0.40002892911434174, "rewards/correctness_reward/mean": 0.5875, "rewards/correctness_reward/std": 0.39557052552700045, "rewards/format_reward/mean": 0.09500000178813935, "rewards/format_reward/std": 0.009258200973272323, "sampling/importance_sampling_ratio/max": 1.3108162432909012, "sampling/importance_sampling_ratio/mean": 0.2764928460121155, "sampling/importance_sampling_ratio/min": 0.0028696730931642377, "sampling/sampling_logp_difference/max": 1.1736707925796508, "sampling/sampling_logp_difference/mean": 0.020335575193166734, "step": 160, "step_time": 23.41124046130426 }, { "clip_ratio/high_max": 0.0005931107676587999, "clip_ratio/high_mean": 0.0005931107676587999, "clip_ratio/low_mean": 0.0003765023429878056, "clip_ratio/low_min": 0.0003765023429878056, "clip_ratio/region_mean": 0.0009696131106466055, "completions/clipped_ratio": 0.0625, "completions/max_length": 3371.2, "completions/max_terminated_length": 3064.9, "completions/mean_length": 1709.9625, "completions/mean_terminated_length": 1545.1994262695312, "completions/min_length": 713.1, "completions/min_terminated_length": 713.1, "entropy": 0.24880375619977713, "epoch": 0.5329153605015674, "frac_reward_zero_std": 0.925, "grad_norm": 0.0, "learning_rate": 4.784565959206822e-06, "loss": -0.001160919852554798, "num_tokens": 2799703.0, "reward": 0.7325000286102294, "reward_std": 0.4547621011734009, "rewards/correctness_reward/mean": 0.6375, "rewards/correctness_reward/std": 0.4473332822322845, "rewards/format_reward/mean": 0.09500000178813935, "rewards/format_reward/std": 0.009258200973272323, "sampling/importance_sampling_ratio/max": 1.17250913977623, "sampling/importance_sampling_ratio/mean": 0.2529589384794235, "sampling/importance_sampling_ratio/min": 0.00015078657834237673, "sampling/sampling_logp_difference/max": 1.1894066691398621, "sampling/sampling_logp_difference/mean": 0.020500292629003526, "step": 170, "step_time": 24.45955295699823 }, { "clip_ratio/high_max": 0.0013415943132713438, "clip_ratio/high_mean": 0.0013415943132713438, "clip_ratio/low_mean": 0.0006985285086557269, "clip_ratio/low_min": 0.0006985285086557269, "clip_ratio/region_mean": 0.002040122821927071, "completions/clipped_ratio": 0.1375, "completions/max_length": 3522.4, "completions/max_terminated_length": 2939.7, "completions/mean_length": 2022.45, "completions/mean_terminated_length": 1712.5793090820312, "completions/min_length": 846.8, "completions/min_terminated_length": 846.8, "entropy": 0.243143948353827, "epoch": 0.5642633228840125, "frac_reward_zero_std": 0.75, "grad_norm": 0.006009459961205721, "learning_rate": 4.748120197850989e-06, "loss": 0.009028572589159012, "num_tokens": 2971287.0, "reward": 0.5850000292062759, "reward_std": 0.48046458065509795, "rewards/correctness_reward/mean": 0.5, "rewards/correctness_reward/std": 0.47005562782287597, "rewards/format_reward/mean": 0.08500000312924386, "rewards/format_reward/std": 0.02912222109735012, "sampling/importance_sampling_ratio/max": 0.8864429581910371, "sampling/importance_sampling_ratio/mean": 0.17258851490914823, "sampling/importance_sampling_ratio/min": 0.0034461140439645986, "sampling/sampling_logp_difference/max": 1.2684314608573914, "sampling/sampling_logp_difference/mean": 0.020217716693878174, "step": 180, "step_time": 25.946074717494774 }, { "clip_ratio/high_max": 0.0011504234513267874, "clip_ratio/high_mean": 0.0011504234513267874, "clip_ratio/low_mean": 0.000597678602207452, "clip_ratio/low_min": 0.000597678602207452, "clip_ratio/region_mean": 0.0017481020535342396, "completions/clipped_ratio": 0.1125, "completions/max_length": 3410.6, "completions/max_terminated_length": 2938.4, "completions/mean_length": 1874.1125, "completions/mean_terminated_length": 1598.6345458984374, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "entropy": 0.24114118572324514, "epoch": 0.5956112852664577, "frac_reward_zero_std": 0.8, "grad_norm": 0.0008628601208329201, "learning_rate": 4.708989411845826e-06, "loss": -0.010369437932968139, "num_tokens": 3132786.0, "reward": 0.6887500196695328, "reward_std": 0.39334752336144446, "rewards/correctness_reward/mean": 0.6, "rewards/correctness_reward/std": 0.3808682680130005, "rewards/format_reward/mean": 0.0887500025331974, "rewards/format_reward/std": 0.022051936015486717, "sampling/importance_sampling_ratio/max": 0.7798637529835105, "sampling/importance_sampling_ratio/mean": 0.14685697900131345, "sampling/importance_sampling_ratio/min": 0.0004774037337483605, "sampling/sampling_logp_difference/max": 1.060469424724579, "sampling/sampling_logp_difference/mean": 0.020691755786538125, "step": 190, "step_time": 25.352976691797085 }, { "clip_ratio/high_max": 0.0009019671357236803, "clip_ratio/high_mean": 0.0009019671357236803, "clip_ratio/low_mean": 0.0006230480590602383, "clip_ratio/low_min": 0.0006230480590602383, "clip_ratio/region_mean": 0.0015250151947839186, "completions/clipped_ratio": 0.125, "completions/max_length": 3415.6, "completions/max_terminated_length": 3109.2, "completions/mean_length": 1930.45, "completions/mean_terminated_length": 1665.1597778320313, "completions/min_length": 732.7, "completions/min_terminated_length": 732.7, "entropy": 0.21688514538109302, "epoch": 0.6269592476489029, "frac_reward_zero_std": 0.8, "grad_norm": 0.18304862082004547, "learning_rate": 4.667220336737355e-06, "loss": -0.016360899806022643, "num_tokens": 3303710.0, "reward": 0.6637500286102295, "reward_std": 0.475689572095871, "rewards/correctness_reward/mean": 0.575, "rewards/correctness_reward/std": 0.4633530914783478, "rewards/format_reward/mean": 0.0887500025331974, "rewards/format_reward/std": 0.021504761278629304, "sampling/importance_sampling_ratio/max": 1.1674106672406197, "sampling/importance_sampling_ratio/mean": 0.24802827090024948, "sampling/importance_sampling_ratio/min": 0.0016349365277619655, "sampling/sampling_logp_difference/max": 1.3856478929519653, "sampling/sampling_logp_difference/mean": 0.019175850413739683, "step": 200, "step_time": 25.75777603710885 }, { "clip_ratio/high_max": 0.000993343279697001, "clip_ratio/high_mean": 0.000993343279697001, "clip_ratio/low_mean": 0.0006324709567707032, "clip_ratio/low_min": 0.0006324709567707032, "clip_ratio/region_mean": 0.001625814236467704, "completions/clipped_ratio": 0.0875, "completions/max_length": 3539.8, "completions/max_terminated_length": 2975.4, "completions/mean_length": 1784.4875, "completions/mean_terminated_length": 1553.4416809082031, "completions/min_length": 633.5, "completions/min_terminated_length": 633.5, "entropy": 0.23482936546206473, "epoch": 0.658307210031348, "frac_reward_zero_std": 0.825, "grad_norm": 0.0, "learning_rate": 4.622862859091407e-06, "loss": -0.008858874440193176, "num_tokens": 3456473.0, "reward": 0.6787500202655792, "reward_std": 0.43741019666194914, "rewards/correctness_reward/mean": 0.5875, "rewards/correctness_reward/std": 0.43432835042476653, "rewards/format_reward/mean": 0.09125000238418579, "rewards/format_reward/std": 0.016875660791993142, "sampling/importance_sampling_ratio/max": 1.0203382909297942, "sampling/importance_sampling_ratio/mean": 0.29103893116116525, "sampling/importance_sampling_ratio/min": 0.003953566052996393, "sampling/sampling_logp_difference/max": 1.2329502701759338, "sampling/sampling_logp_difference/mean": 0.020628420822322368, "step": 210, "step_time": 25.793523767398437 }, { "clip_ratio/high_max": 0.001251060701906681, "clip_ratio/high_mean": 0.001251060701906681, "clip_ratio/low_mean": 0.0006381326355040073, "clip_ratio/low_min": 0.0006381326355040073, "clip_ratio/region_mean": 0.0018891933374106883, "completions/clipped_ratio": 0.1125, "completions/max_length": 3446.2, "completions/max_terminated_length": 3069.8, "completions/mean_length": 1926.675, "completions/mean_terminated_length": 1682.13515625, "completions/min_length": 898.2, "completions/min_terminated_length": 898.2, "entropy": 0.22549284966662525, "epoch": 0.6896551724137931, "frac_reward_zero_std": 0.825, "grad_norm": 3.696798785313149e-06, "learning_rate": 4.575969956911994e-06, "loss": -0.005991434305906295, "num_tokens": 3622915.0, "reward": 0.6400000154972076, "reward_std": 0.41457425951957705, "rewards/correctness_reward/mean": 0.55, "rewards/correctness_reward/std": 0.4107596904039383, "rewards/format_reward/mean": 0.09000000357627869, "rewards/format_reward/std": 0.023400337249040604, "sampling/importance_sampling_ratio/max": 0.8896690681576729, "sampling/importance_sampling_ratio/mean": 0.24411826990544797, "sampling/importance_sampling_ratio/min": 0.002280459411826996, "sampling/sampling_logp_difference/max": 1.1428457975387574, "sampling/sampling_logp_difference/mean": 0.020512748323380948, "step": 220, "step_time": 25.75643434979429 }, { "clip_ratio/high_max": 0.0009091435116715729, "clip_ratio/high_mean": 0.0009091435116715729, "clip_ratio/low_mean": 0.0005976294865831733, "clip_ratio/low_min": 0.0005976294865831733, "clip_ratio/region_mean": 0.0015067729982547462, "completions/clipped_ratio": 0.1, "completions/max_length": 3318.2, "completions/max_terminated_length": 2687.6, "completions/mean_length": 1789.6875, "completions/mean_terminated_length": 1559.130615234375, "completions/min_length": 777.7, "completions/min_terminated_length": 777.7, "entropy": 0.2324732830747962, "epoch": 0.7210031347962382, "frac_reward_zero_std": 0.85, "grad_norm": 0.0032448109705001116, "learning_rate": 4.526597636367449e-06, "loss": -0.00016784800682216882, "num_tokens": 3776114.0, "reward": 0.6912500277161598, "reward_std": 0.37446828186511993, "rewards/correctness_reward/mean": 0.6, "rewards/correctness_reward/std": 0.37370702624320984, "rewards/format_reward/mean": 0.09125000238418579, "rewards/format_reward/std": 0.016875660791993142, "sampling/importance_sampling_ratio/max": 0.9839991062879563, "sampling/importance_sampling_ratio/mean": 0.21926450338214637, "sampling/importance_sampling_ratio/min": 0.0017114538278203638, "sampling/sampling_logp_difference/max": 1.5830877304077149, "sampling/sampling_logp_difference/mean": 0.021970476023852825, "step": 230, "step_time": 24.127974103906308 }, { "clip_ratio/high_max": 0.0013581034494563938, "clip_ratio/high_mean": 0.0013581034494563938, "clip_ratio/low_mean": 0.0008850634156260639, "clip_ratio/low_min": 0.0008850634156260639, "clip_ratio/region_mean": 0.0022431668650824575, "completions/clipped_ratio": 0.15, "completions/max_length": 3789.7, "completions/max_terminated_length": 2807.3, "completions/mean_length": 1925.0, "completions/mean_terminated_length": 1500.732763671875, "completions/min_length": 748.7, "completions/min_terminated_length": 748.7, "entropy": 0.22191071659326553, "epoch": 0.7523510971786834, "frac_reward_zero_std": 0.775, "grad_norm": 4.677455365253991e-07, "learning_rate": 4.474804864899895e-06, "loss": -0.0005461292807012796, "num_tokens": 3942010.0, "reward": 0.6475000232458115, "reward_std": 0.4759311854839325, "rewards/correctness_reward/mean": 0.5625, "rewards/correctness_reward/std": 0.46835830211639407, "rewards/format_reward/mean": 0.08500000350177288, "rewards/format_reward/std": 0.028745562583208085, "sampling/importance_sampling_ratio/max": 0.9141103580594063, "sampling/importance_sampling_ratio/mean": 0.16915908381342887, "sampling/importance_sampling_ratio/min": 0.0004262479888181403, "sampling/sampling_logp_difference/max": 1.369935977458954, "sampling/sampling_logp_difference/mean": 0.021709233336150647, "step": 240, "step_time": 28.0425787657965 }, { "clip_ratio/high_max": 0.0013509833486750721, "clip_ratio/high_mean": 0.0013509833486750721, "clip_ratio/low_mean": 0.000884141051210463, "clip_ratio/low_min": 0.000884141051210463, "clip_ratio/region_mean": 0.002235124399885535, "completions/clipped_ratio": 0.0875, "completions/max_length": 3384.9, "completions/max_terminated_length": 2710.6, "completions/mean_length": 1782.0125, "completions/mean_terminated_length": 1553.994677734375, "completions/min_length": 802.1, "completions/min_terminated_length": 802.1, "entropy": 0.23113715164363385, "epoch": 0.7836990595611285, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 4.420653500797952e-06, "loss": 0.022323785722255705, "num_tokens": 4094557.0, "reward": 0.6412500292062759, "reward_std": 0.4066828578710556, "rewards/correctness_reward/mean": 0.55, "rewards/correctness_reward/std": 0.39557836055755613, "rewards/format_reward/mean": 0.09125000312924385, "rewards/format_reward/std": 0.019864803180098534, "sampling/importance_sampling_ratio/max": 1.1005408063530921, "sampling/importance_sampling_ratio/mean": 0.2535096138715744, "sampling/importance_sampling_ratio/min": 0.0009224904251141197, "sampling/sampling_logp_difference/max": 1.2049122214317323, "sampling/sampling_logp_difference/mean": 0.02191030643880367, "step": 250, "step_time": 24.743671250795888 }, { "clip_ratio/high_max": 0.0014685526955872774, "clip_ratio/high_mean": 0.0014685526955872774, "clip_ratio/low_mean": 0.0008472580957459286, "clip_ratio/low_min": 0.0008472580957459286, "clip_ratio/region_mean": 0.002315810791333206, "completions/clipped_ratio": 0.125, "completions/max_length": 3402.3, "completions/max_terminated_length": 2836.0, "completions/mean_length": 1863.375, "completions/mean_terminated_length": 1562.3606079101562, "completions/min_length": 722.1, "completions/min_terminated_length": 722.1, "entropy": 0.21556257344782354, "epoch": 0.8150470219435737, "frac_reward_zero_std": 0.775, "grad_norm": 0.0, "learning_rate": 4.364208219316771e-06, "loss": 0.0002295381622388959, "num_tokens": 4254891.0, "reward": 0.6250000208616256, "reward_std": 0.34347863867878914, "rewards/correctness_reward/mean": 0.5375, "rewards/correctness_reward/std": 0.32950095534324647, "rewards/format_reward/mean": 0.08750000186264514, "rewards/format_reward/std": 0.01868535205721855, "sampling/importance_sampling_ratio/max": 0.6800905840471387, "sampling/importance_sampling_ratio/mean": 0.13554164883680642, "sampling/importance_sampling_ratio/min": 1.4798295291029718e-05, "sampling/sampling_logp_difference/max": 1.40857173204422, "sampling/sampling_logp_difference/mean": 0.021040080208331348, "step": 260, "step_time": 25.048790523895878 }, { "clip_ratio/high_max": 0.0016789503977634012, "clip_ratio/high_mean": 0.0016789503977634012, "clip_ratio/low_mean": 0.000876212227740325, "clip_ratio/low_min": 0.000876212227740325, "clip_ratio/region_mean": 0.0025551626255037262, "completions/clipped_ratio": 0.1, "completions/max_length": 3535.8, "completions/max_terminated_length": 3057.4, "completions/mean_length": 1948.575, "completions/mean_terminated_length": 1712.798828125, "completions/min_length": 866.7, "completions/min_terminated_length": 866.7, "entropy": 0.22944035436958074, "epoch": 0.8463949843260188, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 4.305536435433656e-06, "loss": 0.002579879201948643, "num_tokens": 4422539.0, "reward": 0.6900000274181366, "reward_std": 0.5191955178976059, "rewards/correctness_reward/mean": 0.6, "rewards/correctness_reward/std": 0.5096440970897674, "rewards/format_reward/mean": 0.09000000208616257, "rewards/format_reward/std": 0.017969227209687234, "sampling/importance_sampling_ratio/max": 0.764763881266117, "sampling/importance_sampling_ratio/mean": 0.1653499912470579, "sampling/importance_sampling_ratio/min": 0.000246226538302885, "sampling/sampling_logp_difference/max": 1.2574076771736145, "sampling/sampling_logp_difference/mean": 0.021441210992634296, "step": 270, "step_time": 26.011382579595374 }, { "clip_ratio/high_max": 0.0019707939820364116, "clip_ratio/high_mean": 0.0019707939820364116, "clip_ratio/low_mean": 0.0011912819871213287, "clip_ratio/low_min": 0.0011912819871213287, "clip_ratio/region_mean": 0.0031620759691577405, "completions/clipped_ratio": 0.1, "completions/max_length": 3422.9, "completions/max_terminated_length": 3005.9, "completions/mean_length": 1993.475, "completions/mean_terminated_length": 1800.402392578125, "completions/min_length": 1008.6, "completions/min_terminated_length": 1008.6, "entropy": 0.23918197378516198, "epoch": 0.877742946708464, "frac_reward_zero_std": 0.7, "grad_norm": 0.0015258209314197302, "learning_rate": 4.244708223331519e-06, "loss": -0.010562650114297866, "num_tokens": 4593383.0, "reward": 0.6537500292062759, "reward_std": 0.5094130367040635, "rewards/correctness_reward/mean": 0.5625, "rewards/correctness_reward/std": 0.502475020289421, "rewards/format_reward/mean": 0.09125000238418579, "rewards/format_reward/std": 0.017422835528850555, "sampling/importance_sampling_ratio/max": 0.5649557963013649, "sampling/importance_sampling_ratio/mean": 0.13420482752844692, "sampling/importance_sampling_ratio/min": 0.0005196533978222729, "sampling/sampling_logp_difference/max": 1.2564236760139464, "sampling/sampling_logp_difference/mean": 0.022799196653068066, "step": 280, "step_time": 25.44951510770188 }, { "clip_ratio/high_max": 0.0010904775466769935, "clip_ratio/high_mean": 0.0010904775466769935, "clip_ratio/low_mean": 0.0004774234985234216, "clip_ratio/low_min": 0.0004774234985234216, "clip_ratio/region_mean": 0.001567901045200415, "completions/clipped_ratio": 0.1375, "completions/max_length": 3580.9, "completions/max_terminated_length": 2864.3, "completions/mean_length": 1920.1875, "completions/mean_terminated_length": 1563.3024047851563, "completions/min_length": 794.2, "completions/min_terminated_length": 794.2, "entropy": 0.22579809557646513, "epoch": 0.9090909090909091, "frac_reward_zero_std": 0.825, "grad_norm": 0.0, "learning_rate": 4.181796232706322e-06, "loss": 1.4840727089904249e-05, "num_tokens": 4757392.0, "reward": 0.6100000143051147, "reward_std": 0.5178730249404907, "rewards/correctness_reward/mean": 0.525, "rewards/correctness_reward/std": 0.5134106874465942, "rewards/format_reward/mean": 0.08500000201165676, "rewards/format_reward/std": 0.023861627280712127, "sampling/importance_sampling_ratio/max": 0.8167740240693092, "sampling/importance_sampling_ratio/mean": 0.1849126074463129, "sampling/importance_sampling_ratio/min": 0.0012310059119727157, "sampling/sampling_logp_difference/max": 1.2191795468330384, "sampling/sampling_logp_difference/mean": 0.020382146909832956, "step": 290, "step_time": 26.333028039905184 }, { "clip_ratio/high_max": 0.001048527710372582, "clip_ratio/high_mean": 0.001048527710372582, "clip_ratio/low_mean": 0.0008238868089392781, "clip_ratio/low_min": 0.0008238868089392781, "clip_ratio/region_mean": 0.0018724145193118602, "completions/clipped_ratio": 0.075, "completions/max_length": 3384.2, "completions/max_terminated_length": 2823.2, "completions/mean_length": 1690.5625, "completions/mean_terminated_length": 1498.3041870117188, "completions/min_length": 757.5, "completions/min_terminated_length": 757.5, "entropy": 0.24261436564847827, "epoch": 0.9404388714733543, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 4.116875601998499e-06, "loss": 0.009490690380334853, "num_tokens": 4907345.0, "reward": 0.6925000309944153, "reward_std": 0.358241993188858, "rewards/correctness_reward/mean": 0.6, "rewards/correctness_reward/std": 0.3458927035331726, "rewards/format_reward/mean": 0.09250000268220901, "rewards/format_reward/std": 0.016329269111156463, "sampling/importance_sampling_ratio/max": 0.8124461621046066, "sampling/importance_sampling_ratio/mean": 0.16854456886649133, "sampling/importance_sampling_ratio/min": 0.0020139962414759795, "sampling/sampling_logp_difference/max": 1.3068999290466308, "sampling/sampling_logp_difference/mean": 0.021543380804359913, "step": 300, "step_time": 25.214923793493654 } ], "logging_steps": 10, "max_steps": 957, "num_input_tokens_seen": 4907345, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }