{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9898666666666667, "eval_steps": 500, "global_step": 928, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.29947916666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 1414.4896240234375, "completions/mean_terminated_length": 705.8884887695312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0010666666666666667, "grad_norm": 0.2816614790650073, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 597104.0, "reward": 0.1067708358168602, "reward_std": 0.13982701301574707, "rewards/accuracy_reward/mean": 0.1067708358168602, "rewards/accuracy_reward/std": 0.30922457575798035, "step": 1 }, { "clip_ratio/high_max": 0.00023427193826819348, "clip_ratio/high_mean": 8.713702288787317e-05, "clip_ratio/low_mean": 6.084873791678547e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000147985758616187, "epoch": 0.005333333333333333, "grad_norm": 0.3403144783935414, "learning_rate": 1.276595744680851e-07, "loss": 0.0002, "step": 5 }, { "clip_ratio/high_max": 0.00035072941445832837, "clip_ratio/high_mean": 0.0001276570524169074, "clip_ratio/low_mean": 6.77214897336853e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019537854147984036, "epoch": 0.010666666666666666, "grad_norm": 0.29502594592995424, "learning_rate": 2.872340425531915e-07, "loss": 0.0003, "step": 10 }, { "clip_ratio/high_max": 0.0003053014331271697, "clip_ratio/high_mean": 0.00010963930958496348, "clip_ratio/low_mean": 7.742258517282608e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001870618923476286, "epoch": 0.016, "grad_norm": 0.25501119406278083, "learning_rate": 4.468085106382979e-07, "loss": 0.0001, "step": 15 }, { "clip_ratio/high_max": 0.0002046515873644239, "clip_ratio/high_mean": 7.515828667692404e-05, "clip_ratio/low_mean": 5.8234314360561255e-05, "clip_ratio/low_min": 1.512584731244715e-06, "clip_ratio/region_mean": 0.00013339260206066684, "completions/clipped_ratio": 0.28385416666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 1430.096435546875, "completions/mean_terminated_length": 779.3054809570312, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.021333333333333333, "grad_norm": 0.31067601933529315, "learning_rate": 6.063829787234043e-07, "loss": 0.0001, "num_tokens": 1199262.0, "reward": 0.1015625, "reward_std": 0.12178482115268707, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30246618390083313, "step": 20 }, { "clip_ratio/high_max": 0.00028137975841673323, "clip_ratio/high_mean": 0.00010598966223369643, "clip_ratio/low_mean": 0.00011780153508880175, "clip_ratio/low_min": 3.02516946248943e-06, "clip_ratio/region_mean": 0.00022379119561719563, "epoch": 0.02666666666666667, "grad_norm": 0.30680642846089784, "learning_rate": 7.659574468085106e-07, "loss": -0.0, "step": 25 }, { "clip_ratio/high_max": 0.0005625465481443826, "clip_ratio/high_mean": 0.00020616382387288467, "clip_ratio/low_mean": 0.00029264680256346765, "clip_ratio/low_min": 3.896484467986738e-05, "clip_ratio/region_mean": 0.000498810625549595, "epoch": 0.032, "grad_norm": 0.304807656455896, "learning_rate": 9.25531914893617e-07, "loss": -0.0005, "step": 30 }, { "clip_ratio/high_max": 0.000427068135513764, "clip_ratio/high_mean": 0.00016064831527273782, "clip_ratio/low_mean": 0.00020991870369471143, "clip_ratio/low_min": 2.6729267210612308e-05, "clip_ratio/region_mean": 0.0003705670187173382, "completions/clipped_ratio": 0.34114583333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 1541.4140625, "completions/mean_terminated_length": 748.8972778320312, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.037333333333333336, "grad_norm": 0.2431337929575801, "learning_rate": 1.0851063829787236e-06, "loss": -0.0002, "num_tokens": 1844577.0, "reward": 0.0911458358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward/mean": 0.0911458358168602, "rewards/accuracy_reward/std": 0.28819188475608826, "step": 35 }, { "clip_ratio/high_max": 0.0004196369342480466, "clip_ratio/high_mean": 0.00016710546381091263, "clip_ratio/low_mean": 0.00020526158239135838, "clip_ratio/low_min": 6.053779270587256e-06, "clip_ratio/region_mean": 0.00037236704679344257, "epoch": 0.042666666666666665, "grad_norm": 0.20497322794464387, "learning_rate": 1.2446808510638299e-06, "loss": -0.0004, "step": 40 }, { "clip_ratio/high_max": 0.0010627394708535576, "clip_ratio/high_mean": 0.0004215139170128168, "clip_ratio/low_mean": 0.0005447547513313111, "clip_ratio/low_min": 4.167848246652284e-05, "clip_ratio/region_mean": 0.0009662686702540668, "epoch": 0.048, "grad_norm": 0.22671969586309226, "learning_rate": 1.4042553191489362e-06, "loss": -0.0011, "step": 45 }, { "clip_ratio/high_max": 0.001229963479727303, "clip_ratio/high_mean": 0.0004789073033407476, "clip_ratio/low_mean": 0.0005524177042161682, "clip_ratio/low_min": 3.984793393101427e-05, "clip_ratio/region_mean": 0.001031325014673712, "completions/clipped_ratio": 0.26041666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 1326.1302490234375, "completions/mean_terminated_length": 711.3873291015625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.05333333333333334, "grad_norm": 0.47182650513985935, "learning_rate": 1.5638297872340427e-06, "loss": -0.0009, "num_tokens": 2407763.0, "reward": 0.1328125, "reward_std": 0.18042197823524475, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3398147225379944, "step": 50 }, { "clip_ratio/high_max": 0.0008261045912149711, "clip_ratio/high_mean": 0.00035407191676313233, "clip_ratio/low_mean": 0.0005964687263485758, "clip_ratio/low_min": 5.501781743078027e-05, "clip_ratio/region_mean": 0.0009505406534572103, "epoch": 0.058666666666666666, "grad_norm": 0.41027478098454245, "learning_rate": 1.723404255319149e-06, "loss": -0.0006, "step": 55 }, { "clip_ratio/high_max": 0.0025908447601977968, "clip_ratio/high_mean": 0.001121942882082294, "clip_ratio/low_mean": 0.0016540326332687982, "clip_ratio/low_min": 0.0001467346191930119, "clip_ratio/region_mean": 0.002775975561416999, "epoch": 0.064, "grad_norm": 0.30594644983616515, "learning_rate": 1.8829787234042552e-06, "loss": -0.0018, "step": 60 }, { "clip_ratio/high_max": 0.0034809220624083537, "clip_ratio/high_mean": 0.001522595134747462, "clip_ratio/low_mean": 0.0022387951074051672, "clip_ratio/low_min": 0.0001903996741020819, "clip_ratio/region_mean": 0.0037613902255543507, "completions/clipped_ratio": 0.34895833333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 1600.6614990234375, "completions/mean_terminated_length": 812.0240478515625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.06933333333333333, "grad_norm": 0.22842058853919295, "learning_rate": 2.0425531914893617e-06, "loss": -0.002, "num_tokens": 3075454.0, "reward": 0.1041666716337204, "reward_std": 0.13982701301574707, "rewards/accuracy_reward/mean": 0.1041666641831398, "rewards/accuracy_reward/std": 0.30587515234947205, "step": 65 }, { "clip_ratio/high_max": 0.0007015500015313591, "clip_ratio/high_mean": 0.0002616708540244872, "clip_ratio/low_mean": 0.0002644213506755477, "clip_ratio/low_min": 7.896953684394248e-06, "clip_ratio/region_mean": 0.0005260922084062258, "epoch": 0.07466666666666667, "grad_norm": 0.18681419198896604, "learning_rate": 2.202127659574468e-06, "loss": -0.0004, "step": 70 }, { "clip_ratio/high_max": 0.002080931533328112, "clip_ratio/high_mean": 0.0008201811926028313, "clip_ratio/low_mean": 0.0010904583676847324, "clip_ratio/low_min": 4.8259160394081846e-05, "clip_ratio/region_mean": 0.0019106395659036935, "epoch": 0.08, "grad_norm": 0.172039067559396, "learning_rate": 2.3617021276595748e-06, "loss": -0.0013, "step": 75 }, { "clip_ratio/high_max": 0.0037232573748042343, "clip_ratio/high_mean": 0.0014595315532915266, "clip_ratio/low_mean": 0.0021300932897020176, "clip_ratio/low_min": 0.00011834550386993214, "clip_ratio/region_mean": 0.0035896248813060082, "epoch": 0.08533333333333333, "grad_norm": 0.2116910505265225, "learning_rate": 2.521276595744681e-06, "loss": -0.002, "step": 80 }, { "clip_ratio/high_max": 0.0010694109992073209, "clip_ratio/high_mean": 0.00043753299657964815, "clip_ratio/low_mean": 0.0003798306191811207, "clip_ratio/low_min": 1.5775153724462142e-05, "clip_ratio/region_mean": 0.0008173636091441949, "completions/clipped_ratio": 0.31510416666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 1515.28125, "completions/mean_terminated_length": 799.072265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.09066666666666667, "grad_norm": 0.23351514666768866, "learning_rate": 2.680851063829787e-06, "loss": -0.0002, "num_tokens": 3710743.0, "reward": 0.171875, "reward_std": 0.19395360350608826, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.377763956785202, "step": 85 }, { "clip_ratio/high_max": 0.004593905031470058, "clip_ratio/high_mean": 0.0018805787036399123, "clip_ratio/low_mean": 0.001913619466563432, "clip_ratio/low_min": 5.695932713933871e-05, "clip_ratio/region_mean": 0.003794198198011145, "epoch": 0.096, "grad_norm": 0.19397795991192338, "learning_rate": 2.8404255319148938e-06, "loss": -0.0015, "step": 90 }, { "clip_ratio/high_max": 0.008362967770153773, "clip_ratio/high_mean": 0.003460932776761183, "clip_ratio/low_mean": 0.0038727070839058796, "clip_ratio/low_min": 0.0001664591164626472, "clip_ratio/region_mean": 0.007333639878493159, "epoch": 0.10133333333333333, "grad_norm": 0.20191736807843347, "learning_rate": 3e-06, "loss": -0.0022, "step": 95 }, { "clip_ratio/high_max": 0.002381238556608878, "clip_ratio/high_mean": 0.0009815574549747908, "clip_ratio/low_mean": 0.0011737182126125845, "clip_ratio/low_min": 6.021586114002275e-05, "clip_ratio/region_mean": 0.0021552756376308933, "completions/clipped_ratio": 0.3671875, "completions/max_length": 3072.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 1630.8046875, "completions/mean_terminated_length": 794.5555419921875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.10666666666666667, "grad_norm": 0.2213316279414877, "learning_rate": 2.999739604603311e-06, "loss": -0.0007, "num_tokens": 4390444.0, "reward": 0.1666666716337204, "reward_std": 0.18944305181503296, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.37316420674324036, "step": 100 }, { "clip_ratio/high_max": 0.002672465645127886, "clip_ratio/high_mean": 0.0011122859472038726, "clip_ratio/low_mean": 0.0014118726646643153, "clip_ratio/low_min": 0.00012129081751481863, "clip_ratio/region_mean": 0.0025241586349011415, "epoch": 0.112, "grad_norm": 0.171299848549443, "learning_rate": 2.9989585088209272e-06, "loss": -0.0015, "step": 105 }, { "clip_ratio/high_max": 0.005408756870701836, "clip_ratio/high_mean": 0.002290414190622414, "clip_ratio/low_mean": 0.0029679009343681175, "clip_ratio/low_min": 0.0002928155689005507, "clip_ratio/region_mean": 0.005258315193441376, "epoch": 0.11733333333333333, "grad_norm": 0.3192339126714297, "learning_rate": 2.9976569838445097e-06, "loss": -0.0022, "step": 110 }, { "clip_ratio/high_max": 0.002959687018301338, "clip_ratio/high_mean": 0.0012178346755945314, "clip_ratio/low_mean": 0.001600003359249058, "clip_ratio/low_min": 0.00017023872933350504, "clip_ratio/region_mean": 0.0028178380576719063, "completions/clipped_ratio": 0.27083333333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 1387.096435546875, "completions/mean_terminated_length": 761.2750244140625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.12266666666666666, "grad_norm": 0.3081525716803612, "learning_rate": 2.9958354815555427e-06, "loss": -0.0007, "num_tokens": 4975235.0, "reward": 0.1848958432674408, "reward_std": 0.18493251502513885, "rewards/accuracy_reward/mean": 0.1848958283662796, "rewards/accuracy_reward/std": 0.3887195289134979, "step": 115 }, { "clip_ratio/high_max": 0.0024702531072762213, "clip_ratio/high_mean": 0.001049470321481749, "clip_ratio/low_mean": 0.0014242714913052623, "clip_ratio/low_min": 6.489944407803706e-05, "clip_ratio/region_mean": 0.002473741802737095, "epoch": 0.128, "grad_norm": 0.3279697411733405, "learning_rate": 2.9934946343684403e-06, "loss": -0.0006, "step": 120 }, { "clip_ratio/high_max": 0.005486167162507627, "clip_ratio/high_mean": 0.002178969041574419, "clip_ratio/low_mean": 0.0036850320605026354, "clip_ratio/low_min": 0.00020167788316030055, "clip_ratio/region_mean": 0.005864001039458344, "epoch": 0.13333333333333333, "grad_norm": 0.2540554350889326, "learning_rate": 2.9906352550109787e-06, "loss": -0.0017, "step": 125 }, { "clip_ratio/high_max": 0.004726606464919314, "clip_ratio/high_mean": 0.0018281891659398752, "clip_ratio/low_mean": 0.003021188566890487, "clip_ratio/low_min": 0.0001661422076722374, "clip_ratio/region_mean": 0.004849377741948047, "completions/clipped_ratio": 0.25, "completions/max_length": 3072.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 1321.7708740234375, "completions/mean_terminated_length": 738.3611450195312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.13866666666666666, "grad_norm": 5.73397949072503, "learning_rate": 2.9872583362421204e-06, "loss": -0.0007, "num_tokens": 5539384.0, "reward": 0.1953125, "reward_std": 0.21199581027030945, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39695829153060913, "step": 130 }, { "clip_ratio/high_max": 0.0022228933299629715, "clip_ratio/high_mean": 0.0009954358316463185, "clip_ratio/low_mean": 0.000893914346602287, "clip_ratio/low_min": 7.622620405527414e-05, "clip_ratio/region_mean": 0.0018893501621732867, "epoch": 0.144, "grad_norm": 0.4275048308885148, "learning_rate": 2.983365050507336e-06, "loss": -0.0, "step": 135 }, { "clip_ratio/high_max": 0.004837688934094331, "clip_ratio/high_mean": 0.0022902527128508153, "clip_ratio/low_mean": 0.003088120846405218, "clip_ratio/low_min": 0.00023968161467564643, "clip_ratio/region_mean": 0.005378373589701368, "epoch": 0.14933333333333335, "grad_norm": 0.33054544967220006, "learning_rate": 2.978956749531536e-06, "loss": -0.0009, "step": 140 }, { "clip_ratio/high_max": 0.0051443109987303615, "clip_ratio/high_mean": 0.002487977426835641, "clip_ratio/low_mean": 0.004273650143568375, "clip_ratio/low_min": 0.0003034086434126948, "clip_ratio/region_mean": 0.0067616275438922456, "completions/clipped_ratio": 0.25, "completions/max_length": 3072.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 1283.330810546875, "completions/mean_terminated_length": 687.107666015625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15466666666666667, "grad_norm": 0.22439957516073739, "learning_rate": 2.9740349638497614e-06, "loss": -0.0012, "num_tokens": 6083402.0, "reward": 0.2604166865348816, "reward_std": 0.23905907571315765, "rewards/accuracy_reward/mean": 0.2604166567325592, "rewards/accuracy_reward/std": 0.4394345283508301, "step": 145 }, { "clip_ratio/high_max": 0.0021053287135146094, "clip_ratio/high_mean": 0.0008679544618416912, "clip_ratio/low_mean": 0.0009508397942681768, "clip_ratio/low_min": 5.001906974939629e-05, "clip_ratio/region_mean": 0.001818794251221334, "epoch": 0.16, "grad_norm": 0.3994941728782329, "learning_rate": 2.9686014022757936e-06, "loss": -0.0002, "step": 150 }, { "clip_ratio/high_max": 0.005103716309531592, "clip_ratio/high_mean": 0.0020372310698803632, "clip_ratio/low_mean": 0.0031260311388905393, "clip_ratio/low_min": 0.00021238998260741937, "clip_ratio/region_mean": 0.005163262256792223, "epoch": 0.16533333333333333, "grad_norm": 0.2513181360963142, "learning_rate": 2.9626579513088605e-06, "loss": -0.0017, "step": 155 }, { "clip_ratio/high_max": 0.007686942920554429, "clip_ratio/high_mean": 0.003016883104100998, "clip_ratio/low_mean": 0.0061866064152127365, "clip_ratio/low_min": 0.0004812538100850361, "clip_ratio/region_mean": 0.00920348943254794, "epoch": 0.17066666666666666, "grad_norm": 0.22474020545824827, "learning_rate": 2.9562066744786588e-06, "loss": -0.0027, "step": 160 }, { "clip_ratio/high_max": 0.0018754905231617158, "clip_ratio/high_mean": 0.0008160531715475372, "clip_ratio/low_mean": 0.0007908354472874635, "clip_ratio/low_min": 5.0958262727363034e-05, "clip_ratio/region_mean": 0.0016068885991444403, "completions/clipped_ratio": 0.1953125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 1161.494873046875, "completions/mean_terminated_length": 697.7799682617188, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.176, "grad_norm": 0.22333365213995707, "learning_rate": 2.949249811628907e-06, "loss": -0.0005, "num_tokens": 6583749.0, "reward": 0.2630208432674408, "reward_std": 0.25710129737854004, "rewards/accuracy_reward/mean": 0.2630208432674408, "rewards/accuracy_reward/std": 0.4408479928970337, "step": 165 }, { "clip_ratio/high_max": 0.005517828748270404, "clip_ratio/high_mean": 0.0024326679747900927, "clip_ratio/low_mean": 0.0038463478718767873, "clip_ratio/low_min": 0.00023901773265606608, "clip_ratio/region_mean": 0.006279015801555943, "epoch": 0.18133333333333335, "grad_norm": 0.22537396270777144, "learning_rate": 2.9417897781396884e-06, "loss": -0.0022, "step": 170 }, { "clip_ratio/high_max": 0.007265804848793777, "clip_ratio/high_mean": 0.0032901522690735874, "clip_ratio/low_mean": 0.007969670142847463, "clip_ratio/low_min": 0.0005763530025433283, "clip_ratio/region_mean": 0.01125982245139312, "epoch": 0.18666666666666668, "grad_norm": 0.19549475242836123, "learning_rate": 2.933829164088841e-06, "loss": -0.0032, "step": 175 }, { "clip_ratio/high_max": 0.002655027662876819, "clip_ratio/high_mean": 0.001208155520271248, "clip_ratio/low_mean": 0.0024745872016637806, "clip_ratio/low_min": 0.00019687277726916365, "clip_ratio/region_mean": 0.0036827426403306164, "completions/clipped_ratio": 0.19791666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 1150.6380615234375, "completions/mean_terminated_length": 676.5357055664062, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.192, "grad_norm": 0.2895879493726039, "learning_rate": 2.925370733352704e-06, "loss": -0.0011, "num_tokens": 7076942.0, "reward": 0.2942708432674408, "reward_std": 0.26612240076065063, "rewards/accuracy_reward/mean": 0.2942708432674408, "rewards/accuracy_reward/std": 0.4563088119029999, "step": 180 }, { "clip_ratio/high_max": 0.004924267961541773, "clip_ratio/high_mean": 0.0024231519953900717, "clip_ratio/low_mean": 0.004012001022965705, "clip_ratio/low_min": 0.0002916377838118933, "clip_ratio/region_mean": 0.006435153077700306, "epoch": 0.19733333333333333, "grad_norm": 0.37202188041864, "learning_rate": 2.9164174226465136e-06, "loss": -0.002, "step": 185 }, { "clip_ratio/high_max": 0.006944523732818198, "clip_ratio/high_mean": 0.003516825514361699, "clip_ratio/low_mean": 0.00927322073112009, "clip_ratio/low_min": 0.00047458279004786164, "clip_ratio/region_mean": 0.0127900462866819, "epoch": 0.20266666666666666, "grad_norm": 0.28867080647619153, "learning_rate": 2.9069723405047926e-06, "loss": -0.0031, "step": 190 }, { "clip_ratio/high_max": 0.0035060103965406596, "clip_ratio/high_mean": 0.0017517634771138546, "clip_ratio/low_mean": 0.005468913222625815, "clip_ratio/low_min": 0.00023343396424024832, "clip_ratio/region_mean": 0.007220676737119902, "completions/clipped_ratio": 0.1953125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 1127.6015625, "completions/mean_terminated_length": 655.6602172851562, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.208, "grad_norm": 0.32506265530024747, "learning_rate": 2.89703876620209e-06, "loss": -0.0016, "num_tokens": 7562192.0, "reward": 0.2890625, "reward_std": 0.24808019399642944, "rewards/accuracy_reward/mean": 0.2890625, "rewards/accuracy_reward/std": 0.4539184272289276, "step": 195 }, { "clip_ratio/high_max": 0.004065762441678089, "clip_ratio/high_mean": 0.0019107346002783742, "clip_ratio/low_mean": 0.004896866964895707, "clip_ratio/low_min": 0.0009746402207383653, "clip_ratio/region_mean": 0.006807601516629802, "epoch": 0.21333333333333335, "grad_norm": 0.2664440291322751, "learning_rate": 2.8866201486144333e-06, "loss": -0.002, "step": 200 }, { "clip_ratio/high_max": 0.006363331121247029, "clip_ratio/high_mean": 0.0029218152740213554, "clip_ratio/low_mean": 0.010967053815693361, "clip_ratio/low_min": 0.0021254538602079264, "clip_ratio/region_mean": 0.013888869067068299, "epoch": 0.21866666666666668, "grad_norm": 0.23523340995533082, "learning_rate": 2.875720105021903e-06, "loss": -0.0033, "step": 205 }, { "clip_ratio/high_max": 0.004465005223210028, "clip_ratio/high_mean": 0.002039889586330901, "clip_ratio/low_mean": 0.009209338776940967, "clip_ratio/low_min": 0.0015583258075821504, "clip_ratio/region_mean": 0.011249228355063678, "completions/clipped_ratio": 0.20833333333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 1169.3099365234375, "completions/mean_terminated_length": 668.6019897460938, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.224, "grad_norm": 5042.709052506257, "learning_rate": 2.8643424198527314e-06, "loss": 0.0272, "num_tokens": 8059675.0, "reward": 0.3177083730697632, "reward_std": 0.27965402603149414, "rewards/accuracy_reward/mean": 0.3177083432674408, "rewards/accuracy_reward/std": 0.4661927819252014, "step": 210 }, { "clip_ratio/high_max": 0.002753507634224661, "clip_ratio/high_mean": 0.0012564170256155194, "clip_ratio/low_mean": 0.004680744556549143, "clip_ratio/low_min": 0.00017640313490119297, "clip_ratio/region_mean": 0.0059371615912823476, "epoch": 0.22933333333333333, "grad_norm": 931.7367400964915, "learning_rate": 2.852491043369377e-06, "loss": 8.6991, "step": 215 }, { "clip_ratio/high_max": 0.004147610625113885, "clip_ratio/high_mean": 0.001961183077582973, "clip_ratio/low_mean": 0.008271946925196972, "clip_ratio/low_min": 0.0006188505456520943, "clip_ratio/region_mean": 0.010233130234973941, "epoch": 0.23466666666666666, "grad_norm": 6.270733524001969, "learning_rate": 2.840170090297014e-06, "loss": 0.0218, "step": 220 }, { "clip_ratio/high_max": 0.002720723580750928, "clip_ratio/high_mean": 0.0012863226074841805, "clip_ratio/low_mean": 0.0075431024601130044, "clip_ratio/low_min": 0.0006347198552248301, "clip_ratio/region_mean": 0.00882942491571157, "completions/clipped_ratio": 0.171875, "completions/max_length": 3072.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 1064.9583740234375, "completions/mean_terminated_length": 648.4024658203125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.24, "grad_norm": 0.2871154514430502, "learning_rate": 2.827383838394926e-06, "loss": 0.0003, "num_tokens": 8519232.0, "reward": 0.3151041865348816, "reward_std": 0.22101688385009766, "rewards/accuracy_reward/mean": 0.3151041567325592, "rewards/accuracy_reward/std": 0.4651634097099304, "step": 225 }, { "clip_ratio/high_max": 0.002411592633507098, "clip_ratio/high_mean": 0.0010880456917675475, "clip_ratio/low_mean": 0.0015896487093186805, "clip_ratio/low_min": 0.00027685199511324755, "clip_ratio/region_mean": 0.0026776944181847286, "epoch": 0.24533333333333332, "grad_norm": 0.23697387697327743, "learning_rate": 2.8141367269712943e-06, "loss": -0.0005, "step": 230 }, { "clip_ratio/high_max": 0.005076965968328295, "clip_ratio/high_mean": 0.0023153819511207985, "clip_ratio/low_mean": 0.005947045352877467, "clip_ratio/low_min": 0.0011312170572637115, "clip_ratio/region_mean": 0.008262427346380719, "epoch": 0.25066666666666665, "grad_norm": 0.2956121394061923, "learning_rate": 2.800433355341898e-06, "loss": -0.0018, "step": 235 }, { "clip_ratio/high_max": 0.005890207088668831, "clip_ratio/high_mean": 0.0026698726497670577, "clip_ratio/low_mean": 0.01140370096618426, "clip_ratio/low_min": 0.0019671105066663584, "clip_ratio/region_mean": 0.014073573659243267, "epoch": 0.256, "grad_norm": 0.2040374756522376, "learning_rate": 2.786278481233259e-06, "loss": -0.0027, "step": 240 }, { "clip_ratio/high_max": 0.0016806751148578769, "clip_ratio/high_mean": 0.0008087534655942363, "clip_ratio/low_mean": 0.001999178370556365, "clip_ratio/low_min": 0.0004022091379738413, "clip_ratio/region_mean": 0.002807931821780585, "completions/clipped_ratio": 0.20833333333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 1142.963623046875, "completions/mean_terminated_length": 635.3223876953125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.2613333333333333, "grad_norm": 0.34767817276003316, "learning_rate": 2.7716770191307885e-06, "loss": -0.0005, "num_tokens": 9011932.0, "reward": 0.3567708432674408, "reward_std": 0.26612240076065063, "rewards/accuracy_reward/mean": 0.3567708432674408, "rewards/accuracy_reward/std": 0.47967129945755005, "step": 245 }, { "clip_ratio/high_max": 0.005238088621808856, "clip_ratio/high_mean": 0.0024511327544587402, "clip_ratio/low_mean": 0.013540269805889692, "clip_ratio/low_min": 0.0025036389124579728, "clip_ratio/region_mean": 0.015991402580038994, "epoch": 0.26666666666666666, "grad_norm": 0.492086661025711, "learning_rate": 2.7566340385725087e-06, "loss": -0.0025, "step": 250 }, { "clip_ratio/high_max": 0.007083323394363105, "clip_ratio/high_mean": 0.0033130540919501074, "clip_ratio/low_mean": 0.026311448449268936, "clip_ratio/low_min": 0.004475444972194964, "clip_ratio/region_mean": 0.029624502923252295, "epoch": 0.272, "grad_norm": 0.20497400381227068, "learning_rate": 2.74115476238894e-06, "loss": -0.0042, "step": 255 }, { "clip_ratio/high_max": 0.0023149752813878877, "clip_ratio/high_mean": 0.0010831557350229558, "clip_ratio/low_mean": 0.007571964882890825, "clip_ratio/low_min": 0.0012249861937561946, "clip_ratio/region_mean": 0.008655120634921331, "completions/clipped_ratio": 0.203125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 1131.5443115234375, "completions/mean_terminated_length": 636.9183349609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2773333333333333, "grad_norm": 0.2766394417071501, "learning_rate": 2.725244564889764e-06, "loss": -0.0017, "num_tokens": 9502827.0, "reward": 0.3333333432674408, "reward_std": 0.23003798723220825, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4720194935798645, "step": 260 }, { "clip_ratio/high_max": 0.003362817376910243, "clip_ratio/high_mean": 0.0014641431391282822, "clip_ratio/low_mean": 0.025800016580615193, "clip_ratio/low_min": 0.0014690757234347984, "clip_ratio/region_mean": 0.027264159741753245, "epoch": 0.2826666666666667, "grad_norm": 0.22864962485800833, "learning_rate": 2.7089089699979008e-06, "loss": -0.0033, "step": 265 }, { "clip_ratio/high_max": 0.005080301449197577, "clip_ratio/high_mean": 0.0021041881016572008, "clip_ratio/low_mean": 0.046076451375120085, "clip_ratio/low_min": 0.0025963570777093993, "clip_ratio/region_mean": 0.04818063929124037, "epoch": 0.288, "grad_norm": 0.21527476743149906, "learning_rate": 2.6921536493316326e-06, "loss": -0.0049, "step": 270 }, { "clip_ratio/high_max": 0.0029253067237732465, "clip_ratio/high_mean": 0.0012250600140305324, "clip_ratio/low_mean": 0.025427330315778818, "clip_ratio/low_min": 0.001569969744195987, "clip_ratio/region_mean": 0.026652389979972213, "completions/clipped_ratio": 0.17708333333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 1028.244873046875, "completions/mean_terminated_length": 588.4494018554688, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.29333333333333333, "grad_norm": 0.37184042936371264, "learning_rate": 2.6749844202354553e-06, "loss": -0.0031, "num_tokens": 9945907.0, "reward": 0.4140625, "reward_std": 0.26612240076065063, "rewards/accuracy_reward/mean": 0.4140625, "rewards/accuracy_reward/std": 0.49320197105407715, "step": 275 }, { "clip_ratio/high_max": 0.00417593694437528, "clip_ratio/high_mean": 0.0019242878805016516, "clip_ratio/low_mean": 0.0711715504181484, "clip_ratio/low_min": 0.009762267696714844, "clip_ratio/region_mean": 0.07309583908318018, "epoch": 0.2986666666666667, "grad_norm": 0.3210219617484807, "learning_rate": 2.65740724376033e-06, "loss": -0.0052, "step": 280 }, { "clip_ratio/high_max": 0.008914283164995141, "clip_ratio/high_mean": 0.0037652082917702502, "clip_ratio/low_mean": 0.07137071263132384, "clip_ratio/low_min": 0.013413932779803872, "clip_ratio/region_mean": 0.07513592096802313, "epoch": 0.304, "grad_norm": 0.31230218100921864, "learning_rate": 2.6394282225940447e-06, "loss": -0.0072, "step": 285 }, { "clip_ratio/high_max": 0.006666170480275469, "clip_ratio/high_mean": 0.0027762708372847554, "clip_ratio/low_mean": 0.05619774387887446, "clip_ratio/low_min": 0.012247233820744441, "clip_ratio/region_mean": 0.05897401415168133, "completions/clipped_ratio": 0.15625, "completions/max_length": 3072.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 954.5807495117188, "completions/mean_terminated_length": 562.466064453125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.30933333333333335, "grad_norm": 0.71949769673446, "learning_rate": 2.621053598942398e-06, "loss": -0.0056, "num_tokens": 10362713.0, "reward": 0.4427083432674408, "reward_std": 0.24356961250305176, "rewards/accuracy_reward/mean": 0.4427083432674408, "rewards/accuracy_reward/std": 0.49735480546951294, "step": 290 }, { "clip_ratio/high_max": 0.003379093437706615, "clip_ratio/high_mean": 0.0015107715925296362, "clip_ratio/low_mean": 0.07282961994133075, "clip_ratio/low_min": 0.0024502106356521836, "clip_ratio/region_mean": 0.0743403908532855, "epoch": 0.31466666666666665, "grad_norm": 0.36610332163183695, "learning_rate": 2.6022897523619424e-06, "loss": -0.0047, "step": 295 }, { "clip_ratio/high_max": 0.005252830811514286, "clip_ratio/high_mean": 0.0025059831141334143, "clip_ratio/low_mean": 0.06780252321186708, "clip_ratio/low_min": 0.0030584968168113845, "clip_ratio/region_mean": 0.07030850623432343, "epoch": 0.32, "grad_norm": 0.34115664035432014, "learning_rate": 2.583143197545044e-06, "loss": -0.0064, "step": 300 }, { "clip_ratio/high_max": 0.0048096978782268705, "clip_ratio/high_mean": 0.002272353955231665, "clip_ratio/low_mean": 0.06703425145242363, "clip_ratio/low_min": 0.004013959509757114, "clip_ratio/region_mean": 0.06930660564357823, "completions/clipped_ratio": 0.20572916666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 1106.526123046875, "completions/mean_terminated_length": 597.4360961914062, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.3253333333333333, "grad_norm": 0.9975554352535647, "learning_rate": 2.5636205820580173e-06, "loss": -0.0059, "num_tokens": 10844076.0, "reward": 0.3697916865348816, "reward_std": 0.25259074568748474, "rewards/accuracy_reward/mean": 0.3697916567325592, "rewards/accuracy_reward/std": 0.48337799310684204, "step": 305 }, { "clip_ratio/high_max": 0.0031820105456972668, "clip_ratio/high_mean": 0.0012596330894893981, "clip_ratio/low_mean": 0.09244121388269377, "clip_ratio/low_min": 0.0014196576996255317, "clip_ratio/region_mean": 0.09370084733416206, "epoch": 0.33066666666666666, "grad_norm": 0.605438009339709, "learning_rate": 2.5437286840331353e-06, "loss": -0.0045, "step": 310 }, { "clip_ratio/high_max": 0.009952538682500744, "clip_ratio/high_mean": 0.0037549236103586736, "clip_ratio/low_mean": 0.06841608674458258, "clip_ratio/low_min": 0.002268788730725646, "clip_ratio/region_mean": 0.07217101125716, "epoch": 0.336, "grad_norm": 0.5566446815028955, "learning_rate": 2.5234744098153e-06, "loss": -0.0066, "step": 315 }, { "clip_ratio/high_max": 0.007441161333554191, "clip_ratio/high_mean": 0.0029721927284299455, "clip_ratio/low_mean": 0.1140272500933861, "clip_ratio/low_min": 0.003387229223153554, "clip_ratio/region_mean": 0.11699944370720913, "epoch": 0.3413333333333333, "grad_norm": 0.24097582393782332, "learning_rate": 2.502864791564205e-06, "loss": -0.0076, "step": 320 }, { "clip_ratio/high_max": 0.001683332831453299, "clip_ratio/high_mean": 0.0007495411800846341, "clip_ratio/low_mean": 0.06396128333512933, "clip_ratio/low_min": 0.008440114792574605, "clip_ratio/region_mean": 0.06471082528951229, "completions/clipped_ratio": 0.16145833333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2196.0, "completions/mean_length": 961.078125, "completions/mean_terminated_length": 554.6273193359375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3466666666666667, "grad_norm": 0.7251705444554711, "learning_rate": 2.48190698481281e-06, "loss": -0.0034, "num_tokens": 11265231.0, "reward": 0.4557291865348816, "reward_std": 0.26161181926727295, "rewards/accuracy_reward/mean": 0.4557291567325592, "rewards/accuracy_reward/std": 0.4986860156059265, "step": 325 }, { "clip_ratio/high_max": 0.007684722321937443, "clip_ratio/high_mean": 0.003086164402338909, "clip_ratio/low_mean": 0.08778525301277114, "clip_ratio/low_min": 0.013961980446765665, "clip_ratio/region_mean": 0.09087141594354761, "epoch": 0.352, "grad_norm": 0.4135959379743344, "learning_rate": 2.460608265982985e-06, "loss": -0.006, "step": 330 }, { "clip_ratio/high_max": 0.009860236405438627, "clip_ratio/high_mean": 0.003965331593644805, "clip_ratio/low_mean": 0.10978739592246711, "clip_ratio/low_min": 0.012376285332720727, "clip_ratio/region_mean": 0.113752728360123, "epoch": 0.35733333333333334, "grad_norm": 0.2664746507461905, "learning_rate": 2.4389760298591824e-06, "loss": -0.0074, "step": 335 }, { "clip_ratio/high_max": 0.0029835030671165443, "clip_ratio/high_mean": 0.001240652644492002, "clip_ratio/low_mean": 0.0533037496857105, "clip_ratio/low_min": 0.0014057877069717506, "clip_ratio/region_mean": 0.05454440365811024, "completions/clipped_ratio": 0.1875, "completions/max_length": 3072.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 1023.125, "completions/mean_terminated_length": 550.3076782226562, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.3626666666666667, "grad_norm": 0.7614908157776686, "learning_rate": 2.4170177870210112e-06, "loss": -0.0043, "num_tokens": 11712837.0, "reward": 0.4322916865348816, "reward_std": 0.23454853892326355, "rewards/accuracy_reward/mean": 0.4322916567325592, "rewards/accuracy_reward/std": 0.4960406720638275, "step": 340 }, { "clip_ratio/high_max": 0.004076514281041455, "clip_ratio/high_mean": 0.001811919272859086, "clip_ratio/low_mean": 0.098631420507445, "clip_ratio/low_min": 0.0012782284262357279, "clip_ratio/region_mean": 0.1004433399404661, "epoch": 0.368, "grad_norm": 0.6288263184759736, "learning_rate": 2.3947411612356092e-06, "loss": -0.0055, "step": 345 }, { "clip_ratio/high_max": 0.006100629337015562, "clip_ratio/high_mean": 0.002691696735564619, "clip_ratio/low_mean": 0.0987749217369128, "clip_ratio/low_min": 0.0017722182034049182, "clip_ratio/region_mean": 0.10146661846520147, "epoch": 0.37333333333333335, "grad_norm": 0.35500831341304645, "learning_rate": 2.3721538868107225e-06, "loss": -0.007, "step": 350 }, { "clip_ratio/high_max": 0.0030550648296411966, "clip_ratio/high_mean": 0.0013258675193355885, "clip_ratio/low_mean": 0.04249611052277942, "clip_ratio/low_min": 0.0013160976604922325, "clip_ratio/region_mean": 0.04382197831437225, "completions/clipped_ratio": 0.19791666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 1051.6171875, "completions/mean_terminated_length": 553.0811767578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.37866666666666665, "grad_norm": 0.49527716898505153, "learning_rate": 2.3492638059093957e-06, "loss": -0.005, "num_tokens": 12170553.0, "reward": 0.4583333432674408, "reward_std": 0.24356964230537415, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.49891090393066406, "step": 355 }, { "clip_ratio/high_max": 0.0030610247978074766, "clip_ratio/high_mean": 0.0013327152694273536, "clip_ratio/low_mean": 0.13173791540011734, "clip_ratio/low_min": 0.008659371639441815, "clip_ratio/region_mean": 0.13307063087195276, "epoch": 0.384, "grad_norm": 0.7649356546798302, "learning_rate": 2.3260788658272246e-06, "loss": -0.005, "step": 360 }, { "clip_ratio/high_max": 0.004847329439326131, "clip_ratio/high_mean": 0.00225636003569889, "clip_ratio/low_mean": 0.10447387288149912, "clip_ratio/low_min": 0.004142179613700137, "clip_ratio/region_mean": 0.10673023308772826, "epoch": 0.3893333333333333, "grad_norm": 0.5075742072537861, "learning_rate": 2.302607116233101e-06, "loss": -0.0067, "step": 365 }, { "clip_ratio/high_max": 0.003597913720568613, "clip_ratio/high_mean": 0.0017043638324139466, "clip_ratio/low_mean": 0.04826350058042408, "clip_ratio/low_min": 0.002022504380374812, "clip_ratio/region_mean": 0.04996786487722602, "completions/clipped_ratio": 0.203125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 1049.190185546875, "completions/mean_terminated_length": 533.5718994140625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.39466666666666667, "grad_norm": 0.8449146113466531, "learning_rate": 2.278856706374422e-06, "loss": -0.0054, "num_tokens": 12632365.0, "reward": 0.4166666865348816, "reward_std": 0.23003801703453064, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.49364984035491943, "step": 370 }, { "clip_ratio/high_max": 0.0021007199358791693, "clip_ratio/high_mean": 0.0009402908016227229, "clip_ratio/low_mean": 0.14048349194490584, "clip_ratio/low_min": 0.007144192687337636, "clip_ratio/region_mean": 0.14142378216274665, "epoch": 0.4, "grad_norm": 0.3721784293588264, "learning_rate": 2.254835882247716e-06, "loss": -0.005, "step": 375 }, { "clip_ratio/high_max": 0.004206264461026876, "clip_ratio/high_mean": 0.0018912223782308502, "clip_ratio/low_mean": 0.07202427100091882, "clip_ratio/low_min": 0.004583687966805883, "clip_ratio/region_mean": 0.07391549248859519, "epoch": 0.4053333333333333, "grad_norm": 0.3944682692743166, "learning_rate": 2.230552983735686e-06, "loss": -0.0066, "step": 380 }, { "clip_ratio/high_max": 0.003962618076184299, "clip_ratio/high_mean": 0.0017866393171061645, "clip_ratio/low_mean": 0.09798001832059526, "clip_ratio/low_min": 0.005440683248161804, "clip_ratio/region_mean": 0.09976665657195553, "completions/clipped_ratio": 0.1796875, "completions/max_length": 3072.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 933.4323120117188, "completions/mean_terminated_length": 464.984130859375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.4106666666666667, "grad_norm": 1.176694876215528, "learning_rate": 2.206016441711652e-06, "loss": -0.0056, "num_tokens": 13046537.0, "reward": 0.46875, "reward_std": 0.19846415519714355, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.4996735155582428, "step": 385 }, { "clip_ratio/high_max": 0.0018572545009192254, "clip_ratio/high_mean": 0.0007957926945891813, "clip_ratio/low_mean": 0.05896586015978755, "clip_ratio/low_min": 0.0025499404731817777, "clip_ratio/region_mean": 0.05976165267472879, "epoch": 0.416, "grad_norm": 0.5734786840603884, "learning_rate": 2.1812347751124072e-06, "loss": -0.0031, "step": 390 }, { "clip_ratio/high_max": 0.003554840175729623, "clip_ratio/high_mean": 0.0015769718615274541, "clip_ratio/low_mean": 0.07429686068680894, "clip_ratio/low_min": 0.0033054753057513153, "clip_ratio/region_mean": 0.07587383319751098, "epoch": 0.42133333333333334, "grad_norm": 0.2865179160110887, "learning_rate": 2.156216587980491e-06, "loss": -0.0042, "step": 395 }, { "clip_ratio/high_max": 0.004424673312223603, "clip_ratio/high_mean": 0.0019454615328641013, "clip_ratio/low_mean": 0.07805184166481922, "clip_ratio/low_min": 0.010520006919614388, "clip_ratio/region_mean": 0.07999730353549239, "epoch": 0.4266666666666667, "grad_norm": 0.3244262791031944, "learning_rate": 2.1309705664769195e-06, "loss": -0.0046, "step": 400 }, { "clip_ratio/high_max": 0.0013938009499725012, "clip_ratio/high_mean": 0.0005448016464242756, "clip_ratio/low_mean": 0.05232084265244339, "clip_ratio/low_min": 0.00010685719671528204, "clip_ratio/region_mean": 0.052865644360622355, "completions/clipped_ratio": 0.21875, "completions/max_length": 3072.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 1090.869873046875, "completions/mean_terminated_length": 536.1533203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.432, "grad_norm": 0.7197172142348183, "learning_rate": 2.1055054758654056e-06, "loss": -0.0024, "num_tokens": 13521225.0, "reward": 0.4114583432674408, "reward_std": 0.21650633215904236, "rewards/accuracy_reward/mean": 0.4114583432674408, "rewards/accuracy_reward/std": 0.4927399158477783, "step": 405 }, { "clip_ratio/high_max": 0.0038134682337840784, "clip_ratio/high_mean": 0.0015407557248181547, "clip_ratio/low_mean": 0.07104751073620719, "clip_ratio/low_min": 0.0002996521205204772, "clip_ratio/region_mean": 0.07258826747547573, "epoch": 0.43733333333333335, "grad_norm": 0.5825063346280412, "learning_rate": 2.0798301574691106e-06, "loss": -0.0042, "step": 410 }, { "clip_ratio/high_max": 0.004845974740692327, "clip_ratio/high_mean": 0.002042640848094379, "clip_ratio/low_mean": 0.08078229681595986, "clip_ratio/low_min": 0.0004686584856244735, "clip_ratio/region_mean": 0.08282493733677257, "epoch": 0.44266666666666665, "grad_norm": 0.34869709297066837, "learning_rate": 2.053953525600994e-06, "loss": -0.0052, "step": 415 }, { "clip_ratio/high_max": 0.0018839577745893622, "clip_ratio/high_mean": 0.0007635463869064552, "clip_ratio/low_mean": 0.030383340324146958, "clip_ratio/low_min": 0.0020213471972965634, "clip_ratio/region_mean": 0.03114688619352819, "completions/clipped_ratio": 0.19270833333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 1021.359375, "completions/mean_terminated_length": 531.8516235351562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.448, "grad_norm": 0.38751935954093847, "learning_rate": 2.027884564468816e-06, "loss": -0.0027, "num_tokens": 13966500.0, "reward": 0.4401041865348816, "reward_std": 0.17140087485313416, "rewards/accuracy_reward/mean": 0.4401041567325592, "rewards/accuracy_reward/std": 0.49704715609550476, "step": 420 }, { "clip_ratio/high_max": 0.002864206872982322, "clip_ratio/high_mean": 0.001183286251125537, "clip_ratio/low_mean": 0.08100727885539527, "clip_ratio/low_min": 0.010261134334359667, "clip_ratio/region_mean": 0.08219056457965053, "epoch": 0.4533333333333333, "grad_norm": 0.22018703177975757, "learning_rate": 2.0016323250558765e-06, "loss": -0.0028, "step": 425 }, { "clip_ratio/high_max": 0.003963224481412908, "clip_ratio/high_mean": 0.0016329133370163618, "clip_ratio/low_mean": 0.031053547892952338, "clip_ratio/low_min": 0.002775615965219913, "clip_ratio/region_mean": 0.032686461017874535, "epoch": 0.45866666666666667, "grad_norm": 0.19544633096174197, "learning_rate": 1.9752059219785703e-06, "loss": -0.0036, "step": 430 }, { "clip_ratio/high_max": 0.002252693182663279, "clip_ratio/high_mean": 0.0009132299313705516, "clip_ratio/low_mean": 0.04574478757022007, "clip_ratio/low_min": 0.00508982240135083, "clip_ratio/region_mean": 0.04665801805304, "completions/clipped_ratio": 0.20052083333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 1020.2943115234375, "completions/mean_terminated_length": 505.6970520019531, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.464, "grad_norm": 0.39901516514336943, "learning_rate": 1.948614530321848e-06, "loss": -0.0027, "num_tokens": 14410223.0, "reward": 0.4557291865348816, "reward_std": 0.20297470688819885, "rewards/accuracy_reward/mean": 0.4557291567325592, "rewards/accuracy_reward/std": 0.4986859858036041, "step": 435 }, { "clip_ratio/high_max": 0.002569176770703052, "clip_ratio/high_mean": 0.0010525892265832227, "clip_ratio/low_mean": 0.08931325557141463, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09036584526329534, "epoch": 0.4693333333333333, "grad_norm": 0.5332244422550945, "learning_rate": 1.921867382453679e-06, "loss": -0.0031, "step": 440 }, { "clip_ratio/high_max": 0.004141085335140815, "clip_ratio/high_mean": 0.0017072654023195355, "clip_ratio/low_mean": 0.06765320781350966, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06936047223471178, "epoch": 0.4746666666666667, "grad_norm": 0.42905228899368286, "learning_rate": 1.8949737648196395e-06, "loss": -0.0041, "step": 445 }, { "clip_ratio/high_max": 0.0030101402818218047, "clip_ratio/high_mean": 0.001238053569841213, "clip_ratio/low_mean": 0.04169838173086191, "clip_ratio/low_min": 6.94540940457955e-06, "clip_ratio/region_mean": 0.04293643507176057, "completions/clipped_ratio": 0.21614583333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 1065.265625, "completions/mean_terminated_length": 511.9136047363281, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.48, "grad_norm": 0.7661050740551064, "learning_rate": 1.8679430147187031e-06, "loss": -0.0032, "num_tokens": 14874806.0, "reward": 0.4817708432674408, "reward_std": 0.18944305181503296, "rewards/accuracy_reward/mean": 0.4817708432674408, "rewards/accuracy_reward/std": 0.5003194212913513, "step": 450 }, { "clip_ratio/high_max": 0.0018354151041421573, "clip_ratio/high_mean": 0.0007806268115928105, "clip_ratio/low_mean": 0.07441385543429532, "clip_ratio/low_min": 0.0001982575979127432, "clip_ratio/region_mean": 0.07519448148200354, "epoch": 0.48533333333333334, "grad_norm": 0.5550116515206406, "learning_rate": 1.840784517061398e-06, "loss": -0.0031, "step": 455 }, { "clip_ratio/high_max": 0.00441814328860346, "clip_ratio/high_mean": 0.0017888988276354213, "clip_ratio/low_mean": 0.03529077522398438, "clip_ratio/low_min": 0.00036053421790711583, "clip_ratio/region_mean": 0.037079673577136416, "epoch": 0.49066666666666664, "grad_norm": 0.48292145311781487, "learning_rate": 1.8135077011114185e-06, "loss": -0.0042, "step": 460 }, { "clip_ratio/high_max": 0.0042142630229136556, "clip_ratio/high_mean": 0.00173533724587287, "clip_ratio/low_mean": 0.0564227989903884, "clip_ratio/low_min": 0.00047756879139342344, "clip_ratio/region_mean": 0.05815813659610285, "completions/clipped_ratio": 0.17447916666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 956.0651245117188, "completions/mean_terminated_length": 508.8486022949219, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.496, "grad_norm": 1.3138593428471037, "learning_rate": 1.7861220372118446e-06, "loss": -0.0037, "num_tokens": 15291711.0, "reward": 0.4791666865348816, "reward_std": 0.15786921977996826, "rewards/accuracy_reward/mean": 0.4791666567325592, "rewards/accuracy_reward/std": 0.5002175569534302, "step": 465 }, { "clip_ratio/high_max": 0.0014431175798563344, "clip_ratio/high_mean": 0.000561200064271361, "clip_ratio/low_mean": 0.0701663168198138, "clip_ratio/low_min": 0.005328421051126497, "clip_ratio/region_mean": 0.07072751648065605, "epoch": 0.5013333333333333, "grad_norm": 0.6723523651970409, "learning_rate": 1.7586370334970954e-06, "loss": -0.0029, "step": 470 }, { "clip_ratio/high_max": 0.002765353807990323, "clip_ratio/high_mean": 0.0011087034272350138, "clip_ratio/low_mean": 0.06183230506685504, "clip_ratio/low_min": 0.0046877538989065215, "clip_ratio/region_mean": 0.06294100870145485, "epoch": 0.5066666666666667, "grad_norm": 0.6676406787542302, "learning_rate": 1.7310622325917648e-06, "loss": -0.0038, "step": 475 }, { "clip_ratio/high_max": 0.0033429564977268456, "clip_ratio/high_mean": 0.0013738121360802324, "clip_ratio/low_mean": 0.08155342446152644, "clip_ratio/low_min": 0.006805185489065479, "clip_ratio/region_mean": 0.08292723593294796, "epoch": 0.512, "grad_norm": 0.3985374024935878, "learning_rate": 1.7034072082974805e-06, "loss": -0.0044, "step": 480 }, { "clip_ratio/high_max": 0.0011374687709576393, "clip_ratio/high_mean": 0.0004723400142211176, "clip_ratio/low_mean": 0.02250789805361819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02298023812847987, "completions/clipped_ratio": 0.2109375, "completions/max_length": 3072.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 1018.0521240234375, "completions/mean_terminated_length": 468.9768981933594, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5173333333333333, "grad_norm": 1.2905165763586464, "learning_rate": 1.6756815622689371e-06, "loss": -0.0012, "num_tokens": 15732473.0, "reward": 0.4453125, "reward_std": 0.16237975656986237, "rewards/accuracy_reward/mean": 0.4453125, "rewards/accuracy_reward/std": 0.4976486563682556, "step": 485 }, { "clip_ratio/high_max": 0.0028168514740173124, "clip_ratio/high_mean": 0.0011303566189326375, "clip_ratio/low_mean": 0.056221505152097964, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.057351861235019896, "epoch": 0.5226666666666666, "grad_norm": 0.40519495905709796, "learning_rate": 1.6478949206802629e-06, "loss": -0.002, "step": 490 }, { "clip_ratio/high_max": 0.0033902631028468024, "clip_ratio/high_mean": 0.0013365435979721951, "clip_ratio/low_mean": 0.046887436969700505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04822398088381306, "epoch": 0.528, "grad_norm": 0.33114448021641263, "learning_rate": 1.6200569308828705e-06, "loss": -0.0025, "step": 495 }, { "clip_ratio/high_max": 0.001350151717360859, "clip_ratio/high_mean": 0.000538574478650844, "clip_ratio/low_mean": 0.03427403083574063, "clip_ratio/low_min": 0.0012982267857296393, "clip_ratio/region_mean": 0.03481260505291175, "completions/clipped_ratio": 0.2578125, "completions/max_length": 3072.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 1130.609375, "completions/mean_terminated_length": 456.2315979003906, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5333333333333333, "grad_norm": 0.6251983134359662, "learning_rate": 1.5921772580559549e-06, "loss": -0.0024, "num_tokens": 16219472.0, "reward": 0.4583333432674408, "reward_std": 0.18944303691387177, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.49891090393066406, "step": 500 }, { "clip_ratio/high_max": 0.0023527391455900213, "clip_ratio/high_mean": 0.000962798686032329, "clip_ratio/low_mean": 0.08444502200868556, "clip_ratio/low_min": 0.0043283133090881165, "clip_ratio/region_mean": 0.08540782133818539, "epoch": 0.5386666666666666, "grad_norm": 1.0613051707001024, "learning_rate": 1.5642655818508029e-06, "loss": -0.0034, "step": 505 }, { "clip_ratio/high_max": 0.0034039687756376225, "clip_ratio/high_mean": 0.0014410186166969653, "clip_ratio/low_mean": 0.0713321166809692, "clip_ratio/low_min": 0.003797134505293798, "clip_ratio/region_mean": 0.07277313535942084, "epoch": 0.544, "grad_norm": 0.5308766846379346, "learning_rate": 1.5363315930300777e-06, "loss": -0.0045, "step": 510 }, { "clip_ratio/high_max": 0.0018221235108285328, "clip_ratio/high_mean": 0.0007714846894941729, "clip_ratio/low_mean": 0.037042277090517925, "clip_ratio/low_min": 0.0018676266225156723, "clip_ratio/region_mean": 0.037813761885718125, "completions/clipped_ratio": 0.2109375, "completions/max_length": 3072.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 1022.8203125, "completions/mean_terminated_length": 475.0198059082031, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5493333333333333, "grad_norm": 0.4555142846733849, "learning_rate": 1.5083849901032472e-06, "loss": -0.0027, "num_tokens": 16667258.0, "reward": 0.4583333432674408, "reward_std": 0.20297470688819885, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.49891090393066406, "step": 515 }, { "clip_ratio/high_max": 0.0028346409020741704, "clip_ratio/high_mean": 0.001165486571881047, "clip_ratio/low_mean": 0.0909755569943627, "clip_ratio/low_min": 7.94659870734904e-05, "clip_ratio/region_mean": 0.09214104335651427, "epoch": 0.5546666666666666, "grad_norm": 0.626839616075161, "learning_rate": 1.4804354759593176e-06, "loss": -0.003, "step": 520 }, { "clip_ratio/high_max": 0.01591993395686586, "clip_ratio/high_mean": 0.005639828065841357, "clip_ratio/low_mean": 0.05858304468092683, "clip_ratio/low_min": 0.00014701207837788388, "clip_ratio/region_mean": 0.06422287253226386, "epoch": 0.56, "grad_norm": 0.3759378172501364, "learning_rate": 1.452492754498053e-06, "loss": -0.0039, "step": 525 }, { "clip_ratio/high_max": 0.004570779279583803, "clip_ratio/high_mean": 0.0017571643659721303, "clip_ratio/low_mean": 0.039327953054817046, "clip_ratio/low_min": 7.555199772468768e-05, "clip_ratio/region_mean": 0.041085118063142544, "completions/clipped_ratio": 0.30208333333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1256.8255615234375, "completions/mean_terminated_length": 471.1529846191406, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.5653333333333334, "grad_norm": 1.1346661690949225, "learning_rate": 1.4245665272608392e-06, "loss": -0.0031, "num_tokens": 17200267.0, "reward": 0.421875, "reward_std": 0.18493251502513885, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.49450308084487915, "step": 530 }, { "clip_ratio/high_max": 0.0014940402811589593, "clip_ratio/high_mean": 0.000641074164695965, "clip_ratio/low_mean": 0.09626583683711942, "clip_ratio/low_min": 0.00992904519325748, "clip_ratio/region_mean": 0.09690691181231159, "epoch": 0.5706666666666667, "grad_norm": 0.6025678725265979, "learning_rate": 1.396666490062369e-06, "loss": -0.004, "step": 535 }, { "clip_ratio/high_max": 0.002706727543409215, "clip_ratio/high_mean": 0.0011666997846987216, "clip_ratio/low_mean": 0.05861752019454798, "clip_ratio/low_min": 0.004863509997267102, "clip_ratio/region_mean": 0.05978421976515165, "epoch": 0.576, "grad_norm": 0.2911962819324454, "learning_rate": 1.368802329624314e-06, "loss": -0.0051, "step": 540 }, { "clip_ratio/high_max": 0.002708869335401687, "clip_ratio/high_mean": 0.001146126141793502, "clip_ratio/low_mean": 0.07105408472489216, "clip_ratio/low_min": 0.0077869487173302335, "clip_ratio/region_mean": 0.07220021089160582, "completions/clipped_ratio": 0.32552083333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 1321.362060546875, "completions/mean_terminated_length": 476.4594421386719, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.5813333333333334, "grad_norm": 1.387241533090774, "learning_rate": 1.3409837202121548e-06, "loss": -0.0043, "num_tokens": 17758814.0, "reward": 0.4322916865348816, "reward_std": 0.15786921977996826, "rewards/accuracy_reward/mean": 0.4322916567325592, "rewards/accuracy_reward/std": 0.4960407018661499, "step": 545 }, { "clip_ratio/high_max": 0.0011825616422811436, "clip_ratio/high_mean": 0.0004538118826758364, "clip_ratio/low_mean": 0.05125583837159411, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05170965068959958, "epoch": 0.5866666666666667, "grad_norm": 0.6347786581963231, "learning_rate": 1.313220320276336e-06, "loss": -0.0029, "step": 550 }, { "clip_ratio/high_max": 0.0026606321714098156, "clip_ratio/high_mean": 0.0010267326748589767, "clip_ratio/low_mean": 0.08060475674774352, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08163148950552568, "epoch": 0.592, "grad_norm": 0.5375504405571101, "learning_rate": 1.285521769098911e-06, "loss": -0.0037, "step": 555 }, { "clip_ratio/high_max": 0.0032606150654828523, "clip_ratio/high_mean": 0.0012811373724389342, "clip_ratio/low_mean": 0.04086343426388339, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04214457105845213, "epoch": 0.5973333333333334, "grad_norm": 0.31265805580240597, "learning_rate": 1.257897683446842e-06, "loss": -0.0045, "step": 560 }, { "clip_ratio/high_max": 0.0009253688945136673, "clip_ratio/high_mean": 0.0003848460328299552, "clip_ratio/low_mean": 0.03965746309527276, "clip_ratio/low_min": 0.003099139089044911, "clip_ratio/region_mean": 0.04004230858863593, "completions/clipped_ratio": 0.2734375, "completions/max_length": 3072.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 1189.7421875, "completions/mean_terminated_length": 481.3656005859375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.6026666666666667, "grad_norm": 0.5290948061172572, "learning_rate": 1.2303576542331168e-06, "loss": -0.0024, "num_tokens": 18270491.0, "reward": 0.4739583432674408, "reward_std": 0.16237974166870117, "rewards/accuracy_reward/mean": 0.4739583432674408, "rewards/accuracy_reward/std": 0.49997276067733765, "step": 565 }, { "clip_ratio/high_max": 0.0023125247231291723, "clip_ratio/high_mean": 0.000941522240100312, "clip_ratio/low_mean": 0.10140650683897548, "clip_ratio/low_min": 0.012586363311856984, "clip_ratio/region_mean": 0.10234802895083703, "epoch": 0.608, "grad_norm": 0.33095762767387343, "learning_rate": 1.2029112431868455e-06, "loss": -0.0031, "step": 570 }, { "clip_ratio/high_max": 0.003593901906879182, "clip_ratio/high_mean": 0.0014164089910082111, "clip_ratio/low_mean": 0.025131099008285675, "clip_ratio/low_min": 0.0013168183235393372, "clip_ratio/region_mean": 0.02654750823676295, "epoch": 0.6133333333333333, "grad_norm": 0.2855994477990491, "learning_rate": 1.1755679795334832e-06, "loss": -0.0041, "step": 575 }, { "clip_ratio/high_max": 0.001284848817749662, "clip_ratio/high_mean": 0.0004980170889666624, "clip_ratio/low_mean": 0.046352222651216836, "clip_ratio/low_min": 0.0028331320638244504, "clip_ratio/region_mean": 0.04685023971142073, "completions/clipped_ratio": 0.30208333333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 1271.3333740234375, "completions/mean_terminated_length": 491.9402770996094, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6186666666666667, "grad_norm": 0.8476704932330884, "learning_rate": 1.1483373566863454e-06, "loss": -0.0035, "num_tokens": 18812710.0, "reward": 0.3984375, "reward_std": 0.20297470688819885, "rewards/accuracy_reward/mean": 0.3984375, "rewards/accuracy_reward/std": 0.49021512269973755, "step": 580 }, { "clip_ratio/high_max": 0.005480648097363883, "clip_ratio/high_mean": 0.0019806388021606836, "clip_ratio/low_mean": 0.10244042027788965, "clip_ratio/low_min": 0.004672814256218771, "clip_ratio/region_mean": 0.10442105827642081, "epoch": 0.624, "grad_norm": 0.7987440331085414, "learning_rate": 1.1212288289505494e-06, "loss": -0.0049, "step": 585 }, { "clip_ratio/high_max": 0.009626721819404337, "clip_ratio/high_mean": 0.0034381008950049364, "clip_ratio/low_mean": 0.07517182499250338, "clip_ratio/low_min": 0.004697832791589462, "clip_ratio/region_mean": 0.07860992621517653, "epoch": 0.6293333333333333, "grad_norm": 0.49539238928761664, "learning_rate": 1.0942518082405401e-06, "loss": -0.0062, "step": 590 }, { "clip_ratio/high_max": 0.001589591645006294, "clip_ratio/high_mean": 0.0006233605454099234, "clip_ratio/low_mean": 0.048352701233295645, "clip_ratio/low_min": 0.0016579951496169087, "clip_ratio/region_mean": 0.04897606225582649, "completions/clipped_ratio": 0.27864583333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 1212.0208740234375, "completions/mean_terminated_length": 493.5451354980469, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.6346666666666667, "grad_norm": 0.611716803422045, "learning_rate": 1.0674156608123294e-06, "loss": -0.0034, "num_tokens": 19328853.0, "reward": 0.4036458432674408, "reward_std": 0.18042194843292236, "rewards/accuracy_reward/mean": 0.4036458432674408, "rewards/accuracy_reward/std": 0.4912680983543396, "step": 595 }, { "clip_ratio/high_max": 0.0012258633300007205, "clip_ratio/high_mean": 0.0005046791741960988, "clip_ratio/low_mean": 0.08033577839350983, "clip_ratio/low_min": 0.00012443318773875945, "clip_ratio/region_mean": 0.08084045782293288, "epoch": 0.64, "grad_norm": 0.5583239263921691, "learning_rate": 1.040729704011591e-06, "loss": -0.0034, "step": 600 }, { "clip_ratio/high_max": 0.0022564839349797695, "clip_ratio/high_mean": 0.0009434024989332102, "clip_ratio/low_mean": 0.06254989956523786, "clip_ratio/low_min": 0.0002681825077161193, "clip_ratio/region_mean": 0.06349330240095696, "epoch": 0.6453333333333333, "grad_norm": 0.5069417607560092, "learning_rate": 1.0142032030387342e-06, "loss": -0.0043, "step": 605 }, { "clip_ratio/high_max": 0.0017286340487771667, "clip_ratio/high_mean": 0.0007177196230259142, "clip_ratio/low_mean": 0.040949530374132334, "clip_ratio/low_min": 0.00015590756374876945, "clip_ratio/region_mean": 0.04166725016498276, "completions/clipped_ratio": 0.26041666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 1140.2734375, "completions/mean_terminated_length": 460.0880126953125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6506666666666666, "grad_norm": 1.1682234596979921, "learning_rate": 9.878453677320847e-07, "loss": -0.003, "num_tokens": 19817574.0, "reward": 0.3984375, "reward_std": 0.16689030826091766, "rewards/accuracy_reward/mean": 0.3984375, "rewards/accuracy_reward/std": 0.49021512269973755, "step": 610 }, { "clip_ratio/high_max": 0.0011543855312083906, "clip_ratio/high_mean": 0.0004275740739331013, "clip_ratio/low_mean": 0.058845913647905944, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.059273487424889026, "epoch": 0.656, "grad_norm": 0.6970362405571762, "learning_rate": 9.616653493702824e-07, "loss": -0.0031, "step": 615 }, { "clip_ratio/high_max": 0.002095131428768582, "clip_ratio/high_mean": 0.0007953824442438418, "clip_ratio/low_mean": 0.06663740906724343, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06743279226384402, "epoch": 0.6613333333333333, "grad_norm": 0.6921576912781968, "learning_rate": 9.356722374950166e-07, "loss": -0.0041, "step": 620 }, { "clip_ratio/high_max": 0.00282628009099426, "clip_ratio/high_mean": 0.0010506490101761302, "clip_ratio/low_mean": 0.031559368208399975, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.032610017383285596, "completions/clipped_ratio": 0.24739583333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1080.783935546875, "completions/mean_terminated_length": 426.2318420410156, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.6666666666666666, "grad_norm": 1.821220094752535, "learning_rate": 9.098750567551911e-07, "loss": -0.0038, "num_tokens": 20283565.0, "reward": 0.5104166865348816, "reward_std": 0.18493251502513885, "rewards/accuracy_reward/mean": 0.5104166865348816, "rewards/accuracy_reward/std": 0.5005436539649963, "step": 625 }, { "clip_ratio/high_max": 0.0010341222390707117, "clip_ratio/high_mean": 0.0004294376700045177, "clip_ratio/low_mean": 0.0585436765749364, "clip_ratio/low_min": 0.006355780504964059, "clip_ratio/region_mean": 0.05897311505725611, "epoch": 0.672, "grad_norm": 0.8826474630356834, "learning_rate": 8.842827637736218e-07, "loss": -0.003, "step": 630 }, { "clip_ratio/high_max": 0.002425370116270642, "clip_ratio/high_mean": 0.0010064826603183973, "clip_ratio/low_mean": 0.06992020507777852, "clip_ratio/low_min": 0.008430524062168843, "clip_ratio/region_mean": 0.07092668748600772, "epoch": 0.6773333333333333, "grad_norm": 0.9463482705116764, "learning_rate": 8.589042440373532e-07, "loss": -0.0039, "step": 635 }, { "clip_ratio/high_max": 0.003191851576775662, "clip_ratio/high_mean": 0.001318646853815153, "clip_ratio/low_mean": 0.05740836009172199, "clip_ratio/low_min": 0.00506230170467461, "clip_ratio/region_mean": 0.058727006714525486, "epoch": 0.6826666666666666, "grad_norm": 0.515171272509066, "learning_rate": 8.337483088126709e-07, "loss": -0.0047, "step": 640 }, { "clip_ratio/high_max": 0.0008373875309189316, "clip_ratio/high_mean": 0.00035896416600280647, "clip_ratio/low_mean": 0.02070066980222691, "clip_ratio/low_min": 0.00161740982603078, "clip_ratio/region_mean": 0.02105963398807944, "completions/clipped_ratio": 0.22395833333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 1025.549560546875, "completions/mean_terminated_length": 434.9631042480469, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.688, "grad_norm": 0.34725403489367546, "learning_rate": 8.088236920858835e-07, "loss": -0.0026, "num_tokens": 20728430.0, "reward": 0.4661458432674408, "reward_std": 0.22552745044231415, "rewards/accuracy_reward/mean": 0.4661458432674408, "rewards/accuracy_reward/std": 0.49950337409973145, "step": 645 }, { "clip_ratio/high_max": 0.0026493671264688603, "clip_ratio/high_mean": 0.0011288010596672393, "clip_ratio/low_mean": 0.10914739753725371, "clip_ratio/low_min": 0.012434507167381526, "clip_ratio/region_mean": 0.11027619704227618, "epoch": 0.6933333333333334, "grad_norm": 0.5254226822504331, "learning_rate": 7.841390475309386e-07, "loss": -0.0039, "step": 650 }, { "clip_ratio/high_max": 0.004173288791207597, "clip_ratio/high_mean": 0.0017493430073045602, "clip_ratio/low_mean": 0.06364794335840998, "clip_ratio/low_min": 0.007780027126136702, "clip_ratio/region_mean": 0.06539728601874231, "epoch": 0.6986666666666667, "grad_norm": 0.7123947232508249, "learning_rate": 7.59702945504917e-07, "loss": -0.0053, "step": 655 }, { "clip_ratio/high_max": 0.0012025066319438337, "clip_ratio/high_mean": 0.0004982200040558382, "clip_ratio/low_mean": 0.01394026193611353, "clip_ratio/low_min": 0.0007624211653819657, "clip_ratio/region_mean": 0.014438482204877801, "completions/clipped_ratio": 0.3046875, "completions/max_length": 3072.0, "completions/max_terminated_length": 2899.0, "completions/mean_length": 1293.7005615234375, "completions/mean_terminated_length": 514.4456787109375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.704, "grad_norm": 0.3251619640881739, "learning_rate": 7.355238700724594e-07, "loss": -0.0027, "num_tokens": 21278380.0, "reward": 0.3880208432674408, "reward_std": 0.15335866808891296, "rewards/accuracy_reward/mean": 0.3880208432674408, "rewards/accuracy_reward/std": 0.4879350960254669, "step": 660 }, { "clip_ratio/high_max": 0.0012561851983264205, "clip_ratio/high_mean": 0.0004854951525430806, "clip_ratio/low_mean": 0.07820944947252428, "clip_ratio/low_min": 9.649656149122166e-05, "clip_ratio/region_mean": 0.07869494465203389, "epoch": 0.7093333333333334, "grad_norm": 0.6152071628560003, "learning_rate": 7.116102160601505e-07, "loss": -0.0032, "step": 665 }, { "clip_ratio/high_max": 0.0019434796606219607, "clip_ratio/high_mean": 0.0007754357723570137, "clip_ratio/low_mean": 0.06723471198747574, "clip_ratio/low_min": 0.00021677564218407496, "clip_ratio/region_mean": 0.06801014809243497, "epoch": 0.7146666666666667, "grad_norm": 0.4906789679106144, "learning_rate": 6.879702861418883e-07, "loss": -0.0044, "step": 670 }, { "clip_ratio/high_max": 0.001219000763467193, "clip_ratio/high_mean": 0.0004845326840040798, "clip_ratio/low_mean": 0.013759543100900374, "clip_ratio/low_min": 0.00010400774017398362, "clip_ratio/region_mean": 0.014244076124691673, "completions/clipped_ratio": 0.234375, "completions/max_length": 3072.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 1076.5, "completions/mean_terminated_length": 465.63262939453125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.72, "grad_norm": 0.7785897780538891, "learning_rate": 6.646122879562435e-07, "loss": -0.0026, "num_tokens": 21744496.0, "reward": 0.4505208432674408, "reward_std": 0.18042194843292236, "rewards/accuracy_reward/mean": 0.45406824350357056, "rewards/accuracy_reward/std": 0.4985404908657074, "step": 675 }, { "clip_ratio/high_max": 0.0012042383884363517, "clip_ratio/high_mean": 0.00047244138136193214, "clip_ratio/low_mean": 0.0767459373753809, "clip_ratio/low_min": 5.86071237194119e-05, "clip_ratio/region_mean": 0.0772183783359651, "epoch": 0.7253333333333334, "grad_norm": 0.7074885547534585, "learning_rate": 6.415443312568216e-07, "loss": -0.0039, "step": 680 }, { "clip_ratio/high_max": 0.002111398743454629, "clip_ratio/high_mean": 0.0008562820210954669, "clip_ratio/low_mean": 0.09287415779326694, "clip_ratio/low_min": 0.00015205146664811763, "clip_ratio/region_mean": 0.09373043903433427, "epoch": 0.7306666666666667, "grad_norm": 0.3605356465742621, "learning_rate": 6.187744250966031e-07, "loss": -0.0045, "step": 685 }, { "clip_ratio/high_max": 0.0014742940112228097, "clip_ratio/high_mean": 0.0006147716106397639, "clip_ratio/low_mean": 0.021137604957175427, "clip_ratio/low_min": 0.00010634768914314918, "clip_ratio/region_mean": 0.021752376643758, "completions/clipped_ratio": 0.20052083333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 984.9791870117188, "completions/mean_terminated_length": 461.5244140625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.736, "grad_norm": 0.7239634632417756, "learning_rate": 5.963104750472507e-07, "loss": -0.0032, "num_tokens": 22179119.0, "reward": 0.4192708432674408, "reward_std": 0.15335866808891296, "rewards/accuracy_reward/mean": 0.4192708432674408, "rewards/accuracy_reward/std": 0.4940834939479828, "step": 690 }, { "clip_ratio/high_max": 0.0010224827206002374, "clip_ratio/high_mean": 0.00040267573954224647, "clip_ratio/low_mean": 0.030504032543785798, "clip_ratio/low_min": 0.00010436699285492068, "clip_ratio/region_mean": 0.030906708135580628, "epoch": 0.7413333333333333, "grad_norm": 0.4617268770849545, "learning_rate": 5.741602804543429e-07, "loss": -0.0018, "step": 695 }, { "clip_ratio/high_max": 0.0016983807364795212, "clip_ratio/high_mean": 0.0006651983926758476, "clip_ratio/low_mean": 0.05762567343790579, "clip_ratio/low_min": 0.00031765277963131667, "clip_ratio/region_mean": 0.05829087174715823, "epoch": 0.7466666666666667, "grad_norm": 0.36076571609516594, "learning_rate": 5.52331531729491e-07, "loss": -0.0023, "step": 700 }, { "clip_ratio/high_max": 0.0018128980687833972, "clip_ratio/high_mean": 0.0006981478734815027, "clip_ratio/low_mean": 0.03221005024479382, "clip_ratio/low_min": 0.0003271876135841012, "clip_ratio/region_mean": 0.03290819797966833, "completions/clipped_ratio": 0.21875, "completions/max_length": 3072.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 1076.75, "completions/mean_terminated_length": 518.0800170898438, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.752, "grad_norm": 1.1543222697396576, "learning_rate": 5.308318076802728e-07, "loss": -0.0025, "num_tokens": 22646093.0, "reward": 0.4244791865348816, "reward_std": 0.12178482860326767, "rewards/accuracy_reward/mean": 0.4244791567325592, "rewards/accuracy_reward/std": 0.49490854144096375, "step": 705 }, { "clip_ratio/high_max": 0.0007661541466404742, "clip_ratio/high_mean": 0.0002683004994651128, "clip_ratio/low_mean": 0.013549502629757626, "clip_ratio/low_min": 1.699819058558205e-05, "clip_ratio/region_mean": 0.01381780310105114, "epoch": 0.7573333333333333, "grad_norm": 0.3767914653500927, "learning_rate": 5.096685728789175e-07, "loss": -0.0015, "step": 710 }, { "clip_ratio/high_max": 0.0015036351468552312, "clip_ratio/high_mean": 0.000541127158908239, "clip_ratio/low_mean": 0.044184694035539, "clip_ratio/low_min": 5.921465999563225e-05, "clip_ratio/region_mean": 0.04472582101734588, "epoch": 0.7626666666666667, "grad_norm": 0.3810685982445513, "learning_rate": 4.888491750706547e-07, "loss": -0.0021, "step": 715 }, { "clip_ratio/high_max": 0.002082085819165513, "clip_ratio/high_mean": 0.0007421550518301956, "clip_ratio/low_mean": 0.03593454270012444, "clip_ratio/low_min": 8.35444443509914e-05, "clip_ratio/region_mean": 0.0366766979762815, "epoch": 0.768, "grad_norm": 0.4864696401109391, "learning_rate": 4.6838084262261776e-07, "loss": -0.0026, "step": 720 }, { "clip_ratio/high_max": 0.0006529347094328842, "clip_ratio/high_mean": 0.00026508466303312164, "clip_ratio/low_mean": 0.03661034831209235, "clip_ratio/low_min": 0.007676141597039532, "clip_ratio/region_mean": 0.036875433040381725, "completions/clipped_ratio": 0.24739583333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2299.0, "completions/mean_length": 1120.9818115234375, "completions/mean_terminated_length": 479.64361572265625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7733333333333333, "grad_norm": 1.0961482806867873, "learning_rate": 4.4827068201420486e-07, "loss": -0.0014, "num_tokens": 23136280.0, "reward": 0.3671875, "reward_std": 0.17591141164302826, "rewards/accuracy_reward/mean": 0.3671875, "rewards/accuracy_reward/std": 0.48266708850860596, "step": 725 }, { "clip_ratio/high_max": 0.00140850802681598, "clip_ratio/high_mean": 0.000588609564510989, "clip_ratio/low_mean": 0.04263099752542985, "clip_ratio/low_min": 0.009426032151532126, "clip_ratio/region_mean": 0.043219607146193086, "epoch": 0.7786666666666666, "grad_norm": 1.1589885530535626, "learning_rate": 4.2852567536974705e-07, "loss": -0.0027, "step": 730 }, { "clip_ratio/high_max": 0.002268770144019072, "clip_ratio/high_mean": 0.0009492890193087078, "clip_ratio/low_mean": 0.047327609898593435, "clip_ratio/low_min": 0.011026294236944522, "clip_ratio/region_mean": 0.04827689818457657, "epoch": 0.784, "grad_norm": 0.7510655129978862, "learning_rate": 4.0915267803436186e-07, "loss": -0.0034, "step": 735 }, { "clip_ratio/high_max": 0.0009182047278954997, "clip_ratio/high_mean": 0.00038086536630999035, "clip_ratio/low_mean": 0.01649121186649154, "clip_ratio/low_min": 0.0037041814470285318, "clip_ratio/region_mean": 0.01687207677352944, "completions/clipped_ratio": 0.20833333333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 1064.4193115234375, "completions/mean_terminated_length": 536.1085815429688, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.7893333333333333, "grad_norm": 0.6443483444126676, "learning_rate": 3.901584161938172e-07, "loss": -0.0013, "num_tokens": 23602359.0, "reward": 0.46875, "reward_std": 0.17591141164302826, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.4996735155582428, "step": 740 }, { "clip_ratio/high_max": 0.0011816810975233238, "clip_ratio/high_mean": 0.00046524005733772356, "clip_ratio/low_mean": 0.0509467643731341, "clip_ratio/low_min": 0.001924342624261044, "clip_ratio/region_mean": 0.05141200426730848, "epoch": 0.7946666666666666, "grad_norm": 0.7836259858913197, "learning_rate": 3.715494845392418e-07, "loss": -0.0028, "step": 745 }, { "clip_ratio/high_max": 0.002083228582478114, "clip_ratio/high_mean": 0.0008276408498204546, "clip_ratio/low_mean": 0.030734749596513213, "clip_ratio/low_min": 0.0026941196778352606, "clip_ratio/region_mean": 0.0315623906170913, "epoch": 0.8, "grad_norm": 0.879199141290654, "learning_rate": 3.5333234397748987e-07, "loss": -0.0038, "step": 750 }, { "clip_ratio/high_max": 0.0013101882731461957, "clip_ratio/high_mean": 0.0005228189069839573, "clip_ratio/low_mean": 0.012684578620974208, "clip_ratio/low_min": 0.0011246469422076188, "clip_ratio/region_mean": 0.013207397388100617, "completions/clipped_ratio": 0.2265625, "completions/max_length": 3072.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 1091.4349365234375, "completions/mean_terminated_length": 511.26934814453125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.8053333333333333, "grad_norm": 0.4166627345899704, "learning_rate": 3.3551331938795246e-07, "loss": -0.002, "num_tokens": 24074345.0, "reward": 0.3697916865348816, "reward_std": 0.16237977147102356, "rewards/accuracy_reward/mean": 0.3727034032344818, "rewards/accuracy_reward/std": 0.4841598868370056, "step": 755 }, { "clip_ratio/high_max": 0.0009507575167845061, "clip_ratio/high_mean": 0.00038859400003730114, "clip_ratio/low_mean": 0.03011121982833629, "clip_ratio/low_min": 0.00019113730540993857, "clip_ratio/region_mean": 0.030499813807000464, "epoch": 0.8106666666666666, "grad_norm": 0.6932440311596509, "learning_rate": 3.1809859742659784e-07, "loss": -0.0015, "step": 760 }, { "clip_ratio/high_max": 0.002418382237192418, "clip_ratio/high_mean": 0.0009913336721183441, "clip_ratio/low_mean": 0.022852651399625758, "clip_ratio/low_min": 0.000419907653122209, "clip_ratio/region_mean": 0.023843985077292018, "epoch": 0.816, "grad_norm": 0.4499061325669949, "learning_rate": 3.0109422437800415e-07, "loss": -0.0028, "step": 765 }, { "clip_ratio/high_max": 0.005142432304637623, "clip_ratio/high_mean": 0.0018760693987815102, "clip_ratio/low_mean": 0.0046391781896772954, "clip_ratio/low_min": 0.00040681727914488875, "clip_ratio/region_mean": 0.006515247495531185, "completions/clipped_ratio": 0.265625, "completions/max_length": 3072.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 1178.190185546875, "completions/mean_terminated_length": 493.19500732421875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.8213333333333334, "grad_norm": 1.4523122298175635, "learning_rate": 2.8450610405612504e-07, "loss": -0.002, "num_tokens": 24579381.0, "reward": 0.4427083432674408, "reward_std": 0.16237977147102356, "rewards/accuracy_reward/mean": 0.4427083432674408, "rewards/accuracy_reward/std": 0.49735480546951294, "step": 770 }, { "clip_ratio/high_max": 0.0005670051381457597, "clip_ratio/high_mean": 0.00025447968607750227, "clip_ratio/low_mean": 0.03032345966803405, "clip_ratio/low_min": 0.0018923958687082632, "clip_ratio/region_mean": 0.030577939788713593, "epoch": 0.8266666666666667, "grad_norm": 0.6916537056842649, "learning_rate": 2.6833999575452256e-07, "loss": -0.0029, "step": 775 }, { "clip_ratio/high_max": 0.0010563003924744408, "clip_ratio/high_mean": 0.0004932881325657945, "clip_ratio/low_mean": 0.05850951334614365, "clip_ratio/low_min": 0.00395097134023672, "clip_ratio/region_mean": 0.059002801402562, "epoch": 0.832, "grad_norm": 0.5136694603209685, "learning_rate": 2.526015122467751e-07, "loss": -0.0033, "step": 780 }, { "clip_ratio/high_max": 0.00117076011165409, "clip_ratio/high_mean": 0.0005315767426054662, "clip_ratio/low_mean": 0.027362305362476035, "clip_ratio/low_min": 0.0016958194413746242, "clip_ratio/region_mean": 0.02789388249316289, "completions/clipped_ratio": 0.203125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 1016.3333740234375, "completions/mean_terminated_length": 492.3398742675781, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.8373333333333334, "grad_norm": 1.511840478189623, "learning_rate": 2.372961178377585e-07, "loss": -0.0036, "num_tokens": 25022015.0, "reward": 0.4322916865348816, "reward_std": 0.18042194843292236, "rewards/accuracy_reward/mean": 0.4322916567325592, "rewards/accuracy_reward/std": 0.4960407018661499, "step": 785 }, { "clip_ratio/high_max": 0.0006613163771817199, "clip_ratio/high_mean": 0.00025725484051690727, "clip_ratio/low_mean": 0.007441935932615706, "clip_ratio/low_min": 4.295532562537119e-06, "clip_ratio/region_mean": 0.007699190771222675, "epoch": 0.8426666666666667, "grad_norm": 0.3218619567609056, "learning_rate": 2.2242912646647086e-07, "loss": -0.0013, "step": 790 }, { "clip_ratio/high_max": 0.0009924005279572158, "clip_ratio/high_mean": 0.0003942615909863889, "clip_ratio/low_mean": 0.041776556338777485, "clip_ratio/low_min": 6.872852318338119e-05, "clip_ratio/region_mean": 0.04217081745118776, "epoch": 0.848, "grad_norm": 0.6171485332039848, "learning_rate": 2.080056998610662e-07, "loss": -0.0025, "step": 795 }, { "clip_ratio/high_max": 0.0015996045062820485, "clip_ratio/high_mean": 0.0006319381845969474, "clip_ratio/low_mean": 0.044201145696251845, "clip_ratio/low_min": 7.302405429072678e-05, "clip_ratio/region_mean": 0.044833083586036084, "epoch": 0.8533333333333334, "grad_norm": 0.22073338816121932, "learning_rate": 1.9403084574673463e-07, "loss": -0.0032, "step": 800 }, { "clip_ratio/high_max": 0.0005480042753333691, "clip_ratio/high_mean": 0.00021893545331295173, "clip_ratio/low_mean": 0.00018941632918085817, "clip_ratio/low_min": 1.1344801168888808e-05, "clip_ratio/region_mean": 0.00040835177828739687, "completions/clipped_ratio": 0.21354166666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 1176.796875, "completions/mean_terminated_length": 662.205322265625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.8586666666666667, "grad_norm": 0.9928063245696838, "learning_rate": 1.8050941610705053e-07, "loss": -0.0005, "num_tokens": 25527668.0, "reward": 0.421875, "reward_std": 0.19395360350608826, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.49450308084487915, "step": 805 }, { "clip_ratio/high_max": 0.0009670603182712512, "clip_ratio/high_mean": 0.0003949781339542824, "clip_ratio/low_mean": 0.01606227820625463, "clip_ratio/low_min": 0.0009993292473154725, "clip_ratio/region_mean": 0.016457256326202697, "epoch": 0.864, "grad_norm": 0.4069186615399281, "learning_rate": 1.6744610549939322e-07, "loss": -0.0037, "step": 810 }, { "clip_ratio/high_max": 0.0013157209751625486, "clip_ratio/high_mean": 0.000542721035071736, "clip_ratio/low_mean": 0.046080468874629334, "clip_ratio/low_min": 0.002244928914660704, "clip_ratio/region_mean": 0.046623189253273266, "epoch": 0.8693333333333333, "grad_norm": 0.469280708819203, "learning_rate": 1.5484544942502694e-07, "loss": -0.0048, "step": 815 }, { "clip_ratio/high_max": 0.0006278479527281888, "clip_ratio/high_mean": 0.0002539549294169774, "clip_ratio/low_mean": 0.012086308056041162, "clip_ratio/low_min": 0.000582591683632927, "clip_ratio/region_mean": 0.01234026287636425, "completions/clipped_ratio": 0.2109375, "completions/max_length": 3072.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 1143.8802490234375, "completions/mean_terminated_length": 628.4422607421875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.8746666666666667, "grad_norm": 0.7885641135069424, "learning_rate": 1.4271182275440077e-07, "loss": -0.0014, "num_tokens": 26017921.0, "reward": 0.4296875, "reward_std": 0.16689030826091766, "rewards/accuracy_reward/mean": 0.4296875, "rewards/accuracy_reward/std": 0.49567729234695435, "step": 820 }, { "clip_ratio/high_max": 0.0006818332804868988, "clip_ratio/high_mean": 0.000264944165360248, "clip_ratio/low_mean": 0.0035559290565970515, "clip_ratio/low_min": 0.00023971155815161183, "clip_ratio/region_mean": 0.003820873308222872, "epoch": 0.88, "grad_norm": 0.3999132200719432, "learning_rate": 1.3104943820822195e-07, "loss": -0.0028, "step": 825 }, { "clip_ratio/high_max": 0.0008823359285088372, "clip_ratio/high_mean": 0.00033811768419127474, "clip_ratio/low_mean": 0.03006819811771493, "clip_ratio/low_min": 0.0024617789538751824, "clip_ratio/region_mean": 0.03040631610085711, "epoch": 0.8853333333333333, "grad_norm": 0.32461059649804497, "learning_rate": 1.19862344894824e-07, "loss": -0.0042, "step": 830 }, { "clip_ratio/high_max": 0.0005540510203900339, "clip_ratio/high_mean": 0.00022197982166289877, "clip_ratio/low_mean": 0.019960488699052804, "clip_ratio/low_min": 0.0017120031339800334, "clip_ratio/region_mean": 0.02018246848244871, "completions/clipped_ratio": 0.1875, "completions/max_length": 3072.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 1062.197998046875, "completions/mean_terminated_length": 598.3974609375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.8906666666666667, "grad_norm": 0.834530476888312, "learning_rate": 1.0915442690434158e-07, "loss": -0.0019, "num_tokens": 26481173.0, "reward": 0.4036458432674408, "reward_std": 0.15335866808891296, "rewards/accuracy_reward/mean": 0.4036458432674408, "rewards/accuracy_reward/std": 0.4912680983543396, "step": 835 }, { "clip_ratio/high_max": 0.000518893538901466, "clip_ratio/high_mean": 0.00019955981156272173, "clip_ratio/low_mean": 0.003260935986622826, "clip_ratio/low_min": 0.00030751415179111066, "clip_ratio/region_mean": 0.0034604957777446543, "epoch": 0.896, "grad_norm": 0.37603261720091047, "learning_rate": 9.89294019601783e-08, "loss": -0.0016, "step": 840 }, { "clip_ratio/high_max": 0.000798197544827417, "clip_ratio/high_mean": 0.00031140441062689207, "clip_ratio/low_mean": 0.019426193800200053, "clip_ratio/low_min": 0.0009656459093093872, "clip_ratio/region_mean": 0.01973759806523958, "epoch": 0.9013333333333333, "grad_norm": 0.27285501007195917, "learning_rate": 8.919082012823675e-08, "loss": -0.0027, "step": 845 }, { "clip_ratio/high_max": 0.0007069298061651352, "clip_ratio/high_mean": 0.00027325062021645865, "clip_ratio/low_mean": 0.020689685308434493, "clip_ratio/low_min": 0.000919325789436698, "clip_ratio/region_mean": 0.020962935676857343, "completions/clipped_ratio": 0.11458333333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 936.3463745117188, "completions/mean_terminated_length": 659.9676513671875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.9066666666666666, "grad_norm": 0.562379178852045, "learning_rate": 7.994206258435576e-08, "loss": -0.0017, "num_tokens": 26901027.0, "reward": 0.4244791865348816, "reward_std": 0.19395360350608826, "rewards/accuracy_reward/mean": 0.4244791567325592, "rewards/accuracy_reward/std": 0.49490854144096375, "step": 850 }, { "clip_ratio/high_max": 0.0009260432932933326, "clip_ratio/high_mean": 0.00036914601573698744, "clip_ratio/low_mean": 0.00026559712919151936, "clip_ratio/low_min": 9.624268386687618e-07, "clip_ratio/region_mean": 0.000634743145928951, "epoch": 0.912, "grad_norm": 0.48181915914534645, "learning_rate": 7.118634044038774e-08, "loss": -0.0, "step": 855 }, { "clip_ratio/high_max": 0.0010694809024244024, "clip_ratio/high_mean": 0.0004341576189744956, "clip_ratio/low_mean": 0.000362517843200294, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007966754636299811, "epoch": 0.9173333333333333, "grad_norm": 0.4011919703056612, "learning_rate": 6.292669362932102e-08, "loss": -0.0009, "step": 860 }, { "clip_ratio/high_max": 0.000974411667084496, "clip_ratio/high_mean": 0.00039414916211626404, "clip_ratio/low_mean": 0.0003663906626343305, "clip_ratio/low_min": 1.668322465775418e-06, "clip_ratio/region_mean": 0.0007605398319583401, "completions/clipped_ratio": 0.15104166666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 981.7994995117188, "completions/mean_terminated_length": 609.9232788085938, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.9226666666666666, "grad_norm": 0.5640021538071135, "learning_rate": 5.516598984983279e-08, "loss": -0.0013, "num_tokens": 27327052.0, "reward": 0.4895833432674408, "reward_std": 0.18042194843292236, "rewards/accuracy_reward/mean": 0.4895833432674408, "rewards/accuracy_reward/std": 0.5005436539649963, "step": 865 }, { "clip_ratio/high_max": 0.0008023715844956314, "clip_ratio/high_mean": 0.0003100335631870621, "clip_ratio/low_mean": 0.0001792538990457615, "clip_ratio/low_min": 7.89223486208357e-06, "clip_ratio/region_mean": 0.0004892874651432066, "epoch": 0.928, "grad_norm": 0.5280427941792062, "learning_rate": 4.7906923570641695e-08, "loss": 0.0001, "step": 870 }, { "clip_ratio/high_max": 0.0007951498469992657, "clip_ratio/high_mean": 0.00031492638240706585, "clip_ratio/low_mean": 0.00025046017171916903, "clip_ratio/low_min": 1.3413453052635304e-05, "clip_ratio/region_mean": 0.0005653865586509709, "epoch": 0.9333333333333333, "grad_norm": 0.4545073012340712, "learning_rate": 4.115201509500582e-08, "loss": -0.0007, "step": 875 }, { "clip_ratio/high_max": 0.000962319417976687, "clip_ratio/high_mean": 0.00038963923243500175, "clip_ratio/low_mean": 0.00040830461132372874, "clip_ratio/low_min": 2.1305687914718875e-05, "clip_ratio/region_mean": 0.0007979438438724173, "epoch": 0.9386666666666666, "grad_norm": 0.4059957362785215, "learning_rate": 3.490360968568801e-08, "loss": -0.0014, "step": 880 }, { "clip_ratio/high_max": 0.0005873541019354889, "clip_ratio/high_mean": 0.00023833751642996504, "clip_ratio/low_mean": 0.00012138135189161403, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035971886757124594, "completions/clipped_ratio": 0.1796875, "completions/max_length": 3072.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 1055.5546875, "completions/mean_terminated_length": 613.857177734375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.944, "grad_norm": 0.8056897542766063, "learning_rate": 2.9163876750694986e-08, "loss": 0.0002, "num_tokens": 27783769.0, "reward": 0.4713541865348816, "reward_std": 0.19395360350608826, "rewards/accuracy_reward/mean": 0.4713541567325592, "rewards/accuracy_reward/std": 0.4998299777507782, "step": 885 }, { "clip_ratio/high_max": 0.000736053941091086, "clip_ratio/high_mean": 0.0003010868134879274, "clip_ratio/low_mean": 0.00019030355974791746, "clip_ratio/low_min": 4.070645263709593e-06, "clip_ratio/region_mean": 0.0004913903652095542, "epoch": 0.9493333333333334, "grad_norm": 0.5961867251741968, "learning_rate": 2.393480909007306e-08, "loss": -0.0003, "step": 890 }, { "clip_ratio/high_max": 0.0007939469470329641, "clip_ratio/high_mean": 0.00032171402617677816, "clip_ratio/low_mean": 0.00027457223243345654, "clip_ratio/low_min": 5.998684719088487e-06, "clip_ratio/region_mean": 0.0005962862574733663, "epoch": 0.9546666666666667, "grad_norm": 0.5404007766153449, "learning_rate": 1.9218222204019087e-08, "loss": -0.0009, "step": 895 }, { "clip_ratio/high_max": 0.0005185732868540072, "clip_ratio/high_mean": 0.00021296636309671159, "clip_ratio/low_mean": 0.0001345524555972588, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003475188213769798, "completions/clipped_ratio": 0.1953125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 1077.4896240234375, "completions/mean_terminated_length": 593.3851318359375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.96, "grad_norm": 0.8424493369425556, "learning_rate": 1.5015753662550813e-08, "loss": -0.0, "num_tokens": 28250181.0, "reward": 0.4505208432674408, "reward_std": 0.19846415519714355, "rewards/accuracy_reward/mean": 0.4505208432674408, "rewards/accuracy_reward/std": 0.4981948733329773, "step": 900 }, { "clip_ratio/high_max": 0.0006499291373074812, "clip_ratio/high_mean": 0.00026218581494958927, "clip_ratio/low_mean": 0.00013294762566147257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003951334421344654, "epoch": 0.9653333333333334, "grad_norm": 0.6876019087734823, "learning_rate": 1.1328862536952033e-08, "loss": -0.0001, "step": 905 }, { "clip_ratio/high_max": 0.0007533465448432252, "clip_ratio/high_mean": 0.00030139338468870847, "clip_ratio/low_mean": 0.00016263060861092525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004640239933905832, "epoch": 0.9706666666666667, "grad_norm": 0.6253607085539371, "learning_rate": 8.158828893192471e-09, "loss": -0.0005, "step": 910 }, { "clip_ratio/high_max": 0.0006076685126572556, "clip_ratio/high_mean": 0.0002456966022919005, "clip_ratio/low_mean": 0.00013469792866089848, "clip_ratio/low_min": 8.737592906982172e-07, "clip_ratio/region_mean": 0.000380394533613071, "completions/clipped_ratio": 0.1640625, "completions/max_length": 3072.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 1020.3046875, "completions/mean_terminated_length": 617.635498046875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.976, "grad_norm": 0.5084297427864617, "learning_rate": 5.506753347496285e-09, "loss": -0.0001, "num_tokens": 28696392.0, "reward": 0.4088541865348816, "reward_std": 0.17140087485313416, "rewards/accuracy_reward/mean": 0.4088541567325592, "rewards/accuracy_reward/std": 0.4922636151313782, "step": 915 }, { "clip_ratio/high_max": 0.0008260932669145405, "clip_ratio/high_mean": 0.00031006573801732883, "clip_ratio/low_mean": 0.00016273604537673237, "clip_ratio/low_min": 6.924216359038838e-06, "clip_ratio/region_mean": 0.000472801781029375, "epoch": 0.9813333333333333, "grad_norm": 0.459028245342396, "learning_rate": 3.37355668421524e-09, "loss": 0.0004, "step": 920 }, { "clip_ratio/high_max": 0.0008105833506306225, "clip_ratio/high_mean": 0.00030842817943721457, "clip_ratio/low_mean": 0.000173703337736697, "clip_ratio/low_min": 7.1027836384018885e-06, "clip_ratio/region_mean": 0.000482131515468609, "epoch": 0.9866666666666667, "grad_norm": 0.4788814954808769, "learning_rate": 1.7599795361376015e-09, "loss": 0.0003, "step": 925 }, { "clip_ratio/high_max": 0.0007806502030689444, "clip_ratio/high_mean": 0.00028658854591867566, "clip_ratio/low_mean": 0.00016420157002509464, "clip_ratio/low_min": 1.5357820757344598e-05, "clip_ratio/region_mean": 0.0004507901177248641, "epoch": 0.9898666666666667, "step": 928, "total_flos": 0.0, "train_loss": 0.044432367682234554, "train_runtime": 19721.3907, "train_samples_per_second": 0.38, "train_steps_per_second": 0.048 } ], "logging_steps": 5, "max_steps": 937, "num_input_tokens_seen": 28696392, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }