{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9898666666666667, "eval_steps": 500, "global_step": 928, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 582.8854370117188, "completions/mean_terminated_length": 556.6842041015625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.0010666666666666667, "grad_norm": 0.20422078818921574, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 277768.0, "reward": 0.5208333730697632, "reward_std": 0.19846415519714355, "rewards/accuracy_reward/mean": 0.5208333134651184, "rewards/accuracy_reward/std": 0.5002174973487854, "step": 1 }, { "clip_ratio/high_max": 0.0007770849975941019, "clip_ratio/high_mean": 0.0003215046778564101, "clip_ratio/low_mean": 0.00011664532638633318, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004381500074828182, "epoch": 0.005333333333333333, "grad_norm": 0.2505908398291977, "learning_rate": 1.276595744680851e-07, "loss": 0.0005, "step": 5 }, { "clip_ratio/high_max": 0.0009432019895939447, "clip_ratio/high_mean": 0.0003980772502472973, "clip_ratio/low_mean": 0.0001918027855026594, "clip_ratio/low_min": 1.4629412362410221e-05, "clip_ratio/region_mean": 0.0005898800357954315, "epoch": 0.010666666666666666, "grad_norm": 0.21816153146040967, "learning_rate": 2.872340425531915e-07, "loss": 0.0006, "step": 10 }, { "clip_ratio/high_max": 0.001069825322520046, "clip_ratio/high_mean": 0.00044034483889845435, "clip_ratio/low_mean": 0.00027804817609649034, "clip_ratio/low_min": 2.929462971223984e-05, "clip_ratio/region_mean": 0.0007183930117662385, "epoch": 0.016, "grad_norm": 0.1803387135769063, "learning_rate": 4.468085106382979e-07, "loss": 0.0005, "step": 15 }, { "clip_ratio/high_max": 0.0008536569416719431, "clip_ratio/high_mean": 0.0003719475469097233, "clip_ratio/low_mean": 0.00027786920049948093, "clip_ratio/low_min": 2.4978651890705805e-05, "clip_ratio/region_mean": 0.000649816743680276, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 3072.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 586.8568115234375, "completions/mean_terminated_length": 580.3681640625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.021333333333333333, "grad_norm": 0.20716971526126013, "learning_rate": 6.063829787234043e-07, "loss": 0.0004, "num_tokens": 556122.0, "reward": 0.4817708432674408, "reward_std": 0.21199578046798706, "rewards/accuracy_reward/mean": 0.4817708432674408, "rewards/accuracy_reward/std": 0.5003194212913513, "step": 20 }, { "clip_ratio/high_max": 0.0012744583305902778, "clip_ratio/high_mean": 0.0005582290830261627, "clip_ratio/low_mean": 0.00041245166776207045, "clip_ratio/low_min": 3.138680403935723e-05, "clip_ratio/region_mean": 0.0009706807515613037, "epoch": 0.02666666666666667, "grad_norm": 0.1418023896022033, "learning_rate": 7.659574468085106e-07, "loss": 0.0004, "step": 25 }, { "clip_ratio/high_max": 0.00220621647858934, "clip_ratio/high_mean": 0.0009707116578283603, "clip_ratio/low_mean": 0.0007946801438720285, "clip_ratio/low_min": 4.932650317641673e-05, "clip_ratio/region_mean": 0.0017653918050655193, "epoch": 0.032, "grad_norm": 0.12799174873341806, "learning_rate": 9.25531914893617e-07, "loss": -0.0003, "step": 30 }, { "clip_ratio/high_max": 0.0013123282908054535, "clip_ratio/high_mean": 0.0005811084596643923, "clip_ratio/low_mean": 0.0005872274301054858, "clip_ratio/low_min": 4.0462239485350435e-05, "clip_ratio/region_mean": 0.001168335893044059, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 609.359375, "completions/mean_terminated_length": 589.968505859375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.037333333333333336, "grad_norm": 0.17689653918151418, "learning_rate": 1.0851063829787236e-06, "loss": -0.0001, "num_tokens": 843528.0, "reward": 0.4791666865348816, "reward_std": 0.18493250012397766, "rewards/accuracy_reward/mean": 0.4791666567325592, "rewards/accuracy_reward/std": 0.5002174973487854, "step": 35 }, { "clip_ratio/high_max": 0.0013572285231930436, "clip_ratio/high_mean": 0.0005922957489474356, "clip_ratio/low_mean": 0.0006135863384315599, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012058820880611165, "epoch": 0.042666666666666665, "grad_norm": 0.12004654267996381, "learning_rate": 1.2446808510638299e-06, "loss": -0.0001, "step": 40 }, { "clip_ratio/high_max": 0.0034562830125651088, "clip_ratio/high_mean": 0.0015175073116552086, "clip_ratio/low_mean": 0.001512340485032837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030298478131953743, "epoch": 0.048, "grad_norm": 0.12119808071569016, "learning_rate": 1.4042553191489362e-06, "loss": -0.001, "step": 45 }, { "clip_ratio/high_max": 0.003685563067847397, "clip_ratio/high_mean": 0.0016141347007760487, "clip_ratio/low_mean": 0.0017511527281840245, "clip_ratio/low_min": 3.322700649732724e-06, "clip_ratio/region_mean": 0.0033652874315521332, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 569.2265625, "completions/mean_terminated_length": 549.5196533203125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.05333333333333334, "grad_norm": 0.18996260394758427, "learning_rate": 1.5638297872340427e-06, "loss": -0.0008, "num_tokens": 1116063.0, "reward": 0.5416666865348816, "reward_std": 0.23003801703453064, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.49891090393066406, "step": 50 }, { "clip_ratio/high_max": 0.0021796487155370413, "clip_ratio/high_mean": 0.0010463413024808688, "clip_ratio/low_mean": 0.0010470149699358445, "clip_ratio/low_min": 0.0001324586268310668, "clip_ratio/region_mean": 0.0020933562766913385, "epoch": 0.058666666666666666, "grad_norm": 0.1303487704568297, "learning_rate": 1.723404255319149e-06, "loss": -0.0003, "step": 55 }, { "clip_ratio/high_max": 0.006383144167921273, "clip_ratio/high_mean": 0.0031119083997509732, "clip_ratio/low_mean": 0.0033932545189600204, "clip_ratio/low_min": 0.0006400024649337865, "clip_ratio/region_mean": 0.006505162901521544, "epoch": 0.064, "grad_norm": 0.1207265479380106, "learning_rate": 1.8829787234042552e-06, "loss": -0.0015, "step": 60 }, { "clip_ratio/high_max": 0.0079313643007481, "clip_ratio/high_mean": 0.004010664290763089, "clip_ratio/low_mean": 0.004932637583260657, "clip_ratio/low_min": 0.001034680861630477, "clip_ratio/region_mean": 0.008943301859108032, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 589.0521240234375, "completions/mean_terminated_length": 569.5012817382812, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.06933333333333333, "grad_norm": 0.14929742906369683, "learning_rate": 2.0425531914893617e-06, "loss": -0.0018, "num_tokens": 1395296.0, "reward": 0.5598958730697632, "reward_std": 0.18493250012397766, "rewards/accuracy_reward/mean": 0.5598958134651184, "rewards/accuracy_reward/std": 0.49704709649086, "step": 65 }, { "clip_ratio/high_max": 0.002393845061487809, "clip_ratio/high_mean": 0.001054112742349389, "clip_ratio/low_mean": 0.0008798149027370528, "clip_ratio/low_min": 6.669599752058275e-05, "clip_ratio/region_mean": 0.0019339276432219776, "epoch": 0.07466666666666667, "grad_norm": 0.12256898323965673, "learning_rate": 2.202127659574468e-06, "loss": -0.0003, "step": 70 }, { "clip_ratio/high_max": 0.007852430811180967, "clip_ratio/high_mean": 0.0035389320792091894, "clip_ratio/low_mean": 0.0036253775208024306, "clip_ratio/low_min": 0.0004405311545269797, "clip_ratio/region_mean": 0.0071643095470790286, "epoch": 0.08, "grad_norm": 0.11736574792478119, "learning_rate": 2.3617021276595748e-06, "loss": -0.0015, "step": 75 }, { "clip_ratio/high_max": 0.012781611752870958, "clip_ratio/high_mean": 0.00571825887018349, "clip_ratio/low_mean": 0.00634475286315137, "clip_ratio/low_min": 0.0010415775468572974, "clip_ratio/region_mean": 0.012063011784630363, "epoch": 0.08533333333333333, "grad_norm": 0.11354460859341908, "learning_rate": 2.521276595744681e-06, "loss": -0.0022, "step": 80 }, { "clip_ratio/high_max": 0.00263888179888454, "clip_ratio/high_mean": 0.0013344709606826655, "clip_ratio/low_mean": 0.0010649578884112999, "clip_ratio/low_min": 0.00021321549320418853, "clip_ratio/region_mean": 0.002399428844728391, "completions/clipped_ratio": 0.01302083333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 589.546875, "completions/mean_terminated_length": 556.796875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.09066666666666667, "grad_norm": 0.25219952568991916, "learning_rate": 2.680851063829787e-06, "loss": -0.0003, "num_tokens": 1675103.0, "reward": 0.5416667461395264, "reward_std": 0.26161181926727295, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.49891090393066406, "step": 85 }, { "clip_ratio/high_max": 0.00987106283500907, "clip_ratio/high_mean": 0.00533320542817819, "clip_ratio/low_mean": 0.005614280872759991, "clip_ratio/low_min": 0.0013633494694659022, "clip_ratio/region_mean": 0.01094748623909254, "epoch": 0.096, "grad_norm": 0.15625358446159077, "learning_rate": 2.8404255319148938e-06, "loss": -0.002, "step": 90 }, { "clip_ratio/high_max": 0.01515841492509935, "clip_ratio/high_mean": 0.008317556266410975, "clip_ratio/low_mean": 0.009814491481665754, "clip_ratio/low_min": 0.0026495314290514217, "clip_ratio/region_mean": 0.018132047734980006, "epoch": 0.10133333333333333, "grad_norm": 0.1862259859600978, "learning_rate": 3e-06, "loss": -0.003, "step": 95 }, { "clip_ratio/high_max": 0.004873402533303306, "clip_ratio/high_mean": 0.002473862611577715, "clip_ratio/low_mean": 0.0029152705660635547, "clip_ratio/low_min": 0.0006941740171896527, "clip_ratio/region_mean": 0.005389133182143268, "completions/clipped_ratio": 0.0, "completions/max_length": 2971.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 538.2864990234375, "completions/mean_terminated_length": 538.2864990234375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.10666666666666667, "grad_norm": 0.13197221632587225, "learning_rate": 2.999739604603311e-06, "loss": -0.0009, "num_tokens": 1935277.0, "reward": 0.53125, "reward_std": 0.18042194843292236, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4996735155582428, "step": 100 }, { "clip_ratio/high_max": 0.00826658782461891, "clip_ratio/high_mean": 0.003611915481633332, "clip_ratio/low_mean": 0.00472188308594923, "clip_ratio/low_min": 0.0005986868112813681, "clip_ratio/region_mean": 0.00833379861614958, "epoch": 0.112, "grad_norm": 0.21363603048601443, "learning_rate": 2.9989585088209272e-06, "loss": -0.0016, "step": 105 }, { "clip_ratio/high_max": 0.01345936935977079, "clip_ratio/high_mean": 0.00586280180614267, "clip_ratio/low_mean": 0.009280941534234444, "clip_ratio/low_min": 0.001012265373719856, "clip_ratio/region_mean": 0.015143743233056738, "epoch": 0.11733333333333333, "grad_norm": 0.12674305107187028, "learning_rate": 2.9976569838445097e-06, "loss": -0.0025, "step": 110 }, { "clip_ratio/high_max": 0.006462734596971132, "clip_ratio/high_mean": 0.0028225505014233933, "clip_ratio/low_mean": 0.004892997997421844, "clip_ratio/low_min": 0.0004995303239411442, "clip_ratio/region_mean": 0.007715548440592102, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 617.2708740234375, "completions/mean_terminated_length": 591.4315795898438, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.12266666666666666, "grad_norm": 0.1551976110248513, "learning_rate": 2.9958354815555427e-06, "loss": -0.0012, "num_tokens": 2224455.0, "reward": 0.4765625, "reward_std": 0.23905909061431885, "rewards/accuracy_reward/mean": 0.4765625, "rewards/accuracy_reward/std": 0.5001019835472107, "step": 115 }, { "clip_ratio/high_max": 0.006417330503245467, "clip_ratio/high_mean": 0.0030476127185465884, "clip_ratio/low_mean": 0.004166808969512203, "clip_ratio/low_min": 0.00021772524451080244, "clip_ratio/region_mean": 0.007214421745993604, "epoch": 0.128, "grad_norm": 0.13207036484949483, "learning_rate": 2.9934946343684403e-06, "loss": -0.0015, "step": 120 }, { "clip_ratio/high_max": 0.012147787163848988, "clip_ratio/high_mean": 0.005797360813085106, "clip_ratio/low_mean": 0.009507861722977396, "clip_ratio/low_min": 0.0005253114592051134, "clip_ratio/region_mean": 0.015305222445022082, "epoch": 0.13333333333333333, "grad_norm": 0.12310744686911018, "learning_rate": 2.9906352550109787e-06, "loss": -0.0027, "step": 125 }, { "clip_ratio/high_max": 0.008791072150233958, "clip_ratio/high_mean": 0.004209827898375806, "clip_ratio/low_mean": 0.007330978753225281, "clip_ratio/low_min": 0.0004689814173616469, "clip_ratio/region_mean": 0.011540806826269545, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 636.7786865234375, "completions/mean_terminated_length": 617.6036987304688, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.13866666666666666, "grad_norm": 0.16057741798072203, "learning_rate": 2.9872583362421204e-06, "loss": -0.0019, "num_tokens": 2525567.0, "reward": 0.4401041865348816, "reward_std": 0.22552743554115295, "rewards/accuracy_reward/mean": 0.4401041567325592, "rewards/accuracy_reward/std": 0.49704715609550476, "step": 130 }, { "clip_ratio/high_max": 0.00555340334540233, "clip_ratio/high_mean": 0.002636409461410949, "clip_ratio/low_mean": 0.002953379235077591, "clip_ratio/low_min": 0.0004720784712844761, "clip_ratio/region_mean": 0.005589788664110529, "epoch": 0.144, "grad_norm": 0.1453811494026259, "learning_rate": 2.983365050507336e-06, "loss": -0.0014, "step": 135 }, { "clip_ratio/high_max": 0.01339928732631961, "clip_ratio/high_mean": 0.006330215063280775, "clip_ratio/low_mean": 0.00879853989899857, "clip_ratio/low_min": 0.0014712307834997772, "clip_ratio/region_mean": 0.015128754943725653, "epoch": 0.14933333333333335, "grad_norm": 0.14793143721427504, "learning_rate": 2.978956749531536e-06, "loss": -0.0029, "step": 140 }, { "clip_ratio/high_max": 0.013523715882911347, "clip_ratio/high_mean": 0.006345920295279939, "clip_ratio/low_mean": 0.009827413122548024, "clip_ratio/low_min": 0.0014740697108209134, "clip_ratio/region_mean": 0.01617333357062307, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 3072.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 564.1823120117188, "completions/mean_terminated_length": 557.6344604492188, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.15466666666666667, "grad_norm": 0.13743820558656808, "learning_rate": 2.9740349638497614e-06, "loss": -0.003, "num_tokens": 2793432.0, "reward": 0.5494791865348816, "reward_std": 0.17591141164302826, "rewards/accuracy_reward/mean": 0.5494791865348816, "rewards/accuracy_reward/std": 0.4981949031352997, "step": 145 }, { "clip_ratio/high_max": 0.00387724136307952, "clip_ratio/high_mean": 0.001668076281885078, "clip_ratio/low_mean": 0.0019100409018392383, "clip_ratio/low_min": 5.469538882607594e-05, "clip_ratio/region_mean": 0.0035781171822691247, "epoch": 0.16, "grad_norm": 0.13392332544942487, "learning_rate": 2.9686014022757936e-06, "loss": -0.0006, "step": 150 }, { "clip_ratio/high_max": 0.01081147320655873, "clip_ratio/high_mean": 0.004736997898726258, "clip_ratio/low_mean": 0.006456733815866755, "clip_ratio/low_min": 0.00025875126011669635, "clip_ratio/region_mean": 0.011193731782259419, "epoch": 0.16533333333333333, "grad_norm": 0.13737945211292066, "learning_rate": 2.9626579513088605e-06, "loss": -0.002, "step": 155 }, { "clip_ratio/high_max": 0.014356120993033983, "clip_ratio/high_mean": 0.0061812863765226215, "clip_ratio/low_mean": 0.009818395573529416, "clip_ratio/low_min": 0.0003344833443406969, "clip_ratio/region_mean": 0.015999681936227715, "epoch": 0.17066666666666666, "grad_norm": 0.14914710656105398, "learning_rate": 2.9562066744786588e-06, "loss": -0.0028, "step": 160 }, { "clip_ratio/high_max": 0.0028448135330108927, "clip_ratio/high_mean": 0.0012944873419655778, "clip_ratio/low_mean": 0.0012954239988630435, "clip_ratio/low_min": 5.0056873806170185e-05, "clip_ratio/region_mean": 0.0025899113332343406, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 582.2057495117188, "completions/mean_terminated_length": 582.2057495117188, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.176, "grad_norm": 0.1353931901062059, "learning_rate": 2.949249811628907e-06, "loss": -0.0005, "num_tokens": 3071332.0, "reward": 0.4609375, "reward_std": 0.20297472178936005, "rewards/accuracy_reward/mean": 0.4609375, "rewards/accuracy_reward/std": 0.49912211298942566, "step": 165 }, { "clip_ratio/high_max": 0.010440215712878852, "clip_ratio/high_mean": 0.004886947046543355, "clip_ratio/low_mean": 0.006611856841300323, "clip_ratio/low_min": 0.0004581456611049362, "clip_ratio/region_mean": 0.011498803871654672, "epoch": 0.18133333333333335, "grad_norm": 0.14921583884942533, "learning_rate": 2.9417897781396884e-06, "loss": -0.0021, "step": 170 }, { "clip_ratio/high_max": 0.014254509836609941, "clip_ratio/high_mean": 0.0066168544100946745, "clip_ratio/low_mean": 0.011595801010844297, "clip_ratio/low_min": 0.0007852825801819563, "clip_ratio/region_mean": 0.01821265552134719, "epoch": 0.18666666666666668, "grad_norm": 0.172926818644771, "learning_rate": 2.933829164088841e-06, "loss": -0.0031, "step": 175 }, { "clip_ratio/high_max": 0.004638076349056064, "clip_ratio/high_mean": 0.002151022928455859, "clip_ratio/low_mean": 0.0033463413205026883, "clip_ratio/low_min": 0.0002793999319692375, "clip_ratio/region_mean": 0.00549736418061002, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 3072.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 614.3463745117188, "completions/mean_terminated_length": 607.9295043945312, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.192, "grad_norm": 0.18011208240370918, "learning_rate": 2.925370733352704e-06, "loss": -0.0006, "num_tokens": 3358589.0, "reward": 0.484375, "reward_std": 0.21650634706020355, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5004078149795532, "step": 180 }, { "clip_ratio/high_max": 0.008613884902661084, "clip_ratio/high_mean": 0.004034369261171377, "clip_ratio/low_mean": 0.004355008427319262, "clip_ratio/low_min": 0.0007161269684729632, "clip_ratio/region_mean": 0.008389377645835338, "epoch": 0.19733333333333333, "grad_norm": 0.16974527980005297, "learning_rate": 2.9164174226465136e-06, "loss": -0.002, "step": 185 }, { "clip_ratio/high_max": 0.01658790694345953, "clip_ratio/high_mean": 0.007886763595161028, "clip_ratio/low_mean": 0.010219653432432096, "clip_ratio/low_min": 0.0016697083745384588, "clip_ratio/region_mean": 0.01810641702322755, "epoch": 0.20266666666666666, "grad_norm": 0.13895860279174874, "learning_rate": 2.9069723405047926e-06, "loss": -0.0036, "step": 190 }, { "clip_ratio/high_max": 0.008373217536609445, "clip_ratio/high_mean": 0.004043773262992545, "clip_ratio/low_mean": 0.005840962910906455, "clip_ratio/low_min": 0.0009006860136651085, "clip_ratio/region_mean": 0.009884736043932207, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2241.0, "completions/mean_length": 604.4479370117188, "completions/mean_terminated_length": 578.4736938476562, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.208, "grad_norm": 0.17144408911410977, "learning_rate": 2.89703876620209e-06, "loss": -0.0018, "num_tokens": 3642948.0, "reward": 0.4296875, "reward_std": 0.20297470688819885, "rewards/accuracy_reward/mean": 0.4296875, "rewards/accuracy_reward/std": 0.49567729234695435, "step": 195 }, { "clip_ratio/high_max": 0.007765444242249941, "clip_ratio/high_mean": 0.0035302122020766545, "clip_ratio/low_mean": 0.004095690787062267, "clip_ratio/low_min": 0.0002433282141282689, "clip_ratio/region_mean": 0.0076259030724031614, "epoch": 0.21333333333333335, "grad_norm": 0.16793300745843606, "learning_rate": 2.8866201486144333e-06, "loss": -0.002, "step": 200 }, { "clip_ratio/high_max": 0.013806802002363838, "clip_ratio/high_mean": 0.006351603633811464, "clip_ratio/low_mean": 0.010427920520442058, "clip_ratio/low_min": 0.0008336678583873436, "clip_ratio/region_mean": 0.016779524183948524, "epoch": 0.21866666666666668, "grad_norm": 0.17755312684983748, "learning_rate": 2.875720105021903e-06, "loss": -0.0035, "step": 205 }, { "clip_ratio/high_max": 0.009226404238688701, "clip_ratio/high_mean": 0.004352189291375907, "clip_ratio/low_mean": 0.009122176324035535, "clip_ratio/low_min": 0.000885023031696619, "clip_ratio/region_mean": 0.01347436560654387, "completions/clipped_ratio": 0.02604166666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2130.0, "completions/mean_length": 626.3463745117188, "completions/mean_terminated_length": 560.9545288085938, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.224, "grad_norm": 0.2303902595097223, "learning_rate": 2.8643424198527314e-06, "loss": -0.0026, "num_tokens": 3931933.0, "reward": 0.4322916865348816, "reward_std": 0.24808019399642944, "rewards/accuracy_reward/mean": 0.4322916567325592, "rewards/accuracy_reward/std": 0.4960406720638275, "step": 210 }, { "clip_ratio/high_max": 0.005785166783971363, "clip_ratio/high_mean": 0.002684333728848287, "clip_ratio/low_mean": 0.0047546586905809814, "clip_ratio/low_min": 0.00045189225020294546, "clip_ratio/region_mean": 0.007438992471725215, "epoch": 0.22933333333333333, "grad_norm": 0.17564501363454268, "learning_rate": 2.852491043369377e-06, "loss": -0.0022, "step": 215 }, { "clip_ratio/high_max": 0.012353528670792003, "clip_ratio/high_mean": 0.00587688130508468, "clip_ratio/low_mean": 0.014300797165196855, "clip_ratio/low_min": 0.0013365065264224541, "clip_ratio/region_mean": 0.02017767846409697, "epoch": 0.23466666666666666, "grad_norm": 0.26866208033661154, "learning_rate": 2.840170090297014e-06, "loss": -0.0044, "step": 220 }, { "clip_ratio/high_max": 0.012385946845461149, "clip_ratio/high_mean": 0.005716409389424371, "clip_ratio/low_mean": 0.017489325674978316, "clip_ratio/low_min": 0.0017160692063043826, "clip_ratio/region_mean": 0.023205734915973154, "completions/clipped_ratio": 0.06510416666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 735.84375, "completions/mean_terminated_length": 573.1587524414062, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.24, "grad_norm": 0.2454314674553111, "learning_rate": 2.827383838394926e-06, "loss": -0.0045, "num_tokens": 4265110.0, "reward": 0.4088541865348816, "reward_std": 0.20297470688819885, "rewards/accuracy_reward/mean": 0.4088541567325592, "rewards/accuracy_reward/std": 0.4922636151313782, "step": 225 }, { "clip_ratio/high_max": 0.004117720608883246, "clip_ratio/high_mean": 0.00174849127579364, "clip_ratio/low_mean": 0.005226423573049033, "clip_ratio/low_min": 0.00038029419629310724, "clip_ratio/region_mean": 0.006974914907004859, "epoch": 0.24533333333333332, "grad_norm": 0.19037731609580857, "learning_rate": 2.8141367269712943e-06, "loss": -0.0011, "step": 230 }, { "clip_ratio/high_max": 0.009471970945014618, "clip_ratio/high_mean": 0.004125609046968748, "clip_ratio/low_mean": 0.01374495918789762, "clip_ratio/low_min": 0.0009897807307424956, "clip_ratio/region_mean": 0.017870568158105015, "epoch": 0.25066666666666665, "grad_norm": 0.1783474017226303, "learning_rate": 2.800433355341898e-06, "loss": -0.003, "step": 235 }, { "clip_ratio/high_max": 0.01219106379430741, "clip_ratio/high_mean": 0.005159649508277653, "clip_ratio/low_mean": 0.02028500295782578, "clip_ratio/low_min": 0.001349572723847814, "clip_ratio/region_mean": 0.025444652368605603, "epoch": 0.256, "grad_norm": 0.1516919728348737, "learning_rate": 2.786278481233259e-06, "loss": -0.004, "step": 240 }, { "clip_ratio/high_max": 0.0030249825253122254, "clip_ratio/high_mean": 0.0013068737164303457, "clip_ratio/low_mean": 0.001975227545699454, "clip_ratio/low_min": 0.00019110942939732923, "clip_ratio/region_mean": 0.0032821012537624485, "completions/clipped_ratio": 0.08854166666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2833.0, "completions/mean_length": 777.90625, "completions/mean_terminated_length": 555.0514526367188, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.2613333333333333, "grad_norm": 0.18625370230793184, "learning_rate": 2.7716770191307885e-06, "loss": -0.0007, "num_tokens": 4617628.0, "reward": 0.4817708432674408, "reward_std": 0.22552745044231415, "rewards/accuracy_reward/mean": 0.4817708432674408, "rewards/accuracy_reward/std": 0.5003194212913513, "step": 245 }, { "clip_ratio/high_max": 0.011308155176084255, "clip_ratio/high_mean": 0.004770742844993947, "clip_ratio/low_mean": 0.009703200111471234, "clip_ratio/low_min": 0.0008358004473848268, "clip_ratio/region_mean": 0.014473943018674617, "epoch": 0.26666666666666666, "grad_norm": 0.21585746485194893, "learning_rate": 2.7566340385725087e-06, "loss": -0.0028, "step": 250 }, { "clip_ratio/high_max": 0.016187353310670005, "clip_ratio/high_mean": 0.006700956767235766, "clip_ratio/low_mean": 0.016702650821389397, "clip_ratio/low_min": 0.00116436087701004, "clip_ratio/region_mean": 0.023403607569343877, "epoch": 0.272, "grad_norm": 0.13947480588694366, "learning_rate": 2.74115476238894e-06, "loss": -0.0038, "step": 255 }, { "clip_ratio/high_max": 0.004075065470351547, "clip_ratio/high_mean": 0.0017068392248347664, "clip_ratio/low_mean": 0.005115064689766769, "clip_ratio/low_min": 0.0002634377931826748, "clip_ratio/region_mean": 0.006821903928198481, "completions/clipped_ratio": 0.24479166666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 1172.8958740234375, "completions/mean_terminated_length": 557.3241577148438, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.2773333333333333, "grad_norm": 0.19892034423297258, "learning_rate": 2.725244564889764e-06, "loss": -0.0011, "num_tokens": 5124402.0, "reward": 0.3020833432674408, "reward_std": 0.16237977147102356, "rewards/accuracy_reward/mean": 0.3020833432674408, "rewards/accuracy_reward/std": 0.4597601890563965, "step": 260 }, { "clip_ratio/high_max": 0.0041308092408144145, "clip_ratio/high_mean": 0.0017702964542877452, "clip_ratio/low_mean": 0.008503467499213001, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010273763901022904, "epoch": 0.2826666666666667, "grad_norm": 0.1756690348318295, "learning_rate": 2.7089089699979008e-06, "loss": -0.0017, "step": 265 }, { "clip_ratio/high_max": 0.006030686004669406, "clip_ratio/high_mean": 0.0025575936773748254, "clip_ratio/low_mean": 0.011701538000488653, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014259131784638158, "epoch": 0.288, "grad_norm": 0.14025081284975885, "learning_rate": 2.6921536493316326e-06, "loss": -0.0025, "step": 270 }, { "clip_ratio/high_max": 0.003235487905476475, "clip_ratio/high_mean": 0.0013571034478445654, "clip_ratio/low_mean": 0.006449986008419728, "clip_ratio/low_min": 0.0001343366207947838, "clip_ratio/region_mean": 0.00780708943539139, "completions/clipped_ratio": 0.27604166666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 1196.393310546875, "completions/mean_terminated_length": 481.23382568359375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.29333333333333333, "grad_norm": 0.26976880976003487, "learning_rate": 2.6749844202354553e-06, "loss": -0.0015, "num_tokens": 5632051.0, "reward": 0.3984375, "reward_std": 0.21650634706020355, "rewards/accuracy_reward/mean": 0.3984375, "rewards/accuracy_reward/std": 0.49021512269973755, "step": 275 }, { "clip_ratio/high_max": 0.00409326254757616, "clip_ratio/high_mean": 0.001835159609618131, "clip_ratio/low_mean": 0.020099082329761587, "clip_ratio/low_min": 0.0015909251960692927, "clip_ratio/region_mean": 0.02193424207598582, "epoch": 0.2986666666666667, "grad_norm": 0.27976810465871765, "learning_rate": 2.65740724376033e-06, "loss": -0.0022, "step": 280 }, { "clip_ratio/high_max": 0.006464947311178549, "clip_ratio/high_mean": 0.0028551706513098908, "clip_ratio/low_mean": 0.026205192415363853, "clip_ratio/low_min": 0.0017754189007973763, "clip_ratio/region_mean": 0.029060362888594683, "epoch": 0.304, "grad_norm": 0.17808768104201175, "learning_rate": 2.6394282225940447e-06, "loss": -0.0034, "step": 285 }, { "clip_ratio/high_max": 0.004281097604143724, "clip_ratio/high_mean": 0.0018994767322737971, "clip_ratio/low_mean": 0.020333782026841617, "clip_ratio/low_min": 0.0013895674244849942, "clip_ratio/region_mean": 0.022233258849882986, "completions/clipped_ratio": 0.3046875, "completions/max_length": 3072.0, "completions/max_terminated_length": 2968.0, "completions/mean_length": 1296.2083740234375, "completions/mean_terminated_length": 518.0524291992188, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.30933333333333335, "grad_norm": 0.3273268027040314, "learning_rate": 2.621053598942398e-06, "loss": -0.0026, "num_tokens": 6180042.0, "reward": 0.421875, "reward_std": 0.20748524367809296, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.49450308084487915, "step": 290 }, { "clip_ratio/high_max": 0.0034333205695475043, "clip_ratio/high_mean": 0.0014623903042092934, "clip_ratio/low_mean": 0.01819981881512831, "clip_ratio/low_min": 0.0006374489210429601, "clip_ratio/region_mean": 0.01966220921567583, "epoch": 0.31466666666666665, "grad_norm": 0.3256651573583743, "learning_rate": 2.6022897523619424e-06, "loss": -0.0022, "step": 295 }, { "clip_ratio/high_max": 0.007133759096905124, "clip_ratio/high_mean": 0.0030467332453554263, "clip_ratio/low_mean": 0.030512772721340297, "clip_ratio/low_min": 0.0007334250636631623, "clip_ratio/region_mean": 0.03355950598597701, "epoch": 0.32, "grad_norm": 0.27098349680358913, "learning_rate": 2.583143197545044e-06, "loss": -0.0041, "step": 300 }, { "clip_ratio/high_max": 0.007178572006523609, "clip_ratio/high_mean": 0.002992426839773543, "clip_ratio/low_mean": 0.03104121607939305, "clip_ratio/low_min": 0.0010419698082841933, "clip_ratio/region_mean": 0.03403364310070174, "completions/clipped_ratio": 0.36197916666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 1442.057373046875, "completions/mean_terminated_length": 517.3142700195312, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.3253333333333333, "grad_norm": 0.7645132121212074, "learning_rate": 2.5636205820580173e-06, "loss": -0.0042, "num_tokens": 6790249.0, "reward": 0.3932291865348816, "reward_std": 0.19395360350608826, "rewards/accuracy_reward/mean": 0.3932291567325592, "rewards/accuracy_reward/std": 0.4891042709350586, "step": 305 }, { "clip_ratio/high_max": 0.0026476537023881973, "clip_ratio/high_mean": 0.0011266059317904364, "clip_ratio/low_mean": 0.04991645668058027, "clip_ratio/low_min": 0.0007461817061994225, "clip_ratio/region_mean": 0.05104306318153249, "epoch": 0.33066666666666666, "grad_norm": 0.4113861087066626, "learning_rate": 2.5437286840331353e-06, "loss": -0.0031, "step": 310 }, { "clip_ratio/high_max": 0.0063183744670823215, "clip_ratio/high_mean": 0.0027117192576042726, "clip_ratio/low_mean": 0.06149723054331844, "clip_ratio/low_min": 0.0014339038054458797, "clip_ratio/region_mean": 0.06420894972688984, "epoch": 0.336, "grad_norm": 0.34905129720399947, "learning_rate": 2.5234744098153e-06, "loss": -0.0056, "step": 315 }, { "clip_ratio/high_max": 0.00849654933808779, "clip_ratio/high_mean": 0.0036011947062434047, "clip_ratio/low_mean": 0.062465655984487965, "clip_ratio/low_min": 0.0026254164869897066, "clip_ratio/region_mean": 0.06606685022852617, "epoch": 0.3413333333333333, "grad_norm": 0.23466922503663074, "learning_rate": 2.502864791564205e-06, "loss": -0.007, "step": 320 }, { "clip_ratio/high_max": 0.0019177518992819387, "clip_ratio/high_mean": 0.0008457156709710034, "clip_ratio/low_mean": 0.05387299727944992, "clip_ratio/low_min": 0.009935903730092833, "clip_ratio/region_mean": 0.05471871315530734, "completions/clipped_ratio": 0.31770833333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2778.0, "completions/mean_length": 1309.018310546875, "completions/mean_terminated_length": 488.0877990722656, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.3466666666666667, "grad_norm": 0.5351934870580246, "learning_rate": 2.48190698481281e-06, "loss": -0.0027, "num_tokens": 7345013.0, "reward": 0.4036458432674408, "reward_std": 0.22552745044231415, "rewards/accuracy_reward/mean": 0.4036458432674408, "rewards/accuracy_reward/std": 0.4912680983543396, "step": 325 }, { "clip_ratio/high_max": 0.007272961941271206, "clip_ratio/high_mean": 0.003073033658665736, "clip_ratio/low_mean": 0.07307519497007889, "clip_ratio/low_min": 0.017213464618544096, "clip_ratio/region_mean": 0.07614822842733701, "epoch": 0.352, "grad_norm": 0.47146634038132695, "learning_rate": 2.460608265982985e-06, "loss": -0.0058, "step": 330 }, { "clip_ratio/high_max": 0.00920700101141847, "clip_ratio/high_mean": 0.003868434091373274, "clip_ratio/low_mean": 0.0743202656536596, "clip_ratio/low_min": 0.015867017867276446, "clip_ratio/region_mean": 0.07818869996190188, "epoch": 0.35733333333333334, "grad_norm": 0.27123292866876086, "learning_rate": 2.4389760298591824e-06, "loss": -0.0075, "step": 335 }, { "clip_ratio/high_max": 0.0026404164586892875, "clip_ratio/high_mean": 0.00110359310297099, "clip_ratio/low_mean": 0.061379383613848405, "clip_ratio/low_min": 0.004069715263631224, "clip_ratio/region_mean": 0.0624829759937711, "completions/clipped_ratio": 0.38541666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1481.518310546875, "completions/mean_terminated_length": 484.09747314453125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.3626666666666667, "grad_norm": 1.562289899186608, "learning_rate": 2.4170177870210112e-06, "loss": -0.0031, "num_tokens": 7968642.0, "reward": 0.3880208432674408, "reward_std": 0.20748524367809296, "rewards/accuracy_reward/mean": 0.3880208432674408, "rewards/accuracy_reward/std": 0.4879350960254669, "step": 340 }, { "clip_ratio/high_max": 0.008549761213907913, "clip_ratio/high_mean": 0.0031626374567622407, "clip_ratio/low_mean": 0.07328167974228564, "clip_ratio/low_min": 0.001982791512273252, "clip_ratio/region_mean": 0.07644431694825471, "epoch": 0.368, "grad_norm": 0.6750991974156748, "learning_rate": 2.3947411612356092e-06, "loss": -0.0017, "step": 345 }, { "clip_ratio/high_max": 0.01168314833266777, "clip_ratio/high_mean": 0.004333287724148249, "clip_ratio/low_mean": 0.09327900885000417, "clip_ratio/low_min": 0.002434034391990281, "clip_ratio/region_mean": 0.09761229644973354, "epoch": 0.37333333333333335, "grad_norm": 0.4861096375557942, "learning_rate": 2.3721538868107225e-06, "loss": -0.0053, "step": 350 }, { "clip_ratio/high_max": 0.00515082522697412, "clip_ratio/high_mean": 0.0019643667242917216, "clip_ratio/low_mean": 0.03430071057707664, "clip_ratio/low_min": 0.0021023122725637223, "clip_ratio/region_mean": 0.03626507724475232, "completions/clipped_ratio": 0.37760416666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 1463.1693115234375, "completions/mean_terminated_length": 487.10040283203125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.37866666666666665, "grad_norm": 0.5229357237868048, "learning_rate": 2.3492638059093957e-06, "loss": -0.0035, "num_tokens": 8584394.0, "reward": 0.3958333432674408, "reward_std": 0.18042194843292236, "rewards/accuracy_reward/mean": 0.3958333432674408, "rewards/accuracy_reward/std": 0.4896669089794159, "step": 355 }, { "clip_ratio/high_max": 0.0030107965487331965, "clip_ratio/high_mean": 0.0014109951745922445, "clip_ratio/low_mean": 0.08581264366857795, "clip_ratio/low_min": 0.005743206990155158, "clip_ratio/region_mean": 0.0872236388244346, "epoch": 0.384, "grad_norm": 0.7375628723427412, "learning_rate": 2.3260788658272246e-06, "loss": -0.0029, "step": 360 }, { "clip_ratio/high_max": 0.004547903422280797, "clip_ratio/high_mean": 0.0020541730286822713, "clip_ratio/low_mean": 0.056422804619796804, "clip_ratio/low_min": 0.00408765841712011, "clip_ratio/region_mean": 0.058476977316604464, "epoch": 0.3893333333333333, "grad_norm": 0.39859333961813315, "learning_rate": 2.302607116233101e-06, "loss": -0.0046, "step": 365 }, { "clip_ratio/high_max": 0.0030406241141918144, "clip_ratio/high_mean": 0.0013973756429322749, "clip_ratio/low_mean": 0.04561135886933698, "clip_ratio/low_min": 0.0028684542965493167, "clip_ratio/region_mean": 0.04700873524891449, "completions/clipped_ratio": 0.47395833333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 1721.127685546875, "completions/mean_terminated_length": 504.00494384765625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.39466666666666667, "grad_norm": 0.700109266656566, "learning_rate": 2.278856706374422e-06, "loss": -0.0041, "num_tokens": 9304230.0, "reward": 0.3489583432674408, "reward_std": 0.16689030826091766, "rewards/accuracy_reward/mean": 0.3489583432674408, "rewards/accuracy_reward/std": 0.47726258635520935, "step": 370 }, { "clip_ratio/high_max": 0.001531930253122482, "clip_ratio/high_mean": 0.0006243601214464434, "clip_ratio/low_mean": 0.11026522623342316, "clip_ratio/low_min": 0.007940051052719355, "clip_ratio/region_mean": 0.11088958535815437, "epoch": 0.4, "grad_norm": 1.9178109993399397, "learning_rate": 2.254835882247716e-06, "loss": -0.0046, "step": 375 }, { "clip_ratio/high_max": 0.003595319176019984, "clip_ratio/high_mean": 0.0014309171640888962, "clip_ratio/low_mean": 0.07121570940926177, "clip_ratio/low_min": 0.005534401349723339, "clip_ratio/region_mean": 0.07264662618999865, "epoch": 0.4053333333333333, "grad_norm": 0.5418999238019325, "learning_rate": 2.230552983735686e-06, "loss": -0.0053, "step": 380 }, { "clip_ratio/high_max": 0.003178618564925273, "clip_ratio/high_mean": 0.0012702006735707983, "clip_ratio/low_mean": 0.06126615319008124, "clip_ratio/low_min": 0.0049176108557730915, "clip_ratio/region_mean": 0.06253635384400695, "completions/clipped_ratio": 0.42447916666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 1585.0625, "completions/mean_terminated_length": 488.36199951171875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4106666666666667, "grad_norm": 1.6103866805401976, "learning_rate": 2.206016441711652e-06, "loss": -0.0051, "num_tokens": 9968628.0, "reward": 0.3932291865348816, "reward_std": 0.17140087485313416, "rewards/accuracy_reward/mean": 0.3932291567325592, "rewards/accuracy_reward/std": 0.4891042709350586, "step": 385 }, { "clip_ratio/high_max": 0.002138170040007026, "clip_ratio/high_mean": 0.0007969536171913205, "clip_ratio/low_mean": 0.10287776709374157, "clip_ratio/low_min": 0.005417244618729456, "clip_ratio/region_mean": 0.10367472118214209, "epoch": 0.416, "grad_norm": 0.3957921527449835, "learning_rate": 2.1812347751124072e-06, "loss": -0.005, "step": 390 }, { "clip_ratio/high_max": 0.004428159066446824, "clip_ratio/high_mean": 0.001690828337086714, "clip_ratio/low_mean": 0.08465427833798458, "clip_ratio/low_min": 0.0032509498683793935, "clip_ratio/region_mean": 0.08634510685078567, "epoch": 0.42133333333333334, "grad_norm": 0.6131360758648255, "learning_rate": 2.156216587980491e-06, "loss": -0.0067, "step": 395 }, { "clip_ratio/high_max": 0.005870185343519551, "clip_ratio/high_mean": 0.002232021604686452, "clip_ratio/low_mean": 0.06619590633708868, "clip_ratio/low_min": 0.0030933621295844206, "clip_ratio/region_mean": 0.06842792783063487, "epoch": 0.4266666666666667, "grad_norm": 0.32758982836857486, "learning_rate": 2.1309705664769195e-06, "loss": -0.0076, "step": 400 }, { "clip_ratio/high_max": 0.001104906623277202, "clip_ratio/high_mean": 0.00044814476480041775, "clip_ratio/low_mean": 0.06794686097150589, "clip_ratio/low_min": 0.00019315825193189085, "clip_ratio/region_mean": 0.06839500537562344, "completions/clipped_ratio": 0.42447916666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 1586.6875, "completions/mean_terminated_length": 491.185546875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.432, "grad_norm": 0.8587362676940098, "learning_rate": 2.1055054758654056e-06, "loss": -0.0023, "num_tokens": 10633710.0, "reward": 0.3489583432674408, "reward_std": 0.18042194843292236, "rewards/accuracy_reward/mean": 0.3489583432674408, "rewards/accuracy_reward/std": 0.47726258635520935, "step": 405 }, { "clip_ratio/high_max": 0.003434718288553995, "clip_ratio/high_mean": 0.0014203029940745183, "clip_ratio/low_mean": 0.060608451827556566, "clip_ratio/low_min": 0.0008279486210085452, "clip_ratio/region_mean": 0.062028754589528036, "epoch": 0.43733333333333335, "grad_norm": 0.6266230317341379, "learning_rate": 2.0798301574691106e-06, "loss": -0.0044, "step": 410 }, { "clip_ratio/high_max": 0.003587247554787609, "clip_ratio/high_mean": 0.001580799402199773, "clip_ratio/low_mean": 0.09860702547020991, "clip_ratio/low_min": 0.0018894157568865922, "clip_ratio/region_mean": 0.10018782324877976, "epoch": 0.44266666666666665, "grad_norm": 0.3911988364721312, "learning_rate": 2.053953525600994e-06, "loss": -0.0053, "step": 415 }, { "clip_ratio/high_max": 0.0012576851619996888, "clip_ratio/high_mean": 0.0005424946870334679, "clip_ratio/low_mean": 0.0458320612400712, "clip_ratio/low_min": 0.002284846782549721, "clip_ratio/region_mean": 0.04637455617350952, "completions/clipped_ratio": 0.38541666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1465.979248046875, "completions/mean_terminated_length": 458.8135681152344, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.448, "grad_norm": 0.5474619957403304, "learning_rate": 2.027884564468816e-06, "loss": -0.0031, "num_tokens": 11249719.0, "reward": 0.375, "reward_std": 0.17591141164302826, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4847545027732849, "step": 420 }, { "clip_ratio/high_max": 0.0021109833996888483, "clip_ratio/high_mean": 0.0008939137334891712, "clip_ratio/low_mean": 0.10148776352361892, "clip_ratio/low_min": 0.005028543881780934, "clip_ratio/region_mean": 0.10238167763873207, "epoch": 0.4533333333333333, "grad_norm": 0.7837241351607741, "learning_rate": 2.0016323250558765e-06, "loss": -0.0039, "step": 425 }, { "clip_ratio/high_max": 0.003409022702544462, "clip_ratio/high_mean": 0.0014325160533189773, "clip_ratio/low_mean": 0.07323483193558786, "clip_ratio/low_min": 0.005653746098687407, "clip_ratio/region_mean": 0.07466734660429211, "epoch": 0.45866666666666667, "grad_norm": 0.43296388436986644, "learning_rate": 1.9752059219785703e-06, "loss": -0.005, "step": 430 }, { "clip_ratio/high_max": 0.001807406998159422, "clip_ratio/high_mean": 0.0007510785388149088, "clip_ratio/low_mean": 0.05021302722773271, "clip_ratio/low_min": 0.003368189519005682, "clip_ratio/region_mean": 0.05096410649587142, "completions/clipped_ratio": 0.44270833333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 1622.1302490234375, "completions/mean_terminated_length": 470.3644714355469, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.464, "grad_norm": 0.25861127050332183, "learning_rate": 1.948614530321848e-06, "loss": -0.0044, "num_tokens": 11924547.0, "reward": 0.375, "reward_std": 0.20748524367809296, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4847545027732849, "step": 435 }, { "clip_ratio/high_max": 0.002239241869938269, "clip_ratio/high_mean": 0.0009120702129166603, "clip_ratio/low_mean": 0.14328454590868206, "clip_ratio/low_min": 0.012018040614748315, "clip_ratio/region_mean": 0.14419661713918686, "epoch": 0.4693333333333333, "grad_norm": 1.0458309257986962, "learning_rate": 1.921867382453679e-06, "loss": -0.0057, "step": 440 }, { "clip_ratio/high_max": 0.003923635012324666, "clip_ratio/high_mean": 0.0016028785108574083, "clip_ratio/low_mean": 0.07476992048868851, "clip_ratio/low_min": 0.006628038924463908, "clip_ratio/region_mean": 0.07637279902201044, "epoch": 0.4746666666666667, "grad_norm": 0.6769048357951881, "learning_rate": 1.8949737648196395e-06, "loss": -0.0056, "step": 445 }, { "clip_ratio/high_max": 0.002720684657106176, "clip_ratio/high_mean": 0.0011155187101621778, "clip_ratio/low_mean": 0.09470151053153586, "clip_ratio/low_min": 0.007887291998395085, "clip_ratio/region_mean": 0.09581702941677576, "completions/clipped_ratio": 0.40364583333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1523.987060546875, "completions/mean_terminated_length": 476.20526123046875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.48, "grad_norm": 1.8839987866404113, "learning_rate": 1.8679430147187031e-06, "loss": -0.0043, "num_tokens": 12565279.0, "reward": 0.4322916865348816, "reward_std": 0.23454853892326355, "rewards/accuracy_reward/mean": 0.4322916567325592, "rewards/accuracy_reward/std": 0.4960406720638275, "step": 450 }, { "clip_ratio/high_max": 0.0015380638558781356, "clip_ratio/high_mean": 0.0007115959610700884, "clip_ratio/low_mean": 0.11563618403843065, "clip_ratio/low_min": 0.007571243058828258, "clip_ratio/region_mean": 0.11634777943527297, "epoch": 0.48533333333333334, "grad_norm": 0.67352645462372, "learning_rate": 1.840784517061398e-06, "loss": -0.0061, "step": 455 }, { "clip_ratio/high_max": 0.0034122977645893115, "clip_ratio/high_mean": 0.0016081294577816152, "clip_ratio/low_mean": 0.06903934945294168, "clip_ratio/low_min": 0.004265563493027003, "clip_ratio/region_mean": 0.07064747888398415, "epoch": 0.49066666666666664, "grad_norm": 0.4368960345570725, "learning_rate": 1.8135077011114185e-06, "loss": -0.0077, "step": 460 }, { "clip_ratio/high_max": 0.0031490640678384805, "clip_ratio/high_mean": 0.0015061335418067757, "clip_ratio/low_mean": 0.08518126579438104, "clip_ratio/low_min": 0.006091109849876375, "clip_ratio/region_mean": 0.086687398977665, "completions/clipped_ratio": 0.37760416666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 1446.9505615234375, "completions/mean_terminated_length": 461.0418395996094, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.496, "grad_norm": 2.147792145133951, "learning_rate": 1.7861220372118446e-06, "loss": -0.0065, "num_tokens": 13170684.0, "reward": 0.4401041865348816, "reward_std": 0.19846415519714355, "rewards/accuracy_reward/mean": 0.4401041567325592, "rewards/accuracy_reward/std": 0.49704715609550476, "step": 465 }, { "clip_ratio/high_max": 0.0012672010763708386, "clip_ratio/high_mean": 0.0005224534447734186, "clip_ratio/low_mean": 0.08139123067807078, "clip_ratio/low_min": 0.009930310912386631, "clip_ratio/region_mean": 0.08191368389465196, "epoch": 0.5013333333333333, "grad_norm": 0.8466426305919128, "learning_rate": 1.7586370334970954e-06, "loss": -0.005, "step": 470 }, { "clip_ratio/high_max": 0.0030355606284501848, "clip_ratio/high_mean": 0.0012444653292732256, "clip_ratio/low_mean": 0.0753894673573086, "clip_ratio/low_min": 0.010303658125485527, "clip_ratio/region_mean": 0.07663393334132707, "epoch": 0.5066666666666667, "grad_norm": 0.5092879407223004, "learning_rate": 1.7310622325917648e-06, "loss": -0.0068, "step": 475 }, { "clip_ratio/high_max": 0.004052611529914429, "clip_ratio/high_mean": 0.001695779630153993, "clip_ratio/low_mean": 0.08801138708768122, "clip_ratio/low_min": 0.006428692897316068, "clip_ratio/region_mean": 0.08970716679250472, "epoch": 0.512, "grad_norm": 0.37298719704903815, "learning_rate": 1.7034072082974805e-06, "loss": -0.0075, "step": 480 }, { "clip_ratio/high_max": 0.0007599989851314604, "clip_ratio/high_mean": 0.00033319178799047223, "clip_ratio/low_mean": 0.0614012249458483, "clip_ratio/low_min": 2.225189236924052e-05, "clip_ratio/region_mean": 0.06173441708533574, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 1542.783935546875, "completions/mean_terminated_length": 450.48663330078125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.5173333333333333, "grad_norm": 0.8885138162027573, "learning_rate": 1.6756815622689371e-06, "loss": -0.0034, "num_tokens": 13812943.0, "reward": 0.4114583432674408, "reward_std": 0.19395360350608826, "rewards/accuracy_reward/mean": 0.4114583432674408, "rewards/accuracy_reward/std": 0.4927399158477783, "step": 485 }, { "clip_ratio/high_max": 0.018110496096414862, "clip_ratio/high_mean": 0.006254618212301466, "clip_ratio/low_mean": 0.10400115496377112, "clip_ratio/low_min": 0.0001223854022100568, "clip_ratio/region_mean": 0.11025577302425518, "epoch": 0.5226666666666666, "grad_norm": 0.6356014651126995, "learning_rate": 1.6478949206802629e-06, "loss": -0.0055, "step": 490 }, { "clip_ratio/high_max": 0.011979550187243148, "clip_ratio/high_mean": 0.004337472982479084, "clip_ratio/low_mean": 0.06878798435827775, "clip_ratio/low_min": 0.00016827993385959417, "clip_ratio/region_mean": 0.07312545729946578, "epoch": 0.528, "grad_norm": 0.4265852659553873, "learning_rate": 1.6200569308828705e-06, "loss": -0.0066, "step": 495 }, { "clip_ratio/high_max": 0.002959414072392974, "clip_ratio/high_mean": 0.0010892530955516123, "clip_ratio/low_mean": 0.04544788012763092, "clip_ratio/low_min": 0.0014603409530536738, "clip_ratio/region_mean": 0.046537132785715585, "completions/clipped_ratio": 0.390625, "completions/max_length": 3072.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 1470.8646240234375, "completions/mean_terminated_length": 444.4957580566406, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5333333333333333, "grad_norm": 0.6548745763544179, "learning_rate": 1.5921772580559549e-06, "loss": -0.0038, "num_tokens": 14430600.0, "reward": 0.4192708432674408, "reward_std": 0.16689030826091766, "rewards/accuracy_reward/mean": 0.4192708432674408, "rewards/accuracy_reward/std": 0.4940834939479828, "step": 500 }, { "clip_ratio/high_max": 0.002207353398807754, "clip_ratio/high_mean": 0.0008845279810884676, "clip_ratio/low_mean": 0.10170576997825265, "clip_ratio/low_min": 0.004324663197621703, "clip_ratio/region_mean": 0.1025902980234605, "epoch": 0.5386666666666666, "grad_norm": 0.5331493589842593, "learning_rate": 1.5642655818508029e-06, "loss": -0.004, "step": 505 }, { "clip_ratio/high_max": 0.003438753176305909, "clip_ratio/high_mean": 0.0013314025385625428, "clip_ratio/low_mean": 0.056352202120251604, "clip_ratio/low_min": 0.0011719158268533648, "clip_ratio/region_mean": 0.05768360438778473, "epoch": 0.544, "grad_norm": 0.3520799318810492, "learning_rate": 1.5363315930300777e-06, "loss": -0.0052, "step": 510 }, { "clip_ratio/high_max": 0.0017432715534596356, "clip_ratio/high_mean": 0.0006973671067953546, "clip_ratio/low_mean": 0.03401082667676292, "clip_ratio/low_min": 0.0018350960686802863, "clip_ratio/region_mean": 0.034708193638539345, "completions/clipped_ratio": 0.38802083333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 1462.9896240234375, "completions/mean_terminated_length": 442.8085021972656, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.5493333333333333, "grad_norm": 0.2588652167125558, "learning_rate": 1.5083849901032472e-06, "loss": -0.0034, "num_tokens": 15047411.0, "reward": 0.4635416865348816, "reward_std": 0.13531647622585297, "rewards/accuracy_reward/mean": 0.4635416567325592, "rewards/accuracy_reward/std": 0.4993196129798889, "step": 515 }, { "clip_ratio/high_max": 0.0016519001596861926, "clip_ratio/high_mean": 0.0007040223968488135, "clip_ratio/low_mean": 0.080545712030289, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08124973371859597, "epoch": 0.5546666666666666, "grad_norm": 0.34513282441606774, "learning_rate": 1.4804354759593176e-06, "loss": -0.0028, "step": 520 }, { "clip_ratio/high_max": 0.0022881746012899384, "clip_ratio/high_mean": 0.0009753675197316624, "clip_ratio/low_mean": 0.04750497791173984, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04848034548849682, "epoch": 0.56, "grad_norm": 0.482493594740385, "learning_rate": 1.452492754498053e-06, "loss": -0.0036, "step": 525 }, { "clip_ratio/high_max": 0.0015535891247509426, "clip_ratio/high_mean": 0.0006678163757442235, "clip_ratio/low_mean": 0.03930277184081206, "clip_ratio/low_min": 4.792944673681632e-06, "clip_ratio/region_mean": 0.039970587718903515, "completions/clipped_ratio": 0.3828125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2917.0, "completions/mean_length": 1464.5234375, "completions/mean_terminated_length": 467.4809875488281, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.5653333333333334, "grad_norm": 1.6651221115034198, "learning_rate": 1.4245665272608392e-06, "loss": -0.0026, "num_tokens": 15660176.0, "reward": 0.4453125, "reward_std": 0.18493250012397766, "rewards/accuracy_reward/mean": 0.4453125, "rewards/accuracy_reward/std": 0.4976486563682556, "step": 530 }, { "clip_ratio/high_max": 0.001495416688385376, "clip_ratio/high_mean": 0.0006163650574080748, "clip_ratio/low_mean": 0.09307174016867066, "clip_ratio/low_min": 0.00019889803588739595, "clip_ratio/region_mean": 0.09368810496573587, "epoch": 0.5706666666666667, "grad_norm": 1.0974070186862415, "learning_rate": 1.396666490062369e-06, "loss": -0.0046, "step": 535 }, { "clip_ratio/high_max": 0.0030711609573700117, "clip_ratio/high_mean": 0.0012838541652399726, "clip_ratio/low_mean": 0.10082828262129624, "clip_ratio/low_min": 0.0004763815042679198, "clip_ratio/region_mean": 0.10211213729003248, "epoch": 0.576, "grad_norm": 0.48064461184570917, "learning_rate": 1.368802329624314e-06, "loss": -0.0054, "step": 540 }, { "clip_ratio/high_max": 0.002941222076333361, "clip_ratio/high_mean": 0.0012450450004052982, "clip_ratio/low_mean": 0.04900269398385717, "clip_ratio/low_min": 0.0004875026934314519, "clip_ratio/region_mean": 0.05024773879799795, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 1558.276123046875, "completions/mean_terminated_length": 477.044677734375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5813333333333334, "grad_norm": 1.5891735456175264, "learning_rate": 1.3409837202121548e-06, "loss": -0.0048, "num_tokens": 16309698.0, "reward": 0.3932291865348816, "reward_std": 0.14884811639785767, "rewards/accuracy_reward/mean": 0.3932291567325592, "rewards/accuracy_reward/std": 0.4891042709350586, "step": 545 }, { "clip_ratio/high_max": 0.0013002901267554988, "clip_ratio/high_mean": 0.000460951691320588, "clip_ratio/low_mean": 0.06695905850428971, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06742001015300048, "epoch": 0.5866666666666667, "grad_norm": 0.8616262759223461, "learning_rate": 1.313220320276336e-06, "loss": -0.0028, "step": 550 }, { "clip_ratio/high_max": 0.00249036002514913, "clip_ratio/high_mean": 0.0008876022172898956, "clip_ratio/low_mean": 0.058310646297468335, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05919824854136095, "epoch": 0.592, "grad_norm": 0.7681601715634485, "learning_rate": 1.285521769098911e-06, "loss": -0.0037, "step": 555 }, { "clip_ratio/high_max": 0.0031672357081333757, "clip_ratio/high_mean": 0.0011236331044528925, "clip_ratio/low_mean": 0.06991613125392178, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07103976465477899, "epoch": 0.5973333333333334, "grad_norm": 0.4915506737157602, "learning_rate": 1.257897683446842e-06, "loss": -0.0044, "step": 560 }, { "clip_ratio/high_max": 0.0006101035837673408, "clip_ratio/high_mean": 0.0002448726332659135, "clip_ratio/low_mean": 0.039113096588562254, "clip_ratio/low_min": 0.0065984008321720465, "clip_ratio/region_mean": 0.03935796950024724, "completions/clipped_ratio": 0.39322916666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 1499.4140625, "completions/mean_terminated_length": 480.2703857421875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.6026666666666667, "grad_norm": 0.9738512866062131, "learning_rate": 1.2303576542331168e-06, "loss": -0.0028, "num_tokens": 16940289.0, "reward": 0.4270833730697632, "reward_std": 0.18042194843292236, "rewards/accuracy_reward/mean": 0.4270833432674408, "rewards/accuracy_reward/std": 0.49529990553855896, "step": 565 }, { "clip_ratio/high_max": 0.0016963833446880018, "clip_ratio/high_mean": 0.0007010168958686336, "clip_ratio/low_mean": 0.09518421710017719, "clip_ratio/low_min": 0.020347312714147848, "clip_ratio/region_mean": 0.09588523472646102, "epoch": 0.608, "grad_norm": 0.3281092981702191, "learning_rate": 1.2029112431868455e-06, "loss": -0.0047, "step": 570 }, { "clip_ratio/high_max": 0.002453617951596243, "clip_ratio/high_mean": 0.0010488400748272398, "clip_ratio/low_mean": 0.07386425042714109, "clip_ratio/low_min": 0.00857842579644057, "clip_ratio/region_mean": 0.0749130901062017, "epoch": 0.6133333333333333, "grad_norm": 0.7710053368163678, "learning_rate": 1.1755679795334832e-06, "loss": -0.0054, "step": 575 }, { "clip_ratio/high_max": 0.0010527579843255808, "clip_ratio/high_mean": 0.0004389151000623315, "clip_ratio/low_mean": 0.03251540275982734, "clip_ratio/low_min": 0.0039565700695675336, "clip_ratio/region_mean": 0.03295431772326083, "completions/clipped_ratio": 0.36197916666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 1440.078125, "completions/mean_terminated_length": 514.2122192382812, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.6186666666666667, "grad_norm": 0.639734234583531, "learning_rate": 1.1483373566863454e-06, "loss": -0.003, "num_tokens": 17547306.0, "reward": 0.4166666865348816, "reward_std": 0.17140084505081177, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.49364984035491943, "step": 580 }, { "clip_ratio/high_max": 0.0019358875941179576, "clip_ratio/high_mean": 0.0007530939723437768, "clip_ratio/low_mean": 0.09050396416905641, "clip_ratio/low_min": 0.01571962712187087, "clip_ratio/region_mean": 0.09125705805263351, "epoch": 0.624, "grad_norm": 0.5629758986739761, "learning_rate": 1.1212288289505494e-06, "loss": -0.0032, "step": 585 }, { "clip_ratio/high_max": 0.0026918316245428285, "clip_ratio/high_mean": 0.0010609514944917465, "clip_ratio/low_mean": 0.04968106818860178, "clip_ratio/low_min": 0.0063101926651143005, "clip_ratio/region_mean": 0.050742019769313626, "epoch": 0.6293333333333333, "grad_norm": 0.4388288097077303, "learning_rate": 1.0942518082405401e-06, "loss": -0.0043, "step": 590 }, { "clip_ratio/high_max": 0.0013274217692014645, "clip_ratio/high_mean": 0.0005198746566065892, "clip_ratio/low_mean": 0.033042849905200455, "clip_ratio/low_min": 0.003937181099354347, "clip_ratio/region_mean": 0.03356272436935796, "completions/clipped_ratio": 0.3828125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 1495.268310546875, "completions/mean_terminated_length": 517.2953491210938, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6346666666666667, "grad_norm": 0.8959974753174467, "learning_rate": 1.0674156608123294e-06, "loss": -0.0031, "num_tokens": 18172216.0, "reward": 0.3697916865348816, "reward_std": 0.19846415519714355, "rewards/accuracy_reward/mean": 0.3697916567325592, "rewards/accuracy_reward/std": 0.48337799310684204, "step": 595 }, { "clip_ratio/high_max": 0.0010681611448035255, "clip_ratio/high_mean": 0.00043812101841922413, "clip_ratio/low_mean": 0.1413875142949564, "clip_ratio/low_min": 0.011108658365446899, "clip_ratio/region_mean": 0.14182563485233005, "epoch": 0.64, "grad_norm": 0.5779401941272009, "learning_rate": 1.040729704011591e-06, "loss": -0.0052, "step": 600 }, { "clip_ratio/high_max": 0.0018927435025034357, "clip_ratio/high_mean": 0.000789131971578172, "clip_ratio/low_mean": 0.051331201497850996, "clip_ratio/low_min": 0.0021838081109308407, "clip_ratio/region_mean": 0.05212033281377444, "epoch": 0.6453333333333333, "grad_norm": 0.3212979020812657, "learning_rate": 1.0142032030387342e-06, "loss": -0.0063, "step": 605 }, { "clip_ratio/high_max": 0.0014642723353972542, "clip_ratio/high_mean": 0.0006004483774404434, "clip_ratio/low_mean": 0.0844514217990195, "clip_ratio/low_min": 0.006942583447607831, "clip_ratio/region_mean": 0.08505186949799963, "completions/clipped_ratio": 0.4453125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 1608.8099365234375, "completions/mean_terminated_length": 434.13616943359375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.6506666666666666, "grad_norm": 1.9809722402893128, "learning_rate": 9.878453677320847e-07, "loss": -0.0042, "num_tokens": 18840855.0, "reward": 0.3567708432674408, "reward_std": 0.18042194843292236, "rewards/accuracy_reward/mean": 0.3567708432674408, "rewards/accuracy_reward/std": 0.47967129945755005, "step": 610 }, { "clip_ratio/high_max": 0.0008442049128916552, "clip_ratio/high_mean": 0.0003902538691136215, "clip_ratio/low_mean": 0.09862613776404032, "clip_ratio/low_min": 0.0016155107969098026, "clip_ratio/region_mean": 0.09901639197364602, "epoch": 0.656, "grad_norm": 0.8472357185042572, "learning_rate": 9.616653493702824e-07, "loss": -0.0048, "step": 615 }, { "clip_ratio/high_max": 0.0019611586270457337, "clip_ratio/high_mean": 0.0008587868485165018, "clip_ratio/low_mean": 0.05741067223825667, "clip_ratio/low_min": 0.0026852591036004014, "clip_ratio/region_mean": 0.05826945892354161, "epoch": 0.6613333333333333, "grad_norm": 0.5117046686765813, "learning_rate": 9.356722374950166e-07, "loss": -0.0061, "step": 620 }, { "clip_ratio/high_max": 0.002149604147007267, "clip_ratio/high_mean": 0.0009095983814859209, "clip_ratio/low_mean": 0.09167227628931869, "clip_ratio/low_min": 0.006681809845031239, "clip_ratio/region_mean": 0.09258187450891456, "completions/clipped_ratio": 0.3046875, "completions/max_length": 3072.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 1257.174560546875, "completions/mean_terminated_length": 461.9138488769531, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6666666666666666, "grad_norm": 2.425786140973197, "learning_rate": 9.098750567551911e-07, "loss": -0.0052, "num_tokens": 19374580.0, "reward": 0.4947916865348816, "reward_std": 0.18944305181503296, "rewards/accuracy_reward/mean": 0.4947916567325592, "rewards/accuracy_reward/std": 0.5006251335144043, "step": 625 }, { "clip_ratio/high_max": 0.000830297266929847, "clip_ratio/high_mean": 0.00033415651921586687, "clip_ratio/low_mean": 0.06716197767773338, "clip_ratio/low_min": 9.235515540240158e-05, "clip_ratio/region_mean": 0.06749613480951666, "epoch": 0.672, "grad_norm": 0.9484298187823065, "learning_rate": 8.842827637736218e-07, "loss": -0.004, "step": 630 }, { "clip_ratio/high_max": 0.0018289288962478167, "clip_ratio/high_mean": 0.0007641465137567139, "clip_ratio/low_mean": 0.11100487593212165, "clip_ratio/low_min": 0.00031114662961044813, "clip_ratio/region_mean": 0.11176902298202548, "epoch": 0.6773333333333333, "grad_norm": 0.8727976138927241, "learning_rate": 8.589042440373532e-07, "loss": -0.0057, "step": 635 }, { "clip_ratio/high_max": 0.0027268029616607237, "clip_ratio/high_mean": 0.001136750497698813, "clip_ratio/low_mean": 0.08060917256370885, "clip_ratio/low_min": 0.0003718588712217752, "clip_ratio/region_mean": 0.08174592283485253, "epoch": 0.6826666666666666, "grad_norm": 0.5766078331576147, "learning_rate": 8.337483088126709e-07, "loss": -0.0066, "step": 640 }, { "clip_ratio/high_max": 0.0005827984293773625, "clip_ratio/high_mean": 0.00025688486277886113, "clip_ratio/low_mean": 0.05894043798748498, "clip_ratio/low_min": 0.005774066379581199, "clip_ratio/region_mean": 0.05919732276574905, "completions/clipped_ratio": 0.328125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 1318.158935546875, "completions/mean_terminated_length": 461.63177490234375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.688, "grad_norm": 0.9541467931677632, "learning_rate": 8.088236920858835e-07, "loss": -0.0043, "num_tokens": 19931807.0, "reward": 0.4505208432674408, "reward_std": 0.23003798723220825, "rewards/accuracy_reward/mean": 0.4505208432674408, "rewards/accuracy_reward/std": 0.4981948733329773, "step": 645 }, { "clip_ratio/high_max": 0.001986936363937275, "clip_ratio/high_mean": 0.000847096519828483, "clip_ratio/low_mean": 0.14992775106147746, "clip_ratio/low_min": 0.011248164915741654, "clip_ratio/region_mean": 0.15077484786647802, "epoch": 0.6933333333333334, "grad_norm": 0.8867605754079709, "learning_rate": 7.841390475309386e-07, "loss": -0.0069, "step": 650 }, { "clip_ratio/high_max": 0.00316539079631184, "clip_ratio/high_mean": 0.0013554359816453144, "clip_ratio/low_mean": 0.09539674111474597, "clip_ratio/low_min": 0.008034749040962196, "clip_ratio/region_mean": 0.09675217630315273, "epoch": 0.6986666666666667, "grad_norm": 0.585937135001212, "learning_rate": 7.59702945504917e-07, "loss": -0.0082, "step": 655 }, { "clip_ratio/high_max": 0.001076056590591179, "clip_ratio/high_mean": 0.00046281777099466126, "clip_ratio/low_mean": 0.03595257262679752, "clip_ratio/low_min": 0.0033627115863055224, "clip_ratio/region_mean": 0.036415390824072344, "completions/clipped_ratio": 0.40364583333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 3052.0, "completions/mean_length": 1557.5, "completions/mean_terminated_length": 532.4017333984375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.704, "grad_norm": 0.4021452838387261, "learning_rate": 7.355238700724594e-07, "loss": -0.0032, "num_tokens": 20583056.0, "reward": 0.328125, "reward_std": 0.16237977147102356, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.4701431691646576, "step": 660 }, { "clip_ratio/high_max": 0.001079712262708199, "clip_ratio/high_mean": 0.00043690134943972223, "clip_ratio/low_mean": 0.08768696378065215, "clip_ratio/low_min": 0.00046466768653772304, "clip_ratio/region_mean": 0.08812386506187977, "epoch": 0.7093333333333334, "grad_norm": 0.6927789997119669, "learning_rate": 7.116102160601505e-07, "loss": -0.0039, "step": 665 }, { "clip_ratio/high_max": 0.0055742011104484845, "clip_ratio/high_mean": 0.002020412324782228, "clip_ratio/low_mean": 0.04127223357927505, "clip_ratio/low_min": 0.00041812123090494423, "clip_ratio/region_mean": 0.043292645750898376, "epoch": 0.7146666666666667, "grad_norm": 0.7477229660302649, "learning_rate": 6.879702861418883e-07, "loss": -0.0051, "step": 670 }, { "clip_ratio/high_max": 0.0035863295405306416, "clip_ratio/high_mean": 0.0012852680132709794, "clip_ratio/low_mean": 0.010531688855007815, "clip_ratio/low_min": 0.00014545066296705044, "clip_ratio/region_mean": 0.011816956915754417, "completions/clipped_ratio": 0.3515625, "completions/max_length": 3072.0, "completions/max_terminated_length": 2841.0, "completions/mean_length": 1393.580810546875, "completions/mean_terminated_length": 483.5943603515625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.72, "grad_norm": 0.5860206669895351, "learning_rate": 6.646122879562435e-07, "loss": -0.0035, "num_tokens": 21170931.0, "reward": 0.4088541865348816, "reward_std": 0.14884811639785767, "rewards/accuracy_reward/mean": 0.4120734930038452, "rewards/accuracy_reward/std": 0.492855429649353, "step": 675 }, { "clip_ratio/high_max": 0.0009973094324777776, "clip_ratio/high_mean": 0.00039315426054145066, "clip_ratio/low_mean": 0.10007340766924244, "clip_ratio/low_min": 0.0002103800798067823, "clip_ratio/region_mean": 0.10046656222098135, "epoch": 0.7253333333333334, "grad_norm": 0.8074078225979295, "learning_rate": 6.415443312568216e-07, "loss": -0.0039, "step": 680 }, { "clip_ratio/high_max": 0.0016584670654083312, "clip_ratio/high_mean": 0.0006752162811153539, "clip_ratio/low_mean": 0.06889854084874969, "clip_ratio/low_min": 0.0004659711688873358, "clip_ratio/region_mean": 0.06957375722172401, "epoch": 0.7306666666666667, "grad_norm": 0.6769497813333211, "learning_rate": 6.187744250966031e-07, "loss": -0.0049, "step": 685 }, { "clip_ratio/high_max": 0.0012265518032108958, "clip_ratio/high_mean": 0.0005000920038128242, "clip_ratio/low_mean": 0.01433550868707698, "clip_ratio/low_min": 0.00033823276971816084, "clip_ratio/region_mean": 0.014835600765945856, "completions/clipped_ratio": 0.35677083333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 1384.9349365234375, "completions/mean_terminated_length": 449.1943359375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.736, "grad_norm": 1.4413347113710488, "learning_rate": 5.963104750472507e-07, "loss": -0.0035, "num_tokens": 21759137.0, "reward": 0.40625, "reward_std": 0.17591141164302826, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.4917730391025543, "step": 690 }, { "clip_ratio/high_max": 0.0007489337052902556, "clip_ratio/high_mean": 0.000333998258020074, "clip_ratio/low_mean": 0.08691675030531769, "clip_ratio/low_min": 0.009981548228824976, "clip_ratio/region_mean": 0.08725074709636829, "epoch": 0.7413333333333333, "grad_norm": 0.7644870125789504, "learning_rate": 5.741602804543429e-07, "loss": -0.0044, "step": 695 }, { "clip_ratio/high_max": 0.0014169166475767269, "clip_ratio/high_mean": 0.0006242913379082893, "clip_ratio/low_mean": 0.12265115290538234, "clip_ratio/low_min": 0.01748713767156005, "clip_ratio/region_mean": 0.12327544433337607, "epoch": 0.7466666666666667, "grad_norm": 0.39357302166506536, "learning_rate": 5.52331531729491e-07, "loss": -0.0048, "step": 700 }, { "clip_ratio/high_max": 0.0014489660134131555, "clip_ratio/high_mean": 0.000661603920525522, "clip_ratio/low_mean": 0.03896075397333334, "clip_ratio/low_min": 0.00547465393319726, "clip_ratio/region_mean": 0.03962235820581554, "completions/clipped_ratio": 0.37239583333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2528.0, "completions/mean_length": 1422.921875, "completions/mean_terminated_length": 444.4232482910156, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.752, "grad_norm": 1.9412485659664167, "learning_rate": 5.308318076802728e-07, "loss": -0.0044, "num_tokens": 22359041.0, "reward": 0.359375, "reward_std": 0.15786920487880707, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.48044323921203613, "step": 705 }, { "clip_ratio/high_max": 0.0007247125194680848, "clip_ratio/high_mean": 0.00027143629274632984, "clip_ratio/low_mean": 0.05364509501459906, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05391653134358876, "epoch": 0.7573333333333333, "grad_norm": 0.7222728998679089, "learning_rate": 5.096685728789175e-07, "loss": -0.0032, "step": 710 }, { "clip_ratio/high_max": 0.0014950132370358915, "clip_ratio/high_mean": 0.0005910961566542028, "clip_ratio/low_mean": 0.10265407566794238, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.10324517158198887, "epoch": 0.7626666666666667, "grad_norm": 0.4633426798803331, "learning_rate": 4.888491750706547e-07, "loss": -0.0043, "step": 715 }, { "clip_ratio/high_max": 0.002107685565260908, "clip_ratio/high_mean": 0.0008398284502845854, "clip_ratio/low_mean": 0.0432122963889924, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.044052125114035334, "epoch": 0.768, "grad_norm": 0.41803527778600374, "learning_rate": 4.6838084262261776e-07, "loss": -0.005, "step": 720 }, { "clip_ratio/high_max": 0.00041984336608038577, "clip_ratio/high_mean": 0.00019650415636078832, "clip_ratio/low_mean": 0.04591512881638664, "clip_ratio/low_min": 6.778326842322712e-05, "clip_ratio/region_mean": 0.04611163287759155, "completions/clipped_ratio": 0.37760416666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 1489.59375, "completions/mean_terminated_length": 529.5564575195312, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.7733333333333333, "grad_norm": 0.8659513679632751, "learning_rate": 4.4827068201420486e-07, "loss": -0.0031, "num_tokens": 22990775.0, "reward": 0.3697916865348816, "reward_std": 0.19846415519714355, "rewards/accuracy_reward/mean": 0.3697916567325592, "rewards/accuracy_reward/std": 0.48337799310684204, "step": 725 }, { "clip_ratio/high_max": 0.001005418406384706, "clip_ratio/high_mean": 0.00047414670361831666, "clip_ratio/low_mean": 0.11322668755210544, "clip_ratio/low_min": 0.00025525990640744566, "clip_ratio/region_mean": 0.11370083442614032, "epoch": 0.7786666666666666, "grad_norm": 0.4135469161651368, "learning_rate": 4.2852567536974705e-07, "loss": -0.0051, "step": 730 }, { "clip_ratio/high_max": 0.001429002207260055, "clip_ratio/high_mean": 0.0007002748416425675, "clip_ratio/low_mean": 0.048857789553039765, "clip_ratio/low_min": 0.00046797647955827416, "clip_ratio/region_mean": 0.04955806436651074, "epoch": 0.784, "grad_norm": 0.4551947793698413, "learning_rate": 4.0915267803436186e-07, "loss": -0.0058, "step": 735 }, { "clip_ratio/high_max": 0.0006173434506308694, "clip_ratio/high_mean": 0.00027514201754001987, "clip_ratio/low_mean": 0.025205485397964366, "clip_ratio/low_min": 0.0001580644282512367, "clip_ratio/region_mean": 0.025480627419983647, "completions/clipped_ratio": 0.34375, "completions/max_length": 3072.0, "completions/max_terminated_length": 2634.0, "completions/mean_length": 1392.3021240234375, "completions/mean_terminated_length": 512.4603271484375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.7893333333333333, "grad_norm": 1.0212730742278187, "learning_rate": 3.901584161938172e-07, "loss": -0.0031, "num_tokens": 23582761.0, "reward": 0.390625, "reward_std": 0.21650634706020355, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48852697014808655, "step": 740 }, { "clip_ratio/high_max": 0.0007365572171693202, "clip_ratio/high_mean": 0.00032287277390423695, "clip_ratio/low_mean": 0.09766255691401966, "clip_ratio/low_min": 9.147672917606542e-05, "clip_ratio/region_mean": 0.09798542929838731, "epoch": 0.7946666666666666, "grad_norm": 0.6815695249385727, "learning_rate": 3.715494845392418e-07, "loss": -0.0055, "step": 745 }, { "clip_ratio/high_max": 0.0015189904763246887, "clip_ratio/high_mean": 0.0006700149351672735, "clip_ratio/low_mean": 0.10994307422615748, "clip_ratio/low_min": 0.000157716667126806, "clip_ratio/region_mean": 0.11061308925236517, "epoch": 0.8, "grad_norm": 0.42485309519425085, "learning_rate": 3.5333234397748987e-07, "loss": -0.0061, "step": 750 }, { "clip_ratio/high_max": 0.0009807385106796573, "clip_ratio/high_mean": 0.0004041217109261197, "clip_ratio/low_mean": 0.032906752723215506, "clip_ratio/low_min": 7.219950043690915e-05, "clip_ratio/region_mean": 0.03331087448573271, "completions/clipped_ratio": 0.4296875, "completions/max_length": 3072.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 1602.557373046875, "completions/mean_terminated_length": 495.4429016113281, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.8053333333333333, "grad_norm": 1.5418792750257988, "learning_rate": 3.3551331938795246e-07, "loss": -0.0029, "num_tokens": 24251018.0, "reward": 0.3072916865348816, "reward_std": 0.17140087485313416, "rewards/accuracy_reward/mean": 0.3097112774848938, "rewards/accuracy_reward/std": 0.46298250555992126, "step": 755 }, { "clip_ratio/high_max": 0.000782373332140196, "clip_ratio/high_mean": 0.00031809655779397873, "clip_ratio/low_mean": 0.02570694254463888, "clip_ratio/low_min": 0.0012707602287264307, "clip_ratio/region_mean": 0.026025039234082215, "epoch": 0.8106666666666666, "grad_norm": 0.47629865778177605, "learning_rate": 3.1809859742659784e-07, "loss": -0.0035, "step": 760 }, { "clip_ratio/high_max": 0.001254349240434749, "clip_ratio/high_mean": 0.0005123108127008891, "clip_ratio/low_mean": 0.08953179615755288, "clip_ratio/low_min": 0.00363702202339482, "clip_ratio/region_mean": 0.09004410713714606, "epoch": 0.816, "grad_norm": 0.5912382827158039, "learning_rate": 3.0109422437800415e-07, "loss": -0.0047, "step": 765 }, { "clip_ratio/high_max": 0.0011374048554898763, "clip_ratio/high_mean": 0.0004622358239203095, "clip_ratio/low_mean": 0.05628275159278928, "clip_ratio/low_min": 0.002317304764164874, "clip_ratio/region_mean": 0.05674498771409162, "completions/clipped_ratio": 0.32291666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 1340.0677490234375, "completions/mean_terminated_length": 514.0692138671875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.8213333333333334, "grad_norm": 1.7047713081395146, "learning_rate": 2.8450610405612504e-07, "loss": -0.0029, "num_tokens": 24818215.0, "reward": 0.3984375, "reward_std": 0.18493250012397766, "rewards/accuracy_reward/mean": 0.3984375, "rewards/accuracy_reward/std": 0.49021512269973755, "step": 770 }, { "clip_ratio/high_max": 0.0008135521629810682, "clip_ratio/high_mean": 0.00032512739676349156, "clip_ratio/low_mean": 0.01960004798927457, "clip_ratio/low_min": 0.002381206746213138, "clip_ratio/region_mean": 0.019925175267690064, "epoch": 0.8266666666666667, "grad_norm": 0.6179039366076539, "learning_rate": 2.6833999575452256e-07, "loss": -0.0029, "step": 775 }, { "clip_ratio/high_max": 0.0012155366818660696, "clip_ratio/high_mean": 0.0004935097189445514, "clip_ratio/low_mean": 0.08312894122977924, "clip_ratio/low_min": 0.008879630558658392, "clip_ratio/region_mean": 0.08362245100802283, "epoch": 0.832, "grad_norm": 0.4915316548227809, "learning_rate": 2.526015122467751e-07, "loss": -0.0052, "step": 780 }, { "clip_ratio/high_max": 0.0013409282384600374, "clip_ratio/high_mean": 0.0005532631801543175, "clip_ratio/low_mean": 0.07124723757133325, "clip_ratio/low_min": 0.007868804596364498, "clip_ratio/region_mean": 0.07180050043730261, "completions/clipped_ratio": 0.22916666666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 2969.0, "completions/mean_length": 1131.59375, "completions/mean_terminated_length": 554.7162475585938, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8373333333333334, "grad_norm": 1.3321336471330067, "learning_rate": 2.372961178377585e-07, "loss": -0.0044, "num_tokens": 25305109.0, "reward": 0.4270833432674408, "reward_std": 0.18944305181503296, "rewards/accuracy_reward/mean": 0.4270833432674408, "rewards/accuracy_reward/std": 0.49529990553855896, "step": 785 }, { "clip_ratio/high_max": 0.0006206725175616157, "clip_ratio/high_mean": 0.00026015501707661316, "clip_ratio/low_mean": 0.008438521537982524, "clip_ratio/low_min": 0.000779829935527232, "clip_ratio/region_mean": 0.00869867654701011, "epoch": 0.8426666666666667, "grad_norm": 0.6531696160217851, "learning_rate": 2.2242912646647086e-07, "loss": -0.0018, "step": 790 }, { "clip_ratio/high_max": 0.001018970776203787, "clip_ratio/high_mean": 0.0004580497719189225, "clip_ratio/low_mean": 0.03081533851709537, "clip_ratio/low_min": 0.0029533952358178793, "clip_ratio/region_mean": 0.03127338842064091, "epoch": 0.848, "grad_norm": 0.3751906016845087, "learning_rate": 2.080056998610662e-07, "loss": -0.0041, "step": 795 }, { "clip_ratio/high_max": 0.0015448625610588352, "clip_ratio/high_mean": 0.0006850655775679116, "clip_ratio/low_mean": 0.04506459243712015, "clip_ratio/low_min": 0.004078388711786829, "clip_ratio/region_mean": 0.045749658174827346, "epoch": 0.8533333333333334, "grad_norm": 0.4695474124723758, "learning_rate": 1.9403084574673463e-07, "loss": -0.0053, "step": 800 }, { "clip_ratio/high_max": 0.0005050916613072332, "clip_ratio/high_mean": 0.00020166085876098804, "clip_ratio/low_mean": 0.0018577357276058138, "clip_ratio/low_min": 0.00016674428952683228, "clip_ratio/region_mean": 0.002059396582944828, "completions/clipped_ratio": 0.33072916666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 1399.46875, "completions/mean_terminated_length": 572.9649658203125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8586666666666667, "grad_norm": 0.6037012055892774, "learning_rate": 1.8050941610705053e-07, "loss": -0.0014, "num_tokens": 25896268.0, "reward": 0.3932291865348816, "reward_std": 0.16689030826091766, "rewards/accuracy_reward/mean": 0.3932291567325592, "rewards/accuracy_reward/std": 0.4891042709350586, "step": 805 }, { "clip_ratio/high_max": 0.0009675620394773432, "clip_ratio/high_mean": 0.0003843064786906325, "clip_ratio/low_mean": 0.041351493953152386, "clip_ratio/low_min": 0.0031604571755451615, "clip_ratio/region_mean": 0.041735800434025805, "epoch": 0.864, "grad_norm": 0.551245538877653, "learning_rate": 1.6744610549939322e-07, "loss": -0.0036, "step": 810 }, { "clip_ratio/high_max": 0.0013038393575698138, "clip_ratio/high_mean": 0.0005415118384007655, "clip_ratio/low_mean": 0.05911404838752787, "clip_ratio/low_min": 0.00435080575698521, "clip_ratio/region_mean": 0.05965556106693839, "epoch": 0.8693333333333333, "grad_norm": 0.4368980079899866, "learning_rate": 1.5484544942502694e-07, "loss": -0.0042, "step": 815 }, { "clip_ratio/high_max": 0.000631665273795079, "clip_ratio/high_mean": 0.0002663155735490363, "clip_ratio/low_mean": 0.009732095834021948, "clip_ratio/low_min": 0.0006983812006637891, "clip_ratio/region_mean": 0.009998411218577984, "completions/clipped_ratio": 0.45833333333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 1674.1796875, "completions/mean_terminated_length": 491.4086608886719, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.8746666666666667, "grad_norm": 1.1416194208715535, "learning_rate": 1.4271182275440077e-07, "loss": -0.0012, "num_tokens": 26590156.0, "reward": 0.375, "reward_std": 0.18493251502513885, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4847545027732849, "step": 820 }, { "clip_ratio/high_max": 0.0005744179329667531, "clip_ratio/high_mean": 0.00024581232726177403, "clip_ratio/low_mean": 0.012403177239184516, "clip_ratio/low_min": 0.0010752059031801763, "clip_ratio/region_mean": 0.012648989548097234, "epoch": 0.88, "grad_norm": 0.6833069573238314, "learning_rate": 1.3104943820822195e-07, "loss": -0.003, "step": 825 }, { "clip_ratio/high_max": 0.0006950385980417195, "clip_ratio/high_mean": 0.00032368661154578147, "clip_ratio/low_mean": 0.027800189873278213, "clip_ratio/low_min": 0.0021251932398172356, "clip_ratio/region_mean": 0.028123876562790428, "epoch": 0.8853333333333333, "grad_norm": 0.46860752495394326, "learning_rate": 1.19862344894824e-07, "loss": -0.005, "step": 830 }, { "clip_ratio/high_max": 0.000570101764787978, "clip_ratio/high_mean": 0.0002450852077572563, "clip_ratio/low_mean": 0.014491262838419061, "clip_ratio/low_min": 0.0011566874243726488, "clip_ratio/region_mean": 0.014736347992015907, "completions/clipped_ratio": 0.45833333333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 1646.0052490234375, "completions/mean_terminated_length": 439.3942565917969, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.8906666666666667, "grad_norm": 1.3190905108753403, "learning_rate": 1.0915442690434158e-07, "loss": -0.0026, "num_tokens": 27277590.0, "reward": 0.3776041865348816, "reward_std": 0.16237977147102356, "rewards/accuracy_reward/mean": 0.3776041567325592, "rewards/accuracy_reward/std": 0.4854203164577484, "step": 835 }, { "clip_ratio/high_max": 0.0004636599130662944, "clip_ratio/high_mean": 0.00018505510327031515, "clip_ratio/low_mean": 0.0023793916196609643, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002564446715655322, "epoch": 0.896, "grad_norm": 0.5144454176402644, "learning_rate": 9.89294019601783e-08, "loss": -0.0035, "step": 840 }, { "clip_ratio/high_max": 0.0006925672048510023, "clip_ratio/high_mean": 0.0002692331955131522, "clip_ratio/low_mean": 0.051089451654797814, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05135868472113998, "epoch": 0.9013333333333333, "grad_norm": 0.3785744875642901, "learning_rate": 8.919082012823675e-08, "loss": -0.0055, "step": 845 }, { "clip_ratio/high_max": 0.0005612557333279256, "clip_ratio/high_mean": 0.00022511526822199812, "clip_ratio/low_mean": 0.05285697572344361, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.053082091317901356, "completions/clipped_ratio": 0.49739583333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 2202.0, "completions/mean_length": 1775.5599365234375, "completions/mean_terminated_length": 492.55438232421875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9066666666666666, "grad_norm": 2.0873724719217313, "learning_rate": 7.994206258435576e-08, "loss": -0.0034, "num_tokens": 28019702.0, "reward": 0.3229166865348816, "reward_std": 0.18944305181503296, "rewards/accuracy_reward/mean": 0.3229166567325592, "rewards/accuracy_reward/std": 0.4682011902332306, "step": 850 }, { "clip_ratio/high_max": 0.00029916751777818716, "clip_ratio/high_mean": 0.0001475981790918013, "clip_ratio/low_mean": 0.001186627710058019, "clip_ratio/low_min": 0.00011110947721135745, "clip_ratio/region_mean": 0.0013342259027012914, "epoch": 0.912, "grad_norm": 1.0407231217678266, "learning_rate": 7.118634044038774e-08, "loss": -0.0026, "step": 855 }, { "clip_ratio/high_max": 0.00039375339788421114, "clip_ratio/high_mean": 0.00018916932601769075, "clip_ratio/low_mean": 0.03520357923448501, "clip_ratio/low_min": 0.004140487466065679, "clip_ratio/region_mean": 0.03539274867489439, "epoch": 0.9173333333333333, "grad_norm": 0.45381290542031755, "learning_rate": 6.292669362932102e-08, "loss": -0.006, "step": 860 }, { "clip_ratio/high_max": 0.00035454770923024623, "clip_ratio/high_mean": 0.00016210461997161473, "clip_ratio/low_mean": 0.07062106373780352, "clip_ratio/low_min": 0.007939100095973118, "clip_ratio/region_mean": 0.07078316801207621, "completions/clipped_ratio": 0.44270833333333337, "completions/max_length": 3072.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 1589.7578125, "completions/mean_terminated_length": 412.27569580078125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9226666666666666, "grad_norm": 4.141924692668672, "learning_rate": 5.516598984983279e-08, "loss": -0.0052, "num_tokens": 28679183.0, "reward": 0.421875, "reward_std": 0.21650634706020355, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.49450308084487915, "step": 865 }, { "clip_ratio/high_max": 0.000624493149234695, "clip_ratio/high_mean": 0.0002377218058654762, "clip_ratio/low_mean": 0.0001582941215019673, "clip_ratio/low_min": 1.0437911896588048e-05, "clip_ratio/region_mean": 0.00039601592551434804, "epoch": 0.928, "grad_norm": 2.2930601082340862, "learning_rate": 4.7906923570641695e-08, "loss": -0.0012, "step": 870 }, { "clip_ratio/high_max": 0.0006761565578017326, "clip_ratio/high_mean": 0.00026434433050326334, "clip_ratio/low_mean": 0.030481814167160337, "clip_ratio/low_min": 0.005796816466227028, "clip_ratio/region_mean": 0.030746158631654908, "epoch": 0.9333333333333333, "grad_norm": 0.8992889394471054, "learning_rate": 4.115201509500582e-08, "loss": -0.0045, "step": 875 }, { "clip_ratio/high_max": 0.000803725753303297, "clip_ratio/high_mean": 0.0003107299557996157, "clip_ratio/low_mean": 0.07673020867616742, "clip_ratio/low_min": 0.01428089960518264, "clip_ratio/region_mean": 0.07704093885442945, "epoch": 0.9386666666666666, "grad_norm": 0.7063355389077242, "learning_rate": 3.490360968568801e-08, "loss": -0.0057, "step": 880 }, { "clip_ratio/high_max": 0.000321737700551239, "clip_ratio/high_mean": 0.00013240401619896147, "clip_ratio/low_mean": 9.871758469444103e-05, "clip_ratio/low_min": 7.3583020366641e-06, "clip_ratio/region_mean": 0.00023112160295113426, "completions/clipped_ratio": 0.45572916666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 1630.3203125, "completions/mean_terminated_length": 423.1722412109375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.944, "grad_norm": 1.3963210543511064, "learning_rate": 2.9163876750694986e-08, "loss": -0.0004, "num_tokens": 29356610.0, "reward": 0.4088541865348816, "reward_std": 0.18944305181503296, "rewards/accuracy_reward/mean": 0.4088541567325592, "rewards/accuracy_reward/std": 0.4922636151313782, "step": 885 }, { "clip_ratio/high_max": 0.00047408624395757216, "clip_ratio/high_mean": 0.0001940367142424293, "clip_ratio/low_mean": 0.0007887816320703677, "clip_ratio/low_min": 3.8721164401067654e-05, "clip_ratio/region_mean": 0.0009828183332047046, "epoch": 0.9493333333333334, "grad_norm": 0.8769065798401435, "learning_rate": 2.393480909007306e-08, "loss": -0.0024, "step": 890 }, { "clip_ratio/high_max": 0.00048624616024426357, "clip_ratio/high_mean": 0.00020660463337662804, "clip_ratio/low_mean": 0.010686800639496142, "clip_ratio/low_min": 0.00031385584370582367, "clip_ratio/region_mean": 0.01089340531725611, "epoch": 0.9546666666666667, "grad_norm": 0.6275720965009554, "learning_rate": 1.9218222204019087e-08, "loss": -0.0036, "step": 895 }, { "clip_ratio/high_max": 0.0004055778117162845, "clip_ratio/high_mean": 0.00016178353553186753, "clip_ratio/low_mean": 0.004519703159144228, "clip_ratio/low_min": 0.00015490951009269338, "clip_ratio/region_mean": 0.004681486522804335, "completions/clipped_ratio": 0.5026041666666667, "completions/max_length": 3072.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 1746.0703125, "completions/mean_terminated_length": 406.2565612792969, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.96, "grad_norm": 1.208676533660295, "learning_rate": 1.5015753662550813e-08, "loss": -0.0006, "num_tokens": 30079757.0, "reward": 0.3645833432674408, "reward_std": 0.18944305181503296, "rewards/accuracy_reward/mean": 0.3645833432674408, "rewards/accuracy_reward/std": 0.4819410443305969, "step": 900 }, { "clip_ratio/high_max": 0.0005658225985371246, "clip_ratio/high_mean": 0.00022134660496249127, "clip_ratio/low_mean": 0.00010698251877556686, "clip_ratio/low_min": 7.589406050101388e-06, "clip_ratio/region_mean": 0.00032832912474987095, "epoch": 0.9653333333333334, "grad_norm": 1.006027597401978, "learning_rate": 1.1328862536952033e-08, "loss": -0.0001, "step": 905 }, { "clip_ratio/high_max": 0.0005919153693866974, "clip_ratio/high_mean": 0.0002289530081952762, "clip_ratio/low_mean": 0.00044369055427750936, "clip_ratio/low_min": 5.2475231223070294e-05, "clip_ratio/region_mean": 0.0006726435575615142, "epoch": 0.9706666666666667, "grad_norm": 0.872969738517768, "learning_rate": 8.158828893192471e-09, "loss": -0.0008, "step": 910 }, { "clip_ratio/high_max": 0.00041595316642997206, "clip_ratio/high_mean": 0.0001653546375564474, "clip_ratio/low_mean": 0.0005628693277003549, "clip_ratio/low_min": 6.853139302620548e-05, "clip_ratio/region_mean": 0.0007282239555024717, "completions/clipped_ratio": 0.49479166666666663, "completions/max_length": 3072.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 1724.955810546875, "completions/mean_terminated_length": 405.685546875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.976, "grad_norm": 0.8603118662401141, "learning_rate": 5.506753347496285e-09, "loss": -0.0003, "num_tokens": 30796554.0, "reward": 0.3541666865348816, "reward_std": 0.16237977147102356, "rewards/accuracy_reward/mean": 0.3541666567325592, "rewards/accuracy_reward/std": 0.4788839817047119, "step": 915 }, { "clip_ratio/high_max": 0.000490554867155879, "clip_ratio/high_mean": 0.00018546660405718284, "clip_ratio/low_mean": 9.097458115547852e-05, "clip_ratio/low_min": 2.169093113479903e-06, "clip_ratio/region_mean": 0.00027644118495118164, "epoch": 0.9813333333333333, "grad_norm": 0.8176942638369722, "learning_rate": 3.37355668421524e-09, "loss": 0.0002, "step": 920 }, { "clip_ratio/high_max": 0.00045186152710812165, "clip_ratio/high_mean": 0.0001695981521379508, "clip_ratio/low_mean": 9.007803339500242e-05, "clip_ratio/low_min": 1.269035510631511e-06, "clip_ratio/region_mean": 0.0002596761846916706, "epoch": 0.9866666666666667, "grad_norm": 0.7952807414107727, "learning_rate": 1.7599795361376015e-09, "loss": 0.0, "step": 925 }, { "clip_ratio/high_max": 0.0005326888999661605, "clip_ratio/high_mean": 0.00019618853552098395, "clip_ratio/low_mean": 9.789040247672649e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002940789374861197, "epoch": 0.9898666666666667, "step": 928, "total_flos": 0.0, "train_loss": -0.003415438665073472, "train_runtime": 22035.1649, "train_samples_per_second": 0.34, "train_steps_per_second": 0.043 } ], "logging_steps": 5, "max_steps": 937, "num_input_tokens_seen": 30796554, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }