diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15707 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.374772190851782, + "eval_steps": 100, + "global_step": 1566, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 562.7477951049805, + "epoch": 0.0019121030145499089, + "grad_norm": 0.022798627614974976, + "kl": 0.0, + "learning_rate": 1.5923566878980894e-08, + "loss": 0.002, + "num_tokens": 3752220.0, + "reward": 0.011439732770668343, + "reward_std": 0.019404857070185244, + "rewards/pure_accuracy_reward_math": 0.011439732537837699, + "step": 1 + }, + { + "clip_ratio": 0.0, + "epoch": 0.0038242060290998177, + "grad_norm": 0.02280641719698906, + "kl": 0.0, + "learning_rate": 3.184713375796179e-08, + "loss": 0.002, + "step": 2 + }, + { + "clip_ratio": 7.760171627069212e-05, + "epoch": 0.005736309043649726, + "grad_norm": 0.02249608002603054, + "kl": 0.00034177303314208984, + "learning_rate": 4.777070063694268e-08, + "loss": 0.002, + "step": 3 + }, + { + "clip_ratio": 7.010291557207893e-05, + "epoch": 0.0076484120581996355, + "grad_norm": 0.022546162828803062, + "kl": 0.0003476440906524658, + "learning_rate": 6.369426751592358e-08, + "loss": 0.002, + "step": 4 + }, + { + "clip_ratio": 6.121935876990392e-05, + "epoch": 0.009560515072749545, + "grad_norm": 0.022293007001280785, + "kl": 0.00034675002098083496, + "learning_rate": 7.961783439490447e-08, + "loss": 0.002, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.8786544799805, + "epoch": 0.011472618087299453, + "grad_norm": 0.04666038230061531, + "kl": 0.000458449125289917, + "learning_rate": 9.554140127388536e-08, + "loss": 0.0036, + "num_tokens": 7526881.0, + "reward": 0.010323661233996972, + "reward_std": 0.01923220051685348, + "rewards/pure_accuracy_reward_math": 0.01032366111758165, + "step": 6 + }, + { + "clip_ratio": 9.284320668712098e-05, + "epoch": 0.013384721101849363, + "grad_norm": 0.03701707720756531, + "kl": 0.0004444718360900879, + "learning_rate": 1.1146496815286625e-07, + "loss": 0.0037, + "step": 7 + }, + { + "clip_ratio": 0.00010049525423028172, + "epoch": 0.015296824116399271, + "grad_norm": 0.05443934351205826, + "kl": 0.0004649162292480469, + "learning_rate": 1.2738853503184715e-07, + "loss": 0.0037, + "step": 8 + }, + { + "clip_ratio": 9.395023369052069e-05, + "epoch": 0.01720892713094918, + "grad_norm": 0.0357414111495018, + "kl": 0.0004501640796661377, + "learning_rate": 1.4331210191082803e-07, + "loss": 0.0037, + "step": 9 + }, + { + "clip_ratio": 0.00010371651984542041, + "epoch": 0.01912103014549909, + "grad_norm": 0.05199029669165611, + "kl": 0.0004614591598510742, + "learning_rate": 1.5923566878980893e-07, + "loss": 0.0037, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.0694994926453, + "epoch": 0.021033133160048997, + "grad_norm": 0.022845298051834106, + "kl": 0.00035685300827026367, + "learning_rate": 1.751592356687898e-07, + "loss": 0.0025, + "num_tokens": 11302358.0, + "reward": 0.00948660756694153, + "reward_std": 0.017558093182742596, + "rewards/pure_accuracy_reward_math": 0.00948660756694153, + "step": 11 + }, + { + "clip_ratio": 7.08361723127382e-05, + "epoch": 0.022945236174598906, + "grad_norm": 0.02234972082078457, + "kl": 0.0003580451011657715, + "learning_rate": 1.9108280254777072e-07, + "loss": 0.0025, + "step": 12 + }, + { + "clip_ratio": 6.80922717606336e-05, + "epoch": 0.024857339189148814, + "grad_norm": 0.021554963663220406, + "kl": 0.00035765767097473145, + "learning_rate": 2.070063694267516e-07, + "loss": 0.0024, + "step": 13 + }, + { + "clip_ratio": 7.82350492158912e-05, + "epoch": 0.026769442203698725, + "grad_norm": 0.02103673666715622, + "kl": 0.000364154577255249, + "learning_rate": 2.229299363057325e-07, + "loss": 0.0025, + "step": 14 + }, + { + "clip_ratio": 7.339451894949889e-05, + "epoch": 0.028681545218248634, + "grad_norm": 0.023219415917992592, + "kl": 0.00036078691482543945, + "learning_rate": 2.3885350318471343e-07, + "loss": 0.0025, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.8797726631165, + "epoch": 0.030593648232798542, + "grad_norm": 0.024746257811784744, + "kl": 0.0003574490547180176, + "learning_rate": 2.547770700636943e-07, + "loss": 0.0041, + "num_tokens": 15044695.0, + "reward": 0.011160714813740924, + "reward_std": 0.0194911856087856, + "rewards/pure_accuracy_reward_math": 0.011160714755533263, + "step": 16 + }, + { + "clip_ratio": 9.0199953319825e-05, + "epoch": 0.032505751247348454, + "grad_norm": 0.02409624680876732, + "kl": 0.0003629624843597412, + "learning_rate": 2.707006369426752e-07, + "loss": 0.0042, + "step": 17 + }, + { + "clip_ratio": 8.157364351291108e-05, + "epoch": 0.03441785426189836, + "grad_norm": 0.023118698969483376, + "kl": 0.0003673136234283447, + "learning_rate": 2.8662420382165606e-07, + "loss": 0.0041, + "step": 18 + }, + { + "clip_ratio": 9.048881202033954e-05, + "epoch": 0.03632995727644827, + "grad_norm": 0.02316245064139366, + "kl": 0.00036725401878356934, + "learning_rate": 3.02547770700637e-07, + "loss": 0.0041, + "step": 19 + }, + { + "clip_ratio": 8.188984941170929e-05, + "epoch": 0.03824206029099818, + "grad_norm": 0.021714523434638977, + "kl": 0.0003698766231536865, + "learning_rate": 3.1847133757961787e-07, + "loss": 0.0041, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.6908774375916, + "epoch": 0.040154163305548086, + "grad_norm": 0.021168457344174385, + "kl": 0.000368267297744751, + "learning_rate": 3.3439490445859875e-07, + "loss": 0.0026, + "num_tokens": 18758275.0, + "reward": 0.010044643335277215, + "reward_std": 0.018202457285951823, + "rewards/pure_accuracy_reward_math": 0.010044643277069554, + "step": 21 + }, + { + "clip_ratio": 7.562077911416054e-05, + "epoch": 0.042066266320097995, + "grad_norm": 0.020001132041215897, + "kl": 0.00037425756454467773, + "learning_rate": 3.503184713375796e-07, + "loss": 0.0026, + "step": 22 + }, + { + "clip_ratio": 7.507880479806772e-05, + "epoch": 0.0439783693346479, + "grad_norm": 0.019386926665902138, + "kl": 0.0003781616687774658, + "learning_rate": 3.6624203821656055e-07, + "loss": 0.0026, + "step": 23 + }, + { + "clip_ratio": 7.805726602327923e-05, + "epoch": 0.04589047234919781, + "grad_norm": 0.018619129434227943, + "kl": 0.0003878176212310791, + "learning_rate": 3.8216560509554143e-07, + "loss": 0.0026, + "step": 24 + }, + { + "clip_ratio": 6.671031508176384e-05, + "epoch": 0.04780257536374772, + "grad_norm": 0.01833859272301197, + "kl": 0.00040024518966674805, + "learning_rate": 3.980891719745223e-07, + "loss": 0.0026, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.0828938484192, + "epoch": 0.04971467837829763, + "grad_norm": 0.02587960660457611, + "kl": 0.00041344761848449707, + "learning_rate": 4.140127388535032e-07, + "loss": 0.0024, + "num_tokens": 22468764.0, + "reward": 0.012276786321308464, + "reward_std": 0.022195036057382822, + "rewards/pure_accuracy_reward_math": 0.012276786204893142, + "step": 26 + }, + { + "clip_ratio": 9.613389988771814e-05, + "epoch": 0.05162678139284754, + "grad_norm": 0.02422533929347992, + "kl": 0.00043016672134399414, + "learning_rate": 4.2993630573248406e-07, + "loss": 0.0024, + "step": 27 + }, + { + "clip_ratio": 8.45099556840978e-05, + "epoch": 0.05353888440739745, + "grad_norm": 0.023998353630304337, + "kl": 0.0004411041736602783, + "learning_rate": 4.45859872611465e-07, + "loss": 0.0024, + "step": 28 + }, + { + "clip_ratio": 9.715859295056362e-05, + "epoch": 0.05545098742194736, + "grad_norm": 0.023024486377835274, + "kl": 0.0004749894142150879, + "learning_rate": 4.6178343949044587e-07, + "loss": 0.0024, + "step": 29 + }, + { + "clip_ratio": 9.816014483021718e-05, + "epoch": 0.05736309043649727, + "grad_norm": 0.022171439602971077, + "kl": 0.0005015134811401367, + "learning_rate": 4.777070063694269e-07, + "loss": 0.0024, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.791042804718, + "epoch": 0.059275193451047176, + "grad_norm": 0.027614159509539604, + "kl": 0.0005223453044891357, + "learning_rate": 4.936305732484077e-07, + "loss": 0.0029, + "num_tokens": 26170579.0, + "reward": 0.017299108090810478, + "reward_std": 0.03018019301816821, + "rewards/pure_accuracy_reward_math": 0.017299107741564512, + "step": 31 + }, + { + "clip_ratio": 0.00012569415866892086, + "epoch": 0.061187296465597084, + "grad_norm": 0.02653171494603157, + "kl": 0.0005522072315216064, + "learning_rate": 5.095541401273886e-07, + "loss": 0.0029, + "step": 32 + }, + { + "clip_ratio": 0.00012863677034147258, + "epoch": 0.06309939948014699, + "grad_norm": 0.0255680400878191, + "kl": 0.0005916953086853027, + "learning_rate": 5.254777070063695e-07, + "loss": 0.0029, + "step": 33 + }, + { + "clip_ratio": 0.00012797017114962728, + "epoch": 0.06501150249469691, + "grad_norm": 0.02455417811870575, + "kl": 0.0006306171417236328, + "learning_rate": 5.414012738853504e-07, + "loss": 0.0029, + "step": 34 + }, + { + "clip_ratio": 0.00012855784757448419, + "epoch": 0.06692360550924681, + "grad_norm": 0.024154040962457657, + "kl": 0.0006751418113708496, + "learning_rate": 5.573248407643312e-07, + "loss": 0.0029, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.728542804718, + "epoch": 0.06883570852379672, + "grad_norm": 0.023450903594493866, + "kl": 0.000738978385925293, + "learning_rate": 5.732484076433121e-07, + "loss": 0.0034, + "num_tokens": 29883398.0, + "reward": 0.018415179511066526, + "reward_std": 0.030997214023955166, + "rewards/pure_accuracy_reward_math": 0.01841517922002822, + "step": 36 + }, + { + "clip_ratio": 0.00012425195308196635, + "epoch": 0.07074781153834662, + "grad_norm": 0.023070134222507477, + "kl": 0.0007783770561218262, + "learning_rate": 5.89171974522293e-07, + "loss": 0.0034, + "step": 37 + }, + { + "clip_ratio": 0.00012334759713894528, + "epoch": 0.07265991455289654, + "grad_norm": 0.023447532206773758, + "kl": 0.0008447170257568359, + "learning_rate": 6.05095541401274e-07, + "loss": 0.0034, + "step": 38 + }, + { + "clip_ratio": 0.00012615493608336692, + "epoch": 0.07457201756744644, + "grad_norm": 0.024682210758328438, + "kl": 0.0009213089942932129, + "learning_rate": 6.210191082802549e-07, + "loss": 0.0034, + "step": 39 + }, + { + "clip_ratio": 0.00012461718182521508, + "epoch": 0.07648412058199636, + "grad_norm": 0.02555885910987854, + "kl": 0.000977635383605957, + "learning_rate": 6.369426751592357e-07, + "loss": 0.0033, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.030158996582, + "epoch": 0.07839622359654626, + "grad_norm": 0.059237755835056305, + "kl": 0.001125633716583252, + "learning_rate": 6.528662420382166e-07, + "loss": 0.0031, + "num_tokens": 33502406.0, + "reward": 0.024832590454025194, + "reward_std": 0.04194520629243925, + "rewards/pure_accuracy_reward_math": 0.02483259010477923, + "step": 41 + }, + { + "clip_ratio": 0.00016323180295785278, + "epoch": 0.08030832661109617, + "grad_norm": 0.029172642156481743, + "kl": 0.0011183619499206543, + "learning_rate": 6.687898089171975e-07, + "loss": 0.0031, + "step": 42 + }, + { + "clip_ratio": 0.0001751068371618203, + "epoch": 0.08222042962564609, + "grad_norm": 0.030453085899353027, + "kl": 0.0011813640594482422, + "learning_rate": 6.847133757961784e-07, + "loss": 0.0031, + "step": 43 + }, + { + "clip_ratio": 0.00018521026674989116, + "epoch": 0.08413253264019599, + "grad_norm": 0.03091653250157833, + "kl": 0.0012224912643432617, + "learning_rate": 7.006369426751592e-07, + "loss": 0.0031, + "step": 44 + }, + { + "clip_ratio": 0.00017049979595640252, + "epoch": 0.0860446356547459, + "grad_norm": 0.030593233183026314, + "kl": 0.0012733936309814453, + "learning_rate": 7.165605095541401e-07, + "loss": 0.0031, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.7360739707947, + "epoch": 0.0879567386692958, + "grad_norm": 0.03371572494506836, + "kl": 0.0012704133987426758, + "learning_rate": 7.324840764331211e-07, + "loss": 0.0039, + "num_tokens": 37133676.0, + "reward": 0.029017858760198578, + "reward_std": 0.04838265001308173, + "rewards/pure_accuracy_reward_math": 0.029017858061706647, + "step": 46 + }, + { + "clip_ratio": 0.000227557278265067, + "epoch": 0.08986884168384572, + "grad_norm": 0.033185359090566635, + "kl": 0.0012688040733337402, + "learning_rate": 7.48407643312102e-07, + "loss": 0.0039, + "step": 47 + }, + { + "clip_ratio": 0.0002238695693677073, + "epoch": 0.09178094469839562, + "grad_norm": 0.03329231217503548, + "kl": 0.0013200044631958008, + "learning_rate": 7.643312101910829e-07, + "loss": 0.0039, + "step": 48 + }, + { + "clip_ratio": 0.00021458888153347289, + "epoch": 0.09369304771294554, + "grad_norm": 0.03329336270689964, + "kl": 0.0013244152069091797, + "learning_rate": 7.802547770700637e-07, + "loss": 0.0039, + "step": 49 + }, + { + "clip_ratio": 0.0002193794426830209, + "epoch": 0.09560515072749544, + "grad_norm": 0.0323607362806797, + "kl": 0.0013269782066345215, + "learning_rate": 7.961783439490446e-07, + "loss": 0.0039, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.9352931976318, + "epoch": 0.09751725374204535, + "grad_norm": 0.030199358239769936, + "kl": 0.0013506412506103516, + "learning_rate": 8.121019108280255e-07, + "loss": 0.0042, + "num_tokens": 40789896.0, + "reward": 0.030970983498264104, + "reward_std": 0.0486878992523998, + "rewards/pure_accuracy_reward_math": 0.030970983090810478, + "step": 51 + }, + { + "clip_ratio": 0.00019589511845197194, + "epoch": 0.09942935675659526, + "grad_norm": 0.029786745086312294, + "kl": 0.001370549201965332, + "learning_rate": 8.280254777070064e-07, + "loss": 0.0042, + "step": 52 + }, + { + "clip_ratio": 0.00021279048064570816, + "epoch": 0.10134145977114517, + "grad_norm": 0.029834378510713577, + "kl": 0.0013399124145507812, + "learning_rate": 8.439490445859872e-07, + "loss": 0.0042, + "step": 53 + }, + { + "clip_ratio": 0.000190277668878025, + "epoch": 0.10325356278569509, + "grad_norm": 0.029410598799586296, + "kl": 0.00139617919921875, + "learning_rate": 8.598726114649681e-07, + "loss": 0.0042, + "step": 54 + }, + { + "clip_ratio": 0.00019459096591845082, + "epoch": 0.10516566580024499, + "grad_norm": 0.02935440093278885, + "kl": 0.0014204978942871094, + "learning_rate": 8.757961783439491e-07, + "loss": 0.0042, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.9548239707947, + "epoch": 0.1070777688147949, + "grad_norm": 0.02805081568658352, + "kl": 0.0014168024063110352, + "learning_rate": 8.9171974522293e-07, + "loss": 0.0048, + "num_tokens": 44444894.0, + "reward": 0.027901787223527208, + "reward_std": 0.04121451411629096, + "rewards/pure_accuracy_reward_math": 0.02790178669965826, + "step": 56 + }, + { + "clip_ratio": 0.00016821016617996065, + "epoch": 0.1089898718293448, + "grad_norm": 0.02779608964920044, + "kl": 0.0014551877975463867, + "learning_rate": 9.076433121019109e-07, + "loss": 0.0048, + "step": 57 + }, + { + "clip_ratio": 0.00018197509814399382, + "epoch": 0.11090197484389472, + "grad_norm": 0.02721741609275341, + "kl": 0.0014206171035766602, + "learning_rate": 9.235668789808917e-07, + "loss": 0.0048, + "step": 58 + }, + { + "clip_ratio": 0.00016919344039934003, + "epoch": 0.11281407785844462, + "grad_norm": 0.02676265314221382, + "kl": 0.0014575719833374023, + "learning_rate": 9.394904458598727e-07, + "loss": 0.0048, + "step": 59 + }, + { + "clip_ratio": 0.00017069062050723005, + "epoch": 0.11472618087299453, + "grad_norm": 0.027010478079319, + "kl": 0.0014843940734863281, + "learning_rate": 9.554140127388537e-07, + "loss": 0.0048, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.1998043060303, + "epoch": 0.11663828388754444, + "grad_norm": 0.030231643468141556, + "kl": 0.0015106201171875, + "learning_rate": 9.713375796178345e-07, + "loss": 0.0029, + "num_tokens": 48046694.0, + "reward": 0.02762276935391128, + "reward_std": 0.04623683576937765, + "rewards/pure_accuracy_reward_math": 0.02762276877183467, + "step": 61 + }, + { + "clip_ratio": 0.0001882643781527804, + "epoch": 0.11855038690209435, + "grad_norm": 0.030413959175348282, + "kl": 0.0015065670013427734, + "learning_rate": 9.872611464968155e-07, + "loss": 0.0029, + "step": 62 + }, + { + "clip_ratio": 0.00019050979824442038, + "epoch": 0.12046248991664425, + "grad_norm": 0.029997214674949646, + "kl": 0.0014984607696533203, + "learning_rate": 1.0031847133757962e-06, + "loss": 0.0029, + "step": 63 + }, + { + "clip_ratio": 0.0001963579389325787, + "epoch": 0.12237459293119417, + "grad_norm": 0.02927768975496292, + "kl": 0.0014634132385253906, + "learning_rate": 1.0191082802547772e-06, + "loss": 0.0029, + "step": 64 + }, + { + "clip_ratio": 0.0002130206620449826, + "epoch": 0.12428669594574408, + "grad_norm": 0.028719380497932434, + "kl": 0.0014470815658569336, + "learning_rate": 1.035031847133758e-06, + "loss": 0.0029, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.4283137321472, + "epoch": 0.12619879896029398, + "grad_norm": 0.031215306371450424, + "kl": 0.0014127492904663086, + "learning_rate": 1.050955414012739e-06, + "loss": 0.0038, + "num_tokens": 51628501.0, + "reward": 0.03487723405123688, + "reward_std": 0.05173706269124523, + "rewards/pure_accuracy_reward_math": 0.03487723323632963, + "step": 66 + }, + { + "clip_ratio": 0.00019433782460964721, + "epoch": 0.1281109019748439, + "grad_norm": 0.03108724020421505, + "kl": 0.0014324188232421875, + "learning_rate": 1.06687898089172e-06, + "loss": 0.0038, + "step": 67 + }, + { + "clip_ratio": 0.00020085336353758976, + "epoch": 0.13002300498939381, + "grad_norm": 0.030220478773117065, + "kl": 0.0014306306838989258, + "learning_rate": 1.0828025477707007e-06, + "loss": 0.0038, + "step": 68 + }, + { + "clip_ratio": 0.00021161197844321578, + "epoch": 0.1319351080039437, + "grad_norm": 0.030320733785629272, + "kl": 0.001450181007385254, + "learning_rate": 1.0987261146496817e-06, + "loss": 0.0038, + "step": 69 + }, + { + "clip_ratio": 0.00019352555551677142, + "epoch": 0.13384721101849362, + "grad_norm": 0.02980073168873787, + "kl": 0.0014796257019042969, + "learning_rate": 1.1146496815286625e-06, + "loss": 0.0038, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.2854599952698, + "epoch": 0.13575931403304353, + "grad_norm": 0.0338500440120697, + "kl": 0.0015006065368652344, + "learning_rate": 1.1305732484076435e-06, + "loss": 0.006, + "num_tokens": 55247180.0, + "reward": 0.03710937674622983, + "reward_std": 0.05426825548056513, + "rewards/pure_accuracy_reward_math": 0.037109375989530236, + "step": 71 + }, + { + "clip_ratio": 0.0002256608086668166, + "epoch": 0.13767141704759345, + "grad_norm": 0.03328324109315872, + "kl": 0.0015664100646972656, + "learning_rate": 1.1464968152866242e-06, + "loss": 0.006, + "step": 72 + }, + { + "clip_ratio": 0.0002166868289350532, + "epoch": 0.13958352006214333, + "grad_norm": 0.03267475962638855, + "kl": 0.0016113519668579102, + "learning_rate": 1.1624203821656052e-06, + "loss": 0.006, + "step": 73 + }, + { + "clip_ratio": 0.00024709346627105333, + "epoch": 0.14149562307669325, + "grad_norm": 0.032320376485586166, + "kl": 0.0017037391662597656, + "learning_rate": 1.178343949044586e-06, + "loss": 0.006, + "step": 74 + }, + { + "clip_ratio": 0.00021453456992048814, + "epoch": 0.14340772609124317, + "grad_norm": 0.0322573184967041, + "kl": 0.0017703771591186523, + "learning_rate": 1.194267515923567e-06, + "loss": 0.006, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.314199924469, + "epoch": 0.14531982910579308, + "grad_norm": 0.03833702206611633, + "kl": 0.0018039941787719727, + "learning_rate": 1.210191082802548e-06, + "loss": 0.0055, + "num_tokens": 58912938.0, + "reward": 0.04045759144355543, + "reward_std": 0.060838291130494326, + "rewards/pure_accuracy_reward_math": 0.040457590454025194, + "step": 76 + }, + { + "clip_ratio": 0.0002450900424548763, + "epoch": 0.147231932120343, + "grad_norm": 0.03705858439207077, + "kl": 0.0018303394317626953, + "learning_rate": 1.2261146496815287e-06, + "loss": 0.0055, + "step": 77 + }, + { + "clip_ratio": 0.0002520209266094753, + "epoch": 0.14914403513489288, + "grad_norm": 0.03624257072806358, + "kl": 0.0019118785858154297, + "learning_rate": 1.2420382165605097e-06, + "loss": 0.0055, + "step": 78 + }, + { + "clip_ratio": 0.00023157394139161624, + "epoch": 0.1510561381494428, + "grad_norm": 0.03626013919711113, + "kl": 0.001949906349182129, + "learning_rate": 1.2579617834394905e-06, + "loss": 0.0055, + "step": 79 + }, + { + "clip_ratio": 0.0002889583781211513, + "epoch": 0.1529682411639927, + "grad_norm": 0.03634464740753174, + "kl": 0.001984238624572754, + "learning_rate": 1.2738853503184715e-06, + "loss": 0.0055, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.2025918960571, + "epoch": 0.15488034417854263, + "grad_norm": 0.032439347356557846, + "kl": 0.0019190311431884766, + "learning_rate": 1.2898089171974522e-06, + "loss": 0.0067, + "num_tokens": 62551992.0, + "reward": 0.03766741280560382, + "reward_std": 0.0572711571585387, + "rewards/pure_accuracy_reward_math": 0.0376674119324889, + "step": 81 + }, + { + "clip_ratio": 0.00025730342139240747, + "epoch": 0.15679244719309252, + "grad_norm": 0.03198026493191719, + "kl": 0.001917123794555664, + "learning_rate": 1.3057324840764332e-06, + "loss": 0.0067, + "step": 82 + }, + { + "clip_ratio": 0.0002504205738205201, + "epoch": 0.15870455020764243, + "grad_norm": 0.02998184598982334, + "kl": 0.0019073486328125, + "learning_rate": 1.3216560509554142e-06, + "loss": 0.0067, + "step": 83 + }, + { + "clip_ratio": 0.00025362581419585695, + "epoch": 0.16061665322219235, + "grad_norm": 0.029601849615573883, + "kl": 0.0019354820251464844, + "learning_rate": 1.337579617834395e-06, + "loss": 0.0067, + "step": 84 + }, + { + "clip_ratio": 0.0003167184295307379, + "epoch": 0.16252875623674226, + "grad_norm": 0.030052170157432556, + "kl": 0.0019598007202148438, + "learning_rate": 1.353503184713376e-06, + "loss": 0.0067, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.9562168121338, + "epoch": 0.16444085925129218, + "grad_norm": 0.03331635147333145, + "kl": 0.002047300338745117, + "learning_rate": 1.3694267515923567e-06, + "loss": 0.0076, + "num_tokens": 66182275.0, + "reward": 0.04045759132714011, + "reward_std": 0.06074576545506716, + "rewards/pure_accuracy_reward_math": 0.04045759068685584, + "step": 86 + }, + { + "clip_ratio": 0.0002471263709367122, + "epoch": 0.16635296226584206, + "grad_norm": 0.03298444300889969, + "kl": 0.0020711421966552734, + "learning_rate": 1.3853503184713377e-06, + "loss": 0.0076, + "step": 87 + }, + { + "clip_ratio": 0.00024866302578629984, + "epoch": 0.16826506528039198, + "grad_norm": 0.03206898272037506, + "kl": 0.0020384788513183594, + "learning_rate": 1.4012738853503185e-06, + "loss": 0.0076, + "step": 88 + }, + { + "clip_ratio": 0.00026278120321876486, + "epoch": 0.1701771682949419, + "grad_norm": 0.03115510568022728, + "kl": 0.002008795738220215, + "learning_rate": 1.4171974522292995e-06, + "loss": 0.0076, + "step": 89 + }, + { + "clip_ratio": 0.000245522400405207, + "epoch": 0.1720892713094918, + "grad_norm": 0.030577220022678375, + "kl": 0.0019922256469726562, + "learning_rate": 1.4331210191082802e-06, + "loss": 0.0076, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.3948340415955, + "epoch": 0.1740013743240417, + "grad_norm": 0.0348118431866169, + "kl": 0.0019588470458984375, + "learning_rate": 1.4490445859872612e-06, + "loss": 0.0046, + "num_tokens": 69793534.0, + "reward": 0.04436384132714011, + "reward_std": 0.059376906079705805, + "rewards/pure_accuracy_reward_math": 0.044363840570440516, + "step": 91 + }, + { + "clip_ratio": 0.00021377558331892033, + "epoch": 0.1759134773385916, + "grad_norm": 0.03493114933371544, + "kl": 0.0019345283508300781, + "learning_rate": 1.4649681528662422e-06, + "loss": 0.0046, + "step": 92 + }, + { + "clip_ratio": 0.00023636125789039397, + "epoch": 0.17782558035314153, + "grad_norm": 0.03362264111638069, + "kl": 0.0019860267639160156, + "learning_rate": 1.480891719745223e-06, + "loss": 0.0046, + "step": 93 + }, + { + "clip_ratio": 0.00022836430440520417, + "epoch": 0.17973768336769144, + "grad_norm": 0.03336656093597412, + "kl": 0.002032160758972168, + "learning_rate": 1.496815286624204e-06, + "loss": 0.0045, + "step": 94 + }, + { + "clip_ratio": 0.00024139108904819295, + "epoch": 0.18164978638224133, + "grad_norm": 0.03235051408410072, + "kl": 0.0021082162857055664, + "learning_rate": 1.5127388535031847e-06, + "loss": 0.0045, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.8058252334595, + "epoch": 0.18356188939679124, + "grad_norm": 0.03482769802212715, + "kl": 0.0021872520446777344, + "learning_rate": 1.5286624203821657e-06, + "loss": 0.0075, + "num_tokens": 73427974.0, + "reward": 0.04101562709547579, + "reward_std": 0.06101094774203375, + "rewards/pure_accuracy_reward_math": 0.04101562616415322, + "step": 96 + }, + { + "clip_ratio": 0.00024072786442275174, + "epoch": 0.18547399241134116, + "grad_norm": 0.03345990553498268, + "kl": 0.002261519432067871, + "learning_rate": 1.5445859872611465e-06, + "loss": 0.0075, + "step": 97 + }, + { + "clip_ratio": 0.00024480573915752757, + "epoch": 0.18738609542589107, + "grad_norm": 0.03318383917212486, + "kl": 0.0022890567779541016, + "learning_rate": 1.5605095541401275e-06, + "loss": 0.0075, + "step": 98 + }, + { + "clip_ratio": 0.00027489714915418517, + "epoch": 0.189298198440441, + "grad_norm": 0.03230712562799454, + "kl": 0.0023267269134521484, + "learning_rate": 1.5764331210191083e-06, + "loss": 0.0074, + "step": 99 + }, + { + "clip_ratio": 0.00029621877195040724, + "epoch": 0.19121030145499088, + "grad_norm": 0.03260359168052673, + "kl": 0.002334117889404297, + "learning_rate": 1.5923566878980892e-06, + "loss": 0.0074, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.358283996582, + "epoch": 0.0019121030145499089, + "grad_norm": 0.30763256549835205, + "kl": 0.0024461746215820312, + "learning_rate": 1.6082802547770702e-06, + "loss": 0.0053, + "num_tokens": 3621800.0, + "reward": 0.0546875023865141, + "reward_std": 0.06997958500869572, + "rewards/pure_accuracy_reward_math": 0.054687501629814506, + "step": 101 + }, + { + "clip_ratio": 0.00028505406811518696, + "epoch": 0.0038242060290998177, + "grad_norm": 0.7424792647361755, + "kl": 0.005189061164855957, + "learning_rate": 1.624203821656051e-06, + "loss": 0.0054, + "step": 102 + }, + { + "clip_ratio": 0.000307778484739174, + "epoch": 0.005736309043649726, + "grad_norm": 0.5747273564338684, + "kl": 0.005206584930419922, + "learning_rate": 1.640127388535032e-06, + "loss": 0.0054, + "step": 103 + }, + { + "clip_ratio": 0.0003712488735345687, + "epoch": 0.0076484120581996355, + "grad_norm": 0.15304483473300934, + "kl": 0.0026189088821411133, + "learning_rate": 1.6560509554140127e-06, + "loss": 0.0053, + "step": 104 + }, + { + "clip_ratio": 0.00037476027159755176, + "epoch": 0.009560515072749545, + "grad_norm": 0.2118157148361206, + "kl": 0.00246584415435791, + "learning_rate": 1.6719745222929937e-06, + "loss": 0.0053, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.46431016922, + "epoch": 0.011472618087299453, + "grad_norm": 0.2036779820919037, + "kl": 0.0037467479705810547, + "learning_rate": 1.6878980891719745e-06, + "loss": 0.0067, + "num_tokens": 7244448.0, + "reward": 0.05161830596625805, + "reward_std": 0.06822534638922662, + "rewards/pure_accuracy_reward_math": 0.05161830474389717, + "step": 106 + }, + { + "clip_ratio": 0.0002751678786125922, + "epoch": 0.013384721101849363, + "grad_norm": 0.1858554631471634, + "kl": 0.0035070180892944336, + "learning_rate": 1.7038216560509555e-06, + "loss": 0.0067, + "step": 107 + }, + { + "clip_ratio": 0.0002901391828800115, + "epoch": 0.015296824116399271, + "grad_norm": 0.06319136172533035, + "kl": 0.0033702850341796875, + "learning_rate": 1.7197452229299363e-06, + "loss": 0.0067, + "step": 108 + }, + { + "clip_ratio": 0.00029408001091724145, + "epoch": 0.01720892713094918, + "grad_norm": 0.061827220022678375, + "kl": 0.00351715087890625, + "learning_rate": 1.7356687898089172e-06, + "loss": 0.0067, + "step": 109 + }, + { + "clip_ratio": 0.0002710100695253459, + "epoch": 0.01912103014549909, + "grad_norm": 0.13167870044708252, + "kl": 0.0036835670471191406, + "learning_rate": 1.7515923566878982e-06, + "loss": 0.0067, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.4391984939575, + "epoch": 0.021033133160048997, + "grad_norm": 0.034568388015031815, + "kl": 0.0024957656860351562, + "learning_rate": 1.767515923566879e-06, + "loss": 0.0068, + "num_tokens": 10824130.0, + "reward": 0.0468750023865141, + "reward_std": 0.06221334758447483, + "rewards/pure_accuracy_reward_math": 0.04687500116415322, + "step": 111 + }, + { + "clip_ratio": 0.00025272632768746917, + "epoch": 0.022945236174598906, + "grad_norm": 0.03421744704246521, + "kl": 0.002499222755432129, + "learning_rate": 1.78343949044586e-06, + "loss": 0.0068, + "step": 112 + }, + { + "clip_ratio": 0.00025192988658773174, + "epoch": 0.024857339189148814, + "grad_norm": 0.03444651514291763, + "kl": 0.002528548240661621, + "learning_rate": 1.7993630573248407e-06, + "loss": 0.0068, + "step": 113 + }, + { + "clip_ratio": 0.0002639102876287325, + "epoch": 0.026769442203698725, + "grad_norm": 0.033966146409511566, + "kl": 0.0025298595428466797, + "learning_rate": 1.8152866242038217e-06, + "loss": 0.0067, + "step": 114 + }, + { + "clip_ratio": 0.0002613060296994263, + "epoch": 0.028681545218248634, + "grad_norm": 0.03252725675702095, + "kl": 0.0025829076766967773, + "learning_rate": 1.8312101910828025e-06, + "loss": 0.0067, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.59126138687134, + "epoch": 0.030593648232798542, + "grad_norm": 0.04161737114191055, + "kl": 0.002777099609375, + "learning_rate": 1.8471337579617835e-06, + "loss": 0.0084, + "num_tokens": 14389817.0, + "reward": 0.04464285934227519, + "reward_std": 0.0631567623349838, + "rewards/pure_accuracy_reward_math": 0.044642858469160274, + "step": 116 + }, + { + "clip_ratio": 0.0002685248994680478, + "epoch": 0.032505751247348454, + "grad_norm": 0.03920653462409973, + "kl": 0.002690911293029785, + "learning_rate": 1.8630573248407643e-06, + "loss": 0.0084, + "step": 117 + }, + { + "clip_ratio": 0.00028247613772691693, + "epoch": 0.03441785426189836, + "grad_norm": 0.037915512919425964, + "kl": 0.0026444196701049805, + "learning_rate": 1.8789808917197455e-06, + "loss": 0.0084, + "step": 118 + }, + { + "clip_ratio": 0.00028578577973803476, + "epoch": 0.03632995727644827, + "grad_norm": 0.03727024793624878, + "kl": 0.002573251724243164, + "learning_rate": 1.8949044585987264e-06, + "loss": 0.0083, + "step": 119 + }, + { + "clip_ratio": 0.0003107314861381383, + "epoch": 0.03824206029099818, + "grad_norm": 0.03734543174505234, + "kl": 0.002534151077270508, + "learning_rate": 1.9108280254777074e-06, + "loss": 0.0083, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.96654319763184, + "epoch": 0.040154163305548086, + "grad_norm": 18.524425506591797, + "kl": 0.05213046073913574, + "learning_rate": 1.926751592356688e-06, + "loss": 0.0067, + "num_tokens": 17950273.0, + "reward": 0.044642859254963696, + "reward_std": 0.0572310917195864, + "rewards/pure_accuracy_reward_math": 0.04464285838184878, + "step": 121 + }, + { + "clip_ratio": 0.00024330438452579983, + "epoch": 0.042066266320097995, + "grad_norm": 0.06961806118488312, + "kl": 0.0025354623794555664, + "learning_rate": 1.942675159235669e-06, + "loss": 0.0047, + "step": 122 + }, + { + "clip_ratio": 0.00023799908234423128, + "epoch": 0.0439783693346479, + "grad_norm": 0.038592379540205, + "kl": 0.0024437904357910156, + "learning_rate": 1.95859872611465e-06, + "loss": 0.0047, + "step": 123 + }, + { + "clip_ratio": 0.00023513944393016573, + "epoch": 0.04589047234919781, + "grad_norm": 0.036785636097192764, + "kl": 0.002588033676147461, + "learning_rate": 1.974522292993631e-06, + "loss": 0.0047, + "step": 124 + }, + { + "clip_ratio": 0.0002449645085107477, + "epoch": 0.04780257536374772, + "grad_norm": 0.03537231311202049, + "kl": 0.002721548080444336, + "learning_rate": 1.9904458598726117e-06, + "loss": 0.0047, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.618884563446, + "epoch": 0.04971467837829763, + "grad_norm": 0.03746291249990463, + "kl": 0.0026862621307373047, + "learning_rate": 2.0063694267515925e-06, + "loss": 0.0063, + "num_tokens": 21522907.0, + "reward": 0.04492187732830644, + "reward_std": 0.061436392075847834, + "rewards/pure_accuracy_reward_math": 0.04492187616415322, + "step": 126 + }, + { + "clip_ratio": 0.0002821582585283977, + "epoch": 0.05162678139284754, + "grad_norm": 0.036032602190971375, + "kl": 0.0027321577072143555, + "learning_rate": 2.0222929936305737e-06, + "loss": 0.0063, + "step": 127 + }, + { + "clip_ratio": 0.0002675421079629814, + "epoch": 0.05353888440739745, + "grad_norm": 0.03723033517599106, + "kl": 0.002848386764526367, + "learning_rate": 2.0382165605095544e-06, + "loss": 0.0062, + "step": 128 + }, + { + "clip_ratio": 0.00030748845301786787, + "epoch": 0.05545098742194736, + "grad_norm": 0.03697400540113449, + "kl": 0.002881765365600586, + "learning_rate": 2.054140127388535e-06, + "loss": 0.0062, + "step": 129 + }, + { + "clip_ratio": 0.0003087153630758621, + "epoch": 0.05736309043649727, + "grad_norm": 0.03756724298000336, + "kl": 0.002836942672729492, + "learning_rate": 2.070063694267516e-06, + "loss": 0.0062, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.1615762710571, + "epoch": 0.059275193451047176, + "grad_norm": 0.039371710270643234, + "kl": 0.00270843505859375, + "learning_rate": 2.085987261146497e-06, + "loss": 0.0064, + "num_tokens": 25111362.0, + "reward": 0.05106027016881853, + "reward_std": 0.06736206263303757, + "rewards/pure_accuracy_reward_math": 0.051060269062872976, + "step": 131 + }, + { + "clip_ratio": 0.0002896036380661826, + "epoch": 0.061187296465597084, + "grad_norm": 0.03780793026089668, + "kl": 0.0027250051498413086, + "learning_rate": 2.101910828025478e-06, + "loss": 0.0064, + "step": 132 + }, + { + "clip_ratio": 0.0002853632216783808, + "epoch": 0.06309939948014699, + "grad_norm": 0.03720535710453987, + "kl": 0.0027070045471191406, + "learning_rate": 2.1178343949044587e-06, + "loss": 0.0064, + "step": 133 + }, + { + "clip_ratio": 0.0002896762144928289, + "epoch": 0.06501150249469691, + "grad_norm": 0.036468133330345154, + "kl": 0.0027469396591186523, + "learning_rate": 2.13375796178344e-06, + "loss": 0.0064, + "step": 134 + }, + { + "clip_ratio": 0.0003120482754184195, + "epoch": 0.06692360550924681, + "grad_norm": 0.03586801886558533, + "kl": 0.002748727798461914, + "learning_rate": 2.1496815286624207e-06, + "loss": 0.0063, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.0351796150208, + "epoch": 0.06883570852379672, + "grad_norm": 0.03092282824218273, + "kl": 0.002766728401184082, + "learning_rate": 2.1656050955414015e-06, + "loss": 0.0056, + "num_tokens": 28735680.0, + "reward": 0.04017857348662801, + "reward_std": 0.05289319949224591, + "rewards/pure_accuracy_reward_math": 0.040178572438890114, + "step": 136 + }, + { + "clip_ratio": 0.00020221989311153266, + "epoch": 0.07074781153834662, + "grad_norm": 0.030703941360116005, + "kl": 0.0028089284896850586, + "learning_rate": 2.1815286624203822e-06, + "loss": 0.0056, + "step": 137 + }, + { + "clip_ratio": 0.00019867721590571819, + "epoch": 0.07265991455289654, + "grad_norm": 0.030248478055000305, + "kl": 0.0027884244918823242, + "learning_rate": 2.1974522292993634e-06, + "loss": 0.0056, + "step": 138 + }, + { + "clip_ratio": 0.00021304549886735913, + "epoch": 0.07457201756744644, + "grad_norm": 0.029539138078689575, + "kl": 0.002767205238342285, + "learning_rate": 2.213375796178344e-06, + "loss": 0.0056, + "step": 139 + }, + { + "clip_ratio": 0.00021535260020755231, + "epoch": 0.07648412058199636, + "grad_norm": 0.02955791726708412, + "kl": 0.002725839614868164, + "learning_rate": 2.229299363057325e-06, + "loss": 0.0055, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.7212834358215, + "epoch": 0.07839622359654626, + "grad_norm": 0.060058850795030594, + "kl": 0.0032591819763183594, + "learning_rate": 2.245222929936306e-06, + "loss": 0.0071, + "num_tokens": 32349997.0, + "reward": 0.048828127211891115, + "reward_std": 0.056028691527899355, + "rewards/pure_accuracy_reward_math": 0.04882812628056854, + "step": 141 + }, + { + "clip_ratio": 0.00022036872547914754, + "epoch": 0.08030832661109617, + "grad_norm": 0.03533012047410011, + "kl": 0.002978205680847168, + "learning_rate": 2.261146496815287e-06, + "loss": 0.0071, + "step": 142 + }, + { + "clip_ratio": 0.0002158615123448726, + "epoch": 0.08222042962564609, + "grad_norm": 0.029908612370491028, + "kl": 0.002841353416442871, + "learning_rate": 2.2770700636942677e-06, + "loss": 0.0071, + "step": 143 + }, + { + "clip_ratio": 0.0002112481060976279, + "epoch": 0.08413253264019599, + "grad_norm": 0.028638474643230438, + "kl": 0.002796173095703125, + "learning_rate": 2.2929936305732485e-06, + "loss": 0.0071, + "step": 144 + }, + { + "clip_ratio": 0.00022246911356660348, + "epoch": 0.0860446356547459, + "grad_norm": 0.02828238159418106, + "kl": 0.0027240514755249023, + "learning_rate": 2.3089171974522297e-06, + "loss": 0.007, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.0318322181702, + "epoch": 0.0879567386692958, + "grad_norm": 3.060509443283081, + "kl": 0.022321224212646484, + "learning_rate": 2.3248407643312104e-06, + "loss": 0.0062, + "num_tokens": 35996663.0, + "reward": 0.04436384144355543, + "reward_std": 0.06130999844754115, + "rewards/pure_accuracy_reward_math": 0.04436384039581753, + "step": 146 + }, + { + "clip_ratio": 0.00023404289771633557, + "epoch": 0.08986884168384572, + "grad_norm": 0.28904739022254944, + "kl": 0.004893064498901367, + "learning_rate": 2.3407643312101912e-06, + "loss": 0.0055, + "step": 147 + }, + { + "clip_ratio": 0.00024259101735424338, + "epoch": 0.09178094469839562, + "grad_norm": 0.03826431185007095, + "kl": 0.0027625560760498047, + "learning_rate": 2.356687898089172e-06, + "loss": 0.0054, + "step": 148 + }, + { + "clip_ratio": 0.0002517821457672653, + "epoch": 0.09369304771294554, + "grad_norm": 0.03572425991296768, + "kl": 0.002875208854675293, + "learning_rate": 2.372611464968153e-06, + "loss": 0.0054, + "step": 149 + }, + { + "clip_ratio": 0.00024034848578935453, + "epoch": 0.09560515072749544, + "grad_norm": 0.036431849002838135, + "kl": 0.0031164884567260742, + "learning_rate": 2.388535031847134e-06, + "loss": 0.0054, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.2168254852295, + "epoch": 0.09751725374204535, + "grad_norm": 0.03362743556499481, + "kl": 0.002684354782104492, + "learning_rate": 2.4044585987261147e-06, + "loss": 0.0027, + "num_tokens": 39661060.0, + "reward": 0.05133928792201914, + "reward_std": 0.06672389659797773, + "rewards/pure_accuracy_reward_math": 0.05133928681607358, + "step": 151 + }, + { + "clip_ratio": 0.0002668876670099962, + "epoch": 0.09942935675659526, + "grad_norm": 0.033922772854566574, + "kl": 0.002791762351989746, + "learning_rate": 2.420382165605096e-06, + "loss": 0.0027, + "step": 152 + }, + { + "clip_ratio": 0.0002435101382616267, + "epoch": 0.10134145977114517, + "grad_norm": 0.03526493161916733, + "kl": 0.002907991409301758, + "learning_rate": 2.4363057324840767e-06, + "loss": 0.0027, + "step": 153 + }, + { + "clip_ratio": 0.00025345294346834635, + "epoch": 0.10325356278569509, + "grad_norm": 0.034125424921512604, + "kl": 0.0029108524322509766, + "learning_rate": 2.4522292993630575e-06, + "loss": 0.0027, + "step": 154 + }, + { + "clip_ratio": 0.0002378649581942227, + "epoch": 0.10516566580024499, + "grad_norm": 0.033436987549066544, + "kl": 0.002874612808227539, + "learning_rate": 2.4681528662420382e-06, + "loss": 0.0027, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.2424907684326, + "epoch": 0.1070777688147949, + "grad_norm": 0.031592246145009995, + "kl": 0.002785801887512207, + "learning_rate": 2.4840764331210194e-06, + "loss": 0.005, + "num_tokens": 43331425.0, + "reward": 0.044363841181620955, + "reward_std": 0.05607495462754741, + "rewards/pure_accuracy_reward_math": 0.044363840599544346, + "step": 156 + }, + { + "clip_ratio": 0.00019312051063025137, + "epoch": 0.1089898718293448, + "grad_norm": 0.030642936006188393, + "kl": 0.0027495622634887695, + "learning_rate": 2.5e-06, + "loss": 0.005, + "step": 157 + }, + { + "clip_ratio": 0.0002267159566713417, + "epoch": 0.11090197484389472, + "grad_norm": 0.03025418519973755, + "kl": 0.002672433853149414, + "learning_rate": 2.515923566878981e-06, + "loss": 0.0049, + "step": 158 + }, + { + "clip_ratio": 0.00023296605036193796, + "epoch": 0.11281407785844462, + "grad_norm": 0.03024701401591301, + "kl": 0.0026074647903442383, + "learning_rate": 2.531847133757962e-06, + "loss": 0.0049, + "step": 159 + }, + { + "clip_ratio": 0.00024551542321660236, + "epoch": 0.11472618087299453, + "grad_norm": 0.03065372072160244, + "kl": 0.0025725364685058594, + "learning_rate": 2.547770700636943e-06, + "loss": 0.0049, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.590705871582, + "epoch": 0.11663828388754444, + "grad_norm": 0.03286377340555191, + "kl": 0.002618551254272461, + "learning_rate": 2.5636942675159237e-06, + "loss": 0.0032, + "num_tokens": 46966882.0, + "reward": 0.0401785735739395, + "reward_std": 0.05864621384534985, + "rewards/pure_accuracy_reward_math": 0.04017857258440927, + "step": 161 + }, + { + "clip_ratio": 0.000249601399104904, + "epoch": 0.11855038690209435, + "grad_norm": 0.03168044239282608, + "kl": 0.0025817155838012695, + "learning_rate": 2.5796178343949045e-06, + "loss": 0.0032, + "step": 162 + }, + { + "clip_ratio": 0.0002426054838338132, + "epoch": 0.12046248991664425, + "grad_norm": 0.03161012753844261, + "kl": 0.0025763511657714844, + "learning_rate": 2.5955414012738857e-06, + "loss": 0.0032, + "step": 163 + }, + { + "clip_ratio": 0.0002400714004124893, + "epoch": 0.12237459293119417, + "grad_norm": 0.031408168375492096, + "kl": 0.002588987350463867, + "learning_rate": 2.6114649681528665e-06, + "loss": 0.0032, + "step": 164 + }, + { + "clip_ratio": 0.00024877328468164706, + "epoch": 0.12428669594574408, + "grad_norm": 0.030564049258828163, + "kl": 0.0026369094848632812, + "learning_rate": 2.6273885350318472e-06, + "loss": 0.0031, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.5862393379211, + "epoch": 0.12619879896029398, + "grad_norm": 0.03767310827970505, + "kl": 0.0026383399963378906, + "learning_rate": 2.6433121019108284e-06, + "loss": 0.0062, + "num_tokens": 50581511.0, + "reward": 0.04966518055880442, + "reward_std": 0.06985319027444348, + "rewards/pure_accuracy_reward_math": 0.04966517968568951, + "step": 166 + }, + { + "clip_ratio": 0.0002872111982696879, + "epoch": 0.1281109019748439, + "grad_norm": 0.03578091412782669, + "kl": 0.0027115345001220703, + "learning_rate": 2.659235668789809e-06, + "loss": 0.0062, + "step": 167 + }, + { + "clip_ratio": 0.0002957127134664006, + "epoch": 0.13002300498939381, + "grad_norm": 0.03471493721008301, + "kl": 0.0028066635131835938, + "learning_rate": 2.67515923566879e-06, + "loss": 0.0062, + "step": 168 + }, + { + "clip_ratio": 0.0003112256898702981, + "epoch": 0.1319351080039437, + "grad_norm": 0.035491716116666794, + "kl": 0.0028966665267944336, + "learning_rate": 2.6910828025477707e-06, + "loss": 0.0062, + "step": 169 + }, + { + "clip_ratio": 0.0003354581235726073, + "epoch": 0.13384721101849362, + "grad_norm": 0.03574714809656143, + "kl": 0.0029289722442626953, + "learning_rate": 2.707006369426752e-06, + "loss": 0.0061, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.8571667671204, + "epoch": 0.13575931403304353, + "grad_norm": 0.03648287057876587, + "kl": 0.0030307769775390625, + "learning_rate": 2.7229299363057327e-06, + "loss": 0.0061, + "num_tokens": 54209407.0, + "reward": 0.05161830587894656, + "reward_std": 0.06465821276651695, + "rewards/pure_accuracy_reward_math": 0.05161830494762398, + "step": 171 + }, + { + "clip_ratio": 0.0002587431810354701, + "epoch": 0.13767141704759345, + "grad_norm": 0.03615426644682884, + "kl": 0.0030341148376464844, + "learning_rate": 2.7388535031847135e-06, + "loss": 0.0061, + "step": 172 + }, + { + "clip_ratio": 0.0002548517101104153, + "epoch": 0.13958352006214333, + "grad_norm": 0.03565597161650658, + "kl": 0.002932310104370117, + "learning_rate": 2.7547770700636942e-06, + "loss": 0.0061, + "step": 173 + }, + { + "clip_ratio": 0.00027394448250106507, + "epoch": 0.14149562307669325, + "grad_norm": 0.035612594336271286, + "kl": 0.0029175281524658203, + "learning_rate": 2.7707006369426754e-06, + "loss": 0.0061, + "step": 174 + }, + { + "clip_ratio": 0.00027776476230201297, + "epoch": 0.14340772609124317, + "grad_norm": 0.036588992923498154, + "kl": 0.002942800521850586, + "learning_rate": 2.786624203821656e-06, + "loss": 0.006, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.2430500984192, + "epoch": 0.14531982910579308, + "grad_norm": 0.03312006592750549, + "kl": 0.0028772354125976562, + "learning_rate": 2.802547770700637e-06, + "loss": 0.0056, + "num_tokens": 57839070.0, + "reward": 0.04854910931317136, + "reward_std": 0.05881887051509693, + "rewards/pure_accuracy_reward_math": 0.048549108614679426, + "step": 176 + }, + { + "clip_ratio": 0.00022063881021949783, + "epoch": 0.147231932120343, + "grad_norm": 0.0327099934220314, + "kl": 0.002942681312561035, + "learning_rate": 2.818471337579618e-06, + "loss": 0.0056, + "step": 177 + }, + { + "clip_ratio": 0.00021944492368675128, + "epoch": 0.14914403513489288, + "grad_norm": 0.03261202201247215, + "kl": 0.002986431121826172, + "learning_rate": 2.834394904458599e-06, + "loss": 0.0056, + "step": 178 + }, + { + "clip_ratio": 0.0002127133307396889, + "epoch": 0.1510561381494428, + "grad_norm": 0.03220335766673088, + "kl": 0.002970457077026367, + "learning_rate": 2.8503184713375797e-06, + "loss": 0.0056, + "step": 179 + }, + { + "clip_ratio": 0.0001991192841614975, + "epoch": 0.1529682411639927, + "grad_norm": 0.03179548308253288, + "kl": 0.0029560327529907227, + "learning_rate": 2.8662420382165605e-06, + "loss": 0.0056, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.1275358200073, + "epoch": 0.15488034417854263, + "grad_norm": 0.030966561287641525, + "kl": 0.0029218196868896484, + "learning_rate": 2.8821656050955417e-06, + "loss": 0.0048, + "num_tokens": 61445599.0, + "reward": 0.04352678795112297, + "reward_std": 0.05598862626357004, + "rewards/pure_accuracy_reward_math": 0.043526787078008056, + "step": 181 + }, + { + "clip_ratio": 0.00021554413663693595, + "epoch": 0.15679244719309252, + "grad_norm": 0.030419446527957916, + "kl": 0.0029065608978271484, + "learning_rate": 2.8980891719745225e-06, + "loss": 0.0048, + "step": 182 + }, + { + "clip_ratio": 0.0002025423377176594, + "epoch": 0.15870455020764243, + "grad_norm": 0.030062729492783546, + "kl": 0.0028995275497436523, + "learning_rate": 2.9140127388535032e-06, + "loss": 0.0048, + "step": 183 + }, + { + "clip_ratio": 0.00023064417456453157, + "epoch": 0.16061665322219235, + "grad_norm": 0.029301613569259644, + "kl": 0.002888321876525879, + "learning_rate": 2.9299363057324844e-06, + "loss": 0.0048, + "step": 184 + }, + { + "clip_ratio": 0.0002338091023261768, + "epoch": 0.16252875623674226, + "grad_norm": 0.029127391055226326, + "kl": 0.0028772354125976562, + "learning_rate": 2.945859872611465e-06, + "loss": 0.0047, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.0510892868042, + "epoch": 0.16444085925129218, + "grad_norm": 0.036479271948337555, + "kl": 0.002923727035522461, + "learning_rate": 2.961783439490446e-06, + "loss": 0.0063, + "num_tokens": 65094142.0, + "reward": 0.05022321717115119, + "reward_std": 0.06538890500087291, + "rewards/pure_accuracy_reward_math": 0.050223215424921364, + "step": 186 + }, + { + "clip_ratio": 0.00026048495129771254, + "epoch": 0.16635296226584206, + "grad_norm": 0.036232370883226395, + "kl": 0.0029561519622802734, + "learning_rate": 2.9777070063694267e-06, + "loss": 0.0063, + "step": 187 + }, + { + "clip_ratio": 0.0002226464382033555, + "epoch": 0.16826506528039198, + "grad_norm": 0.03523917496204376, + "kl": 0.003048419952392578, + "learning_rate": 2.993630573248408e-06, + "loss": 0.0063, + "step": 188 + }, + { + "clip_ratio": 0.0002362887615845466, + "epoch": 0.1701771682949419, + "grad_norm": 0.03477315977215767, + "kl": 0.003025054931640625, + "learning_rate": 3.0095541401273887e-06, + "loss": 0.0062, + "step": 189 + }, + { + "clip_ratio": 0.00023160997727700305, + "epoch": 0.1720892713094918, + "grad_norm": 0.03342609107494354, + "kl": 0.0030221939086914062, + "learning_rate": 3.0254777070063695e-06, + "loss": 0.0062, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.7776494026184, + "epoch": 0.1740013743240417, + "grad_norm": 0.03668810427188873, + "kl": 0.0029752254486083984, + "learning_rate": 3.0414012738853503e-06, + "loss": 0.0066, + "num_tokens": 68728277.0, + "reward": 0.04994419863214716, + "reward_std": 0.06135626137256622, + "rewards/pure_accuracy_reward_math": 0.04994419787544757, + "step": 191 + }, + { + "clip_ratio": 0.0002391185845453947, + "epoch": 0.1759134773385916, + "grad_norm": 0.035618141293525696, + "kl": 0.0029642581939697266, + "learning_rate": 3.0573248407643314e-06, + "loss": 0.0066, + "step": 192 + }, + { + "clip_ratio": 0.00024402707180115613, + "epoch": 0.17782558035314153, + "grad_norm": 0.032588809728622437, + "kl": 0.002981424331665039, + "learning_rate": 3.0732484076433122e-06, + "loss": 0.0066, + "step": 193 + }, + { + "clip_ratio": 0.0002546731577126593, + "epoch": 0.17973768336769144, + "grad_norm": 0.0323190875351429, + "kl": 0.0030133724212646484, + "learning_rate": 3.089171974522293e-06, + "loss": 0.0066, + "step": 194 + }, + { + "clip_ratio": 0.0002784079450179888, + "epoch": 0.18164978638224133, + "grad_norm": 0.03181909769773483, + "kl": 0.002997159957885742, + "learning_rate": 3.105095541401274e-06, + "loss": 0.0065, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.7226796150208, + "epoch": 0.18356188939679124, + "grad_norm": 0.034835390746593475, + "kl": 0.003053426742553711, + "learning_rate": 3.121019108280255e-06, + "loss": 0.0053, + "num_tokens": 72383923.0, + "reward": 0.04352678789291531, + "reward_std": 0.06164911447558552, + "rewards/pure_accuracy_reward_math": 0.043526787078008056, + "step": 196 + }, + { + "clip_ratio": 0.00022759345233680506, + "epoch": 0.18547399241134116, + "grad_norm": 0.03316686674952507, + "kl": 0.003064870834350586, + "learning_rate": 3.1369426751592357e-06, + "loss": 0.0053, + "step": 197 + }, + { + "clip_ratio": 0.00024183520912401946, + "epoch": 0.18738609542589107, + "grad_norm": 0.0329214446246624, + "kl": 0.003040313720703125, + "learning_rate": 3.1528662420382165e-06, + "loss": 0.0053, + "step": 198 + }, + { + "clip_ratio": 0.0002539973459079192, + "epoch": 0.189298198440441, + "grad_norm": 0.031231405213475227, + "kl": 0.0030624866485595703, + "learning_rate": 3.1687898089171977e-06, + "loss": 0.0052, + "step": 199 + }, + { + "clip_ratio": 0.0002776768195076329, + "epoch": 0.19121030145499088, + "grad_norm": 0.031124714761972427, + "kl": 0.0030813217163085938, + "learning_rate": 3.1847133757961785e-06, + "loss": 0.0052, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.4048767089844, + "epoch": 0.1931224044695408, + "grad_norm": 0.03386811539530754, + "kl": 0.003122568130493164, + "learning_rate": 3.2006369426751592e-06, + "loss": 0.0052, + "num_tokens": 75984438.0, + "reward": 0.04771205616998486, + "reward_std": 0.06319682823959738, + "rewards/pure_accuracy_reward_math": 0.04771205471479334, + "step": 201 + }, + { + "clip_ratio": 0.00024403837670661233, + "epoch": 0.1950345074840907, + "grad_norm": 0.03252818062901497, + "kl": 0.003181934356689453, + "learning_rate": 3.2165605095541404e-06, + "loss": 0.0052, + "step": 202 + }, + { + "clip_ratio": 0.0002548924753114079, + "epoch": 0.19694661049864062, + "grad_norm": 0.03233063966035843, + "kl": 0.0032570362091064453, + "learning_rate": 3.232484076433121e-06, + "loss": 0.0052, + "step": 203 + }, + { + "clip_ratio": 0.0003048134046252926, + "epoch": 0.1988587135131905, + "grad_norm": 0.032457806169986725, + "kl": 0.0032837390899658203, + "learning_rate": 3.248407643312102e-06, + "loss": 0.0051, + "step": 204 + }, + { + "clip_ratio": 0.0003034327668842707, + "epoch": 0.20077081652774043, + "grad_norm": 0.03239855542778969, + "kl": 0.0032906532287597656, + "learning_rate": 3.2643312101910827e-06, + "loss": 0.0051, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.6657609939575, + "epoch": 0.20268291954229034, + "grad_norm": 0.0370325967669487, + "kl": 0.0033235549926757812, + "learning_rate": 3.280254777070064e-06, + "loss": 0.0075, + "num_tokens": 79548556.0, + "reward": 0.052176341792801395, + "reward_std": 0.06135626166360453, + "rewards/pure_accuracy_reward_math": 0.0521763407450635, + "step": 206 + }, + { + "clip_ratio": 0.00026798775621728055, + "epoch": 0.20459502255684026, + "grad_norm": 0.03616202250123024, + "kl": 0.0032608509063720703, + "learning_rate": 3.2961783439490447e-06, + "loss": 0.0075, + "step": 207 + }, + { + "clip_ratio": 0.0002652346859690624, + "epoch": 0.20650712557139017, + "grad_norm": 0.03537038713693619, + "kl": 0.0032129287719726562, + "learning_rate": 3.3121019108280255e-06, + "loss": 0.0074, + "step": 208 + }, + { + "clip_ratio": 0.00026950107780976396, + "epoch": 0.20841922858594006, + "grad_norm": 0.03502323478460312, + "kl": 0.0031485557556152344, + "learning_rate": 3.3280254777070063e-06, + "loss": 0.0074, + "step": 209 + }, + { + "clip_ratio": 0.00025725525091502277, + "epoch": 0.21033133160048997, + "grad_norm": 0.03380832076072693, + "kl": 0.0031027793884277344, + "learning_rate": 3.3439490445859875e-06, + "loss": 0.0074, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.36637449264526, + "epoch": 0.2122434346150399, + "grad_norm": 1.5961617231369019, + "kl": 0.007061004638671875, + "learning_rate": 3.3598726114649682e-06, + "loss": 0.0062, + "num_tokens": 83116585.0, + "reward": 0.05078125247382559, + "reward_std": 0.06568795558996499, + "rewards/pure_accuracy_reward_math": 0.05078125119325705, + "step": 211 + }, + { + "clip_ratio": 0.0002800602194383828, + "epoch": 0.2141555376295898, + "grad_norm": 0.04389820247888565, + "kl": 0.004379749298095703, + "learning_rate": 3.375796178343949e-06, + "loss": 0.0061, + "step": 212 + }, + { + "clip_ratio": 0.0002803218378630845, + "epoch": 0.2160676406441397, + "grad_norm": 0.04022788628935814, + "kl": 0.0043125152587890625, + "learning_rate": 3.39171974522293e-06, + "loss": 0.0061, + "step": 213 + }, + { + "clip_ratio": 0.0002704095267631601, + "epoch": 0.2179797436586896, + "grad_norm": 0.041697319597005844, + "kl": 0.004408597946166992, + "learning_rate": 3.407643312101911e-06, + "loss": 0.0061, + "step": 214 + }, + { + "clip_ratio": 0.0003097587871820906, + "epoch": 0.21989184667323952, + "grad_norm": 0.04933662340044975, + "kl": 0.004500150680541992, + "learning_rate": 3.4235668789808917e-06, + "loss": 0.006, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.8122463226318, + "epoch": 0.22180394968778944, + "grad_norm": 0.03384365886449814, + "kl": 0.0032858848571777344, + "learning_rate": 3.4394904458598725e-06, + "loss": 0.0069, + "num_tokens": 86710660.0, + "reward": 0.041015627270098776, + "reward_std": 0.05345123494043946, + "rewards/pure_accuracy_reward_math": 0.041015626047737896, + "step": 216 + }, + { + "clip_ratio": 0.00022953049290208583, + "epoch": 0.22371605270233935, + "grad_norm": 0.03259577602148056, + "kl": 0.003277301788330078, + "learning_rate": 3.4554140127388537e-06, + "loss": 0.0069, + "step": 217 + }, + { + "clip_ratio": 0.00024143920052210888, + "epoch": 0.22562815571688924, + "grad_norm": 0.031054330989718437, + "kl": 0.0031991004943847656, + "learning_rate": 3.4713375796178345e-06, + "loss": 0.0069, + "step": 218 + }, + { + "clip_ratio": 0.0002552373456978785, + "epoch": 0.22754025873143915, + "grad_norm": 0.031755171716213226, + "kl": 0.003099679946899414, + "learning_rate": 3.4872611464968152e-06, + "loss": 0.0069, + "step": 219 + }, + { + "clip_ratio": 0.0002681780064790473, + "epoch": 0.22945236174598907, + "grad_norm": 0.031188273802399635, + "kl": 0.003045320510864258, + "learning_rate": 3.5031847133757964e-06, + "loss": 0.0068, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.6825013160706, + "epoch": 0.23136446476053898, + "grad_norm": 0.03775335103273392, + "kl": 0.003011941909790039, + "learning_rate": 3.5191082802547772e-06, + "loss": 0.0063, + "num_tokens": 90291858.0, + "reward": 0.04715401996509172, + "reward_std": 0.06113734241807833, + "rewards/pure_accuracy_reward_math": 0.04715401915018447, + "step": 221 + }, + { + "clip_ratio": 0.0002582234144483664, + "epoch": 0.23327656777508887, + "grad_norm": 0.03602875769138336, + "kl": 0.002973794937133789, + "learning_rate": 3.535031847133758e-06, + "loss": 0.0063, + "step": 222 + }, + { + "clip_ratio": 0.0002264754746761355, + "epoch": 0.2351886707896388, + "grad_norm": 0.03449266403913498, + "kl": 0.002980470657348633, + "learning_rate": 3.5509554140127388e-06, + "loss": 0.0063, + "step": 223 + }, + { + "clip_ratio": 0.00025999376231311544, + "epoch": 0.2371007738041887, + "grad_norm": 0.0329199843108654, + "kl": 0.002971053123474121, + "learning_rate": 3.56687898089172e-06, + "loss": 0.0062, + "step": 224 + }, + { + "clip_ratio": 0.000296181439978227, + "epoch": 0.23901287681873862, + "grad_norm": 0.033409375697374344, + "kl": 0.0030214786529541016, + "learning_rate": 3.5828025477707007e-06, + "loss": 0.0062, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.4132494926453, + "epoch": 0.2409249798332885, + "grad_norm": 0.03549947962164879, + "kl": 0.004068970680236816, + "learning_rate": 3.5987261146496815e-06, + "loss": 0.0083, + "num_tokens": 93927655.0, + "reward": 0.039899555675219744, + "reward_std": 0.05890519870445132, + "rewards/pure_accuracy_reward_math": 0.03989955480210483, + "step": 226 + }, + { + "clip_ratio": 0.00024125495940552355, + "epoch": 0.24283708284783842, + "grad_norm": 0.033262889832258224, + "kl": 0.0040683746337890625, + "learning_rate": 3.6146496815286623e-06, + "loss": 0.0083, + "step": 227 + }, + { + "clip_ratio": 0.00024547909194438944, + "epoch": 0.24474918586238834, + "grad_norm": 0.03303634375333786, + "kl": 0.004040956497192383, + "learning_rate": 3.6305732484076435e-06, + "loss": 0.0083, + "step": 228 + }, + { + "clip_ratio": 0.0002773670349256463, + "epoch": 0.24666128887693825, + "grad_norm": 0.03389015421271324, + "kl": 0.00404667854309082, + "learning_rate": 3.6464968152866242e-06, + "loss": 0.0083, + "step": 229 + }, + { + "clip_ratio": 0.000270649900215858, + "epoch": 0.24857339189148817, + "grad_norm": 0.035877879709005356, + "kl": 0.0038802623748779297, + "learning_rate": 3.662420382165605e-06, + "loss": 0.0082, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.9338984489441, + "epoch": 0.25048549490603805, + "grad_norm": 0.032850634306669235, + "kl": 0.0030744075775146484, + "learning_rate": 3.678343949044586e-06, + "loss": 0.0064, + "num_tokens": 97554714.0, + "reward": 0.04743303795112297, + "reward_std": 0.061522720265202224, + "rewards/pure_accuracy_reward_math": 0.047433037019800395, + "step": 231 + }, + { + "clip_ratio": 0.00024459305313939694, + "epoch": 0.25239759792058797, + "grad_norm": 0.03185749799013138, + "kl": 0.00302886962890625, + "learning_rate": 3.694267515923567e-06, + "loss": 0.0064, + "step": 232 + }, + { + "clip_ratio": 0.00025332184179660544, + "epoch": 0.2543097009351379, + "grad_norm": 0.03135737404227257, + "kl": 0.002967357635498047, + "learning_rate": 3.7101910828025477e-06, + "loss": 0.0064, + "step": 233 + }, + { + "clip_ratio": 0.0002861271710798974, + "epoch": 0.2562218039496878, + "grad_norm": 0.030725885182619095, + "kl": 0.0029573440551757812, + "learning_rate": 3.7261146496815285e-06, + "loss": 0.0064, + "step": 234 + }, + { + "clip_ratio": 0.0002841630366674508, + "epoch": 0.2581339069642377, + "grad_norm": 0.030670415610074997, + "kl": 0.002954721450805664, + "learning_rate": 3.7420382165605097e-06, + "loss": 0.0063, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.4467325210571, + "epoch": 0.26004600997878763, + "grad_norm": 0.03534790128469467, + "kl": 0.003011465072631836, + "learning_rate": 3.757961783439491e-06, + "loss": 0.0041, + "num_tokens": 101193143.0, + "reward": 0.04631696638534777, + "reward_std": 0.0601075982558541, + "rewards/pure_accuracy_reward_math": 0.046316965454025194, + "step": 236 + }, + { + "clip_ratio": 0.00022260297603793333, + "epoch": 0.2619581129933375, + "grad_norm": 0.03438499942421913, + "kl": 0.0030508041381835938, + "learning_rate": 3.773885350318472e-06, + "loss": 0.0041, + "step": 237 + }, + { + "clip_ratio": 0.00024397839513312647, + "epoch": 0.2638702160078874, + "grad_norm": 0.032804593443870544, + "kl": 0.0030994415283203125, + "learning_rate": 3.789808917197453e-06, + "loss": 0.0041, + "step": 238 + }, + { + "clip_ratio": 0.0002508007286223801, + "epoch": 0.2657823190224373, + "grad_norm": 0.03402625024318695, + "kl": 0.0031244754791259766, + "learning_rate": 3.8057324840764336e-06, + "loss": 0.004, + "step": 239 + }, + { + "clip_ratio": 0.00025242620182552855, + "epoch": 0.26769442203698723, + "grad_norm": 0.03291900083422661, + "kl": 0.003187417984008789, + "learning_rate": 3.821656050955415e-06, + "loss": 0.004, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.9913763999939, + "epoch": 0.26960652505153715, + "grad_norm": 0.033690325915813446, + "kl": 0.003125429153442383, + "learning_rate": 3.837579617834396e-06, + "loss": 0.0089, + "num_tokens": 104860392.0, + "reward": 0.05496652069268748, + "reward_std": 0.07028483308386058, + "rewards/pure_accuracy_reward_math": 0.05496651929570362, + "step": 241 + }, + { + "clip_ratio": 0.0002661047830088137, + "epoch": 0.27151862806608706, + "grad_norm": 0.03227640688419342, + "kl": 0.0031244754791259766, + "learning_rate": 3.853503184713376e-06, + "loss": 0.009, + "step": 242 + }, + { + "clip_ratio": 0.00027503777869242185, + "epoch": 0.273430731080637, + "grad_norm": 0.03168897703289986, + "kl": 0.003157377243041992, + "learning_rate": 3.869426751592357e-06, + "loss": 0.0089, + "step": 243 + }, + { + "clip_ratio": 0.00029653536631712996, + "epoch": 0.2753428340951869, + "grad_norm": 0.03222280368208885, + "kl": 0.0031862258911132812, + "learning_rate": 3.885350318471338e-06, + "loss": 0.0089, + "step": 244 + }, + { + "clip_ratio": 0.0003081631187455969, + "epoch": 0.2772549371097368, + "grad_norm": 0.03176514804363251, + "kl": 0.0032341480255126953, + "learning_rate": 3.901273885350319e-06, + "loss": 0.0088, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.5616898536682, + "epoch": 0.27916704012428667, + "grad_norm": 0.037929438054561615, + "kl": 0.0035233497619628906, + "learning_rate": 3.9171974522293e-06, + "loss": 0.0075, + "num_tokens": 108427949.0, + "reward": 0.0544084852153901, + "reward_std": 0.0659469406818971, + "rewards/pure_accuracy_reward_math": 0.054408483527367935, + "step": 246 + }, + { + "clip_ratio": 0.0002633177949178389, + "epoch": 0.2810791431388366, + "grad_norm": 0.03561301901936531, + "kl": 0.0035467147827148438, + "learning_rate": 3.933121019108281e-06, + "loss": 0.0075, + "step": 247 + }, + { + "clip_ratio": 0.0003005996498472996, + "epoch": 0.2829912461533865, + "grad_norm": 0.035342708230018616, + "kl": 0.003578662872314453, + "learning_rate": 3.949044585987262e-06, + "loss": 0.0075, + "step": 248 + }, + { + "clip_ratio": 0.0003206986277177748, + "epoch": 0.2849033491679364, + "grad_norm": 0.03841444477438927, + "kl": 0.0036001205444335938, + "learning_rate": 3.964968152866243e-06, + "loss": 0.0075, + "step": 249 + }, + { + "clip_ratio": 0.00030761192169848073, + "epoch": 0.28681545218248633, + "grad_norm": 0.03515273705124855, + "kl": 0.003624439239501953, + "learning_rate": 3.980891719745223e-06, + "loss": 0.0074, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 504.73858308792114, + "epoch": 0.28872755519703625, + "grad_norm": 0.04030496999621391, + "kl": 0.003686189651489258, + "learning_rate": 3.996815286624204e-06, + "loss": 0.0081, + "num_tokens": 111975532.0, + "reward": 0.0647321458964143, + "reward_std": 0.07547981187235564, + "rewards/pure_accuracy_reward_math": 0.06473214420839213, + "step": 251 + }, + { + "clip_ratio": 0.00031485489739679906, + "epoch": 0.29063965821158616, + "grad_norm": 0.04058763012290001, + "kl": 0.003683328628540039, + "learning_rate": 4.012738853503185e-06, + "loss": 0.0081, + "step": 252 + }, + { + "clip_ratio": 0.0003329372994471669, + "epoch": 0.2925517612261361, + "grad_norm": 0.039948880672454834, + "kl": 0.003644227981567383, + "learning_rate": 4.0286624203821666e-06, + "loss": 0.0081, + "step": 253 + }, + { + "clip_ratio": 0.00031999613804600813, + "epoch": 0.294463864240686, + "grad_norm": 0.038771189749240875, + "kl": 0.003670930862426758, + "learning_rate": 4.044585987261147e-06, + "loss": 0.008, + "step": 254 + }, + { + "clip_ratio": 0.0003391868065136805, + "epoch": 0.29637596725523585, + "grad_norm": 0.03820183873176575, + "kl": 0.0036439895629882812, + "learning_rate": 4.060509554140128e-06, + "loss": 0.0079, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 507.980770111084, + "epoch": 0.29828807026978577, + "grad_norm": 0.0373733825981617, + "kl": 0.003556966781616211, + "learning_rate": 4.076433121019109e-06, + "loss": 0.0047, + "num_tokens": 115530899.0, + "reward": 0.05217634199652821, + "reward_std": 0.06624599173665047, + "rewards/pure_accuracy_reward_math": 0.05217634030850604, + "step": 256 + }, + { + "clip_ratio": 0.0002444871162765594, + "epoch": 0.3002001732843357, + "grad_norm": 0.03655192255973816, + "kl": 0.003623485565185547, + "learning_rate": 4.09235668789809e-06, + "loss": 0.0047, + "step": 257 + }, + { + "clip_ratio": 0.0002544127338524049, + "epoch": 0.3021122762988856, + "grad_norm": 0.035692181438207626, + "kl": 0.003640890121459961, + "learning_rate": 4.10828025477707e-06, + "loss": 0.0046, + "step": 258 + }, + { + "clip_ratio": 0.0002950017506577751, + "epoch": 0.3040243793134355, + "grad_norm": 0.03550735488533974, + "kl": 0.0036733150482177734, + "learning_rate": 4.124203821656051e-06, + "loss": 0.0046, + "step": 259 + }, + { + "clip_ratio": 0.0002894491571510116, + "epoch": 0.3059364823279854, + "grad_norm": 0.03471330925822258, + "kl": 0.00366973876953125, + "learning_rate": 4.140127388535032e-06, + "loss": 0.0045, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.2921543121338, + "epoch": 0.30784858534253534, + "grad_norm": 0.6632264852523804, + "kl": 0.007929325103759766, + "learning_rate": 4.156050955414014e-06, + "loss": 0.0041, + "num_tokens": 119123970.0, + "reward": 0.046875002153683454, + "reward_std": 0.06358220643596724, + "rewards/pure_accuracy_reward_math": 0.04687500122236088, + "step": 261 + }, + { + "clip_ratio": 0.00027907352409783925, + "epoch": 0.30976068835708526, + "grad_norm": 0.03735750913619995, + "kl": 0.0038709640502929688, + "learning_rate": 4.171974522292994e-06, + "loss": 0.004, + "step": 262 + }, + { + "clip_ratio": 0.000277261100677606, + "epoch": 0.31167279137163517, + "grad_norm": 0.03806532546877861, + "kl": 0.004002094268798828, + "learning_rate": 4.187898089171975e-06, + "loss": 0.004, + "step": 263 + }, + { + "clip_ratio": 0.00026404397090118437, + "epoch": 0.31358489438618503, + "grad_norm": 0.03587675094604492, + "kl": 0.00407719612121582, + "learning_rate": 4.203821656050956e-06, + "loss": 0.0039, + "step": 264 + }, + { + "clip_ratio": 0.0003132741497324787, + "epoch": 0.31549699740073495, + "grad_norm": 0.03516336902976036, + "kl": 0.004099607467651367, + "learning_rate": 4.219745222929937e-06, + "loss": 0.0039, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.596004486084, + "epoch": 0.31740910041528486, + "grad_norm": 0.038204919546842575, + "kl": 0.0035691261291503906, + "learning_rate": 4.2356687898089174e-06, + "loss": 0.006, + "num_tokens": 122758966.0, + "reward": 0.054966520925518125, + "reward_std": 0.06770737608894706, + "rewards/pure_accuracy_reward_math": 0.05496651912108064, + "step": 266 + }, + { + "clip_ratio": 0.00026713599251593223, + "epoch": 0.3193212034298348, + "grad_norm": 0.03804617002606392, + "kl": 0.003623485565185547, + "learning_rate": 4.251592356687898e-06, + "loss": 0.006, + "step": 267 + }, + { + "clip_ratio": 0.00027288361513910786, + "epoch": 0.3212333064443847, + "grad_norm": 0.03765474632382393, + "kl": 0.003659486770629883, + "learning_rate": 4.26751592356688e-06, + "loss": 0.006, + "step": 268 + }, + { + "clip_ratio": 0.0002754389876429286, + "epoch": 0.3231454094589346, + "grad_norm": 0.037356842309236526, + "kl": 0.0036840438842773438, + "learning_rate": 4.283439490445861e-06, + "loss": 0.0059, + "step": 269 + }, + { + "clip_ratio": 0.0002686067065269526, + "epoch": 0.3250575124734845, + "grad_norm": 0.03656876087188721, + "kl": 0.003694295883178711, + "learning_rate": 4.299363057324841e-06, + "loss": 0.0059, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.1746897697449, + "epoch": 0.32696961548803444, + "grad_norm": 0.03417838364839554, + "kl": 0.0035529136657714844, + "learning_rate": 4.315286624203822e-06, + "loss": 0.0076, + "num_tokens": 126443800.0, + "reward": 0.04882812697906047, + "reward_std": 0.05766273388871923, + "rewards/pure_accuracy_reward_math": 0.04882812616415322, + "step": 271 + }, + { + "clip_ratio": 0.0002270729566475893, + "epoch": 0.32888171850258435, + "grad_norm": 0.03328363224864006, + "kl": 0.0035278797149658203, + "learning_rate": 4.331210191082803e-06, + "loss": 0.0076, + "step": 272 + }, + { + "clip_ratio": 0.0002132950650661769, + "epoch": 0.3307938215171342, + "grad_norm": 0.03230879083275795, + "kl": 0.0034902095794677734, + "learning_rate": 4.347133757961784e-06, + "loss": 0.0076, + "step": 273 + }, + { + "clip_ratio": 0.0002096330554195447, + "epoch": 0.3327059245316841, + "grad_norm": 0.031601596623659134, + "kl": 0.003440380096435547, + "learning_rate": 4.3630573248407645e-06, + "loss": 0.0076, + "step": 274 + }, + { + "clip_ratio": 0.00027223577194490645, + "epoch": 0.33461802754623404, + "grad_norm": 0.033090248703956604, + "kl": 0.003412485122680664, + "learning_rate": 4.378980891719746e-06, + "loss": 0.0075, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.8942775726318, + "epoch": 0.33653013056078396, + "grad_norm": 0.03229549527168274, + "kl": 0.003350973129272461, + "learning_rate": 4.394904458598727e-06, + "loss": 0.0057, + "num_tokens": 130099677.0, + "reward": 0.04966518087894656, + "reward_std": 0.060705700190737844, + "rewards/pure_accuracy_reward_math": 0.049665179773001, + "step": 276 + }, + { + "clip_ratio": 0.00025271691475836633, + "epoch": 0.3384422335753339, + "grad_norm": 0.03214692696928978, + "kl": 0.0033435821533203125, + "learning_rate": 4.410828025477708e-06, + "loss": 0.0057, + "step": 277 + }, + { + "clip_ratio": 0.00023837689644778948, + "epoch": 0.3403543365898838, + "grad_norm": 0.03055053949356079, + "kl": 0.003403902053833008, + "learning_rate": 4.426751592356688e-06, + "loss": 0.0057, + "step": 278 + }, + { + "clip_ratio": 0.0002586998209039848, + "epoch": 0.3422664396044337, + "grad_norm": 0.030119990929961205, + "kl": 0.003477334976196289, + "learning_rate": 4.442675159235669e-06, + "loss": 0.0057, + "step": 279 + }, + { + "clip_ratio": 0.00026621688834893575, + "epoch": 0.3441785426189836, + "grad_norm": 0.030735207721590996, + "kl": 0.0035724639892578125, + "learning_rate": 4.45859872611465e-06, + "loss": 0.0056, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.7466769218445, + "epoch": 0.34609064563353353, + "grad_norm": 0.033374350517988205, + "kl": 0.003545999526977539, + "learning_rate": 4.474522292993631e-06, + "loss": 0.0036, + "num_tokens": 133773381.0, + "reward": 0.051339288300368935, + "reward_std": 0.06345581240020692, + "rewards/pure_accuracy_reward_math": 0.05133928690338507, + "step": 281 + }, + { + "clip_ratio": 0.0002734534241994879, + "epoch": 0.3480027486480834, + "grad_norm": 0.03312847390770912, + "kl": 0.0035567283630371094, + "learning_rate": 4.490445859872612e-06, + "loss": 0.0036, + "step": 282 + }, + { + "clip_ratio": 0.00022532319422907676, + "epoch": 0.3499148516626333, + "grad_norm": 0.03281605243682861, + "kl": 0.0035707950592041016, + "learning_rate": 4.506369426751593e-06, + "loss": 0.0035, + "step": 283 + }, + { + "clip_ratio": 0.0002544033526419298, + "epoch": 0.3518269546771832, + "grad_norm": 0.032299675047397614, + "kl": 0.003595113754272461, + "learning_rate": 4.522292993630574e-06, + "loss": 0.0035, + "step": 284 + }, + { + "clip_ratio": 0.00024219880805276262, + "epoch": 0.35373905769173314, + "grad_norm": 0.031959276646375656, + "kl": 0.0035622119903564453, + "learning_rate": 4.538216560509555e-06, + "loss": 0.0035, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.3122510910034, + "epoch": 0.35565116070628305, + "grad_norm": 0.035966720432043076, + "kl": 0.003755331039428711, + "learning_rate": 4.554140127388535e-06, + "loss": 0.0076, + "num_tokens": 137425032.0, + "reward": 0.05524553809664212, + "reward_std": 0.07191267621237785, + "rewards/pure_accuracy_reward_math": 0.055245536990696564, + "step": 286 + }, + { + "clip_ratio": 0.00029696975889237365, + "epoch": 0.35756326372083297, + "grad_norm": 0.03485076501965523, + "kl": 0.0036923885345458984, + "learning_rate": 4.570063694267516e-06, + "loss": 0.0076, + "step": 287 + }, + { + "clip_ratio": 0.0003252405772968814, + "epoch": 0.3594753667353829, + "grad_norm": 0.03465472534298897, + "kl": 0.003720998764038086, + "learning_rate": 4.585987261146497e-06, + "loss": 0.0076, + "step": 288 + }, + { + "clip_ratio": 0.0003269365803362234, + "epoch": 0.3613874697499328, + "grad_norm": 0.033384956419467926, + "kl": 0.003762483596801758, + "learning_rate": 4.601910828025479e-06, + "loss": 0.0075, + "step": 289 + }, + { + "clip_ratio": 0.0003269619904813226, + "epoch": 0.36329957276448266, + "grad_norm": 0.03343256562948227, + "kl": 0.0037889480590820312, + "learning_rate": 4.617834394904459e-06, + "loss": 0.0075, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.6155371665955, + "epoch": 0.3652116757790326, + "grad_norm": 0.035127099603414536, + "kl": 0.0037310123443603516, + "learning_rate": 4.63375796178344e-06, + "loss": 0.0084, + "num_tokens": 141070278.0, + "reward": 0.05580357421422377, + "reward_std": 0.06861072615720332, + "rewards/pure_accuracy_reward_math": 0.05580357281723991, + "step": 291 + }, + { + "clip_ratio": 0.00026876470258230256, + "epoch": 0.3671237787935825, + "grad_norm": 0.034193847328424454, + "kl": 0.0037539005279541016, + "learning_rate": 4.649681528662421e-06, + "loss": 0.0084, + "step": 292 + }, + { + "clip_ratio": 0.00024497293054537295, + "epoch": 0.3690358818081324, + "grad_norm": 0.033800724893808365, + "kl": 0.0037734508514404297, + "learning_rate": 4.665605095541402e-06, + "loss": 0.0084, + "step": 293 + }, + { + "clip_ratio": 0.0002538224067620831, + "epoch": 0.3709479848226823, + "grad_norm": 0.03376767784357071, + "kl": 0.003782033920288086, + "learning_rate": 4.6815286624203824e-06, + "loss": 0.0083, + "step": 294 + }, + { + "clip_ratio": 0.00027697558522277177, + "epoch": 0.37286008783723223, + "grad_norm": 0.03229675441980362, + "kl": 0.003787994384765625, + "learning_rate": 4.697452229299363e-06, + "loss": 0.0083, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.5739660263062, + "epoch": 0.37477219085178215, + "grad_norm": 0.035769619047641754, + "kl": 0.0037794113159179688, + "learning_rate": 4.713375796178344e-06, + "loss": 0.0057, + "num_tokens": 144715023.0, + "reward": 0.05915178812574595, + "reward_std": 0.07096926274243742, + "rewards/pure_accuracy_reward_math": 0.059151787019800395, + "step": 296 + }, + { + "clip_ratio": 0.00030428163654505624, + "epoch": 0.37668429386633207, + "grad_norm": 0.035648081451654434, + "kl": 0.003717660903930664, + "learning_rate": 4.729299363057326e-06, + "loss": 0.0057, + "step": 297 + }, + { + "clip_ratio": 0.00029741515106707084, + "epoch": 0.378596396880882, + "grad_norm": 0.03551783785223961, + "kl": 0.0036716461181640625, + "learning_rate": 4.745222929936306e-06, + "loss": 0.0057, + "step": 298 + }, + { + "clip_ratio": 0.0003008591765478741, + "epoch": 0.38050849989543184, + "grad_norm": 0.03452136367559433, + "kl": 0.0036542415618896484, + "learning_rate": 4.761146496815287e-06, + "loss": 0.0056, + "step": 299 + }, + { + "clip_ratio": 0.00032588979291858777, + "epoch": 0.38242060290998175, + "grad_norm": 0.03325437009334564, + "kl": 0.003694295883178711, + "learning_rate": 4.777070063694268e-06, + "loss": 0.0056, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.2416524887085, + "epoch": 0.38433270592453167, + "grad_norm": 0.04327908158302307, + "kl": 0.004815816879272461, + "learning_rate": 4.792993630573249e-06, + "loss": 0.0041, + "num_tokens": 148307505.0, + "reward": 0.05329241341678426, + "reward_std": 0.061954362492542714, + "rewards/pure_accuracy_reward_math": 0.0532924123108387, + "step": 301 + }, + { + "clip_ratio": 0.0002521659018839273, + "epoch": 0.3862448089390816, + "grad_norm": 0.041329506784677505, + "kl": 0.004758596420288086, + "learning_rate": 4.8089171974522295e-06, + "loss": 0.0041, + "step": 302 + }, + { + "clip_ratio": 0.0002661041191913682, + "epoch": 0.3881569119536315, + "grad_norm": 0.03914090245962143, + "kl": 0.0045318603515625, + "learning_rate": 4.82484076433121e-06, + "loss": 0.0041, + "step": 303 + }, + { + "clip_ratio": 0.0002647961523507547, + "epoch": 0.3900690149681814, + "grad_norm": 0.0363956093788147, + "kl": 0.0043642520904541016, + "learning_rate": 4.840764331210192e-06, + "loss": 0.004, + "step": 304 + }, + { + "clip_ratio": 0.00030025097066754824, + "epoch": 0.39198111798273133, + "grad_norm": 0.05623022839426994, + "kl": 0.00441288948059082, + "learning_rate": 4.856687898089173e-06, + "loss": 0.004, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.0496897697449, + "epoch": 0.39389322099728125, + "grad_norm": 0.03662995249032974, + "kl": 0.0038270950317382812, + "learning_rate": 4.872611464968153e-06, + "loss": 0.0077, + "num_tokens": 151936939.0, + "reward": 0.0560825914144516, + "reward_std": 0.061781705473549664, + "rewards/pure_accuracy_reward_math": 0.05608259071595967, + "step": 306 + }, + { + "clip_ratio": 0.00025576306325092446, + "epoch": 0.39580532401183116, + "grad_norm": 0.03553188219666481, + "kl": 0.00376129150390625, + "learning_rate": 4.888535031847134e-06, + "loss": 0.0076, + "step": 307 + }, + { + "clip_ratio": 0.00027371336784653977, + "epoch": 0.397717427026381, + "grad_norm": 0.035399794578552246, + "kl": 0.0036725997924804688, + "learning_rate": 4.904458598726115e-06, + "loss": 0.0076, + "step": 308 + }, + { + "clip_ratio": 0.0002955471370569285, + "epoch": 0.39962953004093094, + "grad_norm": 0.03487352281808853, + "kl": 0.003664731979370117, + "learning_rate": 4.920382165605096e-06, + "loss": 0.0076, + "step": 309 + }, + { + "clip_ratio": 0.00030850259520320833, + "epoch": 0.40154163305548085, + "grad_norm": 0.03433185815811157, + "kl": 0.003676176071166992, + "learning_rate": 4.9363057324840765e-06, + "loss": 0.0075, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.8312191963196, + "epoch": 0.40345373607003077, + "grad_norm": 0.03824182599782944, + "kl": 0.003762483596801758, + "learning_rate": 4.952229299363058e-06, + "loss": 0.0062, + "num_tokens": 155550782.0, + "reward": 0.05496652075089514, + "reward_std": 0.0689961050520651, + "rewards/pure_accuracy_reward_math": 0.0549665194703266, + "step": 311 + }, + { + "clip_ratio": 0.0002548059320588436, + "epoch": 0.4053658390845807, + "grad_norm": 0.036028265953063965, + "kl": 0.003760099411010742, + "learning_rate": 4.968152866242039e-06, + "loss": 0.0062, + "step": 312 + }, + { + "clip_ratio": 0.00029642158040132927, + "epoch": 0.4072779420991306, + "grad_norm": 0.03537724167108536, + "kl": 0.0038378238677978516, + "learning_rate": 4.98407643312102e-06, + "loss": 0.0062, + "step": 313 + }, + { + "clip_ratio": 0.00030970463706125884, + "epoch": 0.4091900451136805, + "grad_norm": 0.03521754965186119, + "kl": 0.003871440887451172, + "learning_rate": 5e-06, + "loss": 0.0062, + "step": 314 + }, + { + "clip_ratio": 0.000315766970174991, + "epoch": 0.4111021481282304, + "grad_norm": 0.034070126712322235, + "kl": 0.0037851333618164062, + "learning_rate": 4.999992129526286e-06, + "loss": 0.0061, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.3727917671204, + "epoch": 0.41301425114278034, + "grad_norm": 0.12440560013055801, + "kl": 0.005699872970581055, + "learning_rate": 4.999968518154701e-06, + "loss": 0.0041, + "num_tokens": 159174918.0, + "reward": 0.05050223457510583, + "reward_std": 0.06435916194459423, + "rewards/pure_accuracy_reward_math": 0.050502233527367935, + "step": 316 + }, + { + "clip_ratio": 0.0002532021657657424, + "epoch": 0.4149263541573302, + "grad_norm": 0.05440036952495575, + "kl": 0.005144357681274414, + "learning_rate": 4.99992916603391e-06, + "loss": 0.004, + "step": 317 + }, + { + "clip_ratio": 0.00025051761485883617, + "epoch": 0.4168384571718801, + "grad_norm": 0.051424141973257065, + "kl": 0.005103111267089844, + "learning_rate": 4.999874073411688e-06, + "loss": 0.004, + "step": 318 + }, + { + "clip_ratio": 0.0002561948363677402, + "epoch": 0.41875056018643003, + "grad_norm": 0.06930891424417496, + "kl": 0.004969120025634766, + "learning_rate": 4.9998032406349205e-06, + "loss": 0.0039, + "step": 319 + }, + { + "clip_ratio": 0.0002573228107394243, + "epoch": 0.42066266320097995, + "grad_norm": 0.06900722533464432, + "kl": 0.004853248596191406, + "learning_rate": 4.9997166681495975e-06, + "loss": 0.0039, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.6638069152832, + "epoch": 0.42257476621552986, + "grad_norm": 0.03829098492860794, + "kl": 0.0038361549377441406, + "learning_rate": 4.999614356500811e-06, + "loss": 0.0072, + "num_tokens": 162764497.0, + "reward": 0.06110491356230341, + "reward_std": 0.07393209857400507, + "rewards/pure_accuracy_reward_math": 0.06110491222352721, + "step": 321 + }, + { + "clip_ratio": 0.0002886460991931017, + "epoch": 0.4244868692300798, + "grad_norm": 0.03761793673038483, + "kl": 0.0038406848907470703, + "learning_rate": 4.999496306332755e-06, + "loss": 0.0072, + "step": 322 + }, + { + "clip_ratio": 0.00029219654425105546, + "epoch": 0.4263989722446297, + "grad_norm": 0.03714153915643692, + "kl": 0.003914356231689453, + "learning_rate": 4.999362518388718e-06, + "loss": 0.0071, + "step": 323 + }, + { + "clip_ratio": 0.0003099845329757045, + "epoch": 0.4283110752591796, + "grad_norm": 0.03610815480351448, + "kl": 0.0039288997650146484, + "learning_rate": 4.99921299351108e-06, + "loss": 0.0071, + "step": 324 + }, + { + "clip_ratio": 0.0003404705674370234, + "epoch": 0.4302231782737295, + "grad_norm": 0.03599926084280014, + "kl": 0.003935813903808594, + "learning_rate": 4.999047732641305e-06, + "loss": 0.007, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 510.4832811355591, + "epoch": 0.4321352812882794, + "grad_norm": 0.04078551381826401, + "kl": 0.003900766372680664, + "learning_rate": 4.998866736819938e-06, + "loss": 0.0063, + "num_tokens": 166324161.0, + "reward": 0.059151788242161274, + "reward_std": 0.07354671962093562, + "rewards/pure_accuracy_reward_math": 0.05915178725263104, + "step": 326 + }, + { + "clip_ratio": 0.00026936357801332633, + "epoch": 0.4340473843028293, + "grad_norm": 0.03855260834097862, + "kl": 0.003957986831665039, + "learning_rate": 4.998670007186599e-06, + "loss": 0.0063, + "step": 327 + }, + { + "clip_ratio": 0.0002843770836875592, + "epoch": 0.4359594873173792, + "grad_norm": 0.03724536672234535, + "kl": 0.0039751529693603516, + "learning_rate": 4.998457544979971e-06, + "loss": 0.0062, + "step": 328 + }, + { + "clip_ratio": 0.0003156123698886404, + "epoch": 0.43787159033192913, + "grad_norm": 0.03662634268403053, + "kl": 0.0040798187255859375, + "learning_rate": 4.998229351537797e-06, + "loss": 0.0062, + "step": 329 + }, + { + "clip_ratio": 0.0003457550078564964, + "epoch": 0.43978369334647904, + "grad_norm": 0.03598077967762947, + "kl": 0.004061460494995117, + "learning_rate": 4.997985428296869e-06, + "loss": 0.0061, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.4207811355591, + "epoch": 0.44169579636102896, + "grad_norm": 0.08678283542394638, + "kl": 0.008905410766601562, + "learning_rate": 4.997725776793021e-06, + "loss": 0.0058, + "num_tokens": 169950285.0, + "reward": 0.05636160948779434, + "reward_std": 0.07148723275167868, + "rewards/pure_accuracy_reward_math": 0.05636160867288709, + "step": 331 + }, + { + "clip_ratio": 0.00029096677934603576, + "epoch": 0.4436078993755789, + "grad_norm": 0.09512893110513687, + "kl": 0.007820606231689453, + "learning_rate": 4.997450398661117e-06, + "loss": 0.0058, + "step": 332 + }, + { + "clip_ratio": 0.00029938158724007735, + "epoch": 0.4455200023901288, + "grad_norm": 0.24316293001174927, + "kl": 0.007544517517089844, + "learning_rate": 4.9971592956350405e-06, + "loss": 0.0057, + "step": 333 + }, + { + "clip_ratio": 0.00032061134919558754, + "epoch": 0.4474321054046787, + "grad_norm": 0.07169396430253983, + "kl": 0.006528377532958984, + "learning_rate": 4.996852469547688e-06, + "loss": 0.0057, + "step": 334 + }, + { + "clip_ratio": 0.00034978831735088534, + "epoch": 0.44934420841922856, + "grad_norm": 0.06073050945997238, + "kl": 0.0060198307037353516, + "learning_rate": 4.996529922330954e-06, + "loss": 0.0056, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.8259167671204, + "epoch": 0.4512563114337785, + "grad_norm": 0.034031759947538376, + "kl": 0.0037636756896972656, + "learning_rate": 4.996191656015715e-06, + "loss": 0.0063, + "num_tokens": 173606605.0, + "reward": 0.05273437770665623, + "reward_std": 0.061655311612412333, + "rewards/pure_accuracy_reward_math": 0.05273437625146471, + "step": 336 + }, + { + "clip_ratio": 0.0002175188884052659, + "epoch": 0.4531684144483284, + "grad_norm": 0.03333257883787155, + "kl": 0.0038194656372070312, + "learning_rate": 4.995837672731827e-06, + "loss": 0.0063, + "step": 337 + }, + { + "clip_ratio": 0.00022021491247414815, + "epoch": 0.4550805174628783, + "grad_norm": 0.032678041607141495, + "kl": 0.0038101673126220703, + "learning_rate": 4.9954679747081e-06, + "loss": 0.0063, + "step": 338 + }, + { + "clip_ratio": 0.000264580338352971, + "epoch": 0.4569926204774282, + "grad_norm": 0.032030362635850906, + "kl": 0.0037910938262939453, + "learning_rate": 4.995082564272295e-06, + "loss": 0.0062, + "step": 339 + }, + { + "clip_ratio": 0.00027159255438391483, + "epoch": 0.45890472349197814, + "grad_norm": 0.031298909336328506, + "kl": 0.0038001537322998047, + "learning_rate": 4.994681443851102e-06, + "loss": 0.0062, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.6174931526184, + "epoch": 0.46081682650652805, + "grad_norm": 0.04015278443694115, + "kl": 0.004010200500488281, + "learning_rate": 4.994264615970126e-06, + "loss": 0.0062, + "num_tokens": 177226454.0, + "reward": 0.056361609895247966, + "reward_std": 0.06633232033345848, + "rewards/pure_accuracy_reward_math": 0.05636160867288709, + "step": 341 + }, + { + "clip_ratio": 0.00026669438159387937, + "epoch": 0.46272892952107797, + "grad_norm": 0.03813392296433449, + "kl": 0.0039997100830078125, + "learning_rate": 4.993832083253874e-06, + "loss": 0.0062, + "step": 342 + }, + { + "clip_ratio": 0.0003048689098363866, + "epoch": 0.46464103253562783, + "grad_norm": 0.03776548057794571, + "kl": 0.004065752029418945, + "learning_rate": 4.993383848425736e-06, + "loss": 0.0061, + "step": 343 + }, + { + "clip_ratio": 0.0003051352168768062, + "epoch": 0.46655313555017774, + "grad_norm": 0.03955227509140968, + "kl": 0.0041925907135009766, + "learning_rate": 4.992919914307969e-06, + "loss": 0.0061, + "step": 344 + }, + { + "clip_ratio": 0.00030118576887616655, + "epoch": 0.46846523856472766, + "grad_norm": 0.036648593842983246, + "kl": 0.00420832633972168, + "learning_rate": 4.992440283821676e-06, + "loss": 0.006, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.5131411552429, + "epoch": 0.4703773415792776, + "grad_norm": 13.381791114807129, + "kl": 0.1310877799987793, + "learning_rate": 4.991944959986793e-06, + "loss": 0.018, + "num_tokens": 180852413.0, + "reward": 0.06138393163564615, + "reward_std": 0.07144096971023828, + "rewards/pure_accuracy_reward_math": 0.061383930064039305, + "step": 346 + }, + { + "clip_ratio": 0.00030088673440786806, + "epoch": 0.4722894445938275, + "grad_norm": 1.359532356262207, + "kl": 0.01866316795349121, + "learning_rate": 4.991433945922068e-06, + "loss": 0.0135, + "step": 347 + }, + { + "clip_ratio": 0.0003527746957843192, + "epoch": 0.4742015476083774, + "grad_norm": 0.050763800740242004, + "kl": 0.005962371826171875, + "learning_rate": 4.9909072448450386e-06, + "loss": 0.013, + "step": 348 + }, + { + "clip_ratio": 0.0003426602560239189, + "epoch": 0.4761136506229273, + "grad_norm": 0.0476795993745327, + "kl": 0.006250858306884766, + "learning_rate": 4.990364860072014e-06, + "loss": 0.013, + "step": 349 + }, + { + "clip_ratio": 0.00033057811066328213, + "epoch": 0.47802575363747724, + "grad_norm": 0.04783082380890846, + "kl": 0.0066144466400146484, + "learning_rate": 4.989806795018054e-06, + "loss": 0.013, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.409900188446, + "epoch": 0.47993785665202715, + "grad_norm": 0.036505699157714844, + "kl": 0.0040128231048583984, + "learning_rate": 4.989233053196948e-06, + "loss": 0.0024, + "num_tokens": 184454394.0, + "reward": 0.04771205602446571, + "reward_std": 0.05920424917712808, + "rewards/pure_accuracy_reward_math": 0.047712054976727813, + "step": 351 + }, + { + "clip_ratio": 0.00023261837060317703, + "epoch": 0.481849959666577, + "grad_norm": 0.037214819341897964, + "kl": 0.004108428955078125, + "learning_rate": 4.988643638221193e-06, + "loss": 0.0024, + "step": 352 + }, + { + "clip_ratio": 0.0002573013600795093, + "epoch": 0.4837620626811269, + "grad_norm": 0.03702811896800995, + "kl": 0.004202127456665039, + "learning_rate": 4.9880385538019665e-06, + "loss": 0.0024, + "step": 353 + }, + { + "clip_ratio": 0.0002758479482167786, + "epoch": 0.48567416569567684, + "grad_norm": 0.03838437795639038, + "kl": 0.004250764846801758, + "learning_rate": 4.987417803749112e-06, + "loss": 0.0023, + "step": 354 + }, + { + "clip_ratio": 0.00024451872050690326, + "epoch": 0.48758626871022676, + "grad_norm": 0.035314518958330154, + "kl": 0.00424647331237793, + "learning_rate": 4.986781391971105e-06, + "loss": 0.0023, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.8789310455322, + "epoch": 0.48949837172477667, + "grad_norm": 0.038822874426841736, + "kl": 0.004703998565673828, + "learning_rate": 4.986129322475037e-06, + "loss": 0.006, + "num_tokens": 188061244.0, + "reward": 0.05887277075089514, + "reward_std": 0.0715272988891229, + "rewards/pure_accuracy_reward_math": 0.058872769062872976, + "step": 356 + }, + { + "clip_ratio": 0.0003040988601696881, + "epoch": 0.4914104747393266, + "grad_norm": 0.03750370442867279, + "kl": 0.004604816436767578, + "learning_rate": 4.985461599366583e-06, + "loss": 0.006, + "step": 357 + }, + { + "clip_ratio": 0.0003311016299676339, + "epoch": 0.4933225777538765, + "grad_norm": 0.03735021874308586, + "kl": 0.004613637924194336, + "learning_rate": 4.984778226849983e-06, + "loss": 0.0059, + "step": 358 + }, + { + "clip_ratio": 0.00031427563314423423, + "epoch": 0.4952346807684264, + "grad_norm": 0.037090424448251724, + "kl": 0.00463104248046875, + "learning_rate": 4.984079209228007e-06, + "loss": 0.0059, + "step": 359 + }, + { + "clip_ratio": 0.0003153682554284387, + "epoch": 0.49714678378297633, + "grad_norm": 0.03496375307440758, + "kl": 0.004604816436767578, + "learning_rate": 4.983364550901936e-06, + "loss": 0.0058, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.5016980171204, + "epoch": 0.4990588867975262, + "grad_norm": 1978.1619873046875, + "kl": 5.663617134094238, + "learning_rate": 4.982634256371529e-06, + "loss": 0.2313, + "num_tokens": 191670522.0, + "reward": 0.05943080599536188, + "reward_std": 0.06242607004242018, + "rewards/pure_accuracy_reward_math": 0.059430805064039305, + "step": 361 + }, + { + "clip_ratio": 0.0003008291907349303, + "epoch": 0.5009709898120761, + "grad_norm": 6.705481052398682, + "kl": 0.07292413711547852, + "learning_rate": 4.981888330234998e-06, + "loss": 0.0076, + "step": 362 + }, + { + "clip_ratio": 0.00038137949604788446, + "epoch": 0.502883092826626, + "grad_norm": 0.4056338369846344, + "kl": 0.013193130493164062, + "learning_rate": 4.981126777188976e-06, + "loss": 0.0053, + "step": 363 + }, + { + "clip_ratio": 0.00039371675529764616, + "epoch": 0.5047951958411759, + "grad_norm": 0.40032151341438293, + "kl": 0.009969472885131836, + "learning_rate": 4.980349602028489e-06, + "loss": 0.0052, + "step": 364 + }, + { + "clip_ratio": 0.0003270253398568457, + "epoch": 0.5067072988557259, + "grad_norm": 0.08224909007549286, + "kl": 0.010345458984375, + "learning_rate": 4.979556809646928e-06, + "loss": 0.0051, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.6082878112793, + "epoch": 0.5086194018702758, + "grad_norm": 0.036373648792505264, + "kl": 0.003941535949707031, + "learning_rate": 4.978748405036014e-06, + "loss": 0.0071, + "num_tokens": 195317270.0, + "reward": 0.05552455584984273, + "reward_std": 0.06775363947963342, + "rewards/pure_accuracy_reward_math": 0.05552455486031249, + "step": 366 + }, + { + "clip_ratio": 0.00027453447256675645, + "epoch": 0.5105315048848257, + "grad_norm": 0.03525104746222496, + "kl": 0.0039365291595458984, + "learning_rate": 4.977924393285767e-06, + "loss": 0.0072, + "step": 367 + }, + { + "clip_ratio": 0.0003015769660521528, + "epoch": 0.5124436078993756, + "grad_norm": 0.03737647458910942, + "kl": 0.0039522647857666016, + "learning_rate": 4.977084779584479e-06, + "loss": 0.0071, + "step": 368 + }, + { + "clip_ratio": 0.0002889172319555655, + "epoch": 0.5143557109139255, + "grad_norm": 0.03506501764059067, + "kl": 0.0039052963256835938, + "learning_rate": 4.976229569218676e-06, + "loss": 0.0071, + "step": 369 + }, + { + "clip_ratio": 0.0002910121094146234, + "epoch": 0.5162678139284754, + "grad_norm": 0.03558839485049248, + "kl": 0.003898143768310547, + "learning_rate": 4.975358767573085e-06, + "loss": 0.007, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.1417660713196, + "epoch": 0.5181799169430253, + "grad_norm": 9.403284072875977, + "kl": 0.0705575942993164, + "learning_rate": 4.974472380130605e-06, + "loss": 0.0078, + "num_tokens": 198926094.0, + "reward": 0.06305803885334171, + "reward_std": 0.0737193762906827, + "rewards/pure_accuracy_reward_math": 0.06305803733994253, + "step": 371 + }, + { + "clip_ratio": 0.00028168898450076085, + "epoch": 0.5200920199575753, + "grad_norm": 0.10174906253814697, + "kl": 0.005540609359741211, + "learning_rate": 4.9735704124722665e-06, + "loss": 0.0053, + "step": 372 + }, + { + "clip_ratio": 0.00026055807722968893, + "epoch": 0.5220041229721252, + "grad_norm": 0.036394841969013214, + "kl": 0.004784584045410156, + "learning_rate": 4.9726528702771985e-06, + "loss": 0.0052, + "step": 373 + }, + { + "clip_ratio": 0.0003154287535949152, + "epoch": 0.523916225986675, + "grad_norm": 0.03702308237552643, + "kl": 0.004788875579833984, + "learning_rate": 4.971719759322596e-06, + "loss": 0.0052, + "step": 374 + }, + { + "clip_ratio": 0.000301387064496339, + "epoch": 0.5258283290012249, + "grad_norm": 0.03516030311584473, + "kl": 0.004770994186401367, + "learning_rate": 4.97077108548368e-06, + "loss": 0.0051, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.7994132041931, + "epoch": 0.5277404320157748, + "grad_norm": 0.04183080792427063, + "kl": 0.006031513214111328, + "learning_rate": 4.969806854733658e-06, + "loss": 0.0091, + "num_tokens": 202522419.0, + "reward": 0.0638950924621895, + "reward_std": 0.07990403153235093, + "rewards/pure_accuracy_reward_math": 0.0638950903667137, + "step": 376 + }, + { + "clip_ratio": 0.00032519385399609746, + "epoch": 0.5296525350303247, + "grad_norm": 0.0407201424241066, + "kl": 0.005979061126708984, + "learning_rate": 4.968827073143694e-06, + "loss": 0.0091, + "step": 377 + }, + { + "clip_ratio": 0.00031682528469900717, + "epoch": 0.5315646380448746, + "grad_norm": 0.040043942630290985, + "kl": 0.005922555923461914, + "learning_rate": 4.967831746882863e-06, + "loss": 0.0091, + "step": 378 + }, + { + "clip_ratio": 0.00033513708405052967, + "epoch": 0.5334767410594246, + "grad_norm": 0.03983679041266441, + "kl": 0.005841970443725586, + "learning_rate": 4.966820882218118e-06, + "loss": 0.009, + "step": 379 + }, + { + "clip_ratio": 0.00034104771594911654, + "epoch": 0.5353888440739745, + "grad_norm": 0.03983955457806587, + "kl": 0.005755186080932617, + "learning_rate": 4.965794485514245e-06, + "loss": 0.0089, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.5067186355591, + "epoch": 0.5373009470885244, + "grad_norm": 0.034092146903276443, + "kl": 0.0043926239013671875, + "learning_rate": 4.964752563233826e-06, + "loss": 0.008, + "num_tokens": 206122403.0, + "reward": 0.055803573748562485, + "reward_std": 0.05980854749213904, + "rewards/pure_accuracy_reward_math": 0.05580357275903225, + "step": 381 + }, + { + "clip_ratio": 0.00025422318708478997, + "epoch": 0.5392130501030743, + "grad_norm": 0.03263320028781891, + "kl": 0.0043218135833740234, + "learning_rate": 4.9636951219372e-06, + "loss": 0.008, + "step": 382 + }, + { + "clip_ratio": 0.00025885856206286917, + "epoch": 0.5411251531176242, + "grad_norm": 0.032487623393535614, + "kl": 0.004242420196533203, + "learning_rate": 4.962622168282416e-06, + "loss": 0.008, + "step": 383 + }, + { + "clip_ratio": 0.0002850476581102157, + "epoch": 0.5430372561321741, + "grad_norm": 0.032427769154310226, + "kl": 0.004185199737548828, + "learning_rate": 4.961533709025199e-06, + "loss": 0.0079, + "step": 384 + }, + { + "clip_ratio": 0.00029774147623129466, + "epoch": 0.544949359146724, + "grad_norm": 0.031092027202248573, + "kl": 0.004144430160522461, + "learning_rate": 4.960429751018901e-06, + "loss": 0.0079, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.9258050918579, + "epoch": 0.546861462161274, + "grad_norm": 0.6398438811302185, + "kl": 0.013398170471191406, + "learning_rate": 4.959310301214458e-06, + "loss": 0.0048, + "num_tokens": 209727833.0, + "reward": 0.06668527127476409, + "reward_std": 0.07586519059259444, + "rewards/pure_accuracy_reward_math": 0.06668526941211894, + "step": 386 + }, + { + "clip_ratio": 0.0002956847454242961, + "epoch": 0.5487735651758239, + "grad_norm": 0.09603609144687653, + "kl": 0.006535530090332031, + "learning_rate": 4.958175366660352e-06, + "loss": 0.0045, + "step": 387 + }, + { + "clip_ratio": 0.00032585520455086225, + "epoch": 0.5506856681903738, + "grad_norm": 0.042251698672771454, + "kl": 0.004881858825683594, + "learning_rate": 4.95702495450256e-06, + "loss": 0.0045, + "step": 388 + }, + { + "clip_ratio": 0.00030688931195754776, + "epoch": 0.5525977712049237, + "grad_norm": 0.03725959733128548, + "kl": 0.00462651252746582, + "learning_rate": 4.955859071984512e-06, + "loss": 0.0044, + "step": 389 + }, + { + "clip_ratio": 0.0002833517196449975, + "epoch": 0.5545098742194736, + "grad_norm": 0.03557269275188446, + "kl": 0.004591941833496094, + "learning_rate": 4.954677726447049e-06, + "loss": 0.0044, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.50141954422, + "epoch": 0.5564219772340235, + "grad_norm": 0.03767434135079384, + "kl": 0.0041730403900146484, + "learning_rate": 4.953480925328369e-06, + "loss": 0.0053, + "num_tokens": 213359594.0, + "reward": 0.05636160998255946, + "reward_std": 0.06873711966909468, + "rewards/pure_accuracy_reward_math": 0.05636160829453729, + "step": 391 + }, + { + "clip_ratio": 0.0002943199858691514, + "epoch": 0.5583340802485733, + "grad_norm": 0.03691519424319267, + "kl": 0.004199981689453125, + "learning_rate": 4.952268676163984e-06, + "loss": 0.0053, + "step": 392 + }, + { + "clip_ratio": 0.00028674039270981666, + "epoch": 0.5602461832631233, + "grad_norm": 0.036044176667928696, + "kl": 0.004216432571411133, + "learning_rate": 4.951040986586676e-06, + "loss": 0.0053, + "step": 393 + }, + { + "clip_ratio": 0.0003071572371595721, + "epoch": 0.5621582862776732, + "grad_norm": 0.0358373187482357, + "kl": 0.004226207733154297, + "learning_rate": 4.949797864326442e-06, + "loss": 0.0053, + "step": 394 + }, + { + "clip_ratio": 0.000308680556543095, + "epoch": 0.5640703892922231, + "grad_norm": 0.0356404110789299, + "kl": 0.004263877868652344, + "learning_rate": 4.9485393172104525e-06, + "loss": 0.0052, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.1506924629211, + "epoch": 0.565982492306773, + "grad_norm": 0.03425108641386032, + "kl": 0.004232645034790039, + "learning_rate": 4.947265353162997e-06, + "loss": 0.0047, + "num_tokens": 216984490.0, + "reward": 0.05831473466241732, + "reward_std": 0.06912249873857945, + "rewards/pure_accuracy_reward_math": 0.058314733556471765, + "step": 396 + }, + { + "clip_ratio": 0.0002443079777663115, + "epoch": 0.5678945953213229, + "grad_norm": 0.03406741842627525, + "kl": 0.004246950149536133, + "learning_rate": 4.945975980205435e-06, + "loss": 0.0046, + "step": 397 + }, + { + "clip_ratio": 0.00025582832455484095, + "epoch": 0.5698066983358728, + "grad_norm": 0.033892109990119934, + "kl": 0.004239320755004883, + "learning_rate": 4.944671206456148e-06, + "loss": 0.0046, + "step": 398 + }, + { + "clip_ratio": 0.0002801110364885062, + "epoch": 0.5717188013504227, + "grad_norm": 0.03294463828206062, + "kl": 0.0042018890380859375, + "learning_rate": 4.943351040130485e-06, + "loss": 0.0046, + "step": 399 + }, + { + "clip_ratio": 0.00030015600407296006, + "epoch": 0.5736309043649727, + "grad_norm": 0.03228214010596275, + "kl": 0.004125118255615234, + "learning_rate": 4.942015489540715e-06, + "loss": 0.0045, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.8225684165955, + "epoch": 0.5755430073795226, + "grad_norm": 0.037567272782325745, + "kl": 0.005152702331542969, + "learning_rate": 4.94066456309597e-06, + "loss": 0.0071, + "num_tokens": 220604938.0, + "reward": 0.06166294886497781, + "reward_std": 0.07311507751001045, + "rewards/pure_accuracy_reward_math": 0.06166294764261693, + "step": 401 + }, + { + "clip_ratio": 0.0002694410874823916, + "epoch": 0.5774551103940725, + "grad_norm": 0.036373041570186615, + "kl": 0.005210161209106445, + "learning_rate": 4.939298269302194e-06, + "loss": 0.0071, + "step": 402 + }, + { + "clip_ratio": 0.0002891406058438406, + "epoch": 0.5793672134086224, + "grad_norm": 0.03582580015063286, + "kl": 0.0052187442779541016, + "learning_rate": 4.9379166167620915e-06, + "loss": 0.007, + "step": 403 + }, + { + "clip_ratio": 0.00030127688086167836, + "epoch": 0.5812793164231723, + "grad_norm": 0.035248763859272, + "kl": 0.005229949951171875, + "learning_rate": 4.93651961417507e-06, + "loss": 0.007, + "step": 404 + }, + { + "clip_ratio": 0.00031262176707969047, + "epoch": 0.5831914194377222, + "grad_norm": 0.03461577743291855, + "kl": 0.00519251823425293, + "learning_rate": 4.9351072703371885e-06, + "loss": 0.0069, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.0067219734192, + "epoch": 0.5851035224522722, + "grad_norm": 0.0363302007317543, + "kl": 0.004278659820556641, + "learning_rate": 4.933679594141096e-06, + "loss": 0.0041, + "num_tokens": 224253906.0, + "reward": 0.06222098533180542, + "reward_std": 0.07462272536940873, + "rewards/pure_accuracy_reward_math": 0.06222098329453729, + "step": 406 + }, + { + "clip_ratio": 0.0002887690876320903, + "epoch": 0.5870156254668221, + "grad_norm": 0.03538454696536064, + "kl": 0.004297971725463867, + "learning_rate": 4.932236594575986e-06, + "loss": 0.0041, + "step": 407 + }, + { + "clip_ratio": 0.00029836769689950415, + "epoch": 0.588927728481372, + "grad_norm": 0.03521309420466423, + "kl": 0.004305362701416016, + "learning_rate": 4.9307782807275304e-06, + "loss": 0.0041, + "step": 408 + }, + { + "clip_ratio": 0.0003077857980144927, + "epoch": 0.5908398314959219, + "grad_norm": 0.03468110039830208, + "kl": 0.004298210144042969, + "learning_rate": 4.929304661777823e-06, + "loss": 0.0041, + "step": 409 + }, + { + "clip_ratio": 0.00030735837987094783, + "epoch": 0.5927519345104717, + "grad_norm": 0.03504593297839165, + "kl": 0.004282474517822266, + "learning_rate": 4.9278157470053305e-06, + "loss": 0.004, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.0987973213196, + "epoch": 0.5946640375250216, + "grad_norm": 0.03893313929438591, + "kl": 0.004411935806274414, + "learning_rate": 4.926311545784823e-06, + "loss": 0.0081, + "num_tokens": 227887088.0, + "reward": 0.06138393160654232, + "reward_std": 0.07560620526783168, + "rewards/pure_accuracy_reward_math": 0.061383930034935474, + "step": 411 + }, + { + "clip_ratio": 0.0003015478255292692, + "epoch": 0.5965761405395715, + "grad_norm": 0.03745520859956741, + "kl": 0.004415750503540039, + "learning_rate": 4.924792067587321e-06, + "loss": 0.0081, + "step": 412 + }, + { + "clip_ratio": 0.00033068407248038056, + "epoch": 0.5984882435541214, + "grad_norm": 0.037219781428575516, + "kl": 0.004396915435791016, + "learning_rate": 4.923257321980036e-06, + "loss": 0.0081, + "step": 413 + }, + { + "clip_ratio": 0.00037280973344877566, + "epoch": 0.6004003465686714, + "grad_norm": 0.03754372149705887, + "kl": 0.0044384002685546875, + "learning_rate": 4.9217073186263075e-06, + "loss": 0.0081, + "step": 414 + }, + { + "clip_ratio": 0.0003646712993372603, + "epoch": 0.6023124495832213, + "grad_norm": 0.03602118790149689, + "kl": 0.004477262496948242, + "learning_rate": 4.920142067285544e-06, + "loss": 0.008, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 508.44282722473145, + "epoch": 0.6042245525977712, + "grad_norm": 0.039943527430295944, + "kl": 0.004469871520996094, + "learning_rate": 4.9185615778131614e-06, + "loss": 0.0078, + "num_tokens": 231443183.0, + "reward": 0.0705915211874526, + "reward_std": 0.07968511217040941, + "rewards/pure_accuracy_reward_math": 0.07059151926659979, + "step": 416 + }, + { + "clip_ratio": 0.00031770144798315414, + "epoch": 0.6061366556123211, + "grad_norm": 0.039055656641721725, + "kl": 0.004549264907836914, + "learning_rate": 4.916965860160521e-06, + "loss": 0.0078, + "step": 417 + }, + { + "clip_ratio": 0.00030108455553090607, + "epoch": 0.608048758626871, + "grad_norm": 0.03719799593091011, + "kl": 0.004551410675048828, + "learning_rate": 4.915354924374864e-06, + "loss": 0.0078, + "step": 418 + }, + { + "clip_ratio": 0.0003208976940527464, + "epoch": 0.6099608616414209, + "grad_norm": 0.03626833111047745, + "kl": 0.004576444625854492, + "learning_rate": 4.913728780599254e-06, + "loss": 0.0077, + "step": 419 + }, + { + "clip_ratio": 0.00030395733068644404, + "epoch": 0.6118729646559709, + "grad_norm": 0.035672470927238464, + "kl": 0.004616498947143555, + "learning_rate": 4.912087439072508e-06, + "loss": 0.0077, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.3401436805725, + "epoch": 0.6137850676705208, + "grad_norm": 0.035979609936475754, + "kl": 0.004936695098876953, + "learning_rate": 4.9104309101291345e-06, + "loss": 0.008, + "num_tokens": 235040570.0, + "reward": 0.0558035739522893, + "reward_std": 0.06414644059259444, + "rewards/pure_accuracy_reward_math": 0.05580357278813608, + "step": 421 + }, + { + "clip_ratio": 0.0002606460908509689, + "epoch": 0.6156971706850707, + "grad_norm": 0.034824173897504807, + "kl": 0.004873991012573242, + "learning_rate": 4.908759204199268e-06, + "loss": 0.008, + "step": 422 + }, + { + "clip_ratio": 0.0002711625579081556, + "epoch": 0.6176092736996206, + "grad_norm": 0.034011878073215485, + "kl": 0.00480341911315918, + "learning_rate": 4.907072331808602e-06, + "loss": 0.008, + "step": 423 + }, + { + "clip_ratio": 0.0002719364555332504, + "epoch": 0.6195213767141705, + "grad_norm": 0.0330798402428627, + "kl": 0.00470733642578125, + "learning_rate": 4.905370303578324e-06, + "loss": 0.0079, + "step": 424 + }, + { + "clip_ratio": 0.0003164075427548596, + "epoch": 0.6214334797287204, + "grad_norm": 0.03356935828924179, + "kl": 0.004645586013793945, + "learning_rate": 4.903653130225049e-06, + "loss": 0.0079, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.4051547050476, + "epoch": 0.6233455827432703, + "grad_norm": 0.037987031042575836, + "kl": 0.004395723342895508, + "learning_rate": 4.901920822560753e-06, + "loss": 0.004, + "num_tokens": 238650146.0, + "reward": 0.056082592491293326, + "reward_std": 0.06946781190345064, + "rewards/pure_accuracy_reward_math": 0.05608259033760987, + "step": 426 + }, + { + "clip_ratio": 0.0002752577877913609, + "epoch": 0.6252576857578201, + "grad_norm": 0.03711739555001259, + "kl": 0.0043413639068603516, + "learning_rate": 4.900173391492698e-06, + "loss": 0.004, + "step": 427 + }, + { + "clip_ratio": 0.0002780464546390249, + "epoch": 0.6271697887723701, + "grad_norm": 0.03583519160747528, + "kl": 0.004349231719970703, + "learning_rate": 4.898410848023374e-06, + "loss": 0.004, + "step": 428 + }, + { + "clip_ratio": 0.0002759867400072835, + "epoch": 0.62908189178692, + "grad_norm": 0.035115331411361694, + "kl": 0.0043909549713134766, + "learning_rate": 4.896633203250424e-06, + "loss": 0.0039, + "step": 429 + }, + { + "clip_ratio": 0.0002873923492074937, + "epoch": 0.6309939948014699, + "grad_norm": 0.03465187922120094, + "kl": 0.004460573196411133, + "learning_rate": 4.89484046836657e-06, + "loss": 0.0039, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.1116304397583, + "epoch": 0.6329060978160198, + "grad_norm": 0.03591939061880112, + "kl": 0.004395723342895508, + "learning_rate": 4.893032654659554e-06, + "loss": 0.0068, + "num_tokens": 242275198.0, + "reward": 0.05859375320142135, + "reward_std": 0.06461814750218764, + "rewards/pure_accuracy_reward_math": 0.05859375110594556, + "step": 431 + }, + { + "clip_ratio": 0.00021255032419276176, + "epoch": 0.6348182008305697, + "grad_norm": 0.03488593176007271, + "kl": 0.0043849945068359375, + "learning_rate": 4.891209773512054e-06, + "loss": 0.0068, + "step": 432 + }, + { + "clip_ratio": 0.00023523596212271514, + "epoch": 0.6367303038451196, + "grad_norm": 0.03410722687840462, + "kl": 0.004419565200805664, + "learning_rate": 4.889371836401621e-06, + "loss": 0.0067, + "step": 433 + }, + { + "clip_ratio": 0.00024576090385153293, + "epoch": 0.6386424068596696, + "grad_norm": 0.03335421159863472, + "kl": 0.004421710968017578, + "learning_rate": 4.887518854900603e-06, + "loss": 0.0067, + "step": 434 + }, + { + "clip_ratio": 0.0002828803910119859, + "epoch": 0.6405545098742195, + "grad_norm": 0.03240649402141571, + "kl": 0.004340171813964844, + "learning_rate": 4.885650840676074e-06, + "loss": 0.0066, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.2051043510437, + "epoch": 0.6424666128887694, + "grad_norm": 0.03588009625673294, + "kl": 0.0044574737548828125, + "learning_rate": 4.88376780548976e-06, + "loss": 0.0041, + "num_tokens": 245917009.0, + "reward": 0.05775669912691228, + "reward_std": 0.06611959752626717, + "rewards/pure_accuracy_reward_math": 0.05775669778813608, + "step": 436 + }, + { + "clip_ratio": 0.0002524082638899472, + "epoch": 0.6443787159033193, + "grad_norm": 0.03471923619508743, + "kl": 0.0044062137603759766, + "learning_rate": 4.881869761197963e-06, + "loss": 0.0041, + "step": 437 + }, + { + "clip_ratio": 0.0002889056303843063, + "epoch": 0.6462908189178692, + "grad_norm": 0.03379988297820091, + "kl": 0.004372119903564453, + "learning_rate": 4.879956719751491e-06, + "loss": 0.004, + "step": 438 + }, + { + "clip_ratio": 0.0003009145272017122, + "epoch": 0.6482029219324191, + "grad_norm": 0.03446533530950546, + "kl": 0.004400730133056641, + "learning_rate": 4.878028693195577e-06, + "loss": 0.004, + "step": 439 + }, + { + "clip_ratio": 0.00030466545126728306, + "epoch": 0.650115024946969, + "grad_norm": 0.03484022617340088, + "kl": 0.004462242126464844, + "learning_rate": 4.876085693669806e-06, + "loss": 0.0039, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.0904240608215, + "epoch": 0.652027127961519, + "grad_norm": 0.0366295725107193, + "kl": 0.004509925842285156, + "learning_rate": 4.8741277334080405e-06, + "loss": 0.0066, + "num_tokens": 249502673.0, + "reward": 0.05719866382423788, + "reward_std": 0.06594694149680436, + "rewards/pure_accuracy_reward_math": 0.057198662078008056, + "step": 441 + }, + { + "clip_ratio": 0.00023539985437537325, + "epoch": 0.6539392309760689, + "grad_norm": 0.03590084984898567, + "kl": 0.0045740604400634766, + "learning_rate": 4.87215482473834e-06, + "loss": 0.0066, + "step": 442 + }, + { + "clip_ratio": 0.00022167488214108744, + "epoch": 0.6558513339906188, + "grad_norm": 0.03433714434504509, + "kl": 0.004676342010498047, + "learning_rate": 4.870166980082885e-06, + "loss": 0.0066, + "step": 443 + }, + { + "clip_ratio": 0.0002476425726172238, + "epoch": 0.6577634370051687, + "grad_norm": 0.03389691188931465, + "kl": 0.004789113998413086, + "learning_rate": 4.868164211957899e-06, + "loss": 0.0065, + "step": 444 + }, + { + "clip_ratio": 0.00025810993128061455, + "epoch": 0.6596755400197185, + "grad_norm": 0.03417885676026344, + "kl": 0.004879474639892578, + "learning_rate": 4.866146532973569e-06, + "loss": 0.0064, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.3697214126587, + "epoch": 0.6615876430342684, + "grad_norm": 0.03560737892985344, + "kl": 0.00455927848815918, + "learning_rate": 4.864113955833967e-06, + "loss": 0.0056, + "num_tokens": 253104314.0, + "reward": 0.06584821722935885, + "reward_std": 0.07672227645525709, + "rewards/pure_accuracy_reward_math": 0.06584821565775201, + "step": 446 + }, + { + "clip_ratio": 0.00029780695723502504, + "epoch": 0.6634997460488183, + "grad_norm": 0.034836821258068085, + "kl": 0.0045278072357177734, + "learning_rate": 4.862066493336967e-06, + "loss": 0.0056, + "step": 447 + }, + { + "clip_ratio": 0.00030120932990485016, + "epoch": 0.6654118490633683, + "grad_norm": 0.03460467606782913, + "kl": 0.0045435428619384766, + "learning_rate": 4.860004158374172e-06, + "loss": 0.0055, + "step": 448 + }, + { + "clip_ratio": 0.000313081463019671, + "epoch": 0.6673239520779182, + "grad_norm": 0.03467562422156334, + "kl": 0.004552364349365234, + "learning_rate": 4.857926963930822e-06, + "loss": 0.0055, + "step": 449 + }, + { + "clip_ratio": 0.00031086072692687594, + "epoch": 0.6692360550924681, + "grad_norm": 0.03409102186560631, + "kl": 0.004626035690307617, + "learning_rate": 4.855834923085721e-06, + "loss": 0.0054, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.4771447181702, + "epoch": 0.671148158107018, + "grad_norm": 0.03815117105841637, + "kl": 0.005002737045288086, + "learning_rate": 4.853728049011151e-06, + "loss": 0.0091, + "num_tokens": 256687388.0, + "reward": 0.06556919938884676, + "reward_std": 0.07874169782735407, + "rewards/pure_accuracy_reward_math": 0.06556919787544757, + "step": 451 + }, + { + "clip_ratio": 0.0003133106871473501, + "epoch": 0.6730602611215679, + "grad_norm": 0.03761136531829834, + "kl": 0.005041837692260742, + "learning_rate": 4.851606354972791e-06, + "loss": 0.0091, + "step": 452 + }, + { + "clip_ratio": 0.00034106033973557714, + "epoch": 0.6749723641361178, + "grad_norm": 0.0372379869222641, + "kl": 0.0050508975982666016, + "learning_rate": 4.849469854329629e-06, + "loss": 0.0091, + "step": 453 + }, + { + "clip_ratio": 0.00033749614277667206, + "epoch": 0.6768844671506677, + "grad_norm": 0.03686762601137161, + "kl": 0.005095005035400391, + "learning_rate": 4.847318560533882e-06, + "loss": 0.009, + "step": 454 + }, + { + "clip_ratio": 0.00035140375177888927, + "epoch": 0.6787965701652177, + "grad_norm": 0.036469750106334686, + "kl": 0.005120754241943359, + "learning_rate": 4.845152487130914e-06, + "loss": 0.009, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.4866299629211, + "epoch": 0.6807086731797676, + "grad_norm": 0.037901297211647034, + "kl": 0.004809379577636719, + "learning_rate": 4.842971647759142e-06, + "loss": 0.0063, + "num_tokens": 260253700.0, + "reward": 0.05775669912691228, + "reward_std": 0.06710927549283952, + "rewards/pure_accuracy_reward_math": 0.05775669767172076, + "step": 456 + }, + { + "clip_ratio": 0.00026634283756266086, + "epoch": 0.6826207761943175, + "grad_norm": 0.03568252548575401, + "kl": 0.0047724246978759766, + "learning_rate": 4.840776056149957e-06, + "loss": 0.0063, + "step": 457 + }, + { + "clip_ratio": 0.00027518686636085476, + "epoch": 0.6845328792088674, + "grad_norm": 0.0351024754345417, + "kl": 0.004754543304443359, + "learning_rate": 4.838565726127636e-06, + "loss": 0.0063, + "step": 458 + }, + { + "clip_ratio": 0.0003387172891393675, + "epoch": 0.6864449822234173, + "grad_norm": 0.03477272391319275, + "kl": 0.004698753356933594, + "learning_rate": 4.836340671609255e-06, + "loss": 0.0062, + "step": 459 + }, + { + "clip_ratio": 0.0003592506114102889, + "epoch": 0.6883570852379672, + "grad_norm": 0.035812895745038986, + "kl": 0.004735708236694336, + "learning_rate": 4.834100906604601e-06, + "loss": 0.0062, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.1403703689575, + "epoch": 0.6902691882525172, + "grad_norm": 0.03566034138202667, + "kl": 0.004418611526489258, + "learning_rate": 4.831846445216082e-06, + "loss": 0.0056, + "num_tokens": 263902651.0, + "reward": 0.05161830614088103, + "reward_std": 0.06899610540131107, + "rewards/pure_accuracy_reward_math": 0.051618304976727813, + "step": 461 + }, + { + "clip_ratio": 0.00028340513017610647, + "epoch": 0.6921812912670671, + "grad_norm": 0.03495897352695465, + "kl": 0.004414081573486328, + "learning_rate": 4.829577301638642e-06, + "loss": 0.0056, + "step": 462 + }, + { + "clip_ratio": 0.0002825141077664739, + "epoch": 0.6940933942816169, + "grad_norm": 0.034486111253499985, + "kl": 0.004411220550537109, + "learning_rate": 4.827293490159668e-06, + "loss": 0.0056, + "step": 463 + }, + { + "clip_ratio": 0.00031019614829119746, + "epoch": 0.6960054972961668, + "grad_norm": 0.035884980112314224, + "kl": 0.004367351531982422, + "learning_rate": 4.824995025158903e-06, + "loss": 0.0055, + "step": 464 + }, + { + "clip_ratio": 0.0003045983889933268, + "epoch": 0.6979176003107167, + "grad_norm": 0.03378836810588837, + "kl": 0.004292488098144531, + "learning_rate": 4.822681921108355e-06, + "loss": 0.0055, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.3783731460571, + "epoch": 0.6998297033252666, + "grad_norm": 0.03726997971534729, + "kl": 0.0065157413482666016, + "learning_rate": 4.8203541925722016e-06, + "loss": 0.0017, + "num_tokens": 267508687.0, + "reward": 0.06724330646102317, + "reward_std": 0.07591145433252677, + "rewards/pure_accuracy_reward_math": 0.06724330500583164, + "step": 466 + }, + { + "clip_ratio": 0.00026273680936128585, + "epoch": 0.7017418063398165, + "grad_norm": 0.03638988733291626, + "kl": 0.0064983367919921875, + "learning_rate": 4.818011854206706e-06, + "loss": 0.0017, + "step": 467 + }, + { + "clip_ratio": 0.0002903113285128711, + "epoch": 0.7036539093543664, + "grad_norm": 0.0360158272087574, + "kl": 0.006509542465209961, + "learning_rate": 4.815654920760117e-06, + "loss": 0.0016, + "step": 468 + }, + { + "clip_ratio": 0.0002849762186087901, + "epoch": 0.7055660123689164, + "grad_norm": 0.03577370196580887, + "kl": 0.006470680236816406, + "learning_rate": 4.81328340707258e-06, + "loss": 0.0016, + "step": 469 + }, + { + "clip_ratio": 0.00031370155647891806, + "epoch": 0.7074781153834663, + "grad_norm": 0.03484919294714928, + "kl": 0.006468772888183594, + "learning_rate": 4.810897328076045e-06, + "loss": 0.0015, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.1677136421204, + "epoch": 0.7093902183980162, + "grad_norm": 0.04198005422949791, + "kl": 0.004724264144897461, + "learning_rate": 4.808496698794171e-06, + "loss": 0.0046, + "num_tokens": 271138708.0, + "reward": 0.07310268204309978, + "reward_std": 0.07646948879119009, + "rewards/pure_accuracy_reward_math": 0.07310267994762398, + "step": 471 + }, + { + "clip_ratio": 0.00028702764876697984, + "epoch": 0.7113023214125661, + "grad_norm": 0.04015243798494339, + "kl": 0.004670619964599609, + "learning_rate": 4.8060815343422265e-06, + "loss": 0.0045, + "step": 472 + }, + { + "clip_ratio": 0.0002947892680822406, + "epoch": 0.713214424427116, + "grad_norm": 0.0385352224111557, + "kl": 0.0046727657318115234, + "learning_rate": 4.803651849927004e-06, + "loss": 0.0045, + "step": 473 + }, + { + "clip_ratio": 0.00036661511779811917, + "epoch": 0.7151265274416659, + "grad_norm": 0.03803607076406479, + "kl": 0.00463414192199707, + "learning_rate": 4.801207660846717e-06, + "loss": 0.0044, + "step": 474 + }, + { + "clip_ratio": 0.00040073674449558894, + "epoch": 0.7170386304562159, + "grad_norm": 0.03870271518826485, + "kl": 0.00464320182800293, + "learning_rate": 4.798748982490908e-06, + "loss": 0.0044, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.262857913971, + "epoch": 0.7189507334707658, + "grad_norm": 0.0374424010515213, + "kl": 0.0045392513275146484, + "learning_rate": 4.796275830340344e-06, + "loss": 0.0081, + "num_tokens": 274802094.0, + "reward": 0.061941967433085665, + "reward_std": 0.07401842664694414, + "rewards/pure_accuracy_reward_math": 0.06194196522119455, + "step": 476 + }, + { + "clip_ratio": 0.00026828293908920386, + "epoch": 0.7208628364853157, + "grad_norm": 0.03758076950907707, + "kl": 0.004576683044433594, + "learning_rate": 4.793788219966931e-06, + "loss": 0.0081, + "step": 477 + }, + { + "clip_ratio": 0.0002991793934654652, + "epoch": 0.7227749394998656, + "grad_norm": 0.03570091351866722, + "kl": 0.0045130252838134766, + "learning_rate": 4.7912861670336065e-06, + "loss": 0.008, + "step": 478 + }, + { + "clip_ratio": 0.00031140293214093617, + "epoch": 0.7246870425144155, + "grad_norm": 0.034991368651390076, + "kl": 0.0044956207275390625, + "learning_rate": 4.788769687294243e-06, + "loss": 0.008, + "step": 479 + }, + { + "clip_ratio": 0.00034215352269484356, + "epoch": 0.7265991455289653, + "grad_norm": 0.03517301753163338, + "kl": 0.00450587272644043, + "learning_rate": 4.7862387965935504e-06, + "loss": 0.0079, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.2455615997314, + "epoch": 0.7285112485435152, + "grad_norm": 0.03517255187034607, + "kl": 0.004718780517578125, + "learning_rate": 4.783693510866977e-06, + "loss": 0.0066, + "num_tokens": 278455030.0, + "reward": 0.06222098530270159, + "reward_std": 0.069766862958204, + "rewards/pure_accuracy_reward_math": 0.062220983498264104, + "step": 481 + }, + { + "clip_ratio": 0.00026954136529866446, + "epoch": 0.7304233515580651, + "grad_norm": 0.03456445038318634, + "kl": 0.004766225814819336, + "learning_rate": 4.781133846140606e-06, + "loss": 0.0066, + "step": 482 + }, + { + "clip_ratio": 0.000250861422671278, + "epoch": 0.7323354545726151, + "grad_norm": 0.033632129430770874, + "kl": 0.004829883575439453, + "learning_rate": 4.778559818531055e-06, + "loss": 0.0066, + "step": 483 + }, + { + "clip_ratio": 0.0002590245896385568, + "epoch": 0.734247557587165, + "grad_norm": 0.03314875811338425, + "kl": 0.00486445426940918, + "learning_rate": 4.775971444245379e-06, + "loss": 0.0065, + "step": 484 + }, + { + "clip_ratio": 0.0002899982684425595, + "epoch": 0.7361596606017149, + "grad_norm": 0.03288432955741882, + "kl": 0.004921674728393555, + "learning_rate": 4.773368739580963e-06, + "loss": 0.0065, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.4258046150208, + "epoch": 0.7380717636162648, + "grad_norm": 0.08309170603752136, + "kl": 0.006993293762207031, + "learning_rate": 4.770751720925422e-06, + "loss": 0.0023, + "num_tokens": 282068152.0, + "reward": 0.06222098495345563, + "reward_std": 0.0712282478925772, + "rewards/pure_accuracy_reward_math": 0.06222098338184878, + "step": 486 + }, + { + "clip_ratio": 0.0002442373284452515, + "epoch": 0.7399838666308147, + "grad_norm": 0.042120546102523804, + "kl": 0.006081581115722656, + "learning_rate": 4.768120404756497e-06, + "loss": 0.0023, + "step": 487 + }, + { + "clip_ratio": 0.0002956131474434187, + "epoch": 0.7418959696453646, + "grad_norm": 0.036061204969882965, + "kl": 0.0057599544525146484, + "learning_rate": 4.765474807641951e-06, + "loss": 0.0022, + "step": 488 + }, + { + "clip_ratio": 0.00030389728723889675, + "epoch": 0.7438080726599146, + "grad_norm": 0.03613469749689102, + "kl": 0.005738019943237305, + "learning_rate": 4.762814946239468e-06, + "loss": 0.0022, + "step": 489 + }, + { + "clip_ratio": 0.00033159017920070255, + "epoch": 0.7457201756744645, + "grad_norm": 0.0360892117023468, + "kl": 0.00572967529296875, + "learning_rate": 4.760140837296542e-06, + "loss": 0.0021, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.3144750595093, + "epoch": 0.7476322786890144, + "grad_norm": 0.03636733815073967, + "kl": 0.004332542419433594, + "learning_rate": 4.757452497650377e-06, + "loss": 0.0072, + "num_tokens": 285770403.0, + "reward": 0.055803573777666315, + "reward_std": 0.07161362667102367, + "rewards/pure_accuracy_reward_math": 0.05580357278813608, + "step": 491 + }, + { + "clip_ratio": 0.00027637260956225873, + "epoch": 0.7495443817035643, + "grad_norm": 0.035727791488170624, + "kl": 0.004361629486083984, + "learning_rate": 4.754749944227777e-06, + "loss": 0.0072, + "step": 492 + }, + { + "clip_ratio": 0.0002587454115428045, + "epoch": 0.7514564847181142, + "grad_norm": 0.03512200713157654, + "kl": 0.0043697357177734375, + "learning_rate": 4.752033194045044e-06, + "loss": 0.0072, + "step": 493 + }, + { + "clip_ratio": 0.00025780797875540884, + "epoch": 0.7533685877326641, + "grad_norm": 0.033817108720541, + "kl": 0.0043947696685791016, + "learning_rate": 4.7493022642078654e-06, + "loss": 0.0071, + "step": 494 + }, + { + "clip_ratio": 0.00029674232627030506, + "epoch": 0.755280690747214, + "grad_norm": 0.03317062556743622, + "kl": 0.004454851150512695, + "learning_rate": 4.746557171911211e-06, + "loss": 0.0071, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.0692186355591, + "epoch": 0.757192793761764, + "grad_norm": 0.05020016431808472, + "kl": 0.0062062740325927734, + "learning_rate": 4.7437979344392236e-06, + "loss": 0.0059, + "num_tokens": 289451319.0, + "reward": 0.0616629492433276, + "reward_std": 0.07071027776692063, + "rewards/pure_accuracy_reward_math": 0.06166294778813608, + "step": 496 + }, + { + "clip_ratio": 0.00028460744590574905, + "epoch": 0.7591048967763139, + "grad_norm": 0.03948064520955086, + "kl": 0.0061266422271728516, + "learning_rate": 4.741024569165105e-06, + "loss": 0.0059, + "step": 497 + }, + { + "clip_ratio": 0.0002803450769306437, + "epoch": 0.7610169997908637, + "grad_norm": 0.03621263429522514, + "kl": 0.00614476203918457, + "learning_rate": 4.7382370935510165e-06, + "loss": 0.0059, + "step": 498 + }, + { + "clip_ratio": 0.0003022695020717947, + "epoch": 0.7629291028054136, + "grad_norm": 0.037622902542352676, + "kl": 0.006256580352783203, + "learning_rate": 4.73543552514796e-06, + "loss": 0.0058, + "step": 499 + }, + { + "clip_ratio": 0.00030265802058693225, + "epoch": 0.7648412058199635, + "grad_norm": 0.03813454508781433, + "kl": 0.006264209747314453, + "learning_rate": 4.732619881595672e-06, + "loss": 0.0057, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.3538174629211, + "epoch": 0.7667533088345134, + "grad_norm": 0.07500133663415909, + "kl": 0.005916118621826172, + "learning_rate": 4.729790180622512e-06, + "loss": 0.0072, + "num_tokens": 293127839.0, + "reward": 0.0513392879802268, + "reward_std": 0.06792009877972305, + "rewards/pure_accuracy_reward_math": 0.051339287048904225, + "step": 501 + }, + { + "clip_ratio": 0.0002826226679530919, + "epoch": 0.7686654118490633, + "grad_norm": 0.03498294949531555, + "kl": 0.0057086944580078125, + "learning_rate": 4.726946440045348e-06, + "loss": 0.0072, + "step": 502 + }, + { + "clip_ratio": 0.000292762170943206, + "epoch": 0.7705775148636133, + "grad_norm": 0.0338723324239254, + "kl": 0.0054700374603271484, + "learning_rate": 4.7240886777694495e-06, + "loss": 0.0071, + "step": 503 + }, + { + "clip_ratio": 0.00031638332251304746, + "epoch": 0.7724896178781632, + "grad_norm": 0.03360189124941826, + "kl": 0.00526118278503418, + "learning_rate": 4.721216911788371e-06, + "loss": 0.0071, + "step": 504 + }, + { + "clip_ratio": 0.0003445502737804418, + "epoch": 0.7744017208927131, + "grad_norm": 0.03321666270494461, + "kl": 0.005108356475830078, + "learning_rate": 4.71833116018384e-06, + "loss": 0.007, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.3041553497314, + "epoch": 0.776313823907263, + "grad_norm": 0.039082907140254974, + "kl": 0.0048329830169677734, + "learning_rate": 4.715431441125639e-06, + "loss": 0.0072, + "num_tokens": 296745449.0, + "reward": 0.056640627823071554, + "reward_std": 0.066464910923969, + "rewards/pure_accuracy_reward_math": 0.05664062636788003, + "step": 506 + }, + { + "clip_ratio": 0.0002697859709428485, + "epoch": 0.7782259269218129, + "grad_norm": 0.036139652132987976, + "kl": 0.0048868656158447266, + "learning_rate": 4.712517772871503e-06, + "loss": 0.0072, + "step": 507 + }, + { + "clip_ratio": 0.0002602223319172481, + "epoch": 0.7801380299363628, + "grad_norm": 0.03708622604608536, + "kl": 0.004920244216918945, + "learning_rate": 4.709590173766988e-06, + "loss": 0.0072, + "step": 508 + }, + { + "clip_ratio": 0.00030563702995323183, + "epoch": 0.7820501329509127, + "grad_norm": 0.03873802721500397, + "kl": 0.004922151565551758, + "learning_rate": 4.706648662245368e-06, + "loss": 0.0071, + "step": 509 + }, + { + "clip_ratio": 0.00027421732914945096, + "epoch": 0.7839622359654627, + "grad_norm": 0.0337008535861969, + "kl": 0.004686117172241211, + "learning_rate": 4.703693256827515e-06, + "loss": 0.0071, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.4595675468445, + "epoch": 0.7858743389800126, + "grad_norm": 0.032148003578186035, + "kl": 0.004284381866455078, + "learning_rate": 4.700723976121782e-06, + "loss": 0.0079, + "num_tokens": 300427724.0, + "reward": 0.05998884211294353, + "reward_std": 0.06822534691309556, + "rewards/pure_accuracy_reward_math": 0.059988840483129025, + "step": 511 + }, + { + "clip_ratio": 0.00023266997004611767, + "epoch": 0.7877864419945625, + "grad_norm": 0.03213036060333252, + "kl": 0.004235267639160156, + "learning_rate": 4.697740838823884e-06, + "loss": 0.0079, + "step": 512 + }, + { + "clip_ratio": 0.00023210655439243055, + "epoch": 0.7896985450091124, + "grad_norm": 0.03171762451529503, + "kl": 0.004268169403076172, + "learning_rate": 4.694743863716784e-06, + "loss": 0.0078, + "step": 513 + }, + { + "clip_ratio": 0.0002433597992990144, + "epoch": 0.7916106480236623, + "grad_norm": 0.030378276482224464, + "kl": 0.004282712936401367, + "learning_rate": 4.691733069670575e-06, + "loss": 0.0078, + "step": 514 + }, + { + "clip_ratio": 0.00024098603546462982, + "epoch": 0.7935227510382122, + "grad_norm": 0.030135801061987877, + "kl": 0.004299640655517578, + "learning_rate": 4.688708475642356e-06, + "loss": 0.0078, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.5064425468445, + "epoch": 0.795434854052762, + "grad_norm": 0.03758488968014717, + "kl": 0.004748344421386719, + "learning_rate": 4.685670100676117e-06, + "loss": 0.0056, + "num_tokens": 304030899.0, + "reward": 0.059151788300368935, + "reward_std": 0.06615966308163479, + "rewards/pure_accuracy_reward_math": 0.05915178684517741, + "step": 516 + }, + { + "clip_ratio": 0.00024922658519699326, + "epoch": 0.797346957067312, + "grad_norm": 0.03667794167995453, + "kl": 0.004762172698974609, + "learning_rate": 4.6826179639026185e-06, + "loss": 0.0056, + "step": 517 + }, + { + "clip_ratio": 0.00024439046995894387, + "epoch": 0.7992590600818619, + "grad_norm": 0.03566230833530426, + "kl": 0.004770755767822266, + "learning_rate": 4.679552084539271e-06, + "loss": 0.0055, + "step": 518 + }, + { + "clip_ratio": 0.00025443012202686077, + "epoch": 0.8011711630964118, + "grad_norm": 0.03555983304977417, + "kl": 0.004889011383056641, + "learning_rate": 4.676472481890012e-06, + "loss": 0.0055, + "step": 519 + }, + { + "clip_ratio": 0.0002555244412860702, + "epoch": 0.8030832661109617, + "grad_norm": 0.03477266803383827, + "kl": 0.004910707473754883, + "learning_rate": 4.673379175345187e-06, + "loss": 0.0054, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.2039861679077, + "epoch": 0.8049953691255116, + "grad_norm": 0.03352927044034004, + "kl": 0.004728078842163086, + "learning_rate": 4.670272184381426e-06, + "loss": 0.0064, + "num_tokens": 307666714.0, + "reward": 0.05106027063447982, + "reward_std": 0.061781705473549664, + "rewards/pure_accuracy_reward_math": 0.05106026888824999, + "step": 521 + }, + { + "clip_ratio": 0.00022480493561261028, + "epoch": 0.8069074721400615, + "grad_norm": 0.0328591950237751, + "kl": 0.004677772521972656, + "learning_rate": 4.667151528561522e-06, + "loss": 0.0064, + "step": 522 + }, + { + "clip_ratio": 0.0002208993353463029, + "epoch": 0.8088195751546114, + "grad_norm": 0.0323566235601902, + "kl": 0.004681825637817383, + "learning_rate": 4.664017227534308e-06, + "loss": 0.0064, + "step": 523 + }, + { + "clip_ratio": 0.0002261604544742113, + "epoch": 0.8107316781691614, + "grad_norm": 0.03178941085934639, + "kl": 0.004633665084838867, + "learning_rate": 4.6608693010345285e-06, + "loss": 0.0063, + "step": 524 + }, + { + "clip_ratio": 0.0002347389614101303, + "epoch": 0.8126437811837113, + "grad_norm": 0.03144075721502304, + "kl": 0.004633426666259766, + "learning_rate": 4.657707768882723e-06, + "loss": 0.0063, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.2078919410706, + "epoch": 0.8145558841982612, + "grad_norm": 36658.046875, + "kl": 696.0046517848969, + "learning_rate": 4.6545326509850965e-06, + "loss": 27.8583, + "num_tokens": 311314491.0, + "reward": 0.05747768114088103, + "reward_std": 0.06521624798187986, + "rewards/pure_accuracy_reward_math": 0.057477680093143135, + "step": 526 + }, + { + "clip_ratio": 0.0006453408203128674, + "epoch": 0.8164679872128111, + "grad_norm": 3234.42724609375, + "kl": 42.254658937454224, + "learning_rate": 4.651343967333394e-06, + "loss": 1.7021, + "step": 527 + }, + { + "clip_ratio": 0.0006781478184620937, + "epoch": 0.818380090227361, + "grad_norm": 430.01318359375, + "kl": 0.21270966529846191, + "learning_rate": 4.648141738004776e-06, + "loss": 0.256, + "step": 528 + }, + { + "clip_ratio": 0.0006916913723671314, + "epoch": 0.8202921932419109, + "grad_norm": 457.1385803222656, + "kl": 0.1541590690612793, + "learning_rate": 4.644925983161691e-06, + "loss": 0.3118, + "step": 529 + }, + { + "clip_ratio": 0.0007114471513887111, + "epoch": 0.8222042962564609, + "grad_norm": 61.02793884277344, + "kl": 1.6688117980957031, + "learning_rate": 4.641696723051753e-06, + "loss": 0.1081, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.7664904594421, + "epoch": 0.8241163992710108, + "grad_norm": 0.03665775805711746, + "kl": 0.0046710968017578125, + "learning_rate": 4.638453978007606e-06, + "loss": 0.0033, + "num_tokens": 315000186.0, + "reward": 0.05691964577999897, + "reward_std": 0.06766731111565605, + "rewards/pure_accuracy_reward_math": 0.056919643975561485, + "step": 531 + }, + { + "clip_ratio": 0.000247030089042255, + "epoch": 0.8260285022855607, + "grad_norm": 0.03543345257639885, + "kl": 0.004717826843261719, + "learning_rate": 4.635197768446799e-06, + "loss": 0.0033, + "step": 532 + }, + { + "clip_ratio": 0.00024415442914005325, + "epoch": 0.8279406053001105, + "grad_norm": 0.034531209617853165, + "kl": 0.004744768142700195, + "learning_rate": 4.631928114871667e-06, + "loss": 0.0032, + "step": 533 + }, + { + "clip_ratio": 0.0002580326566032909, + "epoch": 0.8298527083146604, + "grad_norm": 0.03323632851243019, + "kl": 0.004830360412597656, + "learning_rate": 4.628645037869183e-06, + "loss": 0.0032, + "step": 534 + }, + { + "clip_ratio": 0.00029695888167680096, + "epoch": 0.8317648113292103, + "grad_norm": 0.03470376506447792, + "kl": 0.0048847198486328125, + "learning_rate": 4.625348558110846e-06, + "loss": 0.0031, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.506443977356, + "epoch": 0.8336769143437602, + "grad_norm": 33.48581314086914, + "kl": 0.7041072845458984, + "learning_rate": 4.6220386963525425e-06, + "loss": 0.0349, + "num_tokens": 318683697.0, + "reward": 0.06333705675206147, + "reward_std": 0.0759915838134475, + "rewards/pure_accuracy_reward_math": 0.063337054773001, + "step": 536 + }, + { + "clip_ratio": 0.00030500417074108555, + "epoch": 0.8355890173583101, + "grad_norm": 5.391356468200684, + "kl": 0.12163639068603516, + "learning_rate": 4.6187154734344144e-06, + "loss": 0.0115, + "step": 537 + }, + { + "clip_ratio": 0.0003094891900445873, + "epoch": 0.8375011203728601, + "grad_norm": 0.24674992263317108, + "kl": 0.011260032653808594, + "learning_rate": 4.615378910280735e-06, + "loss": 0.007, + "step": 538 + }, + { + "clip_ratio": 0.0003443351265559613, + "epoch": 0.83941322338741, + "grad_norm": 0.040490083396434784, + "kl": 0.0068547725677490234, + "learning_rate": 4.61202902789977e-06, + "loss": 0.0068, + "step": 539 + }, + { + "clip_ratio": 0.0003249310258297555, + "epoch": 0.8413253264019599, + "grad_norm": 0.037383101880550385, + "kl": 0.006977081298828125, + "learning_rate": 4.608665847383646e-06, + "loss": 0.0068, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.8432207107544, + "epoch": 0.8432374294165098, + "grad_norm": 0.0408562608063221, + "kl": 0.005080223083496094, + "learning_rate": 4.6052893899082244e-06, + "loss": 0.0092, + "num_tokens": 322311955.0, + "reward": 0.07505580695578828, + "reward_std": 0.08672685426427051, + "rewards/pure_accuracy_reward_math": 0.07505580462748185, + "step": 541 + }, + { + "clip_ratio": 0.0003254984287082152, + "epoch": 0.8451495324310597, + "grad_norm": 0.03888032212853432, + "kl": 0.005081653594970703, + "learning_rate": 4.60189967673296e-06, + "loss": 0.0091, + "step": 542 + }, + { + "clip_ratio": 0.00032150591908930437, + "epoch": 0.8470616354456096, + "grad_norm": 0.03769301995635033, + "kl": 0.005054950714111328, + "learning_rate": 4.598496729200772e-06, + "loss": 0.0091, + "step": 543 + }, + { + "clip_ratio": 0.0003807161001532222, + "epoch": 0.8489737384601596, + "grad_norm": 0.03671475872397423, + "kl": 0.005011320114135742, + "learning_rate": 4.595080568737907e-06, + "loss": 0.009, + "step": 544 + }, + { + "clip_ratio": 0.00040073374452731514, + "epoch": 0.8508858414747095, + "grad_norm": 0.03656642884016037, + "kl": 0.004985332489013672, + "learning_rate": 4.591651216853808e-06, + "loss": 0.009, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.1850123405457, + "epoch": 0.8527979444892594, + "grad_norm": 0.04072614386677742, + "kl": 0.005250692367553711, + "learning_rate": 4.588208695140972e-06, + "loss": 0.008, + "num_tokens": 325915646.0, + "reward": 0.06891741379513405, + "reward_std": 0.07457646180409938, + "rewards/pure_accuracy_reward_math": 0.0689174119324889, + "step": 546 + }, + { + "clip_ratio": 0.0002774237623270892, + "epoch": 0.8547100475038093, + "grad_norm": 0.03891909867525101, + "kl": 0.005267620086669922, + "learning_rate": 4.5847530252748206e-06, + "loss": 0.008, + "step": 547 + }, + { + "clip_ratio": 0.0003099276901821213, + "epoch": 0.8566221505183592, + "grad_norm": 0.03776893764734268, + "kl": 0.005312681198120117, + "learning_rate": 4.581284229013561e-06, + "loss": 0.008, + "step": 548 + }, + { + "clip_ratio": 0.0003329096458060121, + "epoch": 0.8585342535329091, + "grad_norm": 0.03786613792181015, + "kl": 0.0053446292877197266, + "learning_rate": 4.57780232819805e-06, + "loss": 0.0079, + "step": 549 + }, + { + "clip_ratio": 0.0003465502328481307, + "epoch": 0.860446356547459, + "grad_norm": 0.03782954812049866, + "kl": 0.00535893440246582, + "learning_rate": 4.574307344751654e-06, + "loss": 0.0079, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.2042679786682, + "epoch": 0.8623584595620088, + "grad_norm": 0.04082540422677994, + "kl": 0.005150318145751953, + "learning_rate": 4.570799300680112e-06, + "loss": 0.0061, + "num_tokens": 329486142.0, + "reward": 0.06696428914438002, + "reward_std": 0.07865536911413074, + "rewards/pure_accuracy_reward_math": 0.06696428681607358, + "step": 551 + }, + { + "clip_ratio": 0.0002784457984148503, + "epoch": 0.8642705625765588, + "grad_norm": 0.039590511471033096, + "kl": 0.005137205123901367, + "learning_rate": 4.5672782180714005e-06, + "loss": 0.0061, + "step": 552 + }, + { + "clip_ratio": 0.0003210699376268167, + "epoch": 0.8661826655911087, + "grad_norm": 0.03983275964856148, + "kl": 0.005161285400390625, + "learning_rate": 4.56374411909559e-06, + "loss": 0.0061, + "step": 553 + }, + { + "clip_ratio": 0.00032905748116718314, + "epoch": 0.8680947686056586, + "grad_norm": 0.03924131765961647, + "kl": 0.0051097869873046875, + "learning_rate": 4.560197026004706e-06, + "loss": 0.006, + "step": 554 + }, + { + "clip_ratio": 0.00036174511694753164, + "epoch": 0.8700068716202085, + "grad_norm": 0.03864859789609909, + "kl": 0.0051233768463134766, + "learning_rate": 4.556636961132591e-06, + "loss": 0.0059, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.8490724563599, + "epoch": 0.8719189746347584, + "grad_norm": 0.03831901028752327, + "kl": 0.005173921585083008, + "learning_rate": 4.553063946894765e-06, + "loss": 0.0089, + "num_tokens": 333101169.0, + "reward": 0.05970982427243143, + "reward_std": 0.06925509037682787, + "rewards/pure_accuracy_reward_math": 0.05970982293365523, + "step": 556 + }, + { + "clip_ratio": 0.00024058804717697058, + "epoch": 0.8738310776493083, + "grad_norm": 0.03815346583724022, + "kl": 0.005152463912963867, + "learning_rate": 4.549478005788276e-06, + "loss": 0.0088, + "step": 557 + }, + { + "clip_ratio": 0.0002689754076072859, + "epoch": 0.8757431806638583, + "grad_norm": 0.03663227707147598, + "kl": 0.00511932373046875, + "learning_rate": 4.5458791603915695e-06, + "loss": 0.0088, + "step": 558 + }, + { + "clip_ratio": 0.0002769273295371022, + "epoch": 0.8776552836784082, + "grad_norm": 0.03534897044301033, + "kl": 0.005173921585083008, + "learning_rate": 4.5422674333643415e-06, + "loss": 0.0087, + "step": 559 + }, + { + "clip_ratio": 0.0003186316080245888, + "epoch": 0.8795673866929581, + "grad_norm": 0.03454131633043289, + "kl": 0.005182981491088867, + "learning_rate": 4.538642847447393e-06, + "loss": 0.0087, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 499.49025869369507, + "epoch": 0.881479489707508, + "grad_norm": 0.03870520368218422, + "kl": 0.005303621292114258, + "learning_rate": 4.53500542546249e-06, + "loss": 0.0063, + "num_tokens": 336621146.0, + "reward": 0.06724330663564615, + "reward_std": 0.07539348350837827, + "rewards/pure_accuracy_reward_math": 0.0672433050640393, + "step": 561 + }, + { + "clip_ratio": 0.0002930208739826412, + "epoch": 0.8833915927220579, + "grad_norm": 0.03670111671090126, + "kl": 0.005410432815551758, + "learning_rate": 4.5313551903122195e-06, + "loss": 0.0063, + "step": 562 + }, + { + "clip_ratio": 0.00033625421181682214, + "epoch": 0.8853036957366078, + "grad_norm": 0.03873737156391144, + "kl": 0.0054399967193603516, + "learning_rate": 4.5276921649798475e-06, + "loss": 0.0063, + "step": 563 + }, + { + "clip_ratio": 0.0003349392310383337, + "epoch": 0.8872157987511577, + "grad_norm": 0.038494061678647995, + "kl": 0.0053806304931640625, + "learning_rate": 4.524016372529168e-06, + "loss": 0.0062, + "step": 564 + }, + { + "clip_ratio": 0.00031196477385719845, + "epoch": 0.8891279017657077, + "grad_norm": 0.03559175133705139, + "kl": 0.005260467529296875, + "learning_rate": 4.520327836104363e-06, + "loss": 0.0061, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.2452793121338, + "epoch": 0.8910400047802576, + "grad_norm": 0.033526018261909485, + "kl": 0.0050280094146728516, + "learning_rate": 4.516626578929857e-06, + "loss": 0.0083, + "num_tokens": 340217537.0, + "reward": 0.05970982470898889, + "reward_std": 0.06920882686972618, + "rewards/pure_accuracy_reward_math": 0.059709822555305436, + "step": 566 + }, + { + "clip_ratio": 0.0002854210310374583, + "epoch": 0.8929521077948075, + "grad_norm": 0.03320698440074921, + "kl": 0.00494694709777832, + "learning_rate": 4.512912624310166e-06, + "loss": 0.0083, + "step": 567 + }, + { + "clip_ratio": 0.00028784406134718665, + "epoch": 0.8948642108093574, + "grad_norm": 0.0334990993142128, + "kl": 0.004927158355712891, + "learning_rate": 4.509185995629758e-06, + "loss": 0.0083, + "step": 568 + }, + { + "clip_ratio": 0.00028731861192454744, + "epoch": 0.8967763138239072, + "grad_norm": 0.032721105962991714, + "kl": 0.004916667938232422, + "learning_rate": 4.505446716352898e-06, + "loss": 0.0083, + "step": 569 + }, + { + "clip_ratio": 0.0003211342911981774, + "epoch": 0.8986884168384571, + "grad_norm": 0.031691305339336395, + "kl": 0.0050427913665771484, + "learning_rate": 4.501694810023506e-06, + "loss": 0.0082, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.3175444602966, + "epoch": 0.900600519853007, + "grad_norm": 0.039067283272743225, + "kl": 0.0051767826080322266, + "learning_rate": 4.497930300265005e-06, + "loss": 0.0062, + "num_tokens": 343792675.0, + "reward": 0.07254464668221772, + "reward_std": 0.07260330504504964, + "rewards/pure_accuracy_reward_math": 0.07254464394645765, + "step": 571 + }, + { + "clip_ratio": 0.000284439854624452, + "epoch": 0.902512622867557, + "grad_norm": 0.03746037185192108, + "kl": 0.0051670074462890625, + "learning_rate": 4.494153210780177e-06, + "loss": 0.0062, + "step": 572 + }, + { + "clip_ratio": 0.0002894837679718876, + "epoch": 0.9044247258821069, + "grad_norm": 0.0363248772919178, + "kl": 0.0051119327545166016, + "learning_rate": 4.490363565351007e-06, + "loss": 0.0061, + "step": 573 + }, + { + "clip_ratio": 0.00029392389137683494, + "epoch": 0.9063368288966568, + "grad_norm": 0.03513769805431366, + "kl": 0.005059242248535156, + "learning_rate": 4.486561387838539e-06, + "loss": 0.0061, + "step": 574 + }, + { + "clip_ratio": 0.0003296555175325011, + "epoch": 0.9082489319112067, + "grad_norm": 0.03513012453913689, + "kl": 0.005059242248535156, + "learning_rate": 4.482746702182725e-06, + "loss": 0.006, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.8926033973694, + "epoch": 0.9101610349257566, + "grad_norm": 0.049145400524139404, + "kl": 0.011604547500610352, + "learning_rate": 4.478919532402271e-06, + "loss": 0.0046, + "num_tokens": 347395370.0, + "reward": 0.07170759254950099, + "reward_std": 0.0817445982247591, + "rewards/pure_accuracy_reward_math": 0.07170759091968648, + "step": 576 + }, + { + "clip_ratio": 0.00030760892423131736, + "epoch": 0.9120731379403065, + "grad_norm": 0.04954507574439049, + "kl": 0.011447906494140625, + "learning_rate": 4.4750799025944866e-06, + "loss": 0.0045, + "step": 577 + }, + { + "clip_ratio": 0.0003202956161487691, + "epoch": 0.9139852409548564, + "grad_norm": 0.04883984476327896, + "kl": 0.010998249053955078, + "learning_rate": 4.471227836935139e-06, + "loss": 0.0045, + "step": 578 + }, + { + "clip_ratio": 0.0003312723312660637, + "epoch": 0.9158973439694064, + "grad_norm": 0.049066606909036636, + "kl": 0.010381698608398438, + "learning_rate": 4.467363359678291e-06, + "loss": 0.0044, + "step": 579 + }, + { + "clip_ratio": 0.00041312941800697445, + "epoch": 0.9178094469839563, + "grad_norm": 0.053418997675180435, + "kl": 0.009602546691894531, + "learning_rate": 4.463486495156157e-06, + "loss": 0.0043, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.5678267478943, + "epoch": 0.9197215499985062, + "grad_norm": 0.03747523948550224, + "kl": 0.004802227020263672, + "learning_rate": 4.459597267778945e-06, + "loss": 0.0041, + "num_tokens": 351065793.0, + "reward": 0.062220984895247966, + "reward_std": 0.07298868335783482, + "rewards/pure_accuracy_reward_math": 0.0622209832072258, + "step": 581 + }, + { + "clip_ratio": 0.0002890200073579763, + "epoch": 0.9216336530130561, + "grad_norm": 0.03557584062218666, + "kl": 0.004851579666137695, + "learning_rate": 4.455695702034705e-06, + "loss": 0.0041, + "step": 582 + }, + { + "clip_ratio": 0.00031045296407228307, + "epoch": 0.923545756027606, + "grad_norm": 0.034734807908535004, + "kl": 0.004895925521850586, + "learning_rate": 4.451781822489173e-06, + "loss": 0.0041, + "step": 583 + }, + { + "clip_ratio": 0.00032734786560695284, + "epoch": 0.9254578590421559, + "grad_norm": 0.03634972497820854, + "kl": 0.004976511001586914, + "learning_rate": 4.447855653785617e-06, + "loss": 0.004, + "step": 584 + }, + { + "clip_ratio": 0.00036698238614008005, + "epoch": 0.9273699620567059, + "grad_norm": 0.036671172827482224, + "kl": 0.004954338073730469, + "learning_rate": 4.4439172206446845e-06, + "loss": 0.0039, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.6261405944824, + "epoch": 0.9292820650712557, + "grad_norm": 0.03805253654718399, + "kl": 0.005060434341430664, + "learning_rate": 4.439966547864243e-06, + "loss": 0.0061, + "num_tokens": 354732057.0, + "reward": 0.06194196725846268, + "reward_std": 0.07766569184605032, + "rewards/pure_accuracy_reward_math": 0.06194196580327116, + "step": 586 + }, + { + "clip_ratio": 0.0002944122598478316, + "epoch": 0.9311941680858056, + "grad_norm": 0.03603314608335495, + "kl": 0.005051136016845703, + "learning_rate": 4.436003660319224e-06, + "loss": 0.0061, + "step": 587 + }, + { + "clip_ratio": 0.0003042620955966413, + "epoch": 0.9331062711003555, + "grad_norm": 0.035505130887031555, + "kl": 0.005032539367675781, + "learning_rate": 4.432028582961472e-06, + "loss": 0.006, + "step": 588 + }, + { + "clip_ratio": 0.00032173160303727855, + "epoch": 0.9350183741149054, + "grad_norm": 0.03633759915828705, + "kl": 0.00509190559387207, + "learning_rate": 4.428041340819579e-06, + "loss": 0.006, + "step": 589 + }, + { + "clip_ratio": 0.00038377046530513326, + "epoch": 0.9369304771294553, + "grad_norm": 0.03761395812034607, + "kl": 0.005148649215698242, + "learning_rate": 4.424041958998732e-06, + "loss": 0.0059, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.8948354721069, + "epoch": 0.9388425801440052, + "grad_norm": 0.04139011353254318, + "kl": 0.005031585693359375, + "learning_rate": 4.420030462680554e-06, + "loss": 0.007, + "num_tokens": 358409840.0, + "reward": 0.0714285749127157, + "reward_std": 0.07565246830927208, + "rewards/pure_accuracy_reward_math": 0.07142857275903225, + "step": 591 + }, + { + "clip_ratio": 0.0002982392526291733, + "epoch": 0.9407546831585551, + "grad_norm": 0.03948375955224037, + "kl": 0.005082845687866211, + "learning_rate": 4.416006877122948e-06, + "loss": 0.007, + "step": 592 + }, + { + "clip_ratio": 0.00033647330587882607, + "epoch": 0.9426667861731051, + "grad_norm": 0.041717879474163055, + "kl": 0.005113363265991211, + "learning_rate": 4.411971227659933e-06, + "loss": 0.0069, + "step": 593 + }, + { + "clip_ratio": 0.00036752876485479646, + "epoch": 0.944578889187655, + "grad_norm": 0.04109462723135948, + "kl": 0.005068063735961914, + "learning_rate": 4.407923539701486e-06, + "loss": 0.0069, + "step": 594 + }, + { + "clip_ratio": 0.0003528254699176614, + "epoch": 0.9464909922022049, + "grad_norm": 0.03620041161775589, + "kl": 0.0049245357513427734, + "learning_rate": 4.403863838733386e-06, + "loss": 0.0068, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.2444491386414, + "epoch": 0.9484030952167548, + "grad_norm": 42.05046463012695, + "kl": 0.3311493396759033, + "learning_rate": 4.399792150317048e-06, + "loss": 0.0203, + "num_tokens": 362096328.0, + "reward": 0.06026786071015522, + "reward_std": 0.07324766798410565, + "rewards/pure_accuracy_reward_math": 0.06026785832364112, + "step": 596 + }, + { + "clip_ratio": 0.0003009684866128737, + "epoch": 0.9503151982313047, + "grad_norm": 0.575372040271759, + "kl": 0.01551508903503418, + "learning_rate": 4.395708500089366e-06, + "loss": 0.0076, + "step": 597 + }, + { + "clip_ratio": 0.0003299758830053179, + "epoch": 0.9522273012458546, + "grad_norm": 0.052088066935539246, + "kl": 0.01082468032836914, + "learning_rate": 4.391612913762549e-06, + "loss": 0.0074, + "step": 598 + }, + { + "clip_ratio": 0.00032988658261956516, + "epoch": 0.9541394042604046, + "grad_norm": 0.046673182398080826, + "kl": 0.011472225189208984, + "learning_rate": 4.38750541712396e-06, + "loss": 0.0074, + "step": 599 + }, + { + "clip_ratio": 0.00031585949000145774, + "epoch": 0.9560515072749545, + "grad_norm": 0.04350757598876953, + "kl": 0.011662006378173828, + "learning_rate": 4.383386036035956e-06, + "loss": 0.0074, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.0309958457947, + "epoch": 0.9579636102895044, + "grad_norm": 0.04193362593650818, + "kl": 0.005011081695556641, + "learning_rate": 4.379254796435719e-06, + "loss": 0.0085, + "num_tokens": 365761119.0, + "reward": 0.06696428923169151, + "reward_std": 0.08311965479515493, + "rewards/pure_accuracy_reward_math": 0.06696428667055443, + "step": 601 + }, + { + "clip_ratio": 0.0003076634293392999, + "epoch": 0.9598757133040543, + "grad_norm": 0.04204736277461052, + "kl": 0.005095720291137695, + "learning_rate": 4.375111724335102e-06, + "loss": 0.0085, + "step": 602 + }, + { + "clip_ratio": 0.0002991189727481469, + "epoch": 0.9617878163186042, + "grad_norm": 0.041649866849184036, + "kl": 0.00509333610534668, + "learning_rate": 4.370956845820455e-06, + "loss": 0.0085, + "step": 603 + }, + { + "clip_ratio": 0.0003053998929090085, + "epoch": 0.963699919333154, + "grad_norm": 0.03969484567642212, + "kl": 0.005100727081298828, + "learning_rate": 4.366790187052468e-06, + "loss": 0.0084, + "step": 604 + }, + { + "clip_ratio": 0.0003063883330014505, + "epoch": 0.9656120223477039, + "grad_norm": 0.03833401948213577, + "kl": 0.005064487457275391, + "learning_rate": 4.362611774266005e-06, + "loss": 0.0083, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.4046006202698, + "epoch": 0.9675241253622539, + "grad_norm": 0.038279399275779724, + "kl": 0.005177021026611328, + "learning_rate": 4.358421633769934e-06, + "loss": 0.0061, + "num_tokens": 369412689.0, + "reward": 0.07087053885334171, + "reward_std": 0.08299326128326356, + "rewards/pure_accuracy_reward_math": 0.0708705369324889, + "step": 606 + }, + { + "clip_ratio": 0.00030927538728064974, + "epoch": 0.9694362283768038, + "grad_norm": 0.037665851414203644, + "kl": 0.005164146423339844, + "learning_rate": 4.35421979194697e-06, + "loss": 0.0061, + "step": 607 + }, + { + "clip_ratio": 0.0003293242310178357, + "epoch": 0.9713483313913537, + "grad_norm": 0.036888375878334045, + "kl": 0.005212306976318359, + "learning_rate": 4.3500062752535e-06, + "loss": 0.006, + "step": 608 + }, + { + "clip_ratio": 0.0003369250752029984, + "epoch": 0.9732604344059036, + "grad_norm": 0.03607965633273125, + "kl": 0.005278587341308594, + "learning_rate": 4.3457811102194225e-06, + "loss": 0.006, + "step": 609 + }, + { + "clip_ratio": 0.00034393194414406025, + "epoch": 0.9751725374204535, + "grad_norm": 0.036863330751657486, + "kl": 0.005379676818847656, + "learning_rate": 4.341544323447978e-06, + "loss": 0.0059, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.9905385971069, + "epoch": 0.9770846404350034, + "grad_norm": 0.03825363516807556, + "kl": 0.005227804183959961, + "learning_rate": 4.33729594161558e-06, + "loss": 0.0103, + "num_tokens": 373041503.0, + "reward": 0.07254464607103728, + "reward_std": 0.07848271250259131, + "rewards/pure_accuracy_reward_math": 0.07254464444122277, + "step": 611 + }, + { + "clip_ratio": 0.0002938344064205012, + "epoch": 0.9789967434495533, + "grad_norm": 0.037028077989816666, + "kl": 0.005240917205810547, + "learning_rate": 4.333035991471653e-06, + "loss": 0.0102, + "step": 612 + }, + { + "clip_ratio": 0.00029232190240691125, + "epoch": 0.9809088464641033, + "grad_norm": 0.03623189404606819, + "kl": 0.005187034606933594, + "learning_rate": 4.328764499838456e-06, + "loss": 0.0102, + "step": 613 + }, + { + "clip_ratio": 0.000318144969014611, + "epoch": 0.9828209494786532, + "grad_norm": 0.036878351122140884, + "kl": 0.005211830139160156, + "learning_rate": 4.324481493610919e-06, + "loss": 0.0101, + "step": 614 + }, + { + "clip_ratio": 0.0003371401809317831, + "epoch": 0.9847330524932031, + "grad_norm": 0.036278340965509415, + "kl": 0.0051462650299072266, + "learning_rate": 4.320186999756473e-06, + "loss": 0.0101, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.4927659034729, + "epoch": 0.986645155507753, + "grad_norm": 0.037584077566862106, + "kl": 0.005333662033081055, + "learning_rate": 4.315881045314878e-06, + "loss": 0.007, + "num_tokens": 376615645.0, + "reward": 0.07087053899886087, + "reward_std": 0.07342032523592934, + "rewards/pure_accuracy_reward_math": 0.0708705370198004, + "step": 616 + }, + { + "clip_ratio": 0.0002886684330292155, + "epoch": 0.9885572585223029, + "grad_norm": 0.035872798413038254, + "kl": 0.005288362503051758, + "learning_rate": 4.311563657398056e-06, + "loss": 0.007, + "step": 617 + }, + { + "clip_ratio": 0.0002961605097766551, + "epoch": 0.9904693615368528, + "grad_norm": 0.034989748150110245, + "kl": 0.0052263736724853516, + "learning_rate": 4.307234863189917e-06, + "loss": 0.007, + "step": 618 + }, + { + "clip_ratio": 0.0003532402791392997, + "epoch": 0.9923814645514027, + "grad_norm": 0.0338488332927227, + "kl": 0.005165576934814453, + "learning_rate": 4.302894689946189e-06, + "loss": 0.0069, + "step": 619 + }, + { + "clip_ratio": 0.00035387994120128496, + "epoch": 0.9942935675659527, + "grad_norm": 0.03370453417301178, + "kl": 0.005126953125, + "learning_rate": 4.298543164994249e-06, + "loss": 0.0069, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.433337688446, + "epoch": 1.00191210301455, + "grad_norm": 0.0355641208589077, + "kl": 0.004958152770996094, + "learning_rate": 4.294180315732946e-06, + "loss": 0.0063, + "num_tokens": 380233970.0, + "reward": 0.05412946696742438, + "reward_std": 0.06637858302565292, + "rewards/pure_accuracy_reward_math": 0.0541294657450635, + "step": 621 + }, + { + "clip_ratio": 0.0002793830541349962, + "epoch": 1.0038242060290998, + "grad_norm": 0.034697938710451126, + "kl": 0.004967689514160156, + "learning_rate": 4.289806169632434e-06, + "loss": 0.0063, + "step": 622 + }, + { + "clip_ratio": 0.00026950584020823953, + "epoch": 1.0057363090436497, + "grad_norm": 0.034267228096723557, + "kl": 0.005029439926147461, + "learning_rate": 4.285420754233992e-06, + "loss": 0.0062, + "step": 623 + }, + { + "clip_ratio": 0.0002694177366606709, + "epoch": 1.0076484120581997, + "grad_norm": 0.03245500102639198, + "kl": 0.005047798156738281, + "learning_rate": 4.2810240971498594e-06, + "loss": 0.0062, + "step": 624 + }, + { + "clip_ratio": 0.0002762260926942872, + "epoch": 1.0095605150727496, + "grad_norm": 0.03143523633480072, + "kl": 0.005035400390625, + "learning_rate": 4.276616226063055e-06, + "loss": 0.0061, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.094889163971, + "epoch": 1.0114726180872995, + "grad_norm": 0.03780335932970047, + "kl": 0.005240440368652344, + "learning_rate": 4.272197168727204e-06, + "loss": 0.0082, + "num_tokens": 383858818.0, + "reward": 0.06891741388244554, + "reward_std": 0.07891435397323221, + "rewards/pure_accuracy_reward_math": 0.06891741207800806, + "step": 626 + }, + { + "clip_ratio": 0.0002971897219481434, + "epoch": 1.0133847211018494, + "grad_norm": 0.03676832467317581, + "kl": 0.005240440368652344, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0082, + "step": 627 + }, + { + "clip_ratio": 0.00032256075144232454, + "epoch": 1.0152968241163993, + "grad_norm": 0.03722486272454262, + "kl": 0.005322933197021484, + "learning_rate": 4.263325606674865e-06, + "loss": 0.0082, + "step": 628 + }, + { + "clip_ratio": 0.00031109488622860226, + "epoch": 1.0172089271309492, + "grad_norm": 0.036808740347623825, + "kl": 0.0054111480712890625, + "learning_rate": 4.258873157817093e-06, + "loss": 0.0081, + "step": 629 + }, + { + "clip_ratio": 0.00032292150183366175, + "epoch": 1.0191210301454992, + "grad_norm": 0.03518703579902649, + "kl": 0.005442619323730469, + "learning_rate": 4.254409634427356e-06, + "loss": 0.008, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.6958961486816, + "epoch": 1.021033133160049, + "grad_norm": 0.03399791195988655, + "kl": 0.005387306213378906, + "learning_rate": 4.249935064609692e-06, + "loss": 0.0031, + "num_tokens": 387438928.0, + "reward": 0.06250000285217538, + "reward_std": 0.06757478544022888, + "rewards/pure_accuracy_reward_math": 0.06250000145519152, + "step": 631 + }, + { + "clip_ratio": 0.0002553542814212051, + "epoch": 1.022945236174599, + "grad_norm": 0.03381386399269104, + "kl": 0.005375385284423828, + "learning_rate": 4.245449476537685e-06, + "loss": 0.0031, + "step": 632 + }, + { + "clip_ratio": 0.00023506408626872144, + "epoch": 1.024857339189149, + "grad_norm": 0.03337083011865616, + "kl": 0.00537109375, + "learning_rate": 4.2409528984543e-06, + "loss": 0.003, + "step": 633 + }, + { + "clip_ratio": 0.0002632986112871549, + "epoch": 1.0267694422036988, + "grad_norm": 0.03213095664978027, + "kl": 0.005321979522705078, + "learning_rate": 4.236445358671696e-06, + "loss": 0.003, + "step": 634 + }, + { + "clip_ratio": 0.00025607587781451, + "epoch": 1.0286815452182487, + "grad_norm": 0.03154142573475838, + "kl": 0.005255699157714844, + "learning_rate": 4.23192688557105e-06, + "loss": 0.0029, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.272346496582, + "epoch": 1.0305936482327986, + "grad_norm": 0.039318569004535675, + "kl": 0.005155801773071289, + "learning_rate": 4.2273975076023835e-06, + "loss": 0.0075, + "num_tokens": 391053556.0, + "reward": 0.06473214598372579, + "reward_std": 0.07401842583203688, + "rewards/pure_accuracy_reward_math": 0.06473214412108064, + "step": 636 + }, + { + "clip_ratio": 0.0003024499371804268, + "epoch": 1.0325057512473486, + "grad_norm": 0.03726111724972725, + "kl": 0.0050776004791259766, + "learning_rate": 4.222857253284376e-06, + "loss": 0.0075, + "step": 637 + }, + { + "clip_ratio": 0.0003151753968495541, + "epoch": 1.0344178542618985, + "grad_norm": 0.03595959022641182, + "kl": 0.005060434341430664, + "learning_rate": 4.218306151204188e-06, + "loss": 0.0074, + "step": 638 + }, + { + "clip_ratio": 0.0003387899199083222, + "epoch": 1.0363299572764482, + "grad_norm": 0.03628028184175491, + "kl": 0.005034923553466797, + "learning_rate": 4.213744230017283e-06, + "loss": 0.0074, + "step": 639 + }, + { + "clip_ratio": 0.00037899152403042535, + "epoch": 1.038242060290998, + "grad_norm": 0.03670131787657738, + "kl": 0.005095720291137695, + "learning_rate": 4.209171518447248e-06, + "loss": 0.0073, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.5907049179077, + "epoch": 1.040154163305548, + "grad_norm": 0.03938442841172218, + "kl": 0.0051763057708740234, + "learning_rate": 4.204588045285607e-06, + "loss": 0.0022, + "num_tokens": 394708581.0, + "reward": 0.06333705710130744, + "reward_std": 0.07792467664694414, + "rewards/pure_accuracy_reward_math": 0.06333705500583164, + "step": 641 + }, + { + "clip_ratio": 0.0002767174905216052, + "epoch": 1.042066266320098, + "grad_norm": 0.037835828959941864, + "kl": 0.005267143249511719, + "learning_rate": 4.1999938393916424e-06, + "loss": 0.0022, + "step": 642 + }, + { + "clip_ratio": 0.0003277845591469486, + "epoch": 1.0439783693346478, + "grad_norm": 0.03832162916660309, + "kl": 0.005464792251586914, + "learning_rate": 4.195388929692217e-06, + "loss": 0.0022, + "step": 643 + }, + { + "clip_ratio": 0.00035426640954483446, + "epoch": 1.0458904723491977, + "grad_norm": 0.03823033347725868, + "kl": 0.005482673645019531, + "learning_rate": 4.190773345181587e-06, + "loss": 0.0021, + "step": 644 + }, + { + "clip_ratio": 0.0003763593267649412, + "epoch": 1.0478025753637477, + "grad_norm": 0.036984797567129135, + "kl": 0.005467653274536133, + "learning_rate": 4.186147114921221e-06, + "loss": 0.002, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.9266424179077, + "epoch": 1.0497146783782976, + "grad_norm": 0.0355878509581089, + "kl": 0.005333423614501953, + "learning_rate": 4.18151026803962e-06, + "loss": 0.0056, + "num_tokens": 398334618.0, + "reward": 0.06305803850409575, + "reward_std": 0.06942774693015963, + "rewards/pure_accuracy_reward_math": 0.06305803699069656, + "step": 646 + }, + { + "clip_ratio": 0.00024814905674475085, + "epoch": 1.0516267813928475, + "grad_norm": 0.034741513431072235, + "kl": 0.005269289016723633, + "learning_rate": 4.176862833732127e-06, + "loss": 0.0056, + "step": 647 + }, + { + "clip_ratio": 0.00027503305113896204, + "epoch": 1.0535388844073974, + "grad_norm": 0.03375249356031418, + "kl": 0.005173683166503906, + "learning_rate": 4.1722048412607495e-06, + "loss": 0.0055, + "step": 648 + }, + { + "clip_ratio": 0.0002895867207826086, + "epoch": 1.0554509874219473, + "grad_norm": 0.0341072678565979, + "kl": 0.005132198333740234, + "learning_rate": 4.167536319953976e-06, + "loss": 0.0055, + "step": 649 + }, + { + "clip_ratio": 0.0003005371929134526, + "epoch": 1.0573630904364972, + "grad_norm": 0.033096957951784134, + "kl": 0.005170345306396484, + "learning_rate": 4.162857299206584e-06, + "loss": 0.0054, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.7528138160706, + "epoch": 1.0592751934510471, + "grad_norm": 0.03696604445576668, + "kl": 0.0052814483642578125, + "learning_rate": 4.158167808479461e-06, + "loss": 0.0097, + "num_tokens": 401997276.0, + "reward": 0.05943080657743849, + "reward_std": 0.07388583471765742, + "rewards/pure_accuracy_reward_math": 0.05943080494762398, + "step": 651 + }, + { + "clip_ratio": 0.00029416859939601636, + "epoch": 1.061187296465597, + "grad_norm": 0.03565770015120506, + "kl": 0.005290031433105469, + "learning_rate": 4.153467877299419e-06, + "loss": 0.0097, + "step": 652 + }, + { + "clip_ratio": 0.00029473524284640007, + "epoch": 1.063099399480147, + "grad_norm": 0.03546367585659027, + "kl": 0.005368709564208984, + "learning_rate": 4.148757535259004e-06, + "loss": 0.0096, + "step": 653 + }, + { + "clip_ratio": 0.00032781071104182047, + "epoch": 1.065011502494697, + "grad_norm": 0.03601039946079254, + "kl": 0.005382061004638672, + "learning_rate": 4.144036812016317e-06, + "loss": 0.0096, + "step": 654 + }, + { + "clip_ratio": 0.0003433626044397897, + "epoch": 1.0669236055092468, + "grad_norm": 0.035073794424533844, + "kl": 0.0053446292877197266, + "learning_rate": 4.139305737294818e-06, + "loss": 0.0095, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.1163725852966, + "epoch": 1.0688357085237967, + "grad_norm": 0.03852629289031029, + "kl": 0.005383491516113281, + "learning_rate": 4.134564340883148e-06, + "loss": 0.0083, + "num_tokens": 405593985.0, + "reward": 0.06445312793948688, + "reward_std": 0.07135464163729921, + "rewards/pure_accuracy_reward_math": 0.06445312654250301, + "step": 656 + }, + { + "clip_ratio": 0.0002591365355897324, + "epoch": 1.0707478115383466, + "grad_norm": 0.03745557367801666, + "kl": 0.0053327083587646484, + "learning_rate": 4.129812652634936e-06, + "loss": 0.0083, + "step": 657 + }, + { + "clip_ratio": 0.0003071958567772981, + "epoch": 1.0726599145528966, + "grad_norm": 0.037043727934360504, + "kl": 0.00532078742980957, + "learning_rate": 4.1250507024686115e-06, + "loss": 0.0083, + "step": 658 + }, + { + "clip_ratio": 0.00029935286954696494, + "epoch": 1.0745720175674465, + "grad_norm": 0.03582773730158806, + "kl": 0.005355358123779297, + "learning_rate": 4.120278520367217e-06, + "loss": 0.0082, + "step": 659 + }, + { + "clip_ratio": 0.0003111159166451216, + "epoch": 1.0764841205819964, + "grad_norm": 0.035313159227371216, + "kl": 0.005402326583862305, + "learning_rate": 4.115496136378219e-06, + "loss": 0.0081, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.2994108200073, + "epoch": 1.0783962235965463, + "grad_norm": 0.041104141622781754, + "kl": 0.005465507507324219, + "learning_rate": 4.110703580613321e-06, + "loss": 0.0074, + "num_tokens": 409156330.0, + "reward": 0.0641741098370403, + "reward_std": 0.08329231233801693, + "rewards/pure_accuracy_reward_math": 0.06417410826543346, + "step": 661 + }, + { + "clip_ratio": 0.0003218170786567498, + "epoch": 1.0803083266110962, + "grad_norm": 0.03970121592283249, + "kl": 0.005608558654785156, + "learning_rate": 4.105900883248269e-06, + "loss": 0.0074, + "step": 662 + }, + { + "clip_ratio": 0.00032362689415776913, + "epoch": 1.0822204296256461, + "grad_norm": 0.039676353335380554, + "kl": 0.005734920501708984, + "learning_rate": 4.101088074522667e-06, + "loss": 0.0074, + "step": 663 + }, + { + "clip_ratio": 0.000323468098201829, + "epoch": 1.084132532640196, + "grad_norm": 0.03883183002471924, + "kl": 0.005713939666748047, + "learning_rate": 4.096265184739781e-06, + "loss": 0.0073, + "step": 664 + }, + { + "clip_ratio": 0.00033196881122421473, + "epoch": 1.086044635654746, + "grad_norm": 0.037281692028045654, + "kl": 0.0056934356689453125, + "learning_rate": 4.091432244266354e-06, + "loss": 0.0072, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.48774766922, + "epoch": 1.0879567386692959, + "grad_norm": 0.037982553243637085, + "kl": 0.005854606628417969, + "learning_rate": 4.08658928353241e-06, + "loss": 0.0086, + "num_tokens": 412758914.0, + "reward": 0.06835937799769454, + "reward_std": 0.07526708859950304, + "rewards/pure_accuracy_reward_math": 0.06835937630967237, + "step": 666 + }, + { + "clip_ratio": 0.0002976899445457093, + "epoch": 1.0898688416838458, + "grad_norm": 0.03663322329521179, + "kl": 0.005788326263427734, + "learning_rate": 4.081736333031066e-06, + "loss": 0.0086, + "step": 667 + }, + { + "clip_ratio": 0.0002965517393818118, + "epoch": 1.0917809446983957, + "grad_norm": 0.03593512997031212, + "kl": 0.005764484405517578, + "learning_rate": 4.0768734233183376e-06, + "loss": 0.0085, + "step": 668 + }, + { + "clip_ratio": 0.0003466513953753747, + "epoch": 1.0936930477129456, + "grad_norm": 0.03643948212265968, + "kl": 0.005777835845947266, + "learning_rate": 4.072000585012947e-06, + "loss": 0.0085, + "step": 669 + }, + { + "clip_ratio": 0.00037185640462666925, + "epoch": 1.0956051507274955, + "grad_norm": 0.03601692244410515, + "kl": 0.0058193206787109375, + "learning_rate": 4.06711784879613e-06, + "loss": 0.0084, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.0530390739441, + "epoch": 1.0975172537420455, + "grad_norm": 0.03892623260617256, + "kl": 0.005596637725830078, + "learning_rate": 4.062225245411444e-06, + "loss": 0.007, + "num_tokens": 416383588.0, + "reward": 0.061104913387680426, + "reward_std": 0.07539348275167868, + "rewards/pure_accuracy_reward_math": 0.06110491187428124, + "step": 671 + }, + { + "clip_ratio": 0.0003017952032280391, + "epoch": 1.0994293567565951, + "grad_norm": 0.0375184491276741, + "kl": 0.0056912899017333984, + "learning_rate": 4.057322805664576e-06, + "loss": 0.007, + "step": 672 + }, + { + "clip_ratio": 0.0002928147856096075, + "epoch": 1.1013414597711453, + "grad_norm": 0.03731007128953934, + "kl": 0.0057830810546875, + "learning_rate": 4.0524105604231435e-06, + "loss": 0.0069, + "step": 673 + }, + { + "clip_ratio": 0.000317500726794151, + "epoch": 1.103253562785695, + "grad_norm": 0.03885798528790474, + "kl": 0.005819559097290039, + "learning_rate": 4.047488540616503e-06, + "loss": 0.0069, + "step": 674 + }, + { + "clip_ratio": 0.0003141532706649741, + "epoch": 1.105165665800245, + "grad_norm": 0.03583172708749771, + "kl": 0.005753278732299805, + "learning_rate": 4.042556777235558e-06, + "loss": 0.0068, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.9950060844421, + "epoch": 1.1070777688147948, + "grad_norm": 0.03652811422944069, + "kl": 0.005724668502807617, + "learning_rate": 4.037615301332559e-06, + "loss": 0.0088, + "num_tokens": 419993906.0, + "reward": 0.061383931315504014, + "reward_std": 0.07067021139664575, + "rewards/pure_accuracy_reward_math": 0.06138392974389717, + "step": 676 + }, + { + "clip_ratio": 0.00028260578790195723, + "epoch": 1.1089898718293447, + "grad_norm": 0.035632383078336716, + "kl": 0.0056421756744384766, + "learning_rate": 4.0326641440209114e-06, + "loss": 0.0088, + "step": 677 + }, + { + "clip_ratio": 0.0002882395116614589, + "epoch": 1.1109019748438946, + "grad_norm": 0.03453977406024933, + "kl": 0.005593061447143555, + "learning_rate": 4.027703336474979e-06, + "loss": 0.0087, + "step": 678 + }, + { + "clip_ratio": 0.000319835560901538, + "epoch": 1.1128140778584446, + "grad_norm": 0.03415689244866371, + "kl": 0.005594968795776367, + "learning_rate": 4.022732909929883e-06, + "loss": 0.0087, + "step": 679 + }, + { + "clip_ratio": 0.00033849146848297096, + "epoch": 1.1147261808729945, + "grad_norm": 0.03406994044780731, + "kl": 0.005631208419799805, + "learning_rate": 4.017752895681315e-06, + "loss": 0.0086, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.6057720184326, + "epoch": 1.1166382838875444, + "grad_norm": 0.06026715040206909, + "kl": 0.005751848220825195, + "learning_rate": 4.012763325085332e-06, + "loss": 0.0067, + "num_tokens": 423598941.0, + "reward": 0.07198661082657054, + "reward_std": 0.08763020345941186, + "rewards/pure_accuracy_reward_math": 0.07198660844005644, + "step": 681 + }, + { + "clip_ratio": 0.00031779767027728667, + "epoch": 1.1185503869020943, + "grad_norm": 2.6160011291503906, + "kl": 0.005651235580444336, + "learning_rate": 4.0077642295581605e-06, + "loss": 0.007, + "step": 682 + }, + { + "clip_ratio": 0.00035409004277653366, + "epoch": 1.1204624899166442, + "grad_norm": 6.490725994110107, + "kl": 0.04636049270629883, + "learning_rate": 4.002755640576002e-06, + "loss": 0.0083, + "step": 683 + }, + { + "clip_ratio": 0.000386831109835839, + "epoch": 1.1223745929311941, + "grad_norm": 0.13183599710464478, + "kl": 0.0063648223876953125, + "learning_rate": 3.997737589674828e-06, + "loss": 0.0067, + "step": 684 + }, + { + "clip_ratio": 0.00042002629169246575, + "epoch": 1.124286695945744, + "grad_norm": 61.113468170166016, + "kl": 0.00571751594543457, + "learning_rate": 3.992710108450192e-06, + "loss": 0.0205, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.679431438446, + "epoch": 1.126198798960294, + "grad_norm": 0.0341753326356411, + "kl": 0.006865501403808594, + "learning_rate": 3.987673228557017e-06, + "loss": 0.0032, + "num_tokens": 427249916.0, + "reward": 0.056919645285233855, + "reward_std": 0.06538890511728823, + "rewards/pure_accuracy_reward_math": 0.05691964429570362, + "step": 686 + }, + { + "clip_ratio": 0.00022898520234093667, + "epoch": 1.1281109019748439, + "grad_norm": 0.03356679156422615, + "kl": 0.006783246994018555, + "learning_rate": 3.982626981709412e-06, + "loss": 0.0032, + "step": 687 + }, + { + "clip_ratio": 0.00023695471924156664, + "epoch": 1.1300230049893938, + "grad_norm": 0.03283276781439781, + "kl": 0.006662845611572266, + "learning_rate": 3.977571399680457e-06, + "loss": 0.0031, + "step": 688 + }, + { + "clip_ratio": 0.000234549945901108, + "epoch": 1.1319351080039437, + "grad_norm": 0.032041046768426895, + "kl": 0.00657343864440918, + "learning_rate": 3.972506514302013e-06, + "loss": 0.0031, + "step": 689 + }, + { + "clip_ratio": 0.00026119674055280484, + "epoch": 1.1338472110184936, + "grad_norm": 0.03098335862159729, + "kl": 0.006501674652099609, + "learning_rate": 3.967432357464518e-06, + "loss": 0.003, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.4330596923828, + "epoch": 1.1357593140330435, + "grad_norm": 0.03648236393928528, + "kl": 0.005389690399169922, + "learning_rate": 3.962348961116786e-06, + "loss": 0.0075, + "num_tokens": 430894100.0, + "reward": 0.059151788300368935, + "reward_std": 0.06680402747588232, + "rewards/pure_accuracy_reward_math": 0.059151787078008056, + "step": 691 + }, + { + "clip_ratio": 0.00024069582485708452, + "epoch": 1.1376714170475934, + "grad_norm": 0.03502041473984718, + "kl": 0.005405902862548828, + "learning_rate": 3.957256357265806e-06, + "loss": 0.0075, + "step": 692 + }, + { + "clip_ratio": 0.00026108162376203836, + "epoch": 1.1395835200621434, + "grad_norm": 0.03438780456781387, + "kl": 0.0054416656494140625, + "learning_rate": 3.952154577976543e-06, + "loss": 0.0075, + "step": 693 + }, + { + "clip_ratio": 0.0002536772994972125, + "epoch": 1.1414956230766933, + "grad_norm": 0.03388332575559616, + "kl": 0.005480289459228516, + "learning_rate": 3.947043655371734e-06, + "loss": 0.0075, + "step": 694 + }, + { + "clip_ratio": 0.00027197748300977764, + "epoch": 1.1434077260912432, + "grad_norm": 0.03378571942448616, + "kl": 0.005473136901855469, + "learning_rate": 3.941923621631683e-06, + "loss": 0.0074, + "step": 695 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.0050506591797, + "epoch": 1.145319829105793, + "grad_norm": 0.040138646960258484, + "kl": 0.005397796630859375, + "learning_rate": 3.936794508994062e-06, + "loss": 0.0033, + "num_tokens": 434502306.0, + "reward": 0.07142857456346974, + "reward_std": 0.08093377470504493, + "rewards/pure_accuracy_reward_math": 0.07142857316648588, + "step": 696 + }, + { + "clip_ratio": 0.00026038982610998573, + "epoch": 1.147231932120343, + "grad_norm": 0.03855022042989731, + "kl": 0.005437135696411133, + "learning_rate": 3.931656349753709e-06, + "loss": 0.0033, + "step": 697 + }, + { + "clip_ratio": 0.0002577857798655714, + "epoch": 1.149144035134893, + "grad_norm": 0.03805391117930412, + "kl": 0.005386829376220703, + "learning_rate": 3.9265091762624225e-06, + "loss": 0.0032, + "step": 698 + }, + { + "clip_ratio": 0.0002938498616913421, + "epoch": 1.1510561381494429, + "grad_norm": 0.03830750659108162, + "kl": 0.005461931228637695, + "learning_rate": 3.921353020928756e-06, + "loss": 0.0032, + "step": 699 + }, + { + "clip_ratio": 0.00026367085320089245, + "epoch": 1.1529682411639928, + "grad_norm": 0.03759397566318512, + "kl": 0.0055010318756103516, + "learning_rate": 3.916187916217818e-06, + "loss": 0.0031, + "step": 700 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.7466740608215, + "epoch": 1.1548803441785427, + "grad_norm": 0.03618447855114937, + "kl": 0.0054166316986083984, + "learning_rate": 3.911013894651067e-06, + "loss": 0.0066, + "num_tokens": 438144462.0, + "reward": 0.06501116344588809, + "reward_std": 0.07457646209513769, + "rewards/pure_accuracy_reward_math": 0.06501116175786592, + "step": 701 + }, + { + "clip_ratio": 0.00028753443712048465, + "epoch": 1.1567924471930926, + "grad_norm": 0.035918354988098145, + "kl": 0.005413532257080078, + "learning_rate": 3.905830988806101e-06, + "loss": 0.0066, + "step": 702 + }, + { + "clip_ratio": 0.0002842856440565811, + "epoch": 1.1587045502076425, + "grad_norm": 0.03422370180487633, + "kl": 0.005442619323730469, + "learning_rate": 3.90063923131646e-06, + "loss": 0.0066, + "step": 703 + }, + { + "clip_ratio": 0.0002819241568090547, + "epoch": 1.1606166532221924, + "grad_norm": 0.03359530121088028, + "kl": 0.00537109375, + "learning_rate": 3.895438654871416e-06, + "loss": 0.0065, + "step": 704 + }, + { + "clip_ratio": 0.0003241457142166837, + "epoch": 1.1625287562367423, + "grad_norm": 0.033465541899204254, + "kl": 0.0053484439849853516, + "learning_rate": 3.890229292215773e-06, + "loss": 0.0065, + "step": 705 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.7639741897583, + "epoch": 1.1644408592512923, + "grad_norm": 0.03731166943907738, + "kl": 0.00535893440246582, + "learning_rate": 3.885011176149647e-06, + "loss": 0.0071, + "num_tokens": 441760876.0, + "reward": 0.06612723506987095, + "reward_std": 0.06822534691309556, + "rewards/pure_accuracy_reward_math": 0.06612723367288709, + "step": 706 + }, + { + "clip_ratio": 0.00025104734473302415, + "epoch": 1.166352962265842, + "grad_norm": 0.03429851680994034, + "kl": 0.005263566970825195, + "learning_rate": 3.879784339528277e-06, + "loss": 0.0071, + "step": 707 + }, + { + "clip_ratio": 0.0002501190919019791, + "epoch": 1.168265065280392, + "grad_norm": 0.034958597272634506, + "kl": 0.0052831172943115234, + "learning_rate": 3.874548815261809e-06, + "loss": 0.0071, + "step": 708 + }, + { + "clip_ratio": 0.0002633173795629773, + "epoch": 1.1701771682949418, + "grad_norm": 0.032111622393131256, + "kl": 0.005318403244018555, + "learning_rate": 3.869304636315085e-06, + "loss": 0.007, + "step": 709 + }, + { + "clip_ratio": 0.00028521847832507774, + "epoch": 1.172089271309492, + "grad_norm": 0.03191748261451721, + "kl": 0.005407810211181641, + "learning_rate": 3.864051835707444e-06, + "loss": 0.007, + "step": 710 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.3457269668579, + "epoch": 1.1740013743240416, + "grad_norm": 0.05126773193478584, + "kl": 0.01187896728515625, + "learning_rate": 3.85879044651251e-06, + "loss": 0.0066, + "num_tokens": 445370959.0, + "reward": 0.06863839653669856, + "reward_std": 0.07951865292852744, + "rewards/pure_accuracy_reward_math": 0.06863839438301511, + "step": 711 + }, + { + "clip_ratio": 0.00028669004558423694, + "epoch": 1.1759134773385915, + "grad_norm": 0.051731474697589874, + "kl": 0.011458396911621094, + "learning_rate": 3.853520501857981e-06, + "loss": 0.0066, + "step": 712 + }, + { + "clip_ratio": 0.0003143258599038745, + "epoch": 1.1778255803531414, + "grad_norm": 0.051190439611673355, + "kl": 0.010621786117553711, + "learning_rate": 3.848242034925429e-06, + "loss": 0.0065, + "step": 713 + }, + { + "clip_ratio": 0.00033165596249773444, + "epoch": 1.1797376833676914, + "grad_norm": 0.04840007424354553, + "kl": 0.009693622589111328, + "learning_rate": 3.842955078950079e-06, + "loss": 0.0064, + "step": 714 + }, + { + "clip_ratio": 0.00035113433239075675, + "epoch": 1.1816497863822413, + "grad_norm": 0.048264067620038986, + "kl": 0.008889198303222656, + "learning_rate": 3.837659667220612e-06, + "loss": 0.0063, + "step": 715 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.5633645057678, + "epoch": 1.1835618893967912, + "grad_norm": 0.03458649665117264, + "kl": 0.005284786224365234, + "learning_rate": 3.832355833078945e-06, + "loss": 0.0047, + "num_tokens": 449069046.0, + "reward": 0.05691964572179131, + "reward_std": 0.06861072586616501, + "rewards/pure_accuracy_reward_math": 0.05691964415018447, + "step": 716 + }, + { + "clip_ratio": 0.0002876185501463624, + "epoch": 1.185473992411341, + "grad_norm": 0.033646877855062485, + "kl": 0.005215167999267578, + "learning_rate": 3.82704360992003e-06, + "loss": 0.0047, + "step": 717 + }, + { + "clip_ratio": 0.0003252235952686533, + "epoch": 1.187386095425891, + "grad_norm": 0.03455204889178276, + "kl": 0.0051419734954833984, + "learning_rate": 3.8217230311916365e-06, + "loss": 0.0046, + "step": 718 + }, + { + "clip_ratio": 0.0003351885409870192, + "epoch": 1.189298198440441, + "grad_norm": 0.033362697809934616, + "kl": 0.0050907135009765625, + "learning_rate": 3.816394130394142e-06, + "loss": 0.0046, + "step": 719 + }, + { + "clip_ratio": 0.00032723310141591355, + "epoch": 1.1912103014549908, + "grad_norm": 0.03211547061800957, + "kl": 0.0051004886627197266, + "learning_rate": 3.811056941080329e-06, + "loss": 0.0045, + "step": 720 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.3167090415955, + "epoch": 1.1931224044695408, + "grad_norm": 0.03566175699234009, + "kl": 0.0053424835205078125, + "learning_rate": 3.805711496855161e-06, + "loss": 0.009, + "num_tokens": 452726381.0, + "reward": 0.06054687776486389, + "reward_std": 0.07264336961088702, + "rewards/pure_accuracy_reward_math": 0.06054687677533366, + "step": 721 + }, + { + "clip_ratio": 0.00029346574888222676, + "epoch": 1.1950345074840907, + "grad_norm": 0.03476826474070549, + "kl": 0.005379438400268555, + "learning_rate": 3.800357831375583e-06, + "loss": 0.009, + "step": 722 + }, + { + "clip_ratio": 0.00027920183202923, + "epoch": 1.1969466104986406, + "grad_norm": 0.03446114435791969, + "kl": 0.005425691604614258, + "learning_rate": 3.794995978350301e-06, + "loss": 0.009, + "step": 723 + }, + { + "clip_ratio": 0.00031396149876172785, + "epoch": 1.1988587135131905, + "grad_norm": 0.0340140238404274, + "kl": 0.005489826202392578, + "learning_rate": 3.7896259715395727e-06, + "loss": 0.0089, + "step": 724 + }, + { + "clip_ratio": 0.0002986833567888425, + "epoch": 1.2007708165277404, + "grad_norm": 0.03497212752699852, + "kl": 0.005522489547729492, + "learning_rate": 3.784247844754997e-06, + "loss": 0.0088, + "step": 725 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.8044338226318, + "epoch": 1.2026829195422903, + "grad_norm": 0.04050953686237335, + "kl": 0.005362510681152344, + "learning_rate": 3.778861631859298e-06, + "loss": 0.0112, + "num_tokens": 456433388.0, + "reward": 0.06696428879513405, + "reward_std": 0.08140548242954537, + "rewards/pure_accuracy_reward_math": 0.06696428728173487, + "step": 726 + }, + { + "clip_ratio": 0.0003468562302373357, + "epoch": 1.2045950225568403, + "grad_norm": 0.03805195167660713, + "kl": 0.005377531051635742, + "learning_rate": 3.7734673667661133e-06, + "loss": 0.0112, + "step": 727 + }, + { + "clip_ratio": 0.00037477223943938043, + "epoch": 1.2065071255713902, + "grad_norm": 0.03666882589459419, + "kl": 0.005417585372924805, + "learning_rate": 3.7680650834397804e-06, + "loss": 0.0112, + "step": 728 + }, + { + "clip_ratio": 0.0003945930936311015, + "epoch": 1.20841922858594, + "grad_norm": 0.03651399165391922, + "kl": 0.005425453186035156, + "learning_rate": 3.762654815895122e-06, + "loss": 0.0111, + "step": 729 + }, + { + "clip_ratio": 0.0004650242010484362, + "epoch": 1.21033133160049, + "grad_norm": 0.03792130947113037, + "kl": 0.005422115325927734, + "learning_rate": 3.7572365981972335e-06, + "loss": 0.0111, + "step": 730 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.6861305236816, + "epoch": 1.21224343461504, + "grad_norm": 0.0365571565926075, + "kl": 0.005487203598022461, + "learning_rate": 3.7518104644612663e-06, + "loss": 0.0098, + "num_tokens": 460061367.0, + "reward": 0.06417411062284373, + "reward_std": 0.07478918455308303, + "rewards/pure_accuracy_reward_math": 0.06417410852736793, + "step": 731 + }, + { + "clip_ratio": 0.0002798708824229834, + "epoch": 1.2141555376295898, + "grad_norm": 0.036456115543842316, + "kl": 0.005484342575073242, + "learning_rate": 3.746376448852216e-06, + "loss": 0.0098, + "step": 732 + }, + { + "clip_ratio": 0.0003001830394850913, + "epoch": 1.2160676406441397, + "grad_norm": 0.036120470613241196, + "kl": 0.005544900894165039, + "learning_rate": 3.740934585584702e-06, + "loss": 0.0098, + "step": 733 + }, + { + "clip_ratio": 0.00028155883609315424, + "epoch": 1.2179797436586897, + "grad_norm": 0.03475060313940048, + "kl": 0.005614042282104492, + "learning_rate": 3.735484908922759e-06, + "loss": 0.0097, + "step": 734 + }, + { + "clip_ratio": 0.00027523975251142474, + "epoch": 1.2198918466732396, + "grad_norm": 0.03388671204447746, + "kl": 0.005706310272216797, + "learning_rate": 3.730027453179617e-06, + "loss": 0.0096, + "step": 735 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.6091203689575, + "epoch": 1.2218039496877895, + "grad_norm": 0.039098870009183884, + "kl": 0.005930900573730469, + "learning_rate": 3.7245622527174858e-06, + "loss": 0.0072, + "num_tokens": 463651718.0, + "reward": 0.06277902098372579, + "reward_std": 0.06552149687195197, + "rewards/pure_accuracy_reward_math": 0.06277901912108064, + "step": 736 + }, + { + "clip_ratio": 0.000267848483247235, + "epoch": 1.2237160527023394, + "grad_norm": 0.03896670043468475, + "kl": 0.005952358245849609, + "learning_rate": 3.719089341947337e-06, + "loss": 0.0072, + "step": 737 + }, + { + "clip_ratio": 0.00026333254504606884, + "epoch": 1.2256281557168893, + "grad_norm": 0.03838280960917473, + "kl": 0.005873680114746094, + "learning_rate": 3.7136087553286916e-06, + "loss": 0.0072, + "step": 738 + }, + { + "clip_ratio": 0.0002850479507969794, + "epoch": 1.2275402587314392, + "grad_norm": 0.03708336502313614, + "kl": 0.005741596221923828, + "learning_rate": 3.7081205273694005e-06, + "loss": 0.0071, + "step": 739 + }, + { + "clip_ratio": 0.00030947004142944934, + "epoch": 1.2294523617459892, + "grad_norm": 0.03616032376885414, + "kl": 0.005689144134521484, + "learning_rate": 3.702624692625427e-06, + "loss": 0.007, + "step": 740 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.3027577400208, + "epoch": 1.231364464760539, + "grad_norm": 473.16009521484375, + "kl": 7.4117608070373535, + "learning_rate": 3.6971212857006277e-06, + "loss": 0.3027, + "num_tokens": 467231411.0, + "reward": 0.07003348527359776, + "reward_std": 0.07058388437144458, + "rewards/pure_accuracy_reward_math": 0.07003348364378326, + "step": 741 + }, + { + "clip_ratio": 0.00048789031319529386, + "epoch": 1.2332765677750888, + "grad_norm": 15.009349822998047, + "kl": 0.3277552127838135, + "learning_rate": 3.6916103412465405e-06, + "loss": 0.0207, + "step": 742 + }, + { + "clip_ratio": 0.0005436847095552366, + "epoch": 1.235188670789639, + "grad_norm": 34.010345458984375, + "kl": 0.01839423179626465, + "learning_rate": 3.6860918939621586e-06, + "loss": 0.0299, + "step": 743 + }, + { + "clip_ratio": 0.000597593801558105, + "epoch": 1.2371007738041886, + "grad_norm": 13.507566452026367, + "kl": 0.02814960479736328, + "learning_rate": 3.6805659785937176e-06, + "loss": 0.0188, + "step": 744 + }, + { + "clip_ratio": 0.0005609532486232638, + "epoch": 1.2390128768187387, + "grad_norm": 6.263442516326904, + "kl": 0.20073914527893066, + "learning_rate": 3.675032629934475e-06, + "loss": 0.0163, + "step": 745 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.3340101242065, + "epoch": 1.2409249798332884, + "grad_norm": 0.051358480006456375, + "kl": 0.0063626766204833984, + "learning_rate": 3.6694918828244923e-06, + "loss": 0.0095, + "num_tokens": 470866344.0, + "reward": 0.06333705666474998, + "reward_std": 0.07530095760012046, + "rewards/pure_accuracy_reward_math": 0.06333705509314314, + "step": 746 + }, + { + "clip_ratio": 0.00029982604212364095, + "epoch": 1.2428370828478383, + "grad_norm": 0.03713027015328407, + "kl": 0.006081342697143555, + "learning_rate": 3.6639437721504108e-06, + "loss": 0.0095, + "step": 747 + }, + { + "clip_ratio": 0.0002941023938660692, + "epoch": 1.2447491858623883, + "grad_norm": 0.03500093147158623, + "kl": 0.006156444549560547, + "learning_rate": 3.65838833284524e-06, + "loss": 0.0095, + "step": 748 + }, + { + "clip_ratio": 0.0002858027814340858, + "epoch": 1.2466612888769382, + "grad_norm": 0.03525420278310776, + "kl": 0.006234169006347656, + "learning_rate": 3.652825599888129e-06, + "loss": 0.0094, + "step": 749 + }, + { + "clip_ratio": 0.0002950350276478275, + "epoch": 1.248573391891488, + "grad_norm": 0.03545543923974037, + "kl": 0.006281852722167969, + "learning_rate": 3.647255608304154e-06, + "loss": 0.0093, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.79438829422, + "epoch": 1.250485494906038, + "grad_norm": 0.03711007162928581, + "kl": 0.005670070648193359, + "learning_rate": 3.641678393164092e-06, + "loss": 0.0131, + "num_tokens": 474505191.0, + "reward": 0.07170759318978526, + "reward_std": 0.07251697574974969, + "rewards/pure_accuracy_reward_math": 0.0717075907450635, + "step": 751 + }, + { + "clip_ratio": 0.00029345202176500607, + "epoch": 1.252397597920588, + "grad_norm": 0.036423034965991974, + "kl": 0.005608320236206055, + "learning_rate": 3.636093989584204e-06, + "loss": 0.0131, + "step": 752 + }, + { + "clip_ratio": 0.00030187425932126644, + "epoch": 1.2543097009351378, + "grad_norm": 0.03613322973251343, + "kl": 0.005610466003417969, + "learning_rate": 3.630502432726012e-06, + "loss": 0.013, + "step": 753 + }, + { + "clip_ratio": 0.0003275847485610939, + "epoch": 1.2562218039496877, + "grad_norm": 0.03452349826693535, + "kl": 0.0057184696197509766, + "learning_rate": 3.6249037577960744e-06, + "loss": 0.013, + "step": 754 + }, + { + "clip_ratio": 0.00034663524741063156, + "epoch": 1.2581339069642377, + "grad_norm": 0.034864939749240875, + "kl": 0.005825996398925781, + "learning_rate": 3.619298000045773e-06, + "loss": 0.0129, + "step": 755 + }, + { + "clip_ratio": 0.0, + "completion_length": 495.8814425468445, + "epoch": 1.2600460099787876, + "grad_norm": 528.279052734375, + "kl": 9.193241596221924, + "learning_rate": 3.6136851947710804e-06, + "loss": 0.3749, + "num_tokens": 478011678.0, + "reward": 0.07979911071015522, + "reward_std": 0.07470905361697078, + "rewards/pure_accuracy_reward_math": 0.0797991082072258, + "step": 756 + }, + { + "clip_ratio": 0.00028275052295612113, + "epoch": 1.2619581129933375, + "grad_norm": 44.662696838378906, + "kl": 1.2635960578918457, + "learning_rate": 3.608065377312348e-06, + "loss": 0.057, + "step": 757 + }, + { + "clip_ratio": 0.00029553008619132015, + "epoch": 1.2638702160078874, + "grad_norm": 4.775911808013916, + "kl": 0.1474595069885254, + "learning_rate": 3.6024385830540758e-06, + "loss": 0.0123, + "step": 758 + }, + { + "clip_ratio": 0.00033371773997714627, + "epoch": 1.2657823190224373, + "grad_norm": 0.30982905626296997, + "kl": 0.01830148696899414, + "learning_rate": 3.5968048474246925e-06, + "loss": 0.0071, + "step": 759 + }, + { + "clip_ratio": 0.0003257711730952906, + "epoch": 1.2676944220369872, + "grad_norm": 0.05356259644031525, + "kl": 0.011959552764892578, + "learning_rate": 3.591164205896332e-06, + "loss": 0.0068, + "step": 760 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.9149203300476, + "epoch": 1.2696065250515371, + "grad_norm": 0.04138460382819176, + "kl": 0.00600886344909668, + "learning_rate": 3.585516693984612e-06, + "loss": 0.0061, + "num_tokens": 481610981.0, + "reward": 0.07059152136207558, + "reward_std": 0.07616424100706354, + "rewards/pure_accuracy_reward_math": 0.07059151938301511, + "step": 761 + }, + { + "clip_ratio": 0.00029173931721970803, + "epoch": 1.271518628066087, + "grad_norm": 0.04057340323925018, + "kl": 0.0059850215911865234, + "learning_rate": 3.5798623472484074e-06, + "loss": 0.006, + "step": 762 + }, + { + "clip_ratio": 0.00031361054851686276, + "epoch": 1.273430731080637, + "grad_norm": 0.0383637472987175, + "kl": 0.005931377410888672, + "learning_rate": 3.5742012012896273e-06, + "loss": 0.006, + "step": 763 + }, + { + "clip_ratio": 0.000302841177983737, + "epoch": 1.275342834095187, + "grad_norm": 0.037009891122579575, + "kl": 0.005960226058959961, + "learning_rate": 3.5685332917529936e-06, + "loss": 0.0059, + "step": 764 + }, + { + "clip_ratio": 0.00032496250122449055, + "epoch": 1.2772549371097368, + "grad_norm": 0.036052413284778595, + "kl": 0.0060160160064697266, + "learning_rate": 3.5628586543258116e-06, + "loss": 0.0058, + "step": 765 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.19645166397095, + "epoch": 1.2791670401242867, + "grad_norm": 0.039108723402023315, + "kl": 0.0060214996337890625, + "learning_rate": 3.5571773247377495e-06, + "loss": 0.0077, + "num_tokens": 485155493.0, + "reward": 0.06473214537254535, + "reward_std": 0.07595151849091053, + "rewards/pure_accuracy_reward_math": 0.06473214438301511, + "step": 766 + }, + { + "clip_ratio": 0.00031215936860462534, + "epoch": 1.2810791431388366, + "grad_norm": 0.03890209272503853, + "kl": 0.0060939788818359375, + "learning_rate": 3.5514893387606113e-06, + "loss": 0.0078, + "step": 767 + }, + { + "clip_ratio": 0.00029648321913100517, + "epoch": 1.2829912461533866, + "grad_norm": 0.038266174495220184, + "kl": 0.0061397552490234375, + "learning_rate": 3.5457947322081126e-06, + "loss": 0.0077, + "step": 768 + }, + { + "clip_ratio": 0.0002988063008615427, + "epoch": 1.2849033491679365, + "grad_norm": 0.03760776296257973, + "kl": 0.006152629852294922, + "learning_rate": 3.5400935409356534e-06, + "loss": 0.0076, + "step": 769 + }, + { + "clip_ratio": 0.00032748817852734646, + "epoch": 1.2868154521824864, + "grad_norm": 0.037058234214782715, + "kl": 0.006194591522216797, + "learning_rate": 3.5343858008400955e-06, + "loss": 0.0076, + "step": 770 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.085681438446, + "epoch": 1.2887275551970363, + "grad_norm": 0.04272163286805153, + "kl": 0.006904125213623047, + "learning_rate": 3.5286715478595335e-06, + "loss": 0.0066, + "num_tokens": 488731916.0, + "reward": 0.06668527112924494, + "reward_std": 0.07779828266939148, + "rewards/pure_accuracy_reward_math": 0.0666852695576381, + "step": 771 + }, + { + "clip_ratio": 0.0002989328136209224, + "epoch": 1.2906396582115862, + "grad_norm": 0.039898019284009933, + "kl": 0.006760597229003906, + "learning_rate": 3.52295081797307e-06, + "loss": 0.0066, + "step": 772 + }, + { + "clip_ratio": 0.0003237332452385999, + "epoch": 1.2925517612261361, + "grad_norm": 0.0380416214466095, + "kl": 0.006653547286987305, + "learning_rate": 3.5172236472005866e-06, + "loss": 0.0065, + "step": 773 + }, + { + "clip_ratio": 0.0004160679777100995, + "epoch": 1.294463864240686, + "grad_norm": 0.03860335052013397, + "kl": 0.006639003753662109, + "learning_rate": 3.511490071602523e-06, + "loss": 0.0065, + "step": 774 + }, + { + "clip_ratio": 0.0004345110206713798, + "epoch": 1.2963759672552357, + "grad_norm": 0.0405069962143898, + "kl": 0.006697654724121094, + "learning_rate": 3.505750127279643e-06, + "loss": 0.0064, + "step": 775 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.7695565223694, + "epoch": 1.2982880702697859, + "grad_norm": 0.040585048496723175, + "kl": 0.006101369857788086, + "learning_rate": 3.500003850372811e-06, + "loss": 0.0043, + "num_tokens": 492363370.0, + "reward": 0.07477678926079534, + "reward_std": 0.08466117118950933, + "rewards/pure_accuracy_reward_math": 0.07477678704890423, + "step": 776 + }, + { + "clip_ratio": 0.0003347315081327906, + "epoch": 1.3002001732843356, + "grad_norm": 0.039613205939531326, + "kl": 0.0060977935791015625, + "learning_rate": 3.4942512770627655e-06, + "loss": 0.0043, + "step": 777 + }, + { + "clip_ratio": 0.0003803396672310555, + "epoch": 1.3021122762988857, + "grad_norm": 0.03965132310986519, + "kl": 0.006110668182373047, + "learning_rate": 3.4884924435698875e-06, + "loss": 0.0042, + "step": 778 + }, + { + "clip_ratio": 0.00035469116983222193, + "epoch": 1.3040243793134354, + "grad_norm": 0.038701362907886505, + "kl": 0.005974292755126953, + "learning_rate": 3.482727386153974e-06, + "loss": 0.0041, + "step": 779 + }, + { + "clip_ratio": 0.00038596760680320585, + "epoch": 1.3059364823279855, + "grad_norm": 0.03767050802707672, + "kl": 0.0059070587158203125, + "learning_rate": 3.4769561411140123e-06, + "loss": 0.0041, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.3593993186951, + "epoch": 1.3078485853425352, + "grad_norm": 0.04520969092845917, + "kl": 0.015022039413452148, + "learning_rate": 3.471178744787948e-06, + "loss": 0.0107, + "num_tokens": 495988466.0, + "reward": 0.07449777098372579, + "reward_std": 0.08161820413079113, + "rewards/pure_accuracy_reward_math": 0.07449777016881853, + "step": 781 + }, + { + "clip_ratio": 0.00032587463357458546, + "epoch": 1.3097606883570854, + "grad_norm": 0.04337235167622566, + "kl": 0.01485586166381836, + "learning_rate": 3.465395233552458e-06, + "loss": 0.0107, + "step": 782 + }, + { + "clip_ratio": 0.00031156001216459117, + "epoch": 1.311672791371635, + "grad_norm": 0.04306100681424141, + "kl": 0.014668941497802734, + "learning_rate": 3.459605643822721e-06, + "loss": 0.0106, + "step": 783 + }, + { + "clip_ratio": 0.00031179932597069637, + "epoch": 1.313584894386185, + "grad_norm": 0.04292943701148033, + "kl": 0.014333724975585938, + "learning_rate": 3.4538100120521884e-06, + "loss": 0.0106, + "step": 784 + }, + { + "clip_ratio": 0.00034586368491318353, + "epoch": 1.315496997400735, + "grad_norm": 0.04207218065857887, + "kl": 0.013885498046875, + "learning_rate": 3.4480083747323527e-06, + "loss": 0.0105, + "step": 785 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.3471217155457, + "epoch": 1.3174091004152848, + "grad_norm": 0.04057139530777931, + "kl": 0.006026268005371094, + "learning_rate": 3.4422007683925224e-06, + "loss": 0.0119, + "num_tokens": 499590878.0, + "reward": 0.08091518239234574, + "reward_std": 0.08763020328478888, + "rewards/pure_accuracy_reward_math": 0.08091518023866229, + "step": 786 + }, + { + "clip_ratio": 0.00030802900647586284, + "epoch": 1.3193212034298347, + "grad_norm": 0.039306215941905975, + "kl": 0.00603485107421875, + "learning_rate": 3.436387229599587e-06, + "loss": 0.0119, + "step": 787 + }, + { + "clip_ratio": 0.00034579116845634417, + "epoch": 1.3212333064443846, + "grad_norm": 0.03839893266558647, + "kl": 0.006104469299316406, + "learning_rate": 3.4305677949577915e-06, + "loss": 0.0118, + "step": 788 + }, + { + "clip_ratio": 0.00036078316020393686, + "epoch": 1.3231454094589346, + "grad_norm": 0.03700988367199898, + "kl": 0.006115436553955078, + "learning_rate": 3.4247425011084993e-06, + "loss": 0.0118, + "step": 789 + }, + { + "clip_ratio": 0.0003916456239494437, + "epoch": 1.3250575124734845, + "grad_norm": 0.03749685734510422, + "kl": 0.006115436553955078, + "learning_rate": 3.418911384729971e-06, + "loss": 0.0117, + "step": 790 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.7112407684326, + "epoch": 1.3269696154880344, + "grad_norm": 0.03917763754725456, + "kl": 0.009302139282226562, + "learning_rate": 3.413074482537123e-06, + "loss": 0.0077, + "num_tokens": 503128079.0, + "reward": 0.07059152112924494, + "reward_std": 0.07702752505429089, + "rewards/pure_accuracy_reward_math": 0.07059151944122277, + "step": 791 + }, + { + "clip_ratio": 0.0002787132019079763, + "epoch": 1.3288817185025843, + "grad_norm": 0.03894754871726036, + "kl": 0.009203910827636719, + "learning_rate": 3.4072318312813044e-06, + "loss": 0.0077, + "step": 792 + }, + { + "clip_ratio": 0.00031091465683630304, + "epoch": 1.3307938215171342, + "grad_norm": 0.03774462640285492, + "kl": 0.008921146392822266, + "learning_rate": 3.4013834677500612e-06, + "loss": 0.0077, + "step": 793 + }, + { + "clip_ratio": 0.00030987418773520403, + "epoch": 1.3327059245316841, + "grad_norm": 0.03737964481115341, + "kl": 0.008791923522949219, + "learning_rate": 3.395529428766907e-06, + "loss": 0.0076, + "step": 794 + }, + { + "clip_ratio": 0.0003597256319380904, + "epoch": 1.334618027546234, + "grad_norm": 0.03793202340602875, + "kl": 0.008593559265136719, + "learning_rate": 3.3896697511910898e-06, + "loss": 0.0075, + "step": 795 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.8552160263062, + "epoch": 1.336530130560784, + "grad_norm": 0.03877223655581474, + "kl": 0.005873441696166992, + "learning_rate": 3.3838044719173603e-06, + "loss": 0.0086, + "num_tokens": 506711636.0, + "reward": 0.06529018195578828, + "reward_std": 0.06942774722119793, + "rewards/pure_accuracy_reward_math": 0.06529017997672781, + "step": 796 + }, + { + "clip_ratio": 0.0002862633294853367, + "epoch": 1.3384422335753339, + "grad_norm": 0.0376199446618557, + "kl": 0.005820274353027344, + "learning_rate": 3.377933627875739e-06, + "loss": 0.0086, + "step": 797 + }, + { + "clip_ratio": 0.0002861461452994263, + "epoch": 1.3403543365898838, + "grad_norm": 0.036890070885419846, + "kl": 0.005822658538818359, + "learning_rate": 3.3720572560312854e-06, + "loss": 0.0086, + "step": 798 + }, + { + "clip_ratio": 0.0003201163677317709, + "epoch": 1.3422664396044337, + "grad_norm": 0.03669756278395653, + "kl": 0.005821704864501953, + "learning_rate": 3.366175393383863e-06, + "loss": 0.0085, + "step": 799 + }, + { + "clip_ratio": 0.0003494162402830625, + "epoch": 1.3441785426189836, + "grad_norm": 0.03721420839428902, + "kl": 0.005818843841552734, + "learning_rate": 3.360288076967909e-06, + "loss": 0.0084, + "step": 800 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.6105146408081, + "epoch": 1.3460906456335335, + "grad_norm": 0.040034398436546326, + "kl": 0.006266117095947266, + "learning_rate": 3.3543953438521983e-06, + "loss": 0.0091, + "num_tokens": 510255728.0, + "reward": 0.0675223250000272, + "reward_std": 0.07577886182116345, + "rewards/pure_accuracy_reward_math": 0.06752232249709778, + "step": 801 + }, + { + "clip_ratio": 0.00027677676553139463, + "epoch": 1.3480027486480834, + "grad_norm": 0.038657769560813904, + "kl": 0.006215572357177734, + "learning_rate": 3.3484972311396114e-06, + "loss": 0.0091, + "step": 802 + }, + { + "clip_ratio": 0.0002909586188479807, + "epoch": 1.3499148516626334, + "grad_norm": 0.036970507353544235, + "kl": 0.006129741668701172, + "learning_rate": 3.342593775966901e-06, + "loss": 0.009, + "step": 803 + }, + { + "clip_ratio": 0.0003427068459700422, + "epoch": 1.3518269546771833, + "grad_norm": 0.03707785904407501, + "kl": 0.006056785583496094, + "learning_rate": 3.3366850155044595e-06, + "loss": 0.009, + "step": 804 + }, + { + "clip_ratio": 0.00038909467849634893, + "epoch": 1.3537390576917332, + "grad_norm": 0.03700149059295654, + "kl": 0.005985736846923828, + "learning_rate": 3.33077098695608e-06, + "loss": 0.0089, + "step": 805 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.0212287902832, + "epoch": 1.355651160706283, + "grad_norm": 0.04373861476778984, + "kl": 0.005824565887451172, + "learning_rate": 3.3248517275587292e-06, + "loss": 0.0094, + "num_tokens": 513879112.0, + "reward": 0.0703125029685907, + "reward_std": 0.08085364429280162, + "rewards/pure_accuracy_reward_math": 0.07031250145519152, + "step": 806 + }, + { + "clip_ratio": 0.00031092700191948097, + "epoch": 1.357563263720833, + "grad_norm": 0.04273909702897072, + "kl": 0.0058460235595703125, + "learning_rate": 3.318927274582307e-06, + "loss": 0.0094, + "step": 807 + }, + { + "clip_ratio": 0.0003359753473546334, + "epoch": 1.359475366735383, + "grad_norm": 0.04217194393277168, + "kl": 0.005980014801025391, + "learning_rate": 3.312997665329414e-06, + "loss": 0.0093, + "step": 808 + }, + { + "clip_ratio": 0.0003392697701940506, + "epoch": 1.3613874697499329, + "grad_norm": 0.04189891368150711, + "kl": 0.0061492919921875, + "learning_rate": 3.3070629371351176e-06, + "loss": 0.0093, + "step": 809 + }, + { + "clip_ratio": 0.0003985974152556082, + "epoch": 1.3632995727644825, + "grad_norm": 0.04113880172371864, + "kl": 0.0062618255615234375, + "learning_rate": 3.3011231273667155e-06, + "loss": 0.0092, + "step": 810 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.8002490997314, + "epoch": 1.3652116757790327, + "grad_norm": 0.039511535316705704, + "kl": 0.007502555847167969, + "learning_rate": 3.295178273423501e-06, + "loss": 0.0065, + "num_tokens": 517489928.0, + "reward": 0.06835937840514816, + "reward_std": 0.0761642413563095, + "rewards/pure_accuracy_reward_math": 0.06835937636788003, + "step": 811 + }, + { + "clip_ratio": 0.00033993283830113796, + "epoch": 1.3671237787935824, + "grad_norm": 0.03911852091550827, + "kl": 0.0074634552001953125, + "learning_rate": 3.2892284127365277e-06, + "loss": 0.0065, + "step": 812 + }, + { + "clip_ratio": 0.00029188678922764666, + "epoch": 1.3690358818081325, + "grad_norm": 0.038789719343185425, + "kl": 0.007461071014404297, + "learning_rate": 3.2832735827683733e-06, + "loss": 0.0064, + "step": 813 + }, + { + "clip_ratio": 0.00031692377649505943, + "epoch": 1.3709479848226822, + "grad_norm": 0.03795900195837021, + "kl": 0.007411956787109375, + "learning_rate": 3.2773138210129037e-06, + "loss": 0.0063, + "step": 814 + }, + { + "clip_ratio": 0.0003394908647464945, + "epoch": 1.3728600878372323, + "grad_norm": 0.03683575242757797, + "kl": 0.0073795318603515625, + "learning_rate": 3.2713491649950375e-06, + "loss": 0.0063, + "step": 815 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.1018648147583, + "epoch": 1.374772190851782, + "grad_norm": 0.036948177963495255, + "kl": 0.0058441162109375, + "learning_rate": 3.26537965227051e-06, + "loss": 0.0062, + "num_tokens": 521113961.0, + "reward": 0.06333705675206147, + "reward_std": 0.07041122711962089, + "rewards/pure_accuracy_reward_math": 0.06333705494762398, + "step": 816 + }, + { + "clip_ratio": 0.0002517415915690435, + "epoch": 1.3766842938663322, + "grad_norm": 0.03634682297706604, + "kl": 0.005847454071044922, + "learning_rate": 3.2594053204256344e-06, + "loss": 0.0062, + "step": 817 + }, + { + "clip_ratio": 0.00027403954436522326, + "epoch": 1.3785963968808819, + "grad_norm": 0.034690070897340775, + "kl": 0.005870342254638672, + "learning_rate": 3.253426207077069e-06, + "loss": 0.0062, + "step": 818 + }, + { + "clip_ratio": 0.0002389855896467452, + "epoch": 1.3805084998954318, + "grad_norm": 0.034505974501371384, + "kl": 0.005900382995605469, + "learning_rate": 3.2474423498715772e-06, + "loss": 0.0061, + "step": 819 + }, + { + "clip_ratio": 0.000287152882663122, + "epoch": 1.3824206029099817, + "grad_norm": 0.03524321690201759, + "kl": 0.005913734436035156, + "learning_rate": 3.241453786485792e-06, + "loss": 0.0061, + "step": 820 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.66520071029663, + "epoch": 1.3843327059245316, + "grad_norm": 0.039214182645082474, + "kl": 0.006892681121826172, + "learning_rate": 3.2354605546259777e-06, + "loss": 0.0032, + "num_tokens": 524677265.0, + "reward": 0.07979911041911691, + "reward_std": 0.07959878293331712, + "rewards/pure_accuracy_reward_math": 0.07979910867288709, + "step": 821 + }, + { + "clip_ratio": 0.0002965318878409562, + "epoch": 1.3862448089390815, + "grad_norm": 0.037640273571014404, + "kl": 0.0067348480224609375, + "learning_rate": 3.2294626920277928e-06, + "loss": 0.0031, + "step": 822 + }, + { + "clip_ratio": 0.00035153192868619954, + "epoch": 1.3881569119536314, + "grad_norm": 0.038182858377695084, + "kl": 0.006665706634521484, + "learning_rate": 3.2234602364560543e-06, + "loss": 0.0031, + "step": 823 + }, + { + "clip_ratio": 0.0003338070732752385, + "epoch": 1.3900690149681814, + "grad_norm": 0.038163840770721436, + "kl": 0.00667572021484375, + "learning_rate": 3.2174532257044957e-06, + "loss": 0.003, + "step": 824 + }, + { + "clip_ratio": 0.0003418834434683049, + "epoch": 1.3919811179827313, + "grad_norm": 0.03628409281373024, + "kl": 0.0067596435546875, + "learning_rate": 3.2114416975955347e-06, + "loss": 0.003, + "step": 825 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.1027045249939, + "epoch": 1.3938932209972812, + "grad_norm": 0.037393856793642044, + "kl": 0.005987644195556641, + "learning_rate": 3.20542568998003e-06, + "loss": 0.0097, + "num_tokens": 528270425.0, + "reward": 0.07784598556463607, + "reward_std": 0.0774529695045203, + "rewards/pure_accuracy_reward_math": 0.07784598329453729, + "step": 826 + }, + { + "clip_ratio": 0.0002753000243274073, + "epoch": 1.395805324011831, + "grad_norm": 0.03632253408432007, + "kl": 0.00603485107421875, + "learning_rate": 3.199405240737045e-06, + "loss": 0.0097, + "step": 827 + }, + { + "clip_ratio": 0.00028145005671831314, + "epoch": 1.397717427026381, + "grad_norm": 0.035320475697517395, + "kl": 0.0060482025146484375, + "learning_rate": 3.1933803877736103e-06, + "loss": 0.0097, + "step": 828 + }, + { + "clip_ratio": 0.00029773840276448027, + "epoch": 1.399629530040931, + "grad_norm": 0.03532904013991356, + "kl": 0.006001472473144531, + "learning_rate": 3.187351169024483e-06, + "loss": 0.0096, + "step": 829 + }, + { + "clip_ratio": 0.0003131672060590063, + "epoch": 1.4015416330554809, + "grad_norm": 0.03497399017214775, + "kl": 0.0059299468994140625, + "learning_rate": 3.181317622451909e-06, + "loss": 0.0095, + "step": 830 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.5547099113464, + "epoch": 1.4034537360700308, + "grad_norm": 0.03596203401684761, + "kl": 0.005957126617431641, + "learning_rate": 3.1752797860453854e-06, + "loss": 0.0099, + "num_tokens": 531863545.0, + "reward": 0.06584821754950099, + "reward_std": 0.07359298237133771, + "rewards/pure_accuracy_reward_math": 0.06584821580327116, + "step": 831 + }, + { + "clip_ratio": 0.0002871401754873659, + "epoch": 1.4053658390845807, + "grad_norm": 0.03569914028048515, + "kl": 0.005918025970458984, + "learning_rate": 3.169237697821417e-06, + "loss": 0.0099, + "step": 832 + }, + { + "clip_ratio": 0.0002649255456503852, + "epoch": 1.4072779420991306, + "grad_norm": 0.035189539194107056, + "kl": 0.005944252014160156, + "learning_rate": 3.163191395823281e-06, + "loss": 0.0098, + "step": 833 + }, + { + "clip_ratio": 0.0002522150609252094, + "epoch": 1.4091900451136805, + "grad_norm": 0.03371162712574005, + "kl": 0.006028652191162109, + "learning_rate": 3.1571409181207867e-06, + "loss": 0.0098, + "step": 834 + }, + { + "clip_ratio": 0.00028182740913962334, + "epoch": 1.4111021481282304, + "grad_norm": 0.03411802276968956, + "kl": 0.006129264831542969, + "learning_rate": 3.151086302810035e-06, + "loss": 0.0097, + "step": 835 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.0455017089844, + "epoch": 1.4130142511427803, + "grad_norm": 0.042647283524274826, + "kl": 0.006505012512207031, + "learning_rate": 3.1450275880131782e-06, + "loss": 0.0051, + "num_tokens": 535420068.0, + "reward": 0.06919643201399595, + "reward_std": 0.06989945442182943, + "rewards/pure_accuracy_reward_math": 0.06919642980210483, + "step": 836 + }, + { + "clip_ratio": 0.0002792542761653749, + "epoch": 1.4149263541573303, + "grad_norm": 0.03879564628005028, + "kl": 0.006262302398681641, + "learning_rate": 3.1389648118781795e-06, + "loss": 0.0051, + "step": 837 + }, + { + "clip_ratio": 0.00032867032479089175, + "epoch": 1.4168384571718802, + "grad_norm": 0.03632555902004242, + "kl": 0.006078004837036133, + "learning_rate": 3.132898012578577e-06, + "loss": 0.005, + "step": 838 + }, + { + "clip_ratio": 0.0003705890379706034, + "epoch": 1.41875056018643, + "grad_norm": 0.03687159717082977, + "kl": 0.0058705806732177734, + "learning_rate": 3.1268272283132374e-06, + "loss": 0.005, + "step": 839 + }, + { + "clip_ratio": 0.00039090512018447043, + "epoch": 1.42066266320098, + "grad_norm": 0.03681857883930206, + "kl": 0.005755186080932617, + "learning_rate": 3.1207524973061183e-06, + "loss": 0.0049, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.0865178108215, + "epoch": 1.42257476621553, + "grad_norm": 0.077212393283844, + "kl": 0.006708621978759766, + "learning_rate": 3.1146738578060293e-06, + "loss": 0.0034, + "num_tokens": 539042994.0, + "reward": 0.05468750235741027, + "reward_std": 0.06221334764268249, + "rewards/pure_accuracy_reward_math": 0.05468750130967237, + "step": 841 + }, + { + "clip_ratio": 0.00023407521496210393, + "epoch": 1.4244868692300798, + "grad_norm": 0.03766750544309616, + "kl": 0.005887508392333984, + "learning_rate": 3.108591348086388e-06, + "loss": 0.0034, + "step": 842 + }, + { + "clip_ratio": 0.00021864835269980176, + "epoch": 1.4263989722446297, + "grad_norm": 0.03435171768069267, + "kl": 0.0057353973388671875, + "learning_rate": 3.102505006444981e-06, + "loss": 0.0033, + "step": 843 + }, + { + "clip_ratio": 0.0002327330819866802, + "epoch": 1.4283110752591797, + "grad_norm": 0.03385370597243309, + "kl": 0.005730628967285156, + "learning_rate": 3.096414871203721e-06, + "loss": 0.0033, + "step": 844 + }, + { + "clip_ratio": 0.00025595308994752486, + "epoch": 1.4302231782737296, + "grad_norm": 0.0320701077580452, + "kl": 0.005660533905029297, + "learning_rate": 3.0903209807084085e-06, + "loss": 0.0032, + "step": 845 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.2009177207947, + "epoch": 1.4321352812882795, + "grad_norm": 0.035687774419784546, + "kl": 0.006323099136352539, + "learning_rate": 3.0842233733284866e-06, + "loss": 0.0055, + "num_tokens": 542686090.0, + "reward": 0.06389509252039716, + "reward_std": 0.06839800346642733, + "rewards/pure_accuracy_reward_math": 0.06389509059954435, + "step": 846 + }, + { + "clip_ratio": 0.0002455309293054597, + "epoch": 1.4340473843028292, + "grad_norm": 0.03433489799499512, + "kl": 0.006294965744018555, + "learning_rate": 3.078122087456802e-06, + "loss": 0.0055, + "step": 847 + }, + { + "clip_ratio": 0.0003179283777399178, + "epoch": 1.4359594873173793, + "grad_norm": 0.03377856686711311, + "kl": 0.00630497932434082, + "learning_rate": 3.072017161509364e-06, + "loss": 0.0054, + "step": 848 + }, + { + "clip_ratio": 0.00030606188772708265, + "epoch": 1.437871590331929, + "grad_norm": 0.03379327058792114, + "kl": 0.006325483322143555, + "learning_rate": 3.065908633925099e-06, + "loss": 0.0054, + "step": 849 + }, + { + "clip_ratio": 0.00029904921905199444, + "epoch": 1.4397836933464792, + "grad_norm": 0.03319833427667618, + "kl": 0.006340742111206055, + "learning_rate": 3.0597965431656125e-06, + "loss": 0.0053, + "step": 850 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.9991841316223, + "epoch": 1.00191210301455, + "grad_norm": 0.03730909898877144, + "kl": 0.005851268768310547, + "learning_rate": 3.0536809277149433e-06, + "loss": 0.0058, + "num_tokens": 3602593.0, + "reward": 0.061662948777666315, + "reward_std": 0.0712745109340176, + "rewards/pure_accuracy_reward_math": 0.06166294767172076, + "step": 851 + }, + { + "clip_ratio": 0.0002445870232463676, + "epoch": 1.0038242060290998, + "grad_norm": 0.036420926451683044, + "kl": 0.005807399749755859, + "learning_rate": 3.047561826079324e-06, + "loss": 0.0057, + "step": 852 + }, + { + "clip_ratio": 0.0002342841784184202, + "epoch": 1.0057363090436497, + "grad_norm": 0.03534744307398796, + "kl": 0.005809783935546875, + "learning_rate": 3.041439276786937e-06, + "loss": 0.0057, + "step": 853 + }, + { + "clip_ratio": 0.0003130897791834286, + "epoch": 1.0076484120581997, + "grad_norm": 0.03456578403711319, + "kl": 0.005836963653564453, + "learning_rate": 3.0353133183876745e-06, + "loss": 0.0056, + "step": 854 + }, + { + "clip_ratio": 0.0003235736477336104, + "epoch": 1.0095605150727496, + "grad_norm": 0.03683493658900261, + "kl": 0.00588226318359375, + "learning_rate": 3.0291839894528907e-06, + "loss": 0.0056, + "step": 855 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.2422127723694, + "epoch": 1.0114726180872995, + "grad_norm": 3.6328346729278564, + "kl": 0.07409882545471191, + "learning_rate": 3.023051328575164e-06, + "loss": 0.0092, + "num_tokens": 7231613.0, + "reward": 0.06696428847499192, + "reward_std": 0.07320140569936484, + "rewards/pure_accuracy_reward_math": 0.06696428725263104, + "step": 856 + }, + { + "clip_ratio": 0.0002944787788692338, + "epoch": 1.0133847211018494, + "grad_norm": 0.23805810511112213, + "kl": 0.01258087158203125, + "learning_rate": 3.016915374368052e-06, + "loss": 0.0068, + "step": 857 + }, + { + "clip_ratio": 0.000328014534943577, + "epoch": 1.0152968241163993, + "grad_norm": 0.038860052824020386, + "kl": 0.008163928985595703, + "learning_rate": 3.0107761654658464e-06, + "loss": 0.0066, + "step": 858 + }, + { + "clip_ratio": 0.00033978425187797257, + "epoch": 1.0172089271309492, + "grad_norm": 0.037539608776569366, + "kl": 0.008237600326538086, + "learning_rate": 3.0046337405233334e-06, + "loss": 0.0065, + "step": 859 + }, + { + "clip_ratio": 0.0003289994185706746, + "epoch": 1.0191210301454992, + "grad_norm": 0.03649570420384407, + "kl": 0.008342981338500977, + "learning_rate": 2.9984881382155484e-06, + "loss": 0.0065, + "step": 860 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.7709541320801, + "epoch": 1.021033133160049, + "grad_norm": 0.03506062552332878, + "kl": 0.0056056976318359375, + "learning_rate": 2.9923393972375337e-06, + "loss": 0.0075, + "num_tokens": 10898500.0, + "reward": 0.06389509155997075, + "reward_std": 0.07427741104038432, + "rewards/pure_accuracy_reward_math": 0.06389509086147882, + "step": 861 + }, + { + "clip_ratio": 0.00025894983372154456, + "epoch": 1.022945236174599, + "grad_norm": 0.03387964144349098, + "kl": 0.005673408508300781, + "learning_rate": 2.986187556304091e-06, + "loss": 0.0075, + "step": 862 + }, + { + "clip_ratio": 0.00026048227840647087, + "epoch": 1.024857339189149, + "grad_norm": 0.0339200459420681, + "kl": 0.005715370178222656, + "learning_rate": 2.9800326541495427e-06, + "loss": 0.0074, + "step": 863 + }, + { + "clip_ratio": 0.000286817725225319, + "epoch": 1.0267694422036988, + "grad_norm": 0.033578090369701385, + "kl": 0.0057220458984375, + "learning_rate": 2.973874729527486e-06, + "loss": 0.0074, + "step": 864 + }, + { + "clip_ratio": 0.00031288620994018856, + "epoch": 1.0286815452182487, + "grad_norm": 0.03253786265850067, + "kl": 0.005726814270019531, + "learning_rate": 2.967713821210547e-06, + "loss": 0.0073, + "step": 865 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.484959602356, + "epoch": 1.0305936482327986, + "grad_norm": 0.040393006056547165, + "kl": 0.005712032318115234, + "learning_rate": 2.961549967990139e-06, + "loss": 0.0094, + "num_tokens": 14539070.0, + "reward": 0.0700334852153901, + "reward_std": 0.07968511193757877, + "rewards/pure_accuracy_reward_math": 0.07003348364378326, + "step": 866 + }, + { + "clip_ratio": 0.00034418605622477116, + "epoch": 1.0325057512473486, + "grad_norm": 0.03829828277230263, + "kl": 0.00571441650390625, + "learning_rate": 2.95538320867622e-06, + "loss": 0.0094, + "step": 867 + }, + { + "clip_ratio": 0.0003270462358386794, + "epoch": 1.0344178542618985, + "grad_norm": 0.03763904795050621, + "kl": 0.005820035934448242, + "learning_rate": 2.949213582097042e-06, + "loss": 0.0094, + "step": 868 + }, + { + "clip_ratio": 0.00039861036464117205, + "epoch": 1.0363299572764482, + "grad_norm": 0.03893045708537102, + "kl": 0.005897045135498047, + "learning_rate": 2.9430411270989112e-06, + "loss": 0.0093, + "step": 869 + }, + { + "clip_ratio": 0.0004073582798014286, + "epoch": 1.038242060290998, + "grad_norm": 0.03808417171239853, + "kl": 0.0059051513671875, + "learning_rate": 2.9368658825459452e-06, + "loss": 0.0092, + "step": 870 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.7159852981567, + "epoch": 1.040154163305548, + "grad_norm": 0.03680076450109482, + "kl": 0.006183147430419922, + "learning_rate": 2.9306878873198227e-06, + "loss": 0.0073, + "num_tokens": 18123716.0, + "reward": 0.06975446810247377, + "reward_std": 0.07255704078124836, + "rewards/pure_accuracy_reward_math": 0.06975446600699797, + "step": 871 + }, + { + "clip_ratio": 0.00025267474336487794, + "epoch": 1.042066266320098, + "grad_norm": 0.036574870347976685, + "kl": 0.006196498870849609, + "learning_rate": 2.9245071803195435e-06, + "loss": 0.0072, + "step": 872 + }, + { + "clip_ratio": 0.0002888958638322947, + "epoch": 1.0439783693346478, + "grad_norm": 0.03539302200078964, + "kl": 0.006276130676269531, + "learning_rate": 2.9183238004611815e-06, + "loss": 0.0072, + "step": 873 + }, + { + "clip_ratio": 0.00027933804358326597, + "epoch": 1.0458904723491977, + "grad_norm": 0.03457676246762276, + "kl": 0.00629425048828125, + "learning_rate": 2.912137786677639e-06, + "loss": 0.0071, + "step": 874 + }, + { + "clip_ratio": 0.00026495220328115465, + "epoch": 1.0478025753637477, + "grad_norm": 0.034882258623838425, + "kl": 0.006371974945068359, + "learning_rate": 2.905949177918403e-06, + "loss": 0.0071, + "step": 875 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.4989104270935, + "epoch": 1.0497146783782976, + "grad_norm": 0.04403652995824814, + "kl": 0.0064754486083984375, + "learning_rate": 2.8997580131493004e-06, + "loss": 0.0104, + "num_tokens": 21706672.0, + "reward": 0.07421875311410986, + "reward_std": 0.08282060426427051, + "rewards/pure_accuracy_reward_math": 0.07421875130967237, + "step": 876 + }, + { + "clip_ratio": 0.00034863107299543117, + "epoch": 1.0516267813928475, + "grad_norm": 0.040730468928813934, + "kl": 0.006359100341796875, + "learning_rate": 2.89356433135225e-06, + "loss": 0.0104, + "step": 877 + }, + { + "clip_ratio": 0.0003696895219036378, + "epoch": 1.0535388844073974, + "grad_norm": 0.040028344839811325, + "kl": 0.006321430206298828, + "learning_rate": 2.8873681715250197e-06, + "loss": 0.0104, + "step": 878 + }, + { + "clip_ratio": 0.00041197048278718285, + "epoch": 1.0554509874219473, + "grad_norm": 0.04009086638689041, + "kl": 0.0062351226806640625, + "learning_rate": 2.881169572680981e-06, + "loss": 0.0103, + "step": 879 + }, + { + "clip_ratio": 0.0004460485272943515, + "epoch": 1.0573630904364972, + "grad_norm": 0.03965138643980026, + "kl": 0.006242275238037109, + "learning_rate": 2.87496857384886e-06, + "loss": 0.0102, + "step": 880 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.4285945892334, + "epoch": 1.0592751934510471, + "grad_norm": 0.03920762613415718, + "kl": 0.005979061126708984, + "learning_rate": 2.868765214072495e-06, + "loss": 0.0082, + "num_tokens": 25317588.0, + "reward": 0.07338170023285784, + "reward_std": 0.0805021328269504, + "rewards/pure_accuracy_reward_math": 0.07338169755530544, + "step": 881 + }, + { + "clip_ratio": 0.0003169273815046836, + "epoch": 1.061187296465597, + "grad_norm": 0.03858224302530289, + "kl": 0.006028175354003906, + "learning_rate": 2.8625595324105925e-06, + "loss": 0.0082, + "step": 882 + }, + { + "clip_ratio": 0.0003076135093351695, + "epoch": 1.063099399480147, + "grad_norm": 0.03754101321101189, + "kl": 0.006089687347412109, + "learning_rate": 2.8563515679364733e-06, + "loss": 0.0081, + "step": 883 + }, + { + "clip_ratio": 0.0003307215861809709, + "epoch": 1.065011502494697, + "grad_norm": 0.03692120686173439, + "kl": 0.006084442138671875, + "learning_rate": 2.850141359737836e-06, + "loss": 0.008, + "step": 884 + }, + { + "clip_ratio": 0.0003362660154380137, + "epoch": 1.0669236055092468, + "grad_norm": 0.03691774606704712, + "kl": 0.006087303161621094, + "learning_rate": 2.843928946916504e-06, + "loss": 0.008, + "step": 885 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.91938829422, + "epoch": 1.0688357085237967, + "grad_norm": 0.03421162813901901, + "kl": 0.005934238433837891, + "learning_rate": 2.8377143685881835e-06, + "loss": 0.0048, + "num_tokens": 28991667.0, + "reward": 0.06138393090805039, + "reward_std": 0.05770279868738726, + "rewards/pure_accuracy_reward_math": 0.06138392991852015, + "step": 886 + }, + { + "clip_ratio": 0.00021627708133564738, + "epoch": 1.0707478115383466, + "grad_norm": 0.0331665463745594, + "kl": 0.005833148956298828, + "learning_rate": 2.8314976638822145e-06, + "loss": 0.0048, + "step": 887 + }, + { + "clip_ratio": 0.00023772416773226723, + "epoch": 1.0726599145528966, + "grad_norm": 0.03265010192990303, + "kl": 0.00572967529296875, + "learning_rate": 2.825278871941325e-06, + "loss": 0.0048, + "step": 888 + }, + { + "clip_ratio": 0.000255867875353033, + "epoch": 1.0745720175674465, + "grad_norm": 0.031934551894664764, + "kl": 0.0056514739990234375, + "learning_rate": 2.819058031921387e-06, + "loss": 0.0047, + "step": 889 + }, + { + "clip_ratio": 0.0002752940895334177, + "epoch": 1.0764841205819964, + "grad_norm": 0.03180062025785446, + "kl": 0.005589008331298828, + "learning_rate": 2.812835182991166e-06, + "loss": 0.0047, + "step": 890 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.6253051757812, + "epoch": 1.0783962235965463, + "grad_norm": 0.0352044515311718, + "kl": 0.006504535675048828, + "learning_rate": 2.8066103643320774e-06, + "loss": 0.005, + "num_tokens": 32662984.0, + "reward": 0.07003348544822074, + "reward_std": 0.07148103549843654, + "rewards/pure_accuracy_reward_math": 0.07003348341095261, + "step": 891 + }, + { + "clip_ratio": 0.0002908879878305015, + "epoch": 1.0803083266110962, + "grad_norm": 0.03477974981069565, + "kl": 0.006473064422607422, + "learning_rate": 2.800383615137939e-06, + "loss": 0.0049, + "step": 892 + }, + { + "clip_ratio": 0.00027559091887496834, + "epoch": 1.0822204296256461, + "grad_norm": 0.03371204808354378, + "kl": 0.006519317626953125, + "learning_rate": 2.7941549746147234e-06, + "loss": 0.0049, + "step": 893 + }, + { + "clip_ratio": 0.00026331023877901316, + "epoch": 1.084132532640196, + "grad_norm": 0.03233867511153221, + "kl": 0.00655364990234375, + "learning_rate": 2.7879244819803104e-06, + "loss": 0.0048, + "step": 894 + }, + { + "clip_ratio": 0.0003059378379361988, + "epoch": 1.086044635654746, + "grad_norm": 0.032591916620731354, + "kl": 0.006562709808349609, + "learning_rate": 2.781692176464244e-06, + "loss": 0.0048, + "step": 895 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.9467296600342, + "epoch": 1.0879567386692959, + "grad_norm": 0.0399605967104435, + "kl": 0.007935047149658203, + "learning_rate": 2.7754580973074817e-06, + "loss": 0.0078, + "num_tokens": 36327265.0, + "reward": 0.06640625328873284, + "reward_std": 0.07582512497901917, + "rewards/pure_accuracy_reward_math": 0.06640625142608769, + "step": 896 + }, + { + "clip_ratio": 0.00029080147635340836, + "epoch": 1.0898688416838458, + "grad_norm": 0.036669787019491196, + "kl": 0.007892131805419922, + "learning_rate": 2.769222283762148e-06, + "loss": 0.0077, + "step": 897 + }, + { + "clip_ratio": 0.0003202801690349588, + "epoch": 1.0917809446983957, + "grad_norm": 0.036093369126319885, + "kl": 0.007870197296142578, + "learning_rate": 2.7629847750912885e-06, + "loss": 0.0077, + "step": 898 + }, + { + "clip_ratio": 0.00034906711715620986, + "epoch": 1.0936930477129456, + "grad_norm": 0.036899976432323456, + "kl": 0.007824897766113281, + "learning_rate": 2.756745610568622e-06, + "loss": 0.0076, + "step": 899 + }, + { + "clip_ratio": 0.0003909627172333785, + "epoch": 1.0956051507274955, + "grad_norm": 0.03607386723160744, + "kl": 0.00782632827758789, + "learning_rate": 2.7505048294782914e-06, + "loss": 0.0076, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.9687776565552, + "epoch": 1.0975172537420455, + "grad_norm": 0.04138408601284027, + "kl": 0.006854534149169922, + "learning_rate": 2.7442624711146206e-06, + "loss": 0.0105, + "num_tokens": 39926261.0, + "reward": 0.07561384263681248, + "reward_std": 0.08660046180011705, + "rewards/pure_accuracy_reward_math": 0.07561384089058265, + "step": 901 + }, + { + "clip_ratio": 0.0003407098130878694, + "epoch": 1.0994293567565951, + "grad_norm": 0.04008745029568672, + "kl": 0.006922245025634766, + "learning_rate": 2.7380185747818628e-06, + "loss": 0.0105, + "step": 902 + }, + { + "clip_ratio": 0.0003345158028196238, + "epoch": 1.1013414597711453, + "grad_norm": 0.039206936955451965, + "kl": 0.006981372833251953, + "learning_rate": 2.7317731797939566e-06, + "loss": 0.0104, + "step": 903 + }, + { + "clip_ratio": 0.0003512224284918375, + "epoch": 1.103253562785695, + "grad_norm": 0.03816502168774605, + "kl": 0.006984233856201172, + "learning_rate": 2.7255263254742746e-06, + "loss": 0.0103, + "step": 904 + }, + { + "clip_ratio": 0.00038539456500075175, + "epoch": 1.105165665800245, + "grad_norm": 0.03802499175071716, + "kl": 0.006890773773193359, + "learning_rate": 2.71927805115538e-06, + "loss": 0.0103, + "step": 905 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.6635279655457, + "epoch": 1.1070777688147948, + "grad_norm": 0.03780652955174446, + "kl": 0.005947589874267578, + "learning_rate": 2.713028396178776e-06, + "loss": 0.0044, + "num_tokens": 43530039.0, + "reward": 0.0691964318684768, + "reward_std": 0.0774129043566063, + "rewards/pure_accuracy_reward_math": 0.06919642988941632, + "step": 906 + }, + { + "clip_ratio": 0.0002883933650537074, + "epoch": 1.1089898718293447, + "grad_norm": 0.03706151619553566, + "kl": 0.005948543548583984, + "learning_rate": 2.706777399894656e-06, + "loss": 0.0044, + "step": 907 + }, + { + "clip_ratio": 0.0003032470573316459, + "epoch": 1.1109019748438946, + "grad_norm": 0.03684515878558159, + "kl": 0.005936622619628906, + "learning_rate": 2.700525101661665e-06, + "loss": 0.0044, + "step": 908 + }, + { + "clip_ratio": 0.0003385747261290817, + "epoch": 1.1128140778584446, + "grad_norm": 0.03632361814379692, + "kl": 0.005986690521240234, + "learning_rate": 2.6942715408466406e-06, + "loss": 0.0043, + "step": 909 + }, + { + "clip_ratio": 0.00035084231319615355, + "epoch": 1.1147261808729945, + "grad_norm": 0.0364714041352272, + "kl": 0.005983829498291016, + "learning_rate": 2.6880167568243716e-06, + "loss": 0.0042, + "step": 910 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.6629705429077, + "epoch": 1.1166382838875444, + "grad_norm": 0.037073228508234024, + "kl": 0.006183624267578125, + "learning_rate": 2.681760788977349e-06, + "loss": 0.0075, + "num_tokens": 47140667.0, + "reward": 0.06166294956346974, + "reward_std": 0.07140090485336259, + "rewards/pure_accuracy_reward_math": 0.061662947526201606, + "step": 911 + }, + { + "clip_ratio": 0.00026335007953548484, + "epoch": 1.1185503869020943, + "grad_norm": 0.03628791868686676, + "kl": 0.006221771240234375, + "learning_rate": 2.6755036766955172e-06, + "loss": 0.0075, + "step": 912 + }, + { + "clip_ratio": 0.00029098790395210017, + "epoch": 1.1204624899166442, + "grad_norm": 0.03659017011523247, + "kl": 0.006258964538574219, + "learning_rate": 2.6692454593760255e-06, + "loss": 0.0075, + "step": 913 + }, + { + "clip_ratio": 0.00033703100632465066, + "epoch": 1.1223745929311941, + "grad_norm": 0.0357106551527977, + "kl": 0.006211757659912109, + "learning_rate": 2.6629861764229824e-06, + "loss": 0.0074, + "step": 914 + }, + { + "clip_ratio": 0.0003104925490902133, + "epoch": 1.124286695945744, + "grad_norm": 0.03461490571498871, + "kl": 0.006183624267578125, + "learning_rate": 2.6567258672472064e-06, + "loss": 0.0073, + "step": 915 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.3962297439575, + "epoch": 1.126198798960294, + "grad_norm": 0.038919847458601, + "kl": 0.0060977935791015625, + "learning_rate": 2.650464571265975e-06, + "loss": 0.0062, + "num_tokens": 50733111.0, + "reward": 0.06584821734577417, + "reward_std": 0.07367311330744997, + "rewards/pure_accuracy_reward_math": 0.06584821583237499, + "step": 916 + }, + { + "clip_ratio": 0.0002951280029606096, + "epoch": 1.1281109019748439, + "grad_norm": 0.038201622664928436, + "kl": 0.0060329437255859375, + "learning_rate": 2.6442023279027805e-06, + "loss": 0.0061, + "step": 917 + }, + { + "clip_ratio": 0.00029004437487856194, + "epoch": 1.1300230049893938, + "grad_norm": 0.03696547448635101, + "kl": 0.006039619445800781, + "learning_rate": 2.6379391765870828e-06, + "loss": 0.0061, + "step": 918 + }, + { + "clip_ratio": 0.0003163389113183257, + "epoch": 1.1319351080039437, + "grad_norm": 0.03571280464529991, + "kl": 0.006005764007568359, + "learning_rate": 2.6316751567540527e-06, + "loss": 0.006, + "step": 919 + }, + { + "clip_ratio": 0.0003592208154259424, + "epoch": 1.1338472110184936, + "grad_norm": 0.03568287193775177, + "kl": 0.005993366241455078, + "learning_rate": 2.625410307844335e-06, + "loss": 0.006, + "step": 920 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.2659268379211, + "epoch": 1.1357593140330435, + "grad_norm": 0.03899242356419563, + "kl": 0.005813121795654297, + "learning_rate": 2.6191446693037924e-06, + "loss": 0.0071, + "num_tokens": 54398312.0, + "reward": 0.07226562857977115, + "reward_std": 0.07861530320951715, + "rewards/pure_accuracy_reward_math": 0.07226562648429535, + "step": 921 + }, + { + "clip_ratio": 0.00029711308371815903, + "epoch": 1.1376714170475934, + "grad_norm": 0.038164544850587845, + "kl": 0.0058841705322265625, + "learning_rate": 2.6128782805832605e-06, + "loss": 0.0071, + "step": 922 + }, + { + "clip_ratio": 0.0003027216810664868, + "epoch": 1.1395835200621434, + "grad_norm": 0.03706645965576172, + "kl": 0.005882740020751953, + "learning_rate": 2.606611181138295e-06, + "loss": 0.007, + "step": 923 + }, + { + "clip_ratio": 0.00032618250162386175, + "epoch": 1.1414956230766933, + "grad_norm": 0.036637816578149796, + "kl": 0.005909442901611328, + "learning_rate": 2.600343410428931e-06, + "loss": 0.007, + "step": 924 + }, + { + "clip_ratio": 0.00032713054685018506, + "epoch": 1.1434077260912432, + "grad_norm": 0.036758605390787125, + "kl": 0.005947589874267578, + "learning_rate": 2.5940750079194275e-06, + "loss": 0.0069, + "step": 925 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.0072803497314, + "epoch": 1.145319829105793, + "grad_norm": 0.03791532665491104, + "kl": 0.0061702728271484375, + "learning_rate": 2.5878060130780225e-06, + "loss": 0.0074, + "num_tokens": 58073722.0, + "reward": 0.06835937863797881, + "reward_std": 0.07715391897363588, + "rewards/pure_accuracy_reward_math": 0.06835937636788003, + "step": 926 + }, + { + "clip_ratio": 0.00030884258325158953, + "epoch": 1.147231932120343, + "grad_norm": 0.03749171644449234, + "kl": 0.006160736083984375, + "learning_rate": 2.581536465376684e-06, + "loss": 0.0074, + "step": 927 + }, + { + "clip_ratio": 0.000279198229350186, + "epoch": 1.149144035134893, + "grad_norm": 0.03681938722729683, + "kl": 0.006136417388916016, + "learning_rate": 2.575266404290859e-06, + "loss": 0.0073, + "step": 928 + }, + { + "clip_ratio": 0.0002930849948370451, + "epoch": 1.1510561381494429, + "grad_norm": 0.035750068724155426, + "kl": 0.006227970123291016, + "learning_rate": 2.5689958692992284e-06, + "loss": 0.0072, + "step": 929 + }, + { + "clip_ratio": 0.00028936977611238035, + "epoch": 1.1529682411639928, + "grad_norm": 0.03503425419330597, + "kl": 0.006281375885009766, + "learning_rate": 2.562724899883458e-06, + "loss": 0.0072, + "step": 930 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.6188879013062, + "epoch": 1.1548803441785427, + "grad_norm": 0.05187267065048218, + "kl": 0.007277965545654297, + "learning_rate": 2.5564535355279464e-06, + "loss": 0.0072, + "num_tokens": 61714268.0, + "reward": 0.07505580713041127, + "reward_std": 0.08531173289520666, + "rewards/pure_accuracy_reward_math": 0.07505580491852015, + "step": 931 + }, + { + "clip_ratio": 0.00033635866333270314, + "epoch": 1.1567924471930926, + "grad_norm": 0.039655230939388275, + "kl": 0.0072231292724609375, + "learning_rate": 2.550181815719581e-06, + "loss": 0.0072, + "step": 932 + }, + { + "clip_ratio": 0.00035109808851530033, + "epoch": 1.1587045502076425, + "grad_norm": 0.038757406175136566, + "kl": 0.007157802581787109, + "learning_rate": 2.5439097799474867e-06, + "loss": 0.0072, + "step": 933 + }, + { + "clip_ratio": 0.00037538493586453114, + "epoch": 1.1606166532221924, + "grad_norm": 0.03841486573219299, + "kl": 0.007115840911865234, + "learning_rate": 2.537637467702777e-06, + "loss": 0.0071, + "step": 934 + }, + { + "clip_ratio": 0.0003936579208243529, + "epoch": 1.1625287562367423, + "grad_norm": 0.038453541696071625, + "kl": 0.0070896148681640625, + "learning_rate": 2.531364918478308e-06, + "loss": 0.007, + "step": 935 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.6250252723694, + "epoch": 1.1644408592512923, + "grad_norm": 0.03738933801651001, + "kl": 0.00615692138671875, + "learning_rate": 2.5250921717684247e-06, + "loss": 0.0061, + "num_tokens": 65415044.0, + "reward": 0.07561384260770865, + "reward_std": 0.07745296956272796, + "rewards/pure_accuracy_reward_math": 0.07561384062864818, + "step": 936 + }, + { + "clip_ratio": 0.0002929231292227996, + "epoch": 1.166352962265842, + "grad_norm": 0.03690778836607933, + "kl": 0.006189823150634766, + "learning_rate": 2.5188192670687186e-06, + "loss": 0.0061, + "step": 937 + }, + { + "clip_ratio": 0.000294325235870474, + "epoch": 1.168265065280392, + "grad_norm": 0.03613179549574852, + "kl": 0.006130695343017578, + "learning_rate": 2.512546243875776e-06, + "loss": 0.0061, + "step": 938 + }, + { + "clip_ratio": 0.00031920797795237377, + "epoch": 1.1701771682949418, + "grad_norm": 0.03461304306983948, + "kl": 0.006014347076416016, + "learning_rate": 2.5062731416869267e-06, + "loss": 0.006, + "step": 939 + }, + { + "clip_ratio": 0.00037188214912475814, + "epoch": 1.172089271309492, + "grad_norm": 0.03454398363828659, + "kl": 0.005980968475341797, + "learning_rate": 2.5e-06, + "loss": 0.0059, + "step": 940 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.1423244476318, + "epoch": 1.1740013743240416, + "grad_norm": 0.03934042155742645, + "kl": 0.006266117095947266, + "learning_rate": 2.493726858313074e-06, + "loss": 0.0078, + "num_tokens": 69057654.0, + "reward": 0.07477678928989917, + "reward_std": 0.08299326134147123, + "rewards/pure_accuracy_reward_math": 0.07477678690338507, + "step": 941 + }, + { + "clip_ratio": 0.00031629414758072016, + "epoch": 1.1759134773385915, + "grad_norm": 0.03872406855225563, + "kl": 0.0062713623046875, + "learning_rate": 2.4874537561242253e-06, + "loss": 0.0078, + "step": 942 + }, + { + "clip_ratio": 0.0003434862284166229, + "epoch": 1.1778255803531414, + "grad_norm": 0.03723340108990669, + "kl": 0.00623321533203125, + "learning_rate": 2.481180732931282e-06, + "loss": 0.0077, + "step": 943 + }, + { + "clip_ratio": 0.00034986940886483353, + "epoch": 1.1797376833676914, + "grad_norm": 0.03732794523239136, + "kl": 0.006276607513427734, + "learning_rate": 2.4749078282315757e-06, + "loss": 0.0076, + "step": 944 + }, + { + "clip_ratio": 0.0003579597876637308, + "epoch": 1.1816497863822413, + "grad_norm": 0.03668594732880592, + "kl": 0.006198883056640625, + "learning_rate": 2.468635081521693e-06, + "loss": 0.0076, + "step": 945 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.1718993186951, + "epoch": 1.1835618893967912, + "grad_norm": 0.03715552017092705, + "kl": 0.006759166717529297, + "learning_rate": 2.462362532297224e-06, + "loss": 0.0079, + "num_tokens": 72682654.0, + "reward": 0.06891741449362598, + "reward_std": 0.08248148870188743, + "rewards/pure_accuracy_reward_math": 0.06891741199069656, + "step": 946 + }, + { + "clip_ratio": 0.0003075862115053951, + "epoch": 1.185473992411341, + "grad_norm": 0.03616279736161232, + "kl": 0.006741523742675781, + "learning_rate": 2.456090220052514e-06, + "loss": 0.0079, + "step": 947 + }, + { + "clip_ratio": 0.00027696539024191225, + "epoch": 1.187386095425891, + "grad_norm": 0.03556762635707855, + "kl": 0.006789684295654297, + "learning_rate": 2.44981818428042e-06, + "loss": 0.0079, + "step": 948 + }, + { + "clip_ratio": 0.0002739789470638243, + "epoch": 1.189298198440441, + "grad_norm": 0.03486724570393562, + "kl": 0.006869316101074219, + "learning_rate": 2.4435464644720544e-06, + "loss": 0.0078, + "step": 949 + }, + { + "clip_ratio": 0.00031816330425726846, + "epoch": 1.1912103014549908, + "grad_norm": 0.03446395695209503, + "kl": 0.006869316101074219, + "learning_rate": 2.4372751001165427e-06, + "loss": 0.0077, + "step": 950 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.6573901176453, + "epoch": 1.1931224044695408, + "grad_norm": 0.03734345734119415, + "kl": 0.006131649017333984, + "learning_rate": 2.4310041307007716e-06, + "loss": 0.0062, + "num_tokens": 76305578.0, + "reward": 0.07114955657743849, + "reward_std": 0.07526708883233368, + "rewards/pure_accuracy_reward_math": 0.07114955488941632, + "step": 951 + }, + { + "clip_ratio": 0.00029005661951941875, + "epoch": 1.1950345074840907, + "grad_norm": 0.036443449556827545, + "kl": 0.006079196929931641, + "learning_rate": 2.4247335957091418e-06, + "loss": 0.0062, + "step": 952 + }, + { + "clip_ratio": 0.0002579906781647878, + "epoch": 1.1969466104986406, + "grad_norm": 0.034940823912620544, + "kl": 0.006037235260009766, + "learning_rate": 2.4184635346233166e-06, + "loss": 0.0061, + "step": 953 + }, + { + "clip_ratio": 0.00032199256943954424, + "epoch": 1.1988587135131905, + "grad_norm": 0.03445851802825928, + "kl": 0.006024360656738281, + "learning_rate": 2.4121939869219784e-06, + "loss": 0.0061, + "step": 954 + }, + { + "clip_ratio": 0.0003193520489048751, + "epoch": 1.2007708165277404, + "grad_norm": 0.03448885306715965, + "kl": 0.005992889404296875, + "learning_rate": 2.405924992080573e-06, + "loss": 0.006, + "step": 955 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.4358487129211, + "epoch": 1.2026829195422903, + "grad_norm": 0.11665105819702148, + "kl": 0.008374214172363281, + "learning_rate": 2.3996565895710692e-06, + "loss": 0.0065, + "num_tokens": 79904712.0, + "reward": 0.07366071760770865, + "reward_std": 0.08458104060264304, + "rewards/pure_accuracy_reward_math": 0.07366071591968648, + "step": 956 + }, + { + "clip_ratio": 0.00031160829769305565, + "epoch": 1.2045950225568403, + "grad_norm": 0.04096413403749466, + "kl": 0.006944179534912109, + "learning_rate": 2.3933888188617054e-06, + "loss": 0.0064, + "step": 957 + }, + { + "clip_ratio": 0.00032232171946589006, + "epoch": 1.2065071255713902, + "grad_norm": 0.04049144312739372, + "kl": 0.006976127624511719, + "learning_rate": 2.3871217194167407e-06, + "loss": 0.0063, + "step": 958 + }, + { + "clip_ratio": 0.0003416440970340773, + "epoch": 1.20841922858594, + "grad_norm": 0.039766065776348114, + "kl": 0.007042884826660156, + "learning_rate": 2.380855330696208e-06, + "loss": 0.0063, + "step": 959 + }, + { + "clip_ratio": 0.0003523347779150754, + "epoch": 1.21033133160049, + "grad_norm": 0.03884311020374298, + "kl": 0.007153987884521484, + "learning_rate": 2.3745896921556656e-06, + "loss": 0.0062, + "step": 960 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.392322063446, + "epoch": 1.21224343461504, + "grad_norm": 0.04043371230363846, + "kl": 0.008221149444580078, + "learning_rate": 2.368324843245948e-06, + "loss": 0.0086, + "num_tokens": 83540930.0, + "reward": 0.07952009316068143, + "reward_std": 0.08836089639225975, + "rewards/pure_accuracy_reward_math": 0.0795200911234133, + "step": 961 + }, + { + "clip_ratio": 0.0003234188988017195, + "epoch": 1.2141555376295898, + "grad_norm": 0.039239391684532166, + "kl": 0.008275985717773438, + "learning_rate": 2.362060823412919e-06, + "loss": 0.0086, + "step": 962 + }, + { + "clip_ratio": 0.00033211900500873526, + "epoch": 1.2160676406441397, + "grad_norm": 0.03923904523253441, + "kl": 0.008409500122070312, + "learning_rate": 2.355797672097219e-06, + "loss": 0.0086, + "step": 963 + }, + { + "clip_ratio": 0.00036667373893806143, + "epoch": 1.2179797436586897, + "grad_norm": 0.038865529000759125, + "kl": 0.008434295654296875, + "learning_rate": 2.349535428734026e-06, + "loss": 0.0085, + "step": 964 + }, + { + "clip_ratio": 0.0003816600048480723, + "epoch": 1.2198918466732396, + "grad_norm": 0.037728771567344666, + "kl": 0.00834512710571289, + "learning_rate": 2.343274132752795e-06, + "loss": 0.0084, + "step": 965 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.4799346923828, + "epoch": 1.2218039496877895, + "grad_norm": 0.03813539817929268, + "kl": 0.005985260009765625, + "learning_rate": 2.3370138235770184e-06, + "loss": 0.0088, + "num_tokens": 87187574.0, + "reward": 0.060267860419116914, + "reward_std": 0.07384576939512044, + "rewards/pure_accuracy_reward_math": 0.060267858498264104, + "step": 966 + }, + { + "clip_ratio": 0.0002719826344446119, + "epoch": 1.2237160527023394, + "grad_norm": 0.03676025941967964, + "kl": 0.006021976470947266, + "learning_rate": 2.330754540623975e-06, + "loss": 0.0088, + "step": 967 + }, + { + "clip_ratio": 0.0002730399019696961, + "epoch": 1.2256281557168893, + "grad_norm": 0.03579593822360039, + "kl": 0.006060123443603516, + "learning_rate": 2.324496323304484e-06, + "loss": 0.0088, + "step": 968 + }, + { + "clip_ratio": 0.0002800920712502375, + "epoch": 1.2275402587314392, + "grad_norm": 0.0353357158601284, + "kl": 0.0061092376708984375, + "learning_rate": 2.318239211022651e-06, + "loss": 0.0087, + "step": 969 + }, + { + "clip_ratio": 0.0003294056899108, + "epoch": 1.2294523617459892, + "grad_norm": 0.03521355986595154, + "kl": 0.006182193756103516, + "learning_rate": 2.3119832431756284e-06, + "loss": 0.0086, + "step": 970 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.8870182037354, + "epoch": 1.231364464760539, + "grad_norm": 0.03882085531949997, + "kl": 0.006420135498046875, + "learning_rate": 2.3057284591533598e-06, + "loss": 0.0093, + "num_tokens": 90758753.0, + "reward": 0.07505580718861893, + "reward_std": 0.07715391827514395, + "rewards/pure_accuracy_reward_math": 0.0750558051513508, + "step": 971 + }, + { + "clip_ratio": 0.0003045887907546785, + "epoch": 1.2332765677750888, + "grad_norm": 0.03775356709957123, + "kl": 0.006350040435791016, + "learning_rate": 2.299474898338336e-06, + "loss": 0.0093, + "step": 972 + }, + { + "clip_ratio": 0.0003195773986703898, + "epoch": 1.235188670789639, + "grad_norm": 0.03639310225844383, + "kl": 0.006343841552734375, + "learning_rate": 2.2932226001053444e-06, + "loss": 0.0092, + "step": 973 + }, + { + "clip_ratio": 0.0003582680616318612, + "epoch": 1.2371007738041886, + "grad_norm": 0.036272380501031876, + "kl": 0.006300926208496094, + "learning_rate": 2.286971603821226e-06, + "loss": 0.0092, + "step": 974 + }, + { + "clip_ratio": 0.0003946863821511215, + "epoch": 1.2390128768187387, + "grad_norm": 0.03584066033363342, + "kl": 0.006391048431396484, + "learning_rate": 2.280721948844621e-06, + "loss": 0.0091, + "step": 975 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.3044323921204, + "epoch": 1.2409249798332884, + "grad_norm": 0.038236722350120544, + "kl": 0.006694316864013672, + "learning_rate": 2.274473674525726e-06, + "loss": 0.0094, + "num_tokens": 94365488.0, + "reward": 0.06556919953436591, + "reward_std": 0.07405849196948111, + "rewards/pure_accuracy_reward_math": 0.06556919802096672, + "step": 976 + }, + { + "clip_ratio": 0.00029697347130763774, + "epoch": 1.2428370828478383, + "grad_norm": 0.0369977168738842, + "kl": 0.006660938262939453, + "learning_rate": 2.268226820206044e-06, + "loss": 0.0094, + "step": 977 + }, + { + "clip_ratio": 0.000319464833580696, + "epoch": 1.2447491858623883, + "grad_norm": 0.03550850227475166, + "kl": 0.006519794464111328, + "learning_rate": 2.261981425218138e-06, + "loss": 0.0094, + "step": 978 + }, + { + "clip_ratio": 0.0003469139706453461, + "epoch": 1.2466612888769382, + "grad_norm": 0.03525082767009735, + "kl": 0.006406307220458984, + "learning_rate": 2.2557375288853803e-06, + "loss": 0.0093, + "step": 979 + }, + { + "clip_ratio": 0.0003654695393606744, + "epoch": 1.248573391891488, + "grad_norm": 0.0355265848338604, + "kl": 0.006331443786621094, + "learning_rate": 2.2494951705217095e-06, + "loss": 0.0092, + "step": 980 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.76704454422, + "epoch": 1.250485494906038, + "grad_norm": 0.03745350241661072, + "kl": 0.0065135955810546875, + "learning_rate": 2.2432543894313797e-06, + "loss": 0.0042, + "num_tokens": 97952525.0, + "reward": 0.06501116385334171, + "reward_std": 0.07316133996937424, + "rewards/pure_accuracy_reward_math": 0.06501116222352721, + "step": 981 + }, + { + "clip_ratio": 0.00029299165072416145, + "epoch": 1.252397597920588, + "grad_norm": 0.03690091893076897, + "kl": 0.006426095962524414, + "learning_rate": 2.2370152249087114e-06, + "loss": 0.0042, + "step": 982 + }, + { + "clip_ratio": 0.0003187885846500649, + "epoch": 1.2543097009351378, + "grad_norm": 0.03645962476730347, + "kl": 0.006396055221557617, + "learning_rate": 2.2307777162378523e-06, + "loss": 0.0042, + "step": 983 + }, + { + "clip_ratio": 0.00033352292155086616, + "epoch": 1.2562218039496877, + "grad_norm": 0.03598187491297722, + "kl": 0.006333351135253906, + "learning_rate": 2.2245419026925187e-06, + "loss": 0.0041, + "step": 984 + }, + { + "clip_ratio": 0.0003533332319989313, + "epoch": 1.2581339069642377, + "grad_norm": 0.03577181696891785, + "kl": 0.006278276443481445, + "learning_rate": 2.218307823535757e-06, + "loss": 0.004, + "step": 985 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.8172650337219, + "epoch": 1.2600460099787876, + "grad_norm": 0.03590444475412369, + "kl": 0.005995273590087891, + "learning_rate": 2.2120755180196904e-06, + "loss": 0.0045, + "num_tokens": 101560026.0, + "reward": 0.06054687811410986, + "reward_std": 0.06865079078124836, + "rewards/pure_accuracy_reward_math": 0.06054687619325705, + "step": 986 + }, + { + "clip_ratio": 0.00024842098838462334, + "epoch": 1.2619581129933375, + "grad_norm": 0.03513624891638756, + "kl": 0.0059719085693359375, + "learning_rate": 2.2058450253852783e-06, + "loss": 0.0045, + "step": 987 + }, + { + "clip_ratio": 0.000271169978702801, + "epoch": 1.2638702160078874, + "grad_norm": 0.03392768278717995, + "kl": 0.005938529968261719, + "learning_rate": 2.1996163848620612e-06, + "loss": 0.0044, + "step": 988 + }, + { + "clip_ratio": 0.0002971922116898895, + "epoch": 1.2657823190224373, + "grad_norm": 0.03286145627498627, + "kl": 0.0060443878173828125, + "learning_rate": 2.1933896356679226e-06, + "loss": 0.0044, + "step": 989 + }, + { + "clip_ratio": 0.0003229031350429068, + "epoch": 1.2676944220369872, + "grad_norm": 0.032496001571416855, + "kl": 0.006091594696044922, + "learning_rate": 2.1871648170088347e-06, + "loss": 0.0043, + "step": 990 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.8125224113464, + "epoch": 1.2696065250515371, + "grad_norm": 0.21526122093200684, + "kl": 0.007075309753417969, + "learning_rate": 2.1809419680786143e-06, + "loss": 0.0072, + "num_tokens": 105223050.0, + "reward": 0.07421875381260179, + "reward_std": 0.08054219774203375, + "rewards/pure_accuracy_reward_math": 0.07421875130967237, + "step": 991 + }, + { + "clip_ratio": 0.00032863151136552915, + "epoch": 1.271518628066087, + "grad_norm": 0.03788222745060921, + "kl": 0.006428241729736328, + "learning_rate": 2.1747211280586758e-06, + "loss": 0.0072, + "step": 992 + }, + { + "clip_ratio": 0.00034688404628013814, + "epoch": 1.273430731080637, + "grad_norm": 0.03719337284564972, + "kl": 0.0064296722412109375, + "learning_rate": 2.168502336117787e-06, + "loss": 0.0071, + "step": 993 + }, + { + "clip_ratio": 0.00034599834629034376, + "epoch": 1.275342834095187, + "grad_norm": 0.036535993218421936, + "kl": 0.006348133087158203, + "learning_rate": 2.1622856314118178e-06, + "loss": 0.0071, + "step": 994 + }, + { + "clip_ratio": 0.00036459101005448247, + "epoch": 1.2772549371097368, + "grad_norm": 0.03548647463321686, + "kl": 0.006353855133056641, + "learning_rate": 2.156071053083496e-06, + "loss": 0.007, + "step": 995 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.536018371582, + "epoch": 1.2791670401242867, + "grad_norm": 0.03945273160934448, + "kl": 0.006157398223876953, + "learning_rate": 2.1498586402621646e-06, + "loss": 0.0062, + "num_tokens": 108847859.0, + "reward": 0.07366071807336994, + "reward_std": 0.072430647269357, + "rewards/pure_accuracy_reward_math": 0.07366071533760987, + "step": 996 + }, + { + "clip_ratio": 0.0002439655858097467, + "epoch": 1.2810791431388366, + "grad_norm": 0.03839760273694992, + "kl": 0.006161689758300781, + "learning_rate": 2.1436484320635275e-06, + "loss": 0.0061, + "step": 997 + }, + { + "clip_ratio": 0.0002514519866281262, + "epoch": 1.2829912461533866, + "grad_norm": 0.03733210638165474, + "kl": 0.0061798095703125, + "learning_rate": 2.1374404675894083e-06, + "loss": 0.0061, + "step": 998 + }, + { + "clip_ratio": 0.0002774860670342605, + "epoch": 1.2849033491679365, + "grad_norm": 0.03640332072973251, + "kl": 0.006183147430419922, + "learning_rate": 2.131234785927505e-06, + "loss": 0.006, + "step": 999 + }, + { + "clip_ratio": 0.0002877332713069336, + "epoch": 1.2868154521824864, + "grad_norm": 0.03559413552284241, + "kl": 0.006213665008544922, + "learning_rate": 2.1250314261511414e-06, + "loss": 0.0059, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.9492444992065, + "epoch": 1.2887275551970363, + "grad_norm": 0.04216492921113968, + "kl": 0.0073282718658447266, + "learning_rate": 2.1188304273190196e-06, + "loss": 0.0102, + "num_tokens": 112482213.0, + "reward": 0.0772879500000272, + "reward_std": 0.07908701087580994, + "rewards/pure_accuracy_reward_math": 0.07728794772992842, + "step": 1001 + }, + { + "clip_ratio": 0.0003075964003755871, + "epoch": 1.2906396582115862, + "grad_norm": 0.039000045508146286, + "kl": 0.007200002670288086, + "learning_rate": 2.1126318284749807e-06, + "loss": 0.0102, + "step": 1002 + }, + { + "clip_ratio": 0.0003138856436635251, + "epoch": 1.2925517612261361, + "grad_norm": 0.036585696041584015, + "kl": 0.00716710090637207, + "learning_rate": 2.106435668647751e-06, + "loss": 0.0101, + "step": 1003 + }, + { + "clip_ratio": 0.00033263966838603665, + "epoch": 1.294463864240686, + "grad_norm": 0.03634057566523552, + "kl": 0.007274150848388672, + "learning_rate": 2.1002419868507005e-06, + "loss": 0.01, + "step": 1004 + }, + { + "clip_ratio": 0.00035104663936635916, + "epoch": 1.2963759672552357, + "grad_norm": 0.03524275869131088, + "kl": 0.0072422027587890625, + "learning_rate": 2.0940508220815978e-06, + "loss": 0.01, + "step": 1005 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.5226221084595, + "epoch": 1.2982880702697859, + "grad_norm": 0.04047563299536705, + "kl": 0.006965160369873047, + "learning_rate": 2.087862213322362e-06, + "loss": 0.0078, + "num_tokens": 116078946.0, + "reward": 0.06752232470898889, + "reward_std": 0.08269421081058681, + "rewards/pure_accuracy_reward_math": 0.0675223229045514, + "step": 1006 + }, + { + "clip_ratio": 0.00033451643105308904, + "epoch": 1.3002001732843356, + "grad_norm": 0.03818976879119873, + "kl": 0.0069293975830078125, + "learning_rate": 2.0816761995388198e-06, + "loss": 0.0078, + "step": 1007 + }, + { + "clip_ratio": 0.0003828123747666723, + "epoch": 1.3021122762988857, + "grad_norm": 0.03969357907772064, + "kl": 0.006967067718505859, + "learning_rate": 2.075492819680457e-06, + "loss": 0.0078, + "step": 1008 + }, + { + "clip_ratio": 0.0003832018163620887, + "epoch": 1.3040243793134354, + "grad_norm": 0.040100231766700745, + "kl": 0.007086753845214844, + "learning_rate": 2.0693121126801778e-06, + "loss": 0.0077, + "step": 1009 + }, + { + "clip_ratio": 0.0003569153510625256, + "epoch": 1.3059364823279855, + "grad_norm": 0.037368252873420715, + "kl": 0.007195472717285156, + "learning_rate": 2.063134117454055e-06, + "loss": 0.0076, + "step": 1010 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.7126340866089, + "epoch": 1.3078485853425352, + "grad_norm": 0.0401712991297245, + "kl": 0.00678253173828125, + "learning_rate": 2.0569588729010896e-06, + "loss": 0.0063, + "num_tokens": 119662772.0, + "reward": 0.0705915214784909, + "reward_std": 0.08484002540353686, + "rewards/pure_accuracy_reward_math": 0.0705915190919768, + "step": 1011 + }, + { + "clip_ratio": 0.0003401347770477514, + "epoch": 1.3097606883570854, + "grad_norm": 0.03972383588552475, + "kl": 0.006781578063964844, + "learning_rate": 2.0507864179029592e-06, + "loss": 0.0062, + "step": 1012 + }, + { + "clip_ratio": 0.00040657852025560715, + "epoch": 1.311672791371635, + "grad_norm": 0.04063359647989273, + "kl": 0.006711006164550781, + "learning_rate": 2.044616791323781e-06, + "loss": 0.0062, + "step": 1013 + }, + { + "clip_ratio": 0.0004189488300880839, + "epoch": 1.313584894386185, + "grad_norm": 0.03818094730377197, + "kl": 0.006552696228027344, + "learning_rate": 2.0384500320098604e-06, + "loss": 0.0061, + "step": 1014 + }, + { + "clip_ratio": 0.000448550158978378, + "epoch": 1.315496997400735, + "grad_norm": 0.03749743476510048, + "kl": 0.0064678192138671875, + "learning_rate": 2.032286178789454e-06, + "loss": 0.006, + "step": 1015 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.0069990158081, + "epoch": 1.3174091004152848, + "grad_norm": 0.03775123134255409, + "kl": 0.006552696228027344, + "learning_rate": 2.0261252704725143e-06, + "loss": 0.0047, + "num_tokens": 123299241.0, + "reward": 0.06919643163564615, + "reward_std": 0.0781373989302665, + "rewards/pure_accuracy_reward_math": 0.06919642994762398, + "step": 1016 + }, + { + "clip_ratio": 0.0003128642913452495, + "epoch": 1.3193212034298347, + "grad_norm": 0.03666616231203079, + "kl": 0.006560325622558594, + "learning_rate": 2.0199673458504577e-06, + "loss": 0.0047, + "step": 1017 + }, + { + "clip_ratio": 0.00030665075905744743, + "epoch": 1.3212333064443846, + "grad_norm": 0.035805702209472656, + "kl": 0.006537437438964844, + "learning_rate": 2.01381244369591e-06, + "loss": 0.0046, + "step": 1018 + }, + { + "clip_ratio": 0.0003063842187316368, + "epoch": 1.3231454094589346, + "grad_norm": 0.03492369130253792, + "kl": 0.006512641906738281, + "learning_rate": 2.0076606027624676e-06, + "loss": 0.0046, + "step": 1019 + }, + { + "clip_ratio": 0.00033027163379983904, + "epoch": 1.3250575124734845, + "grad_norm": 0.03507117182016373, + "kl": 0.006590366363525391, + "learning_rate": 2.0015118617844516e-06, + "loss": 0.0045, + "step": 1020 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.10493516922, + "epoch": 1.3269696154880344, + "grad_norm": 0.04077515751123428, + "kl": 0.006287097930908203, + "learning_rate": 1.9953662594766675e-06, + "loss": 0.007, + "num_tokens": 126958737.0, + "reward": 0.0756138427532278, + "reward_std": 0.08067478984594345, + "rewards/pure_accuracy_reward_math": 0.07561384083237499, + "step": 1021 + }, + { + "clip_ratio": 0.0003038725464534764, + "epoch": 1.3288817185025843, + "grad_norm": 0.03825462609529495, + "kl": 0.0063266754150390625, + "learning_rate": 1.9892238345341544e-06, + "loss": 0.007, + "step": 1022 + }, + { + "clip_ratio": 0.0003366774006963169, + "epoch": 1.3307938215171342, + "grad_norm": 0.03734288364648819, + "kl": 0.006364345550537109, + "learning_rate": 1.983084625631949e-06, + "loss": 0.0069, + "step": 1023 + }, + { + "clip_ratio": 0.0003749641306853846, + "epoch": 1.3327059245316841, + "grad_norm": 0.03799683600664139, + "kl": 0.006411075592041016, + "learning_rate": 1.9769486714248367e-06, + "loss": 0.0068, + "step": 1024 + }, + { + "clip_ratio": 0.0003729545476289786, + "epoch": 1.334618027546234, + "grad_norm": 0.03601997718214989, + "kl": 0.006434917449951172, + "learning_rate": 1.9708160105471105e-06, + "loss": 0.0068, + "step": 1025 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.7709493637085, + "epoch": 1.336530130560784, + "grad_norm": 0.04102141782641411, + "kl": 0.006857395172119141, + "learning_rate": 1.964686681612327e-06, + "loss": 0.0055, + "num_tokens": 130592668.0, + "reward": 0.06556919959257357, + "reward_std": 0.06470447563333437, + "rewards/pure_accuracy_reward_math": 0.0655691981955897, + "step": 1026 + }, + { + "clip_ratio": 0.00021823535962539609, + "epoch": 1.3384422335753339, + "grad_norm": 0.03428492322564125, + "kl": 0.006598472595214844, + "learning_rate": 1.9585607232130636e-06, + "loss": 0.0054, + "step": 1027 + }, + { + "clip_ratio": 0.00024637427833340553, + "epoch": 1.3403543365898838, + "grad_norm": 0.032555270940065384, + "kl": 0.006415843963623047, + "learning_rate": 1.952438173920677e-06, + "loss": 0.0054, + "step": 1028 + }, + { + "clip_ratio": 0.0002563797440870985, + "epoch": 1.3422664396044337, + "grad_norm": 0.03202388435602188, + "kl": 0.006371498107910156, + "learning_rate": 1.946319072285058e-06, + "loss": 0.0053, + "step": 1029 + }, + { + "clip_ratio": 0.0002687414232696028, + "epoch": 1.3441785426189836, + "grad_norm": 0.03169838339090347, + "kl": 0.006340980529785156, + "learning_rate": 1.9402034568343888e-06, + "loss": 0.0053, + "step": 1030 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.2184953689575, + "epoch": 1.3460906456335335, + "grad_norm": 0.054084766656160355, + "kl": 0.006264686584472656, + "learning_rate": 1.9340913660749015e-06, + "loss": 0.0071, + "num_tokens": 134289567.0, + "reward": 0.06668527112924494, + "reward_std": 0.07140090392204002, + "rewards/pure_accuracy_reward_math": 0.06668526903376915, + "step": 1031 + }, + { + "clip_ratio": 0.00022883353369707038, + "epoch": 1.3480027486480834, + "grad_norm": 0.03612653911113739, + "kl": 0.006344318389892578, + "learning_rate": 1.9279828384906373e-06, + "loss": 0.0071, + "step": 1032 + }, + { + "clip_ratio": 0.0002760976024376305, + "epoch": 1.3499148516626334, + "grad_norm": 0.036703869700431824, + "kl": 0.006397724151611328, + "learning_rate": 1.921877912543198e-06, + "loss": 0.0071, + "step": 1033 + }, + { + "clip_ratio": 0.00027991523592163503, + "epoch": 1.3518269546771833, + "grad_norm": 0.036445919424295425, + "kl": 0.006428718566894531, + "learning_rate": 1.9157766266715142e-06, + "loss": 0.007, + "step": 1034 + }, + { + "clip_ratio": 0.0003110420944381076, + "epoch": 1.3537390576917332, + "grad_norm": 0.032879918813705444, + "kl": 0.006253242492675781, + "learning_rate": 1.909679019291592e-06, + "loss": 0.0069, + "step": 1035 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.200918674469, + "epoch": 1.355651160706283, + "grad_norm": 0.0374806709587574, + "kl": 0.006623744964599609, + "learning_rate": 1.9035851287962797e-06, + "loss": 0.0088, + "num_tokens": 137901395.0, + "reward": 0.07170759295695461, + "reward_std": 0.0834249026956968, + "rewards/pure_accuracy_reward_math": 0.0717075907450635, + "step": 1036 + }, + { + "clip_ratio": 0.0002719677876825699, + "epoch": 1.357563263720833, + "grad_norm": 0.03692527487874031, + "kl": 0.006625652313232422, + "learning_rate": 1.8974949935550202e-06, + "loss": 0.0088, + "step": 1037 + }, + { + "clip_ratio": 0.0003176050505544481, + "epoch": 1.359475366735383, + "grad_norm": 0.03605135530233383, + "kl": 0.006484031677246094, + "learning_rate": 1.8914086519136133e-06, + "loss": 0.0088, + "step": 1038 + }, + { + "clip_ratio": 0.0003420261080577802, + "epoch": 1.3613874697499329, + "grad_norm": 0.03582129627466202, + "kl": 0.006468296051025391, + "learning_rate": 1.8853261421939718e-06, + "loss": 0.0087, + "step": 1039 + }, + { + "clip_ratio": 0.00034158617637558564, + "epoch": 1.3632995727644825, + "grad_norm": 0.0346604622900486, + "kl": 0.006458282470703125, + "learning_rate": 1.8792475026938823e-06, + "loss": 0.0086, + "step": 1040 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.6152620315552, + "epoch": 1.3652116757790327, + "grad_norm": 0.03809192404150963, + "kl": 0.006644248962402344, + "learning_rate": 1.8731727716867632e-06, + "loss": 0.0098, + "num_tokens": 141517968.0, + "reward": 0.07477678963914514, + "reward_std": 0.0749618403497152, + "rewards/pure_accuracy_reward_math": 0.07477678678696975, + "step": 1041 + }, + { + "clip_ratio": 0.0002677642194726104, + "epoch": 1.3671237787935824, + "grad_norm": 0.0377020426094532, + "kl": 0.0066089630126953125, + "learning_rate": 1.8671019874214237e-06, + "loss": 0.0098, + "step": 1042 + }, + { + "clip_ratio": 0.0002758102658617645, + "epoch": 1.3690358818081325, + "grad_norm": 0.03678804636001587, + "kl": 0.006642341613769531, + "learning_rate": 1.8610351881218211e-06, + "loss": 0.0098, + "step": 1043 + }, + { + "clip_ratio": 0.0002790037015074631, + "epoch": 1.3709479848226822, + "grad_norm": 0.03615477308630943, + "kl": 0.006649971008300781, + "learning_rate": 1.8549724119868235e-06, + "loss": 0.0097, + "step": 1044 + }, + { + "clip_ratio": 0.0002795595634097481, + "epoch": 1.3728600878372323, + "grad_norm": 0.03598296642303467, + "kl": 0.006653785705566406, + "learning_rate": 1.8489136971899658e-06, + "loss": 0.0096, + "step": 1045 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.382839679718, + "epoch": 1.374772190851782, + "grad_norm": 0.03458879515528679, + "kl": 0.0064601898193359375, + "learning_rate": 1.8428590818792135e-06, + "loss": 0.0038, + "num_tokens": 145187116.0, + "reward": 0.06584821731667034, + "reward_std": 0.07200520334299654, + "rewards/pure_accuracy_reward_math": 0.06584821562864818, + "step": 1046 + }, + { + "clip_ratio": 0.00023162108237784196, + "epoch": 1.3766842938663322, + "grad_norm": 0.03385276347398758, + "kl": 0.006392478942871094, + "learning_rate": 1.836808604176719e-06, + "loss": 0.0038, + "step": 1047 + }, + { + "clip_ratio": 0.00026906593984676874, + "epoch": 1.3785963968808819, + "grad_norm": 0.0331512950360775, + "kl": 0.0062427520751953125, + "learning_rate": 1.8307623021785837e-06, + "loss": 0.0037, + "step": 1048 + }, + { + "clip_ratio": 0.00025022312701139526, + "epoch": 1.3805084998954318, + "grad_norm": 0.032765790820121765, + "kl": 0.006190299987792969, + "learning_rate": 1.8247202139546155e-06, + "loss": 0.0037, + "step": 1049 + }, + { + "clip_ratio": 0.0002507307134465009, + "epoch": 1.3824206029099817, + "grad_norm": 0.0325283482670784, + "kl": 0.006188869476318359, + "learning_rate": 1.8186823775480917e-06, + "loss": 0.0036, + "step": 1050 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.5159296989441, + "epoch": 1.3843327059245316, + "grad_norm": 0.03628634661436081, + "kl": 0.007945537567138672, + "learning_rate": 1.8126488309755178e-06, + "loss": 0.0101, + "num_tokens": 148852261.0, + "reward": 0.06194196696742438, + "reward_std": 0.06792009872151539, + "rewards/pure_accuracy_reward_math": 0.06194196580327116, + "step": 1051 + }, + { + "clip_ratio": 0.00025563780241100176, + "epoch": 1.3862448089390815, + "grad_norm": 0.035264719277620316, + "kl": 0.007953643798828125, + "learning_rate": 1.80661961222639e-06, + "loss": 0.0101, + "step": 1052 + }, + { + "clip_ratio": 0.0002401949207069265, + "epoch": 1.3881569119536314, + "grad_norm": 0.034110233187675476, + "kl": 0.007923126220703125, + "learning_rate": 1.8005947592629551e-06, + "loss": 0.0101, + "step": 1053 + }, + { + "clip_ratio": 0.00026547102737595196, + "epoch": 1.3900690149681814, + "grad_norm": 0.03364601358771324, + "kl": 0.00788116455078125, + "learning_rate": 1.7945743100199706e-06, + "loss": 0.01, + "step": 1054 + }, + { + "clip_ratio": 0.0002951583905996813, + "epoch": 1.3919811179827313, + "grad_norm": 0.03397928550839424, + "kl": 0.007859230041503906, + "learning_rate": 1.788558302404466e-06, + "loss": 0.0099, + "step": 1055 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.25337266922, + "epoch": 1.3938932209972812, + "grad_norm": 0.03863634541630745, + "kl": 0.006538867950439453, + "learning_rate": 1.7825467742955052e-06, + "loss": 0.0066, + "num_tokens": 152486009.0, + "reward": 0.06780134289874695, + "reward_std": 0.06736206321511418, + "rewards/pure_accuracy_reward_math": 0.06780134057044052, + "step": 1056 + }, + { + "clip_ratio": 0.00027592373527340897, + "epoch": 1.395805324011831, + "grad_norm": 0.036583587527275085, + "kl": 0.0065402984619140625, + "learning_rate": 1.7765397635439468e-06, + "loss": 0.0066, + "step": 1057 + }, + { + "clip_ratio": 0.0002849266509201698, + "epoch": 1.397717427026381, + "grad_norm": 0.03605053946375847, + "kl": 0.006500244140625, + "learning_rate": 1.7705373079722083e-06, + "loss": 0.0065, + "step": 1058 + }, + { + "clip_ratio": 0.0003116865132142266, + "epoch": 1.399629530040931, + "grad_norm": 0.03675729036331177, + "kl": 0.006489276885986328, + "learning_rate": 1.7645394453740227e-06, + "loss": 0.0064, + "step": 1059 + }, + { + "clip_ratio": 0.0003249485117748918, + "epoch": 1.4015416330554809, + "grad_norm": 0.03623329848051071, + "kl": 0.006478786468505859, + "learning_rate": 1.7585462135142083e-06, + "loss": 0.0064, + "step": 1060 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.029598236084, + "epoch": 1.4034537360700308, + "grad_norm": 0.03506990894675255, + "kl": 0.006392955780029297, + "learning_rate": 1.752557650128423e-06, + "loss": 0.0096, + "num_tokens": 156082643.0, + "reward": 0.06194196664728224, + "reward_std": 0.07560620515141636, + "rewards/pure_accuracy_reward_math": 0.061941966181620955, + "step": 1061 + }, + { + "clip_ratio": 0.0002744606111662051, + "epoch": 1.4053658390845807, + "grad_norm": 0.03450053185224533, + "kl": 0.006424903869628906, + "learning_rate": 1.7465737929229317e-06, + "loss": 0.0096, + "step": 1062 + }, + { + "clip_ratio": 0.00027279697263793423, + "epoch": 1.4072779420991306, + "grad_norm": 0.033764585852622986, + "kl": 0.006496906280517578, + "learning_rate": 1.7405946795743665e-06, + "loss": 0.0096, + "step": 1063 + }, + { + "clip_ratio": 0.000298209258943416, + "epoch": 1.4091900451136805, + "grad_norm": 0.03335048630833626, + "kl": 0.0065898895263671875, + "learning_rate": 1.7346203477294916e-06, + "loss": 0.0095, + "step": 1064 + }, + { + "clip_ratio": 0.00030832760762677935, + "epoch": 1.4111021481282304, + "grad_norm": 0.03299354016780853, + "kl": 0.006653308868408203, + "learning_rate": 1.7286508350049627e-06, + "loss": 0.0094, + "step": 1065 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.4023675918579, + "epoch": 1.4130142511427803, + "grad_norm": 0.04127517342567444, + "kl": 0.010558605194091797, + "learning_rate": 1.722686178987097e-06, + "loss": 0.0076, + "num_tokens": 159696133.0, + "reward": 0.06640625282307155, + "reward_std": 0.07264956791186705, + "rewards/pure_accuracy_reward_math": 0.06640625101863407, + "step": 1066 + }, + { + "clip_ratio": 0.00030437137564831573, + "epoch": 1.4149263541573303, + "grad_norm": 0.039496634155511856, + "kl": 0.010538101196289062, + "learning_rate": 1.7167264172316273e-06, + "loss": 0.0076, + "step": 1067 + }, + { + "clip_ratio": 0.0003244270092181978, + "epoch": 1.4168384571718802, + "grad_norm": 0.039376117289066315, + "kl": 0.010515689849853516, + "learning_rate": 1.7107715872634731e-06, + "loss": 0.0075, + "step": 1068 + }, + { + "clip_ratio": 0.0003491952173817481, + "epoch": 1.41875056018643, + "grad_norm": 0.03863466531038284, + "kl": 0.01038360595703125, + "learning_rate": 1.7048217265764993e-06, + "loss": 0.0075, + "step": 1069 + }, + { + "clip_ratio": 0.00037865171140083476, + "epoch": 1.42066266320098, + "grad_norm": 0.03795957565307617, + "kl": 0.010157585144042969, + "learning_rate": 1.6988768726332856e-06, + "loss": 0.0074, + "step": 1070 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.8691644668579, + "epoch": 1.42257476621553, + "grad_norm": 0.04360206797719002, + "kl": 0.0067138671875, + "learning_rate": 1.6929370628648828e-06, + "loss": 0.0086, + "num_tokens": 163268528.0, + "reward": 0.08565848623402417, + "reward_std": 0.08861368341604248, + "rewards/pure_accuracy_reward_math": 0.08565848384751007, + "step": 1071 + }, + { + "clip_ratio": 0.00031944918799808875, + "epoch": 1.4244868692300798, + "grad_norm": 0.04292250797152519, + "kl": 0.006737709045410156, + "learning_rate": 1.6870023346705866e-06, + "loss": 0.0085, + "step": 1072 + }, + { + "clip_ratio": 0.00031442818647064996, + "epoch": 1.4263989722446297, + "grad_norm": 0.04044810310006142, + "kl": 0.006873607635498047, + "learning_rate": 1.6810727254176937e-06, + "loss": 0.0085, + "step": 1073 + }, + { + "clip_ratio": 0.0003650832475727839, + "epoch": 1.4283110752591797, + "grad_norm": 0.04156485199928284, + "kl": 0.006984233856201172, + "learning_rate": 1.6751482724412716e-06, + "loss": 0.0084, + "step": 1074 + }, + { + "clip_ratio": 0.0003947964444250829, + "epoch": 1.4302231782737296, + "grad_norm": 0.04023054987192154, + "kl": 0.007004737854003906, + "learning_rate": 1.669229013043921e-06, + "loss": 0.0083, + "step": 1075 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.7343969345093, + "epoch": 1.4321352812882795, + "grad_norm": 0.03780645504593849, + "kl": 0.006886005401611328, + "learning_rate": 1.6633149844955415e-06, + "loss": 0.0094, + "num_tokens": 166836260.0, + "reward": 0.0797991111758165, + "reward_std": 0.08157813875004649, + "rewards/pure_accuracy_reward_math": 0.07979910867288709, + "step": 1076 + }, + { + "clip_ratio": 0.0002608302990552147, + "epoch": 1.4340473843028292, + "grad_norm": 0.03681138530373573, + "kl": 0.006786823272705078, + "learning_rate": 1.6574062240330996e-06, + "loss": 0.0093, + "step": 1077 + }, + { + "clip_ratio": 0.00031450060896531795, + "epoch": 1.4359594873173793, + "grad_norm": 0.036778852343559265, + "kl": 0.0066986083984375, + "learning_rate": 1.651502768860389e-06, + "loss": 0.0093, + "step": 1078 + }, + { + "clip_ratio": 0.0003176571812559814, + "epoch": 1.437871590331929, + "grad_norm": 0.03592304140329361, + "kl": 0.006758213043212891, + "learning_rate": 1.6456046561478023e-06, + "loss": 0.0092, + "step": 1079 + }, + { + "clip_ratio": 0.0003236016519281293, + "epoch": 1.4397836933464792, + "grad_norm": 0.03520684316754341, + "kl": 0.006850242614746094, + "learning_rate": 1.6397119230320919e-06, + "loss": 0.0092, + "step": 1080 + }, + { + "clip_ratio": 0.0, + "completion_length": 508.80498933792114, + "epoch": 1.4416957963610288, + "grad_norm": 0.04630957916378975, + "kl": 0.01150655746459961, + "learning_rate": 1.633824606616138e-06, + "loss": 0.008, + "num_tokens": 170392081.0, + "reward": 0.07589286129223183, + "reward_std": 0.08140548272058368, + "rewards/pure_accuracy_reward_math": 0.07589285844005644, + "step": 1081 + }, + { + "clip_ratio": 0.00028873196572476445, + "epoch": 1.443607899375579, + "grad_norm": 0.04534924402832985, + "kl": 0.01107931137084961, + "learning_rate": 1.6279427439687154e-06, + "loss": 0.008, + "step": 1082 + }, + { + "clip_ratio": 0.000319909158235987, + "epoch": 1.4455200023901287, + "grad_norm": 0.044707395136356354, + "kl": 0.010364532470703125, + "learning_rate": 1.622066372124262e-06, + "loss": 0.0079, + "step": 1083 + }, + { + "clip_ratio": 0.0003388643909829625, + "epoch": 1.4474321054046788, + "grad_norm": 0.038643479347229004, + "kl": 0.009525775909423828, + "learning_rate": 1.6161955280826399e-06, + "loss": 0.0078, + "step": 1084 + }, + { + "clip_ratio": 0.0003223289492098047, + "epoch": 1.4493442084192285, + "grad_norm": 0.12098709493875504, + "kl": 0.010370254516601562, + "learning_rate": 1.6103302488089104e-06, + "loss": 0.0078, + "step": 1085 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.3169894218445, + "epoch": 1.4512563114337784, + "grad_norm": 0.03693209961056709, + "kl": 0.006680965423583984, + "learning_rate": 1.6044705712330932e-06, + "loss": 0.0059, + "num_tokens": 173992817.0, + "reward": 0.07031250311410986, + "reward_std": 0.07530715462053195, + "rewards/pure_accuracy_reward_math": 0.07031250142608769, + "step": 1086 + }, + { + "clip_ratio": 0.0002918191117657898, + "epoch": 1.4531684144483283, + "grad_norm": 0.03641385957598686, + "kl": 0.0065898895263671875, + "learning_rate": 1.5986165322499398e-06, + "loss": 0.0059, + "step": 1087 + }, + { + "clip_ratio": 0.0002921736467840219, + "epoch": 1.4550805174628783, + "grad_norm": 0.03598758950829506, + "kl": 0.006548881530761719, + "learning_rate": 1.5927681687186964e-06, + "loss": 0.0058, + "step": 1088 + }, + { + "clip_ratio": 0.0003169650843233285, + "epoch": 1.4569926204774282, + "grad_norm": 0.036268141120672226, + "kl": 0.006561756134033203, + "learning_rate": 1.5869255174628778e-06, + "loss": 0.0058, + "step": 1089 + }, + { + "clip_ratio": 0.0003259218068478731, + "epoch": 1.458904723491978, + "grad_norm": 0.03529893979430199, + "kl": 0.006597042083740234, + "learning_rate": 1.5810886152700302e-06, + "loss": 0.0057, + "step": 1090 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.391206741333, + "epoch": 1.460816826506528, + "grad_norm": 0.04034799709916115, + "kl": 0.006509304046630859, + "learning_rate": 1.5752574988915004e-06, + "loss": 0.0066, + "num_tokens": 177633359.0, + "reward": 0.07477678920258768, + "reward_std": 0.0747891838545911, + "rewards/pure_accuracy_reward_math": 0.07477678699069656, + "step": 1091 + }, + { + "clip_ratio": 0.0002679697158214367, + "epoch": 1.462728929521078, + "grad_norm": 0.039328683167696, + "kl": 0.006606101989746094, + "learning_rate": 1.5694322050422096e-06, + "loss": 0.0066, + "step": 1092 + }, + { + "clip_ratio": 0.0002975759220475993, + "epoch": 1.4646410325356278, + "grad_norm": 0.03947217017412186, + "kl": 0.00665283203125, + "learning_rate": 1.5636127704004133e-06, + "loss": 0.0065, + "step": 1093 + }, + { + "clip_ratio": 0.0003127538088278925, + "epoch": 1.4665531355501777, + "grad_norm": 0.03733786940574646, + "kl": 0.006627559661865234, + "learning_rate": 1.5577992316074783e-06, + "loss": 0.0064, + "step": 1094 + }, + { + "clip_ratio": 0.00035554791872982605, + "epoch": 1.4684652385647277, + "grad_norm": 0.03660706803202629, + "kl": 0.0065364837646484375, + "learning_rate": 1.5519916252676482e-06, + "loss": 0.0064, + "step": 1095 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.1163763999939, + "epoch": 1.4703773415792776, + "grad_norm": 0.06871657073497772, + "kl": 0.010003089904785156, + "learning_rate": 1.5461899879478133e-06, + "loss": 0.0057, + "num_tokens": 181268648.0, + "reward": 0.0744977711874526, + "reward_std": 0.08333237702026963, + "rewards/pure_accuracy_reward_math": 0.0744977695576381, + "step": 1096 + }, + { + "clip_ratio": 0.00032988911306119917, + "epoch": 1.4722894445938275, + "grad_norm": 0.04868275299668312, + "kl": 0.009030342102050781, + "learning_rate": 1.5403943561772789e-06, + "loss": 0.0057, + "step": 1097 + }, + { + "clip_ratio": 0.0003833602018517013, + "epoch": 1.4742015476083774, + "grad_norm": 0.04073934629559517, + "kl": 0.00842428207397461, + "learning_rate": 1.5346047664475422e-06, + "loss": 0.0056, + "step": 1098 + }, + { + "clip_ratio": 0.00040459603366116426, + "epoch": 1.4761136506229273, + "grad_norm": 0.04011493921279907, + "kl": 0.008179187774658203, + "learning_rate": 1.5288212552120524e-06, + "loss": 0.0055, + "step": 1099 + }, + { + "clip_ratio": 0.0004078742092019638, + "epoch": 1.4780257536374772, + "grad_norm": 0.03785649687051773, + "kl": 0.008193016052246094, + "learning_rate": 1.5230438588859881e-06, + "loss": 0.0054, + "step": 1100 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.5837321281433, + "epoch": 1.4799378566520272, + "grad_norm": 0.04047717526555061, + "kl": 0.007642269134521484, + "learning_rate": 1.517272613846027e-06, + "loss": 0.0051, + "num_tokens": 184939348.0, + "reward": 0.06863839572179131, + "reward_std": 0.07131457631476223, + "rewards/pure_accuracy_reward_math": 0.06863839420839213, + "step": 1101 + }, + { + "clip_ratio": 0.00026072144959243815, + "epoch": 1.481849959666577, + "grad_norm": 0.037731293588876724, + "kl": 0.007551670074462891, + "learning_rate": 1.511507556430114e-06, + "loss": 0.0051, + "step": 1102 + }, + { + "clip_ratio": 0.00029216510773721893, + "epoch": 1.483762062681127, + "grad_norm": 0.03771767392754555, + "kl": 0.007477760314941406, + "learning_rate": 1.5057487229372347e-06, + "loss": 0.0051, + "step": 1103 + }, + { + "clip_ratio": 0.0003181908435294645, + "epoch": 1.485674165695677, + "grad_norm": 0.03619125112891197, + "kl": 0.0074062347412109375, + "learning_rate": 1.4999961496271889e-06, + "loss": 0.005, + "step": 1104 + }, + { + "clip_ratio": 0.0003646736843165854, + "epoch": 1.4875862687102268, + "grad_norm": 0.035048868507146835, + "kl": 0.007380008697509766, + "learning_rate": 1.4942498727203578e-06, + "loss": 0.0049, + "step": 1105 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.8585615158081, + "epoch": 1.4894983717247767, + "grad_norm": 0.0386812798678875, + "kl": 0.006747245788574219, + "learning_rate": 1.4885099283974774e-06, + "loss": 0.0071, + "num_tokens": 188614221.0, + "reward": 0.07198661062284373, + "reward_std": 0.08140548341907561, + "rewards/pure_accuracy_reward_math": 0.07198660864378326, + "step": 1106 + }, + { + "clip_ratio": 0.0003357146362077401, + "epoch": 1.4914104747393266, + "grad_norm": 0.03723128139972687, + "kl": 0.006694316864013672, + "learning_rate": 1.482776352799414e-06, + "loss": 0.0071, + "step": 1107 + }, + { + "clip_ratio": 0.0003692662889989151, + "epoch": 1.4933225777538766, + "grad_norm": 0.038370903581380844, + "kl": 0.006665706634521484, + "learning_rate": 1.4770491820269317e-06, + "loss": 0.007, + "step": 1108 + }, + { + "clip_ratio": 0.00040588962588117283, + "epoch": 1.4952346807684265, + "grad_norm": 0.037489671260118484, + "kl": 0.006663322448730469, + "learning_rate": 1.4713284521404678e-06, + "loss": 0.0069, + "step": 1109 + }, + { + "clip_ratio": 0.00039138679812822375, + "epoch": 1.4971467837829764, + "grad_norm": 0.03641659393906593, + "kl": 0.006697654724121094, + "learning_rate": 1.465614199159905e-06, + "loss": 0.0069, + "step": 1110 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.476583480835, + "epoch": 1.4990588867975263, + "grad_norm": 1.8961507081985474, + "kl": 0.03508758544921875, + "learning_rate": 1.4599064590643472e-06, + "loss": 0.0056, + "num_tokens": 192212657.0, + "reward": 0.0753348250000272, + "reward_std": 0.07783834805013612, + "rewards/pure_accuracy_reward_math": 0.07533482302096672, + "step": 1111 + }, + { + "clip_ratio": 0.00029740781877762856, + "epoch": 1.500970989812076, + "grad_norm": 0.08476530015468597, + "kl": 0.011601448059082031, + "learning_rate": 1.4542052677918885e-06, + "loss": 0.0047, + "step": 1112 + }, + { + "clip_ratio": 0.0003210891072171762, + "epoch": 1.5028830928266261, + "grad_norm": 0.04907820373773575, + "kl": 0.010628223419189453, + "learning_rate": 1.4485106612393897e-06, + "loss": 0.0046, + "step": 1113 + }, + { + "clip_ratio": 0.00033912417364945213, + "epoch": 1.5047951958411758, + "grad_norm": 0.04438456520438194, + "kl": 0.010659217834472656, + "learning_rate": 1.4428226752622509e-06, + "loss": 0.0046, + "step": 1114 + }, + { + "clip_ratio": 0.0003756833369834567, + "epoch": 1.506707298855726, + "grad_norm": 0.0422808900475502, + "kl": 0.010442733764648438, + "learning_rate": 1.437141345674189e-06, + "loss": 0.0045, + "step": 1115 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.0778713226318, + "epoch": 1.5086194018702757, + "grad_norm": 0.048265133053064346, + "kl": 0.007592678070068359, + "learning_rate": 1.4314667082470064e-06, + "loss": 0.0086, + "num_tokens": 195861088.0, + "reward": 0.07142857479630038, + "reward_std": 0.08346496871672571, + "rewards/pure_accuracy_reward_math": 0.07142857287544757, + "step": 1116 + }, + { + "clip_ratio": 0.0003429410510875641, + "epoch": 1.5105315048848258, + "grad_norm": 0.04287589713931084, + "kl": 0.007152557373046875, + "learning_rate": 1.4257987987103727e-06, + "loss": 0.0085, + "step": 1117 + }, + { + "clip_ratio": 0.0003726668836634417, + "epoch": 1.5124436078993755, + "grad_norm": 0.0397462397813797, + "kl": 0.006825447082519531, + "learning_rate": 1.420137652751593e-06, + "loss": 0.0085, + "step": 1118 + }, + { + "clip_ratio": 0.0003763367328133427, + "epoch": 1.5143557109139256, + "grad_norm": 0.03851110488176346, + "kl": 0.006707668304443359, + "learning_rate": 1.4144833060153887e-06, + "loss": 0.0084, + "step": 1119 + }, + { + "clip_ratio": 0.0003624607439292049, + "epoch": 1.5162678139284753, + "grad_norm": 0.03720558434724808, + "kl": 0.00676727294921875, + "learning_rate": 1.408835794103669e-06, + "loss": 0.0083, + "step": 1120 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.7569994926453, + "epoch": 1.5181799169430255, + "grad_norm": 0.03832938149571419, + "kl": 0.008425712585449219, + "learning_rate": 1.4031951525753088e-06, + "loss": 0.0071, + "num_tokens": 199475701.0, + "reward": 0.08565848635043949, + "reward_std": 0.08179086120799184, + "rewards/pure_accuracy_reward_math": 0.08565848338184878, + "step": 1121 + }, + { + "clip_ratio": 0.00028257126655262255, + "epoch": 1.5200920199575751, + "grad_norm": 0.038414496928453445, + "kl": 0.008458137512207031, + "learning_rate": 1.3975614169459253e-06, + "loss": 0.0071, + "step": 1122 + }, + { + "clip_ratio": 0.0003134008442202685, + "epoch": 1.5220041229721253, + "grad_norm": 0.03928304836153984, + "kl": 0.008496284484863281, + "learning_rate": 1.391934622687652e-06, + "loss": 0.0071, + "step": 1123 + }, + { + "clip_ratio": 0.00030222541431612626, + "epoch": 1.523916225986675, + "grad_norm": 0.038087427616119385, + "kl": 0.008494377136230469, + "learning_rate": 1.38631480522892e-06, + "loss": 0.007, + "step": 1124 + }, + { + "clip_ratio": 0.0002927070846396873, + "epoch": 1.525828329001225, + "grad_norm": 0.03641984984278679, + "kl": 0.008457183837890625, + "learning_rate": 1.3807019999542287e-06, + "loss": 0.0069, + "step": 1125 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.1537666320801, + "epoch": 1.5277404320157748, + "grad_norm": 0.040940940380096436, + "kl": 0.006596565246582031, + "learning_rate": 1.3750962422039269e-06, + "loss": 0.0058, + "num_tokens": 203109136.0, + "reward": 0.07254464621655643, + "reward_std": 0.08217623952077702, + "rewards/pure_accuracy_reward_math": 0.07254464400466532, + "step": 1126 + }, + { + "clip_ratio": 0.00031519718078243386, + "epoch": 1.5296525350303247, + "grad_norm": 0.038493506610393524, + "kl": 0.006714344024658203, + "learning_rate": 1.369497567273989e-06, + "loss": 0.0058, + "step": 1127 + }, + { + "clip_ratio": 0.0003513000764314711, + "epoch": 1.5315646380448746, + "grad_norm": 0.039495162665843964, + "kl": 0.006772041320800781, + "learning_rate": 1.3639060104157964e-06, + "loss": 0.0057, + "step": 1128 + }, + { + "clip_ratio": 0.00033387296190312554, + "epoch": 1.5334767410594246, + "grad_norm": 0.03875305503606796, + "kl": 0.006872653961181641, + "learning_rate": 1.3583216068359078e-06, + "loss": 0.0057, + "step": 1129 + }, + { + "clip_ratio": 0.00036185752793471693, + "epoch": 1.5353888440739745, + "grad_norm": 0.03817266598343849, + "kl": 0.006899356842041016, + "learning_rate": 1.3527443916958466e-06, + "loss": 0.0056, + "step": 1130 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.4143671989441, + "epoch": 1.5373009470885244, + "grad_norm": 0.035565100610256195, + "kl": 0.006679058074951172, + "learning_rate": 1.3471744001118718e-06, + "loss": 0.0091, + "num_tokens": 206769717.0, + "reward": 0.07533482497092336, + "reward_std": 0.07436373975360766, + "rewards/pure_accuracy_reward_math": 0.07533482293365523, + "step": 1131 + }, + { + "clip_ratio": 0.00028060592541123697, + "epoch": 1.5392130501030743, + "grad_norm": 0.036901701241731644, + "kl": 0.006720542907714844, + "learning_rate": 1.3416116671547613e-06, + "loss": 0.0091, + "step": 1132 + }, + { + "clip_ratio": 0.00034766932589036514, + "epoch": 1.5411251531176242, + "grad_norm": 0.03489091992378235, + "kl": 0.006618499755859375, + "learning_rate": 1.3360562278495899e-06, + "loss": 0.009, + "step": 1133 + }, + { + "clip_ratio": 0.0003513962886927402, + "epoch": 1.5430372561321741, + "grad_norm": 0.035007573664188385, + "kl": 0.0066070556640625, + "learning_rate": 1.3305081171755092e-06, + "loss": 0.009, + "step": 1134 + }, + { + "clip_ratio": 0.00036896456708745973, + "epoch": 1.544949359146724, + "grad_norm": 0.03363417461514473, + "kl": 0.006587028503417969, + "learning_rate": 1.3249673700655246e-06, + "loss": 0.0089, + "step": 1135 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.2251925468445, + "epoch": 1.546861462161274, + "grad_norm": 0.037738338112831116, + "kl": 0.006687164306640625, + "learning_rate": 1.3194340214062828e-06, + "loss": 0.0066, + "num_tokens": 210404892.0, + "reward": 0.07477678978466429, + "reward_std": 0.08492635452421382, + "rewards/pure_accuracy_reward_math": 0.07477678699069656, + "step": 1136 + }, + { + "clip_ratio": 0.0003166603274848967, + "epoch": 1.5487735651758239, + "grad_norm": 0.03711307421326637, + "kl": 0.0067272186279296875, + "learning_rate": 1.3139081060378423e-06, + "loss": 0.0066, + "step": 1137 + }, + { + "clip_ratio": 0.00032532861348499864, + "epoch": 1.5506856681903738, + "grad_norm": 0.0381547249853611, + "kl": 0.006831169128417969, + "learning_rate": 1.3083896587534606e-06, + "loss": 0.0065, + "step": 1138 + }, + { + "clip_ratio": 0.0003168874280845557, + "epoch": 1.5525977712049237, + "grad_norm": 0.03702245280146599, + "kl": 0.0068492889404296875, + "learning_rate": 1.3028787142993723e-06, + "loss": 0.0064, + "step": 1139 + }, + { + "clip_ratio": 0.00031372528076190065, + "epoch": 1.5545098742194736, + "grad_norm": 0.035462986677885056, + "kl": 0.0068511962890625, + "learning_rate": 1.297375307374574e-06, + "loss": 0.0063, + "step": 1140 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.9913792610168, + "epoch": 1.5564219772340235, + "grad_norm": 0.0402364507317543, + "kl": 0.006835460662841797, + "learning_rate": 1.2918794726306003e-06, + "loss": 0.0099, + "num_tokens": 214034825.0, + "reward": 0.07310268151923083, + "reward_std": 0.07917333993827924, + "rewards/pure_accuracy_reward_math": 0.07310268000583164, + "step": 1141 + }, + { + "clip_ratio": 0.0003137970834359294, + "epoch": 1.5583340802485734, + "grad_norm": 0.03920648992061615, + "kl": 0.006829738616943359, + "learning_rate": 1.2863912446713084e-06, + "loss": 0.0098, + "step": 1142 + }, + { + "clip_ratio": 0.00032378236608110456, + "epoch": 1.5602461832631231, + "grad_norm": 0.03806397691369057, + "kl": 0.006905078887939453, + "learning_rate": 1.2809106580526636e-06, + "loss": 0.0098, + "step": 1143 + }, + { + "clip_ratio": 0.0003143088524097948, + "epoch": 1.5621582862776733, + "grad_norm": 0.03801356628537178, + "kl": 0.006966590881347656, + "learning_rate": 1.2754377472825153e-06, + "loss": 0.0097, + "step": 1144 + }, + { + "clip_ratio": 0.00035796050920566813, + "epoch": 1.564070389292223, + "grad_norm": 0.036964964121580124, + "kl": 0.006992816925048828, + "learning_rate": 1.2699725468203832e-06, + "loss": 0.0096, + "step": 1145 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.6370244026184, + "epoch": 1.565982492306773, + "grad_norm": 0.045449208468198776, + "kl": 0.007224559783935547, + "learning_rate": 1.2645150910772413e-06, + "loss": 0.0043, + "num_tokens": 217697304.0, + "reward": 0.07393973600119352, + "reward_std": 0.08620888477889821, + "rewards/pure_accuracy_reward_math": 0.07393973361467943, + "step": 1146 + }, + { + "clip_ratio": 0.0003596847872131548, + "epoch": 1.5678945953213228, + "grad_norm": 0.03882161155343056, + "kl": 0.006949901580810547, + "learning_rate": 1.2590654144152992e-06, + "loss": 0.0043, + "step": 1147 + }, + { + "clip_ratio": 0.0004527134210547956, + "epoch": 1.569806698335873, + "grad_norm": 0.03764580935239792, + "kl": 0.00691986083984375, + "learning_rate": 1.2536235511477852e-06, + "loss": 0.0043, + "step": 1148 + }, + { + "clip_ratio": 0.0005161078099717997, + "epoch": 1.5717188013504226, + "grad_norm": 0.03833252564072609, + "kl": 0.006892681121826172, + "learning_rate": 1.2481895355387341e-06, + "loss": 0.0042, + "step": 1149 + }, + { + "clip_ratio": 0.0005320426059824968, + "epoch": 1.5736309043649728, + "grad_norm": 0.03876457363367081, + "kl": 0.006943702697753906, + "learning_rate": 1.2427634018027673e-06, + "loss": 0.0041, + "step": 1150 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.9707288742065, + "epoch": 1.5755430073795225, + "grad_norm": 0.03937402740120888, + "kl": 0.007305145263671875, + "learning_rate": 1.2373451841048781e-06, + "loss": 0.0078, + "num_tokens": 221325451.0, + "reward": 0.08258928963914514, + "reward_std": 0.08058846154017374, + "rewards/pure_accuracy_reward_math": 0.08258928655413911, + "step": 1151 + }, + { + "clip_ratio": 0.0002857717965980555, + "epoch": 1.5774551103940726, + "grad_norm": 0.03863917291164398, + "kl": 0.007287502288818359, + "learning_rate": 1.2319349165602202e-06, + "loss": 0.0078, + "step": 1152 + }, + { + "clip_ratio": 0.0002796752659151025, + "epoch": 1.5793672134086223, + "grad_norm": 0.03722836822271347, + "kl": 0.007286548614501953, + "learning_rate": 1.2265326332338875e-06, + "loss": 0.0077, + "step": 1153 + }, + { + "clip_ratio": 0.00034041513032434523, + "epoch": 1.5812793164231724, + "grad_norm": 0.03688417002558708, + "kl": 0.007335662841796875, + "learning_rate": 1.2211383681407022e-06, + "loss": 0.0076, + "step": 1154 + }, + { + "clip_ratio": 0.0003595712430524145, + "epoch": 1.5831914194377221, + "grad_norm": 0.037124987691640854, + "kl": 0.007359981536865234, + "learning_rate": 1.2157521552450035e-06, + "loss": 0.0076, + "step": 1155 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.098798751831, + "epoch": 1.5851035224522723, + "grad_norm": 0.03577388823032379, + "kl": 0.0069561004638671875, + "learning_rate": 1.210374028460428e-06, + "loss": 0.0065, + "num_tokens": 224996253.0, + "reward": 0.06863839607103728, + "reward_std": 0.07376563857542351, + "rewards/pure_accuracy_reward_math": 0.06863839426659979, + "step": 1156 + }, + { + "clip_ratio": 0.00025091522741149674, + "epoch": 1.587015625466822, + "grad_norm": 0.03386949375271797, + "kl": 0.006894588470458984, + "learning_rate": 1.2050040216497e-06, + "loss": 0.0065, + "step": 1157 + }, + { + "clip_ratio": 0.00029767470277874963, + "epoch": 1.588927728481372, + "grad_norm": 0.033231545239686966, + "kl": 0.0068531036376953125, + "learning_rate": 1.1996421686244179e-06, + "loss": 0.0064, + "step": 1158 + }, + { + "clip_ratio": 0.00030627386024661973, + "epoch": 1.5908398314959218, + "grad_norm": 0.0327543206512928, + "kl": 0.006781578063964844, + "learning_rate": 1.1942885031448397e-06, + "loss": 0.0064, + "step": 1159 + }, + { + "clip_ratio": 0.00032285955057886895, + "epoch": 1.5927519345104717, + "grad_norm": 0.03283894062042236, + "kl": 0.006725788116455078, + "learning_rate": 1.1889430589196727e-06, + "loss": 0.0063, + "step": 1160 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.7405333518982, + "epoch": 1.5946640375250216, + "grad_norm": 0.04240734875202179, + "kl": 0.006897449493408203, + "learning_rate": 1.183605869605858e-06, + "loss": 0.0064, + "num_tokens": 228663991.0, + "reward": 0.08091518227593042, + "reward_std": 0.08951703325146809, + "rewards/pure_accuracy_reward_math": 0.08091518018045463, + "step": 1161 + }, + { + "clip_ratio": 0.00035278943187222467, + "epoch": 1.5965761405395715, + "grad_norm": 0.04050403833389282, + "kl": 0.006961345672607422, + "learning_rate": 1.1782769688083647e-06, + "loss": 0.0064, + "step": 1162 + }, + { + "clip_ratio": 0.00034535837551175064, + "epoch": 1.5984882435541214, + "grad_norm": 0.03872028365731239, + "kl": 0.007065296173095703, + "learning_rate": 1.1729563900799695e-06, + "loss": 0.0063, + "step": 1163 + }, + { + "clip_ratio": 0.00037939938943054585, + "epoch": 1.6004003465686714, + "grad_norm": 0.039447493851184845, + "kl": 0.007191181182861328, + "learning_rate": 1.1676441669210543e-06, + "loss": 0.0063, + "step": 1164 + }, + { + "clip_ratio": 0.00037003348657549395, + "epoch": 1.6023124495832213, + "grad_norm": 0.03724885359406471, + "kl": 0.0071163177490234375, + "learning_rate": 1.1623403327793881e-06, + "loss": 0.0061, + "step": 1165 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.3211750984192, + "epoch": 1.6042245525977712, + "grad_norm": 0.9447879791259766, + "kl": 0.03227043151855469, + "learning_rate": 1.1570449210499213e-06, + "loss": 0.0085, + "num_tokens": 232302082.0, + "reward": 0.07756696781143546, + "reward_std": 0.0780110054765828, + "rewards/pure_accuracy_reward_math": 0.07756696577416733, + "step": 1166 + }, + { + "clip_ratio": 0.00036849399879201883, + "epoch": 1.606136655612321, + "grad_norm": 0.26742058992385864, + "kl": 0.011518478393554688, + "learning_rate": 1.1517579650745713e-06, + "loss": 0.0079, + "step": 1167 + }, + { + "clip_ratio": 0.00029733346730154153, + "epoch": 1.608048758626871, + "grad_norm": 0.3907225728034973, + "kl": 0.017581462860107422, + "learning_rate": 1.1464794981420187e-06, + "loss": 0.0079, + "step": 1168 + }, + { + "clip_ratio": 0.0003680569542439116, + "epoch": 1.609960861641421, + "grad_norm": 0.1778813600540161, + "kl": 0.010699748992919922, + "learning_rate": 1.1412095534874912e-06, + "loss": 0.0077, + "step": 1169 + }, + { + "clip_ratio": 0.0003726620370798628, + "epoch": 1.6118729646559709, + "grad_norm": 0.2035137563943863, + "kl": 0.01429891586303711, + "learning_rate": 1.135948164292557e-06, + "loss": 0.0077, + "step": 1170 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.0362968444824, + "epoch": 1.6137850676705208, + "grad_norm": 0.040138401091098785, + "kl": 0.008060932159423828, + "learning_rate": 1.130695363684916e-06, + "loss": 0.0096, + "num_tokens": 235898380.0, + "reward": 0.0630580390279647, + "reward_std": 0.07195894001051784, + "rewards/pure_accuracy_reward_math": 0.06305803687428124, + "step": 1171 + }, + { + "clip_ratio": 0.0002708259837049809, + "epoch": 1.6156971706850707, + "grad_norm": 0.03859123960137367, + "kl": 0.008191585540771484, + "learning_rate": 1.1254511847381922e-06, + "loss": 0.0096, + "step": 1172 + }, + { + "clip_ratio": 0.00029455311903348047, + "epoch": 1.6176092736996206, + "grad_norm": 0.03898981586098671, + "kl": 0.008168697357177734, + "learning_rate": 1.1202156604717234e-06, + "loss": 0.0095, + "step": 1173 + }, + { + "clip_ratio": 0.0003440694692926627, + "epoch": 1.6195213767141705, + "grad_norm": 0.0370321087539196, + "kl": 0.00800466537475586, + "learning_rate": 1.1149888238503537e-06, + "loss": 0.0094, + "step": 1174 + }, + { + "clip_ratio": 0.00040963905792068545, + "epoch": 1.6214334797287204, + "grad_norm": 0.03698049858212471, + "kl": 0.007803440093994141, + "learning_rate": 1.109770707784229e-06, + "loss": 0.0094, + "step": 1175 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.937527179718, + "epoch": 1.6233455827432703, + "grad_norm": 0.039002615958452225, + "kl": 0.007039546966552734, + "learning_rate": 1.1045613451285837e-06, + "loss": 0.0074, + "num_tokens": 239513448.0, + "reward": 0.06584821754950099, + "reward_std": 0.07595151895657182, + "rewards/pure_accuracy_reward_math": 0.06584821516298689, + "step": 1176 + }, + { + "clip_ratio": 0.0003209126220440339, + "epoch": 1.6252576857578203, + "grad_norm": 0.038693126291036606, + "kl": 0.0069637298583984375, + "learning_rate": 1.0993607686835408e-06, + "loss": 0.0074, + "step": 1177 + }, + { + "clip_ratio": 0.0003234959946212257, + "epoch": 1.62716978877237, + "grad_norm": 0.03805870935320854, + "kl": 0.006987094879150391, + "learning_rate": 1.0941690111939002e-06, + "loss": 0.0073, + "step": 1178 + }, + { + "clip_ratio": 0.0003316311403978034, + "epoch": 1.62908189178692, + "grad_norm": 0.03687576577067375, + "kl": 0.0070285797119140625, + "learning_rate": 1.0889861053489341e-06, + "loss": 0.0072, + "step": 1179 + }, + { + "clip_ratio": 0.00033663610071243966, + "epoch": 1.6309939948014698, + "grad_norm": 0.03717907890677452, + "kl": 0.007116794586181641, + "learning_rate": 1.0838120837821814e-06, + "loss": 0.0071, + "step": 1180 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.2112393379211, + "epoch": 1.63290609781602, + "grad_norm": 0.04346395656466484, + "kl": 0.007472515106201172, + "learning_rate": 1.0786469790712441e-06, + "loss": 0.0059, + "num_tokens": 243092265.0, + "reward": 0.07700893233413808, + "reward_std": 0.07526089128805324, + "rewards/pure_accuracy_reward_math": 0.07700893029686995, + "step": 1181 + }, + { + "clip_ratio": 0.0002878125141592136, + "epoch": 1.6348182008305696, + "grad_norm": 0.03890342637896538, + "kl": 0.007323265075683594, + "learning_rate": 1.0734908237375783e-06, + "loss": 0.0059, + "step": 1182 + }, + { + "clip_ratio": 0.00031910790164602076, + "epoch": 1.6367303038451197, + "grad_norm": 0.03748926892876625, + "kl": 0.007243156433105469, + "learning_rate": 1.0683436502462915e-06, + "loss": 0.0058, + "step": 1183 + }, + { + "clip_ratio": 0.00036283263597169935, + "epoch": 1.6386424068596694, + "grad_norm": 0.037570755928754807, + "kl": 0.007138252258300781, + "learning_rate": 1.0632054910059391e-06, + "loss": 0.0058, + "step": 1184 + }, + { + "clip_ratio": 0.00039574184188495565, + "epoch": 1.6405545098742196, + "grad_norm": 0.038306284695863724, + "kl": 0.007193088531494141, + "learning_rate": 1.0580763783683187e-06, + "loss": 0.0057, + "step": 1185 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.925525188446, + "epoch": 1.6424666128887693, + "grad_norm": 0.04251728951931, + "kl": 0.007372379302978516, + "learning_rate": 1.0529563446282665e-06, + "loss": 0.01, + "num_tokens": 246686482.0, + "reward": 0.08537946754950099, + "reward_std": 0.08939063869183883, + "rewards/pure_accuracy_reward_math": 0.08537946551223285, + "step": 1186 + }, + { + "clip_ratio": 0.0003136689152256622, + "epoch": 1.6443787159033194, + "grad_norm": 0.04087135195732117, + "kl": 0.007419109344482422, + "learning_rate": 1.0478454220234568e-06, + "loss": 0.0099, + "step": 1187 + }, + { + "clip_ratio": 0.0003467907941399062, + "epoch": 1.646290818917869, + "grad_norm": 0.039666056632995605, + "kl": 0.007442951202392578, + "learning_rate": 1.0427436427341939e-06, + "loss": 0.0099, + "step": 1188 + }, + { + "clip_ratio": 0.00038431568484043055, + "epoch": 1.6482029219324192, + "grad_norm": 0.0389142706990242, + "kl": 0.007426738739013672, + "learning_rate": 1.0376510388832147e-06, + "loss": 0.0098, + "step": 1189 + }, + { + "clip_ratio": 0.000490980125164242, + "epoch": 1.650115024946969, + "grad_norm": 0.03956843912601471, + "kl": 0.007406711578369141, + "learning_rate": 1.0325676425354828e-06, + "loss": 0.0097, + "step": 1190 + }, + { + "clip_ratio": 0.0, + "completion_length": 508.4835596084595, + "epoch": 1.652027127961519, + "grad_norm": 0.04898946359753609, + "kl": 0.008952617645263672, + "learning_rate": 1.0274934856979876e-06, + "loss": 0.0069, + "num_tokens": 250241299.0, + "reward": 0.07868303955183364, + "reward_std": 0.08381028211442754, + "rewards/pure_accuracy_reward_math": 0.07868303728173487, + "step": 1191 + }, + { + "clip_ratio": 0.0002854310730526777, + "epoch": 1.6539392309760688, + "grad_norm": 0.04304199293255806, + "kl": 0.008716106414794922, + "learning_rate": 1.0224286003195437e-06, + "loss": 0.0069, + "step": 1192 + }, + { + "clip_ratio": 0.00029722766299755676, + "epoch": 1.655851333990619, + "grad_norm": 0.039751190692186356, + "kl": 0.008554935455322266, + "learning_rate": 1.017373018290588e-06, + "loss": 0.0068, + "step": 1193 + }, + { + "clip_ratio": 0.00036785421832519205, + "epoch": 1.6577634370051686, + "grad_norm": 0.039316095411777496, + "kl": 0.00851297378540039, + "learning_rate": 1.0123267714429826e-06, + "loss": 0.0067, + "step": 1194 + }, + { + "clip_ratio": 0.0003976103018885624, + "epoch": 1.6596755400197185, + "grad_norm": 0.03880908712744713, + "kl": 0.008470535278320312, + "learning_rate": 1.0072898915498094e-06, + "loss": 0.0067, + "step": 1195 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.2179379463196, + "epoch": 1.6615876430342684, + "grad_norm": 0.04073133319616318, + "kl": 0.0076427459716796875, + "learning_rate": 1.0022624103251727e-06, + "loss": 0.0095, + "num_tokens": 253820892.0, + "reward": 0.08593750416184776, + "reward_std": 0.08978221646975726, + "rewards/pure_accuracy_reward_math": 0.08593750165891834, + "step": 1196 + }, + { + "clip_ratio": 0.0003768215759691884, + "epoch": 1.6634997460488183, + "grad_norm": 0.039870597422122955, + "kl": 0.007634639739990234, + "learning_rate": 9.972443594239997e-07, + "loss": 0.0095, + "step": 1197 + }, + { + "clip_ratio": 0.00033531371116168884, + "epoch": 1.6654118490633683, + "grad_norm": 0.039165791124105453, + "kl": 0.007609367370605469, + "learning_rate": 9.922357704418394e-07, + "loss": 0.0094, + "step": 1198 + }, + { + "clip_ratio": 0.0003830786464504854, + "epoch": 1.6673239520779182, + "grad_norm": 0.0393473282456398, + "kl": 0.0076847076416015625, + "learning_rate": 9.872366749146684e-07, + "loss": 0.0094, + "step": 1199 + }, + { + "clip_ratio": 0.0003766370310813727, + "epoch": 1.669236055092468, + "grad_norm": 0.037378448992967606, + "kl": 0.007641792297363281, + "learning_rate": 9.822471043186846e-07, + "loss": 0.0093, + "step": 1200 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.35381841659546, + "epoch": 1.671148158107018, + "grad_norm": 0.051170479506254196, + "kl": 0.008347511291503906, + "learning_rate": 9.772670900701172e-07, + "loss": 0.0074, + "num_tokens": 257360516.0, + "reward": 0.08537946784053929, + "reward_std": 0.09248606633627787, + "rewards/pure_accuracy_reward_math": 0.0853794660361018, + "step": 1201 + }, + { + "clip_ratio": 0.00036896339207714846, + "epoch": 1.673060261121568, + "grad_norm": 0.04540196433663368, + "kl": 0.008112430572509766, + "learning_rate": 9.722966635250222e-07, + "loss": 0.0074, + "step": 1202 + }, + { + "clip_ratio": 0.00040850058093155894, + "epoch": 1.6749723641361178, + "grad_norm": 0.0428830124437809, + "kl": 0.007869243621826172, + "learning_rate": 9.673358559790892e-07, + "loss": 0.0073, + "step": 1203 + }, + { + "clip_ratio": 0.0004735397765216476, + "epoch": 1.6768844671506677, + "grad_norm": 0.04445512220263481, + "kl": 0.007699012756347656, + "learning_rate": 9.623846986674417e-07, + "loss": 0.0072, + "step": 1204 + }, + { + "clip_ratio": 0.00047387216932293086, + "epoch": 1.6787965701652177, + "grad_norm": 0.04317403957247734, + "kl": 0.0076007843017578125, + "learning_rate": 9.574432227644432e-07, + "loss": 0.0071, + "step": 1205 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.88367557525635, + "epoch": 1.6807086731797676, + "grad_norm": 0.041338611394166946, + "kl": 0.007639884948730469, + "learning_rate": 9.525114593834975e-07, + "loss": 0.0077, + "num_tokens": 260924667.0, + "reward": 0.07617187869618647, + "reward_std": 0.08037573983892798, + "rewards/pure_accuracy_reward_math": 0.0761718759604264, + "step": 1206 + }, + { + "clip_ratio": 0.00029646307336861355, + "epoch": 1.6826207761943175, + "grad_norm": 0.040457833558321, + "kl": 0.007670402526855469, + "learning_rate": 9.475894395768579e-07, + "loss": 0.0077, + "step": 1207 + }, + { + "clip_ratio": 0.0003306309376966965, + "epoch": 1.6845328792088674, + "grad_norm": 0.03946809470653534, + "kl": 0.0076751708984375, + "learning_rate": 9.426771943354249e-07, + "loss": 0.0076, + "step": 1208 + }, + { + "clip_ratio": 0.0003582578942200598, + "epoch": 1.6864449822234173, + "grad_norm": 0.04006471857428551, + "kl": 0.007700443267822266, + "learning_rate": 9.377747545885569e-07, + "loss": 0.0075, + "step": 1209 + }, + { + "clip_ratio": 0.00040392828321955676, + "epoch": 1.6883570852379672, + "grad_norm": 0.04037889465689659, + "kl": 0.007681369781494141, + "learning_rate": 9.328821512038716e-07, + "loss": 0.0074, + "step": 1210 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.6010298728943, + "epoch": 1.6902691882525172, + "grad_norm": 0.03628333657979965, + "kl": 0.006788730621337891, + "learning_rate": 9.279994149870539e-07, + "loss": 0.0073, + "num_tokens": 264564517.0, + "reward": 0.06110491382423788, + "reward_std": 0.06693661888130009, + "rewards/pure_accuracy_reward_math": 0.06110491219442338, + "step": 1211 + }, + { + "clip_ratio": 0.0002594580842014693, + "epoch": 1.692181291267067, + "grad_norm": 0.034194085747003555, + "kl": 0.006678581237792969, + "learning_rate": 9.231265766816619e-07, + "loss": 0.0073, + "step": 1212 + }, + { + "clip_ratio": 0.0003170226998463477, + "epoch": 1.6940933942816168, + "grad_norm": 0.035113800317049026, + "kl": 0.006625652313232422, + "learning_rate": 9.182636669689335e-07, + "loss": 0.0073, + "step": 1213 + }, + { + "clip_ratio": 0.0003448430217076748, + "epoch": 1.696005497296167, + "grad_norm": 0.03626548498868942, + "kl": 0.006573200225830078, + "learning_rate": 9.134107164675898e-07, + "loss": 0.0072, + "step": 1214 + }, + { + "clip_ratio": 0.00033195262278695736, + "epoch": 1.6979176003107166, + "grad_norm": 0.03465663269162178, + "kl": 0.006582736968994141, + "learning_rate": 9.085677557336465e-07, + "loss": 0.0071, + "step": 1215 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.8440546989441, + "epoch": 1.6998297033252667, + "grad_norm": 0.038788389414548874, + "kl": 0.009612560272216797, + "learning_rate": 9.037348152602199e-07, + "loss": 0.0052, + "num_tokens": 268179390.0, + "reward": 0.07756696798605844, + "reward_std": 0.0852254037745297, + "rewards/pure_accuracy_reward_math": 0.07756696571595967, + "step": 1216 + }, + { + "clip_ratio": 0.00027092215094626226, + "epoch": 1.7017418063398164, + "grad_norm": 0.038229282945394516, + "kl": 0.009754657745361328, + "learning_rate": 8.989119254773343e-07, + "loss": 0.0052, + "step": 1217 + }, + { + "clip_ratio": 0.00027246196253827293, + "epoch": 1.7036539093543666, + "grad_norm": 0.03782220929861069, + "kl": 0.009780406951904297, + "learning_rate": 8.940991167517313e-07, + "loss": 0.0051, + "step": 1218 + }, + { + "clip_ratio": 0.0003069629718197575, + "epoch": 1.7055660123689163, + "grad_norm": 0.03707100450992584, + "kl": 0.00977468490600586, + "learning_rate": 8.892964193866799e-07, + "loss": 0.005, + "step": 1219 + }, + { + "clip_ratio": 0.0003035257008150438, + "epoch": 1.7074781153834664, + "grad_norm": 0.03552490472793579, + "kl": 0.009665966033935547, + "learning_rate": 8.845038636217818e-07, + "loss": 0.0049, + "step": 1220 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.9601240158081, + "epoch": 1.709390218398016, + "grad_norm": 0.04051567241549492, + "kl": 0.007312297821044922, + "learning_rate": 8.797214796327843e-07, + "loss": 0.0079, + "num_tokens": 271808667.0, + "reward": 0.08733259368455037, + "reward_std": 0.08496641932288185, + "rewards/pure_accuracy_reward_math": 0.0873325903667137, + "step": 1221 + }, + { + "clip_ratio": 0.00033132852740891394, + "epoch": 1.7113023214125662, + "grad_norm": 0.03887411206960678, + "kl": 0.007235527038574219, + "learning_rate": 8.749492975313897e-07, + "loss": 0.0079, + "step": 1222 + }, + { + "clip_ratio": 0.0003587238066984355, + "epoch": 1.713214424427116, + "grad_norm": 0.04010055959224701, + "kl": 0.007251739501953125, + "learning_rate": 8.701873473650643e-07, + "loss": 0.0079, + "step": 1223 + }, + { + "clip_ratio": 0.0003504625653079074, + "epoch": 1.715126527441666, + "grad_norm": 0.039550576359033585, + "kl": 0.007262229919433594, + "learning_rate": 8.654356591168522e-07, + "loss": 0.0078, + "step": 1224 + }, + { + "clip_ratio": 0.0003497420942721874, + "epoch": 1.7170386304562157, + "grad_norm": 0.03883340209722519, + "kl": 0.007348537445068359, + "learning_rate": 8.60694262705182e-07, + "loss": 0.0077, + "step": 1225 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.5396447181702, + "epoch": 1.7189507334707659, + "grad_norm": 0.037610165774822235, + "kl": 0.007049083709716797, + "learning_rate": 8.559631879836838e-07, + "loss": 0.0065, + "num_tokens": 275440789.0, + "reward": 0.07896205675206147, + "reward_std": 0.07938606152310967, + "rewards/pure_accuracy_reward_math": 0.07896205494762398, + "step": 1226 + }, + { + "clip_ratio": 0.0002787316387298233, + "epoch": 1.7208628364853156, + "grad_norm": 0.03763109818100929, + "kl": 0.007136821746826172, + "learning_rate": 8.512424647409964e-07, + "loss": 0.0065, + "step": 1227 + }, + { + "clip_ratio": 0.0003178273858566172, + "epoch": 1.7227749394998657, + "grad_norm": 0.037824735045433044, + "kl": 0.007121562957763672, + "learning_rate": 8.465321227005823e-07, + "loss": 0.0065, + "step": 1228 + }, + { + "clip_ratio": 0.0002866029928725311, + "epoch": 1.7246870425144154, + "grad_norm": 0.03616493567824364, + "kl": 0.00708770751953125, + "learning_rate": 8.418321915205399e-07, + "loss": 0.0064, + "step": 1229 + }, + { + "clip_ratio": 0.00031164622902224437, + "epoch": 1.7265991455289653, + "grad_norm": 0.03562076762318611, + "kl": 0.007038593292236328, + "learning_rate": 8.371427007934174e-07, + "loss": 0.0063, + "step": 1230 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.3178272247314, + "epoch": 1.7285112485435152, + "grad_norm": 0.03759186714887619, + "kl": 0.006800651550292969, + "learning_rate": 8.324636800460242e-07, + "loss": 0.0071, + "num_tokens": 279097568.0, + "reward": 0.07728794903960079, + "reward_std": 0.07732657541055232, + "rewards/pure_accuracy_reward_math": 0.07728794822469354, + "step": 1231 + }, + { + "clip_ratio": 0.00028705537579298834, + "epoch": 1.7304233515580651, + "grad_norm": 0.036786679178476334, + "kl": 0.006786346435546875, + "learning_rate": 8.277951587392505e-07, + "loss": 0.0071, + "step": 1232 + }, + { + "clip_ratio": 0.000303516245821811, + "epoch": 1.732335454572615, + "grad_norm": 0.03563455864787102, + "kl": 0.0068149566650390625, + "learning_rate": 8.231371662678741e-07, + "loss": 0.0071, + "step": 1233 + }, + { + "clip_ratio": 0.0003096325264095867, + "epoch": 1.734247557587165, + "grad_norm": 0.03413652628660202, + "kl": 0.006861209869384766, + "learning_rate": 8.184897319603813e-07, + "loss": 0.007, + "step": 1234 + }, + { + "clip_ratio": 0.0003550405467649398, + "epoch": 1.736159660601715, + "grad_norm": 0.03433661162853241, + "kl": 0.006935596466064453, + "learning_rate": 8.138528850787792e-07, + "loss": 0.0069, + "step": 1235 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.8069453239441, + "epoch": 1.7380717636162648, + "grad_norm": 0.2546544671058655, + "kl": 0.012326240539550781, + "learning_rate": 8.092266548184139e-07, + "loss": 0.011, + "num_tokens": 282683384.0, + "reward": 0.07477678873692639, + "reward_std": 0.08165826951153576, + "rewards/pure_accuracy_reward_math": 0.07477678751456551, + "step": 1236 + }, + { + "clip_ratio": 0.00030172572752462656, + "epoch": 1.7399838666308147, + "grad_norm": 0.042716413736343384, + "kl": 0.0078887939453125, + "learning_rate": 8.046110703077839e-07, + "loss": 0.0108, + "step": 1237 + }, + { + "clip_ratio": 0.00029401268267292835, + "epoch": 1.7418959696453646, + "grad_norm": 0.038783252239227295, + "kl": 0.007707118988037109, + "learning_rate": 8.000061606083579e-07, + "loss": 0.0107, + "step": 1238 + }, + { + "clip_ratio": 0.00028625389199987694, + "epoch": 1.7438080726599146, + "grad_norm": 0.0381159707903862, + "kl": 0.007790088653564453, + "learning_rate": 7.954119547143935e-07, + "loss": 0.0107, + "step": 1239 + }, + { + "clip_ratio": 0.00034677153644224745, + "epoch": 1.7457201756744645, + "grad_norm": 0.038590554147958755, + "kl": 0.007785797119140625, + "learning_rate": 7.90828481552752e-07, + "loss": 0.0106, + "step": 1240 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.8047132492065, + "epoch": 1.7476322786890144, + "grad_norm": 0.03943649306893349, + "kl": 0.007458209991455078, + "learning_rate": 7.862557699827167e-07, + "loss": 0.0092, + "num_tokens": 286269120.0, + "reward": 0.06640625282307155, + "reward_std": 0.07607791275950149, + "rewards/pure_accuracy_reward_math": 0.06640625130967237, + "step": 1241 + }, + { + "clip_ratio": 0.00031282668544463377, + "epoch": 1.7495443817035643, + "grad_norm": 0.0388050340116024, + "kl": 0.007348060607910156, + "learning_rate": 7.816938487958131e-07, + "loss": 0.0092, + "step": 1242 + }, + { + "clip_ratio": 0.0003194147345197962, + "epoch": 1.7514564847181142, + "grad_norm": 0.038322921842336655, + "kl": 0.007298946380615234, + "learning_rate": 7.771427467156256e-07, + "loss": 0.0091, + "step": 1243 + }, + { + "clip_ratio": 0.0003203335651846828, + "epoch": 1.7533685877326641, + "grad_norm": 0.037499312311410904, + "kl": 0.007254600524902344, + "learning_rate": 7.726024923976169e-07, + "loss": 0.009, + "step": 1244 + }, + { + "clip_ratio": 0.00032696440513291236, + "epoch": 1.755280690747214, + "grad_norm": 0.03671669587492943, + "kl": 0.007252693176269531, + "learning_rate": 7.680731144289505e-07, + "loss": 0.009, + "step": 1245 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.8644180297852, + "epoch": 1.757192793761764, + "grad_norm": 0.04826434701681137, + "kl": 0.0094451904296875, + "learning_rate": 7.635546413283054e-07, + "loss": 0.0078, + "num_tokens": 289848950.0, + "reward": 0.07421875323052518, + "reward_std": 0.07818366138963029, + "rewards/pure_accuracy_reward_math": 0.074218751717126, + "step": 1246 + }, + { + "clip_ratio": 0.000299703156713349, + "epoch": 1.7591048967763139, + "grad_norm": 0.03791136294603348, + "kl": 0.009324073791503906, + "learning_rate": 7.590471015457002e-07, + "loss": 0.0077, + "step": 1247 + }, + { + "clip_ratio": 0.00030542989918558305, + "epoch": 1.7610169997908636, + "grad_norm": 0.03703403100371361, + "kl": 0.009335517883300781, + "learning_rate": 7.545505234623152e-07, + "loss": 0.0077, + "step": 1248 + }, + { + "clip_ratio": 0.0002983629839832247, + "epoch": 1.7629291028054137, + "grad_norm": 0.0363752581179142, + "kl": 0.009361743927001953, + "learning_rate": 7.500649353903092e-07, + "loss": 0.0076, + "step": 1249 + }, + { + "clip_ratio": 0.0002923785563098136, + "epoch": 1.7648412058199634, + "grad_norm": 0.03587965667247772, + "kl": 0.009373664855957031, + "learning_rate": 7.455903655726437e-07, + "loss": 0.0075, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completion_length": 510.6543188095093, + "epoch": 1.7667533088345135, + "grad_norm": 0.03651593253016472, + "kl": 0.008678436279296875, + "learning_rate": 7.411268421829076e-07, + "loss": 0.0059, + "num_tokens": 293408275.0, + "reward": 0.07031250264844857, + "reward_std": 0.07401842658873647, + "rewards/pure_accuracy_reward_math": 0.07031250160071068, + "step": 1251 + }, + { + "clip_ratio": 0.000244510552590782, + "epoch": 1.7686654118490632, + "grad_norm": 0.03525623679161072, + "kl": 0.008609294891357422, + "learning_rate": 7.366743933251349e-07, + "loss": 0.0059, + "step": 1252 + }, + { + "clip_ratio": 0.000242228649824483, + "epoch": 1.7705775148636134, + "grad_norm": 0.035115260630846024, + "kl": 0.008548259735107422, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0058, + "step": 1253 + }, + { + "clip_ratio": 0.0002641637478291159, + "epoch": 1.772489617878163, + "grad_norm": 0.03518166393041611, + "kl": 0.008442401885986328, + "learning_rate": 7.278028312727961e-07, + "loss": 0.0058, + "step": 1254 + }, + { + "clip_ratio": 0.0002555919315909705, + "epoch": 1.7744017208927132, + "grad_norm": 0.03385892137885094, + "kl": 0.00841379165649414, + "learning_rate": 7.233837739369462e-07, + "loss": 0.0057, + "step": 1255 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.7271451950073, + "epoch": 1.776313823907263, + "grad_norm": 0.03341628611087799, + "kl": 0.006855964660644531, + "learning_rate": 7.189759028501417e-07, + "loss": 0.0062, + "num_tokens": 296984393.0, + "reward": 0.06556919915601611, + "reward_std": 0.06311669771093875, + "rewards/pure_accuracy_reward_math": 0.06556919775903225, + "step": 1256 + }, + { + "clip_ratio": 0.0002122660096688378, + "epoch": 1.778225926921813, + "grad_norm": 0.03227659687399864, + "kl": 0.006803989410400391, + "learning_rate": 7.145792457660083e-07, + "loss": 0.0062, + "step": 1257 + }, + { + "clip_ratio": 0.00023682935608348998, + "epoch": 1.7801380299363627, + "grad_norm": 0.03206360712647438, + "kl": 0.006758213043212891, + "learning_rate": 7.101938303675674e-07, + "loss": 0.0062, + "step": 1258 + }, + { + "clip_ratio": 0.0002413284565250251, + "epoch": 1.7820501329509129, + "grad_norm": 0.031279318034648895, + "kl": 0.006762981414794922, + "learning_rate": 7.058196842670548e-07, + "loss": 0.0061, + "step": 1259 + }, + { + "clip_ratio": 0.0002680151189338176, + "epoch": 1.7839622359654626, + "grad_norm": 0.031049314886331558, + "kl": 0.006676197052001953, + "learning_rate": 7.014568350057516e-07, + "loss": 0.0061, + "step": 1260 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.2553224563599, + "epoch": 1.7858743389800127, + "grad_norm": 0.03635333850979805, + "kl": 0.007339000701904297, + "learning_rate": 6.971053100538116e-07, + "loss": 0.0066, + "num_tokens": 300622928.0, + "reward": 0.0711495568684768, + "reward_std": 0.07668221119092777, + "rewards/pure_accuracy_reward_math": 0.07114955512224697, + "step": 1261 + }, + { + "clip_ratio": 0.00025942773436327116, + "epoch": 1.7877864419945624, + "grad_norm": 0.03595859929919243, + "kl": 0.007373332977294922, + "learning_rate": 6.927651368100843e-07, + "loss": 0.0065, + "step": 1262 + }, + { + "clip_ratio": 0.00026420129074722354, + "epoch": 1.7896985450091125, + "grad_norm": 0.034778136759996414, + "kl": 0.00739288330078125, + "learning_rate": 6.884363426019444e-07, + "loss": 0.0065, + "step": 1263 + }, + { + "clip_ratio": 0.0002875854173112202, + "epoch": 1.7916106480236622, + "grad_norm": 0.035560280084609985, + "kl": 0.007449150085449219, + "learning_rate": 6.841189546851224e-07, + "loss": 0.0064, + "step": 1264 + }, + { + "clip_ratio": 0.00026737677507071567, + "epoch": 1.7935227510382123, + "grad_norm": 0.03407442197203636, + "kl": 0.007452964782714844, + "learning_rate": 6.79813000243528e-07, + "loss": 0.0064, + "step": 1265 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.543550491333, + "epoch": 1.795434854052762, + "grad_norm": 0.03908964619040489, + "kl": 0.008809566497802734, + "learning_rate": 6.755185063890818e-07, + "loss": 0.0074, + "num_tokens": 304236988.0, + "reward": 0.0747767890279647, + "reward_std": 0.07865536957979202, + "rewards/pure_accuracy_reward_math": 0.07477678745635785, + "step": 1266 + }, + { + "clip_ratio": 0.0002752643416670253, + "epoch": 1.797346957067312, + "grad_norm": 0.0380408875644207, + "kl": 0.00884389877319336, + "learning_rate": 6.71235500161545e-07, + "loss": 0.0074, + "step": 1267 + }, + { + "clip_ratio": 0.0002959408872698077, + "epoch": 1.7992590600818619, + "grad_norm": 0.03713267296552658, + "kl": 0.008931636810302734, + "learning_rate": 6.669640085283479e-07, + "loss": 0.0073, + "step": 1268 + }, + { + "clip_ratio": 0.0003134474755484007, + "epoch": 1.8011711630964118, + "grad_norm": 0.03684492036700249, + "kl": 0.008975982666015625, + "learning_rate": 6.627040583844199e-07, + "loss": 0.0073, + "step": 1269 + }, + { + "clip_ratio": 0.0003336208075666036, + "epoch": 1.8030832661109617, + "grad_norm": 0.0364052951335907, + "kl": 0.009007453918457031, + "learning_rate": 6.584556765520231e-07, + "loss": 0.0072, + "step": 1270 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.5468997955322, + "epoch": 1.8049953691255116, + "grad_norm": 0.03688374161720276, + "kl": 0.006972789764404297, + "learning_rate": 6.542188897805782e-07, + "loss": 0.0076, + "num_tokens": 307881200.0, + "reward": 0.06082589610014111, + "reward_std": 0.06925509008578956, + "rewards/pure_accuracy_reward_math": 0.06082589423749596, + "step": 1271 + }, + { + "clip_ratio": 0.0002535940801635661, + "epoch": 1.8069074721400615, + "grad_norm": 0.03543318435549736, + "kl": 0.006913661956787109, + "learning_rate": 6.499937247465002e-07, + "loss": 0.0076, + "step": 1272 + }, + { + "clip_ratio": 0.00029529011806062044, + "epoch": 1.8088195751546114, + "grad_norm": 0.034321434795856476, + "kl": 0.006764411926269531, + "learning_rate": 6.457802080530304e-07, + "loss": 0.0075, + "step": 1273 + }, + { + "clip_ratio": 0.00032198404306882367, + "epoch": 1.8107316781691614, + "grad_norm": 0.03342648968100548, + "kl": 0.006732940673828125, + "learning_rate": 6.415783662300662e-07, + "loss": 0.0075, + "step": 1274 + }, + { + "clip_ratio": 0.000381207836142039, + "epoch": 1.8126437811837113, + "grad_norm": 0.034588467329740524, + "kl": 0.006687164306640625, + "learning_rate": 6.373882257339964e-07, + "loss": 0.0074, + "step": 1275 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.7452836036682, + "epoch": 1.8145558841982612, + "grad_norm": 0.039650533348321915, + "kl": 0.012791156768798828, + "learning_rate": 6.33209812947532e-07, + "loss": 0.0068, + "num_tokens": 311509399.0, + "reward": 0.06919643239234574, + "reward_std": 0.07131457643117756, + "rewards/pure_accuracy_reward_math": 0.06919642988941632, + "step": 1276 + }, + { + "clip_ratio": 0.00028128568749252736, + "epoch": 1.816467987212811, + "grad_norm": 0.039305564016103745, + "kl": 0.012639522552490234, + "learning_rate": 6.290431541795456e-07, + "loss": 0.0068, + "step": 1277 + }, + { + "clip_ratio": 0.00027201296376233586, + "epoch": 1.818380090227361, + "grad_norm": 0.038404785096645355, + "kl": 0.012586116790771484, + "learning_rate": 6.248882756648988e-07, + "loss": 0.0067, + "step": 1278 + }, + { + "clip_ratio": 0.00027703067632955936, + "epoch": 1.820292193241911, + "grad_norm": 0.037614692002534866, + "kl": 0.01236581802368164, + "learning_rate": 6.207452035642814e-07, + "loss": 0.0066, + "step": 1279 + }, + { + "clip_ratio": 0.000309511864088563, + "epoch": 1.8222042962564609, + "grad_norm": 0.03737355023622513, + "kl": 0.012206554412841797, + "learning_rate": 6.166139639640454e-07, + "loss": 0.0065, + "step": 1280 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.473795413971, + "epoch": 1.8241163992710108, + "grad_norm": 0.03713076934218407, + "kl": 0.007002353668212891, + "learning_rate": 6.124945828760406e-07, + "loss": 0.0059, + "num_tokens": 315129533.0, + "reward": 0.06445312840514816, + "reward_std": 0.06921502435579896, + "rewards/pure_accuracy_reward_math": 0.0644531259604264, + "step": 1281 + }, + { + "clip_ratio": 0.00024346445911760384, + "epoch": 1.8260285022855607, + "grad_norm": 0.03588669002056122, + "kl": 0.006989955902099609, + "learning_rate": 6.083870862374513e-07, + "loss": 0.0059, + "step": 1282 + }, + { + "clip_ratio": 0.0002329723478737833, + "epoch": 1.8279406053001104, + "grad_norm": 0.03526683151721954, + "kl": 0.007010459899902344, + "learning_rate": 6.042914999106342e-07, + "loss": 0.0058, + "step": 1283 + }, + { + "clip_ratio": 0.00023291378442991117, + "epoch": 1.8298527083146605, + "grad_norm": 0.03384559601545334, + "kl": 0.007075786590576172, + "learning_rate": 6.002078496829514e-07, + "loss": 0.0058, + "step": 1284 + }, + { + "clip_ratio": 0.0002458733478647446, + "epoch": 1.8317648113292102, + "grad_norm": 0.03377237543463707, + "kl": 0.0071315765380859375, + "learning_rate": 5.961361612666139e-07, + "loss": 0.0057, + "step": 1285 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.0859618186951, + "epoch": 1.8336769143437603, + "grad_norm": 0.0914173573255539, + "kl": 0.012554645538330078, + "learning_rate": 5.920764602985141e-07, + "loss": 0.0058, + "num_tokens": 318747025.0, + "reward": 0.06612723506987095, + "reward_std": 0.06865079142153263, + "rewards/pure_accuracy_reward_math": 0.06612723355647177, + "step": 1286 + }, + { + "clip_ratio": 0.00025586230526641884, + "epoch": 1.83558901735831, + "grad_norm": 0.04225718230009079, + "kl": 0.010876655578613281, + "learning_rate": 5.88028772340068e-07, + "loss": 0.0057, + "step": 1287 + }, + { + "clip_ratio": 0.00024814432106268214, + "epoch": 1.8375011203728602, + "grad_norm": 0.03636258468031883, + "kl": 0.010531425476074219, + "learning_rate": 5.839931228770526e-07, + "loss": 0.0057, + "step": 1288 + }, + { + "clip_ratio": 0.0002984523198108491, + "epoch": 1.8394132233874099, + "grad_norm": 0.03610241040587425, + "kl": 0.010416984558105469, + "learning_rate": 5.799695373194461e-07, + "loss": 0.0056, + "step": 1289 + }, + { + "clip_ratio": 0.00032527196299270145, + "epoch": 1.84132532640196, + "grad_norm": 0.034912850707769394, + "kl": 0.010428428649902344, + "learning_rate": 5.759580410012691e-07, + "loss": 0.0055, + "step": 1290 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.4793767929077, + "epoch": 1.8432374294165097, + "grad_norm": 0.04220513626933098, + "kl": 0.009058475494384766, + "learning_rate": 5.719586591804222e-07, + "loss": 0.0071, + "num_tokens": 322345307.0, + "reward": 0.07366071786964312, + "reward_std": 0.07878176297526807, + "rewards/pure_accuracy_reward_math": 0.07366071542492136, + "step": 1291 + }, + { + "clip_ratio": 0.00030183524040694465, + "epoch": 1.8451495324310598, + "grad_norm": 0.03849344700574875, + "kl": 0.009106636047363281, + "learning_rate": 5.679714170385283e-07, + "loss": 0.0071, + "step": 1292 + }, + { + "clip_ratio": 0.00035880112773156725, + "epoch": 1.8470616354456095, + "grad_norm": 0.037096235901117325, + "kl": 0.009167194366455078, + "learning_rate": 5.63996339680776e-07, + "loss": 0.0071, + "step": 1293 + }, + { + "clip_ratio": 0.00040293739141361584, + "epoch": 1.8489737384601597, + "grad_norm": 0.03884498402476311, + "kl": 0.009192943572998047, + "learning_rate": 5.600334521357581e-07, + "loss": 0.007, + "step": 1294 + }, + { + "clip_ratio": 0.00038201194092835067, + "epoch": 1.8508858414747094, + "grad_norm": 0.03875093162059784, + "kl": 0.009291648864746094, + "learning_rate": 5.560827793553159e-07, + "loss": 0.0069, + "step": 1295 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.3301024436951, + "epoch": 1.8527979444892595, + "grad_norm": 0.04254430532455444, + "kl": 0.008441925048828125, + "learning_rate": 5.52144346214383e-07, + "loss": 0.0063, + "num_tokens": 325938766.0, + "reward": 0.07840402127476409, + "reward_std": 0.08084744628285989, + "rewards/pure_accuracy_reward_math": 0.07840401929570362, + "step": 1296 + }, + { + "clip_ratio": 0.0002986583057804637, + "epoch": 1.8547100475038092, + "grad_norm": 0.041676584631204605, + "kl": 0.008450508117675781, + "learning_rate": 5.482181775108278e-07, + "loss": 0.0062, + "step": 1297 + }, + { + "clip_ratio": 0.00031948441494478175, + "epoch": 1.8566221505183593, + "grad_norm": 0.03955300524830818, + "kl": 0.008507251739501953, + "learning_rate": 5.443042979652957e-07, + "loss": 0.0062, + "step": 1298 + }, + { + "clip_ratio": 0.0003085145480667961, + "epoch": 1.858534253532909, + "grad_norm": 0.03848061338067055, + "kl": 0.008501052856445312, + "learning_rate": 5.404027322210556e-07, + "loss": 0.0061, + "step": 1299 + }, + { + "clip_ratio": 0.0003855731235944404, + "epoch": 1.8604463565474592, + "grad_norm": 0.04076399654150009, + "kl": 0.00849771499633789, + "learning_rate": 5.365135048438438e-07, + "loss": 0.006, + "step": 1300 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.5170464515686, + "epoch": 1.8623584595620088, + "grad_norm": 0.14906181395053864, + "kl": 0.007767677307128906, + "learning_rate": 5.326366403217093e-07, + "loss": 0.0084, + "num_tokens": 329571311.0, + "reward": 0.07254464630386792, + "reward_std": 0.08418946416350082, + "rewards/pure_accuracy_reward_math": 0.07254464438301511, + "step": 1301 + }, + { + "clip_ratio": 0.00028383656763253384, + "epoch": 1.8642705625765588, + "grad_norm": 0.04550671949982643, + "kl": 0.008212089538574219, + "learning_rate": 5.287721630648615e-07, + "loss": 0.0083, + "step": 1302 + }, + { + "clip_ratio": 0.0003281467976989916, + "epoch": 1.8661826655911087, + "grad_norm": 0.05260877683758736, + "kl": 0.008829593658447266, + "learning_rate": 5.249200974055132e-07, + "loss": 0.0083, + "step": 1303 + }, + { + "clip_ratio": 0.00036754867960553383, + "epoch": 1.8680947686056586, + "grad_norm": 0.0511869452893734, + "kl": 0.008836746215820312, + "learning_rate": 5.210804675977299e-07, + "loss": 0.0082, + "step": 1304 + }, + { + "clip_ratio": 0.0004018283953541868, + "epoch": 1.8700068716202085, + "grad_norm": 0.044321924448013306, + "kl": 0.008379459381103516, + "learning_rate": 5.172532978172753e-07, + "loss": 0.0081, + "step": 1305 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.9788198471069, + "epoch": 1.8719189746347584, + "grad_norm": 0.04202428087592125, + "kl": 0.0076198577880859375, + "learning_rate": 5.134386121614615e-07, + "loss": 0.0072, + "num_tokens": 333143795.0, + "reward": 0.07421875317231752, + "reward_std": 0.07986396714113653, + "rewards/pure_accuracy_reward_math": 0.074218751717126, + "step": 1306 + }, + { + "clip_ratio": 0.00027569573836672134, + "epoch": 1.8738310776493083, + "grad_norm": 0.040443304926157, + "kl": 0.007631778717041016, + "learning_rate": 5.096364346489935e-07, + "loss": 0.0072, + "step": 1307 + }, + { + "clip_ratio": 0.00027392168607320855, + "epoch": 1.8757431806638583, + "grad_norm": 0.040238041430711746, + "kl": 0.007664203643798828, + "learning_rate": 5.058467892198241e-07, + "loss": 0.0071, + "step": 1308 + }, + { + "clip_ratio": 0.0003170029604007141, + "epoch": 1.8776552836784082, + "grad_norm": 0.039109617471694946, + "kl": 0.007664203643798828, + "learning_rate": 5.02069699734995e-07, + "loss": 0.007, + "step": 1309 + }, + { + "clip_ratio": 0.0003183572773082233, + "epoch": 1.879567386692958, + "grad_norm": 0.03724955767393112, + "kl": 0.007700443267822266, + "learning_rate": 4.983051899764946e-07, + "loss": 0.007, + "step": 1310 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.4592852592468, + "epoch": 1.881479489707508, + "grad_norm": 0.03964386135339737, + "kl": 0.007820606231689453, + "learning_rate": 4.945532836471026e-07, + "loss": 0.0074, + "num_tokens": 336685165.0, + "reward": 0.0848214327415917, + "reward_std": 0.07835631881607696, + "rewards/pure_accuracy_reward_math": 0.08482142965658568, + "step": 1311 + }, + { + "clip_ratio": 0.0002873320136700386, + "epoch": 1.883391592722058, + "grad_norm": 0.03871289640665054, + "kl": 0.007764339447021484, + "learning_rate": 4.908140043702426e-07, + "loss": 0.0074, + "step": 1312 + }, + { + "clip_ratio": 0.0003113469839775007, + "epoch": 1.8853036957366078, + "grad_norm": 0.03769771382212639, + "kl": 0.007766246795654297, + "learning_rate": 4.870873756898345e-07, + "loss": 0.0074, + "step": 1313 + }, + { + "clip_ratio": 0.00034381698696961394, + "epoch": 1.8872157987511577, + "grad_norm": 0.03724011033773422, + "kl": 0.007775783538818359, + "learning_rate": 4.833734210701435e-07, + "loss": 0.0073, + "step": 1314 + }, + { + "clip_ratio": 0.0003651243675335536, + "epoch": 1.8891279017657077, + "grad_norm": 0.03757576644420624, + "kl": 0.007784366607666016, + "learning_rate": 4.796721638956376e-07, + "loss": 0.0072, + "step": 1315 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.5703339576721, + "epoch": 1.8910400047802576, + "grad_norm": 0.03592124208807945, + "kl": 0.007517337799072266, + "learning_rate": 4.7598362747083293e-07, + "loss": 0.008, + "num_tokens": 340304225.0, + "reward": 0.06501116388244554, + "reward_std": 0.0762443722342141, + "rewards/pure_accuracy_reward_math": 0.06501116219442338, + "step": 1316 + }, + { + "clip_ratio": 0.00026663288446115985, + "epoch": 1.8929521077948075, + "grad_norm": 0.03529619425535202, + "kl": 0.007477283477783203, + "learning_rate": 4.7230783502015346e-07, + "loss": 0.008, + "step": 1317 + }, + { + "clip_ratio": 0.00025462434007295087, + "epoch": 1.8948642108093574, + "grad_norm": 0.03387421742081642, + "kl": 0.007337093353271484, + "learning_rate": 4.6864480968778103e-07, + "loss": 0.008, + "step": 1318 + }, + { + "clip_ratio": 0.00031681645646131074, + "epoch": 1.8967763138239073, + "grad_norm": 0.033014364540576935, + "kl": 0.007318019866943359, + "learning_rate": 4.649945745375109e-07, + "loss": 0.0079, + "step": 1319 + }, + { + "clip_ratio": 0.00037019279989181086, + "epoch": 1.898688416838457, + "grad_norm": 0.033140987157821655, + "kl": 0.007157325744628906, + "learning_rate": 4.613571525526081e-07, + "loss": 0.0078, + "step": 1320 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.3727917671204, + "epoch": 1.9006005198530072, + "grad_norm": 0.03997303172945976, + "kl": 0.007628440856933594, + "learning_rate": 4.577325666356586e-07, + "loss": 0.0118, + "num_tokens": 343915401.0, + "reward": 0.08816964740981348, + "reward_std": 0.08973595389397815, + "rewards/pure_accuracy_reward_math": 0.08816964426659979, + "step": 1321 + }, + { + "clip_ratio": 0.0003053776546835252, + "epoch": 1.9025126228675568, + "grad_norm": 0.039738208055496216, + "kl": 0.007574558258056641, + "learning_rate": 4.541208396084304e-07, + "loss": 0.0117, + "step": 1322 + }, + { + "clip_ratio": 0.00030029478972437573, + "epoch": 1.904424725882107, + "grad_norm": 0.038392502814531326, + "kl": 0.007514476776123047, + "learning_rate": 4.5052199421172475e-07, + "loss": 0.0117, + "step": 1323 + }, + { + "clip_ratio": 0.0003343055576010556, + "epoch": 1.9063368288966567, + "grad_norm": 0.037236347794532776, + "kl": 0.007477760314941406, + "learning_rate": 4.4693605310523636e-07, + "loss": 0.0116, + "step": 1324 + }, + { + "clip_ratio": 0.00032557199602933906, + "epoch": 1.9082489319112068, + "grad_norm": 0.03678731992840767, + "kl": 0.007478237152099609, + "learning_rate": 4.43363038867409e-07, + "loss": 0.0115, + "step": 1325 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.3047099113464, + "epoch": 1.9101610349257565, + "grad_norm": 0.11113768815994263, + "kl": 0.013922691345214844, + "learning_rate": 4.39802973995295e-07, + "loss": 0.0093, + "num_tokens": 347490901.0, + "reward": 0.09486607549479231, + "reward_std": 0.09372853260720149, + "rewards/pure_accuracy_reward_math": 0.09486607305007055, + "step": 1326 + }, + { + "clip_ratio": 0.00036943193325100765, + "epoch": 1.9120731379403066, + "grad_norm": 0.055216722190380096, + "kl": 0.013732433319091797, + "learning_rate": 4.362558809044107e-07, + "loss": 0.0093, + "step": 1327 + }, + { + "clip_ratio": 0.0004000666916681439, + "epoch": 1.9139852409548563, + "grad_norm": 0.045698132365942, + "kl": 0.013063907623291016, + "learning_rate": 4.327217819286e-07, + "loss": 0.0092, + "step": 1328 + }, + { + "clip_ratio": 0.0004443397794489101, + "epoch": 1.9158973439694065, + "grad_norm": 0.04273562505841255, + "kl": 0.012539863586425781, + "learning_rate": 4.292006993198888e-07, + "loss": 0.009, + "step": 1329 + }, + { + "clip_ratio": 0.0004470848766686686, + "epoch": 1.9178094469839562, + "grad_norm": 0.04232070967555046, + "kl": 0.012142658233642578, + "learning_rate": 4.2569265524834756e-07, + "loss": 0.0089, + "step": 1330 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.7550463676453, + "epoch": 1.9197215499985063, + "grad_norm": 0.03724661469459534, + "kl": 0.007449150085449219, + "learning_rate": 4.221976718019505e-07, + "loss": 0.007, + "num_tokens": 351086731.0, + "reward": 0.06919643189758062, + "reward_std": 0.07200520270271227, + "rewards/pure_accuracy_reward_math": 0.06919642974389717, + "step": 1331 + }, + { + "clip_ratio": 0.00027471570277270985, + "epoch": 1.921633653013056, + "grad_norm": 0.03599303960800171, + "kl": 0.007382869720458984, + "learning_rate": 4.187157709864392e-07, + "loss": 0.007, + "step": 1332 + }, + { + "clip_ratio": 0.0002737036326720954, + "epoch": 1.9235457560276061, + "grad_norm": 0.03614535927772522, + "kl": 0.007375240325927734, + "learning_rate": 4.152469747251794e-07, + "loss": 0.0069, + "step": 1333 + }, + { + "clip_ratio": 0.00030229948259830053, + "epoch": 1.9254578590421558, + "grad_norm": 0.03546711429953575, + "kl": 0.0072498321533203125, + "learning_rate": 4.117913048590283e-07, + "loss": 0.0069, + "step": 1334 + }, + { + "clip_ratio": 0.00030038867771509103, + "epoch": 1.927369962056706, + "grad_norm": 0.03401359170675278, + "kl": 0.007149219512939453, + "learning_rate": 4.0834878314619244e-07, + "loss": 0.0068, + "step": 1335 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.2182154655457, + "epoch": 1.9292820650712557, + "grad_norm": 0.04080551117658615, + "kl": 0.006867885589599609, + "learning_rate": 4.049194312620927e-07, + "loss": 0.0092, + "num_tokens": 354708525.0, + "reward": 0.07756696798605844, + "reward_std": 0.08467356563778594, + "rewards/pure_accuracy_reward_math": 0.07756696530850604, + "step": 1336 + }, + { + "clip_ratio": 0.0002796990767137686, + "epoch": 1.9311941680858056, + "grad_norm": 0.038895782083272934, + "kl": 0.006824970245361328, + "learning_rate": 4.015032707992286e-07, + "loss": 0.0092, + "step": 1337 + }, + { + "clip_ratio": 0.00032694752422912643, + "epoch": 1.9331062711003555, + "grad_norm": 0.03889061138033867, + "kl": 0.006866931915283203, + "learning_rate": 3.9810032326704106e-07, + "loss": 0.0091, + "step": 1338 + }, + { + "clip_ratio": 0.0003511786251237936, + "epoch": 1.9350183741149054, + "grad_norm": 0.03880919888615608, + "kl": 0.006947994232177734, + "learning_rate": 3.9471061009177693e-07, + "loss": 0.009, + "step": 1339 + }, + { + "clip_ratio": 0.000323922223401496, + "epoch": 1.9369304771294553, + "grad_norm": 0.036964643746614456, + "kl": 0.007033824920654297, + "learning_rate": 3.91334152616355e-07, + "loss": 0.0089, + "step": 1340 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.7076120376587, + "epoch": 1.9388425801440052, + "grad_norm": 0.04040682688355446, + "kl": 0.007448673248291016, + "learning_rate": 3.879709721002317e-07, + "loss": 0.0052, + "num_tokens": 358339045.0, + "reward": 0.07896205660654232, + "reward_std": 0.08278053888352588, + "rewards/pure_accuracy_reward_math": 0.07896205550059676, + "step": 1341 + }, + { + "clip_ratio": 0.00029579239503618737, + "epoch": 1.9407546831585551, + "grad_norm": 0.03910582885146141, + "kl": 0.007539272308349609, + "learning_rate": 3.8462108971926564e-07, + "loss": 0.0052, + "step": 1342 + }, + { + "clip_ratio": 0.0003078770084812277, + "epoch": 1.942666786173105, + "grad_norm": 0.03942732512950897, + "kl": 0.007628440856933594, + "learning_rate": 3.8128452656558623e-07, + "loss": 0.0051, + "step": 1343 + }, + { + "clip_ratio": 0.0003229538778555252, + "epoch": 1.944578889187655, + "grad_norm": 0.03747202083468437, + "kl": 0.007678031921386719, + "learning_rate": 3.779613036474583e-07, + "loss": 0.005, + "step": 1344 + }, + { + "clip_ratio": 0.000363169818285769, + "epoch": 1.946490992202205, + "grad_norm": 0.036778781563043594, + "kl": 0.0076923370361328125, + "learning_rate": 3.746514418891545e-07, + "loss": 0.0049, + "step": 1345 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.7960658073425, + "epoch": 1.9484030952167548, + "grad_norm": 0.040943268686532974, + "kl": 0.011704444885253906, + "learning_rate": 3.713549621308174e-07, + "loss": 0.005, + "num_tokens": 361980918.0, + "reward": 0.07059152092551813, + "reward_std": 0.07973137585213408, + "rewards/pure_accuracy_reward_math": 0.07059151900466532, + "step": 1346 + }, + { + "clip_ratio": 0.00029914512055029263, + "epoch": 1.9503151982313047, + "grad_norm": 0.04052672162652016, + "kl": 0.0114288330078125, + "learning_rate": 3.6807188512833406e-07, + "loss": 0.005, + "step": 1347 + }, + { + "clip_ratio": 0.000334167169853572, + "epoch": 1.9522273012458546, + "grad_norm": 0.04054692015051842, + "kl": 0.011135578155517578, + "learning_rate": 3.648022315532007e-07, + "loss": 0.0049, + "step": 1348 + }, + { + "clip_ratio": 0.00035840429575273447, + "epoch": 1.9541394042604046, + "grad_norm": 0.03996079042553902, + "kl": 0.010680675506591797, + "learning_rate": 3.615460219923955e-07, + "loss": 0.0048, + "step": 1349 + }, + { + "clip_ratio": 0.00034668986540964397, + "epoch": 1.9560515072749545, + "grad_norm": 0.037566084414720535, + "kl": 0.010373115539550781, + "learning_rate": 3.5830327694824777e-07, + "loss": 0.0047, + "step": 1350 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.6453948020935, + "epoch": 1.9579636102895044, + "grad_norm": 0.03812556713819504, + "kl": 0.007121086120605469, + "learning_rate": 3.5507401683830933e-07, + "loss": 0.0114, + "num_tokens": 365629991.0, + "reward": 0.07672991411527619, + "reward_std": 0.07831625349353999, + "rewards/pure_accuracy_reward_math": 0.07672991178696975, + "step": 1351 + }, + { + "clip_ratio": 0.0003128355612602718, + "epoch": 1.9598757133040543, + "grad_norm": 0.03631382808089256, + "kl": 0.007141590118408203, + "learning_rate": 3.518582619952257e-07, + "loss": 0.0114, + "step": 1352 + }, + { + "clip_ratio": 0.00033067399391484287, + "epoch": 1.9617878163186042, + "grad_norm": 0.03752359002828598, + "kl": 0.007140636444091797, + "learning_rate": 3.486560326666072e-07, + "loss": 0.0113, + "step": 1353 + }, + { + "clip_ratio": 0.00037038392605381887, + "epoch": 1.9636999193331541, + "grad_norm": 0.03724711388349533, + "kl": 0.007131099700927734, + "learning_rate": 3.4546734901490466e-07, + "loss": 0.0112, + "step": 1354 + }, + { + "clip_ratio": 0.00040464663743478013, + "epoch": 1.9656120223477038, + "grad_norm": 0.034875430166721344, + "kl": 0.007108211517333984, + "learning_rate": 3.42292231117278e-07, + "loss": 0.0112, + "step": 1355 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.9101786613464, + "epoch": 1.967524125362254, + "grad_norm": 0.04123640060424805, + "kl": 0.007243156433105469, + "learning_rate": 3.3913069896547217e-07, + "loss": 0.0069, + "num_tokens": 369229613.0, + "reward": 0.08007812878349796, + "reward_std": 0.085311732836999, + "rewards/pure_accuracy_reward_math": 0.0800781263387762, + "step": 1356 + }, + { + "clip_ratio": 0.00033138683619426956, + "epoch": 1.9694362283768037, + "grad_norm": 0.04048166796565056, + "kl": 0.007332801818847656, + "learning_rate": 3.3598277246569307e-07, + "loss": 0.0069, + "step": 1357 + }, + { + "clip_ratio": 0.0003668193609200898, + "epoch": 1.9713483313913538, + "grad_norm": 0.042313288897275925, + "kl": 0.007485866546630859, + "learning_rate": 3.3284847143847834e-07, + "loss": 0.0068, + "step": 1358 + }, + { + "clip_ratio": 0.0003713441701620468, + "epoch": 1.9732604344059035, + "grad_norm": 0.04199962690472603, + "kl": 0.007598400115966797, + "learning_rate": 3.2972781561857433e-07, + "loss": 0.0067, + "step": 1359 + }, + { + "clip_ratio": 0.0003367169608736731, + "epoch": 1.9751725374204536, + "grad_norm": 0.03874565288424492, + "kl": 0.007636547088623047, + "learning_rate": 3.266208246548136e-07, + "loss": 0.0066, + "step": 1360 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.4445023536682, + "epoch": 1.9770846404350033, + "grad_norm": 0.040357448160648346, + "kl": 0.007414817810058594, + "learning_rate": 3.2352751810998896e-07, + "loss": 0.0055, + "num_tokens": 372817046.0, + "reward": 0.08258928993018344, + "reward_std": 0.09080576250562444, + "rewards/pure_accuracy_reward_math": 0.08258928690338507, + "step": 1361 + }, + { + "clip_ratio": 0.00038423701278134104, + "epoch": 1.9789967434495535, + "grad_norm": 0.03990958258509636, + "kl": 0.007411479949951172, + "learning_rate": 3.2044791546072985e-07, + "loss": 0.0055, + "step": 1362 + }, + { + "clip_ratio": 0.00044172884827275993, + "epoch": 1.9809088464641031, + "grad_norm": 0.042212970554828644, + "kl": 0.007319450378417969, + "learning_rate": 3.173820360973823e-07, + "loss": 0.0054, + "step": 1363 + }, + { + "clip_ratio": 0.00042502668532051757, + "epoch": 1.9828209494786533, + "grad_norm": 0.03946436941623688, + "kl": 0.0072727203369140625, + "learning_rate": 3.1432989932388416e-07, + "loss": 0.0053, + "step": 1364 + }, + { + "clip_ratio": 0.00040032099315112646, + "epoch": 1.984733052493203, + "grad_norm": 0.03701746463775635, + "kl": 0.007288455963134766, + "learning_rate": 3.1129152435764473e-07, + "loss": 0.0052, + "step": 1365 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.9707279205322, + "epoch": 1.9866451555077531, + "grad_norm": 0.03677362576127052, + "kl": 0.00740814208984375, + "learning_rate": 3.0826693032942586e-07, + "loss": 0.008, + "num_tokens": 376414405.0, + "reward": 0.07087053926079534, + "reward_std": 0.07741290412377566, + "rewards/pure_accuracy_reward_math": 0.07087053710711189, + "step": 1366 + }, + { + "clip_ratio": 0.0002998853265978596, + "epoch": 1.9885572585223028, + "grad_norm": 0.03619634732604027, + "kl": 0.0074787139892578125, + "learning_rate": 3.0525613628321656e-07, + "loss": 0.0079, + "step": 1367 + }, + { + "clip_ratio": 0.00031987275491474065, + "epoch": 1.990469361536853, + "grad_norm": 0.03580261766910553, + "kl": 0.007512092590332031, + "learning_rate": 3.022591611761169e-07, + "loss": 0.0079, + "step": 1368 + }, + { + "clip_ratio": 0.00029055258056587263, + "epoch": 1.9923814645514026, + "grad_norm": 0.03512256592512131, + "kl": 0.007531166076660156, + "learning_rate": 2.9927602387821916e-07, + "loss": 0.0078, + "step": 1369 + }, + { + "clip_ratio": 0.0003325358438814874, + "epoch": 1.9942935675659528, + "grad_norm": 0.03404110670089722, + "kl": 0.007470130920410156, + "learning_rate": 2.963067431724856e-07, + "loss": 0.0077, + "step": 1370 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.95845079422, + "epoch": 2.0019121030145497, + "grad_norm": 0.03709035739302635, + "kl": 0.007386684417724609, + "learning_rate": 2.9335133775463266e-07, + "loss": 0.011, + "num_tokens": 380027444.0, + "reward": 0.07198661039001308, + "reward_std": 0.07208533387165517, + "rewards/pure_accuracy_reward_math": 0.07198660876019858, + "step": 1371 + }, + { + "clip_ratio": 0.0002751371110321088, + "epoch": 2.0038242060291, + "grad_norm": 0.03661485016345978, + "kl": 0.007431507110595703, + "learning_rate": 2.9040982623301264e-07, + "loss": 0.011, + "step": 1372 + }, + { + "clip_ratio": 0.0003175289227783651, + "epoch": 2.0057363090436495, + "grad_norm": 0.036799393594264984, + "kl": 0.007405281066894531, + "learning_rate": 2.874822271284977e-07, + "loss": 0.0109, + "step": 1373 + }, + { + "clip_ratio": 0.0003284543961399322, + "epoch": 2.0076484120581997, + "grad_norm": 0.036977026611566544, + "kl": 0.007386684417724609, + "learning_rate": 2.8456855887436074e-07, + "loss": 0.0108, + "step": 1374 + }, + { + "clip_ratio": 0.00032697250054525284, + "epoch": 2.0095605150727494, + "grad_norm": 0.03594314306974411, + "kl": 0.00739288330078125, + "learning_rate": 2.816688398161613e-07, + "loss": 0.0108, + "step": 1375 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.5270891189575, + "epoch": 2.0114726180872995, + "grad_norm": 15.976890563964844, + "kl": 0.4394536018371582, + "learning_rate": 2.7878308821162964e-07, + "loss": 0.0259, + "num_tokens": 383639505.0, + "reward": 0.08286830733413808, + "reward_std": 0.08972975501092151, + "rewards/pure_accuracy_reward_math": 0.08286830488941632, + "step": 1376 + }, + { + "clip_ratio": 0.0003084787746274742, + "epoch": 2.013384721101849, + "grad_norm": 1.2859545946121216, + "kl": 0.04446220397949219, + "learning_rate": 2.759113222305512e-07, + "loss": 0.0102, + "step": 1377 + }, + { + "clip_ratio": 0.00034848380650487343, + "epoch": 2.0152968241163993, + "grad_norm": 0.0618804506957531, + "kl": 0.009487152099609375, + "learning_rate": 2.730535599546524e-07, + "loss": 0.0087, + "step": 1378 + }, + { + "clip_ratio": 0.000346398171132023, + "epoch": 2.017208927130949, + "grad_norm": 0.039353594183921814, + "kl": 0.008243560791015625, + "learning_rate": 2.702098193774891e-07, + "loss": 0.0087, + "step": 1379 + }, + { + "clip_ratio": 0.000389314118024231, + "epoch": 2.019121030145499, + "grad_norm": 0.03626256063580513, + "kl": 0.0083465576171875, + "learning_rate": 2.6738011840432817e-07, + "loss": 0.0086, + "step": 1380 + }, + { + "clip_ratio": 0.0, + "completion_length": 504.881441116333, + "epoch": 2.021033133160049, + "grad_norm": 0.03991848975419998, + "kl": 0.00807046890258789, + "learning_rate": 2.6456447485204014e-07, + "loss": 0.0078, + "num_tokens": 387180856.0, + "reward": 0.07700893218861893, + "reward_std": 0.0893906393321231, + "rewards/pure_accuracy_reward_math": 0.07700893026776612, + "step": 1381 + }, + { + "clip_ratio": 0.00029079897933570464, + "epoch": 2.022945236174599, + "grad_norm": 0.03955512493848801, + "kl": 0.008087635040283203, + "learning_rate": 2.617629064489838e-07, + "loss": 0.0078, + "step": 1382 + }, + { + "clip_ratio": 0.00034119405472665676, + "epoch": 2.0248573391891487, + "grad_norm": 0.04050750657916069, + "kl": 0.008031845092773438, + "learning_rate": 2.5897543083489544e-07, + "loss": 0.0077, + "step": 1383 + }, + { + "clip_ratio": 0.0003633832532159431, + "epoch": 2.026769442203699, + "grad_norm": 0.03760417178273201, + "kl": 0.007889270782470703, + "learning_rate": 2.562020655607772e-07, + "loss": 0.0076, + "step": 1384 + }, + { + "clip_ratio": 0.00040043183099669477, + "epoch": 2.0286815452182485, + "grad_norm": 0.036376822739839554, + "kl": 0.007742404937744141, + "learning_rate": 2.534428280887891e-07, + "loss": 0.0076, + "step": 1385 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.2332820892334, + "epoch": 2.0305936482327986, + "grad_norm": 0.03659322112798691, + "kl": 0.0079498291015625, + "learning_rate": 2.50697735792135e-07, + "loss": 0.0074, + "num_tokens": 390784592.0, + "reward": 0.0678013424621895, + "reward_std": 0.07990403228905052, + "rewards/pure_accuracy_reward_math": 0.06780134083237499, + "step": 1386 + }, + { + "clip_ratio": 0.0003029348101790674, + "epoch": 2.0325057512473483, + "grad_norm": 0.03603421524167061, + "kl": 0.0077915191650390625, + "learning_rate": 2.47966805954957e-07, + "loss": 0.0073, + "step": 1387 + }, + { + "clip_ratio": 0.0002788126068935526, + "epoch": 2.0344178542618985, + "grad_norm": 0.035584706813097, + "kl": 0.00768280029296875, + "learning_rate": 2.4525005577222373e-07, + "loss": 0.0073, + "step": 1388 + }, + { + "clip_ratio": 0.00033219700696918153, + "epoch": 2.036329957276448, + "grad_norm": 0.033913753926754, + "kl": 0.007656097412109375, + "learning_rate": 2.42547502349624e-07, + "loss": 0.0072, + "step": 1389 + }, + { + "clip_ratio": 0.00034793876449157324, + "epoch": 2.0382420602909983, + "grad_norm": 0.033490557223558426, + "kl": 0.007609367370605469, + "learning_rate": 2.398591627034588e-07, + "loss": 0.0072, + "step": 1390 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.8217334747314, + "epoch": 2.040154163305548, + "grad_norm": 0.04065319523215294, + "kl": 0.007349491119384766, + "learning_rate": 2.3718505376053246e-07, + "loss": 0.0094, + "num_tokens": 394433277.0, + "reward": 0.07589286056463607, + "reward_std": 0.09050671145087108, + "rewards/pure_accuracy_reward_math": 0.07589285823632963, + "step": 1391 + }, + { + "clip_ratio": 0.00032872594630362073, + "epoch": 2.042066266320098, + "grad_norm": 0.0390729084610939, + "kl": 0.007353305816650391, + "learning_rate": 2.345251923580491e-07, + "loss": 0.0094, + "step": 1392 + }, + { + "clip_ratio": 0.00038015836332760955, + "epoch": 2.043978369334648, + "grad_norm": 0.037973206490278244, + "kl": 0.007381916046142578, + "learning_rate": 2.3187959524350352e-07, + "loss": 0.0093, + "step": 1393 + }, + { + "clip_ratio": 0.00041672343576237836, + "epoch": 2.045890472349198, + "grad_norm": 0.037547629326581955, + "kl": 0.007441043853759766, + "learning_rate": 2.2924827907457841e-07, + "loss": 0.0092, + "step": 1394 + }, + { + "clip_ratio": 0.00047711057584365335, + "epoch": 2.0478025753637477, + "grad_norm": 0.037767618894577026, + "kl": 0.007452487945556641, + "learning_rate": 2.266312604190374e-07, + "loss": 0.0091, + "step": 1395 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.9163165092468, + "epoch": 2.049714678378298, + "grad_norm": 0.039165694266557693, + "kl": 0.007717609405517578, + "learning_rate": 2.2402855575462152e-07, + "loss": 0.0071, + "num_tokens": 398030605.0, + "reward": 0.07840402194415219, + "reward_std": 0.08072105259634554, + "rewards/pure_accuracy_reward_math": 0.07840401885914616, + "step": 1396 + }, + { + "clip_ratio": 0.0002864374472437703, + "epoch": 2.0516267813928475, + "grad_norm": 0.03918104246258736, + "kl": 0.007798194885253906, + "learning_rate": 2.2144018146894542e-07, + "loss": 0.007, + "step": 1397 + }, + { + "clip_ratio": 0.00028412381868747616, + "epoch": 2.0535388844073976, + "grad_norm": 0.03787809982895851, + "kl": 0.007855415344238281, + "learning_rate": 2.1886615385939502e-07, + "loss": 0.007, + "step": 1398 + }, + { + "clip_ratio": 0.0002802736350417945, + "epoch": 2.0554509874219473, + "grad_norm": 0.03685666248202324, + "kl": 0.007898807525634766, + "learning_rate": 2.1630648913302354e-07, + "loss": 0.0069, + "step": 1399 + }, + { + "clip_ratio": 0.0003048399971703475, + "epoch": 2.0573630904364975, + "grad_norm": 0.03653446584939957, + "kl": 0.0079193115234375, + "learning_rate": 2.1376120340645014e-07, + "loss": 0.0068, + "step": 1400 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.7120804786682, + "epoch": 2.059275193451047, + "grad_norm": 0.041400156915187836, + "kl": 0.0076904296875, + "learning_rate": 2.1123031270575827e-07, + "loss": 0.0112, + "num_tokens": 401639357.0, + "reward": 0.08398437922005542, + "reward_std": 0.08836089540272951, + "rewards/pure_accuracy_reward_math": 0.08398437665891834, + "step": 1401 + }, + { + "clip_ratio": 0.0003276587292475597, + "epoch": 2.0611872964655973, + "grad_norm": 0.04058953374624252, + "kl": 0.007676601409912109, + "learning_rate": 2.0871383296639487e-07, + "loss": 0.0112, + "step": 1402 + }, + { + "clip_ratio": 0.00033817819053183484, + "epoch": 2.063099399480147, + "grad_norm": 0.040160875767469406, + "kl": 0.007659435272216797, + "learning_rate": 2.062117800330693e-07, + "loss": 0.0112, + "step": 1403 + }, + { + "clip_ratio": 0.00034579052078242967, + "epoch": 2.065011502494697, + "grad_norm": 0.03876737132668495, + "kl": 0.007627964019775391, + "learning_rate": 2.0372416965965675e-07, + "loss": 0.0111, + "step": 1404 + }, + { + "clip_ratio": 0.00035969930786450277, + "epoch": 2.066923605509247, + "grad_norm": 0.03797266259789467, + "kl": 0.007703304290771484, + "learning_rate": 2.0125101750909315e-07, + "loss": 0.011, + "step": 1405 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.2500252723694, + "epoch": 2.068835708523797, + "grad_norm": 0.05333253741264343, + "kl": 0.010094165802001953, + "learning_rate": 1.9879233915328312e-07, + "loss": 0.0065, + "num_tokens": 405215041.0, + "reward": 0.08231027176952921, + "reward_std": 0.08208991179708391, + "rewards/pure_accuracy_reward_math": 0.08231026903376915, + "step": 1406 + }, + { + "clip_ratio": 0.0002884399551135175, + "epoch": 2.0707478115383466, + "grad_norm": 0.04066501557826996, + "kl": 0.009914398193359375, + "learning_rate": 1.9634815007299634e-07, + "loss": 0.0065, + "step": 1407 + }, + { + "clip_ratio": 0.0003325861029566113, + "epoch": 2.0726599145528963, + "grad_norm": 0.03939688578248024, + "kl": 0.00982666015625, + "learning_rate": 1.9391846565777418e-07, + "loss": 0.0064, + "step": 1408 + }, + { + "clip_ratio": 0.0003743518978467364, + "epoch": 2.0745720175674465, + "grad_norm": 0.03857440873980522, + "kl": 0.009755611419677734, + "learning_rate": 1.9150330120583012e-07, + "loss": 0.0063, + "step": 1409 + }, + { + "clip_ratio": 0.0004666026043196325, + "epoch": 2.076484120581996, + "grad_norm": 0.03952641412615776, + "kl": 0.0096588134765625, + "learning_rate": 1.891026719239547e-07, + "loss": 0.0062, + "step": 1410 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.8532605171204, + "epoch": 2.0783962235965463, + "grad_norm": 0.04142899066209793, + "kl": 0.008448123931884766, + "learning_rate": 1.8671659292742007e-07, + "loss": 0.0099, + "num_tokens": 408804459.0, + "reward": 0.08286830742144957, + "reward_std": 0.08260788215557113, + "rewards/pure_accuracy_reward_math": 0.08286830509314314, + "step": 1411 + }, + { + "clip_ratio": 0.0003487231184635675, + "epoch": 2.080308326611096, + "grad_norm": 0.040530916303396225, + "kl": 0.008367538452148438, + "learning_rate": 1.8434507923988375e-07, + "loss": 0.0099, + "step": 1412 + }, + { + "clip_ratio": 0.0003221970002869057, + "epoch": 2.082220429625646, + "grad_norm": 0.03941330686211586, + "kl": 0.008350849151611328, + "learning_rate": 1.8198814579329426e-07, + "loss": 0.0098, + "step": 1413 + }, + { + "clip_ratio": 0.00037204451541583694, + "epoch": 2.084132532640196, + "grad_norm": 0.03861032798886299, + "kl": 0.008304595947265625, + "learning_rate": 1.7964580742779847e-07, + "loss": 0.0097, + "step": 1414 + }, + { + "clip_ratio": 0.0003590778907209824, + "epoch": 2.086044635654746, + "grad_norm": 0.03945469483733177, + "kl": 0.008287906646728516, + "learning_rate": 1.7731807889164537e-07, + "loss": 0.0096, + "step": 1415 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.592381477356, + "epoch": 2.0879567386692957, + "grad_norm": 0.03833872824907303, + "kl": 0.0077228546142578125, + "learning_rate": 1.7500497484109703e-07, + "loss": 0.0109, + "num_tokens": 412432506.0, + "reward": 0.07449777142028324, + "reward_std": 0.08200978167587891, + "rewards/pure_accuracy_reward_math": 0.07449776885914616, + "step": 1416 + }, + { + "clip_ratio": 0.0002795722035671133, + "epoch": 2.089868841683846, + "grad_norm": 0.03684116527438164, + "kl": 0.007727146148681641, + "learning_rate": 1.7270650984033245e-07, + "loss": 0.0108, + "step": 1417 + }, + { + "clip_ratio": 0.00033119657558700055, + "epoch": 2.0917809446983955, + "grad_norm": 0.03667665645480156, + "kl": 0.007739067077636719, + "learning_rate": 1.7042269836135882e-07, + "loss": 0.0108, + "step": 1418 + }, + { + "clip_ratio": 0.00036255177064958843, + "epoch": 2.0936930477129456, + "grad_norm": 0.037857044488191605, + "kl": 0.007757663726806641, + "learning_rate": 1.6815355478391886e-07, + "loss": 0.0107, + "step": 1419 + }, + { + "clip_ratio": 0.0003589615364489873, + "epoch": 2.0956051507274953, + "grad_norm": 0.0360855907201767, + "kl": 0.007729053497314453, + "learning_rate": 1.6589909339539968e-07, + "loss": 0.0106, + "step": 1420 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.7469544410706, + "epoch": 2.0975172537420455, + "grad_norm": 0.041348401457071304, + "kl": 0.007639408111572266, + "learning_rate": 1.6365932839074532e-07, + "loss": 0.0099, + "num_tokens": 416048915.0, + "reward": 0.07979911076836288, + "reward_std": 0.08175079576903954, + "rewards/pure_accuracy_reward_math": 0.07979910861467943, + "step": 1421 + }, + { + "clip_ratio": 0.00028084742956480113, + "epoch": 2.099429356756595, + "grad_norm": 0.03983917832374573, + "kl": 0.007691860198974609, + "learning_rate": 1.6143427387236455e-07, + "loss": 0.0099, + "step": 1422 + }, + { + "clip_ratio": 0.00032101355429858813, + "epoch": 2.1013414597711453, + "grad_norm": 0.04035898670554161, + "kl": 0.007829666137695312, + "learning_rate": 1.592239438500434e-07, + "loss": 0.0098, + "step": 1423 + }, + { + "clip_ratio": 0.00036129408920260175, + "epoch": 2.103253562785695, + "grad_norm": 0.03893222287297249, + "kl": 0.0079498291015625, + "learning_rate": 1.570283522408586e-07, + "loss": 0.0097, + "step": 1424 + }, + { + "clip_ratio": 0.0003233651194136655, + "epoch": 2.105165665800245, + "grad_norm": 0.03798089176416397, + "kl": 0.008071422576904297, + "learning_rate": 1.5484751286908655e-07, + "loss": 0.0097, + "step": 1425 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.3281455039978, + "epoch": 2.107077768814795, + "grad_norm": 0.04489213973283768, + "kl": 0.00823831558227539, + "learning_rate": 1.5268143946611802e-07, + "loss": 0.01, + "num_tokens": 419628171.0, + "reward": 0.07952009321888909, + "reward_std": 0.0892580482759513, + "rewards/pure_accuracy_reward_math": 0.07952009089058265, + "step": 1426 + }, + { + "clip_ratio": 0.0003507794546067089, + "epoch": 2.108989871829345, + "grad_norm": 0.04182901233434677, + "kl": 0.008199691772460938, + "learning_rate": 1.5053014567037171e-07, + "loss": 0.01, + "step": 1427 + }, + { + "clip_ratio": 0.0004634781105323782, + "epoch": 2.1109019748438946, + "grad_norm": 0.04111779108643532, + "kl": 0.008260250091552734, + "learning_rate": 1.483936450272097e-07, + "loss": 0.0099, + "step": 1428 + }, + { + "clip_ratio": 0.0005032591409417364, + "epoch": 2.1128140778584448, + "grad_norm": 0.04071485623717308, + "kl": 0.008274078369140625, + "learning_rate": 1.4627195098884856e-07, + "loss": 0.0098, + "step": 1429 + }, + { + "clip_ratio": 0.0005640338476382567, + "epoch": 2.1147261808729945, + "grad_norm": 0.041747044771909714, + "kl": 0.008271217346191406, + "learning_rate": 1.441650769142791e-07, + "loss": 0.0097, + "step": 1430 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.8217334747314, + "epoch": 2.1166382838875446, + "grad_norm": 0.04057304188609123, + "kl": 0.00798797607421875, + "learning_rate": 1.4207303606917856e-07, + "loss": 0.0057, + "num_tokens": 423255484.0, + "reward": 0.08761161076836288, + "reward_std": 0.09866452467394993, + "rewards/pure_accuracy_reward_math": 0.08761160855647177, + "step": 1431 + }, + { + "clip_ratio": 0.0003497144300581567, + "epoch": 2.1185503869020943, + "grad_norm": 0.03972388803958893, + "kl": 0.007953643798828125, + "learning_rate": 1.3999584162582874e-07, + "loss": 0.0057, + "step": 1432 + }, + { + "clip_ratio": 0.00037741022566706306, + "epoch": 2.1204624899166444, + "grad_norm": 0.03924018144607544, + "kl": 0.00795888900756836, + "learning_rate": 1.3793350666303328e-07, + "loss": 0.0056, + "step": 1433 + }, + { + "clip_ratio": 0.0003785647801350933, + "epoch": 2.122374592931194, + "grad_norm": 0.03913624957203865, + "kl": 0.007895946502685547, + "learning_rate": 1.3588604416603424e-07, + "loss": 0.0055, + "step": 1434 + }, + { + "clip_ratio": 0.0003937934675377619, + "epoch": 2.1242866959457443, + "grad_norm": 0.03699544072151184, + "kl": 0.00783538818359375, + "learning_rate": 1.3385346702643188e-07, + "loss": 0.0054, + "step": 1435 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.7888078689575, + "epoch": 2.126198798960294, + "grad_norm": 0.042676378041505814, + "kl": 0.010451793670654297, + "learning_rate": 1.3183578804210173e-07, + "loss": 0.0098, + "num_tokens": 426903267.0, + "reward": 0.07645089671132155, + "reward_std": 0.08488008996937424, + "rewards/pure_accuracy_reward_math": 0.07645089426659979, + "step": 1436 + }, + { + "clip_ratio": 0.00036263700505401175, + "epoch": 2.128110901974844, + "grad_norm": 0.03884616866707802, + "kl": 0.010242462158203125, + "learning_rate": 1.2983301991711578e-07, + "loss": 0.0098, + "step": 1437 + }, + { + "clip_ratio": 0.0003990789759313884, + "epoch": 2.130023004989394, + "grad_norm": 0.0399676114320755, + "kl": 0.01007843017578125, + "learning_rate": 1.278451752616608e-07, + "loss": 0.0097, + "step": 1438 + }, + { + "clip_ratio": 0.0004171350746560165, + "epoch": 2.131935108003944, + "grad_norm": 0.039714373648166656, + "kl": 0.010037422180175781, + "learning_rate": 1.258722665919604e-07, + "loss": 0.0097, + "step": 1439 + }, + { + "clip_ratio": 0.00039808801824392503, + "epoch": 2.1338472110184936, + "grad_norm": 0.03794709965586662, + "kl": 0.009942054748535156, + "learning_rate": 1.2391430633019452e-07, + "loss": 0.0096, + "step": 1440 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.7826709747314, + "epoch": 2.1357593140330433, + "grad_norm": 0.05131447687745094, + "kl": 0.00860595703125, + "learning_rate": 1.2197130680442399e-07, + "loss": 0.0073, + "num_tokens": 430520032.0, + "reward": 0.07282366428989917, + "reward_std": 0.0797313749208115, + "rewards/pure_accuracy_reward_math": 0.07282366172876209, + "step": 1441 + }, + { + "clip_ratio": 0.0003007381984616586, + "epoch": 2.1376714170475934, + "grad_norm": 0.03815394267439842, + "kl": 0.008358001708984375, + "learning_rate": 1.2004328024850938e-07, + "loss": 0.0073, + "step": 1442 + }, + { + "clip_ratio": 0.0003256684682355626, + "epoch": 2.139583520062143, + "grad_norm": 0.03841105103492737, + "kl": 0.008275985717773438, + "learning_rate": 1.1813023880203722e-07, + "loss": 0.0072, + "step": 1443 + }, + { + "clip_ratio": 0.00034418403180325186, + "epoch": 2.1414956230766933, + "grad_norm": 0.041511572897434235, + "kl": 0.008276939392089844, + "learning_rate": 1.1623219451024098e-07, + "loss": 0.0071, + "step": 1444 + }, + { + "clip_ratio": 0.00032526867431670325, + "epoch": 2.143407726091243, + "grad_norm": 0.03922862559556961, + "kl": 0.008294105529785156, + "learning_rate": 1.1434915932392682e-07, + "loss": 0.007, + "step": 1445 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.7310523986816, + "epoch": 2.145319829105793, + "grad_norm": 0.04134941101074219, + "kl": 0.008166313171386719, + "learning_rate": 1.1248114509939817e-07, + "loss": 0.0067, + "num_tokens": 434141592.0, + "reward": 0.08342634307336994, + "reward_std": 0.08578344061970711, + "rewards/pure_accuracy_reward_math": 0.08342634132714011, + "step": 1446 + }, + { + "clip_ratio": 0.00029539940015865795, + "epoch": 2.147231932120343, + "grad_norm": 0.04034848138689995, + "kl": 0.008122920989990234, + "learning_rate": 1.1062816359838024e-07, + "loss": 0.0066, + "step": 1447 + }, + { + "clip_ratio": 0.0003565281184592095, + "epoch": 2.149144035134893, + "grad_norm": 0.04018424078822136, + "kl": 0.00803232192993164, + "learning_rate": 1.0879022648794645e-07, + "loss": 0.0066, + "step": 1448 + }, + { + "clip_ratio": 0.0003515161848781645, + "epoch": 2.1510561381494426, + "grad_norm": 0.03917380049824715, + "kl": 0.007886886596679688, + "learning_rate": 1.0696734534044629e-07, + "loss": 0.0065, + "step": 1449 + }, + { + "clip_ratio": 0.0004228238227028669, + "epoch": 2.1529682411639928, + "grad_norm": 0.038036227226257324, + "kl": 0.00785064697265625, + "learning_rate": 1.0515953163342973e-07, + "loss": 0.0064, + "step": 1450 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.0078330039978, + "epoch": 2.1548803441785425, + "grad_norm": 0.03814779594540596, + "kl": 0.008002758026123047, + "learning_rate": 1.0336679674957716e-07, + "loss": 0.0113, + "num_tokens": 437824108.0, + "reward": 0.07533482514554635, + "reward_std": 0.07659588241949677, + "rewards/pure_accuracy_reward_math": 0.07533482287544757, + "step": 1451 + }, + { + "clip_ratio": 0.0002914705042371679, + "epoch": 2.1567924471930926, + "grad_norm": 0.03763413056731224, + "kl": 0.00798654556274414, + "learning_rate": 1.0158915197662628e-07, + "loss": 0.0113, + "step": 1452 + }, + { + "clip_ratio": 0.0002916823746659247, + "epoch": 2.1587045502076423, + "grad_norm": 0.036225125193595886, + "kl": 0.008030414581298828, + "learning_rate": 9.982660850730269e-08, + "loss": 0.0112, + "step": 1453 + }, + { + "clip_ratio": 0.0002708278207137482, + "epoch": 2.1606166532221924, + "grad_norm": 0.03529945760965347, + "kl": 0.00803375244140625, + "learning_rate": 9.807917743924838e-08, + "loss": 0.0112, + "step": 1454 + }, + { + "clip_ratio": 0.0002930295025862506, + "epoch": 2.162528756236742, + "grad_norm": 0.03426925837993622, + "kl": 0.007987022399902344, + "learning_rate": 9.634686977495089e-08, + "loss": 0.0111, + "step": 1455 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.6585068702698, + "epoch": 2.1644408592512923, + "grad_norm": 0.038425736129283905, + "kl": 0.008115291595458984, + "learning_rate": 9.462969642167613e-08, + "loss": 0.0052, + "num_tokens": 441407888.0, + "reward": 0.07617187869618647, + "reward_std": 0.0740246243076399, + "rewards/pure_accuracy_reward_math": 0.07617187630967237, + "step": 1456 + }, + { + "clip_ratio": 0.00023060813538222646, + "epoch": 2.166352962265842, + "grad_norm": 0.03851727396249771, + "kl": 0.008001327514648438, + "learning_rate": 9.292766819139847e-08, + "loss": 0.0052, + "step": 1457 + }, + { + "clip_ratio": 0.0002378168165932948, + "epoch": 2.168265065280392, + "grad_norm": 0.040155645459890366, + "kl": 0.007994651794433594, + "learning_rate": 9.12407958007322e-08, + "loss": 0.0051, + "step": 1458 + }, + { + "clip_ratio": 0.0002497726611068174, + "epoch": 2.170177168294942, + "grad_norm": 0.0425233468413353, + "kl": 0.007935047149658203, + "learning_rate": 8.956908987086538e-08, + "loss": 0.005, + "step": 1459 + }, + { + "clip_ratio": 0.00030142679486289126, + "epoch": 2.172089271309492, + "grad_norm": 0.03647738695144653, + "kl": 0.007966041564941406, + "learning_rate": 8.791256092749223e-08, + "loss": 0.0049, + "step": 1460 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.2968997955322, + "epoch": 2.1740013743240416, + "grad_norm": 0.22045741975307465, + "kl": 0.022356510162353516, + "learning_rate": 8.627121940074645e-08, + "loss": 0.0122, + "num_tokens": 445010628.0, + "reward": 0.08705357578583062, + "reward_std": 0.08814817463280633, + "rewards/pure_accuracy_reward_math": 0.08705357281723991, + "step": 1461 + }, + { + "clip_ratio": 0.00031046926528688346, + "epoch": 2.1759134773385918, + "grad_norm": 0.06329243630170822, + "kl": 0.015823841094970703, + "learning_rate": 8.464507562513657e-08, + "loss": 0.0119, + "step": 1462 + }, + { + "clip_ratio": 0.0003438202776351318, + "epoch": 2.1778255803531414, + "grad_norm": 0.05041000247001648, + "kl": 0.014271736145019531, + "learning_rate": 8.303413983948017e-08, + "loss": 0.0118, + "step": 1463 + }, + { + "clip_ratio": 0.0003563892260558532, + "epoch": 2.1797376833676916, + "grad_norm": 0.04660080000758171, + "kl": 0.013462543487548828, + "learning_rate": 8.143842218683862e-08, + "loss": 0.0117, + "step": 1464 + }, + { + "clip_ratio": 0.0004125210731444895, + "epoch": 2.1816497863822413, + "grad_norm": 0.04536700248718262, + "kl": 0.012927532196044922, + "learning_rate": 7.985793271445636e-08, + "loss": 0.0116, + "step": 1465 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.6127443313599, + "epoch": 2.1835618893967914, + "grad_norm": 0.08454474061727524, + "kl": 0.010744094848632812, + "learning_rate": 7.829268137369311e-08, + "loss": 0.0075, + "num_tokens": 448601372.0, + "reward": 0.0750558071595151, + "reward_std": 0.0813654173980467, + "rewards/pure_accuracy_reward_math": 0.07505580488941632, + "step": 1466 + }, + { + "clip_ratio": 0.00028517025145902153, + "epoch": 2.185473992411341, + "grad_norm": 0.04138394817709923, + "kl": 0.009669780731201172, + "learning_rate": 7.674267801996427e-08, + "loss": 0.0075, + "step": 1467 + }, + { + "clip_ratio": 0.00027802770790685827, + "epoch": 2.1873860954258912, + "grad_norm": 0.03745463490486145, + "kl": 0.009511947631835938, + "learning_rate": 7.52079324126792e-08, + "loss": 0.0074, + "step": 1468 + }, + { + "clip_ratio": 0.0003267590287805433, + "epoch": 2.189298198440441, + "grad_norm": 0.036841075867414474, + "kl": 0.00956106185913086, + "learning_rate": 7.368845421517779e-08, + "loss": 0.0073, + "step": 1469 + }, + { + "clip_ratio": 0.0003443693621534294, + "epoch": 2.191210301454991, + "grad_norm": 0.0362345427274704, + "kl": 0.009715557098388672, + "learning_rate": 7.21842529946698e-08, + "loss": 0.0072, + "step": 1470 + }, + { + "clip_ratio": 0.0, + "completion_length": 499.83763551712036, + "epoch": 2.1931224044695408, + "grad_norm": 0.0431695282459259, + "kl": 0.008378028869628906, + "learning_rate": 7.0695338222177e-08, + "loss": 0.0093, + "num_tokens": 452124382.0, + "reward": 0.07756696839351207, + "reward_std": 0.08685944566968828, + "rewards/pure_accuracy_reward_math": 0.07756696530850604, + "step": 1471 + }, + { + "clip_ratio": 0.0003288618632950602, + "epoch": 2.195034507484091, + "grad_norm": 0.042445823550224304, + "kl": 0.008408546447753906, + "learning_rate": 6.922171927247062e-08, + "loss": 0.0092, + "step": 1472 + }, + { + "clip_ratio": 0.0003429904774066017, + "epoch": 2.1969466104986406, + "grad_norm": 0.04231419414281845, + "kl": 0.008434295654296875, + "learning_rate": 6.776340542401422e-08, + "loss": 0.0092, + "step": 1473 + }, + { + "clip_ratio": 0.00035230960349963425, + "epoch": 2.1988587135131903, + "grad_norm": 0.04162426292896271, + "kl": 0.008434295654296875, + "learning_rate": 6.632040585890398e-08, + "loss": 0.0091, + "step": 1474 + }, + { + "clip_ratio": 0.000348456743722636, + "epoch": 2.2007708165277404, + "grad_norm": 0.04009128361940384, + "kl": 0.008394718170166016, + "learning_rate": 6.489272966281269e-08, + "loss": 0.009, + "step": 1475 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.53015899658203, + "epoch": 2.2026829195422906, + "grad_norm": 0.03803718462586403, + "kl": 0.008605003356933594, + "learning_rate": 6.348038582493e-08, + "loss": 0.0064, + "num_tokens": 455697798.0, + "reward": 0.06863839633297175, + "reward_std": 0.0772402475704439, + "rewards/pure_accuracy_reward_math": 0.06863839423749596, + "step": 1476 + }, + { + "clip_ratio": 0.0002735381897878142, + "epoch": 2.2045950225568403, + "grad_norm": 0.036724258214235306, + "kl": 0.008575439453125, + "learning_rate": 6.208338323790891e-08, + "loss": 0.0064, + "step": 1477 + }, + { + "clip_ratio": 0.000271568493644736, + "epoch": 2.20650712557139, + "grad_norm": 0.03627302870154381, + "kl": 0.008494853973388672, + "learning_rate": 6.070173069780638e-08, + "loss": 0.0063, + "step": 1478 + }, + { + "clip_ratio": 0.0003129301562694309, + "epoch": 2.20841922858594, + "grad_norm": 0.035685960203409195, + "kl": 0.008512496948242188, + "learning_rate": 5.933543690403082e-08, + "loss": 0.0063, + "step": 1479 + }, + { + "clip_ratio": 0.0003575469975203305, + "epoch": 2.21033133160049, + "grad_norm": 0.03495527431368828, + "kl": 0.008492469787597656, + "learning_rate": 5.7984510459285215e-08, + "loss": 0.0062, + "step": 1480 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.403482913971, + "epoch": 2.21224343461504, + "grad_norm": 0.041989997029304504, + "kl": 0.008183956146240234, + "learning_rate": 5.6648959869514965e-08, + "loss": 0.0075, + "num_tokens": 459321180.0, + "reward": 0.07617187898722477, + "reward_std": 0.0817908609751612, + "rewards/pure_accuracy_reward_math": 0.07617187630967237, + "step": 1481 + }, + { + "clip_ratio": 0.0003129412224893713, + "epoch": 2.2141555376295896, + "grad_norm": 0.04108978435397148, + "kl": 0.00823974609375, + "learning_rate": 5.532879354385234e-08, + "loss": 0.0075, + "step": 1482 + }, + { + "clip_ratio": 0.0003202799926498301, + "epoch": 2.2160676406441397, + "grad_norm": 0.03990933671593666, + "kl": 0.00827646255493164, + "learning_rate": 5.4024019794565176e-08, + "loss": 0.0075, + "step": 1483 + }, + { + "clip_ratio": 0.0003925440155398974, + "epoch": 2.2179797436586894, + "grad_norm": 0.039193831384181976, + "kl": 0.008234977722167969, + "learning_rate": 5.273464683700352e-08, + "loss": 0.0074, + "step": 1484 + }, + { + "clip_ratio": 0.0004001183214654702, + "epoch": 2.2198918466732396, + "grad_norm": 0.039878588169813156, + "kl": 0.00826406478881836, + "learning_rate": 5.1460682789547526e-08, + "loss": 0.0073, + "step": 1485 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.470449924469, + "epoch": 2.2218039496877893, + "grad_norm": 0.04079683497548103, + "kl": 0.011513710021972656, + "learning_rate": 5.020213567355825e-08, + "loss": 0.0091, + "num_tokens": 462957626.0, + "reward": 0.06752232459257357, + "reward_std": 0.07320140459341928, + "rewards/pure_accuracy_reward_math": 0.0675223229045514, + "step": 1486 + }, + { + "clip_ratio": 0.0002717390548241383, + "epoch": 2.2237160527023394, + "grad_norm": 0.037311483174562454, + "kl": 0.011410713195800781, + "learning_rate": 4.8959013413324705e-08, + "loss": 0.009, + "step": 1487 + }, + { + "clip_ratio": 0.0002951391629721911, + "epoch": 2.225628155716889, + "grad_norm": 0.035728756338357925, + "kl": 0.011387348175048828, + "learning_rate": 4.773132383601664e-08, + "loss": 0.009, + "step": 1488 + }, + { + "clip_ratio": 0.00030970129540719427, + "epoch": 2.2275402587314392, + "grad_norm": 0.03630708530545235, + "kl": 0.011130332946777344, + "learning_rate": 4.6519074671631805e-08, + "loss": 0.0089, + "step": 1489 + }, + { + "clip_ratio": 0.00035198272149727927, + "epoch": 2.229452361745989, + "grad_norm": 0.035501569509506226, + "kl": 0.010982990264892578, + "learning_rate": 4.5322273552951265e-08, + "loss": 0.0088, + "step": 1490 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.0912661552429, + "epoch": 2.231364464760539, + "grad_norm": 0.039065275341272354, + "kl": 0.008381366729736328, + "learning_rate": 4.4140928015488085e-08, + "loss": 0.0067, + "num_tokens": 466540145.0, + "reward": 0.08007812951109372, + "reward_std": 0.07346039032563567, + "rewards/pure_accuracy_reward_math": 0.08007812619325705, + "step": 1491 + }, + { + "clip_ratio": 0.0002747246091985289, + "epoch": 2.2332765677750888, + "grad_norm": 0.03766880929470062, + "kl": 0.008387088775634766, + "learning_rate": 4.297504549744119e-08, + "loss": 0.0067, + "step": 1492 + }, + { + "clip_ratio": 0.0002486348788579562, + "epoch": 2.235188670789639, + "grad_norm": 0.03599947690963745, + "kl": 0.0084991455078125, + "learning_rate": 4.182463333964909e-08, + "loss": 0.0066, + "step": 1493 + }, + { + "clip_ratio": 0.0002674886795261955, + "epoch": 2.2371007738041886, + "grad_norm": 0.0361332893371582, + "kl": 0.008679389953613281, + "learning_rate": 4.068969878554263e-08, + "loss": 0.0066, + "step": 1494 + }, + { + "clip_ratio": 0.00031218544620514876, + "epoch": 2.2390128768187387, + "grad_norm": 0.035462211817502975, + "kl": 0.008719921112060547, + "learning_rate": 3.957024898110007e-08, + "loss": 0.0065, + "step": 1495 + }, + { + "clip_ratio": 0.0, + "completion_length": 507.05945777893066, + "epoch": 2.2409249798332884, + "grad_norm": 0.10880274325609207, + "kl": 0.012134075164794922, + "learning_rate": 3.846629097480126e-08, + "loss": 0.0046, + "num_tokens": 470091662.0, + "reward": 0.07952009330620058, + "reward_std": 0.08660046098520979, + "rewards/pure_accuracy_reward_math": 0.0795200907450635, + "step": 1496 + }, + { + "clip_ratio": 0.00034633993402621854, + "epoch": 2.2428370828478386, + "grad_norm": 0.04444468766450882, + "kl": 0.010071754455566406, + "learning_rate": 3.737783171758408e-08, + "loss": 0.0045, + "step": 1497 + }, + { + "clip_ratio": 0.00040814166391101026, + "epoch": 2.2447491858623883, + "grad_norm": 0.050679393112659454, + "kl": 0.009745597839355469, + "learning_rate": 3.630487806280086e-08, + "loss": 0.0044, + "step": 1498 + }, + { + "clip_ratio": 0.00040935890626769833, + "epoch": 2.2466612888769384, + "grad_norm": 0.04249563813209534, + "kl": 0.009531974792480469, + "learning_rate": 3.524743676617426e-08, + "loss": 0.0044, + "step": 1499 + }, + { + "clip_ratio": 0.00041069585563491273, + "epoch": 2.248573391891488, + "grad_norm": 0.04013880342245102, + "kl": 0.009422779083251953, + "learning_rate": 3.42055144857556e-08, + "loss": 0.0042, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.4908156394958, + "epoch": 2.250485494906038, + "grad_norm": 0.04119328781962395, + "kl": 0.00858306884765625, + "learning_rate": 3.3179117781882154e-08, + "loss": 0.0064, + "num_tokens": 473729421.0, + "reward": 0.08175223629223183, + "reward_std": 0.080375739664305, + "rewards/pure_accuracy_reward_math": 0.08175223390571773, + "step": 1501 + }, + { + "clip_ratio": 0.00027040669908728887, + "epoch": 2.252397597920588, + "grad_norm": 0.03726639971137047, + "kl": 0.008556365966796875, + "learning_rate": 3.216825311713689e-08, + "loss": 0.0064, + "step": 1502 + }, + { + "clip_ratio": 0.0003022322244419229, + "epoch": 2.254309700935138, + "grad_norm": 0.03740008547902107, + "kl": 0.008624553680419922, + "learning_rate": 3.11729268563063e-08, + "loss": 0.0063, + "step": 1503 + }, + { + "clip_ratio": 0.0002972338604081415, + "epoch": 2.2562218039496877, + "grad_norm": 0.036019936203956604, + "kl": 0.008683204650878906, + "learning_rate": 3.019314526634232e-08, + "loss": 0.0062, + "step": 1504 + }, + { + "clip_ratio": 0.0003317092545103151, + "epoch": 2.258133906964238, + "grad_norm": 0.035242002457380295, + "kl": 0.008699893951416016, + "learning_rate": 2.922891451632076e-08, + "loss": 0.0062, + "step": 1505 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.8340096473694, + "epoch": 2.2600460099787876, + "grad_norm": 0.04786042869091034, + "kl": 0.0166015625, + "learning_rate": 2.8280240677403813e-08, + "loss": 0.0117, + "num_tokens": 477311002.0, + "reward": 0.08593750389991328, + "reward_std": 0.09509739134227857, + "rewards/pure_accuracy_reward_math": 0.08593750139698386, + "step": 1506 + }, + { + "clip_ratio": 0.0003771551589011324, + "epoch": 2.2619581129933373, + "grad_norm": 0.04542854428291321, + "kl": 0.016517162322998047, + "learning_rate": 2.7347129722801736e-08, + "loss": 0.0117, + "step": 1507 + }, + { + "clip_ratio": 0.00043879733209450933, + "epoch": 2.2638702160078874, + "grad_norm": 0.04336082562804222, + "kl": 0.016106605529785156, + "learning_rate": 2.6429587527734835e-08, + "loss": 0.0116, + "step": 1508 + }, + { + "clip_ratio": 0.0005006881825977416, + "epoch": 2.2657823190224375, + "grad_norm": 0.04397574067115784, + "kl": 0.015746116638183594, + "learning_rate": 2.5527619869396003e-08, + "loss": 0.0115, + "step": 1509 + }, + { + "clip_ratio": 0.0005348546662844456, + "epoch": 2.2676944220369872, + "grad_norm": 0.043936342000961304, + "kl": 0.015500068664550781, + "learning_rate": 2.464123242691574e-08, + "loss": 0.0114, + "step": 1510 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.8474016189575, + "epoch": 2.269606525051537, + "grad_norm": 0.04165401682257652, + "kl": 0.008256912231445312, + "learning_rate": 2.377043078132496e-08, + "loss": 0.0079, + "num_tokens": 480935151.0, + "reward": 0.08342634345171973, + "reward_std": 0.09024772583507001, + "rewards/pure_accuracy_reward_math": 0.08342634071595967, + "step": 1511 + }, + { + "clip_ratio": 0.0003286536882569635, + "epoch": 2.271518628066087, + "grad_norm": 0.04013460502028465, + "kl": 0.008354663848876953, + "learning_rate": 2.291522041552141e-08, + "loss": 0.0079, + "step": 1512 + }, + { + "clip_ratio": 0.00034448601985559435, + "epoch": 2.273430731080637, + "grad_norm": 0.03929148614406586, + "kl": 0.008509159088134766, + "learning_rate": 2.207560671423331e-08, + "loss": 0.0078, + "step": 1513 + }, + { + "clip_ratio": 0.00038580430322099346, + "epoch": 2.275342834095187, + "grad_norm": 0.04108521342277527, + "kl": 0.008730888366699219, + "learning_rate": 2.1251594963986876e-08, + "loss": 0.0077, + "step": 1514 + }, + { + "clip_ratio": 0.00038072799372912414, + "epoch": 2.2772549371097366, + "grad_norm": 0.038887783885002136, + "kl": 0.008725643157958984, + "learning_rate": 2.0443190353072185e-08, + "loss": 0.0076, + "step": 1515 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.4051609039307, + "epoch": 2.2791670401242867, + "grad_norm": 0.03783741220831871, + "kl": 0.008581161499023438, + "learning_rate": 1.9650397971510972e-08, + "loss": 0.0064, + "num_tokens": 484530587.0, + "reward": 0.08231027124566026, + "reward_std": 0.08037574036279693, + "rewards/pure_accuracy_reward_math": 0.08231026897556148, + "step": 1516 + }, + { + "clip_ratio": 0.0002746778108644321, + "epoch": 2.2810791431388364, + "grad_norm": 0.03765445947647095, + "kl": 0.008580207824707031, + "learning_rate": 1.8873222811024717e-08, + "loss": 0.0063, + "step": 1517 + }, + { + "clip_ratio": 0.00031986788579274616, + "epoch": 2.2829912461533866, + "grad_norm": 0.03684096038341522, + "kl": 0.008593082427978516, + "learning_rate": 1.8111669765003005e-08, + "loss": 0.0063, + "step": 1518 + }, + { + "clip_ratio": 0.0003354349921380617, + "epoch": 2.2849033491679362, + "grad_norm": 0.03599463030695915, + "kl": 0.008591175079345703, + "learning_rate": 1.73657436284716e-08, + "loss": 0.0062, + "step": 1519 + }, + { + "clip_ratio": 0.0003505910435706028, + "epoch": 2.2868154521824864, + "grad_norm": 0.035750966519117355, + "kl": 0.00874948501586914, + "learning_rate": 1.6635449098064972e-08, + "loss": 0.0061, + "step": 1520 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.2455606460571, + "epoch": 2.288727555197036, + "grad_norm": 0.03890154883265495, + "kl": 0.008922100067138672, + "learning_rate": 1.5920790771993822e-08, + "loss": 0.0078, + "num_tokens": 488136255.0, + "reward": 0.07952009289874695, + "reward_std": 0.07556614064378664, + "rewards/pure_accuracy_reward_math": 0.07952009068685584, + "step": 1521 + }, + { + "clip_ratio": 0.00024827225587387147, + "epoch": 2.290639658211586, + "grad_norm": 0.037810854613780975, + "kl": 0.008934974670410156, + "learning_rate": 1.5221773150017882e-08, + "loss": 0.0078, + "step": 1522 + }, + { + "clip_ratio": 0.0002384709360967463, + "epoch": 2.292551761226136, + "grad_norm": 0.0364384800195694, + "kl": 0.008936882019042969, + "learning_rate": 1.4538400633417049e-08, + "loss": 0.0077, + "step": 1523 + }, + { + "clip_ratio": 0.0002599185108635993, + "epoch": 2.294463864240686, + "grad_norm": 0.035106074064970016, + "kl": 0.008829116821289062, + "learning_rate": 1.387067752496335e-08, + "loss": 0.0076, + "step": 1524 + }, + { + "clip_ratio": 0.0003290796867077006, + "epoch": 2.2963759672552357, + "grad_norm": 0.03489363566040993, + "kl": 0.0086822509765625, + "learning_rate": 1.3218608028895131e-08, + "loss": 0.0076, + "step": 1525 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.0547122955322, + "epoch": 2.298288070269786, + "grad_norm": 0.040062014013528824, + "kl": 0.008834362030029297, + "learning_rate": 1.2582196250888745e-08, + "loss": 0.0071, + "num_tokens": 491722139.0, + "reward": 0.08621652179863304, + "reward_std": 0.08020308247068897, + "rewards/pure_accuracy_reward_math": 0.08621651906287298, + "step": 1526 + }, + { + "clip_ratio": 0.00031514769625573535, + "epoch": 2.3002001732843356, + "grad_norm": 0.03938477113842964, + "kl": 0.008733272552490234, + "learning_rate": 1.1961446198033855e-08, + "loss": 0.0071, + "step": 1527 + }, + { + "clip_ratio": 0.00030386562087869606, + "epoch": 2.3021122762988857, + "grad_norm": 0.03844742849469185, + "kl": 0.008654594421386719, + "learning_rate": 1.1356361778808167e-08, + "loss": 0.007, + "step": 1528 + }, + { + "clip_ratio": 0.00034510965764411594, + "epoch": 2.3040243793134354, + "grad_norm": 0.03755528852343559, + "kl": 0.00861358642578125, + "learning_rate": 1.076694680305218e-08, + "loss": 0.007, + "step": 1529 + }, + { + "clip_ratio": 0.00035207756366162357, + "epoch": 2.3059364823279855, + "grad_norm": 0.03696778416633606, + "kl": 0.008616447448730469, + "learning_rate": 1.0193204981946426e-08, + "loss": 0.0069, + "step": 1530 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.7249145507812, + "epoch": 2.3078485853425352, + "grad_norm": 0.045076508074998856, + "kl": 0.014521598815917969, + "learning_rate": 9.63513992798676e-09, + "loss": 0.0065, + "num_tokens": 495305537.0, + "reward": 0.07505580713041127, + "reward_std": 0.07844264624873176, + "rewards/pure_accuracy_reward_math": 0.07505580480210483, + "step": 1531 + }, + { + "clip_ratio": 0.0003054732096074986, + "epoch": 2.3097606883570854, + "grad_norm": 0.041828691959381104, + "kl": 0.01419973373413086, + "learning_rate": 9.092755154961886e-09, + "loss": 0.0065, + "step": 1532 + }, + { + "clip_ratio": 0.00030572324658351135, + "epoch": 2.311672791371635, + "grad_norm": 0.03949357569217682, + "kl": 0.013697624206542969, + "learning_rate": 8.566054077932262e-09, + "loss": 0.0064, + "step": 1533 + }, + { + "clip_ratio": 0.0003279060996987937, + "epoch": 2.313584894386185, + "grad_norm": 0.038545649498701096, + "kl": 0.01345968246459961, + "learning_rate": 8.055040013207061e-09, + "loss": 0.0063, + "step": 1534 + }, + { + "clip_ratio": 0.00033917763732915773, + "epoch": 2.315496997400735, + "grad_norm": 0.03716408833861351, + "kl": 0.01330709457397461, + "learning_rate": 7.559716178325016e-09, + "loss": 0.0062, + "step": 1535 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.2921552658081, + "epoch": 2.317409100415285, + "grad_norm": 0.041162386536598206, + "kl": 0.008297443389892578, + "learning_rate": 7.080085692032224e-09, + "loss": 0.0079, + "num_tokens": 498900584.0, + "reward": 0.08928571816068143, + "reward_std": 0.08428199036279693, + "rewards/pure_accuracy_reward_math": 0.08928571571595967, + "step": 1536 + }, + { + "clip_ratio": 0.00029752771973790004, + "epoch": 2.3193212034298347, + "grad_norm": 0.03933210298418999, + "kl": 0.008346080780029297, + "learning_rate": 6.616151574264374e-09, + "loss": 0.0079, + "step": 1537 + }, + { + "clip_ratio": 0.0003302163729017593, + "epoch": 2.321233306444385, + "grad_norm": 0.038146842271089554, + "kl": 0.008320331573486328, + "learning_rate": 6.1679167461262124e-09, + "loss": 0.0078, + "step": 1538 + }, + { + "clip_ratio": 0.0003326926421891585, + "epoch": 2.3231454094589346, + "grad_norm": 0.038072116672992706, + "kl": 0.008330345153808594, + "learning_rate": 5.735384029874336e-09, + "loss": 0.0077, + "step": 1539 + }, + { + "clip_ratio": 0.00038002995881925017, + "epoch": 2.3250575124734847, + "grad_norm": 0.037320397794246674, + "kl": 0.008296012878417969, + "learning_rate": 5.31855614889859e-09, + "loss": 0.0076, + "step": 1540 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.1487407684326, + "epoch": 2.3269696154880344, + "grad_norm": 0.03688493371009827, + "kl": 0.008476734161376953, + "learning_rate": 4.917435727704867e-09, + "loss": 0.0024, + "num_tokens": 502500281.0, + "reward": 0.0811942005821038, + "reward_std": 0.0787416979437694, + "rewards/pure_accuracy_reward_math": 0.08119419842842035, + "step": 1541 + }, + { + "clip_ratio": 0.00028201957394458077, + "epoch": 2.3288817185025845, + "grad_norm": 0.03607385605573654, + "kl": 0.008441448211669922, + "learning_rate": 4.53202529190011e-09, + "loss": 0.0023, + "step": 1542 + }, + { + "clip_ratio": 0.0002742231245633775, + "epoch": 2.330793821517134, + "grad_norm": 0.03572804853320122, + "kl": 0.00852060317993164, + "learning_rate": 4.162327268173727e-09, + "loss": 0.0023, + "step": 1543 + }, + { + "clip_ratio": 0.0003046261713848253, + "epoch": 2.332705924531684, + "grad_norm": 0.034965962171554565, + "kl": 0.00861501693725586, + "learning_rate": 3.80834398428509e-09, + "loss": 0.0022, + "step": 1544 + }, + { + "clip_ratio": 0.0003226917802976459, + "epoch": 2.334618027546234, + "grad_norm": 0.034803807735443115, + "kl": 0.008724212646484375, + "learning_rate": 3.470077669046612e-09, + "loss": 0.0021, + "step": 1545 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.0273699760437, + "epoch": 2.336530130560784, + "grad_norm": 0.034996818751096725, + "kl": 0.008575439453125, + "learning_rate": 3.147530452311809e-09, + "loss": 0.0064, + "num_tokens": 506159719.0, + "reward": 0.06891741408617236, + "reward_std": 0.07063014718005434, + "rewards/pure_accuracy_reward_math": 0.06891741210711189, + "step": 1546 + }, + { + "clip_ratio": 0.00023073077210256088, + "epoch": 2.338442233575334, + "grad_norm": 0.03347066789865494, + "kl": 0.008565902709960938, + "learning_rate": 2.8407043649597567e-09, + "loss": 0.0063, + "step": 1547 + }, + { + "clip_ratio": 0.000268154504112772, + "epoch": 2.3403543365898836, + "grad_norm": 0.03273630142211914, + "kl": 0.008545398712158203, + "learning_rate": 2.549601338883989e-09, + "loss": 0.0063, + "step": 1548 + }, + { + "clip_ratio": 0.00029292683666426456, + "epoch": 2.3422664396044337, + "grad_norm": 0.032376162707805634, + "kl": 0.008570671081542969, + "learning_rate": 2.2742232069794533e-09, + "loss": 0.0063, + "step": 1549 + }, + { + "clip_ratio": 0.0003443536306235728, + "epoch": 2.344178542618984, + "grad_norm": 0.031950000673532486, + "kl": 0.008484363555908203, + "learning_rate": 2.01457170313113e-09, + "loss": 0.0062, + "step": 1550 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.7207255363464, + "epoch": 2.3460906456335335, + "grad_norm": 0.04171088710427284, + "kl": 0.009114742279052734, + "learning_rate": 1.7706484622034837e-09, + "loss": 0.005, + "num_tokens": 509757966.0, + "reward": 0.07672991443541832, + "reward_std": 0.08149181143380702, + "rewards/pure_accuracy_reward_math": 0.07672991228173487, + "step": 1551 + }, + { + "clip_ratio": 0.0003305982788788242, + "epoch": 2.3480027486480832, + "grad_norm": 0.04123101010918617, + "kl": 0.009046554565429688, + "learning_rate": 1.5424550200293653e-09, + "loss": 0.005, + "step": 1552 + }, + { + "clip_ratio": 0.0003486324259256435, + "epoch": 2.3499148516626334, + "grad_norm": 0.039809513837099075, + "kl": 0.008966445922851562, + "learning_rate": 1.3299928134014039e-09, + "loss": 0.0049, + "step": 1553 + }, + { + "clip_ratio": 0.0003954665013452541, + "epoch": 2.351826954677183, + "grad_norm": 0.0393875353038311, + "kl": 0.008915901184082031, + "learning_rate": 1.1332631800620164e-09, + "loss": 0.0049, + "step": 1554 + }, + { + "clip_ratio": 0.0004334128346954458, + "epoch": 2.353739057691733, + "grad_norm": 0.03990260884165764, + "kl": 0.008862972259521484, + "learning_rate": 9.522673586956355e-10, + "loss": 0.0047, + "step": 1555 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.488025188446, + "epoch": 2.355651160706283, + "grad_norm": 0.04300679266452789, + "kl": 0.009171009063720703, + "learning_rate": 7.870064889206608e-10, + "loss": 0.0082, + "num_tokens": 513350767.0, + "reward": 0.07728794994181953, + "reward_std": 0.08290693227900192, + "rewards/pure_accuracy_reward_math": 0.07728794743889011, + "step": 1556 + }, + { + "clip_ratio": 0.000295089724772879, + "epoch": 2.357563263720833, + "grad_norm": 0.04144243150949478, + "kl": 0.009136676788330078, + "learning_rate": 6.374816112819648e-10, + "loss": 0.0082, + "step": 1557 + }, + { + "clip_ratio": 0.0003283331608940898, + "epoch": 2.3594753667353827, + "grad_norm": 0.039357006549835205, + "kl": 0.009202003479003906, + "learning_rate": 5.036936672447868e-10, + "loss": 0.0081, + "step": 1558 + }, + { + "clip_ratio": 0.00036647373104869985, + "epoch": 2.361387469749933, + "grad_norm": 0.03904441371560097, + "kl": 0.009307384490966797, + "learning_rate": 3.8564349918890356e-10, + "loss": 0.008, + "step": 1559 + }, + { + "clip_ratio": 0.0004084905730792343, + "epoch": 2.3632995727644825, + "grad_norm": 0.03901646286249161, + "kl": 0.00932168960571289, + "learning_rate": 2.833318504030791e-10, + "loss": 0.0079, + "step": 1560 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.474356174469, + "epoch": 2.3652116757790327, + "grad_norm": 5.391517162322998, + "kl": 0.0942845344543457, + "learning_rate": 1.9675936507979056e-10, + "loss": 0.0081, + "num_tokens": 516974751.0, + "reward": 0.06975446754950099, + "reward_std": 0.06989945453824475, + "rewards/pure_accuracy_reward_math": 0.06975446597789414, + "step": 1561 + }, + { + "clip_ratio": 0.0002886794856635788, + "epoch": 2.3671237787935824, + "grad_norm": 0.1764528900384903, + "kl": 0.013553619384765625, + "learning_rate": 1.2592658831245274e-10, + "loss": 0.0049, + "step": 1562 + }, + { + "clip_ratio": 0.00028670978349509824, + "epoch": 2.3690358818081325, + "grad_norm": 0.03846847265958786, + "kl": 0.009183406829833984, + "learning_rate": 7.083396609097737e-11, + "loss": 0.0047, + "step": 1563 + }, + { + "clip_ratio": 0.0002776476591748178, + "epoch": 2.370947984822682, + "grad_norm": 0.035545963793992996, + "kl": 0.008979320526123047, + "learning_rate": 3.148184529927489e-11, + "loss": 0.0046, + "step": 1564 + }, + { + "clip_ratio": 0.00032522391097700165, + "epoch": 2.3728600878372323, + "grad_norm": 0.1538141518831253, + "kl": 0.009156227111816406, + "learning_rate": 7.870473713589288e-12, + "loss": 0.0046, + "step": 1565 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.6135845184326, + "epoch": 2.374772190851782, + "grad_norm": 0.0368269719183445, + "kl": 0.008574485778808594, + "learning_rate": 0.0, + "loss": 0.0087, + "num_tokens": 520611370.0, + "reward": 0.07142857427243143, + "reward_std": 0.07900068280287087, + "rewards/pure_accuracy_reward_math": 0.07142857293365523, + "step": 1566 + }, + { + "epoch": 2.374772190851782, + "step": 1566, + "total_flos": 0.0, + "train_loss": 0.003398028112404372, + "train_runtime": 273585.6306, + "train_samples_per_second": 1.028, + "train_steps_per_second": 0.006 + } + ], + "logging_steps": 1, + "max_steps": 1566, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}