diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,15707 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.374772190851782,
+  "eval_steps": 100,
+  "global_step": 1566,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 562.7477951049805,
+      "epoch": 0.0019121030145499089,
+      "grad_norm": 0.022798627614974976,
+      "kl": 0.0,
+      "learning_rate": 1.5923566878980894e-08,
+      "loss": 0.002,
+      "num_tokens": 3752220.0,
+      "reward": 0.011439732770668343,
+      "reward_std": 0.019404857070185244,
+      "rewards/pure_accuracy_reward_math": 0.011439732537837699,
+      "step": 1
+    },
+    {
+      "clip_ratio": 0.0,
+      "epoch": 0.0038242060290998177,
+      "grad_norm": 0.02280641719698906,
+      "kl": 0.0,
+      "learning_rate": 3.184713375796179e-08,
+      "loss": 0.002,
+      "step": 2
+    },
+    {
+      "clip_ratio": 7.760171627069212e-05,
+      "epoch": 0.005736309043649726,
+      "grad_norm": 0.02249608002603054,
+      "kl": 0.00034177303314208984,
+      "learning_rate": 4.777070063694268e-08,
+      "loss": 0.002,
+      "step": 3
+    },
+    {
+      "clip_ratio": 7.010291557207893e-05,
+      "epoch": 0.0076484120581996355,
+      "grad_norm": 0.022546162828803062,
+      "kl": 0.0003476440906524658,
+      "learning_rate": 6.369426751592358e-08,
+      "loss": 0.002,
+      "step": 4
+    },
+    {
+      "clip_ratio": 6.121935876990392e-05,
+      "epoch": 0.009560515072749545,
+      "grad_norm": 0.022293007001280785,
+      "kl": 0.00034675002098083496,
+      "learning_rate": 7.961783439490447e-08,
+      "loss": 0.002,
+      "step": 5
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 569.8786544799805,
+      "epoch": 0.011472618087299453,
+      "grad_norm": 0.04666038230061531,
+      "kl": 0.000458449125289917,
+      "learning_rate": 9.554140127388536e-08,
+      "loss": 0.0036,
+      "num_tokens": 7526881.0,
+      "reward": 0.010323661233996972,
+      "reward_std": 0.01923220051685348,
+      "rewards/pure_accuracy_reward_math": 0.01032366111758165,
+      "step": 6
+    },
+    {
+      "clip_ratio": 9.284320668712098e-05,
+      "epoch": 0.013384721101849363,
+      "grad_norm": 0.03701707720756531,
+      "kl": 0.0004444718360900879,
+      "learning_rate": 1.1146496815286625e-07,
+      "loss": 0.0037,
+      "step": 7
+    },
+    {
+      "clip_ratio": 0.00010049525423028172,
+      "epoch": 0.015296824116399271,
+      "grad_norm": 0.05443934351205826,
+      "kl": 0.0004649162292480469,
+      "learning_rate": 1.2738853503184715e-07,
+      "loss": 0.0037,
+      "step": 8
+    },
+    {
+      "clip_ratio": 9.395023369052069e-05,
+      "epoch": 0.01720892713094918,
+      "grad_norm": 0.0357414111495018,
+      "kl": 0.0004501640796661377,
+      "learning_rate": 1.4331210191082803e-07,
+      "loss": 0.0037,
+      "step": 9
+    },
+    {
+      "clip_ratio": 0.00010371651984542041,
+      "epoch": 0.01912103014549909,
+      "grad_norm": 0.05199029669165611,
+      "kl": 0.0004614591598510742,
+      "learning_rate": 1.5923566878980893e-07,
+      "loss": 0.0037,
+      "step": 10
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 570.0694994926453,
+      "epoch": 0.021033133160048997,
+      "grad_norm": 0.022845298051834106,
+      "kl": 0.00035685300827026367,
+      "learning_rate": 1.751592356687898e-07,
+      "loss": 0.0025,
+      "num_tokens": 11302358.0,
+      "reward": 0.00948660756694153,
+      "reward_std": 0.017558093182742596,
+      "rewards/pure_accuracy_reward_math": 0.00948660756694153,
+      "step": 11
+    },
+    {
+      "clip_ratio": 7.08361723127382e-05,
+      "epoch": 0.022945236174598906,
+      "grad_norm": 0.02234972082078457,
+      "kl": 0.0003580451011657715,
+      "learning_rate": 1.9108280254777072e-07,
+      "loss": 0.0025,
+      "step": 12
+    },
+    {
+      "clip_ratio": 6.80922717606336e-05,
+      "epoch": 0.024857339189148814,
+      "grad_norm": 0.021554963663220406,
+      "kl": 0.00035765767097473145,
+      "learning_rate": 2.070063694267516e-07,
+      "loss": 0.0024,
+      "step": 13
+    },
+    {
+      "clip_ratio": 7.82350492158912e-05,
+      "epoch": 0.026769442203698725,
+      "grad_norm": 0.02103673666715622,
+      "kl": 0.000364154577255249,
+      "learning_rate": 2.229299363057325e-07,
+      "loss": 0.0025,
+      "step": 14
+    },
+    {
+      "clip_ratio": 7.339451894949889e-05,
+      "epoch": 0.028681545218248634,
+      "grad_norm": 0.023219415917992592,
+      "kl": 0.00036078691482543945,
+      "learning_rate": 2.3885350318471343e-07,
+      "loss": 0.0025,
+      "step": 15
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 560.8797726631165,
+      "epoch": 0.030593648232798542,
+      "grad_norm": 0.024746257811784744,
+      "kl": 0.0003574490547180176,
+      "learning_rate": 2.547770700636943e-07,
+      "loss": 0.0041,
+      "num_tokens": 15044695.0,
+      "reward": 0.011160714813740924,
+      "reward_std": 0.0194911856087856,
+      "rewards/pure_accuracy_reward_math": 0.011160714755533263,
+      "step": 16
+    },
+    {
+      "clip_ratio": 9.0199953319825e-05,
+      "epoch": 0.032505751247348454,
+      "grad_norm": 0.02409624680876732,
+      "kl": 0.0003629624843597412,
+      "learning_rate": 2.707006369426752e-07,
+      "loss": 0.0042,
+      "step": 17
+    },
+    {
+      "clip_ratio": 8.157364351291108e-05,
+      "epoch": 0.03441785426189836,
+      "grad_norm": 0.023118698969483376,
+      "kl": 0.0003673136234283447,
+      "learning_rate": 2.8662420382165606e-07,
+      "loss": 0.0041,
+      "step": 18
+    },
+    {
+      "clip_ratio": 9.048881202033954e-05,
+      "epoch": 0.03632995727644827,
+      "grad_norm": 0.02316245064139366,
+      "kl": 0.00036725401878356934,
+      "learning_rate": 3.02547770700637e-07,
+      "loss": 0.0041,
+      "step": 19
+    },
+    {
+      "clip_ratio": 8.188984941170929e-05,
+      "epoch": 0.03824206029099818,
+      "grad_norm": 0.021714523434638977,
+      "kl": 0.0003698766231536865,
+      "learning_rate": 3.1847133757961787e-07,
+      "loss": 0.0041,
+      "step": 20
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 554.6908774375916,
+      "epoch": 0.040154163305548086,
+      "grad_norm": 0.021168457344174385,
+      "kl": 0.000368267297744751,
+      "learning_rate": 3.3439490445859875e-07,
+      "loss": 0.0026,
+      "num_tokens": 18758275.0,
+      "reward": 0.010044643335277215,
+      "reward_std": 0.018202457285951823,
+      "rewards/pure_accuracy_reward_math": 0.010044643277069554,
+      "step": 21
+    },
+    {
+      "clip_ratio": 7.562077911416054e-05,
+      "epoch": 0.042066266320097995,
+      "grad_norm": 0.020001132041215897,
+      "kl": 0.00037425756454467773,
+      "learning_rate": 3.503184713375796e-07,
+      "loss": 0.0026,
+      "step": 22
+    },
+    {
+      "clip_ratio": 7.507880479806772e-05,
+      "epoch": 0.0439783693346479,
+      "grad_norm": 0.019386926665902138,
+      "kl": 0.0003781616687774658,
+      "learning_rate": 3.6624203821656055e-07,
+      "loss": 0.0026,
+      "step": 23
+    },
+    {
+      "clip_ratio": 7.805726602327923e-05,
+      "epoch": 0.04589047234919781,
+      "grad_norm": 0.018619129434227943,
+      "kl": 0.0003878176212310791,
+      "learning_rate": 3.8216560509554143e-07,
+      "loss": 0.0026,
+      "step": 24
+    },
+    {
+      "clip_ratio": 6.671031508176384e-05,
+      "epoch": 0.04780257536374772,
+      "grad_norm": 0.01833859272301197,
+      "kl": 0.00040024518966674805,
+      "learning_rate": 3.980891719745223e-07,
+      "loss": 0.0026,
+      "step": 25
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 552.0828938484192,
+      "epoch": 0.04971467837829763,
+      "grad_norm": 0.02587960660457611,
+      "kl": 0.00041344761848449707,
+      "learning_rate": 4.140127388535032e-07,
+      "loss": 0.0024,
+      "num_tokens": 22468764.0,
+      "reward": 0.012276786321308464,
+      "reward_std": 0.022195036057382822,
+      "rewards/pure_accuracy_reward_math": 0.012276786204893142,
+      "step": 26
+    },
+    {
+      "clip_ratio": 9.613389988771814e-05,
+      "epoch": 0.05162678139284754,
+      "grad_norm": 0.02422533929347992,
+      "kl": 0.00043016672134399414,
+      "learning_rate": 4.2993630573248406e-07,
+      "loss": 0.0024,
+      "step": 27
+    },
+    {
+      "clip_ratio": 8.45099556840978e-05,
+      "epoch": 0.05353888440739745,
+      "grad_norm": 0.023998353630304337,
+      "kl": 0.0004411041736602783,
+      "learning_rate": 4.45859872611465e-07,
+      "loss": 0.0024,
+      "step": 28
+    },
+    {
+      "clip_ratio": 9.715859295056362e-05,
+      "epoch": 0.05545098742194736,
+      "grad_norm": 0.023024486377835274,
+      "kl": 0.0004749894142150879,
+      "learning_rate": 4.6178343949044587e-07,
+      "loss": 0.0024,
+      "step": 29
+    },
+    {
+      "clip_ratio": 9.816014483021718e-05,
+      "epoch": 0.05736309043649727,
+      "grad_norm": 0.022171439602971077,
+      "kl": 0.0005015134811401367,
+      "learning_rate": 4.777070063694269e-07,
+      "loss": 0.0024,
+      "step": 30
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 549.791042804718,
+      "epoch": 0.059275193451047176,
+      "grad_norm": 0.027614159509539604,
+      "kl": 0.0005223453044891357,
+      "learning_rate": 4.936305732484077e-07,
+      "loss": 0.0029,
+      "num_tokens": 26170579.0,
+      "reward": 0.017299108090810478,
+      "reward_std": 0.03018019301816821,
+      "rewards/pure_accuracy_reward_math": 0.017299107741564512,
+      "step": 31
+    },
+    {
+      "clip_ratio": 0.00012569415866892086,
+      "epoch": 0.061187296465597084,
+      "grad_norm": 0.02653171494603157,
+      "kl": 0.0005522072315216064,
+      "learning_rate": 5.095541401273886e-07,
+      "loss": 0.0029,
+      "step": 32
+    },
+    {
+      "clip_ratio": 0.00012863677034147258,
+      "epoch": 0.06309939948014699,
+      "grad_norm": 0.0255680400878191,
+      "kl": 0.0005916953086853027,
+      "learning_rate": 5.254777070063695e-07,
+      "loss": 0.0029,
+      "step": 33
+    },
+    {
+      "clip_ratio": 0.00012797017114962728,
+      "epoch": 0.06501150249469691,
+      "grad_norm": 0.02455417811870575,
+      "kl": 0.0006306171417236328,
+      "learning_rate": 5.414012738853504e-07,
+      "loss": 0.0029,
+      "step": 34
+    },
+    {
+      "clip_ratio": 0.00012855784757448419,
+      "epoch": 0.06692360550924681,
+      "grad_norm": 0.024154040962457657,
+      "kl": 0.0006751418113708496,
+      "learning_rate": 5.573248407643312e-07,
+      "loss": 0.0029,
+      "step": 35
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 552.728542804718,
+      "epoch": 0.06883570852379672,
+      "grad_norm": 0.023450903594493866,
+      "kl": 0.000738978385925293,
+      "learning_rate": 5.732484076433121e-07,
+      "loss": 0.0034,
+      "num_tokens": 29883398.0,
+      "reward": 0.018415179511066526,
+      "reward_std": 0.030997214023955166,
+      "rewards/pure_accuracy_reward_math": 0.01841517922002822,
+      "step": 36
+    },
+    {
+      "clip_ratio": 0.00012425195308196635,
+      "epoch": 0.07074781153834662,
+      "grad_norm": 0.023070134222507477,
+      "kl": 0.0007783770561218262,
+      "learning_rate": 5.89171974522293e-07,
+      "loss": 0.0034,
+      "step": 37
+    },
+    {
+      "clip_ratio": 0.00012334759713894528,
+      "epoch": 0.07265991455289654,
+      "grad_norm": 0.023447532206773758,
+      "kl": 0.0008447170257568359,
+      "learning_rate": 6.05095541401274e-07,
+      "loss": 0.0034,
+      "step": 38
+    },
+    {
+      "clip_ratio": 0.00012615493608336692,
+      "epoch": 0.07457201756744644,
+      "grad_norm": 0.024682210758328438,
+      "kl": 0.0009213089942932129,
+      "learning_rate": 6.210191082802549e-07,
+      "loss": 0.0034,
+      "step": 39
+    },
+    {
+      "clip_ratio": 0.00012461718182521508,
+      "epoch": 0.07648412058199636,
+      "grad_norm": 0.02555885910987854,
+      "kl": 0.000977635383605957,
+      "learning_rate": 6.369426751592357e-07,
+      "loss": 0.0033,
+      "step": 40
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.030158996582,
+      "epoch": 0.07839622359654626,
+      "grad_norm": 0.059237755835056305,
+      "kl": 0.001125633716583252,
+      "learning_rate": 6.528662420382166e-07,
+      "loss": 0.0031,
+      "num_tokens": 33502406.0,
+      "reward": 0.024832590454025194,
+      "reward_std": 0.04194520629243925,
+      "rewards/pure_accuracy_reward_math": 0.02483259010477923,
+      "step": 41
+    },
+    {
+      "clip_ratio": 0.00016323180295785278,
+      "epoch": 0.08030832661109617,
+      "grad_norm": 0.029172642156481743,
+      "kl": 0.0011183619499206543,
+      "learning_rate": 6.687898089171975e-07,
+      "loss": 0.0031,
+      "step": 42
+    },
+    {
+      "clip_ratio": 0.0001751068371618203,
+      "epoch": 0.08222042962564609,
+      "grad_norm": 0.030453085899353027,
+      "kl": 0.0011813640594482422,
+      "learning_rate": 6.847133757961784e-07,
+      "loss": 0.0031,
+      "step": 43
+    },
+    {
+      "clip_ratio": 0.00018521026674989116,
+      "epoch": 0.08413253264019599,
+      "grad_norm": 0.03091653250157833,
+      "kl": 0.0012224912643432617,
+      "learning_rate": 7.006369426751592e-07,
+      "loss": 0.0031,
+      "step": 44
+    },
+    {
+      "clip_ratio": 0.00017049979595640252,
+      "epoch": 0.0860446356547459,
+      "grad_norm": 0.030593233183026314,
+      "kl": 0.0012733936309814453,
+      "learning_rate": 7.165605095541401e-07,
+      "loss": 0.0031,
+      "step": 45
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.7360739707947,
+      "epoch": 0.0879567386692958,
+      "grad_norm": 0.03371572494506836,
+      "kl": 0.0012704133987426758,
+      "learning_rate": 7.324840764331211e-07,
+      "loss": 0.0039,
+      "num_tokens": 37133676.0,
+      "reward": 0.029017858760198578,
+      "reward_std": 0.04838265001308173,
+      "rewards/pure_accuracy_reward_math": 0.029017858061706647,
+      "step": 46
+    },
+    {
+      "clip_ratio": 0.000227557278265067,
+      "epoch": 0.08986884168384572,
+      "grad_norm": 0.033185359090566635,
+      "kl": 0.0012688040733337402,
+      "learning_rate": 7.48407643312102e-07,
+      "loss": 0.0039,
+      "step": 47
+    },
+    {
+      "clip_ratio": 0.0002238695693677073,
+      "epoch": 0.09178094469839562,
+      "grad_norm": 0.03329231217503548,
+      "kl": 0.0013200044631958008,
+      "learning_rate": 7.643312101910829e-07,
+      "loss": 0.0039,
+      "step": 48
+    },
+    {
+      "clip_ratio": 0.00021458888153347289,
+      "epoch": 0.09369304771294554,
+      "grad_norm": 0.03329336270689964,
+      "kl": 0.0013244152069091797,
+      "learning_rate": 7.802547770700637e-07,
+      "loss": 0.0039,
+      "step": 49
+    },
+    {
+      "clip_ratio": 0.0002193794426830209,
+      "epoch": 0.09560515072749544,
+      "grad_norm": 0.0323607362806797,
+      "kl": 0.0013269782066345215,
+      "learning_rate": 7.961783439490446e-07,
+      "loss": 0.0039,
+      "step": 50
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 535.9352931976318,
+      "epoch": 0.09751725374204535,
+      "grad_norm": 0.030199358239769936,
+      "kl": 0.0013506412506103516,
+      "learning_rate": 8.121019108280255e-07,
+      "loss": 0.0042,
+      "num_tokens": 40789896.0,
+      "reward": 0.030970983498264104,
+      "reward_std": 0.0486878992523998,
+      "rewards/pure_accuracy_reward_math": 0.030970983090810478,
+      "step": 51
+    },
+    {
+      "clip_ratio": 0.00019589511845197194,
+      "epoch": 0.09942935675659526,
+      "grad_norm": 0.029786745086312294,
+      "kl": 0.001370549201965332,
+      "learning_rate": 8.280254777070064e-07,
+      "loss": 0.0042,
+      "step": 52
+    },
+    {
+      "clip_ratio": 0.00021279048064570816,
+      "epoch": 0.10134145977114517,
+      "grad_norm": 0.029834378510713577,
+      "kl": 0.0013399124145507812,
+      "learning_rate": 8.439490445859872e-07,
+      "loss": 0.0042,
+      "step": 53
+    },
+    {
+      "clip_ratio": 0.000190277668878025,
+      "epoch": 0.10325356278569509,
+      "grad_norm": 0.029410598799586296,
+      "kl": 0.00139617919921875,
+      "learning_rate": 8.598726114649681e-07,
+      "loss": 0.0042,
+      "step": 54
+    },
+    {
+      "clip_ratio": 0.00019459096591845082,
+      "epoch": 0.10516566580024499,
+      "grad_norm": 0.02935440093278885,
+      "kl": 0.0014204978942871094,
+      "learning_rate": 8.757961783439491e-07,
+      "loss": 0.0042,
+      "step": 55
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 536.9548239707947,
+      "epoch": 0.1070777688147949,
+      "grad_norm": 0.02805081568658352,
+      "kl": 0.0014168024063110352,
+      "learning_rate": 8.9171974522293e-07,
+      "loss": 0.0048,
+      "num_tokens": 44444894.0,
+      "reward": 0.027901787223527208,
+      "reward_std": 0.04121451411629096,
+      "rewards/pure_accuracy_reward_math": 0.02790178669965826,
+      "step": 56
+    },
+    {
+      "clip_ratio": 0.00016821016617996065,
+      "epoch": 0.1089898718293448,
+      "grad_norm": 0.02779608964920044,
+      "kl": 0.0014551877975463867,
+      "learning_rate": 9.076433121019109e-07,
+      "loss": 0.0048,
+      "step": 57
+    },
+    {
+      "clip_ratio": 0.00018197509814399382,
+      "epoch": 0.11090197484389472,
+      "grad_norm": 0.02721741609275341,
+      "kl": 0.0014206171035766602,
+      "learning_rate": 9.235668789808917e-07,
+      "loss": 0.0048,
+      "step": 58
+    },
+    {
+      "clip_ratio": 0.00016919344039934003,
+      "epoch": 0.11281407785844462,
+      "grad_norm": 0.02676265314221382,
+      "kl": 0.0014575719833374023,
+      "learning_rate": 9.394904458598727e-07,
+      "loss": 0.0048,
+      "step": 59
+    },
+    {
+      "clip_ratio": 0.00017069062050723005,
+      "epoch": 0.11472618087299453,
+      "grad_norm": 0.027010478079319,
+      "kl": 0.0014843940734863281,
+      "learning_rate": 9.554140127388537e-07,
+      "loss": 0.0048,
+      "step": 60
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.1998043060303,
+      "epoch": 0.11663828388754444,
+      "grad_norm": 0.030231643468141556,
+      "kl": 0.0015106201171875,
+      "learning_rate": 9.713375796178345e-07,
+      "loss": 0.0029,
+      "num_tokens": 48046694.0,
+      "reward": 0.02762276935391128,
+      "reward_std": 0.04623683576937765,
+      "rewards/pure_accuracy_reward_math": 0.02762276877183467,
+      "step": 61
+    },
+    {
+      "clip_ratio": 0.0001882643781527804,
+      "epoch": 0.11855038690209435,
+      "grad_norm": 0.030413959175348282,
+      "kl": 0.0015065670013427734,
+      "learning_rate": 9.872611464968155e-07,
+      "loss": 0.0029,
+      "step": 62
+    },
+    {
+      "clip_ratio": 0.00019050979824442038,
+      "epoch": 0.12046248991664425,
+      "grad_norm": 0.029997214674949646,
+      "kl": 0.0014984607696533203,
+      "learning_rate": 1.0031847133757962e-06,
+      "loss": 0.0029,
+      "step": 63
+    },
+    {
+      "clip_ratio": 0.0001963579389325787,
+      "epoch": 0.12237459293119417,
+      "grad_norm": 0.02927768975496292,
+      "kl": 0.0014634132385253906,
+      "learning_rate": 1.0191082802547772e-06,
+      "loss": 0.0029,
+      "step": 64
+    },
+    {
+      "clip_ratio": 0.0002130206620449826,
+      "epoch": 0.12428669594574408,
+      "grad_norm": 0.028719380497932434,
+      "kl": 0.0014470815658569336,
+      "learning_rate": 1.035031847133758e-06,
+      "loss": 0.0029,
+      "step": 65
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.4283137321472,
+      "epoch": 0.12619879896029398,
+      "grad_norm": 0.031215306371450424,
+      "kl": 0.0014127492904663086,
+      "learning_rate": 1.050955414012739e-06,
+      "loss": 0.0038,
+      "num_tokens": 51628501.0,
+      "reward": 0.03487723405123688,
+      "reward_std": 0.05173706269124523,
+      "rewards/pure_accuracy_reward_math": 0.03487723323632963,
+      "step": 66
+    },
+    {
+      "clip_ratio": 0.00019433782460964721,
+      "epoch": 0.1281109019748439,
+      "grad_norm": 0.03108724020421505,
+      "kl": 0.0014324188232421875,
+      "learning_rate": 1.06687898089172e-06,
+      "loss": 0.0038,
+      "step": 67
+    },
+    {
+      "clip_ratio": 0.00020085336353758976,
+      "epoch": 0.13002300498939381,
+      "grad_norm": 0.030220478773117065,
+      "kl": 0.0014306306838989258,
+      "learning_rate": 1.0828025477707007e-06,
+      "loss": 0.0038,
+      "step": 68
+    },
+    {
+      "clip_ratio": 0.00021161197844321578,
+      "epoch": 0.1319351080039437,
+      "grad_norm": 0.030320733785629272,
+      "kl": 0.001450181007385254,
+      "learning_rate": 1.0987261146496817e-06,
+      "loss": 0.0038,
+      "step": 69
+    },
+    {
+      "clip_ratio": 0.00019352555551677142,
+      "epoch": 0.13384721101849362,
+      "grad_norm": 0.02980073168873787,
+      "kl": 0.0014796257019042969,
+      "learning_rate": 1.1146496815286625e-06,
+      "loss": 0.0038,
+      "step": 70
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.2854599952698,
+      "epoch": 0.13575931403304353,
+      "grad_norm": 0.0338500440120697,
+      "kl": 0.0015006065368652344,
+      "learning_rate": 1.1305732484076435e-06,
+      "loss": 0.006,
+      "num_tokens": 55247180.0,
+      "reward": 0.03710937674622983,
+      "reward_std": 0.05426825548056513,
+      "rewards/pure_accuracy_reward_math": 0.037109375989530236,
+      "step": 71
+    },
+    {
+      "clip_ratio": 0.0002256608086668166,
+      "epoch": 0.13767141704759345,
+      "grad_norm": 0.03328324109315872,
+      "kl": 0.0015664100646972656,
+      "learning_rate": 1.1464968152866242e-06,
+      "loss": 0.006,
+      "step": 72
+    },
+    {
+      "clip_ratio": 0.0002166868289350532,
+      "epoch": 0.13958352006214333,
+      "grad_norm": 0.03267475962638855,
+      "kl": 0.0016113519668579102,
+      "learning_rate": 1.1624203821656052e-06,
+      "loss": 0.006,
+      "step": 73
+    },
+    {
+      "clip_ratio": 0.00024709346627105333,
+      "epoch": 0.14149562307669325,
+      "grad_norm": 0.032320376485586166,
+      "kl": 0.0017037391662597656,
+      "learning_rate": 1.178343949044586e-06,
+      "loss": 0.006,
+      "step": 74
+    },
+    {
+      "clip_ratio": 0.00021453456992048814,
+      "epoch": 0.14340772609124317,
+      "grad_norm": 0.0322573184967041,
+      "kl": 0.0017703771591186523,
+      "learning_rate": 1.194267515923567e-06,
+      "loss": 0.006,
+      "step": 75
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 539.314199924469,
+      "epoch": 0.14531982910579308,
+      "grad_norm": 0.03833702206611633,
+      "kl": 0.0018039941787719727,
+      "learning_rate": 1.210191082802548e-06,
+      "loss": 0.0055,
+      "num_tokens": 58912938.0,
+      "reward": 0.04045759144355543,
+      "reward_std": 0.060838291130494326,
+      "rewards/pure_accuracy_reward_math": 0.040457590454025194,
+      "step": 76
+    },
+    {
+      "clip_ratio": 0.0002450900424548763,
+      "epoch": 0.147231932120343,
+      "grad_norm": 0.03705858439207077,
+      "kl": 0.0018303394317626953,
+      "learning_rate": 1.2261146496815287e-06,
+      "loss": 0.0055,
+      "step": 77
+    },
+    {
+      "clip_ratio": 0.0002520209266094753,
+      "epoch": 0.14914403513489288,
+      "grad_norm": 0.03624257072806358,
+      "kl": 0.0019118785858154297,
+      "learning_rate": 1.2420382165605097e-06,
+      "loss": 0.0055,
+      "step": 78
+    },
+    {
+      "clip_ratio": 0.00023157394139161624,
+      "epoch": 0.1510561381494428,
+      "grad_norm": 0.03626013919711113,
+      "kl": 0.001949906349182129,
+      "learning_rate": 1.2579617834394905e-06,
+      "loss": 0.0055,
+      "step": 79
+    },
+    {
+      "clip_ratio": 0.0002889583781211513,
+      "epoch": 0.1529682411639927,
+      "grad_norm": 0.03634464740753174,
+      "kl": 0.001984238624572754,
+      "learning_rate": 1.2738853503184715e-06,
+      "loss": 0.0055,
+      "step": 80
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 531.2025918960571,
+      "epoch": 0.15488034417854263,
+      "grad_norm": 0.032439347356557846,
+      "kl": 0.0019190311431884766,
+      "learning_rate": 1.2898089171974522e-06,
+      "loss": 0.0067,
+      "num_tokens": 62551992.0,
+      "reward": 0.03766741280560382,
+      "reward_std": 0.0572711571585387,
+      "rewards/pure_accuracy_reward_math": 0.0376674119324889,
+      "step": 81
+    },
+    {
+      "clip_ratio": 0.00025730342139240747,
+      "epoch": 0.15679244719309252,
+      "grad_norm": 0.03198026493191719,
+      "kl": 0.001917123794555664,
+      "learning_rate": 1.3057324840764332e-06,
+      "loss": 0.0067,
+      "step": 82
+    },
+    {
+      "clip_ratio": 0.0002504205738205201,
+      "epoch": 0.15870455020764243,
+      "grad_norm": 0.02998184598982334,
+      "kl": 0.0019073486328125,
+      "learning_rate": 1.3216560509554142e-06,
+      "loss": 0.0067,
+      "step": 83
+    },
+    {
+      "clip_ratio": 0.00025362581419585695,
+      "epoch": 0.16061665322219235,
+      "grad_norm": 0.029601849615573883,
+      "kl": 0.0019354820251464844,
+      "learning_rate": 1.337579617834395e-06,
+      "loss": 0.0067,
+      "step": 84
+    },
+    {
+      "clip_ratio": 0.0003167184295307379,
+      "epoch": 0.16252875623674226,
+      "grad_norm": 0.030052170157432556,
+      "kl": 0.0019598007202148438,
+      "learning_rate": 1.353503184713376e-06,
+      "loss": 0.0067,
+      "step": 85
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.9562168121338,
+      "epoch": 0.16444085925129218,
+      "grad_norm": 0.03331635147333145,
+      "kl": 0.002047300338745117,
+      "learning_rate": 1.3694267515923567e-06,
+      "loss": 0.0076,
+      "num_tokens": 66182275.0,
+      "reward": 0.04045759132714011,
+      "reward_std": 0.06074576545506716,
+      "rewards/pure_accuracy_reward_math": 0.04045759068685584,
+      "step": 86
+    },
+    {
+      "clip_ratio": 0.0002471263709367122,
+      "epoch": 0.16635296226584206,
+      "grad_norm": 0.03298444300889969,
+      "kl": 0.0020711421966552734,
+      "learning_rate": 1.3853503184713377e-06,
+      "loss": 0.0076,
+      "step": 87
+    },
+    {
+      "clip_ratio": 0.00024866302578629984,
+      "epoch": 0.16826506528039198,
+      "grad_norm": 0.03206898272037506,
+      "kl": 0.0020384788513183594,
+      "learning_rate": 1.4012738853503185e-06,
+      "loss": 0.0076,
+      "step": 88
+    },
+    {
+      "clip_ratio": 0.00026278120321876486,
+      "epoch": 0.1701771682949419,
+      "grad_norm": 0.03115510568022728,
+      "kl": 0.002008795738220215,
+      "learning_rate": 1.4171974522292995e-06,
+      "loss": 0.0076,
+      "step": 89
+    },
+    {
+      "clip_ratio": 0.000245522400405207,
+      "epoch": 0.1720892713094918,
+      "grad_norm": 0.030577220022678375,
+      "kl": 0.0019922256469726562,
+      "learning_rate": 1.4331210191082802e-06,
+      "loss": 0.0076,
+      "step": 90
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.3948340415955,
+      "epoch": 0.1740013743240417,
+      "grad_norm": 0.0348118431866169,
+      "kl": 0.0019588470458984375,
+      "learning_rate": 1.4490445859872612e-06,
+      "loss": 0.0046,
+      "num_tokens": 69793534.0,
+      "reward": 0.04436384132714011,
+      "reward_std": 0.059376906079705805,
+      "rewards/pure_accuracy_reward_math": 0.044363840570440516,
+      "step": 91
+    },
+    {
+      "clip_ratio": 0.00021377558331892033,
+      "epoch": 0.1759134773385916,
+      "grad_norm": 0.03493114933371544,
+      "kl": 0.0019345283508300781,
+      "learning_rate": 1.4649681528662422e-06,
+      "loss": 0.0046,
+      "step": 92
+    },
+    {
+      "clip_ratio": 0.00023636125789039397,
+      "epoch": 0.17782558035314153,
+      "grad_norm": 0.03362264111638069,
+      "kl": 0.0019860267639160156,
+      "learning_rate": 1.480891719745223e-06,
+      "loss": 0.0046,
+      "step": 93
+    },
+    {
+      "clip_ratio": 0.00022836430440520417,
+      "epoch": 0.17973768336769144,
+      "grad_norm": 0.03336656093597412,
+      "kl": 0.002032160758972168,
+      "learning_rate": 1.496815286624204e-06,
+      "loss": 0.0045,
+      "step": 94
+    },
+    {
+      "clip_ratio": 0.00024139108904819295,
+      "epoch": 0.18164978638224133,
+      "grad_norm": 0.03235051408410072,
+      "kl": 0.0021082162857055664,
+      "learning_rate": 1.5127388535031847e-06,
+      "loss": 0.0045,
+      "step": 95
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.8058252334595,
+      "epoch": 0.18356188939679124,
+      "grad_norm": 0.03482769802212715,
+      "kl": 0.0021872520446777344,
+      "learning_rate": 1.5286624203821657e-06,
+      "loss": 0.0075,
+      "num_tokens": 73427974.0,
+      "reward": 0.04101562709547579,
+      "reward_std": 0.06101094774203375,
+      "rewards/pure_accuracy_reward_math": 0.04101562616415322,
+      "step": 96
+    },
+    {
+      "clip_ratio": 0.00024072786442275174,
+      "epoch": 0.18547399241134116,
+      "grad_norm": 0.03345990553498268,
+      "kl": 0.002261519432067871,
+      "learning_rate": 1.5445859872611465e-06,
+      "loss": 0.0075,
+      "step": 97
+    },
+    {
+      "clip_ratio": 0.00024480573915752757,
+      "epoch": 0.18738609542589107,
+      "grad_norm": 0.03318383917212486,
+      "kl": 0.0022890567779541016,
+      "learning_rate": 1.5605095541401275e-06,
+      "loss": 0.0075,
+      "step": 98
+    },
+    {
+      "clip_ratio": 0.00027489714915418517,
+      "epoch": 0.189298198440441,
+      "grad_norm": 0.03230712562799454,
+      "kl": 0.0023267269134521484,
+      "learning_rate": 1.5764331210191083e-06,
+      "loss": 0.0074,
+      "step": 99
+    },
+    {
+      "clip_ratio": 0.00029621877195040724,
+      "epoch": 0.19121030145499088,
+      "grad_norm": 0.03260359168052673,
+      "kl": 0.002334117889404297,
+      "learning_rate": 1.5923566878980892e-06,
+      "loss": 0.0074,
+      "step": 100
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 526.358283996582,
+      "epoch": 0.0019121030145499089,
+      "grad_norm": 0.30763256549835205,
+      "kl": 0.0024461746215820312,
+      "learning_rate": 1.6082802547770702e-06,
+      "loss": 0.0053,
+      "num_tokens": 3621800.0,
+      "reward": 0.0546875023865141,
+      "reward_std": 0.06997958500869572,
+      "rewards/pure_accuracy_reward_math": 0.054687501629814506,
+      "step": 101
+    },
+    {
+      "clip_ratio": 0.00028505406811518696,
+      "epoch": 0.0038242060290998177,
+      "grad_norm": 0.7424792647361755,
+      "kl": 0.005189061164855957,
+      "learning_rate": 1.624203821656051e-06,
+      "loss": 0.0054,
+      "step": 102
+    },
+    {
+      "clip_ratio": 0.000307778484739174,
+      "epoch": 0.005736309043649726,
+      "grad_norm": 0.5747273564338684,
+      "kl": 0.005206584930419922,
+      "learning_rate": 1.640127388535032e-06,
+      "loss": 0.0054,
+      "step": 103
+    },
+    {
+      "clip_ratio": 0.0003712488735345687,
+      "epoch": 0.0076484120581996355,
+      "grad_norm": 0.15304483473300934,
+      "kl": 0.0026189088821411133,
+      "learning_rate": 1.6560509554140127e-06,
+      "loss": 0.0053,
+      "step": 104
+    },
+    {
+      "clip_ratio": 0.00037476027159755176,
+      "epoch": 0.009560515072749545,
+      "grad_norm": 0.2118157148361206,
+      "kl": 0.00246584415435791,
+      "learning_rate": 1.6719745222929937e-06,
+      "loss": 0.0053,
+      "step": 105
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.46431016922,
+      "epoch": 0.011472618087299453,
+      "grad_norm": 0.2036779820919037,
+      "kl": 0.0037467479705810547,
+      "learning_rate": 1.6878980891719745e-06,
+      "loss": 0.0067,
+      "num_tokens": 7244448.0,
+      "reward": 0.05161830596625805,
+      "reward_std": 0.06822534638922662,
+      "rewards/pure_accuracy_reward_math": 0.05161830474389717,
+      "step": 106
+    },
+    {
+      "clip_ratio": 0.0002751678786125922,
+      "epoch": 0.013384721101849363,
+      "grad_norm": 0.1858554631471634,
+      "kl": 0.0035070180892944336,
+      "learning_rate": 1.7038216560509555e-06,
+      "loss": 0.0067,
+      "step": 107
+    },
+    {
+      "clip_ratio": 0.0002901391828800115,
+      "epoch": 0.015296824116399271,
+      "grad_norm": 0.06319136172533035,
+      "kl": 0.0033702850341796875,
+      "learning_rate": 1.7197452229299363e-06,
+      "loss": 0.0067,
+      "step": 108
+    },
+    {
+      "clip_ratio": 0.00029408001091724145,
+      "epoch": 0.01720892713094918,
+      "grad_norm": 0.061827220022678375,
+      "kl": 0.00351715087890625,
+      "learning_rate": 1.7356687898089172e-06,
+      "loss": 0.0067,
+      "step": 109
+    },
+    {
+      "clip_ratio": 0.0002710100695253459,
+      "epoch": 0.01912103014549909,
+      "grad_norm": 0.13167870044708252,
+      "kl": 0.0036835670471191406,
+      "learning_rate": 1.7515923566878982e-06,
+      "loss": 0.0067,
+      "step": 110
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 515.4391984939575,
+      "epoch": 0.021033133160048997,
+      "grad_norm": 0.034568388015031815,
+      "kl": 0.0024957656860351562,
+      "learning_rate": 1.767515923566879e-06,
+      "loss": 0.0068,
+      "num_tokens": 10824130.0,
+      "reward": 0.0468750023865141,
+      "reward_std": 0.06221334758447483,
+      "rewards/pure_accuracy_reward_math": 0.04687500116415322,
+      "step": 111
+    },
+    {
+      "clip_ratio": 0.00025272632768746917,
+      "epoch": 0.022945236174598906,
+      "grad_norm": 0.03421744704246521,
+      "kl": 0.002499222755432129,
+      "learning_rate": 1.78343949044586e-06,
+      "loss": 0.0068,
+      "step": 112
+    },
+    {
+      "clip_ratio": 0.00025192988658773174,
+      "epoch": 0.024857339189148814,
+      "grad_norm": 0.03444651514291763,
+      "kl": 0.002528548240661621,
+      "learning_rate": 1.7993630573248407e-06,
+      "loss": 0.0068,
+      "step": 113
+    },
+    {
+      "clip_ratio": 0.0002639102876287325,
+      "epoch": 0.026769442203698725,
+      "grad_norm": 0.033966146409511566,
+      "kl": 0.0025298595428466797,
+      "learning_rate": 1.8152866242038217e-06,
+      "loss": 0.0067,
+      "step": 114
+    },
+    {
+      "clip_ratio": 0.0002613060296994263,
+      "epoch": 0.028681545218248634,
+      "grad_norm": 0.03252725675702095,
+      "kl": 0.0025829076766967773,
+      "learning_rate": 1.8312101910828025e-06,
+      "loss": 0.0067,
+      "step": 115
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 511.59126138687134,
+      "epoch": 0.030593648232798542,
+      "grad_norm": 0.04161737114191055,
+      "kl": 0.002777099609375,
+      "learning_rate": 1.8471337579617835e-06,
+      "loss": 0.0084,
+      "num_tokens": 14389817.0,
+      "reward": 0.04464285934227519,
+      "reward_std": 0.0631567623349838,
+      "rewards/pure_accuracy_reward_math": 0.044642858469160274,
+      "step": 116
+    },
+    {
+      "clip_ratio": 0.0002685248994680478,
+      "epoch": 0.032505751247348454,
+      "grad_norm": 0.03920653462409973,
+      "kl": 0.002690911293029785,
+      "learning_rate": 1.8630573248407643e-06,
+      "loss": 0.0084,
+      "step": 117
+    },
+    {
+      "clip_ratio": 0.00028247613772691693,
+      "epoch": 0.03441785426189836,
+      "grad_norm": 0.037915512919425964,
+      "kl": 0.0026444196701049805,
+      "learning_rate": 1.8789808917197455e-06,
+      "loss": 0.0084,
+      "step": 118
+    },
+    {
+      "clip_ratio": 0.00028578577973803476,
+      "epoch": 0.03632995727644827,
+      "grad_norm": 0.03727024793624878,
+      "kl": 0.002573251724243164,
+      "learning_rate": 1.8949044585987264e-06,
+      "loss": 0.0083,
+      "step": 119
+    },
+    {
+      "clip_ratio": 0.0003107314861381383,
+      "epoch": 0.03824206029099818,
+      "grad_norm": 0.03734543174505234,
+      "kl": 0.002534151077270508,
+      "learning_rate": 1.9108280254777074e-06,
+      "loss": 0.0083,
+      "step": 120
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 511.96654319763184,
+      "epoch": 0.040154163305548086,
+      "grad_norm": 18.524425506591797,
+      "kl": 0.05213046073913574,
+      "learning_rate": 1.926751592356688e-06,
+      "loss": 0.0067,
+      "num_tokens": 17950273.0,
+      "reward": 0.044642859254963696,
+      "reward_std": 0.0572310917195864,
+      "rewards/pure_accuracy_reward_math": 0.04464285838184878,
+      "step": 121
+    },
+    {
+      "clip_ratio": 0.00024330438452579983,
+      "epoch": 0.042066266320097995,
+      "grad_norm": 0.06961806118488312,
+      "kl": 0.0025354623794555664,
+      "learning_rate": 1.942675159235669e-06,
+      "loss": 0.0047,
+      "step": 122
+    },
+    {
+      "clip_ratio": 0.00023799908234423128,
+      "epoch": 0.0439783693346479,
+      "grad_norm": 0.038592379540205,
+      "kl": 0.0024437904357910156,
+      "learning_rate": 1.95859872611465e-06,
+      "loss": 0.0047,
+      "step": 123
+    },
+    {
+      "clip_ratio": 0.00023513944393016573,
+      "epoch": 0.04589047234919781,
+      "grad_norm": 0.036785636097192764,
+      "kl": 0.002588033676147461,
+      "learning_rate": 1.974522292993631e-06,
+      "loss": 0.0047,
+      "step": 124
+    },
+    {
+      "clip_ratio": 0.0002449645085107477,
+      "epoch": 0.04780257536374772,
+      "grad_norm": 0.03537231311202049,
+      "kl": 0.002721548080444336,
+      "learning_rate": 1.9904458598726117e-06,
+      "loss": 0.0047,
+      "step": 125
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 513.618884563446,
+      "epoch": 0.04971467837829763,
+      "grad_norm": 0.03746291249990463,
+      "kl": 0.0026862621307373047,
+      "learning_rate": 2.0063694267515925e-06,
+      "loss": 0.0063,
+      "num_tokens": 21522907.0,
+      "reward": 0.04492187732830644,
+      "reward_std": 0.061436392075847834,
+      "rewards/pure_accuracy_reward_math": 0.04492187616415322,
+      "step": 126
+    },
+    {
+      "clip_ratio": 0.0002821582585283977,
+      "epoch": 0.05162678139284754,
+      "grad_norm": 0.036032602190971375,
+      "kl": 0.0027321577072143555,
+      "learning_rate": 2.0222929936305737e-06,
+      "loss": 0.0063,
+      "step": 127
+    },
+    {
+      "clip_ratio": 0.0002675421079629814,
+      "epoch": 0.05353888440739745,
+      "grad_norm": 0.03723033517599106,
+      "kl": 0.002848386764526367,
+      "learning_rate": 2.0382165605095544e-06,
+      "loss": 0.0062,
+      "step": 128
+    },
+    {
+      "clip_ratio": 0.00030748845301786787,
+      "epoch": 0.05545098742194736,
+      "grad_norm": 0.03697400540113449,
+      "kl": 0.002881765365600586,
+      "learning_rate": 2.054140127388535e-06,
+      "loss": 0.0062,
+      "step": 129
+    },
+    {
+      "clip_ratio": 0.0003087153630758621,
+      "epoch": 0.05736309043649727,
+      "grad_norm": 0.03756724298000336,
+      "kl": 0.002836942672729492,
+      "learning_rate": 2.070063694267516e-06,
+      "loss": 0.0062,
+      "step": 130
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 518.1615762710571,
+      "epoch": 0.059275193451047176,
+      "grad_norm": 0.039371710270643234,
+      "kl": 0.00270843505859375,
+      "learning_rate": 2.085987261146497e-06,
+      "loss": 0.0064,
+      "num_tokens": 25111362.0,
+      "reward": 0.05106027016881853,
+      "reward_std": 0.06736206263303757,
+      "rewards/pure_accuracy_reward_math": 0.051060269062872976,
+      "step": 131
+    },
+    {
+      "clip_ratio": 0.0002896036380661826,
+      "epoch": 0.061187296465597084,
+      "grad_norm": 0.03780793026089668,
+      "kl": 0.0027250051498413086,
+      "learning_rate": 2.101910828025478e-06,
+      "loss": 0.0064,
+      "step": 132
+    },
+    {
+      "clip_ratio": 0.0002853632216783808,
+      "epoch": 0.06309939948014699,
+      "grad_norm": 0.03720535710453987,
+      "kl": 0.0027070045471191406,
+      "learning_rate": 2.1178343949044587e-06,
+      "loss": 0.0064,
+      "step": 133
+    },
+    {
+      "clip_ratio": 0.0002896762144928289,
+      "epoch": 0.06501150249469691,
+      "grad_norm": 0.036468133330345154,
+      "kl": 0.0027469396591186523,
+      "learning_rate": 2.13375796178344e-06,
+      "loss": 0.0064,
+      "step": 134
+    },
+    {
+      "clip_ratio": 0.0003120482754184195,
+      "epoch": 0.06692360550924681,
+      "grad_norm": 0.03586801886558533,
+      "kl": 0.002748727798461914,
+      "learning_rate": 2.1496815286624207e-06,
+      "loss": 0.0063,
+      "step": 135
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.0351796150208,
+      "epoch": 0.06883570852379672,
+      "grad_norm": 0.03092282824218273,
+      "kl": 0.002766728401184082,
+      "learning_rate": 2.1656050955414015e-06,
+      "loss": 0.0056,
+      "num_tokens": 28735680.0,
+      "reward": 0.04017857348662801,
+      "reward_std": 0.05289319949224591,
+      "rewards/pure_accuracy_reward_math": 0.040178572438890114,
+      "step": 136
+    },
+    {
+      "clip_ratio": 0.00020221989311153266,
+      "epoch": 0.07074781153834662,
+      "grad_norm": 0.030703941360116005,
+      "kl": 0.0028089284896850586,
+      "learning_rate": 2.1815286624203822e-06,
+      "loss": 0.0056,
+      "step": 137
+    },
+    {
+      "clip_ratio": 0.00019867721590571819,
+      "epoch": 0.07265991455289654,
+      "grad_norm": 0.030248478055000305,
+      "kl": 0.0027884244918823242,
+      "learning_rate": 2.1974522292993634e-06,
+      "loss": 0.0056,
+      "step": 138
+    },
+    {
+      "clip_ratio": 0.00021304549886735913,
+      "epoch": 0.07457201756744644,
+      "grad_norm": 0.029539138078689575,
+      "kl": 0.002767205238342285,
+      "learning_rate": 2.213375796178344e-06,
+      "loss": 0.0056,
+      "step": 139
+    },
+    {
+      "clip_ratio": 0.00021535260020755231,
+      "epoch": 0.07648412058199636,
+      "grad_norm": 0.02955791726708412,
+      "kl": 0.002725839614868164,
+      "learning_rate": 2.229299363057325e-06,
+      "loss": 0.0055,
+      "step": 140
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.7212834358215,
+      "epoch": 0.07839622359654626,
+      "grad_norm": 0.060058850795030594,
+      "kl": 0.0032591819763183594,
+      "learning_rate": 2.245222929936306e-06,
+      "loss": 0.0071,
+      "num_tokens": 32349997.0,
+      "reward": 0.048828127211891115,
+      "reward_std": 0.056028691527899355,
+      "rewards/pure_accuracy_reward_math": 0.04882812628056854,
+      "step": 141
+    },
+    {
+      "clip_ratio": 0.00022036872547914754,
+      "epoch": 0.08030832661109617,
+      "grad_norm": 0.03533012047410011,
+      "kl": 0.002978205680847168,
+      "learning_rate": 2.261146496815287e-06,
+      "loss": 0.0071,
+      "step": 142
+    },
+    {
+      "clip_ratio": 0.0002158615123448726,
+      "epoch": 0.08222042962564609,
+      "grad_norm": 0.029908612370491028,
+      "kl": 0.002841353416442871,
+      "learning_rate": 2.2770700636942677e-06,
+      "loss": 0.0071,
+      "step": 143
+    },
+    {
+      "clip_ratio": 0.0002112481060976279,
+      "epoch": 0.08413253264019599,
+      "grad_norm": 0.028638474643230438,
+      "kl": 0.002796173095703125,
+      "learning_rate": 2.2929936305732485e-06,
+      "loss": 0.0071,
+      "step": 144
+    },
+    {
+      "clip_ratio": 0.00022246911356660348,
+      "epoch": 0.0860446356547459,
+      "grad_norm": 0.02828238159418106,
+      "kl": 0.0027240514755249023,
+      "learning_rate": 2.3089171974522297e-06,
+      "loss": 0.007,
+      "step": 145
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 534.0318322181702,
+      "epoch": 0.0879567386692958,
+      "grad_norm": 3.060509443283081,
+      "kl": 0.022321224212646484,
+      "learning_rate": 2.3248407643312104e-06,
+      "loss": 0.0062,
+      "num_tokens": 35996663.0,
+      "reward": 0.04436384144355543,
+      "reward_std": 0.06130999844754115,
+      "rewards/pure_accuracy_reward_math": 0.04436384039581753,
+      "step": 146
+    },
+    {
+      "clip_ratio": 0.00023404289771633557,
+      "epoch": 0.08986884168384572,
+      "grad_norm": 0.28904739022254944,
+      "kl": 0.004893064498901367,
+      "learning_rate": 2.3407643312101912e-06,
+      "loss": 0.0055,
+      "step": 147
+    },
+    {
+      "clip_ratio": 0.00024259101735424338,
+      "epoch": 0.09178094469839562,
+      "grad_norm": 0.03826431185007095,
+      "kl": 0.0027625560760498047,
+      "learning_rate": 2.356687898089172e-06,
+      "loss": 0.0054,
+      "step": 148
+    },
+    {
+      "clip_ratio": 0.0002517821457672653,
+      "epoch": 0.09369304771294554,
+      "grad_norm": 0.03572425991296768,
+      "kl": 0.002875208854675293,
+      "learning_rate": 2.372611464968153e-06,
+      "loss": 0.0054,
+      "step": 149
+    },
+    {
+      "clip_ratio": 0.00024034848578935453,
+      "epoch": 0.09560515072749544,
+      "grad_norm": 0.036431849002838135,
+      "kl": 0.0031164884567260742,
+      "learning_rate": 2.388535031847134e-06,
+      "loss": 0.0054,
+      "step": 150
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 538.2168254852295,
+      "epoch": 0.09751725374204535,
+      "grad_norm": 0.03362743556499481,
+      "kl": 0.002684354782104492,
+      "learning_rate": 2.4044585987261147e-06,
+      "loss": 0.0027,
+      "num_tokens": 39661060.0,
+      "reward": 0.05133928792201914,
+      "reward_std": 0.06672389659797773,
+      "rewards/pure_accuracy_reward_math": 0.05133928681607358,
+      "step": 151
+    },
+    {
+      "clip_ratio": 0.0002668876670099962,
+      "epoch": 0.09942935675659526,
+      "grad_norm": 0.033922772854566574,
+      "kl": 0.002791762351989746,
+      "learning_rate": 2.420382165605096e-06,
+      "loss": 0.0027,
+      "step": 152
+    },
+    {
+      "clip_ratio": 0.0002435101382616267,
+      "epoch": 0.10134145977114517,
+      "grad_norm": 0.03526493161916733,
+      "kl": 0.002907991409301758,
+      "learning_rate": 2.4363057324840767e-06,
+      "loss": 0.0027,
+      "step": 153
+    },
+    {
+      "clip_ratio": 0.00025345294346834635,
+      "epoch": 0.10325356278569509,
+      "grad_norm": 0.034125424921512604,
+      "kl": 0.0029108524322509766,
+      "learning_rate": 2.4522292993630575e-06,
+      "loss": 0.0027,
+      "step": 154
+    },
+    {
+      "clip_ratio": 0.0002378649581942227,
+      "epoch": 0.10516566580024499,
+      "grad_norm": 0.033436987549066544,
+      "kl": 0.002874612808227539,
+      "learning_rate": 2.4681528662420382e-06,
+      "loss": 0.0027,
+      "step": 155
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 541.2424907684326,
+      "epoch": 0.1070777688147949,
+      "grad_norm": 0.031592246145009995,
+      "kl": 0.002785801887512207,
+      "learning_rate": 2.4840764331210194e-06,
+      "loss": 0.005,
+      "num_tokens": 43331425.0,
+      "reward": 0.044363841181620955,
+      "reward_std": 0.05607495462754741,
+      "rewards/pure_accuracy_reward_math": 0.044363840599544346,
+      "step": 156
+    },
+    {
+      "clip_ratio": 0.00019312051063025137,
+      "epoch": 0.1089898718293448,
+      "grad_norm": 0.030642936006188393,
+      "kl": 0.0027495622634887695,
+      "learning_rate": 2.5e-06,
+      "loss": 0.005,
+      "step": 157
+    },
+    {
+      "clip_ratio": 0.0002267159566713417,
+      "epoch": 0.11090197484389472,
+      "grad_norm": 0.03025418519973755,
+      "kl": 0.002672433853149414,
+      "learning_rate": 2.515923566878981e-06,
+      "loss": 0.0049,
+      "step": 158
+    },
+    {
+      "clip_ratio": 0.00023296605036193796,
+      "epoch": 0.11281407785844462,
+      "grad_norm": 0.03024701401591301,
+      "kl": 0.0026074647903442383,
+      "learning_rate": 2.531847133757962e-06,
+      "loss": 0.0049,
+      "step": 159
+    },
+    {
+      "clip_ratio": 0.00024551542321660236,
+      "epoch": 0.11472618087299453,
+      "grad_norm": 0.03065372072160244,
+      "kl": 0.0025725364685058594,
+      "learning_rate": 2.547770700636943e-06,
+      "loss": 0.0049,
+      "step": 160
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 531.590705871582,
+      "epoch": 0.11663828388754444,
+      "grad_norm": 0.03286377340555191,
+      "kl": 0.002618551254272461,
+      "learning_rate": 2.5636942675159237e-06,
+      "loss": 0.0032,
+      "num_tokens": 46966882.0,
+      "reward": 0.0401785735739395,
+      "reward_std": 0.05864621384534985,
+      "rewards/pure_accuracy_reward_math": 0.04017857258440927,
+      "step": 161
+    },
+    {
+      "clip_ratio": 0.000249601399104904,
+      "epoch": 0.11855038690209435,
+      "grad_norm": 0.03168044239282608,
+      "kl": 0.0025817155838012695,
+      "learning_rate": 2.5796178343949045e-06,
+      "loss": 0.0032,
+      "step": 162
+    },
+    {
+      "clip_ratio": 0.0002426054838338132,
+      "epoch": 0.12046248991664425,
+      "grad_norm": 0.03161012753844261,
+      "kl": 0.0025763511657714844,
+      "learning_rate": 2.5955414012738857e-06,
+      "loss": 0.0032,
+      "step": 163
+    },
+    {
+      "clip_ratio": 0.0002400714004124893,
+      "epoch": 0.12237459293119417,
+      "grad_norm": 0.031408168375492096,
+      "kl": 0.002588987350463867,
+      "learning_rate": 2.6114649681528665e-06,
+      "loss": 0.0032,
+      "step": 164
+    },
+    {
+      "clip_ratio": 0.00024877328468164706,
+      "epoch": 0.12428669594574408,
+      "grad_norm": 0.030564049258828163,
+      "kl": 0.0026369094848632812,
+      "learning_rate": 2.6273885350318472e-06,
+      "loss": 0.0031,
+      "step": 165
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.5862393379211,
+      "epoch": 0.12619879896029398,
+      "grad_norm": 0.03767310827970505,
+      "kl": 0.0026383399963378906,
+      "learning_rate": 2.6433121019108284e-06,
+      "loss": 0.0062,
+      "num_tokens": 50581511.0,
+      "reward": 0.04966518055880442,
+      "reward_std": 0.06985319027444348,
+      "rewards/pure_accuracy_reward_math": 0.04966517968568951,
+      "step": 166
+    },
+    {
+      "clip_ratio": 0.0002872111982696879,
+      "epoch": 0.1281109019748439,
+      "grad_norm": 0.03578091412782669,
+      "kl": 0.0027115345001220703,
+      "learning_rate": 2.659235668789809e-06,
+      "loss": 0.0062,
+      "step": 167
+    },
+    {
+      "clip_ratio": 0.0002957127134664006,
+      "epoch": 0.13002300498939381,
+      "grad_norm": 0.03471493721008301,
+      "kl": 0.0028066635131835938,
+      "learning_rate": 2.67515923566879e-06,
+      "loss": 0.0062,
+      "step": 168
+    },
+    {
+      "clip_ratio": 0.0003112256898702981,
+      "epoch": 0.1319351080039437,
+      "grad_norm": 0.035491716116666794,
+      "kl": 0.0028966665267944336,
+      "learning_rate": 2.6910828025477707e-06,
+      "loss": 0.0062,
+      "step": 169
+    },
+    {
+      "clip_ratio": 0.0003354581235726073,
+      "epoch": 0.13384721101849362,
+      "grad_norm": 0.03574714809656143,
+      "kl": 0.0029289722442626953,
+      "learning_rate": 2.707006369426752e-06,
+      "loss": 0.0061,
+      "step": 170
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.8571667671204,
+      "epoch": 0.13575931403304353,
+      "grad_norm": 0.03648287057876587,
+      "kl": 0.0030307769775390625,
+      "learning_rate": 2.7229299363057327e-06,
+      "loss": 0.0061,
+      "num_tokens": 54209407.0,
+      "reward": 0.05161830587894656,
+      "reward_std": 0.06465821276651695,
+      "rewards/pure_accuracy_reward_math": 0.05161830494762398,
+      "step": 171
+    },
+    {
+      "clip_ratio": 0.0002587431810354701,
+      "epoch": 0.13767141704759345,
+      "grad_norm": 0.03615426644682884,
+      "kl": 0.0030341148376464844,
+      "learning_rate": 2.7388535031847135e-06,
+      "loss": 0.0061,
+      "step": 172
+    },
+    {
+      "clip_ratio": 0.0002548517101104153,
+      "epoch": 0.13958352006214333,
+      "grad_norm": 0.03565597161650658,
+      "kl": 0.002932310104370117,
+      "learning_rate": 2.7547770700636942e-06,
+      "loss": 0.0061,
+      "step": 173
+    },
+    {
+      "clip_ratio": 0.00027394448250106507,
+      "epoch": 0.14149562307669325,
+      "grad_norm": 0.035612594336271286,
+      "kl": 0.0029175281524658203,
+      "learning_rate": 2.7707006369426754e-06,
+      "loss": 0.0061,
+      "step": 174
+    },
+    {
+      "clip_ratio": 0.00027776476230201297,
+      "epoch": 0.14340772609124317,
+      "grad_norm": 0.036588992923498154,
+      "kl": 0.002942800521850586,
+      "learning_rate": 2.786624203821656e-06,
+      "loss": 0.006,
+      "step": 175
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.2430500984192,
+      "epoch": 0.14531982910579308,
+      "grad_norm": 0.03312006592750549,
+      "kl": 0.0028772354125976562,
+      "learning_rate": 2.802547770700637e-06,
+      "loss": 0.0056,
+      "num_tokens": 57839070.0,
+      "reward": 0.04854910931317136,
+      "reward_std": 0.05881887051509693,
+      "rewards/pure_accuracy_reward_math": 0.048549108614679426,
+      "step": 176
+    },
+    {
+      "clip_ratio": 0.00022063881021949783,
+      "epoch": 0.147231932120343,
+      "grad_norm": 0.0327099934220314,
+      "kl": 0.002942681312561035,
+      "learning_rate": 2.818471337579618e-06,
+      "loss": 0.0056,
+      "step": 177
+    },
+    {
+      "clip_ratio": 0.00021944492368675128,
+      "epoch": 0.14914403513489288,
+      "grad_norm": 0.03261202201247215,
+      "kl": 0.002986431121826172,
+      "learning_rate": 2.834394904458599e-06,
+      "loss": 0.0056,
+      "step": 178
+    },
+    {
+      "clip_ratio": 0.0002127133307396889,
+      "epoch": 0.1510561381494428,
+      "grad_norm": 0.03220335766673088,
+      "kl": 0.002970457077026367,
+      "learning_rate": 2.8503184713375797e-06,
+      "loss": 0.0056,
+      "step": 179
+    },
+    {
+      "clip_ratio": 0.0001991192841614975,
+      "epoch": 0.1529682411639927,
+      "grad_norm": 0.03179548308253288,
+      "kl": 0.0029560327529907227,
+      "learning_rate": 2.8662420382165605e-06,
+      "loss": 0.0056,
+      "step": 180
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.1275358200073,
+      "epoch": 0.15488034417854263,
+      "grad_norm": 0.030966561287641525,
+      "kl": 0.0029218196868896484,
+      "learning_rate": 2.8821656050955417e-06,
+      "loss": 0.0048,
+      "num_tokens": 61445599.0,
+      "reward": 0.04352678795112297,
+      "reward_std": 0.05598862626357004,
+      "rewards/pure_accuracy_reward_math": 0.043526787078008056,
+      "step": 181
+    },
+    {
+      "clip_ratio": 0.00021554413663693595,
+      "epoch": 0.15679244719309252,
+      "grad_norm": 0.030419446527957916,
+      "kl": 0.0029065608978271484,
+      "learning_rate": 2.8980891719745225e-06,
+      "loss": 0.0048,
+      "step": 182
+    },
+    {
+      "clip_ratio": 0.0002025423377176594,
+      "epoch": 0.15870455020764243,
+      "grad_norm": 0.030062729492783546,
+      "kl": 0.0028995275497436523,
+      "learning_rate": 2.9140127388535032e-06,
+      "loss": 0.0048,
+      "step": 183
+    },
+    {
+      "clip_ratio": 0.00023064417456453157,
+      "epoch": 0.16061665322219235,
+      "grad_norm": 0.029301613569259644,
+      "kl": 0.002888321876525879,
+      "learning_rate": 2.9299363057324844e-06,
+      "loss": 0.0048,
+      "step": 184
+    },
+    {
+      "clip_ratio": 0.0002338091023261768,
+      "epoch": 0.16252875623674226,
+      "grad_norm": 0.029127391055226326,
+      "kl": 0.0028772354125976562,
+      "learning_rate": 2.945859872611465e-06,
+      "loss": 0.0047,
+      "step": 185
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 533.0510892868042,
+      "epoch": 0.16444085925129218,
+      "grad_norm": 0.036479271948337555,
+      "kl": 0.002923727035522461,
+      "learning_rate": 2.961783439490446e-06,
+      "loss": 0.0063,
+      "num_tokens": 65094142.0,
+      "reward": 0.05022321717115119,
+      "reward_std": 0.06538890500087291,
+      "rewards/pure_accuracy_reward_math": 0.050223215424921364,
+      "step": 186
+    },
+    {
+      "clip_ratio": 0.00026048495129771254,
+      "epoch": 0.16635296226584206,
+      "grad_norm": 0.036232370883226395,
+      "kl": 0.0029561519622802734,
+      "learning_rate": 2.9777070063694267e-06,
+      "loss": 0.0063,
+      "step": 187
+    },
+    {
+      "clip_ratio": 0.0002226464382033555,
+      "epoch": 0.16826506528039198,
+      "grad_norm": 0.03523917496204376,
+      "kl": 0.003048419952392578,
+      "learning_rate": 2.993630573248408e-06,
+      "loss": 0.0063,
+      "step": 188
+    },
+    {
+      "clip_ratio": 0.0002362887615845466,
+      "epoch": 0.1701771682949419,
+      "grad_norm": 0.03477315977215767,
+      "kl": 0.003025054931640625,
+      "learning_rate": 3.0095541401273887e-06,
+      "loss": 0.0062,
+      "step": 189
+    },
+    {
+      "clip_ratio": 0.00023160997727700305,
+      "epoch": 0.1720892713094918,
+      "grad_norm": 0.03342609107494354,
+      "kl": 0.0030221939086914062,
+      "learning_rate": 3.0254777070063695e-06,
+      "loss": 0.0062,
+      "step": 190
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.7776494026184,
+      "epoch": 0.1740013743240417,
+      "grad_norm": 0.03668810427188873,
+      "kl": 0.0029752254486083984,
+      "learning_rate": 3.0414012738853503e-06,
+      "loss": 0.0066,
+      "num_tokens": 68728277.0,
+      "reward": 0.04994419863214716,
+      "reward_std": 0.06135626137256622,
+      "rewards/pure_accuracy_reward_math": 0.04994419787544757,
+      "step": 191
+    },
+    {
+      "clip_ratio": 0.0002391185845453947,
+      "epoch": 0.1759134773385916,
+      "grad_norm": 0.035618141293525696,
+      "kl": 0.0029642581939697266,
+      "learning_rate": 3.0573248407643314e-06,
+      "loss": 0.0066,
+      "step": 192
+    },
+    {
+      "clip_ratio": 0.00024402707180115613,
+      "epoch": 0.17782558035314153,
+      "grad_norm": 0.032588809728622437,
+      "kl": 0.002981424331665039,
+      "learning_rate": 3.0732484076433122e-06,
+      "loss": 0.0066,
+      "step": 193
+    },
+    {
+      "clip_ratio": 0.0002546731577126593,
+      "epoch": 0.17973768336769144,
+      "grad_norm": 0.0323190875351429,
+      "kl": 0.0030133724212646484,
+      "learning_rate": 3.089171974522293e-06,
+      "loss": 0.0066,
+      "step": 194
+    },
+    {
+      "clip_ratio": 0.0002784079450179888,
+      "epoch": 0.18164978638224133,
+      "grad_norm": 0.03181909769773483,
+      "kl": 0.002997159957885742,
+      "learning_rate": 3.105095541401274e-06,
+      "loss": 0.0065,
+      "step": 195
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 536.7226796150208,
+      "epoch": 0.18356188939679124,
+      "grad_norm": 0.034835390746593475,
+      "kl": 0.003053426742553711,
+      "learning_rate": 3.121019108280255e-06,
+      "loss": 0.0053,
+      "num_tokens": 72383923.0,
+      "reward": 0.04352678789291531,
+      "reward_std": 0.06164911447558552,
+      "rewards/pure_accuracy_reward_math": 0.043526787078008056,
+      "step": 196
+    },
+    {
+      "clip_ratio": 0.00022759345233680506,
+      "epoch": 0.18547399241134116,
+      "grad_norm": 0.03316686674952507,
+      "kl": 0.003064870834350586,
+      "learning_rate": 3.1369426751592357e-06,
+      "loss": 0.0053,
+      "step": 197
+    },
+    {
+      "clip_ratio": 0.00024183520912401946,
+      "epoch": 0.18738609542589107,
+      "grad_norm": 0.0329214446246624,
+      "kl": 0.003040313720703125,
+      "learning_rate": 3.1528662420382165e-06,
+      "loss": 0.0053,
+      "step": 198
+    },
+    {
+      "clip_ratio": 0.0002539973459079192,
+      "epoch": 0.189298198440441,
+      "grad_norm": 0.031231405213475227,
+      "kl": 0.0030624866485595703,
+      "learning_rate": 3.1687898089171977e-06,
+      "loss": 0.0052,
+      "step": 199
+    },
+    {
+      "clip_ratio": 0.0002776768195076329,
+      "epoch": 0.19121030145499088,
+      "grad_norm": 0.031124714761972427,
+      "kl": 0.0030813217163085938,
+      "learning_rate": 3.1847133757961785e-06,
+      "loss": 0.0052,
+      "step": 200
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.4048767089844,
+      "epoch": 0.1931224044695408,
+      "grad_norm": 0.03386811539530754,
+      "kl": 0.003122568130493164,
+      "learning_rate": 3.2006369426751592e-06,
+      "loss": 0.0052,
+      "num_tokens": 75984438.0,
+      "reward": 0.04771205616998486,
+      "reward_std": 0.06319682823959738,
+      "rewards/pure_accuracy_reward_math": 0.04771205471479334,
+      "step": 201
+    },
+    {
+      "clip_ratio": 0.00024403837670661233,
+      "epoch": 0.1950345074840907,
+      "grad_norm": 0.03252818062901497,
+      "kl": 0.003181934356689453,
+      "learning_rate": 3.2165605095541404e-06,
+      "loss": 0.0052,
+      "step": 202
+    },
+    {
+      "clip_ratio": 0.0002548924753114079,
+      "epoch": 0.19694661049864062,
+      "grad_norm": 0.03233063966035843,
+      "kl": 0.0032570362091064453,
+      "learning_rate": 3.232484076433121e-06,
+      "loss": 0.0052,
+      "step": 203
+    },
+    {
+      "clip_ratio": 0.0003048134046252926,
+      "epoch": 0.1988587135131905,
+      "grad_norm": 0.032457806169986725,
+      "kl": 0.0032837390899658203,
+      "learning_rate": 3.248407643312102e-06,
+      "loss": 0.0051,
+      "step": 204
+    },
+    {
+      "clip_ratio": 0.0003034327668842707,
+      "epoch": 0.20077081652774043,
+      "grad_norm": 0.03239855542778969,
+      "kl": 0.0032906532287597656,
+      "learning_rate": 3.2643312101910827e-06,
+      "loss": 0.0051,
+      "step": 205
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 509.6657609939575,
+      "epoch": 0.20268291954229034,
+      "grad_norm": 0.0370325967669487,
+      "kl": 0.0033235549926757812,
+      "learning_rate": 3.280254777070064e-06,
+      "loss": 0.0075,
+      "num_tokens": 79548556.0,
+      "reward": 0.052176341792801395,
+      "reward_std": 0.06135626166360453,
+      "rewards/pure_accuracy_reward_math": 0.0521763407450635,
+      "step": 206
+    },
+    {
+      "clip_ratio": 0.00026798775621728055,
+      "epoch": 0.20459502255684026,
+      "grad_norm": 0.03616202250123024,
+      "kl": 0.0032608509063720703,
+      "learning_rate": 3.2961783439490447e-06,
+      "loss": 0.0075,
+      "step": 207
+    },
+    {
+      "clip_ratio": 0.0002652346859690624,
+      "epoch": 0.20650712557139017,
+      "grad_norm": 0.03537038713693619,
+      "kl": 0.0032129287719726562,
+      "learning_rate": 3.3121019108280255e-06,
+      "loss": 0.0074,
+      "step": 208
+    },
+    {
+      "clip_ratio": 0.00026950107780976396,
+      "epoch": 0.20841922858594006,
+      "grad_norm": 0.03502323478460312,
+      "kl": 0.0031485557556152344,
+      "learning_rate": 3.3280254777070063e-06,
+      "loss": 0.0074,
+      "step": 209
+    },
+    {
+      "clip_ratio": 0.00025725525091502277,
+      "epoch": 0.21033133160048997,
+      "grad_norm": 0.03380832076072693,
+      "kl": 0.0031027793884277344,
+      "learning_rate": 3.3439490445859875e-06,
+      "loss": 0.0074,
+      "step": 210
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 511.36637449264526,
+      "epoch": 0.2122434346150399,
+      "grad_norm": 1.5961617231369019,
+      "kl": 0.007061004638671875,
+      "learning_rate": 3.3598726114649682e-06,
+      "loss": 0.0062,
+      "num_tokens": 83116585.0,
+      "reward": 0.05078125247382559,
+      "reward_std": 0.06568795558996499,
+      "rewards/pure_accuracy_reward_math": 0.05078125119325705,
+      "step": 211
+    },
+    {
+      "clip_ratio": 0.0002800602194383828,
+      "epoch": 0.2141555376295898,
+      "grad_norm": 0.04389820247888565,
+      "kl": 0.004379749298095703,
+      "learning_rate": 3.375796178343949e-06,
+      "loss": 0.0061,
+      "step": 212
+    },
+    {
+      "clip_ratio": 0.0002803218378630845,
+      "epoch": 0.2160676406441397,
+      "grad_norm": 0.04022788628935814,
+      "kl": 0.0043125152587890625,
+      "learning_rate": 3.39171974522293e-06,
+      "loss": 0.0061,
+      "step": 213
+    },
+    {
+      "clip_ratio": 0.0002704095267631601,
+      "epoch": 0.2179797436586896,
+      "grad_norm": 0.041697319597005844,
+      "kl": 0.004408597946166992,
+      "learning_rate": 3.407643312101911e-06,
+      "loss": 0.0061,
+      "step": 214
+    },
+    {
+      "clip_ratio": 0.0003097587871820906,
+      "epoch": 0.21989184667323952,
+      "grad_norm": 0.04933662340044975,
+      "kl": 0.004500150680541992,
+      "learning_rate": 3.4235668789808917e-06,
+      "loss": 0.006,
+      "step": 215
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.8122463226318,
+      "epoch": 0.22180394968778944,
+      "grad_norm": 0.03384365886449814,
+      "kl": 0.0032858848571777344,
+      "learning_rate": 3.4394904458598725e-06,
+      "loss": 0.0069,
+      "num_tokens": 86710660.0,
+      "reward": 0.041015627270098776,
+      "reward_std": 0.05345123494043946,
+      "rewards/pure_accuracy_reward_math": 0.041015626047737896,
+      "step": 216
+    },
+    {
+      "clip_ratio": 0.00022953049290208583,
+      "epoch": 0.22371605270233935,
+      "grad_norm": 0.03259577602148056,
+      "kl": 0.003277301788330078,
+      "learning_rate": 3.4554140127388537e-06,
+      "loss": 0.0069,
+      "step": 217
+    },
+    {
+      "clip_ratio": 0.00024143920052210888,
+      "epoch": 0.22562815571688924,
+      "grad_norm": 0.031054330989718437,
+      "kl": 0.0031991004943847656,
+      "learning_rate": 3.4713375796178345e-06,
+      "loss": 0.0069,
+      "step": 218
+    },
+    {
+      "clip_ratio": 0.0002552373456978785,
+      "epoch": 0.22754025873143915,
+      "grad_norm": 0.031755171716213226,
+      "kl": 0.003099679946899414,
+      "learning_rate": 3.4872611464968152e-06,
+      "loss": 0.0069,
+      "step": 219
+    },
+    {
+      "clip_ratio": 0.0002681780064790473,
+      "epoch": 0.22945236174598907,
+      "grad_norm": 0.031188273802399635,
+      "kl": 0.003045320510864258,
+      "learning_rate": 3.5031847133757964e-06,
+      "loss": 0.0068,
+      "step": 220
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.6825013160706,
+      "epoch": 0.23136446476053898,
+      "grad_norm": 0.03775335103273392,
+      "kl": 0.003011941909790039,
+      "learning_rate": 3.5191082802547772e-06,
+      "loss": 0.0063,
+      "num_tokens": 90291858.0,
+      "reward": 0.04715401996509172,
+      "reward_std": 0.06113734241807833,
+      "rewards/pure_accuracy_reward_math": 0.04715401915018447,
+      "step": 221
+    },
+    {
+      "clip_ratio": 0.0002582234144483664,
+      "epoch": 0.23327656777508887,
+      "grad_norm": 0.03602875769138336,
+      "kl": 0.002973794937133789,
+      "learning_rate": 3.535031847133758e-06,
+      "loss": 0.0063,
+      "step": 222
+    },
+    {
+      "clip_ratio": 0.0002264754746761355,
+      "epoch": 0.2351886707896388,
+      "grad_norm": 0.03449266403913498,
+      "kl": 0.002980470657348633,
+      "learning_rate": 3.5509554140127388e-06,
+      "loss": 0.0063,
+      "step": 223
+    },
+    {
+      "clip_ratio": 0.00025999376231311544,
+      "epoch": 0.2371007738041887,
+      "grad_norm": 0.0329199843108654,
+      "kl": 0.002971053123474121,
+      "learning_rate": 3.56687898089172e-06,
+      "loss": 0.0062,
+      "step": 224
+    },
+    {
+      "clip_ratio": 0.000296181439978227,
+      "epoch": 0.23901287681873862,
+      "grad_norm": 0.033409375697374344,
+      "kl": 0.0030214786529541016,
+      "learning_rate": 3.5828025477707007e-06,
+      "loss": 0.0062,
+      "step": 225
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.4132494926453,
+      "epoch": 0.2409249798332885,
+      "grad_norm": 0.03549947962164879,
+      "kl": 0.004068970680236816,
+      "learning_rate": 3.5987261146496815e-06,
+      "loss": 0.0083,
+      "num_tokens": 93927655.0,
+      "reward": 0.039899555675219744,
+      "reward_std": 0.05890519870445132,
+      "rewards/pure_accuracy_reward_math": 0.03989955480210483,
+      "step": 226
+    },
+    {
+      "clip_ratio": 0.00024125495940552355,
+      "epoch": 0.24283708284783842,
+      "grad_norm": 0.033262889832258224,
+      "kl": 0.0040683746337890625,
+      "learning_rate": 3.6146496815286623e-06,
+      "loss": 0.0083,
+      "step": 227
+    },
+    {
+      "clip_ratio": 0.00024547909194438944,
+      "epoch": 0.24474918586238834,
+      "grad_norm": 0.03303634375333786,
+      "kl": 0.004040956497192383,
+      "learning_rate": 3.6305732484076435e-06,
+      "loss": 0.0083,
+      "step": 228
+    },
+    {
+      "clip_ratio": 0.0002773670349256463,
+      "epoch": 0.24666128887693825,
+      "grad_norm": 0.03389015421271324,
+      "kl": 0.00404667854309082,
+      "learning_rate": 3.6464968152866242e-06,
+      "loss": 0.0083,
+      "step": 229
+    },
+    {
+      "clip_ratio": 0.000270649900215858,
+      "epoch": 0.24857339189148817,
+      "grad_norm": 0.035877879709005356,
+      "kl": 0.0038802623748779297,
+      "learning_rate": 3.662420382165605e-06,
+      "loss": 0.0082,
+      "step": 230
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.9338984489441,
+      "epoch": 0.25048549490603805,
+      "grad_norm": 0.032850634306669235,
+      "kl": 0.0030744075775146484,
+      "learning_rate": 3.678343949044586e-06,
+      "loss": 0.0064,
+      "num_tokens": 97554714.0,
+      "reward": 0.04743303795112297,
+      "reward_std": 0.061522720265202224,
+      "rewards/pure_accuracy_reward_math": 0.047433037019800395,
+      "step": 231
+    },
+    {
+      "clip_ratio": 0.00024459305313939694,
+      "epoch": 0.25239759792058797,
+      "grad_norm": 0.03185749799013138,
+      "kl": 0.00302886962890625,
+      "learning_rate": 3.694267515923567e-06,
+      "loss": 0.0064,
+      "step": 232
+    },
+    {
+      "clip_ratio": 0.00025332184179660544,
+      "epoch": 0.2543097009351379,
+      "grad_norm": 0.03135737404227257,
+      "kl": 0.002967357635498047,
+      "learning_rate": 3.7101910828025477e-06,
+      "loss": 0.0064,
+      "step": 233
+    },
+    {
+      "clip_ratio": 0.0002861271710798974,
+      "epoch": 0.2562218039496878,
+      "grad_norm": 0.030725885182619095,
+      "kl": 0.0029573440551757812,
+      "learning_rate": 3.7261146496815285e-06,
+      "loss": 0.0064,
+      "step": 234
+    },
+    {
+      "clip_ratio": 0.0002841630366674508,
+      "epoch": 0.2581339069642377,
+      "grad_norm": 0.030670415610074997,
+      "kl": 0.002954721450805664,
+      "learning_rate": 3.7420382165605097e-06,
+      "loss": 0.0063,
+      "step": 235
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 531.4467325210571,
+      "epoch": 0.26004600997878763,
+      "grad_norm": 0.03534790128469467,
+      "kl": 0.003011465072631836,
+      "learning_rate": 3.757961783439491e-06,
+      "loss": 0.0041,
+      "num_tokens": 101193143.0,
+      "reward": 0.04631696638534777,
+      "reward_std": 0.0601075982558541,
+      "rewards/pure_accuracy_reward_math": 0.046316965454025194,
+      "step": 236
+    },
+    {
+      "clip_ratio": 0.00022260297603793333,
+      "epoch": 0.2619581129933375,
+      "grad_norm": 0.03438499942421913,
+      "kl": 0.0030508041381835938,
+      "learning_rate": 3.773885350318472e-06,
+      "loss": 0.0041,
+      "step": 237
+    },
+    {
+      "clip_ratio": 0.00024397839513312647,
+      "epoch": 0.2638702160078874,
+      "grad_norm": 0.032804593443870544,
+      "kl": 0.0030994415283203125,
+      "learning_rate": 3.789808917197453e-06,
+      "loss": 0.0041,
+      "step": 238
+    },
+    {
+      "clip_ratio": 0.0002508007286223801,
+      "epoch": 0.2657823190224373,
+      "grad_norm": 0.03402625024318695,
+      "kl": 0.0031244754791259766,
+      "learning_rate": 3.8057324840764336e-06,
+      "loss": 0.004,
+      "step": 239
+    },
+    {
+      "clip_ratio": 0.00025242620182552855,
+      "epoch": 0.26769442203698723,
+      "grad_norm": 0.03291900083422661,
+      "kl": 0.003187417984008789,
+      "learning_rate": 3.821656050955415e-06,
+      "loss": 0.004,
+      "step": 240
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 536.9913763999939,
+      "epoch": 0.26960652505153715,
+      "grad_norm": 0.033690325915813446,
+      "kl": 0.003125429153442383,
+      "learning_rate": 3.837579617834396e-06,
+      "loss": 0.0089,
+      "num_tokens": 104860392.0,
+      "reward": 0.05496652069268748,
+      "reward_std": 0.07028483308386058,
+      "rewards/pure_accuracy_reward_math": 0.05496651929570362,
+      "step": 241
+    },
+    {
+      "clip_ratio": 0.0002661047830088137,
+      "epoch": 0.27151862806608706,
+      "grad_norm": 0.03227640688419342,
+      "kl": 0.0031244754791259766,
+      "learning_rate": 3.853503184713376e-06,
+      "loss": 0.009,
+      "step": 242
+    },
+    {
+      "clip_ratio": 0.00027503777869242185,
+      "epoch": 0.273430731080637,
+      "grad_norm": 0.03168897703289986,
+      "kl": 0.003157377243041992,
+      "learning_rate": 3.869426751592357e-06,
+      "loss": 0.0089,
+      "step": 243
+    },
+    {
+      "clip_ratio": 0.00029653536631712996,
+      "epoch": 0.2753428340951869,
+      "grad_norm": 0.03222280368208885,
+      "kl": 0.0031862258911132812,
+      "learning_rate": 3.885350318471338e-06,
+      "loss": 0.0089,
+      "step": 244
+    },
+    {
+      "clip_ratio": 0.0003081631187455969,
+      "epoch": 0.2772549371097368,
+      "grad_norm": 0.03176514804363251,
+      "kl": 0.0032341480255126953,
+      "learning_rate": 3.901273885350319e-06,
+      "loss": 0.0088,
+      "step": 245
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 513.5616898536682,
+      "epoch": 0.27916704012428667,
+      "grad_norm": 0.037929438054561615,
+      "kl": 0.0035233497619628906,
+      "learning_rate": 3.9171974522293e-06,
+      "loss": 0.0075,
+      "num_tokens": 108427949.0,
+      "reward": 0.0544084852153901,
+      "reward_std": 0.0659469406818971,
+      "rewards/pure_accuracy_reward_math": 0.054408483527367935,
+      "step": 246
+    },
+    {
+      "clip_ratio": 0.0002633177949178389,
+      "epoch": 0.2810791431388366,
+      "grad_norm": 0.03561301901936531,
+      "kl": 0.0035467147827148438,
+      "learning_rate": 3.933121019108281e-06,
+      "loss": 0.0075,
+      "step": 247
+    },
+    {
+      "clip_ratio": 0.0003005996498472996,
+      "epoch": 0.2829912461533865,
+      "grad_norm": 0.035342708230018616,
+      "kl": 0.003578662872314453,
+      "learning_rate": 3.949044585987262e-06,
+      "loss": 0.0075,
+      "step": 248
+    },
+    {
+      "clip_ratio": 0.0003206986277177748,
+      "epoch": 0.2849033491679364,
+      "grad_norm": 0.03841444477438927,
+      "kl": 0.0036001205444335938,
+      "learning_rate": 3.964968152866243e-06,
+      "loss": 0.0075,
+      "step": 249
+    },
+    {
+      "clip_ratio": 0.00030761192169848073,
+      "epoch": 0.28681545218248633,
+      "grad_norm": 0.03515273705124855,
+      "kl": 0.003624439239501953,
+      "learning_rate": 3.980891719745223e-06,
+      "loss": 0.0074,
+      "step": 250
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 504.73858308792114,
+      "epoch": 0.28872755519703625,
+      "grad_norm": 0.04030496999621391,
+      "kl": 0.003686189651489258,
+      "learning_rate": 3.996815286624204e-06,
+      "loss": 0.0081,
+      "num_tokens": 111975532.0,
+      "reward": 0.0647321458964143,
+      "reward_std": 0.07547981187235564,
+      "rewards/pure_accuracy_reward_math": 0.06473214420839213,
+      "step": 251
+    },
+    {
+      "clip_ratio": 0.00031485489739679906,
+      "epoch": 0.29063965821158616,
+      "grad_norm": 0.04058763012290001,
+      "kl": 0.003683328628540039,
+      "learning_rate": 4.012738853503185e-06,
+      "loss": 0.0081,
+      "step": 252
+    },
+    {
+      "clip_ratio": 0.0003329372994471669,
+      "epoch": 0.2925517612261361,
+      "grad_norm": 0.039948880672454834,
+      "kl": 0.003644227981567383,
+      "learning_rate": 4.0286624203821666e-06,
+      "loss": 0.0081,
+      "step": 253
+    },
+    {
+      "clip_ratio": 0.00031999613804600813,
+      "epoch": 0.294463864240686,
+      "grad_norm": 0.038771189749240875,
+      "kl": 0.003670930862426758,
+      "learning_rate": 4.044585987261147e-06,
+      "loss": 0.008,
+      "step": 254
+    },
+    {
+      "clip_ratio": 0.0003391868065136805,
+      "epoch": 0.29637596725523585,
+      "grad_norm": 0.03820183873176575,
+      "kl": 0.0036439895629882812,
+      "learning_rate": 4.060509554140128e-06,
+      "loss": 0.0079,
+      "step": 255
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 507.980770111084,
+      "epoch": 0.29828807026978577,
+      "grad_norm": 0.0373733825981617,
+      "kl": 0.003556966781616211,
+      "learning_rate": 4.076433121019109e-06,
+      "loss": 0.0047,
+      "num_tokens": 115530899.0,
+      "reward": 0.05217634199652821,
+      "reward_std": 0.06624599173665047,
+      "rewards/pure_accuracy_reward_math": 0.05217634030850604,
+      "step": 256
+    },
+    {
+      "clip_ratio": 0.0002444871162765594,
+      "epoch": 0.3002001732843357,
+      "grad_norm": 0.03655192255973816,
+      "kl": 0.003623485565185547,
+      "learning_rate": 4.09235668789809e-06,
+      "loss": 0.0047,
+      "step": 257
+    },
+    {
+      "clip_ratio": 0.0002544127338524049,
+      "epoch": 0.3021122762988856,
+      "grad_norm": 0.035692181438207626,
+      "kl": 0.003640890121459961,
+      "learning_rate": 4.10828025477707e-06,
+      "loss": 0.0046,
+      "step": 258
+    },
+    {
+      "clip_ratio": 0.0002950017506577751,
+      "epoch": 0.3040243793134355,
+      "grad_norm": 0.03550735488533974,
+      "kl": 0.0036733150482177734,
+      "learning_rate": 4.124203821656051e-06,
+      "loss": 0.0046,
+      "step": 259
+    },
+    {
+      "clip_ratio": 0.0002894491571510116,
+      "epoch": 0.3059364823279854,
+      "grad_norm": 0.03471330925822258,
+      "kl": 0.00366973876953125,
+      "learning_rate": 4.140127388535032e-06,
+      "loss": 0.0045,
+      "step": 260
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 517.2921543121338,
+      "epoch": 0.30784858534253534,
+      "grad_norm": 0.6632264852523804,
+      "kl": 0.007929325103759766,
+      "learning_rate": 4.156050955414014e-06,
+      "loss": 0.0041,
+      "num_tokens": 119123970.0,
+      "reward": 0.046875002153683454,
+      "reward_std": 0.06358220643596724,
+      "rewards/pure_accuracy_reward_math": 0.04687500122236088,
+      "step": 261
+    },
+    {
+      "clip_ratio": 0.00027907352409783925,
+      "epoch": 0.30976068835708526,
+      "grad_norm": 0.03735750913619995,
+      "kl": 0.0038709640502929688,
+      "learning_rate": 4.171974522292994e-06,
+      "loss": 0.004,
+      "step": 262
+    },
+    {
+      "clip_ratio": 0.000277261100677606,
+      "epoch": 0.31167279137163517,
+      "grad_norm": 0.03806532546877861,
+      "kl": 0.004002094268798828,
+      "learning_rate": 4.187898089171975e-06,
+      "loss": 0.004,
+      "step": 263
+    },
+    {
+      "clip_ratio": 0.00026404397090118437,
+      "epoch": 0.31358489438618503,
+      "grad_norm": 0.03587675094604492,
+      "kl": 0.00407719612121582,
+      "learning_rate": 4.203821656050956e-06,
+      "loss": 0.0039,
+      "step": 264
+    },
+    {
+      "clip_ratio": 0.0003132741497324787,
+      "epoch": 0.31549699740073495,
+      "grad_norm": 0.03516336902976036,
+      "kl": 0.004099607467651367,
+      "learning_rate": 4.219745222929937e-06,
+      "loss": 0.0039,
+      "step": 265
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.596004486084,
+      "epoch": 0.31740910041528486,
+      "grad_norm": 0.038204919546842575,
+      "kl": 0.0035691261291503906,
+      "learning_rate": 4.2356687898089174e-06,
+      "loss": 0.006,
+      "num_tokens": 122758966.0,
+      "reward": 0.054966520925518125,
+      "reward_std": 0.06770737608894706,
+      "rewards/pure_accuracy_reward_math": 0.05496651912108064,
+      "step": 266
+    },
+    {
+      "clip_ratio": 0.00026713599251593223,
+      "epoch": 0.3193212034298348,
+      "grad_norm": 0.03804617002606392,
+      "kl": 0.003623485565185547,
+      "learning_rate": 4.251592356687898e-06,
+      "loss": 0.006,
+      "step": 267
+    },
+    {
+      "clip_ratio": 0.00027288361513910786,
+      "epoch": 0.3212333064443847,
+      "grad_norm": 0.03765474632382393,
+      "kl": 0.003659486770629883,
+      "learning_rate": 4.26751592356688e-06,
+      "loss": 0.006,
+      "step": 268
+    },
+    {
+      "clip_ratio": 0.0002754389876429286,
+      "epoch": 0.3231454094589346,
+      "grad_norm": 0.037356842309236526,
+      "kl": 0.0036840438842773438,
+      "learning_rate": 4.283439490445861e-06,
+      "loss": 0.0059,
+      "step": 269
+    },
+    {
+      "clip_ratio": 0.0002686067065269526,
+      "epoch": 0.3250575124734845,
+      "grad_norm": 0.03656876087188721,
+      "kl": 0.003694295883178711,
+      "learning_rate": 4.299363057324841e-06,
+      "loss": 0.0059,
+      "step": 270
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 543.1746897697449,
+      "epoch": 0.32696961548803444,
+      "grad_norm": 0.03417838364839554,
+      "kl": 0.0035529136657714844,
+      "learning_rate": 4.315286624203822e-06,
+      "loss": 0.0076,
+      "num_tokens": 126443800.0,
+      "reward": 0.04882812697906047,
+      "reward_std": 0.05766273388871923,
+      "rewards/pure_accuracy_reward_math": 0.04882812616415322,
+      "step": 271
+    },
+    {
+      "clip_ratio": 0.0002270729566475893,
+      "epoch": 0.32888171850258435,
+      "grad_norm": 0.03328363224864006,
+      "kl": 0.0035278797149658203,
+      "learning_rate": 4.331210191082803e-06,
+      "loss": 0.0076,
+      "step": 272
+    },
+    {
+      "clip_ratio": 0.0002132950650661769,
+      "epoch": 0.3307938215171342,
+      "grad_norm": 0.03230879083275795,
+      "kl": 0.0034902095794677734,
+      "learning_rate": 4.347133757961784e-06,
+      "loss": 0.0076,
+      "step": 273
+    },
+    {
+      "clip_ratio": 0.0002096330554195447,
+      "epoch": 0.3327059245316841,
+      "grad_norm": 0.031601596623659134,
+      "kl": 0.003440380096435547,
+      "learning_rate": 4.3630573248407645e-06,
+      "loss": 0.0076,
+      "step": 274
+    },
+    {
+      "clip_ratio": 0.00027223577194490645,
+      "epoch": 0.33461802754623404,
+      "grad_norm": 0.033090248703956604,
+      "kl": 0.003412485122680664,
+      "learning_rate": 4.378980891719746e-06,
+      "loss": 0.0075,
+      "step": 275
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 535.8942775726318,
+      "epoch": 0.33653013056078396,
+      "grad_norm": 0.03229549527168274,
+      "kl": 0.003350973129272461,
+      "learning_rate": 4.394904458598727e-06,
+      "loss": 0.0057,
+      "num_tokens": 130099677.0,
+      "reward": 0.04966518087894656,
+      "reward_std": 0.060705700190737844,
+      "rewards/pure_accuracy_reward_math": 0.049665179773001,
+      "step": 276
+    },
+    {
+      "clip_ratio": 0.00025271691475836633,
+      "epoch": 0.3384422335753339,
+      "grad_norm": 0.03214692696928978,
+      "kl": 0.0033435821533203125,
+      "learning_rate": 4.410828025477708e-06,
+      "loss": 0.0057,
+      "step": 277
+    },
+    {
+      "clip_ratio": 0.00023837689644778948,
+      "epoch": 0.3403543365898838,
+      "grad_norm": 0.03055053949356079,
+      "kl": 0.003403902053833008,
+      "learning_rate": 4.426751592356688e-06,
+      "loss": 0.0057,
+      "step": 278
+    },
+    {
+      "clip_ratio": 0.0002586998209039848,
+      "epoch": 0.3422664396044337,
+      "grad_norm": 0.030119990929961205,
+      "kl": 0.003477334976196289,
+      "learning_rate": 4.442675159235669e-06,
+      "loss": 0.0057,
+      "step": 279
+    },
+    {
+      "clip_ratio": 0.00026621688834893575,
+      "epoch": 0.3441785426189836,
+      "grad_norm": 0.030735207721590996,
+      "kl": 0.0035724639892578125,
+      "learning_rate": 4.45859872611465e-06,
+      "loss": 0.0056,
+      "step": 280
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 542.7466769218445,
+      "epoch": 0.34609064563353353,
+      "grad_norm": 0.033374350517988205,
+      "kl": 0.003545999526977539,
+      "learning_rate": 4.474522292993631e-06,
+      "loss": 0.0036,
+      "num_tokens": 133773381.0,
+      "reward": 0.051339288300368935,
+      "reward_std": 0.06345581240020692,
+      "rewards/pure_accuracy_reward_math": 0.05133928690338507,
+      "step": 281
+    },
+    {
+      "clip_ratio": 0.0002734534241994879,
+      "epoch": 0.3480027486480834,
+      "grad_norm": 0.03312847390770912,
+      "kl": 0.0035567283630371094,
+      "learning_rate": 4.490445859872612e-06,
+      "loss": 0.0036,
+      "step": 282
+    },
+    {
+      "clip_ratio": 0.00022532319422907676,
+      "epoch": 0.3499148516626333,
+      "grad_norm": 0.03281605243682861,
+      "kl": 0.0035707950592041016,
+      "learning_rate": 4.506369426751593e-06,
+      "loss": 0.0035,
+      "step": 283
+    },
+    {
+      "clip_ratio": 0.0002544033526419298,
+      "epoch": 0.3518269546771832,
+      "grad_norm": 0.032299675047397614,
+      "kl": 0.003595113754272461,
+      "learning_rate": 4.522292993630574e-06,
+      "loss": 0.0035,
+      "step": 284
+    },
+    {
+      "clip_ratio": 0.00024219880805276262,
+      "epoch": 0.35373905769173314,
+      "grad_norm": 0.031959276646375656,
+      "kl": 0.0035622119903564453,
+      "learning_rate": 4.538216560509555e-06,
+      "loss": 0.0035,
+      "step": 285
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 536.3122510910034,
+      "epoch": 0.35565116070628305,
+      "grad_norm": 0.035966720432043076,
+      "kl": 0.003755331039428711,
+      "learning_rate": 4.554140127388535e-06,
+      "loss": 0.0076,
+      "num_tokens": 137425032.0,
+      "reward": 0.05524553809664212,
+      "reward_std": 0.07191267621237785,
+      "rewards/pure_accuracy_reward_math": 0.055245536990696564,
+      "step": 286
+    },
+    {
+      "clip_ratio": 0.00029696975889237365,
+      "epoch": 0.35756326372083297,
+      "grad_norm": 0.03485076501965523,
+      "kl": 0.0036923885345458984,
+      "learning_rate": 4.570063694267516e-06,
+      "loss": 0.0076,
+      "step": 287
+    },
+    {
+      "clip_ratio": 0.0003252405772968814,
+      "epoch": 0.3594753667353829,
+      "grad_norm": 0.03465472534298897,
+      "kl": 0.003720998764038086,
+      "learning_rate": 4.585987261146497e-06,
+      "loss": 0.0076,
+      "step": 288
+    },
+    {
+      "clip_ratio": 0.0003269365803362234,
+      "epoch": 0.3613874697499328,
+      "grad_norm": 0.033384956419467926,
+      "kl": 0.003762483596801758,
+      "learning_rate": 4.601910828025479e-06,
+      "loss": 0.0075,
+      "step": 289
+    },
+    {
+      "clip_ratio": 0.0003269619904813226,
+      "epoch": 0.36329957276448266,
+      "grad_norm": 0.03343256562948227,
+      "kl": 0.0037889480590820312,
+      "learning_rate": 4.617834394904459e-06,
+      "loss": 0.0075,
+      "step": 290
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 533.6155371665955,
+      "epoch": 0.3652116757790326,
+      "grad_norm": 0.035127099603414536,
+      "kl": 0.0037310123443603516,
+      "learning_rate": 4.63375796178344e-06,
+      "loss": 0.0084,
+      "num_tokens": 141070278.0,
+      "reward": 0.05580357421422377,
+      "reward_std": 0.06861072615720332,
+      "rewards/pure_accuracy_reward_math": 0.05580357281723991,
+      "step": 291
+    },
+    {
+      "clip_ratio": 0.00026876470258230256,
+      "epoch": 0.3671237787935825,
+      "grad_norm": 0.034193847328424454,
+      "kl": 0.0037539005279541016,
+      "learning_rate": 4.649681528662421e-06,
+      "loss": 0.0084,
+      "step": 292
+    },
+    {
+      "clip_ratio": 0.00024497293054537295,
+      "epoch": 0.3690358818081324,
+      "grad_norm": 0.033800724893808365,
+      "kl": 0.0037734508514404297,
+      "learning_rate": 4.665605095541402e-06,
+      "loss": 0.0084,
+      "step": 293
+    },
+    {
+      "clip_ratio": 0.0002538224067620831,
+      "epoch": 0.3709479848226823,
+      "grad_norm": 0.03376767784357071,
+      "kl": 0.003782033920288086,
+      "learning_rate": 4.6815286624203824e-06,
+      "loss": 0.0083,
+      "step": 294
+    },
+    {
+      "clip_ratio": 0.00027697558522277177,
+      "epoch": 0.37286008783723223,
+      "grad_norm": 0.03229675441980362,
+      "kl": 0.003787994384765625,
+      "learning_rate": 4.697452229299363e-06,
+      "loss": 0.0083,
+      "step": 295
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 532.5739660263062,
+      "epoch": 0.37477219085178215,
+      "grad_norm": 0.035769619047641754,
+      "kl": 0.0037794113159179688,
+      "learning_rate": 4.713375796178344e-06,
+      "loss": 0.0057,
+      "num_tokens": 144715023.0,
+      "reward": 0.05915178812574595,
+      "reward_std": 0.07096926274243742,
+      "rewards/pure_accuracy_reward_math": 0.059151787019800395,
+      "step": 296
+    },
+    {
+      "clip_ratio": 0.00030428163654505624,
+      "epoch": 0.37668429386633207,
+      "grad_norm": 0.035648081451654434,
+      "kl": 0.003717660903930664,
+      "learning_rate": 4.729299363057326e-06,
+      "loss": 0.0057,
+      "step": 297
+    },
+    {
+      "clip_ratio": 0.00029741515106707084,
+      "epoch": 0.378596396880882,
+      "grad_norm": 0.03551783785223961,
+      "kl": 0.0036716461181640625,
+      "learning_rate": 4.745222929936306e-06,
+      "loss": 0.0057,
+      "step": 298
+    },
+    {
+      "clip_ratio": 0.0003008591765478741,
+      "epoch": 0.38050849989543184,
+      "grad_norm": 0.03452136367559433,
+      "kl": 0.0036542415618896484,
+      "learning_rate": 4.761146496815287e-06,
+      "loss": 0.0056,
+      "step": 299
+    },
+    {
+      "clip_ratio": 0.00032588979291858777,
+      "epoch": 0.38242060290998175,
+      "grad_norm": 0.03325437009334564,
+      "kl": 0.003694295883178711,
+      "learning_rate": 4.777070063694268e-06,
+      "loss": 0.0056,
+      "step": 300
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.2416524887085,
+      "epoch": 0.38433270592453167,
+      "grad_norm": 0.04327908158302307,
+      "kl": 0.004815816879272461,
+      "learning_rate": 4.792993630573249e-06,
+      "loss": 0.0041,
+      "num_tokens": 148307505.0,
+      "reward": 0.05329241341678426,
+      "reward_std": 0.061954362492542714,
+      "rewards/pure_accuracy_reward_math": 0.0532924123108387,
+      "step": 301
+    },
+    {
+      "clip_ratio": 0.0002521659018839273,
+      "epoch": 0.3862448089390816,
+      "grad_norm": 0.041329506784677505,
+      "kl": 0.004758596420288086,
+      "learning_rate": 4.8089171974522295e-06,
+      "loss": 0.0041,
+      "step": 302
+    },
+    {
+      "clip_ratio": 0.0002661041191913682,
+      "epoch": 0.3881569119536315,
+      "grad_norm": 0.03914090245962143,
+      "kl": 0.0045318603515625,
+      "learning_rate": 4.82484076433121e-06,
+      "loss": 0.0041,
+      "step": 303
+    },
+    {
+      "clip_ratio": 0.0002647961523507547,
+      "epoch": 0.3900690149681814,
+      "grad_norm": 0.0363956093788147,
+      "kl": 0.0043642520904541016,
+      "learning_rate": 4.840764331210192e-06,
+      "loss": 0.004,
+      "step": 304
+    },
+    {
+      "clip_ratio": 0.00030025097066754824,
+      "epoch": 0.39198111798273133,
+      "grad_norm": 0.05623022839426994,
+      "kl": 0.00441288948059082,
+      "learning_rate": 4.856687898089173e-06,
+      "loss": 0.004,
+      "step": 305
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.0496897697449,
+      "epoch": 0.39389322099728125,
+      "grad_norm": 0.03662995249032974,
+      "kl": 0.0038270950317382812,
+      "learning_rate": 4.872611464968153e-06,
+      "loss": 0.0077,
+      "num_tokens": 151936939.0,
+      "reward": 0.0560825914144516,
+      "reward_std": 0.061781705473549664,
+      "rewards/pure_accuracy_reward_math": 0.05608259071595967,
+      "step": 306
+    },
+    {
+      "clip_ratio": 0.00025576306325092446,
+      "epoch": 0.39580532401183116,
+      "grad_norm": 0.03553188219666481,
+      "kl": 0.00376129150390625,
+      "learning_rate": 4.888535031847134e-06,
+      "loss": 0.0076,
+      "step": 307
+    },
+    {
+      "clip_ratio": 0.00027371336784653977,
+      "epoch": 0.397717427026381,
+      "grad_norm": 0.035399794578552246,
+      "kl": 0.0036725997924804688,
+      "learning_rate": 4.904458598726115e-06,
+      "loss": 0.0076,
+      "step": 308
+    },
+    {
+      "clip_ratio": 0.0002955471370569285,
+      "epoch": 0.39962953004093094,
+      "grad_norm": 0.03487352281808853,
+      "kl": 0.003664731979370117,
+      "learning_rate": 4.920382165605096e-06,
+      "loss": 0.0076,
+      "step": 309
+    },
+    {
+      "clip_ratio": 0.00030850259520320833,
+      "epoch": 0.40154163305548085,
+      "grad_norm": 0.03433185815811157,
+      "kl": 0.003676176071166992,
+      "learning_rate": 4.9363057324840765e-06,
+      "loss": 0.0075,
+      "step": 310
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 524.8312191963196,
+      "epoch": 0.40345373607003077,
+      "grad_norm": 0.03824182599782944,
+      "kl": 0.003762483596801758,
+      "learning_rate": 4.952229299363058e-06,
+      "loss": 0.0062,
+      "num_tokens": 155550782.0,
+      "reward": 0.05496652075089514,
+      "reward_std": 0.0689961050520651,
+      "rewards/pure_accuracy_reward_math": 0.0549665194703266,
+      "step": 311
+    },
+    {
+      "clip_ratio": 0.0002548059320588436,
+      "epoch": 0.4053658390845807,
+      "grad_norm": 0.036028265953063965,
+      "kl": 0.003760099411010742,
+      "learning_rate": 4.968152866242039e-06,
+      "loss": 0.0062,
+      "step": 312
+    },
+    {
+      "clip_ratio": 0.00029642158040132927,
+      "epoch": 0.4072779420991306,
+      "grad_norm": 0.03537724167108536,
+      "kl": 0.0038378238677978516,
+      "learning_rate": 4.98407643312102e-06,
+      "loss": 0.0062,
+      "step": 313
+    },
+    {
+      "clip_ratio": 0.00030970463706125884,
+      "epoch": 0.4091900451136805,
+      "grad_norm": 0.03521754965186119,
+      "kl": 0.003871440887451172,
+      "learning_rate": 5e-06,
+      "loss": 0.0062,
+      "step": 314
+    },
+    {
+      "clip_ratio": 0.000315766970174991,
+      "epoch": 0.4111021481282304,
+      "grad_norm": 0.034070126712322235,
+      "kl": 0.0037851333618164062,
+      "learning_rate": 4.999992129526286e-06,
+      "loss": 0.0061,
+      "step": 315
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.3727917671204,
+      "epoch": 0.41301425114278034,
+      "grad_norm": 0.12440560013055801,
+      "kl": 0.005699872970581055,
+      "learning_rate": 4.999968518154701e-06,
+      "loss": 0.0041,
+      "num_tokens": 159174918.0,
+      "reward": 0.05050223457510583,
+      "reward_std": 0.06435916194459423,
+      "rewards/pure_accuracy_reward_math": 0.050502233527367935,
+      "step": 316
+    },
+    {
+      "clip_ratio": 0.0002532021657657424,
+      "epoch": 0.4149263541573302,
+      "grad_norm": 0.05440036952495575,
+      "kl": 0.005144357681274414,
+      "learning_rate": 4.99992916603391e-06,
+      "loss": 0.004,
+      "step": 317
+    },
+    {
+      "clip_ratio": 0.00025051761485883617,
+      "epoch": 0.4168384571718801,
+      "grad_norm": 0.051424141973257065,
+      "kl": 0.005103111267089844,
+      "learning_rate": 4.999874073411688e-06,
+      "loss": 0.004,
+      "step": 318
+    },
+    {
+      "clip_ratio": 0.0002561948363677402,
+      "epoch": 0.41875056018643003,
+      "grad_norm": 0.06930891424417496,
+      "kl": 0.004969120025634766,
+      "learning_rate": 4.9998032406349205e-06,
+      "loss": 0.0039,
+      "step": 319
+    },
+    {
+      "clip_ratio": 0.0002573228107394243,
+      "epoch": 0.42066266320097995,
+      "grad_norm": 0.06900722533464432,
+      "kl": 0.004853248596191406,
+      "learning_rate": 4.9997166681495975e-06,
+      "loss": 0.0039,
+      "step": 320
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 517.6638069152832,
+      "epoch": 0.42257476621552986,
+      "grad_norm": 0.03829098492860794,
+      "kl": 0.0038361549377441406,
+      "learning_rate": 4.999614356500811e-06,
+      "loss": 0.0072,
+      "num_tokens": 162764497.0,
+      "reward": 0.06110491356230341,
+      "reward_std": 0.07393209857400507,
+      "rewards/pure_accuracy_reward_math": 0.06110491222352721,
+      "step": 321
+    },
+    {
+      "clip_ratio": 0.0002886460991931017,
+      "epoch": 0.4244868692300798,
+      "grad_norm": 0.03761793673038483,
+      "kl": 0.0038406848907470703,
+      "learning_rate": 4.999496306332755e-06,
+      "loss": 0.0072,
+      "step": 322
+    },
+    {
+      "clip_ratio": 0.00029219654425105546,
+      "epoch": 0.4263989722446297,
+      "grad_norm": 0.03714153915643692,
+      "kl": 0.003914356231689453,
+      "learning_rate": 4.999362518388718e-06,
+      "loss": 0.0071,
+      "step": 323
+    },
+    {
+      "clip_ratio": 0.0003099845329757045,
+      "epoch": 0.4283110752591796,
+      "grad_norm": 0.03610815480351448,
+      "kl": 0.0039288997650146484,
+      "learning_rate": 4.99921299351108e-06,
+      "loss": 0.0071,
+      "step": 324
+    },
+    {
+      "clip_ratio": 0.0003404705674370234,
+      "epoch": 0.4302231782737295,
+      "grad_norm": 0.03599926084280014,
+      "kl": 0.003935813903808594,
+      "learning_rate": 4.999047732641305e-06,
+      "loss": 0.007,
+      "step": 325
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 510.4832811355591,
+      "epoch": 0.4321352812882794,
+      "grad_norm": 0.04078551381826401,
+      "kl": 0.003900766372680664,
+      "learning_rate": 4.998866736819938e-06,
+      "loss": 0.0063,
+      "num_tokens": 166324161.0,
+      "reward": 0.059151788242161274,
+      "reward_std": 0.07354671962093562,
+      "rewards/pure_accuracy_reward_math": 0.05915178725263104,
+      "step": 326
+    },
+    {
+      "clip_ratio": 0.00026936357801332633,
+      "epoch": 0.4340473843028293,
+      "grad_norm": 0.03855260834097862,
+      "kl": 0.003957986831665039,
+      "learning_rate": 4.998670007186599e-06,
+      "loss": 0.0063,
+      "step": 327
+    },
+    {
+      "clip_ratio": 0.0002843770836875592,
+      "epoch": 0.4359594873173792,
+      "grad_norm": 0.03724536672234535,
+      "kl": 0.0039751529693603516,
+      "learning_rate": 4.998457544979971e-06,
+      "loss": 0.0062,
+      "step": 328
+    },
+    {
+      "clip_ratio": 0.0003156123698886404,
+      "epoch": 0.43787159033192913,
+      "grad_norm": 0.03662634268403053,
+      "kl": 0.0040798187255859375,
+      "learning_rate": 4.998229351537797e-06,
+      "loss": 0.0062,
+      "step": 329
+    },
+    {
+      "clip_ratio": 0.0003457550078564964,
+      "epoch": 0.43978369334647904,
+      "grad_norm": 0.03598077967762947,
+      "kl": 0.004061460494995117,
+      "learning_rate": 4.997985428296869e-06,
+      "loss": 0.0061,
+      "step": 330
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.4207811355591,
+      "epoch": 0.44169579636102896,
+      "grad_norm": 0.08678283542394638,
+      "kl": 0.008905410766601562,
+      "learning_rate": 4.997725776793021e-06,
+      "loss": 0.0058,
+      "num_tokens": 169950285.0,
+      "reward": 0.05636160948779434,
+      "reward_std": 0.07148723275167868,
+      "rewards/pure_accuracy_reward_math": 0.05636160867288709,
+      "step": 331
+    },
+    {
+      "clip_ratio": 0.00029096677934603576,
+      "epoch": 0.4436078993755789,
+      "grad_norm": 0.09512893110513687,
+      "kl": 0.007820606231689453,
+      "learning_rate": 4.997450398661117e-06,
+      "loss": 0.0058,
+      "step": 332
+    },
+    {
+      "clip_ratio": 0.00029938158724007735,
+      "epoch": 0.4455200023901288,
+      "grad_norm": 0.24316293001174927,
+      "kl": 0.007544517517089844,
+      "learning_rate": 4.9971592956350405e-06,
+      "loss": 0.0057,
+      "step": 333
+    },
+    {
+      "clip_ratio": 0.00032061134919558754,
+      "epoch": 0.4474321054046787,
+      "grad_norm": 0.07169396430253983,
+      "kl": 0.006528377532958984,
+      "learning_rate": 4.996852469547688e-06,
+      "loss": 0.0057,
+      "step": 334
+    },
+    {
+      "clip_ratio": 0.00034978831735088534,
+      "epoch": 0.44934420841922856,
+      "grad_norm": 0.06073050945997238,
+      "kl": 0.0060198307037353516,
+      "learning_rate": 4.996529922330954e-06,
+      "loss": 0.0056,
+      "step": 335
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 535.8259167671204,
+      "epoch": 0.4512563114337785,
+      "grad_norm": 0.034031759947538376,
+      "kl": 0.0037636756896972656,
+      "learning_rate": 4.996191656015715e-06,
+      "loss": 0.0063,
+      "num_tokens": 173606605.0,
+      "reward": 0.05273437770665623,
+      "reward_std": 0.061655311612412333,
+      "rewards/pure_accuracy_reward_math": 0.05273437625146471,
+      "step": 336
+    },
+    {
+      "clip_ratio": 0.0002175188884052659,
+      "epoch": 0.4531684144483284,
+      "grad_norm": 0.03333257883787155,
+      "kl": 0.0038194656372070312,
+      "learning_rate": 4.995837672731827e-06,
+      "loss": 0.0063,
+      "step": 337
+    },
+    {
+      "clip_ratio": 0.00022021491247414815,
+      "epoch": 0.4550805174628783,
+      "grad_norm": 0.032678041607141495,
+      "kl": 0.0038101673126220703,
+      "learning_rate": 4.9954679747081e-06,
+      "loss": 0.0063,
+      "step": 338
+    },
+    {
+      "clip_ratio": 0.000264580338352971,
+      "epoch": 0.4569926204774282,
+      "grad_norm": 0.032030362635850906,
+      "kl": 0.0037910938262939453,
+      "learning_rate": 4.995082564272295e-06,
+      "loss": 0.0062,
+      "step": 339
+    },
+    {
+      "clip_ratio": 0.00027159255438391483,
+      "epoch": 0.45890472349197814,
+      "grad_norm": 0.031298909336328506,
+      "kl": 0.0038001537322998047,
+      "learning_rate": 4.994681443851102e-06,
+      "loss": 0.0062,
+      "step": 340
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.6174931526184,
+      "epoch": 0.46081682650652805,
+      "grad_norm": 0.04015278443694115,
+      "kl": 0.004010200500488281,
+      "learning_rate": 4.994264615970126e-06,
+      "loss": 0.0062,
+      "num_tokens": 177226454.0,
+      "reward": 0.056361609895247966,
+      "reward_std": 0.06633232033345848,
+      "rewards/pure_accuracy_reward_math": 0.05636160867288709,
+      "step": 341
+    },
+    {
+      "clip_ratio": 0.00026669438159387937,
+      "epoch": 0.46272892952107797,
+      "grad_norm": 0.03813392296433449,
+      "kl": 0.0039997100830078125,
+      "learning_rate": 4.993832083253874e-06,
+      "loss": 0.0062,
+      "step": 342
+    },
+    {
+      "clip_ratio": 0.0003048689098363866,
+      "epoch": 0.46464103253562783,
+      "grad_norm": 0.03776548057794571,
+      "kl": 0.004065752029418945,
+      "learning_rate": 4.993383848425736e-06,
+      "loss": 0.0061,
+      "step": 343
+    },
+    {
+      "clip_ratio": 0.0003051352168768062,
+      "epoch": 0.46655313555017774,
+      "grad_norm": 0.03955227509140968,
+      "kl": 0.0041925907135009766,
+      "learning_rate": 4.992919914307969e-06,
+      "loss": 0.0061,
+      "step": 344
+    },
+    {
+      "clip_ratio": 0.00030118576887616655,
+      "epoch": 0.46846523856472766,
+      "grad_norm": 0.036648593842983246,
+      "kl": 0.00420832633972168,
+      "learning_rate": 4.992440283821676e-06,
+      "loss": 0.006,
+      "step": 345
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.5131411552429,
+      "epoch": 0.4703773415792776,
+      "grad_norm": 13.381791114807129,
+      "kl": 0.1310877799987793,
+      "learning_rate": 4.991944959986793e-06,
+      "loss": 0.018,
+      "num_tokens": 180852413.0,
+      "reward": 0.06138393163564615,
+      "reward_std": 0.07144096971023828,
+      "rewards/pure_accuracy_reward_math": 0.061383930064039305,
+      "step": 346
+    },
+    {
+      "clip_ratio": 0.00030088673440786806,
+      "epoch": 0.4722894445938275,
+      "grad_norm": 1.359532356262207,
+      "kl": 0.01866316795349121,
+      "learning_rate": 4.991433945922068e-06,
+      "loss": 0.0135,
+      "step": 347
+    },
+    {
+      "clip_ratio": 0.0003527746957843192,
+      "epoch": 0.4742015476083774,
+      "grad_norm": 0.050763800740242004,
+      "kl": 0.005962371826171875,
+      "learning_rate": 4.9909072448450386e-06,
+      "loss": 0.013,
+      "step": 348
+    },
+    {
+      "clip_ratio": 0.0003426602560239189,
+      "epoch": 0.4761136506229273,
+      "grad_norm": 0.0476795993745327,
+      "kl": 0.006250858306884766,
+      "learning_rate": 4.990364860072014e-06,
+      "loss": 0.013,
+      "step": 349
+    },
+    {
+      "clip_ratio": 0.00033057811066328213,
+      "epoch": 0.47802575363747724,
+      "grad_norm": 0.04783082380890846,
+      "kl": 0.0066144466400146484,
+      "learning_rate": 4.989806795018054e-06,
+      "loss": 0.013,
+      "step": 350
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.409900188446,
+      "epoch": 0.47993785665202715,
+      "grad_norm": 0.036505699157714844,
+      "kl": 0.0040128231048583984,
+      "learning_rate": 4.989233053196948e-06,
+      "loss": 0.0024,
+      "num_tokens": 184454394.0,
+      "reward": 0.04771205602446571,
+      "reward_std": 0.05920424917712808,
+      "rewards/pure_accuracy_reward_math": 0.047712054976727813,
+      "step": 351
+    },
+    {
+      "clip_ratio": 0.00023261837060317703,
+      "epoch": 0.481849959666577,
+      "grad_norm": 0.037214819341897964,
+      "kl": 0.004108428955078125,
+      "learning_rate": 4.988643638221193e-06,
+      "loss": 0.0024,
+      "step": 352
+    },
+    {
+      "clip_ratio": 0.0002573013600795093,
+      "epoch": 0.4837620626811269,
+      "grad_norm": 0.03702811896800995,
+      "kl": 0.004202127456665039,
+      "learning_rate": 4.9880385538019665e-06,
+      "loss": 0.0024,
+      "step": 353
+    },
+    {
+      "clip_ratio": 0.0002758479482167786,
+      "epoch": 0.48567416569567684,
+      "grad_norm": 0.03838437795639038,
+      "kl": 0.004250764846801758,
+      "learning_rate": 4.987417803749112e-06,
+      "loss": 0.0023,
+      "step": 354
+    },
+    {
+      "clip_ratio": 0.00024451872050690326,
+      "epoch": 0.48758626871022676,
+      "grad_norm": 0.035314518958330154,
+      "kl": 0.00424647331237793,
+      "learning_rate": 4.986781391971105e-06,
+      "loss": 0.0023,
+      "step": 355
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.8789310455322,
+      "epoch": 0.48949837172477667,
+      "grad_norm": 0.038822874426841736,
+      "kl": 0.004703998565673828,
+      "learning_rate": 4.986129322475037e-06,
+      "loss": 0.006,
+      "num_tokens": 188061244.0,
+      "reward": 0.05887277075089514,
+      "reward_std": 0.0715272988891229,
+      "rewards/pure_accuracy_reward_math": 0.058872769062872976,
+      "step": 356
+    },
+    {
+      "clip_ratio": 0.0003040988601696881,
+      "epoch": 0.4914104747393266,
+      "grad_norm": 0.03750370442867279,
+      "kl": 0.004604816436767578,
+      "learning_rate": 4.985461599366583e-06,
+      "loss": 0.006,
+      "step": 357
+    },
+    {
+      "clip_ratio": 0.0003311016299676339,
+      "epoch": 0.4933225777538765,
+      "grad_norm": 0.03735021874308586,
+      "kl": 0.004613637924194336,
+      "learning_rate": 4.984778226849983e-06,
+      "loss": 0.0059,
+      "step": 358
+    },
+    {
+      "clip_ratio": 0.00031427563314423423,
+      "epoch": 0.4952346807684264,
+      "grad_norm": 0.037090424448251724,
+      "kl": 0.00463104248046875,
+      "learning_rate": 4.984079209228007e-06,
+      "loss": 0.0059,
+      "step": 359
+    },
+    {
+      "clip_ratio": 0.0003153682554284387,
+      "epoch": 0.49714678378297633,
+      "grad_norm": 0.03496375307440758,
+      "kl": 0.004604816436767578,
+      "learning_rate": 4.983364550901936e-06,
+      "loss": 0.0058,
+      "step": 360
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.5016980171204,
+      "epoch": 0.4990588867975262,
+      "grad_norm": 1978.1619873046875,
+      "kl": 5.663617134094238,
+      "learning_rate": 4.982634256371529e-06,
+      "loss": 0.2313,
+      "num_tokens": 191670522.0,
+      "reward": 0.05943080599536188,
+      "reward_std": 0.06242607004242018,
+      "rewards/pure_accuracy_reward_math": 0.059430805064039305,
+      "step": 361
+    },
+    {
+      "clip_ratio": 0.0003008291907349303,
+      "epoch": 0.5009709898120761,
+      "grad_norm": 6.705481052398682,
+      "kl": 0.07292413711547852,
+      "learning_rate": 4.981888330234998e-06,
+      "loss": 0.0076,
+      "step": 362
+    },
+    {
+      "clip_ratio": 0.00038137949604788446,
+      "epoch": 0.502883092826626,
+      "grad_norm": 0.4056338369846344,
+      "kl": 0.013193130493164062,
+      "learning_rate": 4.981126777188976e-06,
+      "loss": 0.0053,
+      "step": 363
+    },
+    {
+      "clip_ratio": 0.00039371675529764616,
+      "epoch": 0.5047951958411759,
+      "grad_norm": 0.40032151341438293,
+      "kl": 0.009969472885131836,
+      "learning_rate": 4.980349602028489e-06,
+      "loss": 0.0052,
+      "step": 364
+    },
+    {
+      "clip_ratio": 0.0003270253398568457,
+      "epoch": 0.5067072988557259,
+      "grad_norm": 0.08224909007549286,
+      "kl": 0.010345458984375,
+      "learning_rate": 4.979556809646928e-06,
+      "loss": 0.0051,
+      "step": 365
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 534.6082878112793,
+      "epoch": 0.5086194018702758,
+      "grad_norm": 0.036373648792505264,
+      "kl": 0.003941535949707031,
+      "learning_rate": 4.978748405036014e-06,
+      "loss": 0.0071,
+      "num_tokens": 195317270.0,
+      "reward": 0.05552455584984273,
+      "reward_std": 0.06775363947963342,
+      "rewards/pure_accuracy_reward_math": 0.05552455486031249,
+      "step": 366
+    },
+    {
+      "clip_ratio": 0.00027453447256675645,
+      "epoch": 0.5105315048848257,
+      "grad_norm": 0.03525104746222496,
+      "kl": 0.0039365291595458984,
+      "learning_rate": 4.977924393285767e-06,
+      "loss": 0.0072,
+      "step": 367
+    },
+    {
+      "clip_ratio": 0.0003015769660521528,
+      "epoch": 0.5124436078993756,
+      "grad_norm": 0.03737647458910942,
+      "kl": 0.0039522647857666016,
+      "learning_rate": 4.977084779584479e-06,
+      "loss": 0.0071,
+      "step": 368
+    },
+    {
+      "clip_ratio": 0.0002889172319555655,
+      "epoch": 0.5143557109139255,
+      "grad_norm": 0.03506501764059067,
+      "kl": 0.0039052963256835938,
+      "learning_rate": 4.976229569218676e-06,
+      "loss": 0.0071,
+      "step": 369
+    },
+    {
+      "clip_ratio": 0.0002910121094146234,
+      "epoch": 0.5162678139284754,
+      "grad_norm": 0.03558839485049248,
+      "kl": 0.003898143768310547,
+      "learning_rate": 4.975358767573085e-06,
+      "loss": 0.007,
+      "step": 370
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.1417660713196,
+      "epoch": 0.5181799169430253,
+      "grad_norm": 9.403284072875977,
+      "kl": 0.0705575942993164,
+      "learning_rate": 4.974472380130605e-06,
+      "loss": 0.0078,
+      "num_tokens": 198926094.0,
+      "reward": 0.06305803885334171,
+      "reward_std": 0.0737193762906827,
+      "rewards/pure_accuracy_reward_math": 0.06305803733994253,
+      "step": 371
+    },
+    {
+      "clip_ratio": 0.00028168898450076085,
+      "epoch": 0.5200920199575753,
+      "grad_norm": 0.10174906253814697,
+      "kl": 0.005540609359741211,
+      "learning_rate": 4.9735704124722665e-06,
+      "loss": 0.0053,
+      "step": 372
+    },
+    {
+      "clip_ratio": 0.00026055807722968893,
+      "epoch": 0.5220041229721252,
+      "grad_norm": 0.036394841969013214,
+      "kl": 0.004784584045410156,
+      "learning_rate": 4.9726528702771985e-06,
+      "loss": 0.0052,
+      "step": 373
+    },
+    {
+      "clip_ratio": 0.0003154287535949152,
+      "epoch": 0.523916225986675,
+      "grad_norm": 0.03702308237552643,
+      "kl": 0.004788875579833984,
+      "learning_rate": 4.971719759322596e-06,
+      "loss": 0.0052,
+      "step": 374
+    },
+    {
+      "clip_ratio": 0.000301387064496339,
+      "epoch": 0.5258283290012249,
+      "grad_norm": 0.03516030311584473,
+      "kl": 0.004770994186401367,
+      "learning_rate": 4.97077108548368e-06,
+      "loss": 0.0051,
+      "step": 375
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.7994132041931,
+      "epoch": 0.5277404320157748,
+      "grad_norm": 0.04183080792427063,
+      "kl": 0.006031513214111328,
+      "learning_rate": 4.969806854733658e-06,
+      "loss": 0.0091,
+      "num_tokens": 202522419.0,
+      "reward": 0.0638950924621895,
+      "reward_std": 0.07990403153235093,
+      "rewards/pure_accuracy_reward_math": 0.0638950903667137,
+      "step": 376
+    },
+    {
+      "clip_ratio": 0.00032519385399609746,
+      "epoch": 0.5296525350303247,
+      "grad_norm": 0.0407201424241066,
+      "kl": 0.005979061126708984,
+      "learning_rate": 4.968827073143694e-06,
+      "loss": 0.0091,
+      "step": 377
+    },
+    {
+      "clip_ratio": 0.00031682528469900717,
+      "epoch": 0.5315646380448746,
+      "grad_norm": 0.040043942630290985,
+      "kl": 0.005922555923461914,
+      "learning_rate": 4.967831746882863e-06,
+      "loss": 0.0091,
+      "step": 378
+    },
+    {
+      "clip_ratio": 0.00033513708405052967,
+      "epoch": 0.5334767410594246,
+      "grad_norm": 0.03983679041266441,
+      "kl": 0.005841970443725586,
+      "learning_rate": 4.966820882218118e-06,
+      "loss": 0.009,
+      "step": 379
+    },
+    {
+      "clip_ratio": 0.00034104771594911654,
+      "epoch": 0.5353888440739745,
+      "grad_norm": 0.03983955457806587,
+      "kl": 0.005755186080932617,
+      "learning_rate": 4.965794485514245e-06,
+      "loss": 0.0089,
+      "step": 380
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.5067186355591,
+      "epoch": 0.5373009470885244,
+      "grad_norm": 0.034092146903276443,
+      "kl": 0.0043926239013671875,
+      "learning_rate": 4.964752563233826e-06,
+      "loss": 0.008,
+      "num_tokens": 206122403.0,
+      "reward": 0.055803573748562485,
+      "reward_std": 0.05980854749213904,
+      "rewards/pure_accuracy_reward_math": 0.05580357275903225,
+      "step": 381
+    },
+    {
+      "clip_ratio": 0.00025422318708478997,
+      "epoch": 0.5392130501030743,
+      "grad_norm": 0.03263320028781891,
+      "kl": 0.0043218135833740234,
+      "learning_rate": 4.9636951219372e-06,
+      "loss": 0.008,
+      "step": 382
+    },
+    {
+      "clip_ratio": 0.00025885856206286917,
+      "epoch": 0.5411251531176242,
+      "grad_norm": 0.032487623393535614,
+      "kl": 0.004242420196533203,
+      "learning_rate": 4.962622168282416e-06,
+      "loss": 0.008,
+      "step": 383
+    },
+    {
+      "clip_ratio": 0.0002850476581102157,
+      "epoch": 0.5430372561321741,
+      "grad_norm": 0.032427769154310226,
+      "kl": 0.004185199737548828,
+      "learning_rate": 4.961533709025199e-06,
+      "loss": 0.0079,
+      "step": 384
+    },
+    {
+      "clip_ratio": 0.00029774147623129466,
+      "epoch": 0.544949359146724,
+      "grad_norm": 0.031092027202248573,
+      "kl": 0.004144430160522461,
+      "learning_rate": 4.960429751018901e-06,
+      "loss": 0.0079,
+      "step": 385
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.9258050918579,
+      "epoch": 0.546861462161274,
+      "grad_norm": 0.6398438811302185,
+      "kl": 0.013398170471191406,
+      "learning_rate": 4.959310301214458e-06,
+      "loss": 0.0048,
+      "num_tokens": 209727833.0,
+      "reward": 0.06668527127476409,
+      "reward_std": 0.07586519059259444,
+      "rewards/pure_accuracy_reward_math": 0.06668526941211894,
+      "step": 386
+    },
+    {
+      "clip_ratio": 0.0002956847454242961,
+      "epoch": 0.5487735651758239,
+      "grad_norm": 0.09603609144687653,
+      "kl": 0.006535530090332031,
+      "learning_rate": 4.958175366660352e-06,
+      "loss": 0.0045,
+      "step": 387
+    },
+    {
+      "clip_ratio": 0.00032585520455086225,
+      "epoch": 0.5506856681903738,
+      "grad_norm": 0.042251698672771454,
+      "kl": 0.004881858825683594,
+      "learning_rate": 4.95702495450256e-06,
+      "loss": 0.0045,
+      "step": 388
+    },
+    {
+      "clip_ratio": 0.00030688931195754776,
+      "epoch": 0.5525977712049237,
+      "grad_norm": 0.03725959733128548,
+      "kl": 0.00462651252746582,
+      "learning_rate": 4.955859071984512e-06,
+      "loss": 0.0044,
+      "step": 389
+    },
+    {
+      "clip_ratio": 0.0002833517196449975,
+      "epoch": 0.5545098742194736,
+      "grad_norm": 0.03557269275188446,
+      "kl": 0.004591941833496094,
+      "learning_rate": 4.954677726447049e-06,
+      "loss": 0.0044,
+      "step": 390
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.50141954422,
+      "epoch": 0.5564219772340235,
+      "grad_norm": 0.03767434135079384,
+      "kl": 0.0041730403900146484,
+      "learning_rate": 4.953480925328369e-06,
+      "loss": 0.0053,
+      "num_tokens": 213359594.0,
+      "reward": 0.05636160998255946,
+      "reward_std": 0.06873711966909468,
+      "rewards/pure_accuracy_reward_math": 0.05636160829453729,
+      "step": 391
+    },
+    {
+      "clip_ratio": 0.0002943199858691514,
+      "epoch": 0.5583340802485733,
+      "grad_norm": 0.03691519424319267,
+      "kl": 0.004199981689453125,
+      "learning_rate": 4.952268676163984e-06,
+      "loss": 0.0053,
+      "step": 392
+    },
+    {
+      "clip_ratio": 0.00028674039270981666,
+      "epoch": 0.5602461832631233,
+      "grad_norm": 0.036044176667928696,
+      "kl": 0.004216432571411133,
+      "learning_rate": 4.951040986586676e-06,
+      "loss": 0.0053,
+      "step": 393
+    },
+    {
+      "clip_ratio": 0.0003071572371595721,
+      "epoch": 0.5621582862776732,
+      "grad_norm": 0.0358373187482357,
+      "kl": 0.004226207733154297,
+      "learning_rate": 4.949797864326442e-06,
+      "loss": 0.0053,
+      "step": 394
+    },
+    {
+      "clip_ratio": 0.000308680556543095,
+      "epoch": 0.5640703892922231,
+      "grad_norm": 0.0356404110789299,
+      "kl": 0.004263877868652344,
+      "learning_rate": 4.9485393172104525e-06,
+      "loss": 0.0052,
+      "step": 395
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.1506924629211,
+      "epoch": 0.565982492306773,
+      "grad_norm": 0.03425108641386032,
+      "kl": 0.004232645034790039,
+      "learning_rate": 4.947265353162997e-06,
+      "loss": 0.0047,
+      "num_tokens": 216984490.0,
+      "reward": 0.05831473466241732,
+      "reward_std": 0.06912249873857945,
+      "rewards/pure_accuracy_reward_math": 0.058314733556471765,
+      "step": 396
+    },
+    {
+      "clip_ratio": 0.0002443079777663115,
+      "epoch": 0.5678945953213229,
+      "grad_norm": 0.03406741842627525,
+      "kl": 0.004246950149536133,
+      "learning_rate": 4.945975980205435e-06,
+      "loss": 0.0046,
+      "step": 397
+    },
+    {
+      "clip_ratio": 0.00025582832455484095,
+      "epoch": 0.5698066983358728,
+      "grad_norm": 0.033892109990119934,
+      "kl": 0.004239320755004883,
+      "learning_rate": 4.944671206456148e-06,
+      "loss": 0.0046,
+      "step": 398
+    },
+    {
+      "clip_ratio": 0.0002801110364885062,
+      "epoch": 0.5717188013504227,
+      "grad_norm": 0.03294463828206062,
+      "kl": 0.0042018890380859375,
+      "learning_rate": 4.943351040130485e-06,
+      "loss": 0.0046,
+      "step": 399
+    },
+    {
+      "clip_ratio": 0.00030015600407296006,
+      "epoch": 0.5736309043649727,
+      "grad_norm": 0.03228214010596275,
+      "kl": 0.004125118255615234,
+      "learning_rate": 4.942015489540715e-06,
+      "loss": 0.0045,
+      "step": 400
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.8225684165955,
+      "epoch": 0.5755430073795226,
+      "grad_norm": 0.037567272782325745,
+      "kl": 0.005152702331542969,
+      "learning_rate": 4.94066456309597e-06,
+      "loss": 0.0071,
+      "num_tokens": 220604938.0,
+      "reward": 0.06166294886497781,
+      "reward_std": 0.07311507751001045,
+      "rewards/pure_accuracy_reward_math": 0.06166294764261693,
+      "step": 401
+    },
+    {
+      "clip_ratio": 0.0002694410874823916,
+      "epoch": 0.5774551103940725,
+      "grad_norm": 0.036373041570186615,
+      "kl": 0.005210161209106445,
+      "learning_rate": 4.939298269302194e-06,
+      "loss": 0.0071,
+      "step": 402
+    },
+    {
+      "clip_ratio": 0.0002891406058438406,
+      "epoch": 0.5793672134086224,
+      "grad_norm": 0.03582580015063286,
+      "kl": 0.0052187442779541016,
+      "learning_rate": 4.9379166167620915e-06,
+      "loss": 0.007,
+      "step": 403
+    },
+    {
+      "clip_ratio": 0.00030127688086167836,
+      "epoch": 0.5812793164231723,
+      "grad_norm": 0.035248763859272,
+      "kl": 0.005229949951171875,
+      "learning_rate": 4.93651961417507e-06,
+      "loss": 0.007,
+      "step": 404
+    },
+    {
+      "clip_ratio": 0.00031262176707969047,
+      "epoch": 0.5831914194377222,
+      "grad_norm": 0.03461577743291855,
+      "kl": 0.00519251823425293,
+      "learning_rate": 4.9351072703371885e-06,
+      "loss": 0.0069,
+      "step": 405
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 534.0067219734192,
+      "epoch": 0.5851035224522722,
+      "grad_norm": 0.0363302007317543,
+      "kl": 0.004278659820556641,
+      "learning_rate": 4.933679594141096e-06,
+      "loss": 0.0041,
+      "num_tokens": 224253906.0,
+      "reward": 0.06222098533180542,
+      "reward_std": 0.07462272536940873,
+      "rewards/pure_accuracy_reward_math": 0.06222098329453729,
+      "step": 406
+    },
+    {
+      "clip_ratio": 0.0002887690876320903,
+      "epoch": 0.5870156254668221,
+      "grad_norm": 0.03538454696536064,
+      "kl": 0.004297971725463867,
+      "learning_rate": 4.932236594575986e-06,
+      "loss": 0.0041,
+      "step": 407
+    },
+    {
+      "clip_ratio": 0.00029836769689950415,
+      "epoch": 0.588927728481372,
+      "grad_norm": 0.03521309420466423,
+      "kl": 0.004305362701416016,
+      "learning_rate": 4.9307782807275304e-06,
+      "loss": 0.0041,
+      "step": 408
+    },
+    {
+      "clip_ratio": 0.0003077857980144927,
+      "epoch": 0.5908398314959219,
+      "grad_norm": 0.03468110039830208,
+      "kl": 0.004298210144042969,
+      "learning_rate": 4.929304661777823e-06,
+      "loss": 0.0041,
+      "step": 409
+    },
+    {
+      "clip_ratio": 0.00030735837987094783,
+      "epoch": 0.5927519345104717,
+      "grad_norm": 0.03504593297839165,
+      "kl": 0.004282474517822266,
+      "learning_rate": 4.9278157470053305e-06,
+      "loss": 0.004,
+      "step": 410
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 531.0987973213196,
+      "epoch": 0.5946640375250216,
+      "grad_norm": 0.03893313929438591,
+      "kl": 0.004411935806274414,
+      "learning_rate": 4.926311545784823e-06,
+      "loss": 0.0081,
+      "num_tokens": 227887088.0,
+      "reward": 0.06138393160654232,
+      "reward_std": 0.07560620526783168,
+      "rewards/pure_accuracy_reward_math": 0.061383930034935474,
+      "step": 411
+    },
+    {
+      "clip_ratio": 0.0003015478255292692,
+      "epoch": 0.5965761405395715,
+      "grad_norm": 0.03745520859956741,
+      "kl": 0.004415750503540039,
+      "learning_rate": 4.924792067587321e-06,
+      "loss": 0.0081,
+      "step": 412
+    },
+    {
+      "clip_ratio": 0.00033068407248038056,
+      "epoch": 0.5984882435541214,
+      "grad_norm": 0.037219781428575516,
+      "kl": 0.004396915435791016,
+      "learning_rate": 4.923257321980036e-06,
+      "loss": 0.0081,
+      "step": 413
+    },
+    {
+      "clip_ratio": 0.00037280973344877566,
+      "epoch": 0.6004003465686714,
+      "grad_norm": 0.03754372149705887,
+      "kl": 0.0044384002685546875,
+      "learning_rate": 4.9217073186263075e-06,
+      "loss": 0.0081,
+      "step": 414
+    },
+    {
+      "clip_ratio": 0.0003646712993372603,
+      "epoch": 0.6023124495832213,
+      "grad_norm": 0.03602118790149689,
+      "kl": 0.004477262496948242,
+      "learning_rate": 4.920142067285544e-06,
+      "loss": 0.008,
+      "step": 415
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 508.44282722473145,
+      "epoch": 0.6042245525977712,
+      "grad_norm": 0.039943527430295944,
+      "kl": 0.004469871520996094,
+      "learning_rate": 4.9185615778131614e-06,
+      "loss": 0.0078,
+      "num_tokens": 231443183.0,
+      "reward": 0.0705915211874526,
+      "reward_std": 0.07968511217040941,
+      "rewards/pure_accuracy_reward_math": 0.07059151926659979,
+      "step": 416
+    },
+    {
+      "clip_ratio": 0.00031770144798315414,
+      "epoch": 0.6061366556123211,
+      "grad_norm": 0.039055656641721725,
+      "kl": 0.004549264907836914,
+      "learning_rate": 4.916965860160521e-06,
+      "loss": 0.0078,
+      "step": 417
+    },
+    {
+      "clip_ratio": 0.00030108455553090607,
+      "epoch": 0.608048758626871,
+      "grad_norm": 0.03719799593091011,
+      "kl": 0.004551410675048828,
+      "learning_rate": 4.915354924374864e-06,
+      "loss": 0.0078,
+      "step": 418
+    },
+    {
+      "clip_ratio": 0.0003208976940527464,
+      "epoch": 0.6099608616414209,
+      "grad_norm": 0.03626833111047745,
+      "kl": 0.004576444625854492,
+      "learning_rate": 4.913728780599254e-06,
+      "loss": 0.0077,
+      "step": 419
+    },
+    {
+      "clip_ratio": 0.00030395733068644404,
+      "epoch": 0.6118729646559709,
+      "grad_norm": 0.035672470927238464,
+      "kl": 0.004616498947143555,
+      "learning_rate": 4.912087439072508e-06,
+      "loss": 0.0077,
+      "step": 420
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.3401436805725,
+      "epoch": 0.6137850676705208,
+      "grad_norm": 0.035979609936475754,
+      "kl": 0.004936695098876953,
+      "learning_rate": 4.9104309101291345e-06,
+      "loss": 0.008,
+      "num_tokens": 235040570.0,
+      "reward": 0.0558035739522893,
+      "reward_std": 0.06414644059259444,
+      "rewards/pure_accuracy_reward_math": 0.05580357278813608,
+      "step": 421
+    },
+    {
+      "clip_ratio": 0.0002606460908509689,
+      "epoch": 0.6156971706850707,
+      "grad_norm": 0.034824173897504807,
+      "kl": 0.004873991012573242,
+      "learning_rate": 4.908759204199268e-06,
+      "loss": 0.008,
+      "step": 422
+    },
+    {
+      "clip_ratio": 0.0002711625579081556,
+      "epoch": 0.6176092736996206,
+      "grad_norm": 0.034011878073215485,
+      "kl": 0.00480341911315918,
+      "learning_rate": 4.907072331808602e-06,
+      "loss": 0.008,
+      "step": 423
+    },
+    {
+      "clip_ratio": 0.0002719364555332504,
+      "epoch": 0.6195213767141705,
+      "grad_norm": 0.0330798402428627,
+      "kl": 0.00470733642578125,
+      "learning_rate": 4.905370303578324e-06,
+      "loss": 0.0079,
+      "step": 424
+    },
+    {
+      "clip_ratio": 0.0003164075427548596,
+      "epoch": 0.6214334797287204,
+      "grad_norm": 0.03356935828924179,
+      "kl": 0.004645586013793945,
+      "learning_rate": 4.903653130225049e-06,
+      "loss": 0.0079,
+      "step": 425
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 524.4051547050476,
+      "epoch": 0.6233455827432703,
+      "grad_norm": 0.037987031042575836,
+      "kl": 0.004395723342895508,
+      "learning_rate": 4.901920822560753e-06,
+      "loss": 0.004,
+      "num_tokens": 238650146.0,
+      "reward": 0.056082592491293326,
+      "reward_std": 0.06946781190345064,
+      "rewards/pure_accuracy_reward_math": 0.05608259033760987,
+      "step": 426
+    },
+    {
+      "clip_ratio": 0.0002752577877913609,
+      "epoch": 0.6252576857578201,
+      "grad_norm": 0.03711739555001259,
+      "kl": 0.0043413639068603516,
+      "learning_rate": 4.900173391492698e-06,
+      "loss": 0.004,
+      "step": 427
+    },
+    {
+      "clip_ratio": 0.0002780464546390249,
+      "epoch": 0.6271697887723701,
+      "grad_norm": 0.03583519160747528,
+      "kl": 0.004349231719970703,
+      "learning_rate": 4.898410848023374e-06,
+      "loss": 0.004,
+      "step": 428
+    },
+    {
+      "clip_ratio": 0.0002759867400072835,
+      "epoch": 0.62908189178692,
+      "grad_norm": 0.035115331411361694,
+      "kl": 0.0043909549713134766,
+      "learning_rate": 4.896633203250424e-06,
+      "loss": 0.0039,
+      "step": 429
+    },
+    {
+      "clip_ratio": 0.0002873923492074937,
+      "epoch": 0.6309939948014699,
+      "grad_norm": 0.03465187922120094,
+      "kl": 0.004460573196411133,
+      "learning_rate": 4.89484046836657e-06,
+      "loss": 0.0039,
+      "step": 430
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.1116304397583,
+      "epoch": 0.6329060978160198,
+      "grad_norm": 0.03591939061880112,
+      "kl": 0.004395723342895508,
+      "learning_rate": 4.893032654659554e-06,
+      "loss": 0.0068,
+      "num_tokens": 242275198.0,
+      "reward": 0.05859375320142135,
+      "reward_std": 0.06461814750218764,
+      "rewards/pure_accuracy_reward_math": 0.05859375110594556,
+      "step": 431
+    },
+    {
+      "clip_ratio": 0.00021255032419276176,
+      "epoch": 0.6348182008305697,
+      "grad_norm": 0.03488593176007271,
+      "kl": 0.0043849945068359375,
+      "learning_rate": 4.891209773512054e-06,
+      "loss": 0.0068,
+      "step": 432
+    },
+    {
+      "clip_ratio": 0.00023523596212271514,
+      "epoch": 0.6367303038451196,
+      "grad_norm": 0.03410722687840462,
+      "kl": 0.004419565200805664,
+      "learning_rate": 4.889371836401621e-06,
+      "loss": 0.0067,
+      "step": 433
+    },
+    {
+      "clip_ratio": 0.00024576090385153293,
+      "epoch": 0.6386424068596696,
+      "grad_norm": 0.03335421159863472,
+      "kl": 0.004421710968017578,
+      "learning_rate": 4.887518854900603e-06,
+      "loss": 0.0067,
+      "step": 434
+    },
+    {
+      "clip_ratio": 0.0002828803910119859,
+      "epoch": 0.6405545098742195,
+      "grad_norm": 0.03240649402141571,
+      "kl": 0.004340171813964844,
+      "learning_rate": 4.885650840676074e-06,
+      "loss": 0.0066,
+      "step": 435
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 532.2051043510437,
+      "epoch": 0.6424666128887694,
+      "grad_norm": 0.03588009625673294,
+      "kl": 0.0044574737548828125,
+      "learning_rate": 4.88376780548976e-06,
+      "loss": 0.0041,
+      "num_tokens": 245917009.0,
+      "reward": 0.05775669912691228,
+      "reward_std": 0.06611959752626717,
+      "rewards/pure_accuracy_reward_math": 0.05775669778813608,
+      "step": 436
+    },
+    {
+      "clip_ratio": 0.0002524082638899472,
+      "epoch": 0.6443787159033193,
+      "grad_norm": 0.03471923619508743,
+      "kl": 0.0044062137603759766,
+      "learning_rate": 4.881869761197963e-06,
+      "loss": 0.0041,
+      "step": 437
+    },
+    {
+      "clip_ratio": 0.0002889056303843063,
+      "epoch": 0.6462908189178692,
+      "grad_norm": 0.03379988297820091,
+      "kl": 0.004372119903564453,
+      "learning_rate": 4.879956719751491e-06,
+      "loss": 0.004,
+      "step": 438
+    },
+    {
+      "clip_ratio": 0.0003009145272017122,
+      "epoch": 0.6482029219324191,
+      "grad_norm": 0.03446533530950546,
+      "kl": 0.004400730133056641,
+      "learning_rate": 4.878028693195577e-06,
+      "loss": 0.004,
+      "step": 439
+    },
+    {
+      "clip_ratio": 0.00030466545126728306,
+      "epoch": 0.650115024946969,
+      "grad_norm": 0.03484022617340088,
+      "kl": 0.004462242126464844,
+      "learning_rate": 4.876085693669806e-06,
+      "loss": 0.0039,
+      "step": 440
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 517.0904240608215,
+      "epoch": 0.652027127961519,
+      "grad_norm": 0.0366295725107193,
+      "kl": 0.004509925842285156,
+      "learning_rate": 4.8741277334080405e-06,
+      "loss": 0.0066,
+      "num_tokens": 249502673.0,
+      "reward": 0.05719866382423788,
+      "reward_std": 0.06594694149680436,
+      "rewards/pure_accuracy_reward_math": 0.057198662078008056,
+      "step": 441
+    },
+    {
+      "clip_ratio": 0.00023539985437537325,
+      "epoch": 0.6539392309760689,
+      "grad_norm": 0.03590084984898567,
+      "kl": 0.0045740604400634766,
+      "learning_rate": 4.87215482473834e-06,
+      "loss": 0.0066,
+      "step": 442
+    },
+    {
+      "clip_ratio": 0.00022167488214108744,
+      "epoch": 0.6558513339906188,
+      "grad_norm": 0.03433714434504509,
+      "kl": 0.004676342010498047,
+      "learning_rate": 4.870166980082885e-06,
+      "loss": 0.0066,
+      "step": 443
+    },
+    {
+      "clip_ratio": 0.0002476425726172238,
+      "epoch": 0.6577634370051687,
+      "grad_norm": 0.03389691188931465,
+      "kl": 0.004789113998413086,
+      "learning_rate": 4.868164211957899e-06,
+      "loss": 0.0065,
+      "step": 444
+    },
+    {
+      "clip_ratio": 0.00025810993128061455,
+      "epoch": 0.6596755400197185,
+      "grad_norm": 0.03417885676026344,
+      "kl": 0.004879474639892578,
+      "learning_rate": 4.866146532973569e-06,
+      "loss": 0.0064,
+      "step": 445
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.3697214126587,
+      "epoch": 0.6615876430342684,
+      "grad_norm": 0.03560737892985344,
+      "kl": 0.00455927848815918,
+      "learning_rate": 4.864113955833967e-06,
+      "loss": 0.0056,
+      "num_tokens": 253104314.0,
+      "reward": 0.06584821722935885,
+      "reward_std": 0.07672227645525709,
+      "rewards/pure_accuracy_reward_math": 0.06584821565775201,
+      "step": 446
+    },
+    {
+      "clip_ratio": 0.00029780695723502504,
+      "epoch": 0.6634997460488183,
+      "grad_norm": 0.034836821258068085,
+      "kl": 0.0045278072357177734,
+      "learning_rate": 4.862066493336967e-06,
+      "loss": 0.0056,
+      "step": 447
+    },
+    {
+      "clip_ratio": 0.00030120932990485016,
+      "epoch": 0.6654118490633683,
+      "grad_norm": 0.03460467606782913,
+      "kl": 0.0045435428619384766,
+      "learning_rate": 4.860004158374172e-06,
+      "loss": 0.0055,
+      "step": 448
+    },
+    {
+      "clip_ratio": 0.000313081463019671,
+      "epoch": 0.6673239520779182,
+      "grad_norm": 0.03467562422156334,
+      "kl": 0.004552364349365234,
+      "learning_rate": 4.857926963930822e-06,
+      "loss": 0.0055,
+      "step": 449
+    },
+    {
+      "clip_ratio": 0.00031086072692687594,
+      "epoch": 0.6692360550924681,
+      "grad_norm": 0.03409102186560631,
+      "kl": 0.004626035690307617,
+      "learning_rate": 4.855834923085721e-06,
+      "loss": 0.0054,
+      "step": 450
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 514.4771447181702,
+      "epoch": 0.671148158107018,
+      "grad_norm": 0.03815117105841637,
+      "kl": 0.005002737045288086,
+      "learning_rate": 4.853728049011151e-06,
+      "loss": 0.0091,
+      "num_tokens": 256687388.0,
+      "reward": 0.06556919938884676,
+      "reward_std": 0.07874169782735407,
+      "rewards/pure_accuracy_reward_math": 0.06556919787544757,
+      "step": 451
+    },
+    {
+      "clip_ratio": 0.0003133106871473501,
+      "epoch": 0.6730602611215679,
+      "grad_norm": 0.03761136531829834,
+      "kl": 0.005041837692260742,
+      "learning_rate": 4.851606354972791e-06,
+      "loss": 0.0091,
+      "step": 452
+    },
+    {
+      "clip_ratio": 0.00034106033973557714,
+      "epoch": 0.6749723641361178,
+      "grad_norm": 0.0372379869222641,
+      "kl": 0.0050508975982666016,
+      "learning_rate": 4.849469854329629e-06,
+      "loss": 0.0091,
+      "step": 453
+    },
+    {
+      "clip_ratio": 0.00033749614277667206,
+      "epoch": 0.6768844671506677,
+      "grad_norm": 0.03686762601137161,
+      "kl": 0.005095005035400391,
+      "learning_rate": 4.847318560533882e-06,
+      "loss": 0.009,
+      "step": 454
+    },
+    {
+      "clip_ratio": 0.00035140375177888927,
+      "epoch": 0.6787965701652177,
+      "grad_norm": 0.036469750106334686,
+      "kl": 0.005120754241943359,
+      "learning_rate": 4.845152487130914e-06,
+      "loss": 0.009,
+      "step": 455
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 512.4866299629211,
+      "epoch": 0.6807086731797676,
+      "grad_norm": 0.037901297211647034,
+      "kl": 0.004809379577636719,
+      "learning_rate": 4.842971647759142e-06,
+      "loss": 0.0063,
+      "num_tokens": 260253700.0,
+      "reward": 0.05775669912691228,
+      "reward_std": 0.06710927549283952,
+      "rewards/pure_accuracy_reward_math": 0.05775669767172076,
+      "step": 456
+    },
+    {
+      "clip_ratio": 0.00026634283756266086,
+      "epoch": 0.6826207761943175,
+      "grad_norm": 0.03568252548575401,
+      "kl": 0.0047724246978759766,
+      "learning_rate": 4.840776056149957e-06,
+      "loss": 0.0063,
+      "step": 457
+    },
+    {
+      "clip_ratio": 0.00027518686636085476,
+      "epoch": 0.6845328792088674,
+      "grad_norm": 0.0351024754345417,
+      "kl": 0.004754543304443359,
+      "learning_rate": 4.838565726127636e-06,
+      "loss": 0.0063,
+      "step": 458
+    },
+    {
+      "clip_ratio": 0.0003387172891393675,
+      "epoch": 0.6864449822234173,
+      "grad_norm": 0.03477272391319275,
+      "kl": 0.004698753356933594,
+      "learning_rate": 4.836340671609255e-06,
+      "loss": 0.0062,
+      "step": 459
+    },
+    {
+      "clip_ratio": 0.0003592506114102889,
+      "epoch": 0.6883570852379672,
+      "grad_norm": 0.035812895745038986,
+      "kl": 0.004735708236694336,
+      "learning_rate": 4.834100906604601e-06,
+      "loss": 0.0062,
+      "step": 460
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 536.1403703689575,
+      "epoch": 0.6902691882525172,
+      "grad_norm": 0.03566034138202667,
+      "kl": 0.004418611526489258,
+      "learning_rate": 4.831846445216082e-06,
+      "loss": 0.0056,
+      "num_tokens": 263902651.0,
+      "reward": 0.05161830614088103,
+      "reward_std": 0.06899610540131107,
+      "rewards/pure_accuracy_reward_math": 0.051618304976727813,
+      "step": 461
+    },
+    {
+      "clip_ratio": 0.00028340513017610647,
+      "epoch": 0.6921812912670671,
+      "grad_norm": 0.03495897352695465,
+      "kl": 0.004414081573486328,
+      "learning_rate": 4.829577301638642e-06,
+      "loss": 0.0056,
+      "step": 462
+    },
+    {
+      "clip_ratio": 0.0002825141077664739,
+      "epoch": 0.6940933942816169,
+      "grad_norm": 0.034486111253499985,
+      "kl": 0.004411220550537109,
+      "learning_rate": 4.827293490159668e-06,
+      "loss": 0.0056,
+      "step": 463
+    },
+    {
+      "clip_ratio": 0.00031019614829119746,
+      "epoch": 0.6960054972961668,
+      "grad_norm": 0.035884980112314224,
+      "kl": 0.004367351531982422,
+      "learning_rate": 4.824995025158903e-06,
+      "loss": 0.0055,
+      "step": 464
+    },
+    {
+      "clip_ratio": 0.0003045983889933268,
+      "epoch": 0.6979176003107167,
+      "grad_norm": 0.03378836810588837,
+      "kl": 0.004292488098144531,
+      "learning_rate": 4.822681921108355e-06,
+      "loss": 0.0055,
+      "step": 465
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.3783731460571,
+      "epoch": 0.6998297033252666,
+      "grad_norm": 0.03726997971534729,
+      "kl": 0.0065157413482666016,
+      "learning_rate": 4.8203541925722016e-06,
+      "loss": 0.0017,
+      "num_tokens": 267508687.0,
+      "reward": 0.06724330646102317,
+      "reward_std": 0.07591145433252677,
+      "rewards/pure_accuracy_reward_math": 0.06724330500583164,
+      "step": 466
+    },
+    {
+      "clip_ratio": 0.00026273680936128585,
+      "epoch": 0.7017418063398165,
+      "grad_norm": 0.03638988733291626,
+      "kl": 0.0064983367919921875,
+      "learning_rate": 4.818011854206706e-06,
+      "loss": 0.0017,
+      "step": 467
+    },
+    {
+      "clip_ratio": 0.0002903113285128711,
+      "epoch": 0.7036539093543664,
+      "grad_norm": 0.0360158272087574,
+      "kl": 0.006509542465209961,
+      "learning_rate": 4.815654920760117e-06,
+      "loss": 0.0016,
+      "step": 468
+    },
+    {
+      "clip_ratio": 0.0002849762186087901,
+      "epoch": 0.7055660123689164,
+      "grad_norm": 0.03577370196580887,
+      "kl": 0.006470680236816406,
+      "learning_rate": 4.81328340707258e-06,
+      "loss": 0.0016,
+      "step": 469
+    },
+    {
+      "clip_ratio": 0.00031370155647891806,
+      "epoch": 0.7074781153834663,
+      "grad_norm": 0.03484919294714928,
+      "kl": 0.006468772888183594,
+      "learning_rate": 4.810897328076045e-06,
+      "loss": 0.0015,
+      "step": 470
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.1677136421204,
+      "epoch": 0.7093902183980162,
+      "grad_norm": 0.04198005422949791,
+      "kl": 0.004724264144897461,
+      "learning_rate": 4.808496698794171e-06,
+      "loss": 0.0046,
+      "num_tokens": 271138708.0,
+      "reward": 0.07310268204309978,
+      "reward_std": 0.07646948879119009,
+      "rewards/pure_accuracy_reward_math": 0.07310267994762398,
+      "step": 471
+    },
+    {
+      "clip_ratio": 0.00028702764876697984,
+      "epoch": 0.7113023214125661,
+      "grad_norm": 0.04015243798494339,
+      "kl": 0.004670619964599609,
+      "learning_rate": 4.8060815343422265e-06,
+      "loss": 0.0045,
+      "step": 472
+    },
+    {
+      "clip_ratio": 0.0002947892680822406,
+      "epoch": 0.713214424427116,
+      "grad_norm": 0.0385352224111557,
+      "kl": 0.0046727657318115234,
+      "learning_rate": 4.803651849927004e-06,
+      "loss": 0.0045,
+      "step": 473
+    },
+    {
+      "clip_ratio": 0.00036661511779811917,
+      "epoch": 0.7151265274416659,
+      "grad_norm": 0.03803607076406479,
+      "kl": 0.00463414192199707,
+      "learning_rate": 4.801207660846717e-06,
+      "loss": 0.0044,
+      "step": 474
+    },
+    {
+      "clip_ratio": 0.00040073674449558894,
+      "epoch": 0.7170386304562159,
+      "grad_norm": 0.03870271518826485,
+      "kl": 0.00464320182800293,
+      "learning_rate": 4.798748982490908e-06,
+      "loss": 0.0044,
+      "step": 475
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 539.262857913971,
+      "epoch": 0.7189507334707658,
+      "grad_norm": 0.0374424010515213,
+      "kl": 0.0045392513275146484,
+      "learning_rate": 4.796275830340344e-06,
+      "loss": 0.0081,
+      "num_tokens": 274802094.0,
+      "reward": 0.061941967433085665,
+      "reward_std": 0.07401842664694414,
+      "rewards/pure_accuracy_reward_math": 0.06194196522119455,
+      "step": 476
+    },
+    {
+      "clip_ratio": 0.00026828293908920386,
+      "epoch": 0.7208628364853157,
+      "grad_norm": 0.03758076950907707,
+      "kl": 0.004576683044433594,
+      "learning_rate": 4.793788219966931e-06,
+      "loss": 0.0081,
+      "step": 477
+    },
+    {
+      "clip_ratio": 0.0002991793934654652,
+      "epoch": 0.7227749394998656,
+      "grad_norm": 0.03570091351866722,
+      "kl": 0.0045130252838134766,
+      "learning_rate": 4.7912861670336065e-06,
+      "loss": 0.008,
+      "step": 478
+    },
+    {
+      "clip_ratio": 0.00031140293214093617,
+      "epoch": 0.7246870425144155,
+      "grad_norm": 0.034991368651390076,
+      "kl": 0.0044956207275390625,
+      "learning_rate": 4.788769687294243e-06,
+      "loss": 0.008,
+      "step": 479
+    },
+    {
+      "clip_ratio": 0.00034215352269484356,
+      "epoch": 0.7265991455289653,
+      "grad_norm": 0.03517301753163338,
+      "kl": 0.00450587272644043,
+      "learning_rate": 4.7862387965935504e-06,
+      "loss": 0.0079,
+      "step": 480
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 535.2455615997314,
+      "epoch": 0.7285112485435152,
+      "grad_norm": 0.03517255187034607,
+      "kl": 0.004718780517578125,
+      "learning_rate": 4.783693510866977e-06,
+      "loss": 0.0066,
+      "num_tokens": 278455030.0,
+      "reward": 0.06222098530270159,
+      "reward_std": 0.069766862958204,
+      "rewards/pure_accuracy_reward_math": 0.062220983498264104,
+      "step": 481
+    },
+    {
+      "clip_ratio": 0.00026954136529866446,
+      "epoch": 0.7304233515580651,
+      "grad_norm": 0.03456445038318634,
+      "kl": 0.004766225814819336,
+      "learning_rate": 4.781133846140606e-06,
+      "loss": 0.0066,
+      "step": 482
+    },
+    {
+      "clip_ratio": 0.000250861422671278,
+      "epoch": 0.7323354545726151,
+      "grad_norm": 0.033632129430770874,
+      "kl": 0.004829883575439453,
+      "learning_rate": 4.778559818531055e-06,
+      "loss": 0.0066,
+      "step": 483
+    },
+    {
+      "clip_ratio": 0.0002590245896385568,
+      "epoch": 0.734247557587165,
+      "grad_norm": 0.03314875811338425,
+      "kl": 0.00486445426940918,
+      "learning_rate": 4.775971444245379e-06,
+      "loss": 0.0065,
+      "step": 484
+    },
+    {
+      "clip_ratio": 0.0002899982684425595,
+      "epoch": 0.7361596606017149,
+      "grad_norm": 0.03288432955741882,
+      "kl": 0.004921674728393555,
+      "learning_rate": 4.773368739580963e-06,
+      "loss": 0.0065,
+      "step": 485
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 524.4258046150208,
+      "epoch": 0.7380717636162648,
+      "grad_norm": 0.08309170603752136,
+      "kl": 0.006993293762207031,
+      "learning_rate": 4.770751720925422e-06,
+      "loss": 0.0023,
+      "num_tokens": 282068152.0,
+      "reward": 0.06222098495345563,
+      "reward_std": 0.0712282478925772,
+      "rewards/pure_accuracy_reward_math": 0.06222098338184878,
+      "step": 486
+    },
+    {
+      "clip_ratio": 0.0002442373284452515,
+      "epoch": 0.7399838666308147,
+      "grad_norm": 0.042120546102523804,
+      "kl": 0.006081581115722656,
+      "learning_rate": 4.768120404756497e-06,
+      "loss": 0.0023,
+      "step": 487
+    },
+    {
+      "clip_ratio": 0.0002956131474434187,
+      "epoch": 0.7418959696453646,
+      "grad_norm": 0.036061204969882965,
+      "kl": 0.0057599544525146484,
+      "learning_rate": 4.765474807641951e-06,
+      "loss": 0.0022,
+      "step": 488
+    },
+    {
+      "clip_ratio": 0.00030389728723889675,
+      "epoch": 0.7438080726599146,
+      "grad_norm": 0.03613469749689102,
+      "kl": 0.005738019943237305,
+      "learning_rate": 4.762814946239468e-06,
+      "loss": 0.0022,
+      "step": 489
+    },
+    {
+      "clip_ratio": 0.00033159017920070255,
+      "epoch": 0.7457201756744645,
+      "grad_norm": 0.0360892117023468,
+      "kl": 0.00572967529296875,
+      "learning_rate": 4.760140837296542e-06,
+      "loss": 0.0021,
+      "step": 490
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 550.3144750595093,
+      "epoch": 0.7476322786890144,
+      "grad_norm": 0.03636733815073967,
+      "kl": 0.004332542419433594,
+      "learning_rate": 4.757452497650377e-06,
+      "loss": 0.0072,
+      "num_tokens": 285770403.0,
+      "reward": 0.055803573777666315,
+      "reward_std": 0.07161362667102367,
+      "rewards/pure_accuracy_reward_math": 0.05580357278813608,
+      "step": 491
+    },
+    {
+      "clip_ratio": 0.00027637260956225873,
+      "epoch": 0.7495443817035643,
+      "grad_norm": 0.035727791488170624,
+      "kl": 0.004361629486083984,
+      "learning_rate": 4.754749944227777e-06,
+      "loss": 0.0072,
+      "step": 492
+    },
+    {
+      "clip_ratio": 0.0002587454115428045,
+      "epoch": 0.7514564847181142,
+      "grad_norm": 0.03512200713157654,
+      "kl": 0.0043697357177734375,
+      "learning_rate": 4.752033194045044e-06,
+      "loss": 0.0072,
+      "step": 493
+    },
+    {
+      "clip_ratio": 0.00025780797875540884,
+      "epoch": 0.7533685877326641,
+      "grad_norm": 0.033817108720541,
+      "kl": 0.0043947696685791016,
+      "learning_rate": 4.7493022642078654e-06,
+      "loss": 0.0071,
+      "step": 494
+    },
+    {
+      "clip_ratio": 0.00029674232627030506,
+      "epoch": 0.755280690747214,
+      "grad_norm": 0.03317062556743622,
+      "kl": 0.004454851150512695,
+      "learning_rate": 4.746557171911211e-06,
+      "loss": 0.0071,
+      "step": 495
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 543.0692186355591,
+      "epoch": 0.757192793761764,
+      "grad_norm": 0.05020016431808472,
+      "kl": 0.0062062740325927734,
+      "learning_rate": 4.7437979344392236e-06,
+      "loss": 0.0059,
+      "num_tokens": 289451319.0,
+      "reward": 0.0616629492433276,
+      "reward_std": 0.07071027776692063,
+      "rewards/pure_accuracy_reward_math": 0.06166294778813608,
+      "step": 496
+    },
+    {
+      "clip_ratio": 0.00028460744590574905,
+      "epoch": 0.7591048967763139,
+      "grad_norm": 0.03948064520955086,
+      "kl": 0.0061266422271728516,
+      "learning_rate": 4.741024569165105e-06,
+      "loss": 0.0059,
+      "step": 497
+    },
+    {
+      "clip_ratio": 0.0002803450769306437,
+      "epoch": 0.7610169997908637,
+      "grad_norm": 0.03621263429522514,
+      "kl": 0.00614476203918457,
+      "learning_rate": 4.7382370935510165e-06,
+      "loss": 0.0059,
+      "step": 498
+    },
+    {
+      "clip_ratio": 0.0003022695020717947,
+      "epoch": 0.7629291028054136,
+      "grad_norm": 0.037622902542352676,
+      "kl": 0.006256580352783203,
+      "learning_rate": 4.73543552514796e-06,
+      "loss": 0.0058,
+      "step": 499
+    },
+    {
+      "clip_ratio": 0.00030265802058693225,
+      "epoch": 0.7648412058199635,
+      "grad_norm": 0.03813454508781433,
+      "kl": 0.006264209747314453,
+      "learning_rate": 4.732619881595672e-06,
+      "loss": 0.0057,
+      "step": 500
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 543.3538174629211,
+      "epoch": 0.7667533088345134,
+      "grad_norm": 0.07500133663415909,
+      "kl": 0.005916118621826172,
+      "learning_rate": 4.729790180622512e-06,
+      "loss": 0.0072,
+      "num_tokens": 293127839.0,
+      "reward": 0.0513392879802268,
+      "reward_std": 0.06792009877972305,
+      "rewards/pure_accuracy_reward_math": 0.051339287048904225,
+      "step": 501
+    },
+    {
+      "clip_ratio": 0.0002826226679530919,
+      "epoch": 0.7686654118490633,
+      "grad_norm": 0.03498294949531555,
+      "kl": 0.0057086944580078125,
+      "learning_rate": 4.726946440045348e-06,
+      "loss": 0.0072,
+      "step": 502
+    },
+    {
+      "clip_ratio": 0.000292762170943206,
+      "epoch": 0.7705775148636133,
+      "grad_norm": 0.0338723324239254,
+      "kl": 0.0054700374603271484,
+      "learning_rate": 4.7240886777694495e-06,
+      "loss": 0.0071,
+      "step": 503
+    },
+    {
+      "clip_ratio": 0.00031638332251304746,
+      "epoch": 0.7724896178781632,
+      "grad_norm": 0.03360189124941826,
+      "kl": 0.00526118278503418,
+      "learning_rate": 4.721216911788371e-06,
+      "loss": 0.0071,
+      "step": 504
+    },
+    {
+      "clip_ratio": 0.0003445502737804418,
+      "epoch": 0.7744017208927131,
+      "grad_norm": 0.03321666270494461,
+      "kl": 0.005108356475830078,
+      "learning_rate": 4.71833116018384e-06,
+      "loss": 0.007,
+      "step": 505
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.3041553497314,
+      "epoch": 0.776313823907263,
+      "grad_norm": 0.039082907140254974,
+      "kl": 0.0048329830169677734,
+      "learning_rate": 4.715431441125639e-06,
+      "loss": 0.0072,
+      "num_tokens": 296745449.0,
+      "reward": 0.056640627823071554,
+      "reward_std": 0.066464910923969,
+      "rewards/pure_accuracy_reward_math": 0.05664062636788003,
+      "step": 506
+    },
+    {
+      "clip_ratio": 0.0002697859709428485,
+      "epoch": 0.7782259269218129,
+      "grad_norm": 0.036139652132987976,
+      "kl": 0.0048868656158447266,
+      "learning_rate": 4.712517772871503e-06,
+      "loss": 0.0072,
+      "step": 507
+    },
+    {
+      "clip_ratio": 0.0002602223319172481,
+      "epoch": 0.7801380299363628,
+      "grad_norm": 0.03708622604608536,
+      "kl": 0.004920244216918945,
+      "learning_rate": 4.709590173766988e-06,
+      "loss": 0.0072,
+      "step": 508
+    },
+    {
+      "clip_ratio": 0.00030563702995323183,
+      "epoch": 0.7820501329509127,
+      "grad_norm": 0.03873802721500397,
+      "kl": 0.004922151565551758,
+      "learning_rate": 4.706648662245368e-06,
+      "loss": 0.0071,
+      "step": 509
+    },
+    {
+      "clip_ratio": 0.00027421732914945096,
+      "epoch": 0.7839622359654627,
+      "grad_norm": 0.0337008535861969,
+      "kl": 0.004686117172241211,
+      "learning_rate": 4.703693256827515e-06,
+      "loss": 0.0071,
+      "step": 510
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 544.4595675468445,
+      "epoch": 0.7858743389800126,
+      "grad_norm": 0.032148003578186035,
+      "kl": 0.004284381866455078,
+      "learning_rate": 4.700723976121782e-06,
+      "loss": 0.0079,
+      "num_tokens": 300427724.0,
+      "reward": 0.05998884211294353,
+      "reward_std": 0.06822534691309556,
+      "rewards/pure_accuracy_reward_math": 0.059988840483129025,
+      "step": 511
+    },
+    {
+      "clip_ratio": 0.00023266997004611767,
+      "epoch": 0.7877864419945625,
+      "grad_norm": 0.03213036060333252,
+      "kl": 0.004235267639160156,
+      "learning_rate": 4.697740838823884e-06,
+      "loss": 0.0079,
+      "step": 512
+    },
+    {
+      "clip_ratio": 0.00023210655439243055,
+      "epoch": 0.7896985450091124,
+      "grad_norm": 0.03171762451529503,
+      "kl": 0.004268169403076172,
+      "learning_rate": 4.694743863716784e-06,
+      "loss": 0.0078,
+      "step": 513
+    },
+    {
+      "clip_ratio": 0.0002433597992990144,
+      "epoch": 0.7916106480236623,
+      "grad_norm": 0.030378276482224464,
+      "kl": 0.004282712936401367,
+      "learning_rate": 4.691733069670575e-06,
+      "loss": 0.0078,
+      "step": 514
+    },
+    {
+      "clip_ratio": 0.00024098603546462982,
+      "epoch": 0.7935227510382122,
+      "grad_norm": 0.030135801061987877,
+      "kl": 0.004299640655517578,
+      "learning_rate": 4.688708475642356e-06,
+      "loss": 0.0078,
+      "step": 515
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.5064425468445,
+      "epoch": 0.795434854052762,
+      "grad_norm": 0.03758488968014717,
+      "kl": 0.004748344421386719,
+      "learning_rate": 4.685670100676117e-06,
+      "loss": 0.0056,
+      "num_tokens": 304030899.0,
+      "reward": 0.059151788300368935,
+      "reward_std": 0.06615966308163479,
+      "rewards/pure_accuracy_reward_math": 0.05915178684517741,
+      "step": 516
+    },
+    {
+      "clip_ratio": 0.00024922658519699326,
+      "epoch": 0.797346957067312,
+      "grad_norm": 0.03667794167995453,
+      "kl": 0.004762172698974609,
+      "learning_rate": 4.6826179639026185e-06,
+      "loss": 0.0056,
+      "step": 517
+    },
+    {
+      "clip_ratio": 0.00024439046995894387,
+      "epoch": 0.7992590600818619,
+      "grad_norm": 0.03566230833530426,
+      "kl": 0.004770755767822266,
+      "learning_rate": 4.679552084539271e-06,
+      "loss": 0.0055,
+      "step": 518
+    },
+    {
+      "clip_ratio": 0.00025443012202686077,
+      "epoch": 0.8011711630964118,
+      "grad_norm": 0.03555983304977417,
+      "kl": 0.004889011383056641,
+      "learning_rate": 4.676472481890012e-06,
+      "loss": 0.0055,
+      "step": 519
+    },
+    {
+      "clip_ratio": 0.0002555244412860702,
+      "epoch": 0.8030832661109617,
+      "grad_norm": 0.03477266803383827,
+      "kl": 0.004910707473754883,
+      "learning_rate": 4.673379175345187e-06,
+      "loss": 0.0054,
+      "step": 520
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.2039861679077,
+      "epoch": 0.8049953691255116,
+      "grad_norm": 0.03352927044034004,
+      "kl": 0.004728078842163086,
+      "learning_rate": 4.670272184381426e-06,
+      "loss": 0.0064,
+      "num_tokens": 307666714.0,
+      "reward": 0.05106027063447982,
+      "reward_std": 0.061781705473549664,
+      "rewards/pure_accuracy_reward_math": 0.05106026888824999,
+      "step": 521
+    },
+    {
+      "clip_ratio": 0.00022480493561261028,
+      "epoch": 0.8069074721400615,
+      "grad_norm": 0.0328591950237751,
+      "kl": 0.004677772521972656,
+      "learning_rate": 4.667151528561522e-06,
+      "loss": 0.0064,
+      "step": 522
+    },
+    {
+      "clip_ratio": 0.0002208993353463029,
+      "epoch": 0.8088195751546114,
+      "grad_norm": 0.0323566235601902,
+      "kl": 0.004681825637817383,
+      "learning_rate": 4.664017227534308e-06,
+      "loss": 0.0064,
+      "step": 523
+    },
+    {
+      "clip_ratio": 0.0002261604544742113,
+      "epoch": 0.8107316781691614,
+      "grad_norm": 0.03178941085934639,
+      "kl": 0.004633665084838867,
+      "learning_rate": 4.6608693010345285e-06,
+      "loss": 0.0063,
+      "step": 524
+    },
+    {
+      "clip_ratio": 0.0002347389614101303,
+      "epoch": 0.8126437811837113,
+      "grad_norm": 0.03144075721502304,
+      "kl": 0.004633426666259766,
+      "learning_rate": 4.657707768882723e-06,
+      "loss": 0.0063,
+      "step": 525
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 534.2078919410706,
+      "epoch": 0.8145558841982612,
+      "grad_norm": 36658.046875,
+      "kl": 696.0046517848969,
+      "learning_rate": 4.6545326509850965e-06,
+      "loss": 27.8583,
+      "num_tokens": 311314491.0,
+      "reward": 0.05747768114088103,
+      "reward_std": 0.06521624798187986,
+      "rewards/pure_accuracy_reward_math": 0.057477680093143135,
+      "step": 526
+    },
+    {
+      "clip_ratio": 0.0006453408203128674,
+      "epoch": 0.8164679872128111,
+      "grad_norm": 3234.42724609375,
+      "kl": 42.254658937454224,
+      "learning_rate": 4.651343967333394e-06,
+      "loss": 1.7021,
+      "step": 527
+    },
+    {
+      "clip_ratio": 0.0006781478184620937,
+      "epoch": 0.818380090227361,
+      "grad_norm": 430.01318359375,
+      "kl": 0.21270966529846191,
+      "learning_rate": 4.648141738004776e-06,
+      "loss": 0.256,
+      "step": 528
+    },
+    {
+      "clip_ratio": 0.0006916913723671314,
+      "epoch": 0.8202921932419109,
+      "grad_norm": 457.1385803222656,
+      "kl": 0.1541590690612793,
+      "learning_rate": 4.644925983161691e-06,
+      "loss": 0.3118,
+      "step": 529
+    },
+    {
+      "clip_ratio": 0.0007114471513887111,
+      "epoch": 0.8222042962564609,
+      "grad_norm": 61.02793884277344,
+      "kl": 1.6688117980957031,
+      "learning_rate": 4.641696723051753e-06,
+      "loss": 0.1081,
+      "step": 530
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 544.7664904594421,
+      "epoch": 0.8241163992710108,
+      "grad_norm": 0.03665775805711746,
+      "kl": 0.0046710968017578125,
+      "learning_rate": 4.638453978007606e-06,
+      "loss": 0.0033,
+      "num_tokens": 315000186.0,
+      "reward": 0.05691964577999897,
+      "reward_std": 0.06766731111565605,
+      "rewards/pure_accuracy_reward_math": 0.056919643975561485,
+      "step": 531
+    },
+    {
+      "clip_ratio": 0.000247030089042255,
+      "epoch": 0.8260285022855607,
+      "grad_norm": 0.03543345257639885,
+      "kl": 0.004717826843261719,
+      "learning_rate": 4.635197768446799e-06,
+      "loss": 0.0033,
+      "step": 532
+    },
+    {
+      "clip_ratio": 0.00024415442914005325,
+      "epoch": 0.8279406053001105,
+      "grad_norm": 0.034531209617853165,
+      "kl": 0.004744768142700195,
+      "learning_rate": 4.631928114871667e-06,
+      "loss": 0.0032,
+      "step": 533
+    },
+    {
+      "clip_ratio": 0.0002580326566032909,
+      "epoch": 0.8298527083146604,
+      "grad_norm": 0.03323632851243019,
+      "kl": 0.004830360412597656,
+      "learning_rate": 4.628645037869183e-06,
+      "loss": 0.0032,
+      "step": 534
+    },
+    {
+      "clip_ratio": 0.00029695888167680096,
+      "epoch": 0.8317648113292103,
+      "grad_norm": 0.03470376506447792,
+      "kl": 0.0048847198486328125,
+      "learning_rate": 4.625348558110846e-06,
+      "loss": 0.0031,
+      "step": 535
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 543.506443977356,
+      "epoch": 0.8336769143437602,
+      "grad_norm": 33.48581314086914,
+      "kl": 0.7041072845458984,
+      "learning_rate": 4.6220386963525425e-06,
+      "loss": 0.0349,
+      "num_tokens": 318683697.0,
+      "reward": 0.06333705675206147,
+      "reward_std": 0.0759915838134475,
+      "rewards/pure_accuracy_reward_math": 0.063337054773001,
+      "step": 536
+    },
+    {
+      "clip_ratio": 0.00030500417074108555,
+      "epoch": 0.8355890173583101,
+      "grad_norm": 5.391356468200684,
+      "kl": 0.12163639068603516,
+      "learning_rate": 4.6187154734344144e-06,
+      "loss": 0.0115,
+      "step": 537
+    },
+    {
+      "clip_ratio": 0.0003094891900445873,
+      "epoch": 0.8375011203728601,
+      "grad_norm": 0.24674992263317108,
+      "kl": 0.011260032653808594,
+      "learning_rate": 4.615378910280735e-06,
+      "loss": 0.007,
+      "step": 538
+    },
+    {
+      "clip_ratio": 0.0003443351265559613,
+      "epoch": 0.83941322338741,
+      "grad_norm": 0.040490083396434784,
+      "kl": 0.0068547725677490234,
+      "learning_rate": 4.61202902789977e-06,
+      "loss": 0.0068,
+      "step": 539
+    },
+    {
+      "clip_ratio": 0.0003249310258297555,
+      "epoch": 0.8413253264019599,
+      "grad_norm": 0.037383101880550385,
+      "kl": 0.006977081298828125,
+      "learning_rate": 4.608665847383646e-06,
+      "loss": 0.0068,
+      "step": 540
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.8432207107544,
+      "epoch": 0.8432374294165098,
+      "grad_norm": 0.0408562608063221,
+      "kl": 0.005080223083496094,
+      "learning_rate": 4.6052893899082244e-06,
+      "loss": 0.0092,
+      "num_tokens": 322311955.0,
+      "reward": 0.07505580695578828,
+      "reward_std": 0.08672685426427051,
+      "rewards/pure_accuracy_reward_math": 0.07505580462748185,
+      "step": 541
+    },
+    {
+      "clip_ratio": 0.0003254984287082152,
+      "epoch": 0.8451495324310597,
+      "grad_norm": 0.03888032212853432,
+      "kl": 0.005081653594970703,
+      "learning_rate": 4.60189967673296e-06,
+      "loss": 0.0091,
+      "step": 542
+    },
+    {
+      "clip_ratio": 0.00032150591908930437,
+      "epoch": 0.8470616354456096,
+      "grad_norm": 0.03769301995635033,
+      "kl": 0.005054950714111328,
+      "learning_rate": 4.598496729200772e-06,
+      "loss": 0.0091,
+      "step": 543
+    },
+    {
+      "clip_ratio": 0.0003807161001532222,
+      "epoch": 0.8489737384601596,
+      "grad_norm": 0.03671475872397423,
+      "kl": 0.005011320114135742,
+      "learning_rate": 4.595080568737907e-06,
+      "loss": 0.009,
+      "step": 544
+    },
+    {
+      "clip_ratio": 0.00040073374452731514,
+      "epoch": 0.8508858414747095,
+      "grad_norm": 0.03656642884016037,
+      "kl": 0.004985332489013672,
+      "learning_rate": 4.591651216853808e-06,
+      "loss": 0.009,
+      "step": 545
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 521.1850123405457,
+      "epoch": 0.8527979444892594,
+      "grad_norm": 0.04072614386677742,
+      "kl": 0.005250692367553711,
+      "learning_rate": 4.588208695140972e-06,
+      "loss": 0.008,
+      "num_tokens": 325915646.0,
+      "reward": 0.06891741379513405,
+      "reward_std": 0.07457646180409938,
+      "rewards/pure_accuracy_reward_math": 0.0689174119324889,
+      "step": 546
+    },
+    {
+      "clip_ratio": 0.0002774237623270892,
+      "epoch": 0.8547100475038093,
+      "grad_norm": 0.03891909867525101,
+      "kl": 0.005267620086669922,
+      "learning_rate": 4.5847530252748206e-06,
+      "loss": 0.008,
+      "step": 547
+    },
+    {
+      "clip_ratio": 0.0003099276901821213,
+      "epoch": 0.8566221505183592,
+      "grad_norm": 0.03776893764734268,
+      "kl": 0.005312681198120117,
+      "learning_rate": 4.581284229013561e-06,
+      "loss": 0.008,
+      "step": 548
+    },
+    {
+      "clip_ratio": 0.0003329096458060121,
+      "epoch": 0.8585342535329091,
+      "grad_norm": 0.03786613792181015,
+      "kl": 0.0053446292877197266,
+      "learning_rate": 4.57780232819805e-06,
+      "loss": 0.0079,
+      "step": 549
+    },
+    {
+      "clip_ratio": 0.0003465502328481307,
+      "epoch": 0.860446356547459,
+      "grad_norm": 0.03782954812049866,
+      "kl": 0.00535893440246582,
+      "learning_rate": 4.574307344751654e-06,
+      "loss": 0.0079,
+      "step": 550
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 512.2042679786682,
+      "epoch": 0.8623584595620088,
+      "grad_norm": 0.04082540422677994,
+      "kl": 0.005150318145751953,
+      "learning_rate": 4.570799300680112e-06,
+      "loss": 0.0061,
+      "num_tokens": 329486142.0,
+      "reward": 0.06696428914438002,
+      "reward_std": 0.07865536911413074,
+      "rewards/pure_accuracy_reward_math": 0.06696428681607358,
+      "step": 551
+    },
+    {
+      "clip_ratio": 0.0002784457984148503,
+      "epoch": 0.8642705625765588,
+      "grad_norm": 0.039590511471033096,
+      "kl": 0.005137205123901367,
+      "learning_rate": 4.5672782180714005e-06,
+      "loss": 0.0061,
+      "step": 552
+    },
+    {
+      "clip_ratio": 0.0003210699376268167,
+      "epoch": 0.8661826655911087,
+      "grad_norm": 0.03983275964856148,
+      "kl": 0.005161285400390625,
+      "learning_rate": 4.56374411909559e-06,
+      "loss": 0.0061,
+      "step": 553
+    },
+    {
+      "clip_ratio": 0.00032905748116718314,
+      "epoch": 0.8680947686056586,
+      "grad_norm": 0.03924131765961647,
+      "kl": 0.0051097869873046875,
+      "learning_rate": 4.560197026004706e-06,
+      "loss": 0.006,
+      "step": 554
+    },
+    {
+      "clip_ratio": 0.00036174511694753164,
+      "epoch": 0.8700068716202085,
+      "grad_norm": 0.03864859789609909,
+      "kl": 0.0051233768463134766,
+      "learning_rate": 4.556636961132591e-06,
+      "loss": 0.0059,
+      "step": 555
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 524.8490724563599,
+      "epoch": 0.8719189746347584,
+      "grad_norm": 0.03831901028752327,
+      "kl": 0.005173921585083008,
+      "learning_rate": 4.553063946894765e-06,
+      "loss": 0.0089,
+      "num_tokens": 333101169.0,
+      "reward": 0.05970982427243143,
+      "reward_std": 0.06925509037682787,
+      "rewards/pure_accuracy_reward_math": 0.05970982293365523,
+      "step": 556
+    },
+    {
+      "clip_ratio": 0.00024058804717697058,
+      "epoch": 0.8738310776493083,
+      "grad_norm": 0.03815346583724022,
+      "kl": 0.005152463912963867,
+      "learning_rate": 4.549478005788276e-06,
+      "loss": 0.0088,
+      "step": 557
+    },
+    {
+      "clip_ratio": 0.0002689754076072859,
+      "epoch": 0.8757431806638583,
+      "grad_norm": 0.03663227707147598,
+      "kl": 0.00511932373046875,
+      "learning_rate": 4.5458791603915695e-06,
+      "loss": 0.0088,
+      "step": 558
+    },
+    {
+      "clip_ratio": 0.0002769273295371022,
+      "epoch": 0.8776552836784082,
+      "grad_norm": 0.03534897044301033,
+      "kl": 0.005173921585083008,
+      "learning_rate": 4.5422674333643415e-06,
+      "loss": 0.0087,
+      "step": 559
+    },
+    {
+      "clip_ratio": 0.0003186316080245888,
+      "epoch": 0.8795673866929581,
+      "grad_norm": 0.03454131633043289,
+      "kl": 0.005182981491088867,
+      "learning_rate": 4.538642847447393e-06,
+      "loss": 0.0087,
+      "step": 560
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 499.49025869369507,
+      "epoch": 0.881479489707508,
+      "grad_norm": 0.03870520368218422,
+      "kl": 0.005303621292114258,
+      "learning_rate": 4.53500542546249e-06,
+      "loss": 0.0063,
+      "num_tokens": 336621146.0,
+      "reward": 0.06724330663564615,
+      "reward_std": 0.07539348350837827,
+      "rewards/pure_accuracy_reward_math": 0.0672433050640393,
+      "step": 561
+    },
+    {
+      "clip_ratio": 0.0002930208739826412,
+      "epoch": 0.8833915927220579,
+      "grad_norm": 0.03670111671090126,
+      "kl": 0.005410432815551758,
+      "learning_rate": 4.5313551903122195e-06,
+      "loss": 0.0063,
+      "step": 562
+    },
+    {
+      "clip_ratio": 0.00033625421181682214,
+      "epoch": 0.8853036957366078,
+      "grad_norm": 0.03873737156391144,
+      "kl": 0.0054399967193603516,
+      "learning_rate": 4.5276921649798475e-06,
+      "loss": 0.0063,
+      "step": 563
+    },
+    {
+      "clip_ratio": 0.0003349392310383337,
+      "epoch": 0.8872157987511577,
+      "grad_norm": 0.038494061678647995,
+      "kl": 0.0053806304931640625,
+      "learning_rate": 4.524016372529168e-06,
+      "loss": 0.0062,
+      "step": 564
+    },
+    {
+      "clip_ratio": 0.00031196477385719845,
+      "epoch": 0.8891279017657077,
+      "grad_norm": 0.03559175133705139,
+      "kl": 0.005260467529296875,
+      "learning_rate": 4.520327836104363e-06,
+      "loss": 0.0061,
+      "step": 565
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 521.2452793121338,
+      "epoch": 0.8910400047802576,
+      "grad_norm": 0.033526018261909485,
+      "kl": 0.0050280094146728516,
+      "learning_rate": 4.516626578929857e-06,
+      "loss": 0.0083,
+      "num_tokens": 340217537.0,
+      "reward": 0.05970982470898889,
+      "reward_std": 0.06920882686972618,
+      "rewards/pure_accuracy_reward_math": 0.059709822555305436,
+      "step": 566
+    },
+    {
+      "clip_ratio": 0.0002854210310374583,
+      "epoch": 0.8929521077948075,
+      "grad_norm": 0.03320698440074921,
+      "kl": 0.00494694709777832,
+      "learning_rate": 4.512912624310166e-06,
+      "loss": 0.0083,
+      "step": 567
+    },
+    {
+      "clip_ratio": 0.00028784406134718665,
+      "epoch": 0.8948642108093574,
+      "grad_norm": 0.0334990993142128,
+      "kl": 0.004927158355712891,
+      "learning_rate": 4.509185995629758e-06,
+      "loss": 0.0083,
+      "step": 568
+    },
+    {
+      "clip_ratio": 0.00028731861192454744,
+      "epoch": 0.8967763138239072,
+      "grad_norm": 0.032721105962991714,
+      "kl": 0.004916667938232422,
+      "learning_rate": 4.505446716352898e-06,
+      "loss": 0.0083,
+      "step": 569
+    },
+    {
+      "clip_ratio": 0.0003211342911981774,
+      "epoch": 0.8986884168384571,
+      "grad_norm": 0.031691305339336395,
+      "kl": 0.0050427913665771484,
+      "learning_rate": 4.501694810023506e-06,
+      "loss": 0.0082,
+      "step": 570
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 513.3175444602966,
+      "epoch": 0.900600519853007,
+      "grad_norm": 0.039067283272743225,
+      "kl": 0.0051767826080322266,
+      "learning_rate": 4.497930300265005e-06,
+      "loss": 0.0062,
+      "num_tokens": 343792675.0,
+      "reward": 0.07254464668221772,
+      "reward_std": 0.07260330504504964,
+      "rewards/pure_accuracy_reward_math": 0.07254464394645765,
+      "step": 571
+    },
+    {
+      "clip_ratio": 0.000284439854624452,
+      "epoch": 0.902512622867557,
+      "grad_norm": 0.03746037185192108,
+      "kl": 0.0051670074462890625,
+      "learning_rate": 4.494153210780177e-06,
+      "loss": 0.0062,
+      "step": 572
+    },
+    {
+      "clip_ratio": 0.0002894837679718876,
+      "epoch": 0.9044247258821069,
+      "grad_norm": 0.0363248772919178,
+      "kl": 0.0051119327545166016,
+      "learning_rate": 4.490363565351007e-06,
+      "loss": 0.0061,
+      "step": 573
+    },
+    {
+      "clip_ratio": 0.00029392389137683494,
+      "epoch": 0.9063368288966568,
+      "grad_norm": 0.03513769805431366,
+      "kl": 0.005059242248535156,
+      "learning_rate": 4.486561387838539e-06,
+      "loss": 0.0061,
+      "step": 574
+    },
+    {
+      "clip_ratio": 0.0003296555175325011,
+      "epoch": 0.9082489319112067,
+      "grad_norm": 0.03513012453913689,
+      "kl": 0.005059242248535156,
+      "learning_rate": 4.482746702182725e-06,
+      "loss": 0.006,
+      "step": 575
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.8926033973694,
+      "epoch": 0.9101610349257566,
+      "grad_norm": 0.049145400524139404,
+      "kl": 0.011604547500610352,
+      "learning_rate": 4.478919532402271e-06,
+      "loss": 0.0046,
+      "num_tokens": 347395370.0,
+      "reward": 0.07170759254950099,
+      "reward_std": 0.0817445982247591,
+      "rewards/pure_accuracy_reward_math": 0.07170759091968648,
+      "step": 576
+    },
+    {
+      "clip_ratio": 0.00030760892423131736,
+      "epoch": 0.9120731379403065,
+      "grad_norm": 0.04954507574439049,
+      "kl": 0.011447906494140625,
+      "learning_rate": 4.4750799025944866e-06,
+      "loss": 0.0045,
+      "step": 577
+    },
+    {
+      "clip_ratio": 0.0003202956161487691,
+      "epoch": 0.9139852409548564,
+      "grad_norm": 0.04883984476327896,
+      "kl": 0.010998249053955078,
+      "learning_rate": 4.471227836935139e-06,
+      "loss": 0.0045,
+      "step": 578
+    },
+    {
+      "clip_ratio": 0.0003312723312660637,
+      "epoch": 0.9158973439694064,
+      "grad_norm": 0.049066606909036636,
+      "kl": 0.010381698608398438,
+      "learning_rate": 4.467363359678291e-06,
+      "loss": 0.0044,
+      "step": 579
+    },
+    {
+      "clip_ratio": 0.00041312941800697445,
+      "epoch": 0.9178094469839563,
+      "grad_norm": 0.053418997675180435,
+      "kl": 0.009602546691894531,
+      "learning_rate": 4.463486495156157e-06,
+      "loss": 0.0043,
+      "step": 580
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 539.5678267478943,
+      "epoch": 0.9197215499985062,
+      "grad_norm": 0.03747523948550224,
+      "kl": 0.004802227020263672,
+      "learning_rate": 4.459597267778945e-06,
+      "loss": 0.0041,
+      "num_tokens": 351065793.0,
+      "reward": 0.062220984895247966,
+      "reward_std": 0.07298868335783482,
+      "rewards/pure_accuracy_reward_math": 0.0622209832072258,
+      "step": 581
+    },
+    {
+      "clip_ratio": 0.0002890200073579763,
+      "epoch": 0.9216336530130561,
+      "grad_norm": 0.03557584062218666,
+      "kl": 0.004851579666137695,
+      "learning_rate": 4.455695702034705e-06,
+      "loss": 0.0041,
+      "step": 582
+    },
+    {
+      "clip_ratio": 0.00031045296407228307,
+      "epoch": 0.923545756027606,
+      "grad_norm": 0.034734807908535004,
+      "kl": 0.004895925521850586,
+      "learning_rate": 4.451781822489173e-06,
+      "loss": 0.0041,
+      "step": 583
+    },
+    {
+      "clip_ratio": 0.00032734786560695284,
+      "epoch": 0.9254578590421559,
+      "grad_norm": 0.03634972497820854,
+      "kl": 0.004976511001586914,
+      "learning_rate": 4.447855653785617e-06,
+      "loss": 0.004,
+      "step": 584
+    },
+    {
+      "clip_ratio": 0.00036698238614008005,
+      "epoch": 0.9273699620567059,
+      "grad_norm": 0.036671172827482224,
+      "kl": 0.004954338073730469,
+      "learning_rate": 4.4439172206446845e-06,
+      "loss": 0.0039,
+      "step": 585
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 538.6261405944824,
+      "epoch": 0.9292820650712557,
+      "grad_norm": 0.03805253654718399,
+      "kl": 0.005060434341430664,
+      "learning_rate": 4.439966547864243e-06,
+      "loss": 0.0061,
+      "num_tokens": 354732057.0,
+      "reward": 0.06194196725846268,
+      "reward_std": 0.07766569184605032,
+      "rewards/pure_accuracy_reward_math": 0.06194196580327116,
+      "step": 586
+    },
+    {
+      "clip_ratio": 0.0002944122598478316,
+      "epoch": 0.9311941680858056,
+      "grad_norm": 0.03603314608335495,
+      "kl": 0.005051136016845703,
+      "learning_rate": 4.436003660319224e-06,
+      "loss": 0.0061,
+      "step": 587
+    },
+    {
+      "clip_ratio": 0.0003042620955966413,
+      "epoch": 0.9331062711003555,
+      "grad_norm": 0.035505130887031555,
+      "kl": 0.005032539367675781,
+      "learning_rate": 4.432028582961472e-06,
+      "loss": 0.006,
+      "step": 588
+    },
+    {
+      "clip_ratio": 0.00032173160303727855,
+      "epoch": 0.9350183741149054,
+      "grad_norm": 0.03633759915828705,
+      "kl": 0.00509190559387207,
+      "learning_rate": 4.428041340819579e-06,
+      "loss": 0.006,
+      "step": 589
+    },
+    {
+      "clip_ratio": 0.00038377046530513326,
+      "epoch": 0.9369304771294553,
+      "grad_norm": 0.03761395812034607,
+      "kl": 0.005148649215698242,
+      "learning_rate": 4.424041958998732e-06,
+      "loss": 0.0059,
+      "step": 590
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 540.8948354721069,
+      "epoch": 0.9388425801440052,
+      "grad_norm": 0.04139011353254318,
+      "kl": 0.005031585693359375,
+      "learning_rate": 4.420030462680554e-06,
+      "loss": 0.007,
+      "num_tokens": 358409840.0,
+      "reward": 0.0714285749127157,
+      "reward_std": 0.07565246830927208,
+      "rewards/pure_accuracy_reward_math": 0.07142857275903225,
+      "step": 591
+    },
+    {
+      "clip_ratio": 0.0002982392526291733,
+      "epoch": 0.9407546831585551,
+      "grad_norm": 0.03948375955224037,
+      "kl": 0.005082845687866211,
+      "learning_rate": 4.416006877122948e-06,
+      "loss": 0.007,
+      "step": 592
+    },
+    {
+      "clip_ratio": 0.00033647330587882607,
+      "epoch": 0.9426667861731051,
+      "grad_norm": 0.041717879474163055,
+      "kl": 0.005113363265991211,
+      "learning_rate": 4.411971227659933e-06,
+      "loss": 0.0069,
+      "step": 593
+    },
+    {
+      "clip_ratio": 0.00036752876485479646,
+      "epoch": 0.944578889187655,
+      "grad_norm": 0.04109462723135948,
+      "kl": 0.005068063735961914,
+      "learning_rate": 4.407923539701486e-06,
+      "loss": 0.0069,
+      "step": 594
+    },
+    {
+      "clip_ratio": 0.0003528254699176614,
+      "epoch": 0.9464909922022049,
+      "grad_norm": 0.03620041161775589,
+      "kl": 0.0049245357513427734,
+      "learning_rate": 4.403863838733386e-06,
+      "loss": 0.0068,
+      "step": 595
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 545.2444491386414,
+      "epoch": 0.9484030952167548,
+      "grad_norm": 42.05046463012695,
+      "kl": 0.3311493396759033,
+      "learning_rate": 4.399792150317048e-06,
+      "loss": 0.0203,
+      "num_tokens": 362096328.0,
+      "reward": 0.06026786071015522,
+      "reward_std": 0.07324766798410565,
+      "rewards/pure_accuracy_reward_math": 0.06026785832364112,
+      "step": 596
+    },
+    {
+      "clip_ratio": 0.0003009684866128737,
+      "epoch": 0.9503151982313047,
+      "grad_norm": 0.575372040271759,
+      "kl": 0.01551508903503418,
+      "learning_rate": 4.395708500089366e-06,
+      "loss": 0.0076,
+      "step": 597
+    },
+    {
+      "clip_ratio": 0.0003299758830053179,
+      "epoch": 0.9522273012458546,
+      "grad_norm": 0.052088066935539246,
+      "kl": 0.01082468032836914,
+      "learning_rate": 4.391612913762549e-06,
+      "loss": 0.0074,
+      "step": 598
+    },
+    {
+      "clip_ratio": 0.00032988658261956516,
+      "epoch": 0.9541394042604046,
+      "grad_norm": 0.046673182398080826,
+      "kl": 0.011472225189208984,
+      "learning_rate": 4.38750541712396e-06,
+      "loss": 0.0074,
+      "step": 599
+    },
+    {
+      "clip_ratio": 0.00031585949000145774,
+      "epoch": 0.9560515072749545,
+      "grad_norm": 0.04350757598876953,
+      "kl": 0.011662006378173828,
+      "learning_rate": 4.383386036035956e-06,
+      "loss": 0.0074,
+      "step": 600
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 539.0309958457947,
+      "epoch": 0.9579636102895044,
+      "grad_norm": 0.04193362593650818,
+      "kl": 0.005011081695556641,
+      "learning_rate": 4.379254796435719e-06,
+      "loss": 0.0085,
+      "num_tokens": 365761119.0,
+      "reward": 0.06696428923169151,
+      "reward_std": 0.08311965479515493,
+      "rewards/pure_accuracy_reward_math": 0.06696428667055443,
+      "step": 601
+    },
+    {
+      "clip_ratio": 0.0003076634293392999,
+      "epoch": 0.9598757133040543,
+      "grad_norm": 0.04204736277461052,
+      "kl": 0.005095720291137695,
+      "learning_rate": 4.375111724335102e-06,
+      "loss": 0.0085,
+      "step": 602
+    },
+    {
+      "clip_ratio": 0.0002991189727481469,
+      "epoch": 0.9617878163186042,
+      "grad_norm": 0.041649866849184036,
+      "kl": 0.00509333610534668,
+      "learning_rate": 4.370956845820455e-06,
+      "loss": 0.0085,
+      "step": 603
+    },
+    {
+      "clip_ratio": 0.0003053998929090085,
+      "epoch": 0.963699919333154,
+      "grad_norm": 0.03969484567642212,
+      "kl": 0.005100727081298828,
+      "learning_rate": 4.366790187052468e-06,
+      "loss": 0.0084,
+      "step": 604
+    },
+    {
+      "clip_ratio": 0.0003063883330014505,
+      "epoch": 0.9656120223477039,
+      "grad_norm": 0.03833401948213577,
+      "kl": 0.005064487457275391,
+      "learning_rate": 4.362611774266005e-06,
+      "loss": 0.0083,
+      "step": 605
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 534.4046006202698,
+      "epoch": 0.9675241253622539,
+      "grad_norm": 0.038279399275779724,
+      "kl": 0.005177021026611328,
+      "learning_rate": 4.358421633769934e-06,
+      "loss": 0.0061,
+      "num_tokens": 369412689.0,
+      "reward": 0.07087053885334171,
+      "reward_std": 0.08299326128326356,
+      "rewards/pure_accuracy_reward_math": 0.0708705369324889,
+      "step": 606
+    },
+    {
+      "clip_ratio": 0.00030927538728064974,
+      "epoch": 0.9694362283768038,
+      "grad_norm": 0.037665851414203644,
+      "kl": 0.005164146423339844,
+      "learning_rate": 4.35421979194697e-06,
+      "loss": 0.0061,
+      "step": 607
+    },
+    {
+      "clip_ratio": 0.0003293242310178357,
+      "epoch": 0.9713483313913537,
+      "grad_norm": 0.036888375878334045,
+      "kl": 0.005212306976318359,
+      "learning_rate": 4.3500062752535e-06,
+      "loss": 0.006,
+      "step": 608
+    },
+    {
+      "clip_ratio": 0.0003369250752029984,
+      "epoch": 0.9732604344059036,
+      "grad_norm": 0.03607965633273125,
+      "kl": 0.005278587341308594,
+      "learning_rate": 4.3457811102194225e-06,
+      "loss": 0.006,
+      "step": 609
+    },
+    {
+      "clip_ratio": 0.00034393194414406025,
+      "epoch": 0.9751725374204535,
+      "grad_norm": 0.036863330751657486,
+      "kl": 0.005379676818847656,
+      "learning_rate": 4.341544323447978e-06,
+      "loss": 0.0059,
+      "step": 610
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.9905385971069,
+      "epoch": 0.9770846404350034,
+      "grad_norm": 0.03825363516807556,
+      "kl": 0.005227804183959961,
+      "learning_rate": 4.33729594161558e-06,
+      "loss": 0.0103,
+      "num_tokens": 373041503.0,
+      "reward": 0.07254464607103728,
+      "reward_std": 0.07848271250259131,
+      "rewards/pure_accuracy_reward_math": 0.07254464444122277,
+      "step": 611
+    },
+    {
+      "clip_ratio": 0.0002938344064205012,
+      "epoch": 0.9789967434495533,
+      "grad_norm": 0.037028077989816666,
+      "kl": 0.005240917205810547,
+      "learning_rate": 4.333035991471653e-06,
+      "loss": 0.0102,
+      "step": 612
+    },
+    {
+      "clip_ratio": 0.00029232190240691125,
+      "epoch": 0.9809088464641033,
+      "grad_norm": 0.03623189404606819,
+      "kl": 0.005187034606933594,
+      "learning_rate": 4.328764499838456e-06,
+      "loss": 0.0102,
+      "step": 613
+    },
+    {
+      "clip_ratio": 0.000318144969014611,
+      "epoch": 0.9828209494786532,
+      "grad_norm": 0.036878351122140884,
+      "kl": 0.005211830139160156,
+      "learning_rate": 4.324481493610919e-06,
+      "loss": 0.0101,
+      "step": 614
+    },
+    {
+      "clip_ratio": 0.0003371401809317831,
+      "epoch": 0.9847330524932031,
+      "grad_norm": 0.036278340965509415,
+      "kl": 0.0051462650299072266,
+      "learning_rate": 4.320186999756473e-06,
+      "loss": 0.0101,
+      "step": 615
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 513.4927659034729,
+      "epoch": 0.986645155507753,
+      "grad_norm": 0.037584077566862106,
+      "kl": 0.005333662033081055,
+      "learning_rate": 4.315881045314878e-06,
+      "loss": 0.007,
+      "num_tokens": 376615645.0,
+      "reward": 0.07087053899886087,
+      "reward_std": 0.07342032523592934,
+      "rewards/pure_accuracy_reward_math": 0.0708705370198004,
+      "step": 616
+    },
+    {
+      "clip_ratio": 0.0002886684330292155,
+      "epoch": 0.9885572585223029,
+      "grad_norm": 0.035872798413038254,
+      "kl": 0.005288362503051758,
+      "learning_rate": 4.311563657398056e-06,
+      "loss": 0.007,
+      "step": 617
+    },
+    {
+      "clip_ratio": 0.0002961605097766551,
+      "epoch": 0.9904693615368528,
+      "grad_norm": 0.034989748150110245,
+      "kl": 0.0052263736724853516,
+      "learning_rate": 4.307234863189917e-06,
+      "loss": 0.007,
+      "step": 618
+    },
+    {
+      "clip_ratio": 0.0003532402791392997,
+      "epoch": 0.9923814645514027,
+      "grad_norm": 0.0338488332927227,
+      "kl": 0.005165576934814453,
+      "learning_rate": 4.302894689946189e-06,
+      "loss": 0.0069,
+      "step": 619
+    },
+    {
+      "clip_ratio": 0.00035387994120128496,
+      "epoch": 0.9942935675659527,
+      "grad_norm": 0.03370453417301178,
+      "kl": 0.005126953125,
+      "learning_rate": 4.298543164994249e-06,
+      "loss": 0.0069,
+      "step": 620
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 526.433337688446,
+      "epoch": 1.00191210301455,
+      "grad_norm": 0.0355641208589077,
+      "kl": 0.004958152770996094,
+      "learning_rate": 4.294180315732946e-06,
+      "loss": 0.0063,
+      "num_tokens": 380233970.0,
+      "reward": 0.05412946696742438,
+      "reward_std": 0.06637858302565292,
+      "rewards/pure_accuracy_reward_math": 0.0541294657450635,
+      "step": 621
+    },
+    {
+      "clip_ratio": 0.0002793830541349962,
+      "epoch": 1.0038242060290998,
+      "grad_norm": 0.034697938710451126,
+      "kl": 0.004967689514160156,
+      "learning_rate": 4.289806169632434e-06,
+      "loss": 0.0063,
+      "step": 622
+    },
+    {
+      "clip_ratio": 0.00026950584020823953,
+      "epoch": 1.0057363090436497,
+      "grad_norm": 0.034267228096723557,
+      "kl": 0.005029439926147461,
+      "learning_rate": 4.285420754233992e-06,
+      "loss": 0.0062,
+      "step": 623
+    },
+    {
+      "clip_ratio": 0.0002694177366606709,
+      "epoch": 1.0076484120581997,
+      "grad_norm": 0.03245500102639198,
+      "kl": 0.005047798156738281,
+      "learning_rate": 4.2810240971498594e-06,
+      "loss": 0.0062,
+      "step": 624
+    },
+    {
+      "clip_ratio": 0.0002762260926942872,
+      "epoch": 1.0095605150727496,
+      "grad_norm": 0.03143523633480072,
+      "kl": 0.005035400390625,
+      "learning_rate": 4.276616226063055e-06,
+      "loss": 0.0061,
+      "step": 625
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.094889163971,
+      "epoch": 1.0114726180872995,
+      "grad_norm": 0.03780335932970047,
+      "kl": 0.005240440368652344,
+      "learning_rate": 4.272197168727204e-06,
+      "loss": 0.0082,
+      "num_tokens": 383858818.0,
+      "reward": 0.06891741388244554,
+      "reward_std": 0.07891435397323221,
+      "rewards/pure_accuracy_reward_math": 0.06891741207800806,
+      "step": 626
+    },
+    {
+      "clip_ratio": 0.0002971897219481434,
+      "epoch": 1.0133847211018494,
+      "grad_norm": 0.03676832467317581,
+      "kl": 0.005240440368652344,
+      "learning_rate": 4.267766952966369e-06,
+      "loss": 0.0082,
+      "step": 627
+    },
+    {
+      "clip_ratio": 0.00032256075144232454,
+      "epoch": 1.0152968241163993,
+      "grad_norm": 0.03722486272454262,
+      "kl": 0.005322933197021484,
+      "learning_rate": 4.263325606674865e-06,
+      "loss": 0.0082,
+      "step": 628
+    },
+    {
+      "clip_ratio": 0.00031109488622860226,
+      "epoch": 1.0172089271309492,
+      "grad_norm": 0.036808740347623825,
+      "kl": 0.0054111480712890625,
+      "learning_rate": 4.258873157817093e-06,
+      "loss": 0.0081,
+      "step": 629
+    },
+    {
+      "clip_ratio": 0.00032292150183366175,
+      "epoch": 1.0191210301454992,
+      "grad_norm": 0.03518703579902649,
+      "kl": 0.005442619323730469,
+      "learning_rate": 4.254409634427356e-06,
+      "loss": 0.008,
+      "step": 630
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 515.6958961486816,
+      "epoch": 1.021033133160049,
+      "grad_norm": 0.03399791195988655,
+      "kl": 0.005387306213378906,
+      "learning_rate": 4.249935064609692e-06,
+      "loss": 0.0031,
+      "num_tokens": 387438928.0,
+      "reward": 0.06250000285217538,
+      "reward_std": 0.06757478544022888,
+      "rewards/pure_accuracy_reward_math": 0.06250000145519152,
+      "step": 631
+    },
+    {
+      "clip_ratio": 0.0002553542814212051,
+      "epoch": 1.022945236174599,
+      "grad_norm": 0.03381386399269104,
+      "kl": 0.005375385284423828,
+      "learning_rate": 4.245449476537685e-06,
+      "loss": 0.0031,
+      "step": 632
+    },
+    {
+      "clip_ratio": 0.00023506408626872144,
+      "epoch": 1.024857339189149,
+      "grad_norm": 0.03337083011865616,
+      "kl": 0.00537109375,
+      "learning_rate": 4.2409528984543e-06,
+      "loss": 0.003,
+      "step": 633
+    },
+    {
+      "clip_ratio": 0.0002632986112871549,
+      "epoch": 1.0267694422036988,
+      "grad_norm": 0.03213095664978027,
+      "kl": 0.005321979522705078,
+      "learning_rate": 4.236445358671696e-06,
+      "loss": 0.003,
+      "step": 634
+    },
+    {
+      "clip_ratio": 0.00025607587781451,
+      "epoch": 1.0286815452182487,
+      "grad_norm": 0.03154142573475838,
+      "kl": 0.005255699157714844,
+      "learning_rate": 4.23192688557105e-06,
+      "loss": 0.0029,
+      "step": 635
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 524.272346496582,
+      "epoch": 1.0305936482327986,
+      "grad_norm": 0.039318569004535675,
+      "kl": 0.005155801773071289,
+      "learning_rate": 4.2273975076023835e-06,
+      "loss": 0.0075,
+      "num_tokens": 391053556.0,
+      "reward": 0.06473214598372579,
+      "reward_std": 0.07401842583203688,
+      "rewards/pure_accuracy_reward_math": 0.06473214412108064,
+      "step": 636
+    },
+    {
+      "clip_ratio": 0.0003024499371804268,
+      "epoch": 1.0325057512473486,
+      "grad_norm": 0.03726111724972725,
+      "kl": 0.0050776004791259766,
+      "learning_rate": 4.222857253284376e-06,
+      "loss": 0.0075,
+      "step": 637
+    },
+    {
+      "clip_ratio": 0.0003151753968495541,
+      "epoch": 1.0344178542618985,
+      "grad_norm": 0.03595959022641182,
+      "kl": 0.005060434341430664,
+      "learning_rate": 4.218306151204188e-06,
+      "loss": 0.0074,
+      "step": 638
+    },
+    {
+      "clip_ratio": 0.0003387899199083222,
+      "epoch": 1.0363299572764482,
+      "grad_norm": 0.03628028184175491,
+      "kl": 0.005034923553466797,
+      "learning_rate": 4.213744230017283e-06,
+      "loss": 0.0074,
+      "step": 639
+    },
+    {
+      "clip_ratio": 0.00037899152403042535,
+      "epoch": 1.038242060290998,
+      "grad_norm": 0.03670131787657738,
+      "kl": 0.005095720291137695,
+      "learning_rate": 4.209171518447248e-06,
+      "loss": 0.0073,
+      "step": 640
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 536.5907049179077,
+      "epoch": 1.040154163305548,
+      "grad_norm": 0.03938442841172218,
+      "kl": 0.0051763057708740234,
+      "learning_rate": 4.204588045285607e-06,
+      "loss": 0.0022,
+      "num_tokens": 394708581.0,
+      "reward": 0.06333705710130744,
+      "reward_std": 0.07792467664694414,
+      "rewards/pure_accuracy_reward_math": 0.06333705500583164,
+      "step": 641
+    },
+    {
+      "clip_ratio": 0.0002767174905216052,
+      "epoch": 1.042066266320098,
+      "grad_norm": 0.037835828959941864,
+      "kl": 0.005267143249511719,
+      "learning_rate": 4.1999938393916424e-06,
+      "loss": 0.0022,
+      "step": 642
+    },
+    {
+      "clip_ratio": 0.0003277845591469486,
+      "epoch": 1.0439783693346478,
+      "grad_norm": 0.03832162916660309,
+      "kl": 0.005464792251586914,
+      "learning_rate": 4.195388929692217e-06,
+      "loss": 0.0022,
+      "step": 643
+    },
+    {
+      "clip_ratio": 0.00035426640954483446,
+      "epoch": 1.0458904723491977,
+      "grad_norm": 0.03823033347725868,
+      "kl": 0.005482673645019531,
+      "learning_rate": 4.190773345181587e-06,
+      "loss": 0.0021,
+      "step": 644
+    },
+    {
+      "clip_ratio": 0.0003763593267649412,
+      "epoch": 1.0478025753637477,
+      "grad_norm": 0.036984797567129135,
+      "kl": 0.005467653274536133,
+      "learning_rate": 4.186147114921221e-06,
+      "loss": 0.002,
+      "step": 645
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.9266424179077,
+      "epoch": 1.0497146783782976,
+      "grad_norm": 0.0355878509581089,
+      "kl": 0.005333423614501953,
+      "learning_rate": 4.18151026803962e-06,
+      "loss": 0.0056,
+      "num_tokens": 398334618.0,
+      "reward": 0.06305803850409575,
+      "reward_std": 0.06942774693015963,
+      "rewards/pure_accuracy_reward_math": 0.06305803699069656,
+      "step": 646
+    },
+    {
+      "clip_ratio": 0.00024814905674475085,
+      "epoch": 1.0516267813928475,
+      "grad_norm": 0.034741513431072235,
+      "kl": 0.005269289016723633,
+      "learning_rate": 4.176862833732127e-06,
+      "loss": 0.0056,
+      "step": 647
+    },
+    {
+      "clip_ratio": 0.00027503305113896204,
+      "epoch": 1.0535388844073974,
+      "grad_norm": 0.03375249356031418,
+      "kl": 0.005173683166503906,
+      "learning_rate": 4.1722048412607495e-06,
+      "loss": 0.0055,
+      "step": 648
+    },
+    {
+      "clip_ratio": 0.0002895867207826086,
+      "epoch": 1.0554509874219473,
+      "grad_norm": 0.0341072678565979,
+      "kl": 0.005132198333740234,
+      "learning_rate": 4.167536319953976e-06,
+      "loss": 0.0055,
+      "step": 649
+    },
+    {
+      "clip_ratio": 0.0003005371929134526,
+      "epoch": 1.0573630904364972,
+      "grad_norm": 0.033096957951784134,
+      "kl": 0.005170345306396484,
+      "learning_rate": 4.162857299206584e-06,
+      "loss": 0.0054,
+      "step": 650
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 538.7528138160706,
+      "epoch": 1.0592751934510471,
+      "grad_norm": 0.03696604445576668,
+      "kl": 0.0052814483642578125,
+      "learning_rate": 4.158167808479461e-06,
+      "loss": 0.0097,
+      "num_tokens": 401997276.0,
+      "reward": 0.05943080657743849,
+      "reward_std": 0.07388583471765742,
+      "rewards/pure_accuracy_reward_math": 0.05943080494762398,
+      "step": 651
+    },
+    {
+      "clip_ratio": 0.00029416859939601636,
+      "epoch": 1.061187296465597,
+      "grad_norm": 0.03565770015120506,
+      "kl": 0.005290031433105469,
+      "learning_rate": 4.153467877299419e-06,
+      "loss": 0.0097,
+      "step": 652
+    },
+    {
+      "clip_ratio": 0.00029473524284640007,
+      "epoch": 1.063099399480147,
+      "grad_norm": 0.03546367585659027,
+      "kl": 0.005368709564208984,
+      "learning_rate": 4.148757535259004e-06,
+      "loss": 0.0096,
+      "step": 653
+    },
+    {
+      "clip_ratio": 0.00032781071104182047,
+      "epoch": 1.065011502494697,
+      "grad_norm": 0.03601039946079254,
+      "kl": 0.005382061004638672,
+      "learning_rate": 4.144036812016317e-06,
+      "loss": 0.0096,
+      "step": 654
+    },
+    {
+      "clip_ratio": 0.0003433626044397897,
+      "epoch": 1.0669236055092468,
+      "grad_norm": 0.035073794424533844,
+      "kl": 0.0053446292877197266,
+      "learning_rate": 4.139305737294818e-06,
+      "loss": 0.0095,
+      "step": 655
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.1163725852966,
+      "epoch": 1.0688357085237967,
+      "grad_norm": 0.03852629289031029,
+      "kl": 0.005383491516113281,
+      "learning_rate": 4.134564340883148e-06,
+      "loss": 0.0083,
+      "num_tokens": 405593985.0,
+      "reward": 0.06445312793948688,
+      "reward_std": 0.07135464163729921,
+      "rewards/pure_accuracy_reward_math": 0.06445312654250301,
+      "step": 656
+    },
+    {
+      "clip_ratio": 0.0002591365355897324,
+      "epoch": 1.0707478115383466,
+      "grad_norm": 0.03745557367801666,
+      "kl": 0.0053327083587646484,
+      "learning_rate": 4.129812652634936e-06,
+      "loss": 0.0083,
+      "step": 657
+    },
+    {
+      "clip_ratio": 0.0003071958567772981,
+      "epoch": 1.0726599145528966,
+      "grad_norm": 0.037043727934360504,
+      "kl": 0.00532078742980957,
+      "learning_rate": 4.1250507024686115e-06,
+      "loss": 0.0083,
+      "step": 658
+    },
+    {
+      "clip_ratio": 0.00029935286954696494,
+      "epoch": 1.0745720175674465,
+      "grad_norm": 0.03582773730158806,
+      "kl": 0.005355358123779297,
+      "learning_rate": 4.120278520367217e-06,
+      "loss": 0.0082,
+      "step": 659
+    },
+    {
+      "clip_ratio": 0.0003111159166451216,
+      "epoch": 1.0764841205819964,
+      "grad_norm": 0.035313159227371216,
+      "kl": 0.005402326583862305,
+      "learning_rate": 4.115496136378219e-06,
+      "loss": 0.0081,
+      "step": 660
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 509.2994108200073,
+      "epoch": 1.0783962235965463,
+      "grad_norm": 0.041104141622781754,
+      "kl": 0.005465507507324219,
+      "learning_rate": 4.110703580613321e-06,
+      "loss": 0.0074,
+      "num_tokens": 409156330.0,
+      "reward": 0.0641741098370403,
+      "reward_std": 0.08329231233801693,
+      "rewards/pure_accuracy_reward_math": 0.06417410826543346,
+      "step": 661
+    },
+    {
+      "clip_ratio": 0.0003218170786567498,
+      "epoch": 1.0803083266110962,
+      "grad_norm": 0.03970121592283249,
+      "kl": 0.005608558654785156,
+      "learning_rate": 4.105900883248269e-06,
+      "loss": 0.0074,
+      "step": 662
+    },
+    {
+      "clip_ratio": 0.00032362689415776913,
+      "epoch": 1.0822204296256461,
+      "grad_norm": 0.039676353335380554,
+      "kl": 0.005734920501708984,
+      "learning_rate": 4.101088074522667e-06,
+      "loss": 0.0074,
+      "step": 663
+    },
+    {
+      "clip_ratio": 0.000323468098201829,
+      "epoch": 1.084132532640196,
+      "grad_norm": 0.03883183002471924,
+      "kl": 0.005713939666748047,
+      "learning_rate": 4.096265184739781e-06,
+      "loss": 0.0073,
+      "step": 664
+    },
+    {
+      "clip_ratio": 0.00033196881122421473,
+      "epoch": 1.086044635654746,
+      "grad_norm": 0.037281692028045654,
+      "kl": 0.0056934356689453125,
+      "learning_rate": 4.091432244266354e-06,
+      "loss": 0.0072,
+      "step": 665
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.48774766922,
+      "epoch": 1.0879567386692959,
+      "grad_norm": 0.037982553243637085,
+      "kl": 0.005854606628417969,
+      "learning_rate": 4.08658928353241e-06,
+      "loss": 0.0086,
+      "num_tokens": 412758914.0,
+      "reward": 0.06835937799769454,
+      "reward_std": 0.07526708859950304,
+      "rewards/pure_accuracy_reward_math": 0.06835937630967237,
+      "step": 666
+    },
+    {
+      "clip_ratio": 0.0002976899445457093,
+      "epoch": 1.0898688416838458,
+      "grad_norm": 0.03663322329521179,
+      "kl": 0.005788326263427734,
+      "learning_rate": 4.081736333031066e-06,
+      "loss": 0.0086,
+      "step": 667
+    },
+    {
+      "clip_ratio": 0.0002965517393818118,
+      "epoch": 1.0917809446983957,
+      "grad_norm": 0.03593512997031212,
+      "kl": 0.005764484405517578,
+      "learning_rate": 4.0768734233183376e-06,
+      "loss": 0.0085,
+      "step": 668
+    },
+    {
+      "clip_ratio": 0.0003466513953753747,
+      "epoch": 1.0936930477129456,
+      "grad_norm": 0.03643948212265968,
+      "kl": 0.005777835845947266,
+      "learning_rate": 4.072000585012947e-06,
+      "loss": 0.0085,
+      "step": 669
+    },
+    {
+      "clip_ratio": 0.00037185640462666925,
+      "epoch": 1.0956051507274955,
+      "grad_norm": 0.03601692244410515,
+      "kl": 0.0058193206787109375,
+      "learning_rate": 4.06711784879613e-06,
+      "loss": 0.0084,
+      "step": 670
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 526.0530390739441,
+      "epoch": 1.0975172537420455,
+      "grad_norm": 0.03892623260617256,
+      "kl": 0.005596637725830078,
+      "learning_rate": 4.062225245411444e-06,
+      "loss": 0.007,
+      "num_tokens": 416383588.0,
+      "reward": 0.061104913387680426,
+      "reward_std": 0.07539348275167868,
+      "rewards/pure_accuracy_reward_math": 0.06110491187428124,
+      "step": 671
+    },
+    {
+      "clip_ratio": 0.0003017952032280391,
+      "epoch": 1.0994293567565951,
+      "grad_norm": 0.0375184491276741,
+      "kl": 0.0056912899017333984,
+      "learning_rate": 4.057322805664576e-06,
+      "loss": 0.007,
+      "step": 672
+    },
+    {
+      "clip_ratio": 0.0002928147856096075,
+      "epoch": 1.1013414597711453,
+      "grad_norm": 0.03731007128953934,
+      "kl": 0.0057830810546875,
+      "learning_rate": 4.0524105604231435e-06,
+      "loss": 0.0069,
+      "step": 673
+    },
+    {
+      "clip_ratio": 0.000317500726794151,
+      "epoch": 1.103253562785695,
+      "grad_norm": 0.03885798528790474,
+      "kl": 0.005819559097290039,
+      "learning_rate": 4.047488540616503e-06,
+      "loss": 0.0069,
+      "step": 674
+    },
+    {
+      "clip_ratio": 0.0003141532706649741,
+      "epoch": 1.105165665800245,
+      "grad_norm": 0.03583172708749771,
+      "kl": 0.005753278732299805,
+      "learning_rate": 4.042556777235558e-06,
+      "loss": 0.0068,
+      "step": 675
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.9950060844421,
+      "epoch": 1.1070777688147948,
+      "grad_norm": 0.03652811422944069,
+      "kl": 0.005724668502807617,
+      "learning_rate": 4.037615301332559e-06,
+      "loss": 0.0088,
+      "num_tokens": 419993906.0,
+      "reward": 0.061383931315504014,
+      "reward_std": 0.07067021139664575,
+      "rewards/pure_accuracy_reward_math": 0.06138392974389717,
+      "step": 676
+    },
+    {
+      "clip_ratio": 0.00028260578790195723,
+      "epoch": 1.1089898718293447,
+      "grad_norm": 0.035632383078336716,
+      "kl": 0.0056421756744384766,
+      "learning_rate": 4.0326641440209114e-06,
+      "loss": 0.0088,
+      "step": 677
+    },
+    {
+      "clip_ratio": 0.0002882395116614589,
+      "epoch": 1.1109019748438946,
+      "grad_norm": 0.03453977406024933,
+      "kl": 0.005593061447143555,
+      "learning_rate": 4.027703336474979e-06,
+      "loss": 0.0087,
+      "step": 678
+    },
+    {
+      "clip_ratio": 0.000319835560901538,
+      "epoch": 1.1128140778584446,
+      "grad_norm": 0.03415689244866371,
+      "kl": 0.005594968795776367,
+      "learning_rate": 4.022732909929883e-06,
+      "loss": 0.0087,
+      "step": 679
+    },
+    {
+      "clip_ratio": 0.00033849146848297096,
+      "epoch": 1.1147261808729945,
+      "grad_norm": 0.03406994044780731,
+      "kl": 0.005631208419799805,
+      "learning_rate": 4.017752895681315e-06,
+      "loss": 0.0086,
+      "step": 680
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 521.6057720184326,
+      "epoch": 1.1166382838875444,
+      "grad_norm": 0.06026715040206909,
+      "kl": 0.005751848220825195,
+      "learning_rate": 4.012763325085332e-06,
+      "loss": 0.0067,
+      "num_tokens": 423598941.0,
+      "reward": 0.07198661082657054,
+      "reward_std": 0.08763020345941186,
+      "rewards/pure_accuracy_reward_math": 0.07198660844005644,
+      "step": 681
+    },
+    {
+      "clip_ratio": 0.00031779767027728667,
+      "epoch": 1.1185503869020943,
+      "grad_norm": 2.6160011291503906,
+      "kl": 0.005651235580444336,
+      "learning_rate": 4.0077642295581605e-06,
+      "loss": 0.007,
+      "step": 682
+    },
+    {
+      "clip_ratio": 0.00035409004277653366,
+      "epoch": 1.1204624899166442,
+      "grad_norm": 6.490725994110107,
+      "kl": 0.04636049270629883,
+      "learning_rate": 4.002755640576002e-06,
+      "loss": 0.0083,
+      "step": 683
+    },
+    {
+      "clip_ratio": 0.000386831109835839,
+      "epoch": 1.1223745929311941,
+      "grad_norm": 0.13183599710464478,
+      "kl": 0.0063648223876953125,
+      "learning_rate": 3.997737589674828e-06,
+      "loss": 0.0067,
+      "step": 684
+    },
+    {
+      "clip_ratio": 0.00042002629169246575,
+      "epoch": 1.124286695945744,
+      "grad_norm": 61.113468170166016,
+      "kl": 0.00571751594543457,
+      "learning_rate": 3.992710108450192e-06,
+      "loss": 0.0205,
+      "step": 685
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 534.679431438446,
+      "epoch": 1.126198798960294,
+      "grad_norm": 0.0341753326356411,
+      "kl": 0.006865501403808594,
+      "learning_rate": 3.987673228557017e-06,
+      "loss": 0.0032,
+      "num_tokens": 427249916.0,
+      "reward": 0.056919645285233855,
+      "reward_std": 0.06538890511728823,
+      "rewards/pure_accuracy_reward_math": 0.05691964429570362,
+      "step": 686
+    },
+    {
+      "clip_ratio": 0.00022898520234093667,
+      "epoch": 1.1281109019748439,
+      "grad_norm": 0.03356679156422615,
+      "kl": 0.006783246994018555,
+      "learning_rate": 3.982626981709412e-06,
+      "loss": 0.0032,
+      "step": 687
+    },
+    {
+      "clip_ratio": 0.00023695471924156664,
+      "epoch": 1.1300230049893938,
+      "grad_norm": 0.03283276781439781,
+      "kl": 0.006662845611572266,
+      "learning_rate": 3.977571399680457e-06,
+      "loss": 0.0031,
+      "step": 688
+    },
+    {
+      "clip_ratio": 0.000234549945901108,
+      "epoch": 1.1319351080039437,
+      "grad_norm": 0.032041046768426895,
+      "kl": 0.00657343864440918,
+      "learning_rate": 3.972506514302013e-06,
+      "loss": 0.0031,
+      "step": 689
+    },
+    {
+      "clip_ratio": 0.00026119674055280484,
+      "epoch": 1.1338472110184936,
+      "grad_norm": 0.03098335862159729,
+      "kl": 0.006501674652099609,
+      "learning_rate": 3.967432357464518e-06,
+      "loss": 0.003,
+      "step": 690
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 533.4330596923828,
+      "epoch": 1.1357593140330435,
+      "grad_norm": 0.03648236393928528,
+      "kl": 0.005389690399169922,
+      "learning_rate": 3.962348961116786e-06,
+      "loss": 0.0075,
+      "num_tokens": 430894100.0,
+      "reward": 0.059151788300368935,
+      "reward_std": 0.06680402747588232,
+      "rewards/pure_accuracy_reward_math": 0.059151787078008056,
+      "step": 691
+    },
+    {
+      "clip_ratio": 0.00024069582485708452,
+      "epoch": 1.1376714170475934,
+      "grad_norm": 0.03502041473984718,
+      "kl": 0.005405902862548828,
+      "learning_rate": 3.957256357265806e-06,
+      "loss": 0.0075,
+      "step": 692
+    },
+    {
+      "clip_ratio": 0.00026108162376203836,
+      "epoch": 1.1395835200621434,
+      "grad_norm": 0.03438780456781387,
+      "kl": 0.0054416656494140625,
+      "learning_rate": 3.952154577976543e-06,
+      "loss": 0.0075,
+      "step": 693
+    },
+    {
+      "clip_ratio": 0.0002536772994972125,
+      "epoch": 1.1414956230766933,
+      "grad_norm": 0.03388332575559616,
+      "kl": 0.005480289459228516,
+      "learning_rate": 3.947043655371734e-06,
+      "loss": 0.0075,
+      "step": 694
+    },
+    {
+      "clip_ratio": 0.00027197748300977764,
+      "epoch": 1.1434077260912432,
+      "grad_norm": 0.03378571942448616,
+      "kl": 0.005473136901855469,
+      "learning_rate": 3.941923621631683e-06,
+      "loss": 0.0074,
+      "step": 695
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.0050506591797,
+      "epoch": 1.145319829105793,
+      "grad_norm": 0.040138646960258484,
+      "kl": 0.005397796630859375,
+      "learning_rate": 3.936794508994062e-06,
+      "loss": 0.0033,
+      "num_tokens": 434502306.0,
+      "reward": 0.07142857456346974,
+      "reward_std": 0.08093377470504493,
+      "rewards/pure_accuracy_reward_math": 0.07142857316648588,
+      "step": 696
+    },
+    {
+      "clip_ratio": 0.00026038982610998573,
+      "epoch": 1.147231932120343,
+      "grad_norm": 0.03855022042989731,
+      "kl": 0.005437135696411133,
+      "learning_rate": 3.931656349753709e-06,
+      "loss": 0.0033,
+      "step": 697
+    },
+    {
+      "clip_ratio": 0.0002577857798655714,
+      "epoch": 1.149144035134893,
+      "grad_norm": 0.03805391117930412,
+      "kl": 0.005386829376220703,
+      "learning_rate": 3.9265091762624225e-06,
+      "loss": 0.0032,
+      "step": 698
+    },
+    {
+      "clip_ratio": 0.0002938498616913421,
+      "epoch": 1.1510561381494429,
+      "grad_norm": 0.03830750659108162,
+      "kl": 0.005461931228637695,
+      "learning_rate": 3.921353020928756e-06,
+      "loss": 0.0032,
+      "step": 699
+    },
+    {
+      "clip_ratio": 0.00026367085320089245,
+      "epoch": 1.1529682411639928,
+      "grad_norm": 0.03759397566318512,
+      "kl": 0.0055010318756103516,
+      "learning_rate": 3.916187916217818e-06,
+      "loss": 0.0031,
+      "step": 700
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 532.7466740608215,
+      "epoch": 1.1548803441785427,
+      "grad_norm": 0.03618447855114937,
+      "kl": 0.0054166316986083984,
+      "learning_rate": 3.911013894651067e-06,
+      "loss": 0.0066,
+      "num_tokens": 438144462.0,
+      "reward": 0.06501116344588809,
+      "reward_std": 0.07457646209513769,
+      "rewards/pure_accuracy_reward_math": 0.06501116175786592,
+      "step": 701
+    },
+    {
+      "clip_ratio": 0.00028753443712048465,
+      "epoch": 1.1567924471930926,
+      "grad_norm": 0.035918354988098145,
+      "kl": 0.005413532257080078,
+      "learning_rate": 3.905830988806101e-06,
+      "loss": 0.0066,
+      "step": 702
+    },
+    {
+      "clip_ratio": 0.0002842856440565811,
+      "epoch": 1.1587045502076425,
+      "grad_norm": 0.03422370180487633,
+      "kl": 0.005442619323730469,
+      "learning_rate": 3.90063923131646e-06,
+      "loss": 0.0066,
+      "step": 703
+    },
+    {
+      "clip_ratio": 0.0002819241568090547,
+      "epoch": 1.1606166532221924,
+      "grad_norm": 0.03359530121088028,
+      "kl": 0.00537109375,
+      "learning_rate": 3.895438654871416e-06,
+      "loss": 0.0065,
+      "step": 704
+    },
+    {
+      "clip_ratio": 0.0003241457142166837,
+      "epoch": 1.1625287562367423,
+      "grad_norm": 0.033465541899204254,
+      "kl": 0.0053484439849853516,
+      "learning_rate": 3.890229292215773e-06,
+      "loss": 0.0065,
+      "step": 705
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 526.7639741897583,
+      "epoch": 1.1644408592512923,
+      "grad_norm": 0.03731166943907738,
+      "kl": 0.00535893440246582,
+      "learning_rate": 3.885011176149647e-06,
+      "loss": 0.0071,
+      "num_tokens": 441760876.0,
+      "reward": 0.06612723506987095,
+      "reward_std": 0.06822534691309556,
+      "rewards/pure_accuracy_reward_math": 0.06612723367288709,
+      "step": 706
+    },
+    {
+      "clip_ratio": 0.00025104734473302415,
+      "epoch": 1.166352962265842,
+      "grad_norm": 0.03429851680994034,
+      "kl": 0.005263566970825195,
+      "learning_rate": 3.879784339528277e-06,
+      "loss": 0.0071,
+      "step": 707
+    },
+    {
+      "clip_ratio": 0.0002501190919019791,
+      "epoch": 1.168265065280392,
+      "grad_norm": 0.034958597272634506,
+      "kl": 0.0052831172943115234,
+      "learning_rate": 3.874548815261809e-06,
+      "loss": 0.0071,
+      "step": 708
+    },
+    {
+      "clip_ratio": 0.0002633173795629773,
+      "epoch": 1.1701771682949418,
+      "grad_norm": 0.032111622393131256,
+      "kl": 0.005318403244018555,
+      "learning_rate": 3.869304636315085e-06,
+      "loss": 0.007,
+      "step": 709
+    },
+    {
+      "clip_ratio": 0.00028521847832507774,
+      "epoch": 1.172089271309492,
+      "grad_norm": 0.03191748261451721,
+      "kl": 0.005407810211181641,
+      "learning_rate": 3.864051835707444e-06,
+      "loss": 0.007,
+      "step": 710
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.3457269668579,
+      "epoch": 1.1740013743240416,
+      "grad_norm": 0.05126773193478584,
+      "kl": 0.01187896728515625,
+      "learning_rate": 3.85879044651251e-06,
+      "loss": 0.0066,
+      "num_tokens": 445370959.0,
+      "reward": 0.06863839653669856,
+      "reward_std": 0.07951865292852744,
+      "rewards/pure_accuracy_reward_math": 0.06863839438301511,
+      "step": 711
+    },
+    {
+      "clip_ratio": 0.00028669004558423694,
+      "epoch": 1.1759134773385915,
+      "grad_norm": 0.051731474697589874,
+      "kl": 0.011458396911621094,
+      "learning_rate": 3.853520501857981e-06,
+      "loss": 0.0066,
+      "step": 712
+    },
+    {
+      "clip_ratio": 0.0003143258599038745,
+      "epoch": 1.1778255803531414,
+      "grad_norm": 0.051190439611673355,
+      "kl": 0.010621786117553711,
+      "learning_rate": 3.848242034925429e-06,
+      "loss": 0.0065,
+      "step": 713
+    },
+    {
+      "clip_ratio": 0.00033165596249773444,
+      "epoch": 1.1797376833676914,
+      "grad_norm": 0.04840007424354553,
+      "kl": 0.009693622589111328,
+      "learning_rate": 3.842955078950079e-06,
+      "loss": 0.0064,
+      "step": 714
+    },
+    {
+      "clip_ratio": 0.00035113433239075675,
+      "epoch": 1.1816497863822413,
+      "grad_norm": 0.048264067620038986,
+      "kl": 0.008889198303222656,
+      "learning_rate": 3.837659667220612e-06,
+      "loss": 0.0063,
+      "step": 715
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 547.5633645057678,
+      "epoch": 1.1835618893967912,
+      "grad_norm": 0.03458649665117264,
+      "kl": 0.005284786224365234,
+      "learning_rate": 3.832355833078945e-06,
+      "loss": 0.0047,
+      "num_tokens": 449069046.0,
+      "reward": 0.05691964572179131,
+      "reward_std": 0.06861072586616501,
+      "rewards/pure_accuracy_reward_math": 0.05691964415018447,
+      "step": 716
+    },
+    {
+      "clip_ratio": 0.0002876185501463624,
+      "epoch": 1.185473992411341,
+      "grad_norm": 0.033646877855062485,
+      "kl": 0.005215167999267578,
+      "learning_rate": 3.82704360992003e-06,
+      "loss": 0.0047,
+      "step": 717
+    },
+    {
+      "clip_ratio": 0.0003252235952686533,
+      "epoch": 1.187386095425891,
+      "grad_norm": 0.03455204889178276,
+      "kl": 0.0051419734954833984,
+      "learning_rate": 3.8217230311916365e-06,
+      "loss": 0.0046,
+      "step": 718
+    },
+    {
+      "clip_ratio": 0.0003351885409870192,
+      "epoch": 1.189298198440441,
+      "grad_norm": 0.033362697809934616,
+      "kl": 0.0050907135009765625,
+      "learning_rate": 3.816394130394142e-06,
+      "loss": 0.0046,
+      "step": 719
+    },
+    {
+      "clip_ratio": 0.00032723310141591355,
+      "epoch": 1.1912103014549908,
+      "grad_norm": 0.03211547061800957,
+      "kl": 0.0051004886627197266,
+      "learning_rate": 3.811056941080329e-06,
+      "loss": 0.0045,
+      "step": 720
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 537.3167090415955,
+      "epoch": 1.1931224044695408,
+      "grad_norm": 0.03566175699234009,
+      "kl": 0.0053424835205078125,
+      "learning_rate": 3.805711496855161e-06,
+      "loss": 0.009,
+      "num_tokens": 452726381.0,
+      "reward": 0.06054687776486389,
+      "reward_std": 0.07264336961088702,
+      "rewards/pure_accuracy_reward_math": 0.06054687677533366,
+      "step": 721
+    },
+    {
+      "clip_ratio": 0.00029346574888222676,
+      "epoch": 1.1950345074840907,
+      "grad_norm": 0.03476826474070549,
+      "kl": 0.005379438400268555,
+      "learning_rate": 3.800357831375583e-06,
+      "loss": 0.009,
+      "step": 722
+    },
+    {
+      "clip_ratio": 0.00027920183202923,
+      "epoch": 1.1969466104986406,
+      "grad_norm": 0.03446114435791969,
+      "kl": 0.005425691604614258,
+      "learning_rate": 3.794995978350301e-06,
+      "loss": 0.009,
+      "step": 723
+    },
+    {
+      "clip_ratio": 0.00031396149876172785,
+      "epoch": 1.1988587135131905,
+      "grad_norm": 0.0340140238404274,
+      "kl": 0.005489826202392578,
+      "learning_rate": 3.7896259715395727e-06,
+      "loss": 0.0089,
+      "step": 724
+    },
+    {
+      "clip_ratio": 0.0002986833567888425,
+      "epoch": 1.2007708165277404,
+      "grad_norm": 0.03497212752699852,
+      "kl": 0.005522489547729492,
+      "learning_rate": 3.784247844754997e-06,
+      "loss": 0.0088,
+      "step": 725
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 548.8044338226318,
+      "epoch": 1.2026829195422903,
+      "grad_norm": 0.04050953686237335,
+      "kl": 0.005362510681152344,
+      "learning_rate": 3.778861631859298e-06,
+      "loss": 0.0112,
+      "num_tokens": 456433388.0,
+      "reward": 0.06696428879513405,
+      "reward_std": 0.08140548242954537,
+      "rewards/pure_accuracy_reward_math": 0.06696428728173487,
+      "step": 726
+    },
+    {
+      "clip_ratio": 0.0003468562302373357,
+      "epoch": 1.2045950225568403,
+      "grad_norm": 0.03805195167660713,
+      "kl": 0.005377531051635742,
+      "learning_rate": 3.7734673667661133e-06,
+      "loss": 0.0112,
+      "step": 727
+    },
+    {
+      "clip_ratio": 0.00037477223943938043,
+      "epoch": 1.2065071255713902,
+      "grad_norm": 0.03666882589459419,
+      "kl": 0.005417585372924805,
+      "learning_rate": 3.7680650834397804e-06,
+      "loss": 0.0112,
+      "step": 728
+    },
+    {
+      "clip_ratio": 0.0003945930936311015,
+      "epoch": 1.20841922858594,
+      "grad_norm": 0.03651399165391922,
+      "kl": 0.005425453186035156,
+      "learning_rate": 3.762654815895122e-06,
+      "loss": 0.0111,
+      "step": 729
+    },
+    {
+      "clip_ratio": 0.0004650242010484362,
+      "epoch": 1.21033133160049,
+      "grad_norm": 0.03792130947113037,
+      "kl": 0.005422115325927734,
+      "learning_rate": 3.7572365981972335e-06,
+      "loss": 0.0111,
+      "step": 730
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.6861305236816,
+      "epoch": 1.21224343461504,
+      "grad_norm": 0.0365571565926075,
+      "kl": 0.005487203598022461,
+      "learning_rate": 3.7518104644612663e-06,
+      "loss": 0.0098,
+      "num_tokens": 460061367.0,
+      "reward": 0.06417411062284373,
+      "reward_std": 0.07478918455308303,
+      "rewards/pure_accuracy_reward_math": 0.06417410852736793,
+      "step": 731
+    },
+    {
+      "clip_ratio": 0.0002798708824229834,
+      "epoch": 1.2141555376295898,
+      "grad_norm": 0.036456115543842316,
+      "kl": 0.005484342575073242,
+      "learning_rate": 3.746376448852216e-06,
+      "loss": 0.0098,
+      "step": 732
+    },
+    {
+      "clip_ratio": 0.0003001830394850913,
+      "epoch": 1.2160676406441397,
+      "grad_norm": 0.036120470613241196,
+      "kl": 0.005544900894165039,
+      "learning_rate": 3.740934585584702e-06,
+      "loss": 0.0098,
+      "step": 733
+    },
+    {
+      "clip_ratio": 0.00028155883609315424,
+      "epoch": 1.2179797436586897,
+      "grad_norm": 0.03475060313940048,
+      "kl": 0.005614042282104492,
+      "learning_rate": 3.735484908922759e-06,
+      "loss": 0.0097,
+      "step": 734
+    },
+    {
+      "clip_ratio": 0.00027523975251142474,
+      "epoch": 1.2198918466732396,
+      "grad_norm": 0.03388671204447746,
+      "kl": 0.005706310272216797,
+      "learning_rate": 3.730027453179617e-06,
+      "loss": 0.0096,
+      "step": 735
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 518.6091203689575,
+      "epoch": 1.2218039496877895,
+      "grad_norm": 0.039098870009183884,
+      "kl": 0.005930900573730469,
+      "learning_rate": 3.7245622527174858e-06,
+      "loss": 0.0072,
+      "num_tokens": 463651718.0,
+      "reward": 0.06277902098372579,
+      "reward_std": 0.06552149687195197,
+      "rewards/pure_accuracy_reward_math": 0.06277901912108064,
+      "step": 736
+    },
+    {
+      "clip_ratio": 0.000267848483247235,
+      "epoch": 1.2237160527023394,
+      "grad_norm": 0.03896670043468475,
+      "kl": 0.005952358245849609,
+      "learning_rate": 3.719089341947337e-06,
+      "loss": 0.0072,
+      "step": 737
+    },
+    {
+      "clip_ratio": 0.00026333254504606884,
+      "epoch": 1.2256281557168893,
+      "grad_norm": 0.03838280960917473,
+      "kl": 0.005873680114746094,
+      "learning_rate": 3.7136087553286916e-06,
+      "loss": 0.0072,
+      "step": 738
+    },
+    {
+      "clip_ratio": 0.0002850479507969794,
+      "epoch": 1.2275402587314392,
+      "grad_norm": 0.03708336502313614,
+      "kl": 0.005741596221923828,
+      "learning_rate": 3.7081205273694005e-06,
+      "loss": 0.0071,
+      "step": 739
+    },
+    {
+      "clip_ratio": 0.00030947004142944934,
+      "epoch": 1.2294523617459892,
+      "grad_norm": 0.03616032376885414,
+      "kl": 0.005689144134521484,
+      "learning_rate": 3.702624692625427e-06,
+      "loss": 0.007,
+      "step": 740
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 515.3027577400208,
+      "epoch": 1.231364464760539,
+      "grad_norm": 473.16009521484375,
+      "kl": 7.4117608070373535,
+      "learning_rate": 3.6971212857006277e-06,
+      "loss": 0.3027,
+      "num_tokens": 467231411.0,
+      "reward": 0.07003348527359776,
+      "reward_std": 0.07058388437144458,
+      "rewards/pure_accuracy_reward_math": 0.07003348364378326,
+      "step": 741
+    },
+    {
+      "clip_ratio": 0.00048789031319529386,
+      "epoch": 1.2332765677750888,
+      "grad_norm": 15.009349822998047,
+      "kl": 0.3277552127838135,
+      "learning_rate": 3.6916103412465405e-06,
+      "loss": 0.0207,
+      "step": 742
+    },
+    {
+      "clip_ratio": 0.0005436847095552366,
+      "epoch": 1.235188670789639,
+      "grad_norm": 34.010345458984375,
+      "kl": 0.01839423179626465,
+      "learning_rate": 3.6860918939621586e-06,
+      "loss": 0.0299,
+      "step": 743
+    },
+    {
+      "clip_ratio": 0.000597593801558105,
+      "epoch": 1.2371007738041886,
+      "grad_norm": 13.507566452026367,
+      "kl": 0.02814960479736328,
+      "learning_rate": 3.6805659785937176e-06,
+      "loss": 0.0188,
+      "step": 744
+    },
+    {
+      "clip_ratio": 0.0005609532486232638,
+      "epoch": 1.2390128768187387,
+      "grad_norm": 6.263442516326904,
+      "kl": 0.20073914527893066,
+      "learning_rate": 3.675032629934475e-06,
+      "loss": 0.0163,
+      "step": 745
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.3340101242065,
+      "epoch": 1.2409249798332884,
+      "grad_norm": 0.051358480006456375,
+      "kl": 0.0063626766204833984,
+      "learning_rate": 3.6694918828244923e-06,
+      "loss": 0.0095,
+      "num_tokens": 470866344.0,
+      "reward": 0.06333705666474998,
+      "reward_std": 0.07530095760012046,
+      "rewards/pure_accuracy_reward_math": 0.06333705509314314,
+      "step": 746
+    },
+    {
+      "clip_ratio": 0.00029982604212364095,
+      "epoch": 1.2428370828478383,
+      "grad_norm": 0.03713027015328407,
+      "kl": 0.006081342697143555,
+      "learning_rate": 3.6639437721504108e-06,
+      "loss": 0.0095,
+      "step": 747
+    },
+    {
+      "clip_ratio": 0.0002941023938660692,
+      "epoch": 1.2447491858623883,
+      "grad_norm": 0.03500093147158623,
+      "kl": 0.006156444549560547,
+      "learning_rate": 3.65838833284524e-06,
+      "loss": 0.0095,
+      "step": 748
+    },
+    {
+      "clip_ratio": 0.0002858027814340858,
+      "epoch": 1.2466612888769382,
+      "grad_norm": 0.03525420278310776,
+      "kl": 0.006234169006347656,
+      "learning_rate": 3.652825599888129e-06,
+      "loss": 0.0094,
+      "step": 749
+    },
+    {
+      "clip_ratio": 0.0002950350276478275,
+      "epoch": 1.248573391891488,
+      "grad_norm": 0.03545543923974037,
+      "kl": 0.006281852722167969,
+      "learning_rate": 3.647255608304154e-06,
+      "loss": 0.0093,
+      "step": 750
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.79438829422,
+      "epoch": 1.250485494906038,
+      "grad_norm": 0.03711007162928581,
+      "kl": 0.005670070648193359,
+      "learning_rate": 3.641678393164092e-06,
+      "loss": 0.0131,
+      "num_tokens": 474505191.0,
+      "reward": 0.07170759318978526,
+      "reward_std": 0.07251697574974969,
+      "rewards/pure_accuracy_reward_math": 0.0717075907450635,
+      "step": 751
+    },
+    {
+      "clip_ratio": 0.00029345202176500607,
+      "epoch": 1.252397597920588,
+      "grad_norm": 0.036423034965991974,
+      "kl": 0.005608320236206055,
+      "learning_rate": 3.636093989584204e-06,
+      "loss": 0.0131,
+      "step": 752
+    },
+    {
+      "clip_ratio": 0.00030187425932126644,
+      "epoch": 1.2543097009351378,
+      "grad_norm": 0.03613322973251343,
+      "kl": 0.005610466003417969,
+      "learning_rate": 3.630502432726012e-06,
+      "loss": 0.013,
+      "step": 753
+    },
+    {
+      "clip_ratio": 0.0003275847485610939,
+      "epoch": 1.2562218039496877,
+      "grad_norm": 0.03452349826693535,
+      "kl": 0.0057184696197509766,
+      "learning_rate": 3.6249037577960744e-06,
+      "loss": 0.013,
+      "step": 754
+    },
+    {
+      "clip_ratio": 0.00034663524741063156,
+      "epoch": 1.2581339069642377,
+      "grad_norm": 0.034864939749240875,
+      "kl": 0.005825996398925781,
+      "learning_rate": 3.619298000045773e-06,
+      "loss": 0.0129,
+      "step": 755
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 495.8814425468445,
+      "epoch": 1.2600460099787876,
+      "grad_norm": 528.279052734375,
+      "kl": 9.193241596221924,
+      "learning_rate": 3.6136851947710804e-06,
+      "loss": 0.3749,
+      "num_tokens": 478011678.0,
+      "reward": 0.07979911071015522,
+      "reward_std": 0.07470905361697078,
+      "rewards/pure_accuracy_reward_math": 0.0797991082072258,
+      "step": 756
+    },
+    {
+      "clip_ratio": 0.00028275052295612113,
+      "epoch": 1.2619581129933375,
+      "grad_norm": 44.662696838378906,
+      "kl": 1.2635960578918457,
+      "learning_rate": 3.608065377312348e-06,
+      "loss": 0.057,
+      "step": 757
+    },
+    {
+      "clip_ratio": 0.00029553008619132015,
+      "epoch": 1.2638702160078874,
+      "grad_norm": 4.775911808013916,
+      "kl": 0.1474595069885254,
+      "learning_rate": 3.6024385830540758e-06,
+      "loss": 0.0123,
+      "step": 758
+    },
+    {
+      "clip_ratio": 0.00033371773997714627,
+      "epoch": 1.2657823190224373,
+      "grad_norm": 0.30982905626296997,
+      "kl": 0.01830148696899414,
+      "learning_rate": 3.5968048474246925e-06,
+      "loss": 0.0071,
+      "step": 759
+    },
+    {
+      "clip_ratio": 0.0003257711730952906,
+      "epoch": 1.2676944220369872,
+      "grad_norm": 0.05356259644031525,
+      "kl": 0.011959552764892578,
+      "learning_rate": 3.591164205896332e-06,
+      "loss": 0.0068,
+      "step": 760
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.9149203300476,
+      "epoch": 1.2696065250515371,
+      "grad_norm": 0.04138460382819176,
+      "kl": 0.00600886344909668,
+      "learning_rate": 3.585516693984612e-06,
+      "loss": 0.0061,
+      "num_tokens": 481610981.0,
+      "reward": 0.07059152136207558,
+      "reward_std": 0.07616424100706354,
+      "rewards/pure_accuracy_reward_math": 0.07059151938301511,
+      "step": 761
+    },
+    {
+      "clip_ratio": 0.00029173931721970803,
+      "epoch": 1.271518628066087,
+      "grad_norm": 0.04057340323925018,
+      "kl": 0.0059850215911865234,
+      "learning_rate": 3.5798623472484074e-06,
+      "loss": 0.006,
+      "step": 762
+    },
+    {
+      "clip_ratio": 0.00031361054851686276,
+      "epoch": 1.273430731080637,
+      "grad_norm": 0.0383637472987175,
+      "kl": 0.005931377410888672,
+      "learning_rate": 3.5742012012896273e-06,
+      "loss": 0.006,
+      "step": 763
+    },
+    {
+      "clip_ratio": 0.000302841177983737,
+      "epoch": 1.275342834095187,
+      "grad_norm": 0.037009891122579575,
+      "kl": 0.005960226058959961,
+      "learning_rate": 3.5685332917529936e-06,
+      "loss": 0.0059,
+      "step": 764
+    },
+    {
+      "clip_ratio": 0.00032496250122449055,
+      "epoch": 1.2772549371097368,
+      "grad_norm": 0.036052413284778595,
+      "kl": 0.0060160160064697266,
+      "learning_rate": 3.5628586543258116e-06,
+      "loss": 0.0058,
+      "step": 765
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 505.19645166397095,
+      "epoch": 1.2791670401242867,
+      "grad_norm": 0.039108723402023315,
+      "kl": 0.0060214996337890625,
+      "learning_rate": 3.5571773247377495e-06,
+      "loss": 0.0077,
+      "num_tokens": 485155493.0,
+      "reward": 0.06473214537254535,
+      "reward_std": 0.07595151849091053,
+      "rewards/pure_accuracy_reward_math": 0.06473214438301511,
+      "step": 766
+    },
+    {
+      "clip_ratio": 0.00031215936860462534,
+      "epoch": 1.2810791431388366,
+      "grad_norm": 0.03890209272503853,
+      "kl": 0.0060939788818359375,
+      "learning_rate": 3.5514893387606113e-06,
+      "loss": 0.0078,
+      "step": 767
+    },
+    {
+      "clip_ratio": 0.00029648321913100517,
+      "epoch": 1.2829912461533866,
+      "grad_norm": 0.038266174495220184,
+      "kl": 0.0061397552490234375,
+      "learning_rate": 3.5457947322081126e-06,
+      "loss": 0.0077,
+      "step": 768
+    },
+    {
+      "clip_ratio": 0.0002988063008615427,
+      "epoch": 1.2849033491679365,
+      "grad_norm": 0.03760776296257973,
+      "kl": 0.006152629852294922,
+      "learning_rate": 3.5400935409356534e-06,
+      "loss": 0.0076,
+      "step": 769
+    },
+    {
+      "clip_ratio": 0.00032748817852734646,
+      "epoch": 1.2868154521824864,
+      "grad_norm": 0.037058234214782715,
+      "kl": 0.006194591522216797,
+      "learning_rate": 3.5343858008400955e-06,
+      "loss": 0.0076,
+      "step": 770
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 513.085681438446,
+      "epoch": 1.2887275551970363,
+      "grad_norm": 0.04272163286805153,
+      "kl": 0.006904125213623047,
+      "learning_rate": 3.5286715478595335e-06,
+      "loss": 0.0066,
+      "num_tokens": 488731916.0,
+      "reward": 0.06668527112924494,
+      "reward_std": 0.07779828266939148,
+      "rewards/pure_accuracy_reward_math": 0.0666852695576381,
+      "step": 771
+    },
+    {
+      "clip_ratio": 0.0002989328136209224,
+      "epoch": 1.2906396582115862,
+      "grad_norm": 0.039898019284009933,
+      "kl": 0.006760597229003906,
+      "learning_rate": 3.52295081797307e-06,
+      "loss": 0.0066,
+      "step": 772
+    },
+    {
+      "clip_ratio": 0.0003237332452385999,
+      "epoch": 1.2925517612261361,
+      "grad_norm": 0.0380416214466095,
+      "kl": 0.006653547286987305,
+      "learning_rate": 3.5172236472005866e-06,
+      "loss": 0.0065,
+      "step": 773
+    },
+    {
+      "clip_ratio": 0.0004160679777100995,
+      "epoch": 1.294463864240686,
+      "grad_norm": 0.03860335052013397,
+      "kl": 0.006639003753662109,
+      "learning_rate": 3.511490071602523e-06,
+      "loss": 0.0065,
+      "step": 774
+    },
+    {
+      "clip_ratio": 0.0004345110206713798,
+      "epoch": 1.2963759672552357,
+      "grad_norm": 0.0405069962143898,
+      "kl": 0.006697654724121094,
+      "learning_rate": 3.505750127279643e-06,
+      "loss": 0.0064,
+      "step": 775
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.7695565223694,
+      "epoch": 1.2982880702697859,
+      "grad_norm": 0.040585048496723175,
+      "kl": 0.006101369857788086,
+      "learning_rate": 3.500003850372811e-06,
+      "loss": 0.0043,
+      "num_tokens": 492363370.0,
+      "reward": 0.07477678926079534,
+      "reward_std": 0.08466117118950933,
+      "rewards/pure_accuracy_reward_math": 0.07477678704890423,
+      "step": 776
+    },
+    {
+      "clip_ratio": 0.0003347315081327906,
+      "epoch": 1.3002001732843356,
+      "grad_norm": 0.039613205939531326,
+      "kl": 0.0060977935791015625,
+      "learning_rate": 3.4942512770627655e-06,
+      "loss": 0.0043,
+      "step": 777
+    },
+    {
+      "clip_ratio": 0.0003803396672310555,
+      "epoch": 1.3021122762988857,
+      "grad_norm": 0.03965132310986519,
+      "kl": 0.006110668182373047,
+      "learning_rate": 3.4884924435698875e-06,
+      "loss": 0.0042,
+      "step": 778
+    },
+    {
+      "clip_ratio": 0.00035469116983222193,
+      "epoch": 1.3040243793134354,
+      "grad_norm": 0.038701362907886505,
+      "kl": 0.005974292755126953,
+      "learning_rate": 3.482727386153974e-06,
+      "loss": 0.0041,
+      "step": 779
+    },
+    {
+      "clip_ratio": 0.00038596760680320585,
+      "epoch": 1.3059364823279855,
+      "grad_norm": 0.03767050802707672,
+      "kl": 0.0059070587158203125,
+      "learning_rate": 3.4769561411140123e-06,
+      "loss": 0.0041,
+      "step": 780
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.3593993186951,
+      "epoch": 1.3078485853425352,
+      "grad_norm": 0.04520969092845917,
+      "kl": 0.015022039413452148,
+      "learning_rate": 3.471178744787948e-06,
+      "loss": 0.0107,
+      "num_tokens": 495988466.0,
+      "reward": 0.07449777098372579,
+      "reward_std": 0.08161820413079113,
+      "rewards/pure_accuracy_reward_math": 0.07449777016881853,
+      "step": 781
+    },
+    {
+      "clip_ratio": 0.00032587463357458546,
+      "epoch": 1.3097606883570854,
+      "grad_norm": 0.04337235167622566,
+      "kl": 0.01485586166381836,
+      "learning_rate": 3.465395233552458e-06,
+      "loss": 0.0107,
+      "step": 782
+    },
+    {
+      "clip_ratio": 0.00031156001216459117,
+      "epoch": 1.311672791371635,
+      "grad_norm": 0.04306100681424141,
+      "kl": 0.014668941497802734,
+      "learning_rate": 3.459605643822721e-06,
+      "loss": 0.0106,
+      "step": 783
+    },
+    {
+      "clip_ratio": 0.00031179932597069637,
+      "epoch": 1.313584894386185,
+      "grad_norm": 0.04292943701148033,
+      "kl": 0.014333724975585938,
+      "learning_rate": 3.4538100120521884e-06,
+      "loss": 0.0106,
+      "step": 784
+    },
+    {
+      "clip_ratio": 0.00034586368491318353,
+      "epoch": 1.315496997400735,
+      "grad_norm": 0.04207218065857887,
+      "kl": 0.013885498046875,
+      "learning_rate": 3.4480083747323527e-06,
+      "loss": 0.0105,
+      "step": 785
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 521.3471217155457,
+      "epoch": 1.3174091004152848,
+      "grad_norm": 0.04057139530777931,
+      "kl": 0.006026268005371094,
+      "learning_rate": 3.4422007683925224e-06,
+      "loss": 0.0119,
+      "num_tokens": 499590878.0,
+      "reward": 0.08091518239234574,
+      "reward_std": 0.08763020328478888,
+      "rewards/pure_accuracy_reward_math": 0.08091518023866229,
+      "step": 786
+    },
+    {
+      "clip_ratio": 0.00030802900647586284,
+      "epoch": 1.3193212034298347,
+      "grad_norm": 0.039306215941905975,
+      "kl": 0.00603485107421875,
+      "learning_rate": 3.436387229599587e-06,
+      "loss": 0.0119,
+      "step": 787
+    },
+    {
+      "clip_ratio": 0.00034579116845634417,
+      "epoch": 1.3212333064443846,
+      "grad_norm": 0.03839893266558647,
+      "kl": 0.006104469299316406,
+      "learning_rate": 3.4305677949577915e-06,
+      "loss": 0.0118,
+      "step": 788
+    },
+    {
+      "clip_ratio": 0.00036078316020393686,
+      "epoch": 1.3231454094589346,
+      "grad_norm": 0.03700988367199898,
+      "kl": 0.006115436553955078,
+      "learning_rate": 3.4247425011084993e-06,
+      "loss": 0.0118,
+      "step": 789
+    },
+    {
+      "clip_ratio": 0.0003916456239494437,
+      "epoch": 1.3250575124734845,
+      "grad_norm": 0.03749685734510422,
+      "kl": 0.006115436553955078,
+      "learning_rate": 3.418911384729971e-06,
+      "loss": 0.0117,
+      "step": 790
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 502.7112407684326,
+      "epoch": 1.3269696154880344,
+      "grad_norm": 0.03917763754725456,
+      "kl": 0.009302139282226562,
+      "learning_rate": 3.413074482537123e-06,
+      "loss": 0.0077,
+      "num_tokens": 503128079.0,
+      "reward": 0.07059152112924494,
+      "reward_std": 0.07702752505429089,
+      "rewards/pure_accuracy_reward_math": 0.07059151944122277,
+      "step": 791
+    },
+    {
+      "clip_ratio": 0.0002787132019079763,
+      "epoch": 1.3288817185025843,
+      "grad_norm": 0.03894754871726036,
+      "kl": 0.009203910827636719,
+      "learning_rate": 3.4072318312813044e-06,
+      "loss": 0.0077,
+      "step": 792
+    },
+    {
+      "clip_ratio": 0.00031091465683630304,
+      "epoch": 1.3307938215171342,
+      "grad_norm": 0.03774462640285492,
+      "kl": 0.008921146392822266,
+      "learning_rate": 3.4013834677500612e-06,
+      "loss": 0.0077,
+      "step": 793
+    },
+    {
+      "clip_ratio": 0.00030987418773520403,
+      "epoch": 1.3327059245316841,
+      "grad_norm": 0.03737964481115341,
+      "kl": 0.008791923522949219,
+      "learning_rate": 3.395529428766907e-06,
+      "loss": 0.0076,
+      "step": 794
+    },
+    {
+      "clip_ratio": 0.0003597256319380904,
+      "epoch": 1.334618027546234,
+      "grad_norm": 0.03793202340602875,
+      "kl": 0.008593559265136719,
+      "learning_rate": 3.3896697511910898e-06,
+      "loss": 0.0075,
+      "step": 795
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.8552160263062,
+      "epoch": 1.336530130560784,
+      "grad_norm": 0.03877223655581474,
+      "kl": 0.005873441696166992,
+      "learning_rate": 3.3838044719173603e-06,
+      "loss": 0.0086,
+      "num_tokens": 506711636.0,
+      "reward": 0.06529018195578828,
+      "reward_std": 0.06942774722119793,
+      "rewards/pure_accuracy_reward_math": 0.06529017997672781,
+      "step": 796
+    },
+    {
+      "clip_ratio": 0.0002862633294853367,
+      "epoch": 1.3384422335753339,
+      "grad_norm": 0.0376199446618557,
+      "kl": 0.005820274353027344,
+      "learning_rate": 3.377933627875739e-06,
+      "loss": 0.0086,
+      "step": 797
+    },
+    {
+      "clip_ratio": 0.0002861461452994263,
+      "epoch": 1.3403543365898838,
+      "grad_norm": 0.036890070885419846,
+      "kl": 0.005822658538818359,
+      "learning_rate": 3.3720572560312854e-06,
+      "loss": 0.0086,
+      "step": 798
+    },
+    {
+      "clip_ratio": 0.0003201163677317709,
+      "epoch": 1.3422664396044337,
+      "grad_norm": 0.03669756278395653,
+      "kl": 0.005821704864501953,
+      "learning_rate": 3.366175393383863e-06,
+      "loss": 0.0085,
+      "step": 799
+    },
+    {
+      "clip_ratio": 0.0003494162402830625,
+      "epoch": 1.3441785426189836,
+      "grad_norm": 0.03721420839428902,
+      "kl": 0.005818843841552734,
+      "learning_rate": 3.360288076967909e-06,
+      "loss": 0.0084,
+      "step": 800
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 505.6105146408081,
+      "epoch": 1.3460906456335335,
+      "grad_norm": 0.040034398436546326,
+      "kl": 0.006266117095947266,
+      "learning_rate": 3.3543953438521983e-06,
+      "loss": 0.0091,
+      "num_tokens": 510255728.0,
+      "reward": 0.0675223250000272,
+      "reward_std": 0.07577886182116345,
+      "rewards/pure_accuracy_reward_math": 0.06752232249709778,
+      "step": 801
+    },
+    {
+      "clip_ratio": 0.00027677676553139463,
+      "epoch": 1.3480027486480834,
+      "grad_norm": 0.038657769560813904,
+      "kl": 0.006215572357177734,
+      "learning_rate": 3.3484972311396114e-06,
+      "loss": 0.0091,
+      "step": 802
+    },
+    {
+      "clip_ratio": 0.0002909586188479807,
+      "epoch": 1.3499148516626334,
+      "grad_norm": 0.036970507353544235,
+      "kl": 0.006129741668701172,
+      "learning_rate": 3.342593775966901e-06,
+      "loss": 0.009,
+      "step": 803
+    },
+    {
+      "clip_ratio": 0.0003427068459700422,
+      "epoch": 1.3518269546771833,
+      "grad_norm": 0.03707785904407501,
+      "kl": 0.006056785583496094,
+      "learning_rate": 3.3366850155044595e-06,
+      "loss": 0.009,
+      "step": 804
+    },
+    {
+      "clip_ratio": 0.00038909467849634893,
+      "epoch": 1.3537390576917332,
+      "grad_norm": 0.03700149059295654,
+      "kl": 0.005985736846923828,
+      "learning_rate": 3.33077098695608e-06,
+      "loss": 0.0089,
+      "step": 805
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.0212287902832,
+      "epoch": 1.355651160706283,
+      "grad_norm": 0.04373861476778984,
+      "kl": 0.005824565887451172,
+      "learning_rate": 3.3248517275587292e-06,
+      "loss": 0.0094,
+      "num_tokens": 513879112.0,
+      "reward": 0.0703125029685907,
+      "reward_std": 0.08085364429280162,
+      "rewards/pure_accuracy_reward_math": 0.07031250145519152,
+      "step": 806
+    },
+    {
+      "clip_ratio": 0.00031092700191948097,
+      "epoch": 1.357563263720833,
+      "grad_norm": 0.04273909702897072,
+      "kl": 0.0058460235595703125,
+      "learning_rate": 3.318927274582307e-06,
+      "loss": 0.0094,
+      "step": 807
+    },
+    {
+      "clip_ratio": 0.0003359753473546334,
+      "epoch": 1.359475366735383,
+      "grad_norm": 0.04217194393277168,
+      "kl": 0.005980014801025391,
+      "learning_rate": 3.312997665329414e-06,
+      "loss": 0.0093,
+      "step": 808
+    },
+    {
+      "clip_ratio": 0.0003392697701940506,
+      "epoch": 1.3613874697499329,
+      "grad_norm": 0.04189891368150711,
+      "kl": 0.0061492919921875,
+      "learning_rate": 3.3070629371351176e-06,
+      "loss": 0.0093,
+      "step": 809
+    },
+    {
+      "clip_ratio": 0.0003985974152556082,
+      "epoch": 1.3632995727644825,
+      "grad_norm": 0.04113880172371864,
+      "kl": 0.0062618255615234375,
+      "learning_rate": 3.3011231273667155e-06,
+      "loss": 0.0092,
+      "step": 810
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.8002490997314,
+      "epoch": 1.3652116757790327,
+      "grad_norm": 0.039511535316705704,
+      "kl": 0.007502555847167969,
+      "learning_rate": 3.295178273423501e-06,
+      "loss": 0.0065,
+      "num_tokens": 517489928.0,
+      "reward": 0.06835937840514816,
+      "reward_std": 0.0761642413563095,
+      "rewards/pure_accuracy_reward_math": 0.06835937636788003,
+      "step": 811
+    },
+    {
+      "clip_ratio": 0.00033993283830113796,
+      "epoch": 1.3671237787935824,
+      "grad_norm": 0.03911852091550827,
+      "kl": 0.0074634552001953125,
+      "learning_rate": 3.2892284127365277e-06,
+      "loss": 0.0065,
+      "step": 812
+    },
+    {
+      "clip_ratio": 0.00029188678922764666,
+      "epoch": 1.3690358818081325,
+      "grad_norm": 0.038789719343185425,
+      "kl": 0.007461071014404297,
+      "learning_rate": 3.2832735827683733e-06,
+      "loss": 0.0064,
+      "step": 813
+    },
+    {
+      "clip_ratio": 0.00031692377649505943,
+      "epoch": 1.3709479848226822,
+      "grad_norm": 0.03795900195837021,
+      "kl": 0.007411956787109375,
+      "learning_rate": 3.2773138210129037e-06,
+      "loss": 0.0063,
+      "step": 814
+    },
+    {
+      "clip_ratio": 0.0003394908647464945,
+      "epoch": 1.3728600878372323,
+      "grad_norm": 0.03683575242757797,
+      "kl": 0.0073795318603515625,
+      "learning_rate": 3.2713491649950375e-06,
+      "loss": 0.0063,
+      "step": 815
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.1018648147583,
+      "epoch": 1.374772190851782,
+      "grad_norm": 0.036948177963495255,
+      "kl": 0.0058441162109375,
+      "learning_rate": 3.26537965227051e-06,
+      "loss": 0.0062,
+      "num_tokens": 521113961.0,
+      "reward": 0.06333705675206147,
+      "reward_std": 0.07041122711962089,
+      "rewards/pure_accuracy_reward_math": 0.06333705494762398,
+      "step": 816
+    },
+    {
+      "clip_ratio": 0.0002517415915690435,
+      "epoch": 1.3766842938663322,
+      "grad_norm": 0.03634682297706604,
+      "kl": 0.005847454071044922,
+      "learning_rate": 3.2594053204256344e-06,
+      "loss": 0.0062,
+      "step": 817
+    },
+    {
+      "clip_ratio": 0.00027403954436522326,
+      "epoch": 1.3785963968808819,
+      "grad_norm": 0.034690070897340775,
+      "kl": 0.005870342254638672,
+      "learning_rate": 3.253426207077069e-06,
+      "loss": 0.0062,
+      "step": 818
+    },
+    {
+      "clip_ratio": 0.0002389855896467452,
+      "epoch": 1.3805084998954318,
+      "grad_norm": 0.034505974501371384,
+      "kl": 0.005900382995605469,
+      "learning_rate": 3.2474423498715772e-06,
+      "loss": 0.0061,
+      "step": 819
+    },
+    {
+      "clip_ratio": 0.000287152882663122,
+      "epoch": 1.3824206029099817,
+      "grad_norm": 0.03524321690201759,
+      "kl": 0.005913734436035156,
+      "learning_rate": 3.241453786485792e-06,
+      "loss": 0.0061,
+      "step": 820
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 509.66520071029663,
+      "epoch": 1.3843327059245316,
+      "grad_norm": 0.039214182645082474,
+      "kl": 0.006892681121826172,
+      "learning_rate": 3.2354605546259777e-06,
+      "loss": 0.0032,
+      "num_tokens": 524677265.0,
+      "reward": 0.07979911041911691,
+      "reward_std": 0.07959878293331712,
+      "rewards/pure_accuracy_reward_math": 0.07979910867288709,
+      "step": 821
+    },
+    {
+      "clip_ratio": 0.0002965318878409562,
+      "epoch": 1.3862448089390815,
+      "grad_norm": 0.037640273571014404,
+      "kl": 0.0067348480224609375,
+      "learning_rate": 3.2294626920277928e-06,
+      "loss": 0.0031,
+      "step": 822
+    },
+    {
+      "clip_ratio": 0.00035153192868619954,
+      "epoch": 1.3881569119536314,
+      "grad_norm": 0.038182858377695084,
+      "kl": 0.006665706634521484,
+      "learning_rate": 3.2234602364560543e-06,
+      "loss": 0.0031,
+      "step": 823
+    },
+    {
+      "clip_ratio": 0.0003338070732752385,
+      "epoch": 1.3900690149681814,
+      "grad_norm": 0.038163840770721436,
+      "kl": 0.00667572021484375,
+      "learning_rate": 3.2174532257044957e-06,
+      "loss": 0.003,
+      "step": 824
+    },
+    {
+      "clip_ratio": 0.0003418834434683049,
+      "epoch": 1.3919811179827313,
+      "grad_norm": 0.03628409281373024,
+      "kl": 0.0067596435546875,
+      "learning_rate": 3.2114416975955347e-06,
+      "loss": 0.003,
+      "step": 825
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.1027045249939,
+      "epoch": 1.3938932209972812,
+      "grad_norm": 0.037393856793642044,
+      "kl": 0.005987644195556641,
+      "learning_rate": 3.20542568998003e-06,
+      "loss": 0.0097,
+      "num_tokens": 528270425.0,
+      "reward": 0.07784598556463607,
+      "reward_std": 0.0774529695045203,
+      "rewards/pure_accuracy_reward_math": 0.07784598329453729,
+      "step": 826
+    },
+    {
+      "clip_ratio": 0.0002753000243274073,
+      "epoch": 1.395805324011831,
+      "grad_norm": 0.03632253408432007,
+      "kl": 0.00603485107421875,
+      "learning_rate": 3.199405240737045e-06,
+      "loss": 0.0097,
+      "step": 827
+    },
+    {
+      "clip_ratio": 0.00028145005671831314,
+      "epoch": 1.397717427026381,
+      "grad_norm": 0.035320475697517395,
+      "kl": 0.0060482025146484375,
+      "learning_rate": 3.1933803877736103e-06,
+      "loss": 0.0097,
+      "step": 828
+    },
+    {
+      "clip_ratio": 0.00029773840276448027,
+      "epoch": 1.399629530040931,
+      "grad_norm": 0.03532904013991356,
+      "kl": 0.006001472473144531,
+      "learning_rate": 3.187351169024483e-06,
+      "loss": 0.0096,
+      "step": 829
+    },
+    {
+      "clip_ratio": 0.0003131672060590063,
+      "epoch": 1.4015416330554809,
+      "grad_norm": 0.03497399017214775,
+      "kl": 0.0059299468994140625,
+      "learning_rate": 3.181317622451909e-06,
+      "loss": 0.0095,
+      "step": 830
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.5547099113464,
+      "epoch": 1.4034537360700308,
+      "grad_norm": 0.03596203401684761,
+      "kl": 0.005957126617431641,
+      "learning_rate": 3.1752797860453854e-06,
+      "loss": 0.0099,
+      "num_tokens": 531863545.0,
+      "reward": 0.06584821754950099,
+      "reward_std": 0.07359298237133771,
+      "rewards/pure_accuracy_reward_math": 0.06584821580327116,
+      "step": 831
+    },
+    {
+      "clip_ratio": 0.0002871401754873659,
+      "epoch": 1.4053658390845807,
+      "grad_norm": 0.03569914028048515,
+      "kl": 0.005918025970458984,
+      "learning_rate": 3.169237697821417e-06,
+      "loss": 0.0099,
+      "step": 832
+    },
+    {
+      "clip_ratio": 0.0002649255456503852,
+      "epoch": 1.4072779420991306,
+      "grad_norm": 0.035189539194107056,
+      "kl": 0.005944252014160156,
+      "learning_rate": 3.163191395823281e-06,
+      "loss": 0.0098,
+      "step": 833
+    },
+    {
+      "clip_ratio": 0.0002522150609252094,
+      "epoch": 1.4091900451136805,
+      "grad_norm": 0.03371162712574005,
+      "kl": 0.006028652191162109,
+      "learning_rate": 3.1571409181207867e-06,
+      "loss": 0.0098,
+      "step": 834
+    },
+    {
+      "clip_ratio": 0.00028182740913962334,
+      "epoch": 1.4111021481282304,
+      "grad_norm": 0.03411802276968956,
+      "kl": 0.006129264831542969,
+      "learning_rate": 3.151086302810035e-06,
+      "loss": 0.0097,
+      "step": 835
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 509.0455017089844,
+      "epoch": 1.4130142511427803,
+      "grad_norm": 0.042647283524274826,
+      "kl": 0.006505012512207031,
+      "learning_rate": 3.1450275880131782e-06,
+      "loss": 0.0051,
+      "num_tokens": 535420068.0,
+      "reward": 0.06919643201399595,
+      "reward_std": 0.06989945442182943,
+      "rewards/pure_accuracy_reward_math": 0.06919642980210483,
+      "step": 836
+    },
+    {
+      "clip_ratio": 0.0002792542761653749,
+      "epoch": 1.4149263541573303,
+      "grad_norm": 0.03879564628005028,
+      "kl": 0.006262302398681641,
+      "learning_rate": 3.1389648118781795e-06,
+      "loss": 0.0051,
+      "step": 837
+    },
+    {
+      "clip_ratio": 0.00032867032479089175,
+      "epoch": 1.4168384571718802,
+      "grad_norm": 0.03632555902004242,
+      "kl": 0.006078004837036133,
+      "learning_rate": 3.132898012578577e-06,
+      "loss": 0.005,
+      "step": 838
+    },
+    {
+      "clip_ratio": 0.0003705890379706034,
+      "epoch": 1.41875056018643,
+      "grad_norm": 0.03687159717082977,
+      "kl": 0.0058705806732177734,
+      "learning_rate": 3.1268272283132374e-06,
+      "loss": 0.005,
+      "step": 839
+    },
+    {
+      "clip_ratio": 0.00039090512018447043,
+      "epoch": 1.42066266320098,
+      "grad_norm": 0.03681857883930206,
+      "kl": 0.005755186080932617,
+      "learning_rate": 3.1207524973061183e-06,
+      "loss": 0.0049,
+      "step": 840
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.0865178108215,
+      "epoch": 1.42257476621553,
+      "grad_norm": 0.077212393283844,
+      "kl": 0.006708621978759766,
+      "learning_rate": 3.1146738578060293e-06,
+      "loss": 0.0034,
+      "num_tokens": 539042994.0,
+      "reward": 0.05468750235741027,
+      "reward_std": 0.06221334764268249,
+      "rewards/pure_accuracy_reward_math": 0.05468750130967237,
+      "step": 841
+    },
+    {
+      "clip_ratio": 0.00023407521496210393,
+      "epoch": 1.4244868692300798,
+      "grad_norm": 0.03766750544309616,
+      "kl": 0.005887508392333984,
+      "learning_rate": 3.108591348086388e-06,
+      "loss": 0.0034,
+      "step": 842
+    },
+    {
+      "clip_ratio": 0.00021864835269980176,
+      "epoch": 1.4263989722446297,
+      "grad_norm": 0.03435171768069267,
+      "kl": 0.0057353973388671875,
+      "learning_rate": 3.102505006444981e-06,
+      "loss": 0.0033,
+      "step": 843
+    },
+    {
+      "clip_ratio": 0.0002327330819866802,
+      "epoch": 1.4283110752591797,
+      "grad_norm": 0.03385370597243309,
+      "kl": 0.005730628967285156,
+      "learning_rate": 3.096414871203721e-06,
+      "loss": 0.0033,
+      "step": 844
+    },
+    {
+      "clip_ratio": 0.00025595308994752486,
+      "epoch": 1.4302231782737296,
+      "grad_norm": 0.0320701077580452,
+      "kl": 0.005660533905029297,
+      "learning_rate": 3.0903209807084085e-06,
+      "loss": 0.0032,
+      "step": 845
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 532.2009177207947,
+      "epoch": 1.4321352812882795,
+      "grad_norm": 0.035687774419784546,
+      "kl": 0.006323099136352539,
+      "learning_rate": 3.0842233733284866e-06,
+      "loss": 0.0055,
+      "num_tokens": 542686090.0,
+      "reward": 0.06389509252039716,
+      "reward_std": 0.06839800346642733,
+      "rewards/pure_accuracy_reward_math": 0.06389509059954435,
+      "step": 846
+    },
+    {
+      "clip_ratio": 0.0002455309293054597,
+      "epoch": 1.4340473843028292,
+      "grad_norm": 0.03433489799499512,
+      "kl": 0.006294965744018555,
+      "learning_rate": 3.078122087456802e-06,
+      "loss": 0.0055,
+      "step": 847
+    },
+    {
+      "clip_ratio": 0.0003179283777399178,
+      "epoch": 1.4359594873173793,
+      "grad_norm": 0.03377856686711311,
+      "kl": 0.00630497932434082,
+      "learning_rate": 3.072017161509364e-06,
+      "loss": 0.0054,
+      "step": 848
+    },
+    {
+      "clip_ratio": 0.00030606188772708265,
+      "epoch": 1.437871590331929,
+      "grad_norm": 0.03379327058792114,
+      "kl": 0.006325483322143555,
+      "learning_rate": 3.065908633925099e-06,
+      "loss": 0.0054,
+      "step": 849
+    },
+    {
+      "clip_ratio": 0.00029904921905199444,
+      "epoch": 1.4397836933464792,
+      "grad_norm": 0.03319833427667618,
+      "kl": 0.006340742111206055,
+      "learning_rate": 3.0597965431656125e-06,
+      "loss": 0.0053,
+      "step": 850
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.9991841316223,
+      "epoch": 1.00191210301455,
+      "grad_norm": 0.03730909898877144,
+      "kl": 0.005851268768310547,
+      "learning_rate": 3.0536809277149433e-06,
+      "loss": 0.0058,
+      "num_tokens": 3602593.0,
+      "reward": 0.061662948777666315,
+      "reward_std": 0.0712745109340176,
+      "rewards/pure_accuracy_reward_math": 0.06166294767172076,
+      "step": 851
+    },
+    {
+      "clip_ratio": 0.0002445870232463676,
+      "epoch": 1.0038242060290998,
+      "grad_norm": 0.036420926451683044,
+      "kl": 0.005807399749755859,
+      "learning_rate": 3.047561826079324e-06,
+      "loss": 0.0057,
+      "step": 852
+    },
+    {
+      "clip_ratio": 0.0002342841784184202,
+      "epoch": 1.0057363090436497,
+      "grad_norm": 0.03534744307398796,
+      "kl": 0.005809783935546875,
+      "learning_rate": 3.041439276786937e-06,
+      "loss": 0.0057,
+      "step": 853
+    },
+    {
+      "clip_ratio": 0.0003130897791834286,
+      "epoch": 1.0076484120581997,
+      "grad_norm": 0.03456578403711319,
+      "kl": 0.005836963653564453,
+      "learning_rate": 3.0353133183876745e-06,
+      "loss": 0.0056,
+      "step": 854
+    },
+    {
+      "clip_ratio": 0.0003235736477336104,
+      "epoch": 1.0095605150727496,
+      "grad_norm": 0.03683493658900261,
+      "kl": 0.00588226318359375,
+      "learning_rate": 3.0291839894528907e-06,
+      "loss": 0.0056,
+      "step": 855
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.2422127723694,
+      "epoch": 1.0114726180872995,
+      "grad_norm": 3.6328346729278564,
+      "kl": 0.07409882545471191,
+      "learning_rate": 3.023051328575164e-06,
+      "loss": 0.0092,
+      "num_tokens": 7231613.0,
+      "reward": 0.06696428847499192,
+      "reward_std": 0.07320140569936484,
+      "rewards/pure_accuracy_reward_math": 0.06696428725263104,
+      "step": 856
+    },
+    {
+      "clip_ratio": 0.0002944787788692338,
+      "epoch": 1.0133847211018494,
+      "grad_norm": 0.23805810511112213,
+      "kl": 0.01258087158203125,
+      "learning_rate": 3.016915374368052e-06,
+      "loss": 0.0068,
+      "step": 857
+    },
+    {
+      "clip_ratio": 0.000328014534943577,
+      "epoch": 1.0152968241163993,
+      "grad_norm": 0.038860052824020386,
+      "kl": 0.008163928985595703,
+      "learning_rate": 3.0107761654658464e-06,
+      "loss": 0.0066,
+      "step": 858
+    },
+    {
+      "clip_ratio": 0.00033978425187797257,
+      "epoch": 1.0172089271309492,
+      "grad_norm": 0.037539608776569366,
+      "kl": 0.008237600326538086,
+      "learning_rate": 3.0046337405233334e-06,
+      "loss": 0.0065,
+      "step": 859
+    },
+    {
+      "clip_ratio": 0.0003289994185706746,
+      "epoch": 1.0191210301454992,
+      "grad_norm": 0.03649570420384407,
+      "kl": 0.008342981338500977,
+      "learning_rate": 2.9984881382155484e-06,
+      "loss": 0.0065,
+      "step": 860
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 539.7709541320801,
+      "epoch": 1.021033133160049,
+      "grad_norm": 0.03506062552332878,
+      "kl": 0.0056056976318359375,
+      "learning_rate": 2.9923393972375337e-06,
+      "loss": 0.0075,
+      "num_tokens": 10898500.0,
+      "reward": 0.06389509155997075,
+      "reward_std": 0.07427741104038432,
+      "rewards/pure_accuracy_reward_math": 0.06389509086147882,
+      "step": 861
+    },
+    {
+      "clip_ratio": 0.00025894983372154456,
+      "epoch": 1.022945236174599,
+      "grad_norm": 0.03387964144349098,
+      "kl": 0.005673408508300781,
+      "learning_rate": 2.986187556304091e-06,
+      "loss": 0.0075,
+      "step": 862
+    },
+    {
+      "clip_ratio": 0.00026048227840647087,
+      "epoch": 1.024857339189149,
+      "grad_norm": 0.0339200459420681,
+      "kl": 0.005715370178222656,
+      "learning_rate": 2.9800326541495427e-06,
+      "loss": 0.0074,
+      "step": 863
+    },
+    {
+      "clip_ratio": 0.000286817725225319,
+      "epoch": 1.0267694422036988,
+      "grad_norm": 0.033578090369701385,
+      "kl": 0.0057220458984375,
+      "learning_rate": 2.973874729527486e-06,
+      "loss": 0.0074,
+      "step": 864
+    },
+    {
+      "clip_ratio": 0.00031288620994018856,
+      "epoch": 1.0286815452182487,
+      "grad_norm": 0.03253786265850067,
+      "kl": 0.005726814270019531,
+      "learning_rate": 2.967713821210547e-06,
+      "loss": 0.0073,
+      "step": 865
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 532.484959602356,
+      "epoch": 1.0305936482327986,
+      "grad_norm": 0.040393006056547165,
+      "kl": 0.005712032318115234,
+      "learning_rate": 2.961549967990139e-06,
+      "loss": 0.0094,
+      "num_tokens": 14539070.0,
+      "reward": 0.0700334852153901,
+      "reward_std": 0.07968511193757877,
+      "rewards/pure_accuracy_reward_math": 0.07003348364378326,
+      "step": 866
+    },
+    {
+      "clip_ratio": 0.00034418605622477116,
+      "epoch": 1.0325057512473486,
+      "grad_norm": 0.03829828277230263,
+      "kl": 0.00571441650390625,
+      "learning_rate": 2.95538320867622e-06,
+      "loss": 0.0094,
+      "step": 867
+    },
+    {
+      "clip_ratio": 0.0003270462358386794,
+      "epoch": 1.0344178542618985,
+      "grad_norm": 0.03763904795050621,
+      "kl": 0.005820035934448242,
+      "learning_rate": 2.949213582097042e-06,
+      "loss": 0.0094,
+      "step": 868
+    },
+    {
+      "clip_ratio": 0.00039861036464117205,
+      "epoch": 1.0363299572764482,
+      "grad_norm": 0.03893045708537102,
+      "kl": 0.005897045135498047,
+      "learning_rate": 2.9430411270989112e-06,
+      "loss": 0.0093,
+      "step": 869
+    },
+    {
+      "clip_ratio": 0.0004073582798014286,
+      "epoch": 1.038242060290998,
+      "grad_norm": 0.03808417171239853,
+      "kl": 0.0059051513671875,
+      "learning_rate": 2.9368658825459452e-06,
+      "loss": 0.0092,
+      "step": 870
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 518.7159852981567,
+      "epoch": 1.040154163305548,
+      "grad_norm": 0.03680076450109482,
+      "kl": 0.006183147430419922,
+      "learning_rate": 2.9306878873198227e-06,
+      "loss": 0.0073,
+      "num_tokens": 18123716.0,
+      "reward": 0.06975446810247377,
+      "reward_std": 0.07255704078124836,
+      "rewards/pure_accuracy_reward_math": 0.06975446600699797,
+      "step": 871
+    },
+    {
+      "clip_ratio": 0.00025267474336487794,
+      "epoch": 1.042066266320098,
+      "grad_norm": 0.036574870347976685,
+      "kl": 0.006196498870849609,
+      "learning_rate": 2.9245071803195435e-06,
+      "loss": 0.0072,
+      "step": 872
+    },
+    {
+      "clip_ratio": 0.0002888958638322947,
+      "epoch": 1.0439783693346478,
+      "grad_norm": 0.03539302200078964,
+      "kl": 0.006276130676269531,
+      "learning_rate": 2.9183238004611815e-06,
+      "loss": 0.0072,
+      "step": 873
+    },
+    {
+      "clip_ratio": 0.00027933804358326597,
+      "epoch": 1.0458904723491977,
+      "grad_norm": 0.03457676246762276,
+      "kl": 0.00629425048828125,
+      "learning_rate": 2.912137786677639e-06,
+      "loss": 0.0071,
+      "step": 874
+    },
+    {
+      "clip_ratio": 0.00026495220328115465,
+      "epoch": 1.0478025753637477,
+      "grad_norm": 0.034882258623838425,
+      "kl": 0.006371974945068359,
+      "learning_rate": 2.905949177918403e-06,
+      "loss": 0.0071,
+      "step": 875
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.4989104270935,
+      "epoch": 1.0497146783782976,
+      "grad_norm": 0.04403652995824814,
+      "kl": 0.0064754486083984375,
+      "learning_rate": 2.8997580131493004e-06,
+      "loss": 0.0104,
+      "num_tokens": 21706672.0,
+      "reward": 0.07421875311410986,
+      "reward_std": 0.08282060426427051,
+      "rewards/pure_accuracy_reward_math": 0.07421875130967237,
+      "step": 876
+    },
+    {
+      "clip_ratio": 0.00034863107299543117,
+      "epoch": 1.0516267813928475,
+      "grad_norm": 0.040730468928813934,
+      "kl": 0.006359100341796875,
+      "learning_rate": 2.89356433135225e-06,
+      "loss": 0.0104,
+      "step": 877
+    },
+    {
+      "clip_ratio": 0.0003696895219036378,
+      "epoch": 1.0535388844073974,
+      "grad_norm": 0.040028344839811325,
+      "kl": 0.006321430206298828,
+      "learning_rate": 2.8873681715250197e-06,
+      "loss": 0.0104,
+      "step": 878
+    },
+    {
+      "clip_ratio": 0.00041197048278718285,
+      "epoch": 1.0554509874219473,
+      "grad_norm": 0.04009086638689041,
+      "kl": 0.0062351226806640625,
+      "learning_rate": 2.881169572680981e-06,
+      "loss": 0.0103,
+      "step": 879
+    },
+    {
+      "clip_ratio": 0.0004460485272943515,
+      "epoch": 1.0573630904364972,
+      "grad_norm": 0.03965138643980026,
+      "kl": 0.006242275238037109,
+      "learning_rate": 2.87496857384886e-06,
+      "loss": 0.0102,
+      "step": 880
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 524.4285945892334,
+      "epoch": 1.0592751934510471,
+      "grad_norm": 0.03920762613415718,
+      "kl": 0.005979061126708984,
+      "learning_rate": 2.868765214072495e-06,
+      "loss": 0.0082,
+      "num_tokens": 25317588.0,
+      "reward": 0.07338170023285784,
+      "reward_std": 0.0805021328269504,
+      "rewards/pure_accuracy_reward_math": 0.07338169755530544,
+      "step": 881
+    },
+    {
+      "clip_ratio": 0.0003169273815046836,
+      "epoch": 1.061187296465597,
+      "grad_norm": 0.03858224302530289,
+      "kl": 0.006028175354003906,
+      "learning_rate": 2.8625595324105925e-06,
+      "loss": 0.0082,
+      "step": 882
+    },
+    {
+      "clip_ratio": 0.0003076135093351695,
+      "epoch": 1.063099399480147,
+      "grad_norm": 0.03754101321101189,
+      "kl": 0.006089687347412109,
+      "learning_rate": 2.8563515679364733e-06,
+      "loss": 0.0081,
+      "step": 883
+    },
+    {
+      "clip_ratio": 0.0003307215861809709,
+      "epoch": 1.065011502494697,
+      "grad_norm": 0.03692120686173439,
+      "kl": 0.006084442138671875,
+      "learning_rate": 2.850141359737836e-06,
+      "loss": 0.008,
+      "step": 884
+    },
+    {
+      "clip_ratio": 0.0003362660154380137,
+      "epoch": 1.0669236055092468,
+      "grad_norm": 0.03691774606704712,
+      "kl": 0.006087303161621094,
+      "learning_rate": 2.843928946916504e-06,
+      "loss": 0.008,
+      "step": 885
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 541.91938829422,
+      "epoch": 1.0688357085237967,
+      "grad_norm": 0.03421162813901901,
+      "kl": 0.005934238433837891,
+      "learning_rate": 2.8377143685881835e-06,
+      "loss": 0.0048,
+      "num_tokens": 28991667.0,
+      "reward": 0.06138393090805039,
+      "reward_std": 0.05770279868738726,
+      "rewards/pure_accuracy_reward_math": 0.06138392991852015,
+      "step": 886
+    },
+    {
+      "clip_ratio": 0.00021627708133564738,
+      "epoch": 1.0707478115383466,
+      "grad_norm": 0.0331665463745594,
+      "kl": 0.005833148956298828,
+      "learning_rate": 2.8314976638822145e-06,
+      "loss": 0.0048,
+      "step": 887
+    },
+    {
+      "clip_ratio": 0.00023772416773226723,
+      "epoch": 1.0726599145528966,
+      "grad_norm": 0.03265010192990303,
+      "kl": 0.00572967529296875,
+      "learning_rate": 2.825278871941325e-06,
+      "loss": 0.0048,
+      "step": 888
+    },
+    {
+      "clip_ratio": 0.000255867875353033,
+      "epoch": 1.0745720175674465,
+      "grad_norm": 0.031934551894664764,
+      "kl": 0.0056514739990234375,
+      "learning_rate": 2.819058031921387e-06,
+      "loss": 0.0047,
+      "step": 889
+    },
+    {
+      "clip_ratio": 0.0002752940895334177,
+      "epoch": 1.0764841205819964,
+      "grad_norm": 0.03180062025785446,
+      "kl": 0.005589008331298828,
+      "learning_rate": 2.812835182991166e-06,
+      "loss": 0.0047,
+      "step": 890
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 541.6253051757812,
+      "epoch": 1.0783962235965463,
+      "grad_norm": 0.0352044515311718,
+      "kl": 0.006504535675048828,
+      "learning_rate": 2.8066103643320774e-06,
+      "loss": 0.005,
+      "num_tokens": 32662984.0,
+      "reward": 0.07003348544822074,
+      "reward_std": 0.07148103549843654,
+      "rewards/pure_accuracy_reward_math": 0.07003348341095261,
+      "step": 891
+    },
+    {
+      "clip_ratio": 0.0002908879878305015,
+      "epoch": 1.0803083266110962,
+      "grad_norm": 0.03477974981069565,
+      "kl": 0.006473064422607422,
+      "learning_rate": 2.800383615137939e-06,
+      "loss": 0.0049,
+      "step": 892
+    },
+    {
+      "clip_ratio": 0.00027559091887496834,
+      "epoch": 1.0822204296256461,
+      "grad_norm": 0.03371204808354378,
+      "kl": 0.006519317626953125,
+      "learning_rate": 2.7941549746147234e-06,
+      "loss": 0.0049,
+      "step": 893
+    },
+    {
+      "clip_ratio": 0.00026331023877901316,
+      "epoch": 1.084132532640196,
+      "grad_norm": 0.03233867511153221,
+      "kl": 0.00655364990234375,
+      "learning_rate": 2.7879244819803104e-06,
+      "loss": 0.0048,
+      "step": 894
+    },
+    {
+      "clip_ratio": 0.0003059378379361988,
+      "epoch": 1.086044635654746,
+      "grad_norm": 0.032591916620731354,
+      "kl": 0.006562709808349609,
+      "learning_rate": 2.781692176464244e-06,
+      "loss": 0.0048,
+      "step": 895
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 538.9467296600342,
+      "epoch": 1.0879567386692959,
+      "grad_norm": 0.0399605967104435,
+      "kl": 0.007935047149658203,
+      "learning_rate": 2.7754580973074817e-06,
+      "loss": 0.0078,
+      "num_tokens": 36327265.0,
+      "reward": 0.06640625328873284,
+      "reward_std": 0.07582512497901917,
+      "rewards/pure_accuracy_reward_math": 0.06640625142608769,
+      "step": 896
+    },
+    {
+      "clip_ratio": 0.00029080147635340836,
+      "epoch": 1.0898688416838458,
+      "grad_norm": 0.036669787019491196,
+      "kl": 0.007892131805419922,
+      "learning_rate": 2.769222283762148e-06,
+      "loss": 0.0077,
+      "step": 897
+    },
+    {
+      "clip_ratio": 0.0003202801690349588,
+      "epoch": 1.0917809446983957,
+      "grad_norm": 0.036093369126319885,
+      "kl": 0.007870197296142578,
+      "learning_rate": 2.7629847750912885e-06,
+      "loss": 0.0077,
+      "step": 898
+    },
+    {
+      "clip_ratio": 0.00034906711715620986,
+      "epoch": 1.0936930477129456,
+      "grad_norm": 0.036899976432323456,
+      "kl": 0.007824897766113281,
+      "learning_rate": 2.756745610568622e-06,
+      "loss": 0.0076,
+      "step": 899
+    },
+    {
+      "clip_ratio": 0.0003909627172333785,
+      "epoch": 1.0956051507274955,
+      "grad_norm": 0.03607386723160744,
+      "kl": 0.00782632827758789,
+      "learning_rate": 2.7505048294782914e-06,
+      "loss": 0.0076,
+      "step": 900
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.9687776565552,
+      "epoch": 1.0975172537420455,
+      "grad_norm": 0.04138408601284027,
+      "kl": 0.006854534149169922,
+      "learning_rate": 2.7442624711146206e-06,
+      "loss": 0.0105,
+      "num_tokens": 39926261.0,
+      "reward": 0.07561384263681248,
+      "reward_std": 0.08660046180011705,
+      "rewards/pure_accuracy_reward_math": 0.07561384089058265,
+      "step": 901
+    },
+    {
+      "clip_ratio": 0.0003407098130878694,
+      "epoch": 1.0994293567565951,
+      "grad_norm": 0.04008745029568672,
+      "kl": 0.006922245025634766,
+      "learning_rate": 2.7380185747818628e-06,
+      "loss": 0.0105,
+      "step": 902
+    },
+    {
+      "clip_ratio": 0.0003345158028196238,
+      "epoch": 1.1013414597711453,
+      "grad_norm": 0.039206936955451965,
+      "kl": 0.006981372833251953,
+      "learning_rate": 2.7317731797939566e-06,
+      "loss": 0.0104,
+      "step": 903
+    },
+    {
+      "clip_ratio": 0.0003512224284918375,
+      "epoch": 1.103253562785695,
+      "grad_norm": 0.03816502168774605,
+      "kl": 0.006984233856201172,
+      "learning_rate": 2.7255263254742746e-06,
+      "loss": 0.0103,
+      "step": 904
+    },
+    {
+      "clip_ratio": 0.00038539456500075175,
+      "epoch": 1.105165665800245,
+      "grad_norm": 0.03802499175071716,
+      "kl": 0.006890773773193359,
+      "learning_rate": 2.71927805115538e-06,
+      "loss": 0.0103,
+      "step": 905
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.6635279655457,
+      "epoch": 1.1070777688147948,
+      "grad_norm": 0.03780652955174446,
+      "kl": 0.005947589874267578,
+      "learning_rate": 2.713028396178776e-06,
+      "loss": 0.0044,
+      "num_tokens": 43530039.0,
+      "reward": 0.0691964318684768,
+      "reward_std": 0.0774129043566063,
+      "rewards/pure_accuracy_reward_math": 0.06919642988941632,
+      "step": 906
+    },
+    {
+      "clip_ratio": 0.0002883933650537074,
+      "epoch": 1.1089898718293447,
+      "grad_norm": 0.03706151619553566,
+      "kl": 0.005948543548583984,
+      "learning_rate": 2.706777399894656e-06,
+      "loss": 0.0044,
+      "step": 907
+    },
+    {
+      "clip_ratio": 0.0003032470573316459,
+      "epoch": 1.1109019748438946,
+      "grad_norm": 0.03684515878558159,
+      "kl": 0.005936622619628906,
+      "learning_rate": 2.700525101661665e-06,
+      "loss": 0.0044,
+      "step": 908
+    },
+    {
+      "clip_ratio": 0.0003385747261290817,
+      "epoch": 1.1128140778584446,
+      "grad_norm": 0.03632361814379692,
+      "kl": 0.005986690521240234,
+      "learning_rate": 2.6942715408466406e-06,
+      "loss": 0.0043,
+      "step": 909
+    },
+    {
+      "clip_ratio": 0.00035084231319615355,
+      "epoch": 1.1147261808729945,
+      "grad_norm": 0.0364714041352272,
+      "kl": 0.005983829498291016,
+      "learning_rate": 2.6880167568243716e-06,
+      "loss": 0.0042,
+      "step": 910
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 524.6629705429077,
+      "epoch": 1.1166382838875444,
+      "grad_norm": 0.037073228508234024,
+      "kl": 0.006183624267578125,
+      "learning_rate": 2.681760788977349e-06,
+      "loss": 0.0075,
+      "num_tokens": 47140667.0,
+      "reward": 0.06166294956346974,
+      "reward_std": 0.07140090485336259,
+      "rewards/pure_accuracy_reward_math": 0.061662947526201606,
+      "step": 911
+    },
+    {
+      "clip_ratio": 0.00026335007953548484,
+      "epoch": 1.1185503869020943,
+      "grad_norm": 0.03628791868686676,
+      "kl": 0.006221771240234375,
+      "learning_rate": 2.6755036766955172e-06,
+      "loss": 0.0075,
+      "step": 912
+    },
+    {
+      "clip_ratio": 0.00029098790395210017,
+      "epoch": 1.1204624899166442,
+      "grad_norm": 0.03659017011523247,
+      "kl": 0.006258964538574219,
+      "learning_rate": 2.6692454593760255e-06,
+      "loss": 0.0075,
+      "step": 913
+    },
+    {
+      "clip_ratio": 0.00033703100632465066,
+      "epoch": 1.1223745929311941,
+      "grad_norm": 0.0357106551527977,
+      "kl": 0.006211757659912109,
+      "learning_rate": 2.6629861764229824e-06,
+      "loss": 0.0074,
+      "step": 914
+    },
+    {
+      "clip_ratio": 0.0003104925490902133,
+      "epoch": 1.124286695945744,
+      "grad_norm": 0.03461490571498871,
+      "kl": 0.006183624267578125,
+      "learning_rate": 2.6567258672472064e-06,
+      "loss": 0.0073,
+      "step": 915
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.3962297439575,
+      "epoch": 1.126198798960294,
+      "grad_norm": 0.038919847458601,
+      "kl": 0.0060977935791015625,
+      "learning_rate": 2.650464571265975e-06,
+      "loss": 0.0062,
+      "num_tokens": 50733111.0,
+      "reward": 0.06584821734577417,
+      "reward_std": 0.07367311330744997,
+      "rewards/pure_accuracy_reward_math": 0.06584821583237499,
+      "step": 916
+    },
+    {
+      "clip_ratio": 0.0002951280029606096,
+      "epoch": 1.1281109019748439,
+      "grad_norm": 0.038201622664928436,
+      "kl": 0.0060329437255859375,
+      "learning_rate": 2.6442023279027805e-06,
+      "loss": 0.0061,
+      "step": 917
+    },
+    {
+      "clip_ratio": 0.00029004437487856194,
+      "epoch": 1.1300230049893938,
+      "grad_norm": 0.03696547448635101,
+      "kl": 0.006039619445800781,
+      "learning_rate": 2.6379391765870828e-06,
+      "loss": 0.0061,
+      "step": 918
+    },
+    {
+      "clip_ratio": 0.0003163389113183257,
+      "epoch": 1.1319351080039437,
+      "grad_norm": 0.03571280464529991,
+      "kl": 0.006005764007568359,
+      "learning_rate": 2.6316751567540527e-06,
+      "loss": 0.006,
+      "step": 919
+    },
+    {
+      "clip_ratio": 0.0003592208154259424,
+      "epoch": 1.1338472110184936,
+      "grad_norm": 0.03568287193775177,
+      "kl": 0.005993366241455078,
+      "learning_rate": 2.625410307844335e-06,
+      "loss": 0.006,
+      "step": 920
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 538.2659268379211,
+      "epoch": 1.1357593140330435,
+      "grad_norm": 0.03899242356419563,
+      "kl": 0.005813121795654297,
+      "learning_rate": 2.6191446693037924e-06,
+      "loss": 0.0071,
+      "num_tokens": 54398312.0,
+      "reward": 0.07226562857977115,
+      "reward_std": 0.07861530320951715,
+      "rewards/pure_accuracy_reward_math": 0.07226562648429535,
+      "step": 921
+    },
+    {
+      "clip_ratio": 0.00029711308371815903,
+      "epoch": 1.1376714170475934,
+      "grad_norm": 0.038164544850587845,
+      "kl": 0.0058841705322265625,
+      "learning_rate": 2.6128782805832605e-06,
+      "loss": 0.0071,
+      "step": 922
+    },
+    {
+      "clip_ratio": 0.0003027216810664868,
+      "epoch": 1.1395835200621434,
+      "grad_norm": 0.03706645965576172,
+      "kl": 0.005882740020751953,
+      "learning_rate": 2.606611181138295e-06,
+      "loss": 0.007,
+      "step": 923
+    },
+    {
+      "clip_ratio": 0.00032618250162386175,
+      "epoch": 1.1414956230766933,
+      "grad_norm": 0.036637816578149796,
+      "kl": 0.005909442901611328,
+      "learning_rate": 2.600343410428931e-06,
+      "loss": 0.007,
+      "step": 924
+    },
+    {
+      "clip_ratio": 0.00032713054685018506,
+      "epoch": 1.1434077260912432,
+      "grad_norm": 0.036758605390787125,
+      "kl": 0.005947589874267578,
+      "learning_rate": 2.5940750079194275e-06,
+      "loss": 0.0069,
+      "step": 925
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 542.0072803497314,
+      "epoch": 1.145319829105793,
+      "grad_norm": 0.03791532665491104,
+      "kl": 0.0061702728271484375,
+      "learning_rate": 2.5878060130780225e-06,
+      "loss": 0.0074,
+      "num_tokens": 58073722.0,
+      "reward": 0.06835937863797881,
+      "reward_std": 0.07715391897363588,
+      "rewards/pure_accuracy_reward_math": 0.06835937636788003,
+      "step": 926
+    },
+    {
+      "clip_ratio": 0.00030884258325158953,
+      "epoch": 1.147231932120343,
+      "grad_norm": 0.03749171644449234,
+      "kl": 0.006160736083984375,
+      "learning_rate": 2.581536465376684e-06,
+      "loss": 0.0074,
+      "step": 927
+    },
+    {
+      "clip_ratio": 0.000279198229350186,
+      "epoch": 1.149144035134893,
+      "grad_norm": 0.03681938722729683,
+      "kl": 0.006136417388916016,
+      "learning_rate": 2.575266404290859e-06,
+      "loss": 0.0073,
+      "step": 928
+    },
+    {
+      "clip_ratio": 0.0002930849948370451,
+      "epoch": 1.1510561381494429,
+      "grad_norm": 0.035750068724155426,
+      "kl": 0.006227970123291016,
+      "learning_rate": 2.5689958692992284e-06,
+      "loss": 0.0072,
+      "step": 929
+    },
+    {
+      "clip_ratio": 0.00028936977611238035,
+      "epoch": 1.1529682411639928,
+      "grad_norm": 0.03503425419330597,
+      "kl": 0.006281375885009766,
+      "learning_rate": 2.562724899883458e-06,
+      "loss": 0.0072,
+      "step": 930
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 531.6188879013062,
+      "epoch": 1.1548803441785427,
+      "grad_norm": 0.05187267065048218,
+      "kl": 0.007277965545654297,
+      "learning_rate": 2.5564535355279464e-06,
+      "loss": 0.0072,
+      "num_tokens": 61714268.0,
+      "reward": 0.07505580713041127,
+      "reward_std": 0.08531173289520666,
+      "rewards/pure_accuracy_reward_math": 0.07505580491852015,
+      "step": 931
+    },
+    {
+      "clip_ratio": 0.00033635866333270314,
+      "epoch": 1.1567924471930926,
+      "grad_norm": 0.039655230939388275,
+      "kl": 0.0072231292724609375,
+      "learning_rate": 2.550181815719581e-06,
+      "loss": 0.0072,
+      "step": 932
+    },
+    {
+      "clip_ratio": 0.00035109808851530033,
+      "epoch": 1.1587045502076425,
+      "grad_norm": 0.038757406175136566,
+      "kl": 0.007157802581787109,
+      "learning_rate": 2.5439097799474867e-06,
+      "loss": 0.0072,
+      "step": 933
+    },
+    {
+      "clip_ratio": 0.00037538493586453114,
+      "epoch": 1.1606166532221924,
+      "grad_norm": 0.03841486573219299,
+      "kl": 0.007115840911865234,
+      "learning_rate": 2.537637467702777e-06,
+      "loss": 0.0071,
+      "step": 934
+    },
+    {
+      "clip_ratio": 0.0003936579208243529,
+      "epoch": 1.1625287562367423,
+      "grad_norm": 0.038453541696071625,
+      "kl": 0.0070896148681640625,
+      "learning_rate": 2.531364918478308e-06,
+      "loss": 0.007,
+      "step": 935
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 547.6250252723694,
+      "epoch": 1.1644408592512923,
+      "grad_norm": 0.03738933801651001,
+      "kl": 0.00615692138671875,
+      "learning_rate": 2.5250921717684247e-06,
+      "loss": 0.0061,
+      "num_tokens": 65415044.0,
+      "reward": 0.07561384260770865,
+      "reward_std": 0.07745296956272796,
+      "rewards/pure_accuracy_reward_math": 0.07561384062864818,
+      "step": 936
+    },
+    {
+      "clip_ratio": 0.0002929231292227996,
+      "epoch": 1.166352962265842,
+      "grad_norm": 0.03690778836607933,
+      "kl": 0.006189823150634766,
+      "learning_rate": 2.5188192670687186e-06,
+      "loss": 0.0061,
+      "step": 937
+    },
+    {
+      "clip_ratio": 0.000294325235870474,
+      "epoch": 1.168265065280392,
+      "grad_norm": 0.03613179549574852,
+      "kl": 0.006130695343017578,
+      "learning_rate": 2.512546243875776e-06,
+      "loss": 0.0061,
+      "step": 938
+    },
+    {
+      "clip_ratio": 0.00031920797795237377,
+      "epoch": 1.1701771682949418,
+      "grad_norm": 0.03461304306983948,
+      "kl": 0.006014347076416016,
+      "learning_rate": 2.5062731416869267e-06,
+      "loss": 0.006,
+      "step": 939
+    },
+    {
+      "clip_ratio": 0.00037188214912475814,
+      "epoch": 1.172089271309492,
+      "grad_norm": 0.03454398363828659,
+      "kl": 0.005980968475341797,
+      "learning_rate": 2.5e-06,
+      "loss": 0.0059,
+      "step": 940
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 532.1423244476318,
+      "epoch": 1.1740013743240416,
+      "grad_norm": 0.03934042155742645,
+      "kl": 0.006266117095947266,
+      "learning_rate": 2.493726858313074e-06,
+      "loss": 0.0078,
+      "num_tokens": 69057654.0,
+      "reward": 0.07477678928989917,
+      "reward_std": 0.08299326134147123,
+      "rewards/pure_accuracy_reward_math": 0.07477678690338507,
+      "step": 941
+    },
+    {
+      "clip_ratio": 0.00031629414758072016,
+      "epoch": 1.1759134773385915,
+      "grad_norm": 0.03872406855225563,
+      "kl": 0.0062713623046875,
+      "learning_rate": 2.4874537561242253e-06,
+      "loss": 0.0078,
+      "step": 942
+    },
+    {
+      "clip_ratio": 0.0003434862284166229,
+      "epoch": 1.1778255803531414,
+      "grad_norm": 0.03723340108990669,
+      "kl": 0.00623321533203125,
+      "learning_rate": 2.481180732931282e-06,
+      "loss": 0.0077,
+      "step": 943
+    },
+    {
+      "clip_ratio": 0.00034986940886483353,
+      "epoch": 1.1797376833676914,
+      "grad_norm": 0.03732794523239136,
+      "kl": 0.006276607513427734,
+      "learning_rate": 2.4749078282315757e-06,
+      "loss": 0.0076,
+      "step": 944
+    },
+    {
+      "clip_ratio": 0.0003579597876637308,
+      "epoch": 1.1816497863822413,
+      "grad_norm": 0.03668594732880592,
+      "kl": 0.006198883056640625,
+      "learning_rate": 2.468635081521693e-06,
+      "loss": 0.0076,
+      "step": 945
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.1718993186951,
+      "epoch": 1.1835618893967912,
+      "grad_norm": 0.03715552017092705,
+      "kl": 0.006759166717529297,
+      "learning_rate": 2.462362532297224e-06,
+      "loss": 0.0079,
+      "num_tokens": 72682654.0,
+      "reward": 0.06891741449362598,
+      "reward_std": 0.08248148870188743,
+      "rewards/pure_accuracy_reward_math": 0.06891741199069656,
+      "step": 946
+    },
+    {
+      "clip_ratio": 0.0003075862115053951,
+      "epoch": 1.185473992411341,
+      "grad_norm": 0.03616279736161232,
+      "kl": 0.006741523742675781,
+      "learning_rate": 2.456090220052514e-06,
+      "loss": 0.0079,
+      "step": 947
+    },
+    {
+      "clip_ratio": 0.00027696539024191225,
+      "epoch": 1.187386095425891,
+      "grad_norm": 0.03556762635707855,
+      "kl": 0.006789684295654297,
+      "learning_rate": 2.44981818428042e-06,
+      "loss": 0.0079,
+      "step": 948
+    },
+    {
+      "clip_ratio": 0.0002739789470638243,
+      "epoch": 1.189298198440441,
+      "grad_norm": 0.03486724570393562,
+      "kl": 0.006869316101074219,
+      "learning_rate": 2.4435464644720544e-06,
+      "loss": 0.0078,
+      "step": 949
+    },
+    {
+      "clip_ratio": 0.00031816330425726846,
+      "epoch": 1.1912103014549908,
+      "grad_norm": 0.03446395695209503,
+      "kl": 0.006869316101074219,
+      "learning_rate": 2.4372751001165427e-06,
+      "loss": 0.0077,
+      "step": 950
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.6573901176453,
+      "epoch": 1.1931224044695408,
+      "grad_norm": 0.03734345734119415,
+      "kl": 0.006131649017333984,
+      "learning_rate": 2.4310041307007716e-06,
+      "loss": 0.0062,
+      "num_tokens": 76305578.0,
+      "reward": 0.07114955657743849,
+      "reward_std": 0.07526708883233368,
+      "rewards/pure_accuracy_reward_math": 0.07114955488941632,
+      "step": 951
+    },
+    {
+      "clip_ratio": 0.00029005661951941875,
+      "epoch": 1.1950345074840907,
+      "grad_norm": 0.036443449556827545,
+      "kl": 0.006079196929931641,
+      "learning_rate": 2.4247335957091418e-06,
+      "loss": 0.0062,
+      "step": 952
+    },
+    {
+      "clip_ratio": 0.0002579906781647878,
+      "epoch": 1.1969466104986406,
+      "grad_norm": 0.034940823912620544,
+      "kl": 0.006037235260009766,
+      "learning_rate": 2.4184635346233166e-06,
+      "loss": 0.0061,
+      "step": 953
+    },
+    {
+      "clip_ratio": 0.00032199256943954424,
+      "epoch": 1.1988587135131905,
+      "grad_norm": 0.03445851802825928,
+      "kl": 0.006024360656738281,
+      "learning_rate": 2.4121939869219784e-06,
+      "loss": 0.0061,
+      "step": 954
+    },
+    {
+      "clip_ratio": 0.0003193520489048751,
+      "epoch": 1.2007708165277404,
+      "grad_norm": 0.03448885306715965,
+      "kl": 0.005992889404296875,
+      "learning_rate": 2.405924992080573e-06,
+      "loss": 0.006,
+      "step": 955
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.4358487129211,
+      "epoch": 1.2026829195422903,
+      "grad_norm": 0.11665105819702148,
+      "kl": 0.008374214172363281,
+      "learning_rate": 2.3996565895710692e-06,
+      "loss": 0.0065,
+      "num_tokens": 79904712.0,
+      "reward": 0.07366071760770865,
+      "reward_std": 0.08458104060264304,
+      "rewards/pure_accuracy_reward_math": 0.07366071591968648,
+      "step": 956
+    },
+    {
+      "clip_ratio": 0.00031160829769305565,
+      "epoch": 1.2045950225568403,
+      "grad_norm": 0.04096413403749466,
+      "kl": 0.006944179534912109,
+      "learning_rate": 2.3933888188617054e-06,
+      "loss": 0.0064,
+      "step": 957
+    },
+    {
+      "clip_ratio": 0.00032232171946589006,
+      "epoch": 1.2065071255713902,
+      "grad_norm": 0.04049144312739372,
+      "kl": 0.006976127624511719,
+      "learning_rate": 2.3871217194167407e-06,
+      "loss": 0.0063,
+      "step": 958
+    },
+    {
+      "clip_ratio": 0.0003416440970340773,
+      "epoch": 1.20841922858594,
+      "grad_norm": 0.039766065776348114,
+      "kl": 0.007042884826660156,
+      "learning_rate": 2.380855330696208e-06,
+      "loss": 0.0063,
+      "step": 959
+    },
+    {
+      "clip_ratio": 0.0003523347779150754,
+      "epoch": 1.21033133160049,
+      "grad_norm": 0.03884311020374298,
+      "kl": 0.007153987884521484,
+      "learning_rate": 2.3745896921556656e-06,
+      "loss": 0.0062,
+      "step": 960
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.392322063446,
+      "epoch": 1.21224343461504,
+      "grad_norm": 0.04043371230363846,
+      "kl": 0.008221149444580078,
+      "learning_rate": 2.368324843245948e-06,
+      "loss": 0.0086,
+      "num_tokens": 83540930.0,
+      "reward": 0.07952009316068143,
+      "reward_std": 0.08836089639225975,
+      "rewards/pure_accuracy_reward_math": 0.0795200911234133,
+      "step": 961
+    },
+    {
+      "clip_ratio": 0.0003234188988017195,
+      "epoch": 1.2141555376295898,
+      "grad_norm": 0.039239391684532166,
+      "kl": 0.008275985717773438,
+      "learning_rate": 2.362060823412919e-06,
+      "loss": 0.0086,
+      "step": 962
+    },
+    {
+      "clip_ratio": 0.00033211900500873526,
+      "epoch": 1.2160676406441397,
+      "grad_norm": 0.03923904523253441,
+      "kl": 0.008409500122070312,
+      "learning_rate": 2.355797672097219e-06,
+      "loss": 0.0086,
+      "step": 963
+    },
+    {
+      "clip_ratio": 0.00036667373893806143,
+      "epoch": 1.2179797436586897,
+      "grad_norm": 0.038865529000759125,
+      "kl": 0.008434295654296875,
+      "learning_rate": 2.349535428734026e-06,
+      "loss": 0.0085,
+      "step": 964
+    },
+    {
+      "clip_ratio": 0.0003816600048480723,
+      "epoch": 1.2198918466732396,
+      "grad_norm": 0.037728771567344666,
+      "kl": 0.00834512710571289,
+      "learning_rate": 2.343274132752795e-06,
+      "loss": 0.0084,
+      "step": 965
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 535.4799346923828,
+      "epoch": 1.2218039496877895,
+      "grad_norm": 0.03813539817929268,
+      "kl": 0.005985260009765625,
+      "learning_rate": 2.3370138235770184e-06,
+      "loss": 0.0088,
+      "num_tokens": 87187574.0,
+      "reward": 0.060267860419116914,
+      "reward_std": 0.07384576939512044,
+      "rewards/pure_accuracy_reward_math": 0.060267858498264104,
+      "step": 966
+    },
+    {
+      "clip_ratio": 0.0002719826344446119,
+      "epoch": 1.2237160527023394,
+      "grad_norm": 0.03676025941967964,
+      "kl": 0.006021976470947266,
+      "learning_rate": 2.330754540623975e-06,
+      "loss": 0.0088,
+      "step": 967
+    },
+    {
+      "clip_ratio": 0.0002730399019696961,
+      "epoch": 1.2256281557168893,
+      "grad_norm": 0.03579593822360039,
+      "kl": 0.006060123443603516,
+      "learning_rate": 2.324496323304484e-06,
+      "loss": 0.0088,
+      "step": 968
+    },
+    {
+      "clip_ratio": 0.0002800920712502375,
+      "epoch": 1.2275402587314392,
+      "grad_norm": 0.0353357158601284,
+      "kl": 0.0061092376708984375,
+      "learning_rate": 2.318239211022651e-06,
+      "loss": 0.0087,
+      "step": 969
+    },
+    {
+      "clip_ratio": 0.0003294056899108,
+      "epoch": 1.2294523617459892,
+      "grad_norm": 0.03521355986595154,
+      "kl": 0.006182193756103516,
+      "learning_rate": 2.3119832431756284e-06,
+      "loss": 0.0086,
+      "step": 970
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 513.8870182037354,
+      "epoch": 1.231364464760539,
+      "grad_norm": 0.03882085531949997,
+      "kl": 0.006420135498046875,
+      "learning_rate": 2.3057284591533598e-06,
+      "loss": 0.0093,
+      "num_tokens": 90758753.0,
+      "reward": 0.07505580718861893,
+      "reward_std": 0.07715391827514395,
+      "rewards/pure_accuracy_reward_math": 0.0750558051513508,
+      "step": 971
+    },
+    {
+      "clip_ratio": 0.0003045887907546785,
+      "epoch": 1.2332765677750888,
+      "grad_norm": 0.03775356709957123,
+      "kl": 0.006350040435791016,
+      "learning_rate": 2.299474898338336e-06,
+      "loss": 0.0093,
+      "step": 972
+    },
+    {
+      "clip_ratio": 0.0003195773986703898,
+      "epoch": 1.235188670789639,
+      "grad_norm": 0.03639310225844383,
+      "kl": 0.006343841552734375,
+      "learning_rate": 2.2932226001053444e-06,
+      "loss": 0.0092,
+      "step": 973
+    },
+    {
+      "clip_ratio": 0.0003582680616318612,
+      "epoch": 1.2371007738041886,
+      "grad_norm": 0.036272380501031876,
+      "kl": 0.006300926208496094,
+      "learning_rate": 2.286971603821226e-06,
+      "loss": 0.0092,
+      "step": 974
+    },
+    {
+      "clip_ratio": 0.0003946863821511215,
+      "epoch": 1.2390128768187387,
+      "grad_norm": 0.03584066033363342,
+      "kl": 0.006391048431396484,
+      "learning_rate": 2.280721948844621e-06,
+      "loss": 0.0091,
+      "step": 975
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.3044323921204,
+      "epoch": 1.2409249798332884,
+      "grad_norm": 0.038236722350120544,
+      "kl": 0.006694316864013672,
+      "learning_rate": 2.274473674525726e-06,
+      "loss": 0.0094,
+      "num_tokens": 94365488.0,
+      "reward": 0.06556919953436591,
+      "reward_std": 0.07405849196948111,
+      "rewards/pure_accuracy_reward_math": 0.06556919802096672,
+      "step": 976
+    },
+    {
+      "clip_ratio": 0.00029697347130763774,
+      "epoch": 1.2428370828478383,
+      "grad_norm": 0.0369977168738842,
+      "kl": 0.006660938262939453,
+      "learning_rate": 2.268226820206044e-06,
+      "loss": 0.0094,
+      "step": 977
+    },
+    {
+      "clip_ratio": 0.000319464833580696,
+      "epoch": 1.2447491858623883,
+      "grad_norm": 0.03550850227475166,
+      "kl": 0.006519794464111328,
+      "learning_rate": 2.261981425218138e-06,
+      "loss": 0.0094,
+      "step": 978
+    },
+    {
+      "clip_ratio": 0.0003469139706453461,
+      "epoch": 1.2466612888769382,
+      "grad_norm": 0.03525082767009735,
+      "kl": 0.006406307220458984,
+      "learning_rate": 2.2557375288853803e-06,
+      "loss": 0.0093,
+      "step": 979
+    },
+    {
+      "clip_ratio": 0.0003654695393606744,
+      "epoch": 1.248573391891488,
+      "grad_norm": 0.0355265848338604,
+      "kl": 0.006331443786621094,
+      "learning_rate": 2.2494951705217095e-06,
+      "loss": 0.0092,
+      "step": 980
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.76704454422,
+      "epoch": 1.250485494906038,
+      "grad_norm": 0.03745350241661072,
+      "kl": 0.0065135955810546875,
+      "learning_rate": 2.2432543894313797e-06,
+      "loss": 0.0042,
+      "num_tokens": 97952525.0,
+      "reward": 0.06501116385334171,
+      "reward_std": 0.07316133996937424,
+      "rewards/pure_accuracy_reward_math": 0.06501116222352721,
+      "step": 981
+    },
+    {
+      "clip_ratio": 0.00029299165072416145,
+      "epoch": 1.252397597920588,
+      "grad_norm": 0.03690091893076897,
+      "kl": 0.006426095962524414,
+      "learning_rate": 2.2370152249087114e-06,
+      "loss": 0.0042,
+      "step": 982
+    },
+    {
+      "clip_ratio": 0.0003187885846500649,
+      "epoch": 1.2543097009351378,
+      "grad_norm": 0.03645962476730347,
+      "kl": 0.006396055221557617,
+      "learning_rate": 2.2307777162378523e-06,
+      "loss": 0.0042,
+      "step": 983
+    },
+    {
+      "clip_ratio": 0.00033352292155086616,
+      "epoch": 1.2562218039496877,
+      "grad_norm": 0.03598187491297722,
+      "kl": 0.006333351135253906,
+      "learning_rate": 2.2245419026925187e-06,
+      "loss": 0.0041,
+      "step": 984
+    },
+    {
+      "clip_ratio": 0.0003533332319989313,
+      "epoch": 1.2581339069642377,
+      "grad_norm": 0.03577181696891785,
+      "kl": 0.006278276443481445,
+      "learning_rate": 2.218307823535757e-06,
+      "loss": 0.004,
+      "step": 985
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 522.8172650337219,
+      "epoch": 1.2600460099787876,
+      "grad_norm": 0.03590444475412369,
+      "kl": 0.005995273590087891,
+      "learning_rate": 2.2120755180196904e-06,
+      "loss": 0.0045,
+      "num_tokens": 101560026.0,
+      "reward": 0.06054687811410986,
+      "reward_std": 0.06865079078124836,
+      "rewards/pure_accuracy_reward_math": 0.06054687619325705,
+      "step": 986
+    },
+    {
+      "clip_ratio": 0.00024842098838462334,
+      "epoch": 1.2619581129933375,
+      "grad_norm": 0.03513624891638756,
+      "kl": 0.0059719085693359375,
+      "learning_rate": 2.2058450253852783e-06,
+      "loss": 0.0045,
+      "step": 987
+    },
+    {
+      "clip_ratio": 0.000271169978702801,
+      "epoch": 1.2638702160078874,
+      "grad_norm": 0.03392768278717995,
+      "kl": 0.005938529968261719,
+      "learning_rate": 2.1996163848620612e-06,
+      "loss": 0.0044,
+      "step": 988
+    },
+    {
+      "clip_ratio": 0.0002971922116898895,
+      "epoch": 1.2657823190224373,
+      "grad_norm": 0.03286145627498627,
+      "kl": 0.0060443878173828125,
+      "learning_rate": 2.1933896356679226e-06,
+      "loss": 0.0044,
+      "step": 989
+    },
+    {
+      "clip_ratio": 0.0003229031350429068,
+      "epoch": 1.2676944220369872,
+      "grad_norm": 0.032496001571416855,
+      "kl": 0.006091594696044922,
+      "learning_rate": 2.1871648170088347e-06,
+      "loss": 0.0043,
+      "step": 990
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 535.8125224113464,
+      "epoch": 1.2696065250515371,
+      "grad_norm": 0.21526122093200684,
+      "kl": 0.007075309753417969,
+      "learning_rate": 2.1809419680786143e-06,
+      "loss": 0.0072,
+      "num_tokens": 105223050.0,
+      "reward": 0.07421875381260179,
+      "reward_std": 0.08054219774203375,
+      "rewards/pure_accuracy_reward_math": 0.07421875130967237,
+      "step": 991
+    },
+    {
+      "clip_ratio": 0.00032863151136552915,
+      "epoch": 1.271518628066087,
+      "grad_norm": 0.03788222745060921,
+      "kl": 0.006428241729736328,
+      "learning_rate": 2.1747211280586758e-06,
+      "loss": 0.0072,
+      "step": 992
+    },
+    {
+      "clip_ratio": 0.00034688404628013814,
+      "epoch": 1.273430731080637,
+      "grad_norm": 0.03719337284564972,
+      "kl": 0.0064296722412109375,
+      "learning_rate": 2.168502336117787e-06,
+      "loss": 0.0071,
+      "step": 993
+    },
+    {
+      "clip_ratio": 0.00034599834629034376,
+      "epoch": 1.275342834095187,
+      "grad_norm": 0.036535993218421936,
+      "kl": 0.006348133087158203,
+      "learning_rate": 2.1622856314118178e-06,
+      "loss": 0.0071,
+      "step": 994
+    },
+    {
+      "clip_ratio": 0.00036459101005448247,
+      "epoch": 1.2772549371097368,
+      "grad_norm": 0.03548647463321686,
+      "kl": 0.006353855133056641,
+      "learning_rate": 2.156071053083496e-06,
+      "loss": 0.007,
+      "step": 995
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.536018371582,
+      "epoch": 1.2791670401242867,
+      "grad_norm": 0.03945273160934448,
+      "kl": 0.006157398223876953,
+      "learning_rate": 2.1498586402621646e-06,
+      "loss": 0.0062,
+      "num_tokens": 108847859.0,
+      "reward": 0.07366071807336994,
+      "reward_std": 0.072430647269357,
+      "rewards/pure_accuracy_reward_math": 0.07366071533760987,
+      "step": 996
+    },
+    {
+      "clip_ratio": 0.0002439655858097467,
+      "epoch": 1.2810791431388366,
+      "grad_norm": 0.03839760273694992,
+      "kl": 0.006161689758300781,
+      "learning_rate": 2.1436484320635275e-06,
+      "loss": 0.0061,
+      "step": 997
+    },
+    {
+      "clip_ratio": 0.0002514519866281262,
+      "epoch": 1.2829912461533866,
+      "grad_norm": 0.03733210638165474,
+      "kl": 0.0061798095703125,
+      "learning_rate": 2.1374404675894083e-06,
+      "loss": 0.0061,
+      "step": 998
+    },
+    {
+      "clip_ratio": 0.0002774860670342605,
+      "epoch": 1.2849033491679365,
+      "grad_norm": 0.03640332072973251,
+      "kl": 0.006183147430419922,
+      "learning_rate": 2.131234785927505e-06,
+      "loss": 0.006,
+      "step": 999
+    },
+    {
+      "clip_ratio": 0.0002877332713069336,
+      "epoch": 1.2868154521824864,
+      "grad_norm": 0.03559413552284241,
+      "kl": 0.006213665008544922,
+      "learning_rate": 2.1250314261511414e-06,
+      "loss": 0.0059,
+      "step": 1000
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.9492444992065,
+      "epoch": 1.2887275551970363,
+      "grad_norm": 0.04216492921113968,
+      "kl": 0.0073282718658447266,
+      "learning_rate": 2.1188304273190196e-06,
+      "loss": 0.0102,
+      "num_tokens": 112482213.0,
+      "reward": 0.0772879500000272,
+      "reward_std": 0.07908701087580994,
+      "rewards/pure_accuracy_reward_math": 0.07728794772992842,
+      "step": 1001
+    },
+    {
+      "clip_ratio": 0.0003075964003755871,
+      "epoch": 1.2906396582115862,
+      "grad_norm": 0.039000045508146286,
+      "kl": 0.007200002670288086,
+      "learning_rate": 2.1126318284749807e-06,
+      "loss": 0.0102,
+      "step": 1002
+    },
+    {
+      "clip_ratio": 0.0003138856436635251,
+      "epoch": 1.2925517612261361,
+      "grad_norm": 0.036585696041584015,
+      "kl": 0.00716710090637207,
+      "learning_rate": 2.106435668647751e-06,
+      "loss": 0.0101,
+      "step": 1003
+    },
+    {
+      "clip_ratio": 0.00033263966838603665,
+      "epoch": 1.294463864240686,
+      "grad_norm": 0.03634057566523552,
+      "kl": 0.007274150848388672,
+      "learning_rate": 2.1002419868507005e-06,
+      "loss": 0.01,
+      "step": 1004
+    },
+    {
+      "clip_ratio": 0.00035104663936635916,
+      "epoch": 1.2963759672552357,
+      "grad_norm": 0.03524275869131088,
+      "kl": 0.0072422027587890625,
+      "learning_rate": 2.0940508220815978e-06,
+      "loss": 0.01,
+      "step": 1005
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.5226221084595,
+      "epoch": 1.2982880702697859,
+      "grad_norm": 0.04047563299536705,
+      "kl": 0.006965160369873047,
+      "learning_rate": 2.087862213322362e-06,
+      "loss": 0.0078,
+      "num_tokens": 116078946.0,
+      "reward": 0.06752232470898889,
+      "reward_std": 0.08269421081058681,
+      "rewards/pure_accuracy_reward_math": 0.0675223229045514,
+      "step": 1006
+    },
+    {
+      "clip_ratio": 0.00033451643105308904,
+      "epoch": 1.3002001732843356,
+      "grad_norm": 0.03818976879119873,
+      "kl": 0.0069293975830078125,
+      "learning_rate": 2.0816761995388198e-06,
+      "loss": 0.0078,
+      "step": 1007
+    },
+    {
+      "clip_ratio": 0.0003828123747666723,
+      "epoch": 1.3021122762988857,
+      "grad_norm": 0.03969357907772064,
+      "kl": 0.006967067718505859,
+      "learning_rate": 2.075492819680457e-06,
+      "loss": 0.0078,
+      "step": 1008
+    },
+    {
+      "clip_ratio": 0.0003832018163620887,
+      "epoch": 1.3040243793134354,
+      "grad_norm": 0.040100231766700745,
+      "kl": 0.007086753845214844,
+      "learning_rate": 2.0693121126801778e-06,
+      "loss": 0.0077,
+      "step": 1009
+    },
+    {
+      "clip_ratio": 0.0003569153510625256,
+      "epoch": 1.3059364823279855,
+      "grad_norm": 0.037368252873420715,
+      "kl": 0.007195472717285156,
+      "learning_rate": 2.063134117454055e-06,
+      "loss": 0.0076,
+      "step": 1010
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 514.7126340866089,
+      "epoch": 1.3078485853425352,
+      "grad_norm": 0.0401712991297245,
+      "kl": 0.00678253173828125,
+      "learning_rate": 2.0569588729010896e-06,
+      "loss": 0.0063,
+      "num_tokens": 119662772.0,
+      "reward": 0.0705915214784909,
+      "reward_std": 0.08484002540353686,
+      "rewards/pure_accuracy_reward_math": 0.0705915190919768,
+      "step": 1011
+    },
+    {
+      "clip_ratio": 0.0003401347770477514,
+      "epoch": 1.3097606883570854,
+      "grad_norm": 0.03972383588552475,
+      "kl": 0.006781578063964844,
+      "learning_rate": 2.0507864179029592e-06,
+      "loss": 0.0062,
+      "step": 1012
+    },
+    {
+      "clip_ratio": 0.00040657852025560715,
+      "epoch": 1.311672791371635,
+      "grad_norm": 0.04063359647989273,
+      "kl": 0.006711006164550781,
+      "learning_rate": 2.044616791323781e-06,
+      "loss": 0.0062,
+      "step": 1013
+    },
+    {
+      "clip_ratio": 0.0004189488300880839,
+      "epoch": 1.313584894386185,
+      "grad_norm": 0.03818094730377197,
+      "kl": 0.006552696228027344,
+      "learning_rate": 2.0384500320098604e-06,
+      "loss": 0.0061,
+      "step": 1014
+    },
+    {
+      "clip_ratio": 0.000448550158978378,
+      "epoch": 1.315496997400735,
+      "grad_norm": 0.03749743476510048,
+      "kl": 0.0064678192138671875,
+      "learning_rate": 2.032286178789454e-06,
+      "loss": 0.006,
+      "step": 1015
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.0069990158081,
+      "epoch": 1.3174091004152848,
+      "grad_norm": 0.03775123134255409,
+      "kl": 0.006552696228027344,
+      "learning_rate": 2.0261252704725143e-06,
+      "loss": 0.0047,
+      "num_tokens": 123299241.0,
+      "reward": 0.06919643163564615,
+      "reward_std": 0.0781373989302665,
+      "rewards/pure_accuracy_reward_math": 0.06919642994762398,
+      "step": 1016
+    },
+    {
+      "clip_ratio": 0.0003128642913452495,
+      "epoch": 1.3193212034298347,
+      "grad_norm": 0.03666616231203079,
+      "kl": 0.006560325622558594,
+      "learning_rate": 2.0199673458504577e-06,
+      "loss": 0.0047,
+      "step": 1017
+    },
+    {
+      "clip_ratio": 0.00030665075905744743,
+      "epoch": 1.3212333064443846,
+      "grad_norm": 0.035805702209472656,
+      "kl": 0.006537437438964844,
+      "learning_rate": 2.01381244369591e-06,
+      "loss": 0.0046,
+      "step": 1018
+    },
+    {
+      "clip_ratio": 0.0003063842187316368,
+      "epoch": 1.3231454094589346,
+      "grad_norm": 0.03492369130253792,
+      "kl": 0.006512641906738281,
+      "learning_rate": 2.0076606027624676e-06,
+      "loss": 0.0046,
+      "step": 1019
+    },
+    {
+      "clip_ratio": 0.00033027163379983904,
+      "epoch": 1.3250575124734845,
+      "grad_norm": 0.03507117182016373,
+      "kl": 0.006590366363525391,
+      "learning_rate": 2.0015118617844516e-06,
+      "loss": 0.0045,
+      "step": 1020
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 536.10493516922,
+      "epoch": 1.3269696154880344,
+      "grad_norm": 0.04077515751123428,
+      "kl": 0.006287097930908203,
+      "learning_rate": 1.9953662594766675e-06,
+      "loss": 0.007,
+      "num_tokens": 126958737.0,
+      "reward": 0.0756138427532278,
+      "reward_std": 0.08067478984594345,
+      "rewards/pure_accuracy_reward_math": 0.07561384083237499,
+      "step": 1021
+    },
+    {
+      "clip_ratio": 0.0003038725464534764,
+      "epoch": 1.3288817185025843,
+      "grad_norm": 0.03825462609529495,
+      "kl": 0.0063266754150390625,
+      "learning_rate": 1.9892238345341544e-06,
+      "loss": 0.007,
+      "step": 1022
+    },
+    {
+      "clip_ratio": 0.0003366774006963169,
+      "epoch": 1.3307938215171342,
+      "grad_norm": 0.03734288364648819,
+      "kl": 0.006364345550537109,
+      "learning_rate": 1.983084625631949e-06,
+      "loss": 0.0069,
+      "step": 1023
+    },
+    {
+      "clip_ratio": 0.0003749641306853846,
+      "epoch": 1.3327059245316841,
+      "grad_norm": 0.03799683600664139,
+      "kl": 0.006411075592041016,
+      "learning_rate": 1.9769486714248367e-06,
+      "loss": 0.0068,
+      "step": 1024
+    },
+    {
+      "clip_ratio": 0.0003729545476289786,
+      "epoch": 1.334618027546234,
+      "grad_norm": 0.03601997718214989,
+      "kl": 0.006434917449951172,
+      "learning_rate": 1.9708160105471105e-06,
+      "loss": 0.0068,
+      "step": 1025
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.7709493637085,
+      "epoch": 1.336530130560784,
+      "grad_norm": 0.04102141782641411,
+      "kl": 0.006857395172119141,
+      "learning_rate": 1.964686681612327e-06,
+      "loss": 0.0055,
+      "num_tokens": 130592668.0,
+      "reward": 0.06556919959257357,
+      "reward_std": 0.06470447563333437,
+      "rewards/pure_accuracy_reward_math": 0.0655691981955897,
+      "step": 1026
+    },
+    {
+      "clip_ratio": 0.00021823535962539609,
+      "epoch": 1.3384422335753339,
+      "grad_norm": 0.03428492322564125,
+      "kl": 0.006598472595214844,
+      "learning_rate": 1.9585607232130636e-06,
+      "loss": 0.0054,
+      "step": 1027
+    },
+    {
+      "clip_ratio": 0.00024637427833340553,
+      "epoch": 1.3403543365898838,
+      "grad_norm": 0.032555270940065384,
+      "kl": 0.006415843963623047,
+      "learning_rate": 1.952438173920677e-06,
+      "loss": 0.0054,
+      "step": 1028
+    },
+    {
+      "clip_ratio": 0.0002563797440870985,
+      "epoch": 1.3422664396044337,
+      "grad_norm": 0.03202388435602188,
+      "kl": 0.006371498107910156,
+      "learning_rate": 1.946319072285058e-06,
+      "loss": 0.0053,
+      "step": 1029
+    },
+    {
+      "clip_ratio": 0.0002687414232696028,
+      "epoch": 1.3441785426189836,
+      "grad_norm": 0.03169838339090347,
+      "kl": 0.006340980529785156,
+      "learning_rate": 1.9402034568343888e-06,
+      "loss": 0.0053,
+      "step": 1030
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 549.2184953689575,
+      "epoch": 1.3460906456335335,
+      "grad_norm": 0.054084766656160355,
+      "kl": 0.006264686584472656,
+      "learning_rate": 1.9340913660749015e-06,
+      "loss": 0.0071,
+      "num_tokens": 134289567.0,
+      "reward": 0.06668527112924494,
+      "reward_std": 0.07140090392204002,
+      "rewards/pure_accuracy_reward_math": 0.06668526903376915,
+      "step": 1031
+    },
+    {
+      "clip_ratio": 0.00022883353369707038,
+      "epoch": 1.3480027486480834,
+      "grad_norm": 0.03612653911113739,
+      "kl": 0.006344318389892578,
+      "learning_rate": 1.9279828384906373e-06,
+      "loss": 0.0071,
+      "step": 1032
+    },
+    {
+      "clip_ratio": 0.0002760976024376305,
+      "epoch": 1.3499148516626334,
+      "grad_norm": 0.036703869700431824,
+      "kl": 0.006397724151611328,
+      "learning_rate": 1.921877912543198e-06,
+      "loss": 0.0071,
+      "step": 1033
+    },
+    {
+      "clip_ratio": 0.00027991523592163503,
+      "epoch": 1.3518269546771833,
+      "grad_norm": 0.036445919424295425,
+      "kl": 0.006428718566894531,
+      "learning_rate": 1.9157766266715142e-06,
+      "loss": 0.007,
+      "step": 1034
+    },
+    {
+      "clip_ratio": 0.0003110420944381076,
+      "epoch": 1.3537390576917332,
+      "grad_norm": 0.032879918813705444,
+      "kl": 0.006253242492675781,
+      "learning_rate": 1.909679019291592e-06,
+      "loss": 0.0069,
+      "step": 1035
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.200918674469,
+      "epoch": 1.355651160706283,
+      "grad_norm": 0.0374806709587574,
+      "kl": 0.006623744964599609,
+      "learning_rate": 1.9035851287962797e-06,
+      "loss": 0.0088,
+      "num_tokens": 137901395.0,
+      "reward": 0.07170759295695461,
+      "reward_std": 0.0834249026956968,
+      "rewards/pure_accuracy_reward_math": 0.0717075907450635,
+      "step": 1036
+    },
+    {
+      "clip_ratio": 0.0002719677876825699,
+      "epoch": 1.357563263720833,
+      "grad_norm": 0.03692527487874031,
+      "kl": 0.006625652313232422,
+      "learning_rate": 1.8974949935550202e-06,
+      "loss": 0.0088,
+      "step": 1037
+    },
+    {
+      "clip_ratio": 0.0003176050505544481,
+      "epoch": 1.359475366735383,
+      "grad_norm": 0.03605135530233383,
+      "kl": 0.006484031677246094,
+      "learning_rate": 1.8914086519136133e-06,
+      "loss": 0.0088,
+      "step": 1038
+    },
+    {
+      "clip_ratio": 0.0003420261080577802,
+      "epoch": 1.3613874697499329,
+      "grad_norm": 0.03582129627466202,
+      "kl": 0.006468296051025391,
+      "learning_rate": 1.8853261421939718e-06,
+      "loss": 0.0087,
+      "step": 1039
+    },
+    {
+      "clip_ratio": 0.00034158617637558564,
+      "epoch": 1.3632995727644825,
+      "grad_norm": 0.0346604622900486,
+      "kl": 0.006458282470703125,
+      "learning_rate": 1.8792475026938823e-06,
+      "loss": 0.0086,
+      "step": 1040
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.6152620315552,
+      "epoch": 1.3652116757790327,
+      "grad_norm": 0.03809192404150963,
+      "kl": 0.006644248962402344,
+      "learning_rate": 1.8731727716867632e-06,
+      "loss": 0.0098,
+      "num_tokens": 141517968.0,
+      "reward": 0.07477678963914514,
+      "reward_std": 0.0749618403497152,
+      "rewards/pure_accuracy_reward_math": 0.07477678678696975,
+      "step": 1041
+    },
+    {
+      "clip_ratio": 0.0002677642194726104,
+      "epoch": 1.3671237787935824,
+      "grad_norm": 0.0377020426094532,
+      "kl": 0.0066089630126953125,
+      "learning_rate": 1.8671019874214237e-06,
+      "loss": 0.0098,
+      "step": 1042
+    },
+    {
+      "clip_ratio": 0.0002758102658617645,
+      "epoch": 1.3690358818081325,
+      "grad_norm": 0.03678804636001587,
+      "kl": 0.006642341613769531,
+      "learning_rate": 1.8610351881218211e-06,
+      "loss": 0.0098,
+      "step": 1043
+    },
+    {
+      "clip_ratio": 0.0002790037015074631,
+      "epoch": 1.3709479848226822,
+      "grad_norm": 0.03615477308630943,
+      "kl": 0.006649971008300781,
+      "learning_rate": 1.8549724119868235e-06,
+      "loss": 0.0097,
+      "step": 1044
+    },
+    {
+      "clip_ratio": 0.0002795595634097481,
+      "epoch": 1.3728600878372323,
+      "grad_norm": 0.03598296642303467,
+      "kl": 0.006653785705566406,
+      "learning_rate": 1.8489136971899658e-06,
+      "loss": 0.0096,
+      "step": 1045
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 539.382839679718,
+      "epoch": 1.374772190851782,
+      "grad_norm": 0.03458879515528679,
+      "kl": 0.0064601898193359375,
+      "learning_rate": 1.8428590818792135e-06,
+      "loss": 0.0038,
+      "num_tokens": 145187116.0,
+      "reward": 0.06584821731667034,
+      "reward_std": 0.07200520334299654,
+      "rewards/pure_accuracy_reward_math": 0.06584821562864818,
+      "step": 1046
+    },
+    {
+      "clip_ratio": 0.00023162108237784196,
+      "epoch": 1.3766842938663322,
+      "grad_norm": 0.03385276347398758,
+      "kl": 0.006392478942871094,
+      "learning_rate": 1.836808604176719e-06,
+      "loss": 0.0038,
+      "step": 1047
+    },
+    {
+      "clip_ratio": 0.00026906593984676874,
+      "epoch": 1.3785963968808819,
+      "grad_norm": 0.0331512950360775,
+      "kl": 0.0062427520751953125,
+      "learning_rate": 1.8307623021785837e-06,
+      "loss": 0.0037,
+      "step": 1048
+    },
+    {
+      "clip_ratio": 0.00025022312701139526,
+      "epoch": 1.3805084998954318,
+      "grad_norm": 0.032765790820121765,
+      "kl": 0.006190299987792969,
+      "learning_rate": 1.8247202139546155e-06,
+      "loss": 0.0037,
+      "step": 1049
+    },
+    {
+      "clip_ratio": 0.0002507307134465009,
+      "epoch": 1.3824206029099817,
+      "grad_norm": 0.0325283482670784,
+      "kl": 0.006188869476318359,
+      "learning_rate": 1.8186823775480917e-06,
+      "loss": 0.0036,
+      "step": 1050
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 539.5159296989441,
+      "epoch": 1.3843327059245316,
+      "grad_norm": 0.03628634661436081,
+      "kl": 0.007945537567138672,
+      "learning_rate": 1.8126488309755178e-06,
+      "loss": 0.0101,
+      "num_tokens": 148852261.0,
+      "reward": 0.06194196696742438,
+      "reward_std": 0.06792009872151539,
+      "rewards/pure_accuracy_reward_math": 0.06194196580327116,
+      "step": 1051
+    },
+    {
+      "clip_ratio": 0.00025563780241100176,
+      "epoch": 1.3862448089390815,
+      "grad_norm": 0.035264719277620316,
+      "kl": 0.007953643798828125,
+      "learning_rate": 1.80661961222639e-06,
+      "loss": 0.0101,
+      "step": 1052
+    },
+    {
+      "clip_ratio": 0.0002401949207069265,
+      "epoch": 1.3881569119536314,
+      "grad_norm": 0.034110233187675476,
+      "kl": 0.007923126220703125,
+      "learning_rate": 1.8005947592629551e-06,
+      "loss": 0.0101,
+      "step": 1053
+    },
+    {
+      "clip_ratio": 0.00026547102737595196,
+      "epoch": 1.3900690149681814,
+      "grad_norm": 0.03364601358771324,
+      "kl": 0.00788116455078125,
+      "learning_rate": 1.7945743100199706e-06,
+      "loss": 0.01,
+      "step": 1054
+    },
+    {
+      "clip_ratio": 0.0002951583905996813,
+      "epoch": 1.3919811179827313,
+      "grad_norm": 0.03397928550839424,
+      "kl": 0.007859230041503906,
+      "learning_rate": 1.788558302404466e-06,
+      "loss": 0.0099,
+      "step": 1055
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.25337266922,
+      "epoch": 1.3938932209972812,
+      "grad_norm": 0.03863634541630745,
+      "kl": 0.006538867950439453,
+      "learning_rate": 1.7825467742955052e-06,
+      "loss": 0.0066,
+      "num_tokens": 152486009.0,
+      "reward": 0.06780134289874695,
+      "reward_std": 0.06736206321511418,
+      "rewards/pure_accuracy_reward_math": 0.06780134057044052,
+      "step": 1056
+    },
+    {
+      "clip_ratio": 0.00027592373527340897,
+      "epoch": 1.395805324011831,
+      "grad_norm": 0.036583587527275085,
+      "kl": 0.0065402984619140625,
+      "learning_rate": 1.7765397635439468e-06,
+      "loss": 0.0066,
+      "step": 1057
+    },
+    {
+      "clip_ratio": 0.0002849266509201698,
+      "epoch": 1.397717427026381,
+      "grad_norm": 0.03605053946375847,
+      "kl": 0.006500244140625,
+      "learning_rate": 1.7705373079722083e-06,
+      "loss": 0.0065,
+      "step": 1058
+    },
+    {
+      "clip_ratio": 0.0003116865132142266,
+      "epoch": 1.399629530040931,
+      "grad_norm": 0.03675729036331177,
+      "kl": 0.006489276885986328,
+      "learning_rate": 1.7645394453740227e-06,
+      "loss": 0.0064,
+      "step": 1059
+    },
+    {
+      "clip_ratio": 0.0003249485117748918,
+      "epoch": 1.4015416330554809,
+      "grad_norm": 0.03623329848051071,
+      "kl": 0.006478786468505859,
+      "learning_rate": 1.7585462135142083e-06,
+      "loss": 0.0064,
+      "step": 1060
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.029598236084,
+      "epoch": 1.4034537360700308,
+      "grad_norm": 0.03506990894675255,
+      "kl": 0.006392955780029297,
+      "learning_rate": 1.752557650128423e-06,
+      "loss": 0.0096,
+      "num_tokens": 156082643.0,
+      "reward": 0.06194196664728224,
+      "reward_std": 0.07560620515141636,
+      "rewards/pure_accuracy_reward_math": 0.061941966181620955,
+      "step": 1061
+    },
+    {
+      "clip_ratio": 0.0002744606111662051,
+      "epoch": 1.4053658390845807,
+      "grad_norm": 0.03450053185224533,
+      "kl": 0.006424903869628906,
+      "learning_rate": 1.7465737929229317e-06,
+      "loss": 0.0096,
+      "step": 1062
+    },
+    {
+      "clip_ratio": 0.00027279697263793423,
+      "epoch": 1.4072779420991306,
+      "grad_norm": 0.033764585852622986,
+      "kl": 0.006496906280517578,
+      "learning_rate": 1.7405946795743665e-06,
+      "loss": 0.0096,
+      "step": 1063
+    },
+    {
+      "clip_ratio": 0.000298209258943416,
+      "epoch": 1.4091900451136805,
+      "grad_norm": 0.03335048630833626,
+      "kl": 0.0065898895263671875,
+      "learning_rate": 1.7346203477294916e-06,
+      "loss": 0.0095,
+      "step": 1064
+    },
+    {
+      "clip_ratio": 0.00030832760762677935,
+      "epoch": 1.4111021481282304,
+      "grad_norm": 0.03299354016780853,
+      "kl": 0.006653308868408203,
+      "learning_rate": 1.7286508350049627e-06,
+      "loss": 0.0094,
+      "step": 1065
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.4023675918579,
+      "epoch": 1.4130142511427803,
+      "grad_norm": 0.04127517342567444,
+      "kl": 0.010558605194091797,
+      "learning_rate": 1.722686178987097e-06,
+      "loss": 0.0076,
+      "num_tokens": 159696133.0,
+      "reward": 0.06640625282307155,
+      "reward_std": 0.07264956791186705,
+      "rewards/pure_accuracy_reward_math": 0.06640625101863407,
+      "step": 1066
+    },
+    {
+      "clip_ratio": 0.00030437137564831573,
+      "epoch": 1.4149263541573303,
+      "grad_norm": 0.039496634155511856,
+      "kl": 0.010538101196289062,
+      "learning_rate": 1.7167264172316273e-06,
+      "loss": 0.0076,
+      "step": 1067
+    },
+    {
+      "clip_ratio": 0.0003244270092181978,
+      "epoch": 1.4168384571718802,
+      "grad_norm": 0.039376117289066315,
+      "kl": 0.010515689849853516,
+      "learning_rate": 1.7107715872634731e-06,
+      "loss": 0.0075,
+      "step": 1068
+    },
+    {
+      "clip_ratio": 0.0003491952173817481,
+      "epoch": 1.41875056018643,
+      "grad_norm": 0.03863466531038284,
+      "kl": 0.01038360595703125,
+      "learning_rate": 1.7048217265764993e-06,
+      "loss": 0.0075,
+      "step": 1069
+    },
+    {
+      "clip_ratio": 0.00037865171140083476,
+      "epoch": 1.42066266320098,
+      "grad_norm": 0.03795957565307617,
+      "kl": 0.010157585144042969,
+      "learning_rate": 1.6988768726332856e-06,
+      "loss": 0.0074,
+      "step": 1070
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 512.8691644668579,
+      "epoch": 1.42257476621553,
+      "grad_norm": 0.04360206797719002,
+      "kl": 0.0067138671875,
+      "learning_rate": 1.6929370628648828e-06,
+      "loss": 0.0086,
+      "num_tokens": 163268528.0,
+      "reward": 0.08565848623402417,
+      "reward_std": 0.08861368341604248,
+      "rewards/pure_accuracy_reward_math": 0.08565848384751007,
+      "step": 1071
+    },
+    {
+      "clip_ratio": 0.00031944918799808875,
+      "epoch": 1.4244868692300798,
+      "grad_norm": 0.04292250797152519,
+      "kl": 0.006737709045410156,
+      "learning_rate": 1.6870023346705866e-06,
+      "loss": 0.0085,
+      "step": 1072
+    },
+    {
+      "clip_ratio": 0.00031442818647064996,
+      "epoch": 1.4263989722446297,
+      "grad_norm": 0.04044810310006142,
+      "kl": 0.006873607635498047,
+      "learning_rate": 1.6810727254176937e-06,
+      "loss": 0.0085,
+      "step": 1073
+    },
+    {
+      "clip_ratio": 0.0003650832475727839,
+      "epoch": 1.4283110752591797,
+      "grad_norm": 0.04156485199928284,
+      "kl": 0.006984233856201172,
+      "learning_rate": 1.6751482724412716e-06,
+      "loss": 0.0084,
+      "step": 1074
+    },
+    {
+      "clip_ratio": 0.0003947964444250829,
+      "epoch": 1.4302231782737296,
+      "grad_norm": 0.04023054987192154,
+      "kl": 0.007004737854003906,
+      "learning_rate": 1.669229013043921e-06,
+      "loss": 0.0083,
+      "step": 1075
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 512.7343969345093,
+      "epoch": 1.4321352812882795,
+      "grad_norm": 0.03780645504593849,
+      "kl": 0.006886005401611328,
+      "learning_rate": 1.6633149844955415e-06,
+      "loss": 0.0094,
+      "num_tokens": 166836260.0,
+      "reward": 0.0797991111758165,
+      "reward_std": 0.08157813875004649,
+      "rewards/pure_accuracy_reward_math": 0.07979910867288709,
+      "step": 1076
+    },
+    {
+      "clip_ratio": 0.0002608302990552147,
+      "epoch": 1.4340473843028292,
+      "grad_norm": 0.03681138530373573,
+      "kl": 0.006786823272705078,
+      "learning_rate": 1.6574062240330996e-06,
+      "loss": 0.0093,
+      "step": 1077
+    },
+    {
+      "clip_ratio": 0.00031450060896531795,
+      "epoch": 1.4359594873173793,
+      "grad_norm": 0.036778852343559265,
+      "kl": 0.0066986083984375,
+      "learning_rate": 1.651502768860389e-06,
+      "loss": 0.0093,
+      "step": 1078
+    },
+    {
+      "clip_ratio": 0.0003176571812559814,
+      "epoch": 1.437871590331929,
+      "grad_norm": 0.03592304140329361,
+      "kl": 0.006758213043212891,
+      "learning_rate": 1.6456046561478023e-06,
+      "loss": 0.0092,
+      "step": 1079
+    },
+    {
+      "clip_ratio": 0.0003236016519281293,
+      "epoch": 1.4397836933464792,
+      "grad_norm": 0.03520684316754341,
+      "kl": 0.006850242614746094,
+      "learning_rate": 1.6397119230320919e-06,
+      "loss": 0.0092,
+      "step": 1080
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 508.80498933792114,
+      "epoch": 1.4416957963610288,
+      "grad_norm": 0.04630957916378975,
+      "kl": 0.01150655746459961,
+      "learning_rate": 1.633824606616138e-06,
+      "loss": 0.008,
+      "num_tokens": 170392081.0,
+      "reward": 0.07589286129223183,
+      "reward_std": 0.08140548272058368,
+      "rewards/pure_accuracy_reward_math": 0.07589285844005644,
+      "step": 1081
+    },
+    {
+      "clip_ratio": 0.00028873196572476445,
+      "epoch": 1.443607899375579,
+      "grad_norm": 0.04534924402832985,
+      "kl": 0.01107931137084961,
+      "learning_rate": 1.6279427439687154e-06,
+      "loss": 0.008,
+      "step": 1082
+    },
+    {
+      "clip_ratio": 0.000319909158235987,
+      "epoch": 1.4455200023901287,
+      "grad_norm": 0.044707395136356354,
+      "kl": 0.010364532470703125,
+      "learning_rate": 1.622066372124262e-06,
+      "loss": 0.0079,
+      "step": 1083
+    },
+    {
+      "clip_ratio": 0.0003388643909829625,
+      "epoch": 1.4474321054046788,
+      "grad_norm": 0.038643479347229004,
+      "kl": 0.009525775909423828,
+      "learning_rate": 1.6161955280826399e-06,
+      "loss": 0.0078,
+      "step": 1084
+    },
+    {
+      "clip_ratio": 0.0003223289492098047,
+      "epoch": 1.4493442084192285,
+      "grad_norm": 0.12098709493875504,
+      "kl": 0.010370254516601562,
+      "learning_rate": 1.6103302488089104e-06,
+      "loss": 0.0078,
+      "step": 1085
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.3169894218445,
+      "epoch": 1.4512563114337784,
+      "grad_norm": 0.03693209961056709,
+      "kl": 0.006680965423583984,
+      "learning_rate": 1.6044705712330932e-06,
+      "loss": 0.0059,
+      "num_tokens": 173992817.0,
+      "reward": 0.07031250311410986,
+      "reward_std": 0.07530715462053195,
+      "rewards/pure_accuracy_reward_math": 0.07031250142608769,
+      "step": 1086
+    },
+    {
+      "clip_ratio": 0.0002918191117657898,
+      "epoch": 1.4531684144483283,
+      "grad_norm": 0.03641385957598686,
+      "kl": 0.0065898895263671875,
+      "learning_rate": 1.5986165322499398e-06,
+      "loss": 0.0059,
+      "step": 1087
+    },
+    {
+      "clip_ratio": 0.0002921736467840219,
+      "epoch": 1.4550805174628783,
+      "grad_norm": 0.03598758950829506,
+      "kl": 0.006548881530761719,
+      "learning_rate": 1.5927681687186964e-06,
+      "loss": 0.0058,
+      "step": 1088
+    },
+    {
+      "clip_ratio": 0.0003169650843233285,
+      "epoch": 1.4569926204774282,
+      "grad_norm": 0.036268141120672226,
+      "kl": 0.006561756134033203,
+      "learning_rate": 1.5869255174628778e-06,
+      "loss": 0.0058,
+      "step": 1089
+    },
+    {
+      "clip_ratio": 0.0003259218068478731,
+      "epoch": 1.458904723491978,
+      "grad_norm": 0.03529893979430199,
+      "kl": 0.006597042083740234,
+      "learning_rate": 1.5810886152700302e-06,
+      "loss": 0.0057,
+      "step": 1090
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 533.391206741333,
+      "epoch": 1.460816826506528,
+      "grad_norm": 0.04034799709916115,
+      "kl": 0.006509304046630859,
+      "learning_rate": 1.5752574988915004e-06,
+      "loss": 0.0066,
+      "num_tokens": 177633359.0,
+      "reward": 0.07477678920258768,
+      "reward_std": 0.0747891838545911,
+      "rewards/pure_accuracy_reward_math": 0.07477678699069656,
+      "step": 1091
+    },
+    {
+      "clip_ratio": 0.0002679697158214367,
+      "epoch": 1.462728929521078,
+      "grad_norm": 0.039328683167696,
+      "kl": 0.006606101989746094,
+      "learning_rate": 1.5694322050422096e-06,
+      "loss": 0.0066,
+      "step": 1092
+    },
+    {
+      "clip_ratio": 0.0002975759220475993,
+      "epoch": 1.4646410325356278,
+      "grad_norm": 0.03947217017412186,
+      "kl": 0.00665283203125,
+      "learning_rate": 1.5636127704004133e-06,
+      "loss": 0.0065,
+      "step": 1093
+    },
+    {
+      "clip_ratio": 0.0003127538088278925,
+      "epoch": 1.4665531355501777,
+      "grad_norm": 0.03733786940574646,
+      "kl": 0.006627559661865234,
+      "learning_rate": 1.5577992316074783e-06,
+      "loss": 0.0064,
+      "step": 1094
+    },
+    {
+      "clip_ratio": 0.00035554791872982605,
+      "epoch": 1.4684652385647277,
+      "grad_norm": 0.03660706803202629,
+      "kl": 0.0065364837646484375,
+      "learning_rate": 1.5519916252676482e-06,
+      "loss": 0.0064,
+      "step": 1095
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.1163763999939,
+      "epoch": 1.4703773415792776,
+      "grad_norm": 0.06871657073497772,
+      "kl": 0.010003089904785156,
+      "learning_rate": 1.5461899879478133e-06,
+      "loss": 0.0057,
+      "num_tokens": 181268648.0,
+      "reward": 0.0744977711874526,
+      "reward_std": 0.08333237702026963,
+      "rewards/pure_accuracy_reward_math": 0.0744977695576381,
+      "step": 1096
+    },
+    {
+      "clip_ratio": 0.00032988911306119917,
+      "epoch": 1.4722894445938275,
+      "grad_norm": 0.04868275299668312,
+      "kl": 0.009030342102050781,
+      "learning_rate": 1.5403943561772789e-06,
+      "loss": 0.0057,
+      "step": 1097
+    },
+    {
+      "clip_ratio": 0.0003833602018517013,
+      "epoch": 1.4742015476083774,
+      "grad_norm": 0.04073934629559517,
+      "kl": 0.00842428207397461,
+      "learning_rate": 1.5346047664475422e-06,
+      "loss": 0.0056,
+      "step": 1098
+    },
+    {
+      "clip_ratio": 0.00040459603366116426,
+      "epoch": 1.4761136506229273,
+      "grad_norm": 0.04011493921279907,
+      "kl": 0.008179187774658203,
+      "learning_rate": 1.5288212552120524e-06,
+      "loss": 0.0055,
+      "step": 1099
+    },
+    {
+      "clip_ratio": 0.0004078742092019638,
+      "epoch": 1.4780257536374772,
+      "grad_norm": 0.03785649687051773,
+      "kl": 0.008193016052246094,
+      "learning_rate": 1.5230438588859881e-06,
+      "loss": 0.0054,
+      "step": 1100
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 541.5837321281433,
+      "epoch": 1.4799378566520272,
+      "grad_norm": 0.04047717526555061,
+      "kl": 0.007642269134521484,
+      "learning_rate": 1.517272613846027e-06,
+      "loss": 0.0051,
+      "num_tokens": 184939348.0,
+      "reward": 0.06863839572179131,
+      "reward_std": 0.07131457631476223,
+      "rewards/pure_accuracy_reward_math": 0.06863839420839213,
+      "step": 1101
+    },
+    {
+      "clip_ratio": 0.00026072144959243815,
+      "epoch": 1.481849959666577,
+      "grad_norm": 0.037731293588876724,
+      "kl": 0.007551670074462891,
+      "learning_rate": 1.511507556430114e-06,
+      "loss": 0.0051,
+      "step": 1102
+    },
+    {
+      "clip_ratio": 0.00029216510773721893,
+      "epoch": 1.483762062681127,
+      "grad_norm": 0.03771767392754555,
+      "kl": 0.007477760314941406,
+      "learning_rate": 1.5057487229372347e-06,
+      "loss": 0.0051,
+      "step": 1103
+    },
+    {
+      "clip_ratio": 0.0003181908435294645,
+      "epoch": 1.485674165695677,
+      "grad_norm": 0.03619125112891197,
+      "kl": 0.0074062347412109375,
+      "learning_rate": 1.4999961496271889e-06,
+      "loss": 0.005,
+      "step": 1104
+    },
+    {
+      "clip_ratio": 0.0003646736843165854,
+      "epoch": 1.4875862687102268,
+      "grad_norm": 0.035048868507146835,
+      "kl": 0.007380008697509766,
+      "learning_rate": 1.4942498727203578e-06,
+      "loss": 0.0049,
+      "step": 1105
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 541.8585615158081,
+      "epoch": 1.4894983717247767,
+      "grad_norm": 0.0386812798678875,
+      "kl": 0.006747245788574219,
+      "learning_rate": 1.4885099283974774e-06,
+      "loss": 0.0071,
+      "num_tokens": 188614221.0,
+      "reward": 0.07198661062284373,
+      "reward_std": 0.08140548341907561,
+      "rewards/pure_accuracy_reward_math": 0.07198660864378326,
+      "step": 1106
+    },
+    {
+      "clip_ratio": 0.0003357146362077401,
+      "epoch": 1.4914104747393266,
+      "grad_norm": 0.03723128139972687,
+      "kl": 0.006694316864013672,
+      "learning_rate": 1.482776352799414e-06,
+      "loss": 0.0071,
+      "step": 1107
+    },
+    {
+      "clip_ratio": 0.0003692662889989151,
+      "epoch": 1.4933225777538766,
+      "grad_norm": 0.038370903581380844,
+      "kl": 0.006665706634521484,
+      "learning_rate": 1.4770491820269317e-06,
+      "loss": 0.007,
+      "step": 1108
+    },
+    {
+      "clip_ratio": 0.00040588962588117283,
+      "epoch": 1.4952346807684265,
+      "grad_norm": 0.037489671260118484,
+      "kl": 0.006663322448730469,
+      "learning_rate": 1.4713284521404678e-06,
+      "loss": 0.0069,
+      "step": 1109
+    },
+    {
+      "clip_ratio": 0.00039138679812822375,
+      "epoch": 1.4971467837829764,
+      "grad_norm": 0.03641659393906593,
+      "kl": 0.006697654724121094,
+      "learning_rate": 1.465614199159905e-06,
+      "loss": 0.0069,
+      "step": 1110
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.476583480835,
+      "epoch": 1.4990588867975263,
+      "grad_norm": 1.8961507081985474,
+      "kl": 0.03508758544921875,
+      "learning_rate": 1.4599064590643472e-06,
+      "loss": 0.0056,
+      "num_tokens": 192212657.0,
+      "reward": 0.0753348250000272,
+      "reward_std": 0.07783834805013612,
+      "rewards/pure_accuracy_reward_math": 0.07533482302096672,
+      "step": 1111
+    },
+    {
+      "clip_ratio": 0.00029740781877762856,
+      "epoch": 1.500970989812076,
+      "grad_norm": 0.08476530015468597,
+      "kl": 0.011601448059082031,
+      "learning_rate": 1.4542052677918885e-06,
+      "loss": 0.0047,
+      "step": 1112
+    },
+    {
+      "clip_ratio": 0.0003210891072171762,
+      "epoch": 1.5028830928266261,
+      "grad_norm": 0.04907820373773575,
+      "kl": 0.010628223419189453,
+      "learning_rate": 1.4485106612393897e-06,
+      "loss": 0.0046,
+      "step": 1113
+    },
+    {
+      "clip_ratio": 0.00033912417364945213,
+      "epoch": 1.5047951958411758,
+      "grad_norm": 0.04438456520438194,
+      "kl": 0.010659217834472656,
+      "learning_rate": 1.4428226752622509e-06,
+      "loss": 0.0046,
+      "step": 1114
+    },
+    {
+      "clip_ratio": 0.0003756833369834567,
+      "epoch": 1.506707298855726,
+      "grad_norm": 0.0422808900475502,
+      "kl": 0.010442733764648438,
+      "learning_rate": 1.437141345674189e-06,
+      "loss": 0.0045,
+      "step": 1115
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 535.0778713226318,
+      "epoch": 1.5086194018702757,
+      "grad_norm": 0.048265133053064346,
+      "kl": 0.007592678070068359,
+      "learning_rate": 1.4314667082470064e-06,
+      "loss": 0.0086,
+      "num_tokens": 195861088.0,
+      "reward": 0.07142857479630038,
+      "reward_std": 0.08346496871672571,
+      "rewards/pure_accuracy_reward_math": 0.07142857287544757,
+      "step": 1116
+    },
+    {
+      "clip_ratio": 0.0003429410510875641,
+      "epoch": 1.5105315048848258,
+      "grad_norm": 0.04287589713931084,
+      "kl": 0.007152557373046875,
+      "learning_rate": 1.4257987987103727e-06,
+      "loss": 0.0085,
+      "step": 1117
+    },
+    {
+      "clip_ratio": 0.0003726668836634417,
+      "epoch": 1.5124436078993755,
+      "grad_norm": 0.0397462397813797,
+      "kl": 0.006825447082519531,
+      "learning_rate": 1.420137652751593e-06,
+      "loss": 0.0085,
+      "step": 1118
+    },
+    {
+      "clip_ratio": 0.0003763367328133427,
+      "epoch": 1.5143557109139256,
+      "grad_norm": 0.03851110488176346,
+      "kl": 0.006707668304443359,
+      "learning_rate": 1.4144833060153887e-06,
+      "loss": 0.0084,
+      "step": 1119
+    },
+    {
+      "clip_ratio": 0.0003624607439292049,
+      "epoch": 1.5162678139284753,
+      "grad_norm": 0.03720558434724808,
+      "kl": 0.00676727294921875,
+      "learning_rate": 1.408835794103669e-06,
+      "loss": 0.0083,
+      "step": 1120
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 524.7569994926453,
+      "epoch": 1.5181799169430255,
+      "grad_norm": 0.03832938149571419,
+      "kl": 0.008425712585449219,
+      "learning_rate": 1.4031951525753088e-06,
+      "loss": 0.0071,
+      "num_tokens": 199475701.0,
+      "reward": 0.08565848635043949,
+      "reward_std": 0.08179086120799184,
+      "rewards/pure_accuracy_reward_math": 0.08565848338184878,
+      "step": 1121
+    },
+    {
+      "clip_ratio": 0.00028257126655262255,
+      "epoch": 1.5200920199575751,
+      "grad_norm": 0.038414496928453445,
+      "kl": 0.008458137512207031,
+      "learning_rate": 1.3975614169459253e-06,
+      "loss": 0.0071,
+      "step": 1122
+    },
+    {
+      "clip_ratio": 0.0003134008442202685,
+      "epoch": 1.5220041229721253,
+      "grad_norm": 0.03928304836153984,
+      "kl": 0.008496284484863281,
+      "learning_rate": 1.391934622687652e-06,
+      "loss": 0.0071,
+      "step": 1123
+    },
+    {
+      "clip_ratio": 0.00030222541431612626,
+      "epoch": 1.523916225986675,
+      "grad_norm": 0.038087427616119385,
+      "kl": 0.008494377136230469,
+      "learning_rate": 1.38631480522892e-06,
+      "loss": 0.007,
+      "step": 1124
+    },
+    {
+      "clip_ratio": 0.0002927070846396873,
+      "epoch": 1.525828329001225,
+      "grad_norm": 0.03641984984278679,
+      "kl": 0.008457183837890625,
+      "learning_rate": 1.3807019999542287e-06,
+      "loss": 0.0069,
+      "step": 1125
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 531.1537666320801,
+      "epoch": 1.5277404320157748,
+      "grad_norm": 0.040940940380096436,
+      "kl": 0.006596565246582031,
+      "learning_rate": 1.3750962422039269e-06,
+      "loss": 0.0058,
+      "num_tokens": 203109136.0,
+      "reward": 0.07254464621655643,
+      "reward_std": 0.08217623952077702,
+      "rewards/pure_accuracy_reward_math": 0.07254464400466532,
+      "step": 1126
+    },
+    {
+      "clip_ratio": 0.00031519718078243386,
+      "epoch": 1.5296525350303247,
+      "grad_norm": 0.038493506610393524,
+      "kl": 0.006714344024658203,
+      "learning_rate": 1.369497567273989e-06,
+      "loss": 0.0058,
+      "step": 1127
+    },
+    {
+      "clip_ratio": 0.0003513000764314711,
+      "epoch": 1.5315646380448746,
+      "grad_norm": 0.039495162665843964,
+      "kl": 0.006772041320800781,
+      "learning_rate": 1.3639060104157964e-06,
+      "loss": 0.0057,
+      "step": 1128
+    },
+    {
+      "clip_ratio": 0.00033387296190312554,
+      "epoch": 1.5334767410594246,
+      "grad_norm": 0.03875305503606796,
+      "kl": 0.006872653961181641,
+      "learning_rate": 1.3583216068359078e-06,
+      "loss": 0.0057,
+      "step": 1129
+    },
+    {
+      "clip_ratio": 0.00036185752793471693,
+      "epoch": 1.5353888440739745,
+      "grad_norm": 0.03817266598343849,
+      "kl": 0.006899356842041016,
+      "learning_rate": 1.3527443916958466e-06,
+      "loss": 0.0056,
+      "step": 1130
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 537.4143671989441,
+      "epoch": 1.5373009470885244,
+      "grad_norm": 0.035565100610256195,
+      "kl": 0.006679058074951172,
+      "learning_rate": 1.3471744001118718e-06,
+      "loss": 0.0091,
+      "num_tokens": 206769717.0,
+      "reward": 0.07533482497092336,
+      "reward_std": 0.07436373975360766,
+      "rewards/pure_accuracy_reward_math": 0.07533482293365523,
+      "step": 1131
+    },
+    {
+      "clip_ratio": 0.00028060592541123697,
+      "epoch": 1.5392130501030743,
+      "grad_norm": 0.036901701241731644,
+      "kl": 0.006720542907714844,
+      "learning_rate": 1.3416116671547613e-06,
+      "loss": 0.0091,
+      "step": 1132
+    },
+    {
+      "clip_ratio": 0.00034766932589036514,
+      "epoch": 1.5411251531176242,
+      "grad_norm": 0.03489091992378235,
+      "kl": 0.006618499755859375,
+      "learning_rate": 1.3360562278495899e-06,
+      "loss": 0.009,
+      "step": 1133
+    },
+    {
+      "clip_ratio": 0.0003513962886927402,
+      "epoch": 1.5430372561321741,
+      "grad_norm": 0.035007573664188385,
+      "kl": 0.0066070556640625,
+      "learning_rate": 1.3305081171755092e-06,
+      "loss": 0.009,
+      "step": 1134
+    },
+    {
+      "clip_ratio": 0.00036896456708745973,
+      "epoch": 1.544949359146724,
+      "grad_norm": 0.03363417461514473,
+      "kl": 0.006587028503417969,
+      "learning_rate": 1.3249673700655246e-06,
+      "loss": 0.0089,
+      "step": 1135
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 531.2251925468445,
+      "epoch": 1.546861462161274,
+      "grad_norm": 0.037738338112831116,
+      "kl": 0.006687164306640625,
+      "learning_rate": 1.3194340214062828e-06,
+      "loss": 0.0066,
+      "num_tokens": 210404892.0,
+      "reward": 0.07477678978466429,
+      "reward_std": 0.08492635452421382,
+      "rewards/pure_accuracy_reward_math": 0.07477678699069656,
+      "step": 1136
+    },
+    {
+      "clip_ratio": 0.0003166603274848967,
+      "epoch": 1.5487735651758239,
+      "grad_norm": 0.03711307421326637,
+      "kl": 0.0067272186279296875,
+      "learning_rate": 1.3139081060378423e-06,
+      "loss": 0.0066,
+      "step": 1137
+    },
+    {
+      "clip_ratio": 0.00032532861348499864,
+      "epoch": 1.5506856681903738,
+      "grad_norm": 0.0381547249853611,
+      "kl": 0.006831169128417969,
+      "learning_rate": 1.3083896587534606e-06,
+      "loss": 0.0065,
+      "step": 1138
+    },
+    {
+      "clip_ratio": 0.0003168874280845557,
+      "epoch": 1.5525977712049237,
+      "grad_norm": 0.03702245280146599,
+      "kl": 0.0068492889404296875,
+      "learning_rate": 1.3028787142993723e-06,
+      "loss": 0.0064,
+      "step": 1139
+    },
+    {
+      "clip_ratio": 0.00031372528076190065,
+      "epoch": 1.5545098742194736,
+      "grad_norm": 0.035462986677885056,
+      "kl": 0.0068511962890625,
+      "learning_rate": 1.297375307374574e-06,
+      "loss": 0.0063,
+      "step": 1140
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.9913792610168,
+      "epoch": 1.5564219772340235,
+      "grad_norm": 0.0402364507317543,
+      "kl": 0.006835460662841797,
+      "learning_rate": 1.2918794726306003e-06,
+      "loss": 0.0099,
+      "num_tokens": 214034825.0,
+      "reward": 0.07310268151923083,
+      "reward_std": 0.07917333993827924,
+      "rewards/pure_accuracy_reward_math": 0.07310268000583164,
+      "step": 1141
+    },
+    {
+      "clip_ratio": 0.0003137970834359294,
+      "epoch": 1.5583340802485734,
+      "grad_norm": 0.03920648992061615,
+      "kl": 0.006829738616943359,
+      "learning_rate": 1.2863912446713084e-06,
+      "loss": 0.0098,
+      "step": 1142
+    },
+    {
+      "clip_ratio": 0.00032378236608110456,
+      "epoch": 1.5602461832631231,
+      "grad_norm": 0.03806397691369057,
+      "kl": 0.006905078887939453,
+      "learning_rate": 1.2809106580526636e-06,
+      "loss": 0.0098,
+      "step": 1143
+    },
+    {
+      "clip_ratio": 0.0003143088524097948,
+      "epoch": 1.5621582862776733,
+      "grad_norm": 0.03801356628537178,
+      "kl": 0.006966590881347656,
+      "learning_rate": 1.2754377472825153e-06,
+      "loss": 0.0097,
+      "step": 1144
+    },
+    {
+      "clip_ratio": 0.00035796050920566813,
+      "epoch": 1.564070389292223,
+      "grad_norm": 0.036964964121580124,
+      "kl": 0.006992816925048828,
+      "learning_rate": 1.2699725468203832e-06,
+      "loss": 0.0096,
+      "step": 1145
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 538.6370244026184,
+      "epoch": 1.565982492306773,
+      "grad_norm": 0.045449208468198776,
+      "kl": 0.007224559783935547,
+      "learning_rate": 1.2645150910772413e-06,
+      "loss": 0.0043,
+      "num_tokens": 217697304.0,
+      "reward": 0.07393973600119352,
+      "reward_std": 0.08620888477889821,
+      "rewards/pure_accuracy_reward_math": 0.07393973361467943,
+      "step": 1146
+    },
+    {
+      "clip_ratio": 0.0003596847872131548,
+      "epoch": 1.5678945953213228,
+      "grad_norm": 0.03882161155343056,
+      "kl": 0.006949901580810547,
+      "learning_rate": 1.2590654144152992e-06,
+      "loss": 0.0043,
+      "step": 1147
+    },
+    {
+      "clip_ratio": 0.0004527134210547956,
+      "epoch": 1.569806698335873,
+      "grad_norm": 0.03764580935239792,
+      "kl": 0.00691986083984375,
+      "learning_rate": 1.2536235511477852e-06,
+      "loss": 0.0043,
+      "step": 1148
+    },
+    {
+      "clip_ratio": 0.0005161078099717997,
+      "epoch": 1.5717188013504226,
+      "grad_norm": 0.03833252564072609,
+      "kl": 0.006892681121826172,
+      "learning_rate": 1.2481895355387341e-06,
+      "loss": 0.0042,
+      "step": 1149
+    },
+    {
+      "clip_ratio": 0.0005320426059824968,
+      "epoch": 1.5736309043649728,
+      "grad_norm": 0.03876457363367081,
+      "kl": 0.006943702697753906,
+      "learning_rate": 1.2427634018027673e-06,
+      "loss": 0.0041,
+      "step": 1150
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.9707288742065,
+      "epoch": 1.5755430073795225,
+      "grad_norm": 0.03937402740120888,
+      "kl": 0.007305145263671875,
+      "learning_rate": 1.2373451841048781e-06,
+      "loss": 0.0078,
+      "num_tokens": 221325451.0,
+      "reward": 0.08258928963914514,
+      "reward_std": 0.08058846154017374,
+      "rewards/pure_accuracy_reward_math": 0.08258928655413911,
+      "step": 1151
+    },
+    {
+      "clip_ratio": 0.0002857717965980555,
+      "epoch": 1.5774551103940726,
+      "grad_norm": 0.03863917291164398,
+      "kl": 0.007287502288818359,
+      "learning_rate": 1.2319349165602202e-06,
+      "loss": 0.0078,
+      "step": 1152
+    },
+    {
+      "clip_ratio": 0.0002796752659151025,
+      "epoch": 1.5793672134086223,
+      "grad_norm": 0.03722836822271347,
+      "kl": 0.007286548614501953,
+      "learning_rate": 1.2265326332338875e-06,
+      "loss": 0.0077,
+      "step": 1153
+    },
+    {
+      "clip_ratio": 0.00034041513032434523,
+      "epoch": 1.5812793164231724,
+      "grad_norm": 0.03688417002558708,
+      "kl": 0.007335662841796875,
+      "learning_rate": 1.2211383681407022e-06,
+      "loss": 0.0076,
+      "step": 1154
+    },
+    {
+      "clip_ratio": 0.0003595712430524145,
+      "epoch": 1.5831914194377221,
+      "grad_norm": 0.037124987691640854,
+      "kl": 0.007359981536865234,
+      "learning_rate": 1.2157521552450035e-06,
+      "loss": 0.0076,
+      "step": 1155
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 540.098798751831,
+      "epoch": 1.5851035224522723,
+      "grad_norm": 0.03577388823032379,
+      "kl": 0.0069561004638671875,
+      "learning_rate": 1.210374028460428e-06,
+      "loss": 0.0065,
+      "num_tokens": 224996253.0,
+      "reward": 0.06863839607103728,
+      "reward_std": 0.07376563857542351,
+      "rewards/pure_accuracy_reward_math": 0.06863839426659979,
+      "step": 1156
+    },
+    {
+      "clip_ratio": 0.00025091522741149674,
+      "epoch": 1.587015625466822,
+      "grad_norm": 0.03386949375271797,
+      "kl": 0.006894588470458984,
+      "learning_rate": 1.2050040216497e-06,
+      "loss": 0.0065,
+      "step": 1157
+    },
+    {
+      "clip_ratio": 0.00029767470277874963,
+      "epoch": 1.588927728481372,
+      "grad_norm": 0.033231545239686966,
+      "kl": 0.0068531036376953125,
+      "learning_rate": 1.1996421686244179e-06,
+      "loss": 0.0064,
+      "step": 1158
+    },
+    {
+      "clip_ratio": 0.00030627386024661973,
+      "epoch": 1.5908398314959218,
+      "grad_norm": 0.0327543206512928,
+      "kl": 0.006781578063964844,
+      "learning_rate": 1.1942885031448397e-06,
+      "loss": 0.0064,
+      "step": 1159
+    },
+    {
+      "clip_ratio": 0.00032285955057886895,
+      "epoch": 1.5927519345104717,
+      "grad_norm": 0.03283894062042236,
+      "kl": 0.006725788116455078,
+      "learning_rate": 1.1889430589196727e-06,
+      "loss": 0.0063,
+      "step": 1160
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 540.7405333518982,
+      "epoch": 1.5946640375250216,
+      "grad_norm": 0.04240734875202179,
+      "kl": 0.006897449493408203,
+      "learning_rate": 1.183605869605858e-06,
+      "loss": 0.0064,
+      "num_tokens": 228663991.0,
+      "reward": 0.08091518227593042,
+      "reward_std": 0.08951703325146809,
+      "rewards/pure_accuracy_reward_math": 0.08091518018045463,
+      "step": 1161
+    },
+    {
+      "clip_ratio": 0.00035278943187222467,
+      "epoch": 1.5965761405395715,
+      "grad_norm": 0.04050403833389282,
+      "kl": 0.006961345672607422,
+      "learning_rate": 1.1782769688083647e-06,
+      "loss": 0.0064,
+      "step": 1162
+    },
+    {
+      "clip_ratio": 0.00034535837551175064,
+      "epoch": 1.5984882435541214,
+      "grad_norm": 0.03872028365731239,
+      "kl": 0.007065296173095703,
+      "learning_rate": 1.1729563900799695e-06,
+      "loss": 0.0063,
+      "step": 1163
+    },
+    {
+      "clip_ratio": 0.00037939938943054585,
+      "epoch": 1.6004003465686714,
+      "grad_norm": 0.039447493851184845,
+      "kl": 0.007191181182861328,
+      "learning_rate": 1.1676441669210543e-06,
+      "loss": 0.0063,
+      "step": 1164
+    },
+    {
+      "clip_ratio": 0.00037003348657549395,
+      "epoch": 1.6023124495832213,
+      "grad_norm": 0.03724885359406471,
+      "kl": 0.0071163177490234375,
+      "learning_rate": 1.1623403327793881e-06,
+      "loss": 0.0061,
+      "step": 1165
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 531.3211750984192,
+      "epoch": 1.6042245525977712,
+      "grad_norm": 0.9447879791259766,
+      "kl": 0.03227043151855469,
+      "learning_rate": 1.1570449210499213e-06,
+      "loss": 0.0085,
+      "num_tokens": 232302082.0,
+      "reward": 0.07756696781143546,
+      "reward_std": 0.0780110054765828,
+      "rewards/pure_accuracy_reward_math": 0.07756696577416733,
+      "step": 1166
+    },
+    {
+      "clip_ratio": 0.00036849399879201883,
+      "epoch": 1.606136655612321,
+      "grad_norm": 0.26742058992385864,
+      "kl": 0.011518478393554688,
+      "learning_rate": 1.1517579650745713e-06,
+      "loss": 0.0079,
+      "step": 1167
+    },
+    {
+      "clip_ratio": 0.00029733346730154153,
+      "epoch": 1.608048758626871,
+      "grad_norm": 0.3907225728034973,
+      "kl": 0.017581462860107422,
+      "learning_rate": 1.1464794981420187e-06,
+      "loss": 0.0079,
+      "step": 1168
+    },
+    {
+      "clip_ratio": 0.0003680569542439116,
+      "epoch": 1.609960861641421,
+      "grad_norm": 0.1778813600540161,
+      "kl": 0.010699748992919922,
+      "learning_rate": 1.1412095534874912e-06,
+      "loss": 0.0077,
+      "step": 1169
+    },
+    {
+      "clip_ratio": 0.0003726620370798628,
+      "epoch": 1.6118729646559709,
+      "grad_norm": 0.2035137563943863,
+      "kl": 0.01429891586303711,
+      "learning_rate": 1.135948164292557e-06,
+      "loss": 0.0077,
+      "step": 1170
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.0362968444824,
+      "epoch": 1.6137850676705208,
+      "grad_norm": 0.040138401091098785,
+      "kl": 0.008060932159423828,
+      "learning_rate": 1.130695363684916e-06,
+      "loss": 0.0096,
+      "num_tokens": 235898380.0,
+      "reward": 0.0630580390279647,
+      "reward_std": 0.07195894001051784,
+      "rewards/pure_accuracy_reward_math": 0.06305803687428124,
+      "step": 1171
+    },
+    {
+      "clip_ratio": 0.0002708259837049809,
+      "epoch": 1.6156971706850707,
+      "grad_norm": 0.03859123960137367,
+      "kl": 0.008191585540771484,
+      "learning_rate": 1.1254511847381922e-06,
+      "loss": 0.0096,
+      "step": 1172
+    },
+    {
+      "clip_ratio": 0.00029455311903348047,
+      "epoch": 1.6176092736996206,
+      "grad_norm": 0.03898981586098671,
+      "kl": 0.008168697357177734,
+      "learning_rate": 1.1202156604717234e-06,
+      "loss": 0.0095,
+      "step": 1173
+    },
+    {
+      "clip_ratio": 0.0003440694692926627,
+      "epoch": 1.6195213767141705,
+      "grad_norm": 0.0370321087539196,
+      "kl": 0.00800466537475586,
+      "learning_rate": 1.1149888238503537e-06,
+      "loss": 0.0094,
+      "step": 1174
+    },
+    {
+      "clip_ratio": 0.00040963905792068545,
+      "epoch": 1.6214334797287204,
+      "grad_norm": 0.03698049858212471,
+      "kl": 0.007803440093994141,
+      "learning_rate": 1.109770707784229e-06,
+      "loss": 0.0094,
+      "step": 1175
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.937527179718,
+      "epoch": 1.6233455827432703,
+      "grad_norm": 0.039002615958452225,
+      "kl": 0.007039546966552734,
+      "learning_rate": 1.1045613451285837e-06,
+      "loss": 0.0074,
+      "num_tokens": 239513448.0,
+      "reward": 0.06584821754950099,
+      "reward_std": 0.07595151895657182,
+      "rewards/pure_accuracy_reward_math": 0.06584821516298689,
+      "step": 1176
+    },
+    {
+      "clip_ratio": 0.0003209126220440339,
+      "epoch": 1.6252576857578203,
+      "grad_norm": 0.038693126291036606,
+      "kl": 0.0069637298583984375,
+      "learning_rate": 1.0993607686835408e-06,
+      "loss": 0.0074,
+      "step": 1177
+    },
+    {
+      "clip_ratio": 0.0003234959946212257,
+      "epoch": 1.62716978877237,
+      "grad_norm": 0.03805870935320854,
+      "kl": 0.006987094879150391,
+      "learning_rate": 1.0941690111939002e-06,
+      "loss": 0.0073,
+      "step": 1178
+    },
+    {
+      "clip_ratio": 0.0003316311403978034,
+      "epoch": 1.62908189178692,
+      "grad_norm": 0.03687576577067375,
+      "kl": 0.0070285797119140625,
+      "learning_rate": 1.0889861053489341e-06,
+      "loss": 0.0072,
+      "step": 1179
+    },
+    {
+      "clip_ratio": 0.00033663610071243966,
+      "epoch": 1.6309939948014698,
+      "grad_norm": 0.03717907890677452,
+      "kl": 0.007116794586181641,
+      "learning_rate": 1.0838120837821814e-06,
+      "loss": 0.0071,
+      "step": 1180
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 514.2112393379211,
+      "epoch": 1.63290609781602,
+      "grad_norm": 0.04346395656466484,
+      "kl": 0.007472515106201172,
+      "learning_rate": 1.0786469790712441e-06,
+      "loss": 0.0059,
+      "num_tokens": 243092265.0,
+      "reward": 0.07700893233413808,
+      "reward_std": 0.07526089128805324,
+      "rewards/pure_accuracy_reward_math": 0.07700893029686995,
+      "step": 1181
+    },
+    {
+      "clip_ratio": 0.0002878125141592136,
+      "epoch": 1.6348182008305696,
+      "grad_norm": 0.03890342637896538,
+      "kl": 0.007323265075683594,
+      "learning_rate": 1.0734908237375783e-06,
+      "loss": 0.0059,
+      "step": 1182
+    },
+    {
+      "clip_ratio": 0.00031910790164602076,
+      "epoch": 1.6367303038451197,
+      "grad_norm": 0.03748926892876625,
+      "kl": 0.007243156433105469,
+      "learning_rate": 1.0683436502462915e-06,
+      "loss": 0.0058,
+      "step": 1183
+    },
+    {
+      "clip_ratio": 0.00036283263597169935,
+      "epoch": 1.6386424068596694,
+      "grad_norm": 0.037570755928754807,
+      "kl": 0.007138252258300781,
+      "learning_rate": 1.0632054910059391e-06,
+      "loss": 0.0058,
+      "step": 1184
+    },
+    {
+      "clip_ratio": 0.00039574184188495565,
+      "epoch": 1.6405545098742196,
+      "grad_norm": 0.038306284695863724,
+      "kl": 0.007193088531494141,
+      "learning_rate": 1.0580763783683187e-06,
+      "loss": 0.0057,
+      "step": 1185
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 518.925525188446,
+      "epoch": 1.6424666128887693,
+      "grad_norm": 0.04251728951931,
+      "kl": 0.007372379302978516,
+      "learning_rate": 1.0529563446282665e-06,
+      "loss": 0.01,
+      "num_tokens": 246686482.0,
+      "reward": 0.08537946754950099,
+      "reward_std": 0.08939063869183883,
+      "rewards/pure_accuracy_reward_math": 0.08537946551223285,
+      "step": 1186
+    },
+    {
+      "clip_ratio": 0.0003136689152256622,
+      "epoch": 1.6443787159033194,
+      "grad_norm": 0.04087135195732117,
+      "kl": 0.007419109344482422,
+      "learning_rate": 1.0478454220234568e-06,
+      "loss": 0.0099,
+      "step": 1187
+    },
+    {
+      "clip_ratio": 0.0003467907941399062,
+      "epoch": 1.646290818917869,
+      "grad_norm": 0.039666056632995605,
+      "kl": 0.007442951202392578,
+      "learning_rate": 1.0427436427341939e-06,
+      "loss": 0.0099,
+      "step": 1188
+    },
+    {
+      "clip_ratio": 0.00038431568484043055,
+      "epoch": 1.6482029219324192,
+      "grad_norm": 0.0389142706990242,
+      "kl": 0.007426738739013672,
+      "learning_rate": 1.0376510388832147e-06,
+      "loss": 0.0098,
+      "step": 1189
+    },
+    {
+      "clip_ratio": 0.000490980125164242,
+      "epoch": 1.650115024946969,
+      "grad_norm": 0.03956843912601471,
+      "kl": 0.007406711578369141,
+      "learning_rate": 1.0325676425354828e-06,
+      "loss": 0.0097,
+      "step": 1190
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 508.4835596084595,
+      "epoch": 1.652027127961519,
+      "grad_norm": 0.04898946359753609,
+      "kl": 0.008952617645263672,
+      "learning_rate": 1.0274934856979876e-06,
+      "loss": 0.0069,
+      "num_tokens": 250241299.0,
+      "reward": 0.07868303955183364,
+      "reward_std": 0.08381028211442754,
+      "rewards/pure_accuracy_reward_math": 0.07868303728173487,
+      "step": 1191
+    },
+    {
+      "clip_ratio": 0.0002854310730526777,
+      "epoch": 1.6539392309760688,
+      "grad_norm": 0.04304199293255806,
+      "kl": 0.008716106414794922,
+      "learning_rate": 1.0224286003195437e-06,
+      "loss": 0.0069,
+      "step": 1192
+    },
+    {
+      "clip_ratio": 0.00029722766299755676,
+      "epoch": 1.655851333990619,
+      "grad_norm": 0.039751190692186356,
+      "kl": 0.008554935455322266,
+      "learning_rate": 1.017373018290588e-06,
+      "loss": 0.0068,
+      "step": 1193
+    },
+    {
+      "clip_ratio": 0.00036785421832519205,
+      "epoch": 1.6577634370051686,
+      "grad_norm": 0.039316095411777496,
+      "kl": 0.00851297378540039,
+      "learning_rate": 1.0123267714429826e-06,
+      "loss": 0.0067,
+      "step": 1194
+    },
+    {
+      "clip_ratio": 0.0003976103018885624,
+      "epoch": 1.6596755400197185,
+      "grad_norm": 0.03880908712744713,
+      "kl": 0.008470535278320312,
+      "learning_rate": 1.0072898915498094e-06,
+      "loss": 0.0067,
+      "step": 1195
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 514.2179379463196,
+      "epoch": 1.6615876430342684,
+      "grad_norm": 0.04073133319616318,
+      "kl": 0.0076427459716796875,
+      "learning_rate": 1.0022624103251727e-06,
+      "loss": 0.0095,
+      "num_tokens": 253820892.0,
+      "reward": 0.08593750416184776,
+      "reward_std": 0.08978221646975726,
+      "rewards/pure_accuracy_reward_math": 0.08593750165891834,
+      "step": 1196
+    },
+    {
+      "clip_ratio": 0.0003768215759691884,
+      "epoch": 1.6634997460488183,
+      "grad_norm": 0.039870597422122955,
+      "kl": 0.007634639739990234,
+      "learning_rate": 9.972443594239997e-07,
+      "loss": 0.0095,
+      "step": 1197
+    },
+    {
+      "clip_ratio": 0.00033531371116168884,
+      "epoch": 1.6654118490633683,
+      "grad_norm": 0.039165791124105453,
+      "kl": 0.007609367370605469,
+      "learning_rate": 9.922357704418394e-07,
+      "loss": 0.0094,
+      "step": 1198
+    },
+    {
+      "clip_ratio": 0.0003830786464504854,
+      "epoch": 1.6673239520779182,
+      "grad_norm": 0.0393473282456398,
+      "kl": 0.0076847076416015625,
+      "learning_rate": 9.872366749146684e-07,
+      "loss": 0.0094,
+      "step": 1199
+    },
+    {
+      "clip_ratio": 0.0003766370310813727,
+      "epoch": 1.669236055092468,
+      "grad_norm": 0.037378448992967606,
+      "kl": 0.007641792297363281,
+      "learning_rate": 9.822471043186846e-07,
+      "loss": 0.0093,
+      "step": 1200
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 502.35381841659546,
+      "epoch": 1.671148158107018,
+      "grad_norm": 0.051170479506254196,
+      "kl": 0.008347511291503906,
+      "learning_rate": 9.772670900701172e-07,
+      "loss": 0.0074,
+      "num_tokens": 257360516.0,
+      "reward": 0.08537946784053929,
+      "reward_std": 0.09248606633627787,
+      "rewards/pure_accuracy_reward_math": 0.0853794660361018,
+      "step": 1201
+    },
+    {
+      "clip_ratio": 0.00036896339207714846,
+      "epoch": 1.673060261121568,
+      "grad_norm": 0.04540196433663368,
+      "kl": 0.008112430572509766,
+      "learning_rate": 9.722966635250222e-07,
+      "loss": 0.0074,
+      "step": 1202
+    },
+    {
+      "clip_ratio": 0.00040850058093155894,
+      "epoch": 1.6749723641361178,
+      "grad_norm": 0.0428830124437809,
+      "kl": 0.007869243621826172,
+      "learning_rate": 9.673358559790892e-07,
+      "loss": 0.0073,
+      "step": 1203
+    },
+    {
+      "clip_ratio": 0.0004735397765216476,
+      "epoch": 1.6768844671506677,
+      "grad_norm": 0.04445512220263481,
+      "kl": 0.007699012756347656,
+      "learning_rate": 9.623846986674417e-07,
+      "loss": 0.0072,
+      "step": 1204
+    },
+    {
+      "clip_ratio": 0.00047387216932293086,
+      "epoch": 1.6787965701652177,
+      "grad_norm": 0.04317403957247734,
+      "kl": 0.0076007843017578125,
+      "learning_rate": 9.574432227644432e-07,
+      "loss": 0.0071,
+      "step": 1205
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 511.88367557525635,
+      "epoch": 1.6807086731797676,
+      "grad_norm": 0.041338611394166946,
+      "kl": 0.007639884948730469,
+      "learning_rate": 9.525114593834975e-07,
+      "loss": 0.0077,
+      "num_tokens": 260924667.0,
+      "reward": 0.07617187869618647,
+      "reward_std": 0.08037573983892798,
+      "rewards/pure_accuracy_reward_math": 0.0761718759604264,
+      "step": 1206
+    },
+    {
+      "clip_ratio": 0.00029646307336861355,
+      "epoch": 1.6826207761943175,
+      "grad_norm": 0.040457833558321,
+      "kl": 0.007670402526855469,
+      "learning_rate": 9.475894395768579e-07,
+      "loss": 0.0077,
+      "step": 1207
+    },
+    {
+      "clip_ratio": 0.0003306309376966965,
+      "epoch": 1.6845328792088674,
+      "grad_norm": 0.03946809470653534,
+      "kl": 0.0076751708984375,
+      "learning_rate": 9.426771943354249e-07,
+      "loss": 0.0076,
+      "step": 1208
+    },
+    {
+      "clip_ratio": 0.0003582578942200598,
+      "epoch": 1.6864449822234173,
+      "grad_norm": 0.04006471857428551,
+      "kl": 0.007700443267822266,
+      "learning_rate": 9.377747545885569e-07,
+      "loss": 0.0075,
+      "step": 1209
+    },
+    {
+      "clip_ratio": 0.00040392828321955676,
+      "epoch": 1.6883570852379672,
+      "grad_norm": 0.04037889465689659,
+      "kl": 0.007681369781494141,
+      "learning_rate": 9.328821512038716e-07,
+      "loss": 0.0074,
+      "step": 1210
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 533.6010298728943,
+      "epoch": 1.6902691882525172,
+      "grad_norm": 0.03628333657979965,
+      "kl": 0.006788730621337891,
+      "learning_rate": 9.279994149870539e-07,
+      "loss": 0.0073,
+      "num_tokens": 264564517.0,
+      "reward": 0.06110491382423788,
+      "reward_std": 0.06693661888130009,
+      "rewards/pure_accuracy_reward_math": 0.06110491219442338,
+      "step": 1211
+    },
+    {
+      "clip_ratio": 0.0002594580842014693,
+      "epoch": 1.692181291267067,
+      "grad_norm": 0.034194085747003555,
+      "kl": 0.006678581237792969,
+      "learning_rate": 9.231265766816619e-07,
+      "loss": 0.0073,
+      "step": 1212
+    },
+    {
+      "clip_ratio": 0.0003170226998463477,
+      "epoch": 1.6940933942816168,
+      "grad_norm": 0.035113800317049026,
+      "kl": 0.006625652313232422,
+      "learning_rate": 9.182636669689335e-07,
+      "loss": 0.0073,
+      "step": 1213
+    },
+    {
+      "clip_ratio": 0.0003448430217076748,
+      "epoch": 1.696005497296167,
+      "grad_norm": 0.03626548498868942,
+      "kl": 0.006573200225830078,
+      "learning_rate": 9.134107164675898e-07,
+      "loss": 0.0072,
+      "step": 1214
+    },
+    {
+      "clip_ratio": 0.00033195262278695736,
+      "epoch": 1.6979176003107166,
+      "grad_norm": 0.03465663269162178,
+      "kl": 0.006582736968994141,
+      "learning_rate": 9.085677557336465e-07,
+      "loss": 0.0071,
+      "step": 1215
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.8440546989441,
+      "epoch": 1.6998297033252667,
+      "grad_norm": 0.038788389414548874,
+      "kl": 0.009612560272216797,
+      "learning_rate": 9.037348152602199e-07,
+      "loss": 0.0052,
+      "num_tokens": 268179390.0,
+      "reward": 0.07756696798605844,
+      "reward_std": 0.0852254037745297,
+      "rewards/pure_accuracy_reward_math": 0.07756696571595967,
+      "step": 1216
+    },
+    {
+      "clip_ratio": 0.00027092215094626226,
+      "epoch": 1.7017418063398164,
+      "grad_norm": 0.038229282945394516,
+      "kl": 0.009754657745361328,
+      "learning_rate": 8.989119254773343e-07,
+      "loss": 0.0052,
+      "step": 1217
+    },
+    {
+      "clip_ratio": 0.00027246196253827293,
+      "epoch": 1.7036539093543666,
+      "grad_norm": 0.03782220929861069,
+      "kl": 0.009780406951904297,
+      "learning_rate": 8.940991167517313e-07,
+      "loss": 0.0051,
+      "step": 1218
+    },
+    {
+      "clip_ratio": 0.0003069629718197575,
+      "epoch": 1.7055660123689163,
+      "grad_norm": 0.03707100450992584,
+      "kl": 0.00977468490600586,
+      "learning_rate": 8.892964193866799e-07,
+      "loss": 0.005,
+      "step": 1219
+    },
+    {
+      "clip_ratio": 0.0003035257008150438,
+      "epoch": 1.7074781153834664,
+      "grad_norm": 0.03552490472793579,
+      "kl": 0.009665966033935547,
+      "learning_rate": 8.845038636217818e-07,
+      "loss": 0.0049,
+      "step": 1220
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.9601240158081,
+      "epoch": 1.709390218398016,
+      "grad_norm": 0.04051567241549492,
+      "kl": 0.007312297821044922,
+      "learning_rate": 8.797214796327843e-07,
+      "loss": 0.0079,
+      "num_tokens": 271808667.0,
+      "reward": 0.08733259368455037,
+      "reward_std": 0.08496641932288185,
+      "rewards/pure_accuracy_reward_math": 0.0873325903667137,
+      "step": 1221
+    },
+    {
+      "clip_ratio": 0.00033132852740891394,
+      "epoch": 1.7113023214125662,
+      "grad_norm": 0.03887411206960678,
+      "kl": 0.007235527038574219,
+      "learning_rate": 8.749492975313897e-07,
+      "loss": 0.0079,
+      "step": 1222
+    },
+    {
+      "clip_ratio": 0.0003587238066984355,
+      "epoch": 1.713214424427116,
+      "grad_norm": 0.04010055959224701,
+      "kl": 0.007251739501953125,
+      "learning_rate": 8.701873473650643e-07,
+      "loss": 0.0079,
+      "step": 1223
+    },
+    {
+      "clip_ratio": 0.0003504625653079074,
+      "epoch": 1.715126527441666,
+      "grad_norm": 0.039550576359033585,
+      "kl": 0.007262229919433594,
+      "learning_rate": 8.654356591168522e-07,
+      "loss": 0.0078,
+      "step": 1224
+    },
+    {
+      "clip_ratio": 0.0003497420942721874,
+      "epoch": 1.7170386304562157,
+      "grad_norm": 0.03883340209722519,
+      "kl": 0.007348537445068359,
+      "learning_rate": 8.60694262705182e-07,
+      "loss": 0.0077,
+      "step": 1225
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.5396447181702,
+      "epoch": 1.7189507334707659,
+      "grad_norm": 0.037610165774822235,
+      "kl": 0.007049083709716797,
+      "learning_rate": 8.559631879836838e-07,
+      "loss": 0.0065,
+      "num_tokens": 275440789.0,
+      "reward": 0.07896205675206147,
+      "reward_std": 0.07938606152310967,
+      "rewards/pure_accuracy_reward_math": 0.07896205494762398,
+      "step": 1226
+    },
+    {
+      "clip_ratio": 0.0002787316387298233,
+      "epoch": 1.7208628364853156,
+      "grad_norm": 0.03763109818100929,
+      "kl": 0.007136821746826172,
+      "learning_rate": 8.512424647409964e-07,
+      "loss": 0.0065,
+      "step": 1227
+    },
+    {
+      "clip_ratio": 0.0003178273858566172,
+      "epoch": 1.7227749394998657,
+      "grad_norm": 0.037824735045433044,
+      "kl": 0.007121562957763672,
+      "learning_rate": 8.465321227005823e-07,
+      "loss": 0.0065,
+      "step": 1228
+    },
+    {
+      "clip_ratio": 0.0002866029928725311,
+      "epoch": 1.7246870425144154,
+      "grad_norm": 0.03616493567824364,
+      "kl": 0.00708770751953125,
+      "learning_rate": 8.418321915205399e-07,
+      "loss": 0.0064,
+      "step": 1229
+    },
+    {
+      "clip_ratio": 0.00031164622902224437,
+      "epoch": 1.7265991455289653,
+      "grad_norm": 0.03562076762318611,
+      "kl": 0.007038593292236328,
+      "learning_rate": 8.371427007934174e-07,
+      "loss": 0.0063,
+      "step": 1230
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 536.3178272247314,
+      "epoch": 1.7285112485435152,
+      "grad_norm": 0.03759186714887619,
+      "kl": 0.006800651550292969,
+      "learning_rate": 8.324636800460242e-07,
+      "loss": 0.0071,
+      "num_tokens": 279097568.0,
+      "reward": 0.07728794903960079,
+      "reward_std": 0.07732657541055232,
+      "rewards/pure_accuracy_reward_math": 0.07728794822469354,
+      "step": 1231
+    },
+    {
+      "clip_ratio": 0.00028705537579298834,
+      "epoch": 1.7304233515580651,
+      "grad_norm": 0.036786679178476334,
+      "kl": 0.006786346435546875,
+      "learning_rate": 8.277951587392505e-07,
+      "loss": 0.0071,
+      "step": 1232
+    },
+    {
+      "clip_ratio": 0.000303516245821811,
+      "epoch": 1.732335454572615,
+      "grad_norm": 0.03563455864787102,
+      "kl": 0.0068149566650390625,
+      "learning_rate": 8.231371662678741e-07,
+      "loss": 0.0071,
+      "step": 1233
+    },
+    {
+      "clip_ratio": 0.0003096325264095867,
+      "epoch": 1.734247557587165,
+      "grad_norm": 0.03413652628660202,
+      "kl": 0.006861209869384766,
+      "learning_rate": 8.184897319603813e-07,
+      "loss": 0.007,
+      "step": 1234
+    },
+    {
+      "clip_ratio": 0.0003550405467649398,
+      "epoch": 1.736159660601715,
+      "grad_norm": 0.03433661162853241,
+      "kl": 0.006935596466064453,
+      "learning_rate": 8.138528850787792e-07,
+      "loss": 0.0069,
+      "step": 1235
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.8069453239441,
+      "epoch": 1.7380717636162648,
+      "grad_norm": 0.2546544671058655,
+      "kl": 0.012326240539550781,
+      "learning_rate": 8.092266548184139e-07,
+      "loss": 0.011,
+      "num_tokens": 282683384.0,
+      "reward": 0.07477678873692639,
+      "reward_std": 0.08165826951153576,
+      "rewards/pure_accuracy_reward_math": 0.07477678751456551,
+      "step": 1236
+    },
+    {
+      "clip_ratio": 0.00030172572752462656,
+      "epoch": 1.7399838666308147,
+      "grad_norm": 0.042716413736343384,
+      "kl": 0.0078887939453125,
+      "learning_rate": 8.046110703077839e-07,
+      "loss": 0.0108,
+      "step": 1237
+    },
+    {
+      "clip_ratio": 0.00029401268267292835,
+      "epoch": 1.7418959696453646,
+      "grad_norm": 0.038783252239227295,
+      "kl": 0.007707118988037109,
+      "learning_rate": 8.000061606083579e-07,
+      "loss": 0.0107,
+      "step": 1238
+    },
+    {
+      "clip_ratio": 0.00028625389199987694,
+      "epoch": 1.7438080726599146,
+      "grad_norm": 0.0381159707903862,
+      "kl": 0.007790088653564453,
+      "learning_rate": 7.954119547143935e-07,
+      "loss": 0.0107,
+      "step": 1239
+    },
+    {
+      "clip_ratio": 0.00034677153644224745,
+      "epoch": 1.7457201756744645,
+      "grad_norm": 0.038590554147958755,
+      "kl": 0.007785797119140625,
+      "learning_rate": 7.90828481552752e-07,
+      "loss": 0.0106,
+      "step": 1240
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 517.8047132492065,
+      "epoch": 1.7476322786890144,
+      "grad_norm": 0.03943649306893349,
+      "kl": 0.007458209991455078,
+      "learning_rate": 7.862557699827167e-07,
+      "loss": 0.0092,
+      "num_tokens": 286269120.0,
+      "reward": 0.06640625282307155,
+      "reward_std": 0.07607791275950149,
+      "rewards/pure_accuracy_reward_math": 0.06640625130967237,
+      "step": 1241
+    },
+    {
+      "clip_ratio": 0.00031282668544463377,
+      "epoch": 1.7495443817035643,
+      "grad_norm": 0.0388050340116024,
+      "kl": 0.007348060607910156,
+      "learning_rate": 7.816938487958131e-07,
+      "loss": 0.0092,
+      "step": 1242
+    },
+    {
+      "clip_ratio": 0.0003194147345197962,
+      "epoch": 1.7514564847181142,
+      "grad_norm": 0.038322921842336655,
+      "kl": 0.007298946380615234,
+      "learning_rate": 7.771427467156256e-07,
+      "loss": 0.0091,
+      "step": 1243
+    },
+    {
+      "clip_ratio": 0.0003203335651846828,
+      "epoch": 1.7533685877326641,
+      "grad_norm": 0.037499312311410904,
+      "kl": 0.007254600524902344,
+      "learning_rate": 7.726024923976169e-07,
+      "loss": 0.009,
+      "step": 1244
+    },
+    {
+      "clip_ratio": 0.00032696440513291236,
+      "epoch": 1.755280690747214,
+      "grad_norm": 0.03671669587492943,
+      "kl": 0.007252693176269531,
+      "learning_rate": 7.680731144289505e-07,
+      "loss": 0.009,
+      "step": 1245
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 514.8644180297852,
+      "epoch": 1.757192793761764,
+      "grad_norm": 0.04826434701681137,
+      "kl": 0.0094451904296875,
+      "learning_rate": 7.635546413283054e-07,
+      "loss": 0.0078,
+      "num_tokens": 289848950.0,
+      "reward": 0.07421875323052518,
+      "reward_std": 0.07818366138963029,
+      "rewards/pure_accuracy_reward_math": 0.074218751717126,
+      "step": 1246
+    },
+    {
+      "clip_ratio": 0.000299703156713349,
+      "epoch": 1.7591048967763139,
+      "grad_norm": 0.03791136294603348,
+      "kl": 0.009324073791503906,
+      "learning_rate": 7.590471015457002e-07,
+      "loss": 0.0077,
+      "step": 1247
+    },
+    {
+      "clip_ratio": 0.00030542989918558305,
+      "epoch": 1.7610169997908636,
+      "grad_norm": 0.03703403100371361,
+      "kl": 0.009335517883300781,
+      "learning_rate": 7.545505234623152e-07,
+      "loss": 0.0077,
+      "step": 1248
+    },
+    {
+      "clip_ratio": 0.0002983629839832247,
+      "epoch": 1.7629291028054137,
+      "grad_norm": 0.0363752581179142,
+      "kl": 0.009361743927001953,
+      "learning_rate": 7.500649353903092e-07,
+      "loss": 0.0076,
+      "step": 1249
+    },
+    {
+      "clip_ratio": 0.0002923785563098136,
+      "epoch": 1.7648412058199634,
+      "grad_norm": 0.03587965667247772,
+      "kl": 0.009373664855957031,
+      "learning_rate": 7.455903655726437e-07,
+      "loss": 0.0075,
+      "step": 1250
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 510.6543188095093,
+      "epoch": 1.7667533088345135,
+      "grad_norm": 0.03651593253016472,
+      "kl": 0.008678436279296875,
+      "learning_rate": 7.411268421829076e-07,
+      "loss": 0.0059,
+      "num_tokens": 293408275.0,
+      "reward": 0.07031250264844857,
+      "reward_std": 0.07401842658873647,
+      "rewards/pure_accuracy_reward_math": 0.07031250160071068,
+      "step": 1251
+    },
+    {
+      "clip_ratio": 0.000244510552590782,
+      "epoch": 1.7686654118490632,
+      "grad_norm": 0.03525623679161072,
+      "kl": 0.008609294891357422,
+      "learning_rate": 7.366743933251349e-07,
+      "loss": 0.0059,
+      "step": 1252
+    },
+    {
+      "clip_ratio": 0.000242228649824483,
+      "epoch": 1.7705775148636134,
+      "grad_norm": 0.035115260630846024,
+      "kl": 0.008548259735107422,
+      "learning_rate": 7.322330470336314e-07,
+      "loss": 0.0058,
+      "step": 1253
+    },
+    {
+      "clip_ratio": 0.0002641637478291159,
+      "epoch": 1.772489617878163,
+      "grad_norm": 0.03518166393041611,
+      "kl": 0.008442401885986328,
+      "learning_rate": 7.278028312727961e-07,
+      "loss": 0.0058,
+      "step": 1254
+    },
+    {
+      "clip_ratio": 0.0002555919315909705,
+      "epoch": 1.7744017208927132,
+      "grad_norm": 0.03385892137885094,
+      "kl": 0.00841379165649414,
+      "learning_rate": 7.233837739369462e-07,
+      "loss": 0.0057,
+      "step": 1255
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 513.7271451950073,
+      "epoch": 1.776313823907263,
+      "grad_norm": 0.03341628611087799,
+      "kl": 0.006855964660644531,
+      "learning_rate": 7.189759028501417e-07,
+      "loss": 0.0062,
+      "num_tokens": 296984393.0,
+      "reward": 0.06556919915601611,
+      "reward_std": 0.06311669771093875,
+      "rewards/pure_accuracy_reward_math": 0.06556919775903225,
+      "step": 1256
+    },
+    {
+      "clip_ratio": 0.0002122660096688378,
+      "epoch": 1.778225926921813,
+      "grad_norm": 0.03227659687399864,
+      "kl": 0.006803989410400391,
+      "learning_rate": 7.145792457660083e-07,
+      "loss": 0.0062,
+      "step": 1257
+    },
+    {
+      "clip_ratio": 0.00023682935608348998,
+      "epoch": 1.7801380299363627,
+      "grad_norm": 0.03206360712647438,
+      "kl": 0.006758213043212891,
+      "learning_rate": 7.101938303675674e-07,
+      "loss": 0.0062,
+      "step": 1258
+    },
+    {
+      "clip_ratio": 0.0002413284565250251,
+      "epoch": 1.7820501329509129,
+      "grad_norm": 0.031279318034648895,
+      "kl": 0.006762981414794922,
+      "learning_rate": 7.058196842670548e-07,
+      "loss": 0.0061,
+      "step": 1259
+    },
+    {
+      "clip_ratio": 0.0002680151189338176,
+      "epoch": 1.7839622359654626,
+      "grad_norm": 0.031049314886331558,
+      "kl": 0.006676197052001953,
+      "learning_rate": 7.014568350057516e-07,
+      "loss": 0.0061,
+      "step": 1260
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 532.2553224563599,
+      "epoch": 1.7858743389800127,
+      "grad_norm": 0.03635333850979805,
+      "kl": 0.007339000701904297,
+      "learning_rate": 6.971053100538116e-07,
+      "loss": 0.0066,
+      "num_tokens": 300622928.0,
+      "reward": 0.0711495568684768,
+      "reward_std": 0.07668221119092777,
+      "rewards/pure_accuracy_reward_math": 0.07114955512224697,
+      "step": 1261
+    },
+    {
+      "clip_ratio": 0.00025942773436327116,
+      "epoch": 1.7877864419945624,
+      "grad_norm": 0.03595859929919243,
+      "kl": 0.007373332977294922,
+      "learning_rate": 6.927651368100843e-07,
+      "loss": 0.0065,
+      "step": 1262
+    },
+    {
+      "clip_ratio": 0.00026420129074722354,
+      "epoch": 1.7896985450091125,
+      "grad_norm": 0.034778136759996414,
+      "kl": 0.00739288330078125,
+      "learning_rate": 6.884363426019444e-07,
+      "loss": 0.0065,
+      "step": 1263
+    },
+    {
+      "clip_ratio": 0.0002875854173112202,
+      "epoch": 1.7916106480236622,
+      "grad_norm": 0.035560280084609985,
+      "kl": 0.007449150085449219,
+      "learning_rate": 6.841189546851224e-07,
+      "loss": 0.0064,
+      "step": 1264
+    },
+    {
+      "clip_ratio": 0.00026737677507071567,
+      "epoch": 1.7935227510382123,
+      "grad_norm": 0.03407442197203636,
+      "kl": 0.007452964782714844,
+      "learning_rate": 6.79813000243528e-07,
+      "loss": 0.0064,
+      "step": 1265
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.543550491333,
+      "epoch": 1.795434854052762,
+      "grad_norm": 0.03908964619040489,
+      "kl": 0.008809566497802734,
+      "learning_rate": 6.755185063890818e-07,
+      "loss": 0.0074,
+      "num_tokens": 304236988.0,
+      "reward": 0.0747767890279647,
+      "reward_std": 0.07865536957979202,
+      "rewards/pure_accuracy_reward_math": 0.07477678745635785,
+      "step": 1266
+    },
+    {
+      "clip_ratio": 0.0002752643416670253,
+      "epoch": 1.797346957067312,
+      "grad_norm": 0.0380408875644207,
+      "kl": 0.00884389877319336,
+      "learning_rate": 6.71235500161545e-07,
+      "loss": 0.0074,
+      "step": 1267
+    },
+    {
+      "clip_ratio": 0.0002959408872698077,
+      "epoch": 1.7992590600818619,
+      "grad_norm": 0.03713267296552658,
+      "kl": 0.008931636810302734,
+      "learning_rate": 6.669640085283479e-07,
+      "loss": 0.0073,
+      "step": 1268
+    },
+    {
+      "clip_ratio": 0.0003134474755484007,
+      "epoch": 1.8011711630964118,
+      "grad_norm": 0.03684492036700249,
+      "kl": 0.008975982666015625,
+      "learning_rate": 6.627040583844199e-07,
+      "loss": 0.0073,
+      "step": 1269
+    },
+    {
+      "clip_ratio": 0.0003336208075666036,
+      "epoch": 1.8030832661109617,
+      "grad_norm": 0.0364052951335907,
+      "kl": 0.009007453918457031,
+      "learning_rate": 6.584556765520231e-07,
+      "loss": 0.0072,
+      "step": 1270
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 532.5468997955322,
+      "epoch": 1.8049953691255116,
+      "grad_norm": 0.03688374161720276,
+      "kl": 0.006972789764404297,
+      "learning_rate": 6.542188897805782e-07,
+      "loss": 0.0076,
+      "num_tokens": 307881200.0,
+      "reward": 0.06082589610014111,
+      "reward_std": 0.06925509008578956,
+      "rewards/pure_accuracy_reward_math": 0.06082589423749596,
+      "step": 1271
+    },
+    {
+      "clip_ratio": 0.0002535940801635661,
+      "epoch": 1.8069074721400615,
+      "grad_norm": 0.03543318435549736,
+      "kl": 0.006913661956787109,
+      "learning_rate": 6.499937247465002e-07,
+      "loss": 0.0076,
+      "step": 1272
+    },
+    {
+      "clip_ratio": 0.00029529011806062044,
+      "epoch": 1.8088195751546114,
+      "grad_norm": 0.034321434795856476,
+      "kl": 0.006764411926269531,
+      "learning_rate": 6.457802080530304e-07,
+      "loss": 0.0075,
+      "step": 1273
+    },
+    {
+      "clip_ratio": 0.00032198404306882367,
+      "epoch": 1.8107316781691614,
+      "grad_norm": 0.03342648968100548,
+      "kl": 0.006732940673828125,
+      "learning_rate": 6.415783662300662e-07,
+      "loss": 0.0075,
+      "step": 1274
+    },
+    {
+      "clip_ratio": 0.000381207836142039,
+      "epoch": 1.8126437811837113,
+      "grad_norm": 0.034588467329740524,
+      "kl": 0.006687164306640625,
+      "learning_rate": 6.373882257339964e-07,
+      "loss": 0.0074,
+      "step": 1275
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 528.7452836036682,
+      "epoch": 1.8145558841982612,
+      "grad_norm": 0.039650533348321915,
+      "kl": 0.012791156768798828,
+      "learning_rate": 6.33209812947532e-07,
+      "loss": 0.0068,
+      "num_tokens": 311509399.0,
+      "reward": 0.06919643239234574,
+      "reward_std": 0.07131457643117756,
+      "rewards/pure_accuracy_reward_math": 0.06919642988941632,
+      "step": 1276
+    },
+    {
+      "clip_ratio": 0.00028128568749252736,
+      "epoch": 1.816467987212811,
+      "grad_norm": 0.039305564016103745,
+      "kl": 0.012639522552490234,
+      "learning_rate": 6.290431541795456e-07,
+      "loss": 0.0068,
+      "step": 1277
+    },
+    {
+      "clip_ratio": 0.00027201296376233586,
+      "epoch": 1.818380090227361,
+      "grad_norm": 0.038404785096645355,
+      "kl": 0.012586116790771484,
+      "learning_rate": 6.248882756648988e-07,
+      "loss": 0.0067,
+      "step": 1278
+    },
+    {
+      "clip_ratio": 0.00027703067632955936,
+      "epoch": 1.820292193241911,
+      "grad_norm": 0.037614692002534866,
+      "kl": 0.01236581802368164,
+      "learning_rate": 6.207452035642814e-07,
+      "loss": 0.0066,
+      "step": 1279
+    },
+    {
+      "clip_ratio": 0.000309511864088563,
+      "epoch": 1.8222042962564609,
+      "grad_norm": 0.03737355023622513,
+      "kl": 0.012206554412841797,
+      "learning_rate": 6.166139639640454e-07,
+      "loss": 0.0065,
+      "step": 1280
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 526.473795413971,
+      "epoch": 1.8241163992710108,
+      "grad_norm": 0.03713076934218407,
+      "kl": 0.007002353668212891,
+      "learning_rate": 6.124945828760406e-07,
+      "loss": 0.0059,
+      "num_tokens": 315129533.0,
+      "reward": 0.06445312840514816,
+      "reward_std": 0.06921502435579896,
+      "rewards/pure_accuracy_reward_math": 0.0644531259604264,
+      "step": 1281
+    },
+    {
+      "clip_ratio": 0.00024346445911760384,
+      "epoch": 1.8260285022855607,
+      "grad_norm": 0.03588669002056122,
+      "kl": 0.006989955902099609,
+      "learning_rate": 6.083870862374513e-07,
+      "loss": 0.0059,
+      "step": 1282
+    },
+    {
+      "clip_ratio": 0.0002329723478737833,
+      "epoch": 1.8279406053001104,
+      "grad_norm": 0.03526683151721954,
+      "kl": 0.007010459899902344,
+      "learning_rate": 6.042914999106342e-07,
+      "loss": 0.0058,
+      "step": 1283
+    },
+    {
+      "clip_ratio": 0.00023291378442991117,
+      "epoch": 1.8298527083146605,
+      "grad_norm": 0.03384559601545334,
+      "kl": 0.007075786590576172,
+      "learning_rate": 6.002078496829514e-07,
+      "loss": 0.0058,
+      "step": 1284
+    },
+    {
+      "clip_ratio": 0.0002458733478647446,
+      "epoch": 1.8317648113292102,
+      "grad_norm": 0.03377237543463707,
+      "kl": 0.0071315765380859375,
+      "learning_rate": 5.961361612666139e-07,
+      "loss": 0.0057,
+      "step": 1285
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.0859618186951,
+      "epoch": 1.8336769143437603,
+      "grad_norm": 0.0914173573255539,
+      "kl": 0.012554645538330078,
+      "learning_rate": 5.920764602985141e-07,
+      "loss": 0.0058,
+      "num_tokens": 318747025.0,
+      "reward": 0.06612723506987095,
+      "reward_std": 0.06865079142153263,
+      "rewards/pure_accuracy_reward_math": 0.06612723355647177,
+      "step": 1286
+    },
+    {
+      "clip_ratio": 0.00025586230526641884,
+      "epoch": 1.83558901735831,
+      "grad_norm": 0.04225718230009079,
+      "kl": 0.010876655578613281,
+      "learning_rate": 5.88028772340068e-07,
+      "loss": 0.0057,
+      "step": 1287
+    },
+    {
+      "clip_ratio": 0.00024814432106268214,
+      "epoch": 1.8375011203728602,
+      "grad_norm": 0.03636258468031883,
+      "kl": 0.010531425476074219,
+      "learning_rate": 5.839931228770526e-07,
+      "loss": 0.0057,
+      "step": 1288
+    },
+    {
+      "clip_ratio": 0.0002984523198108491,
+      "epoch": 1.8394132233874099,
+      "grad_norm": 0.03610241040587425,
+      "kl": 0.010416984558105469,
+      "learning_rate": 5.799695373194461e-07,
+      "loss": 0.0056,
+      "step": 1289
+    },
+    {
+      "clip_ratio": 0.00032527196299270145,
+      "epoch": 1.84132532640196,
+      "grad_norm": 0.034912850707769394,
+      "kl": 0.010428428649902344,
+      "learning_rate": 5.759580410012691e-07,
+      "loss": 0.0055,
+      "step": 1290
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.4793767929077,
+      "epoch": 1.8432374294165097,
+      "grad_norm": 0.04220513626933098,
+      "kl": 0.009058475494384766,
+      "learning_rate": 5.719586591804222e-07,
+      "loss": 0.0071,
+      "num_tokens": 322345307.0,
+      "reward": 0.07366071786964312,
+      "reward_std": 0.07878176297526807,
+      "rewards/pure_accuracy_reward_math": 0.07366071542492136,
+      "step": 1291
+    },
+    {
+      "clip_ratio": 0.00030183524040694465,
+      "epoch": 1.8451495324310598,
+      "grad_norm": 0.03849344700574875,
+      "kl": 0.009106636047363281,
+      "learning_rate": 5.679714170385283e-07,
+      "loss": 0.0071,
+      "step": 1292
+    },
+    {
+      "clip_ratio": 0.00035880112773156725,
+      "epoch": 1.8470616354456095,
+      "grad_norm": 0.037096235901117325,
+      "kl": 0.009167194366455078,
+      "learning_rate": 5.63996339680776e-07,
+      "loss": 0.0071,
+      "step": 1293
+    },
+    {
+      "clip_ratio": 0.00040293739141361584,
+      "epoch": 1.8489737384601597,
+      "grad_norm": 0.03884498402476311,
+      "kl": 0.009192943572998047,
+      "learning_rate": 5.600334521357581e-07,
+      "loss": 0.007,
+      "step": 1294
+    },
+    {
+      "clip_ratio": 0.00038201194092835067,
+      "epoch": 1.8508858414747094,
+      "grad_norm": 0.03875093162059784,
+      "kl": 0.009291648864746094,
+      "learning_rate": 5.560827793553159e-07,
+      "loss": 0.0069,
+      "step": 1295
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 518.3301024436951,
+      "epoch": 1.8527979444892595,
+      "grad_norm": 0.04254430532455444,
+      "kl": 0.008441925048828125,
+      "learning_rate": 5.52144346214383e-07,
+      "loss": 0.0063,
+      "num_tokens": 325938766.0,
+      "reward": 0.07840402127476409,
+      "reward_std": 0.08084744628285989,
+      "rewards/pure_accuracy_reward_math": 0.07840401929570362,
+      "step": 1296
+    },
+    {
+      "clip_ratio": 0.0002986583057804637,
+      "epoch": 1.8547100475038092,
+      "grad_norm": 0.041676584631204605,
+      "kl": 0.008450508117675781,
+      "learning_rate": 5.482181775108278e-07,
+      "loss": 0.0062,
+      "step": 1297
+    },
+    {
+      "clip_ratio": 0.00031948441494478175,
+      "epoch": 1.8566221505183593,
+      "grad_norm": 0.03955300524830818,
+      "kl": 0.008507251739501953,
+      "learning_rate": 5.443042979652957e-07,
+      "loss": 0.0062,
+      "step": 1298
+    },
+    {
+      "clip_ratio": 0.0003085145480667961,
+      "epoch": 1.858534253532909,
+      "grad_norm": 0.03848061338067055,
+      "kl": 0.008501052856445312,
+      "learning_rate": 5.404027322210556e-07,
+      "loss": 0.0061,
+      "step": 1299
+    },
+    {
+      "clip_ratio": 0.0003855731235944404,
+      "epoch": 1.8604463565474592,
+      "grad_norm": 0.04076399654150009,
+      "kl": 0.00849771499633789,
+      "learning_rate": 5.365135048438438e-07,
+      "loss": 0.006,
+      "step": 1300
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.5170464515686,
+      "epoch": 1.8623584595620088,
+      "grad_norm": 0.14906181395053864,
+      "kl": 0.007767677307128906,
+      "learning_rate": 5.326366403217093e-07,
+      "loss": 0.0084,
+      "num_tokens": 329571311.0,
+      "reward": 0.07254464630386792,
+      "reward_std": 0.08418946416350082,
+      "rewards/pure_accuracy_reward_math": 0.07254464438301511,
+      "step": 1301
+    },
+    {
+      "clip_ratio": 0.00028383656763253384,
+      "epoch": 1.8642705625765588,
+      "grad_norm": 0.04550671949982643,
+      "kl": 0.008212089538574219,
+      "learning_rate": 5.287721630648615e-07,
+      "loss": 0.0083,
+      "step": 1302
+    },
+    {
+      "clip_ratio": 0.0003281467976989916,
+      "epoch": 1.8661826655911087,
+      "grad_norm": 0.05260877683758736,
+      "kl": 0.008829593658447266,
+      "learning_rate": 5.249200974055132e-07,
+      "loss": 0.0083,
+      "step": 1303
+    },
+    {
+      "clip_ratio": 0.00036754867960553383,
+      "epoch": 1.8680947686056586,
+      "grad_norm": 0.0511869452893734,
+      "kl": 0.008836746215820312,
+      "learning_rate": 5.210804675977299e-07,
+      "loss": 0.0082,
+      "step": 1304
+    },
+    {
+      "clip_ratio": 0.0004018283953541868,
+      "epoch": 1.8700068716202085,
+      "grad_norm": 0.044321924448013306,
+      "kl": 0.008379459381103516,
+      "learning_rate": 5.172532978172753e-07,
+      "loss": 0.0081,
+      "step": 1305
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 512.9788198471069,
+      "epoch": 1.8719189746347584,
+      "grad_norm": 0.04202428087592125,
+      "kl": 0.0076198577880859375,
+      "learning_rate": 5.134386121614615e-07,
+      "loss": 0.0072,
+      "num_tokens": 333143795.0,
+      "reward": 0.07421875317231752,
+      "reward_std": 0.07986396714113653,
+      "rewards/pure_accuracy_reward_math": 0.074218751717126,
+      "step": 1306
+    },
+    {
+      "clip_ratio": 0.00027569573836672134,
+      "epoch": 1.8738310776493083,
+      "grad_norm": 0.040443304926157,
+      "kl": 0.007631778717041016,
+      "learning_rate": 5.096364346489935e-07,
+      "loss": 0.0072,
+      "step": 1307
+    },
+    {
+      "clip_ratio": 0.00027392168607320855,
+      "epoch": 1.8757431806638583,
+      "grad_norm": 0.040238041430711746,
+      "kl": 0.007664203643798828,
+      "learning_rate": 5.058467892198241e-07,
+      "loss": 0.0071,
+      "step": 1308
+    },
+    {
+      "clip_ratio": 0.0003170029604007141,
+      "epoch": 1.8776552836784082,
+      "grad_norm": 0.039109617471694946,
+      "kl": 0.007664203643798828,
+      "learning_rate": 5.02069699734995e-07,
+      "loss": 0.007,
+      "step": 1309
+    },
+    {
+      "clip_ratio": 0.0003183572773082233,
+      "epoch": 1.879567386692958,
+      "grad_norm": 0.03724955767393112,
+      "kl": 0.007700443267822266,
+      "learning_rate": 4.983051899764946e-07,
+      "loss": 0.007,
+      "step": 1310
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 505.4592852592468,
+      "epoch": 1.881479489707508,
+      "grad_norm": 0.03964386135339737,
+      "kl": 0.007820606231689453,
+      "learning_rate": 4.945532836471026e-07,
+      "loss": 0.0074,
+      "num_tokens": 336685165.0,
+      "reward": 0.0848214327415917,
+      "reward_std": 0.07835631881607696,
+      "rewards/pure_accuracy_reward_math": 0.08482142965658568,
+      "step": 1311
+    },
+    {
+      "clip_ratio": 0.0002873320136700386,
+      "epoch": 1.883391592722058,
+      "grad_norm": 0.03871289640665054,
+      "kl": 0.007764339447021484,
+      "learning_rate": 4.908140043702426e-07,
+      "loss": 0.0074,
+      "step": 1312
+    },
+    {
+      "clip_ratio": 0.0003113469839775007,
+      "epoch": 1.8853036957366078,
+      "grad_norm": 0.03769771382212639,
+      "kl": 0.007766246795654297,
+      "learning_rate": 4.870873756898345e-07,
+      "loss": 0.0074,
+      "step": 1313
+    },
+    {
+      "clip_ratio": 0.00034381698696961394,
+      "epoch": 1.8872157987511577,
+      "grad_norm": 0.03724011033773422,
+      "kl": 0.007775783538818359,
+      "learning_rate": 4.833734210701435e-07,
+      "loss": 0.0073,
+      "step": 1314
+    },
+    {
+      "clip_ratio": 0.0003651243675335536,
+      "epoch": 1.8891279017657077,
+      "grad_norm": 0.03757576644420624,
+      "kl": 0.007784366607666016,
+      "learning_rate": 4.796721638956376e-07,
+      "loss": 0.0072,
+      "step": 1315
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.5703339576721,
+      "epoch": 1.8910400047802576,
+      "grad_norm": 0.03592124208807945,
+      "kl": 0.007517337799072266,
+      "learning_rate": 4.7598362747083293e-07,
+      "loss": 0.008,
+      "num_tokens": 340304225.0,
+      "reward": 0.06501116388244554,
+      "reward_std": 0.0762443722342141,
+      "rewards/pure_accuracy_reward_math": 0.06501116219442338,
+      "step": 1316
+    },
+    {
+      "clip_ratio": 0.00026663288446115985,
+      "epoch": 1.8929521077948075,
+      "grad_norm": 0.03529619425535202,
+      "kl": 0.007477283477783203,
+      "learning_rate": 4.7230783502015346e-07,
+      "loss": 0.008,
+      "step": 1317
+    },
+    {
+      "clip_ratio": 0.00025462434007295087,
+      "epoch": 1.8948642108093574,
+      "grad_norm": 0.03387421742081642,
+      "kl": 0.007337093353271484,
+      "learning_rate": 4.6864480968778103e-07,
+      "loss": 0.008,
+      "step": 1318
+    },
+    {
+      "clip_ratio": 0.00031681645646131074,
+      "epoch": 1.8967763138239073,
+      "grad_norm": 0.033014364540576935,
+      "kl": 0.007318019866943359,
+      "learning_rate": 4.649945745375109e-07,
+      "loss": 0.0079,
+      "step": 1319
+    },
+    {
+      "clip_ratio": 0.00037019279989181086,
+      "epoch": 1.898688416838457,
+      "grad_norm": 0.033140987157821655,
+      "kl": 0.007157325744628906,
+      "learning_rate": 4.613571525526081e-07,
+      "loss": 0.0078,
+      "step": 1320
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.3727917671204,
+      "epoch": 1.9006005198530072,
+      "grad_norm": 0.03997303172945976,
+      "kl": 0.007628440856933594,
+      "learning_rate": 4.577325666356586e-07,
+      "loss": 0.0118,
+      "num_tokens": 343915401.0,
+      "reward": 0.08816964740981348,
+      "reward_std": 0.08973595389397815,
+      "rewards/pure_accuracy_reward_math": 0.08816964426659979,
+      "step": 1321
+    },
+    {
+      "clip_ratio": 0.0003053776546835252,
+      "epoch": 1.9025126228675568,
+      "grad_norm": 0.039738208055496216,
+      "kl": 0.007574558258056641,
+      "learning_rate": 4.541208396084304e-07,
+      "loss": 0.0117,
+      "step": 1322
+    },
+    {
+      "clip_ratio": 0.00030029478972437573,
+      "epoch": 1.904424725882107,
+      "grad_norm": 0.038392502814531326,
+      "kl": 0.007514476776123047,
+      "learning_rate": 4.5052199421172475e-07,
+      "loss": 0.0117,
+      "step": 1323
+    },
+    {
+      "clip_ratio": 0.0003343055576010556,
+      "epoch": 1.9063368288966567,
+      "grad_norm": 0.037236347794532776,
+      "kl": 0.007477760314941406,
+      "learning_rate": 4.4693605310523636e-07,
+      "loss": 0.0116,
+      "step": 1324
+    },
+    {
+      "clip_ratio": 0.00032557199602933906,
+      "epoch": 1.9082489319112068,
+      "grad_norm": 0.03678731992840767,
+      "kl": 0.007478237152099609,
+      "learning_rate": 4.43363038867409e-07,
+      "loss": 0.0115,
+      "step": 1325
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 513.3047099113464,
+      "epoch": 1.9101610349257565,
+      "grad_norm": 0.11113768815994263,
+      "kl": 0.013922691345214844,
+      "learning_rate": 4.39802973995295e-07,
+      "loss": 0.0093,
+      "num_tokens": 347490901.0,
+      "reward": 0.09486607549479231,
+      "reward_std": 0.09372853260720149,
+      "rewards/pure_accuracy_reward_math": 0.09486607305007055,
+      "step": 1326
+    },
+    {
+      "clip_ratio": 0.00036943193325100765,
+      "epoch": 1.9120731379403066,
+      "grad_norm": 0.055216722190380096,
+      "kl": 0.013732433319091797,
+      "learning_rate": 4.362558809044107e-07,
+      "loss": 0.0093,
+      "step": 1327
+    },
+    {
+      "clip_ratio": 0.0004000666916681439,
+      "epoch": 1.9139852409548563,
+      "grad_norm": 0.045698132365942,
+      "kl": 0.013063907623291016,
+      "learning_rate": 4.327217819286e-07,
+      "loss": 0.0092,
+      "step": 1328
+    },
+    {
+      "clip_ratio": 0.0004443397794489101,
+      "epoch": 1.9158973439694065,
+      "grad_norm": 0.04273562505841255,
+      "kl": 0.012539863586425781,
+      "learning_rate": 4.292006993198888e-07,
+      "loss": 0.009,
+      "step": 1329
+    },
+    {
+      "clip_ratio": 0.0004470848766686686,
+      "epoch": 1.9178094469839562,
+      "grad_norm": 0.04232070967555046,
+      "kl": 0.012142658233642578,
+      "learning_rate": 4.2569265524834756e-07,
+      "loss": 0.0089,
+      "step": 1330
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 518.7550463676453,
+      "epoch": 1.9197215499985063,
+      "grad_norm": 0.03724661469459534,
+      "kl": 0.007449150085449219,
+      "learning_rate": 4.221976718019505e-07,
+      "loss": 0.007,
+      "num_tokens": 351086731.0,
+      "reward": 0.06919643189758062,
+      "reward_std": 0.07200520270271227,
+      "rewards/pure_accuracy_reward_math": 0.06919642974389717,
+      "step": 1331
+    },
+    {
+      "clip_ratio": 0.00027471570277270985,
+      "epoch": 1.921633653013056,
+      "grad_norm": 0.03599303960800171,
+      "kl": 0.007382869720458984,
+      "learning_rate": 4.187157709864392e-07,
+      "loss": 0.007,
+      "step": 1332
+    },
+    {
+      "clip_ratio": 0.0002737036326720954,
+      "epoch": 1.9235457560276061,
+      "grad_norm": 0.03614535927772522,
+      "kl": 0.007375240325927734,
+      "learning_rate": 4.152469747251794e-07,
+      "loss": 0.0069,
+      "step": 1333
+    },
+    {
+      "clip_ratio": 0.00030229948259830053,
+      "epoch": 1.9254578590421558,
+      "grad_norm": 0.03546711429953575,
+      "kl": 0.0072498321533203125,
+      "learning_rate": 4.117913048590283e-07,
+      "loss": 0.0069,
+      "step": 1334
+    },
+    {
+      "clip_ratio": 0.00030038867771509103,
+      "epoch": 1.927369962056706,
+      "grad_norm": 0.03401359170675278,
+      "kl": 0.007149219512939453,
+      "learning_rate": 4.0834878314619244e-07,
+      "loss": 0.0068,
+      "step": 1335
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 526.2182154655457,
+      "epoch": 1.9292820650712557,
+      "grad_norm": 0.04080551117658615,
+      "kl": 0.006867885589599609,
+      "learning_rate": 4.049194312620927e-07,
+      "loss": 0.0092,
+      "num_tokens": 354708525.0,
+      "reward": 0.07756696798605844,
+      "reward_std": 0.08467356563778594,
+      "rewards/pure_accuracy_reward_math": 0.07756696530850604,
+      "step": 1336
+    },
+    {
+      "clip_ratio": 0.0002796990767137686,
+      "epoch": 1.9311941680858056,
+      "grad_norm": 0.038895782083272934,
+      "kl": 0.006824970245361328,
+      "learning_rate": 4.015032707992286e-07,
+      "loss": 0.0092,
+      "step": 1337
+    },
+    {
+      "clip_ratio": 0.00032694752422912643,
+      "epoch": 1.9331062711003555,
+      "grad_norm": 0.03889061138033867,
+      "kl": 0.006866931915283203,
+      "learning_rate": 3.9810032326704106e-07,
+      "loss": 0.0091,
+      "step": 1338
+    },
+    {
+      "clip_ratio": 0.0003511786251237936,
+      "epoch": 1.9350183741149054,
+      "grad_norm": 0.03880919888615608,
+      "kl": 0.006947994232177734,
+      "learning_rate": 3.9471061009177693e-07,
+      "loss": 0.009,
+      "step": 1339
+    },
+    {
+      "clip_ratio": 0.000323922223401496,
+      "epoch": 1.9369304771294553,
+      "grad_norm": 0.036964643746614456,
+      "kl": 0.007033824920654297,
+      "learning_rate": 3.91334152616355e-07,
+      "loss": 0.0089,
+      "step": 1340
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.7076120376587,
+      "epoch": 1.9388425801440052,
+      "grad_norm": 0.04040682688355446,
+      "kl": 0.007448673248291016,
+      "learning_rate": 3.879709721002317e-07,
+      "loss": 0.0052,
+      "num_tokens": 358339045.0,
+      "reward": 0.07896205660654232,
+      "reward_std": 0.08278053888352588,
+      "rewards/pure_accuracy_reward_math": 0.07896205550059676,
+      "step": 1341
+    },
+    {
+      "clip_ratio": 0.00029579239503618737,
+      "epoch": 1.9407546831585551,
+      "grad_norm": 0.03910582885146141,
+      "kl": 0.007539272308349609,
+      "learning_rate": 3.8462108971926564e-07,
+      "loss": 0.0052,
+      "step": 1342
+    },
+    {
+      "clip_ratio": 0.0003078770084812277,
+      "epoch": 1.942666786173105,
+      "grad_norm": 0.03942732512950897,
+      "kl": 0.007628440856933594,
+      "learning_rate": 3.8128452656558623e-07,
+      "loss": 0.0051,
+      "step": 1343
+    },
+    {
+      "clip_ratio": 0.0003229538778555252,
+      "epoch": 1.944578889187655,
+      "grad_norm": 0.03747202083468437,
+      "kl": 0.007678031921386719,
+      "learning_rate": 3.779613036474583e-07,
+      "loss": 0.005,
+      "step": 1344
+    },
+    {
+      "clip_ratio": 0.000363169818285769,
+      "epoch": 1.946490992202205,
+      "grad_norm": 0.036778781563043594,
+      "kl": 0.0076923370361328125,
+      "learning_rate": 3.746514418891545e-07,
+      "loss": 0.0049,
+      "step": 1345
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 532.7960658073425,
+      "epoch": 1.9484030952167548,
+      "grad_norm": 0.040943268686532974,
+      "kl": 0.011704444885253906,
+      "learning_rate": 3.713549621308174e-07,
+      "loss": 0.005,
+      "num_tokens": 361980918.0,
+      "reward": 0.07059152092551813,
+      "reward_std": 0.07973137585213408,
+      "rewards/pure_accuracy_reward_math": 0.07059151900466532,
+      "step": 1346
+    },
+    {
+      "clip_ratio": 0.00029914512055029263,
+      "epoch": 1.9503151982313047,
+      "grad_norm": 0.04052672162652016,
+      "kl": 0.0114288330078125,
+      "learning_rate": 3.6807188512833406e-07,
+      "loss": 0.005,
+      "step": 1347
+    },
+    {
+      "clip_ratio": 0.000334167169853572,
+      "epoch": 1.9522273012458546,
+      "grad_norm": 0.04054692015051842,
+      "kl": 0.011135578155517578,
+      "learning_rate": 3.648022315532007e-07,
+      "loss": 0.0049,
+      "step": 1348
+    },
+    {
+      "clip_ratio": 0.00035840429575273447,
+      "epoch": 1.9541394042604046,
+      "grad_norm": 0.03996079042553902,
+      "kl": 0.010680675506591797,
+      "learning_rate": 3.615460219923955e-07,
+      "loss": 0.0048,
+      "step": 1349
+    },
+    {
+      "clip_ratio": 0.00034668986540964397,
+      "epoch": 1.9560515072749545,
+      "grad_norm": 0.037566084414720535,
+      "kl": 0.010373115539550781,
+      "learning_rate": 3.5830327694824777e-07,
+      "loss": 0.0047,
+      "step": 1350
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 534.6453948020935,
+      "epoch": 1.9579636102895044,
+      "grad_norm": 0.03812556713819504,
+      "kl": 0.007121086120605469,
+      "learning_rate": 3.5507401683830933e-07,
+      "loss": 0.0114,
+      "num_tokens": 365629991.0,
+      "reward": 0.07672991411527619,
+      "reward_std": 0.07831625349353999,
+      "rewards/pure_accuracy_reward_math": 0.07672991178696975,
+      "step": 1351
+    },
+    {
+      "clip_ratio": 0.0003128355612602718,
+      "epoch": 1.9598757133040543,
+      "grad_norm": 0.03631382808089256,
+      "kl": 0.007141590118408203,
+      "learning_rate": 3.518582619952257e-07,
+      "loss": 0.0114,
+      "step": 1352
+    },
+    {
+      "clip_ratio": 0.00033067399391484287,
+      "epoch": 1.9617878163186042,
+      "grad_norm": 0.03752359002828598,
+      "kl": 0.007140636444091797,
+      "learning_rate": 3.486560326666072e-07,
+      "loss": 0.0113,
+      "step": 1353
+    },
+    {
+      "clip_ratio": 0.00037038392605381887,
+      "epoch": 1.9636999193331541,
+      "grad_norm": 0.03724711388349533,
+      "kl": 0.007131099700927734,
+      "learning_rate": 3.4546734901490466e-07,
+      "loss": 0.0112,
+      "step": 1354
+    },
+    {
+      "clip_ratio": 0.00040464663743478013,
+      "epoch": 1.9656120223477038,
+      "grad_norm": 0.034875430166721344,
+      "kl": 0.007108211517333984,
+      "learning_rate": 3.42292231117278e-07,
+      "loss": 0.0112,
+      "step": 1355
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.9101786613464,
+      "epoch": 1.967524125362254,
+      "grad_norm": 0.04123640060424805,
+      "kl": 0.007243156433105469,
+      "learning_rate": 3.3913069896547217e-07,
+      "loss": 0.0069,
+      "num_tokens": 369229613.0,
+      "reward": 0.08007812878349796,
+      "reward_std": 0.085311732836999,
+      "rewards/pure_accuracy_reward_math": 0.0800781263387762,
+      "step": 1356
+    },
+    {
+      "clip_ratio": 0.00033138683619426956,
+      "epoch": 1.9694362283768037,
+      "grad_norm": 0.04048166796565056,
+      "kl": 0.007332801818847656,
+      "learning_rate": 3.3598277246569307e-07,
+      "loss": 0.0069,
+      "step": 1357
+    },
+    {
+      "clip_ratio": 0.0003668193609200898,
+      "epoch": 1.9713483313913538,
+      "grad_norm": 0.042313288897275925,
+      "kl": 0.007485866546630859,
+      "learning_rate": 3.3284847143847834e-07,
+      "loss": 0.0068,
+      "step": 1358
+    },
+    {
+      "clip_ratio": 0.0003713441701620468,
+      "epoch": 1.9732604344059035,
+      "grad_norm": 0.04199962690472603,
+      "kl": 0.007598400115966797,
+      "learning_rate": 3.2972781561857433e-07,
+      "loss": 0.0067,
+      "step": 1359
+    },
+    {
+      "clip_ratio": 0.0003367169608736731,
+      "epoch": 1.9751725374204536,
+      "grad_norm": 0.03874565288424492,
+      "kl": 0.007636547088623047,
+      "learning_rate": 3.266208246548136e-07,
+      "loss": 0.0066,
+      "step": 1360
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.4445023536682,
+      "epoch": 1.9770846404350033,
+      "grad_norm": 0.040357448160648346,
+      "kl": 0.007414817810058594,
+      "learning_rate": 3.2352751810998896e-07,
+      "loss": 0.0055,
+      "num_tokens": 372817046.0,
+      "reward": 0.08258928993018344,
+      "reward_std": 0.09080576250562444,
+      "rewards/pure_accuracy_reward_math": 0.08258928690338507,
+      "step": 1361
+    },
+    {
+      "clip_ratio": 0.00038423701278134104,
+      "epoch": 1.9789967434495535,
+      "grad_norm": 0.03990958258509636,
+      "kl": 0.007411479949951172,
+      "learning_rate": 3.2044791546072985e-07,
+      "loss": 0.0055,
+      "step": 1362
+    },
+    {
+      "clip_ratio": 0.00044172884827275993,
+      "epoch": 1.9809088464641031,
+      "grad_norm": 0.042212970554828644,
+      "kl": 0.007319450378417969,
+      "learning_rate": 3.173820360973823e-07,
+      "loss": 0.0054,
+      "step": 1363
+    },
+    {
+      "clip_ratio": 0.00042502668532051757,
+      "epoch": 1.9828209494786533,
+      "grad_norm": 0.03946436941623688,
+      "kl": 0.0072727203369140625,
+      "learning_rate": 3.1432989932388416e-07,
+      "loss": 0.0053,
+      "step": 1364
+    },
+    {
+      "clip_ratio": 0.00040032099315112646,
+      "epoch": 1.984733052493203,
+      "grad_norm": 0.03701746463775635,
+      "kl": 0.007288455963134766,
+      "learning_rate": 3.1129152435764473e-07,
+      "loss": 0.0052,
+      "step": 1365
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.9707279205322,
+      "epoch": 1.9866451555077531,
+      "grad_norm": 0.03677362576127052,
+      "kl": 0.00740814208984375,
+      "learning_rate": 3.0826693032942586e-07,
+      "loss": 0.008,
+      "num_tokens": 376414405.0,
+      "reward": 0.07087053926079534,
+      "reward_std": 0.07741290412377566,
+      "rewards/pure_accuracy_reward_math": 0.07087053710711189,
+      "step": 1366
+    },
+    {
+      "clip_ratio": 0.0002998853265978596,
+      "epoch": 1.9885572585223028,
+      "grad_norm": 0.03619634732604027,
+      "kl": 0.0074787139892578125,
+      "learning_rate": 3.0525613628321656e-07,
+      "loss": 0.0079,
+      "step": 1367
+    },
+    {
+      "clip_ratio": 0.00031987275491474065,
+      "epoch": 1.990469361536853,
+      "grad_norm": 0.03580261766910553,
+      "kl": 0.007512092590332031,
+      "learning_rate": 3.022591611761169e-07,
+      "loss": 0.0079,
+      "step": 1368
+    },
+    {
+      "clip_ratio": 0.00029055258056587263,
+      "epoch": 1.9923814645514026,
+      "grad_norm": 0.03512256592512131,
+      "kl": 0.007531166076660156,
+      "learning_rate": 2.9927602387821916e-07,
+      "loss": 0.0078,
+      "step": 1369
+    },
+    {
+      "clip_ratio": 0.0003325358438814874,
+      "epoch": 1.9942935675659528,
+      "grad_norm": 0.03404110670089722,
+      "kl": 0.007470130920410156,
+      "learning_rate": 2.963067431724856e-07,
+      "loss": 0.0077,
+      "step": 1370
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 524.95845079422,
+      "epoch": 2.0019121030145497,
+      "grad_norm": 0.03709035739302635,
+      "kl": 0.007386684417724609,
+      "learning_rate": 2.9335133775463266e-07,
+      "loss": 0.011,
+      "num_tokens": 380027444.0,
+      "reward": 0.07198661039001308,
+      "reward_std": 0.07208533387165517,
+      "rewards/pure_accuracy_reward_math": 0.07198660876019858,
+      "step": 1371
+    },
+    {
+      "clip_ratio": 0.0002751371110321088,
+      "epoch": 2.0038242060291,
+      "grad_norm": 0.03661485016345978,
+      "kl": 0.007431507110595703,
+      "learning_rate": 2.9040982623301264e-07,
+      "loss": 0.011,
+      "step": 1372
+    },
+    {
+      "clip_ratio": 0.0003175289227783651,
+      "epoch": 2.0057363090436495,
+      "grad_norm": 0.036799393594264984,
+      "kl": 0.007405281066894531,
+      "learning_rate": 2.874822271284977e-07,
+      "loss": 0.0109,
+      "step": 1373
+    },
+    {
+      "clip_ratio": 0.0003284543961399322,
+      "epoch": 2.0076484120581997,
+      "grad_norm": 0.036977026611566544,
+      "kl": 0.007386684417724609,
+      "learning_rate": 2.8456855887436074e-07,
+      "loss": 0.0108,
+      "step": 1374
+    },
+    {
+      "clip_ratio": 0.00032697250054525284,
+      "epoch": 2.0095605150727494,
+      "grad_norm": 0.03594314306974411,
+      "kl": 0.00739288330078125,
+      "learning_rate": 2.816688398161613e-07,
+      "loss": 0.0108,
+      "step": 1375
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 524.5270891189575,
+      "epoch": 2.0114726180872995,
+      "grad_norm": 15.976890563964844,
+      "kl": 0.4394536018371582,
+      "learning_rate": 2.7878308821162964e-07,
+      "loss": 0.0259,
+      "num_tokens": 383639505.0,
+      "reward": 0.08286830733413808,
+      "reward_std": 0.08972975501092151,
+      "rewards/pure_accuracy_reward_math": 0.08286830488941632,
+      "step": 1376
+    },
+    {
+      "clip_ratio": 0.0003084787746274742,
+      "epoch": 2.013384721101849,
+      "grad_norm": 1.2859545946121216,
+      "kl": 0.04446220397949219,
+      "learning_rate": 2.759113222305512e-07,
+      "loss": 0.0102,
+      "step": 1377
+    },
+    {
+      "clip_ratio": 0.00034848380650487343,
+      "epoch": 2.0152968241163993,
+      "grad_norm": 0.0618804506957531,
+      "kl": 0.009487152099609375,
+      "learning_rate": 2.730535599546524e-07,
+      "loss": 0.0087,
+      "step": 1378
+    },
+    {
+      "clip_ratio": 0.000346398171132023,
+      "epoch": 2.017208927130949,
+      "grad_norm": 0.039353594183921814,
+      "kl": 0.008243560791015625,
+      "learning_rate": 2.702098193774891e-07,
+      "loss": 0.0087,
+      "step": 1379
+    },
+    {
+      "clip_ratio": 0.000389314118024231,
+      "epoch": 2.019121030145499,
+      "grad_norm": 0.03626256063580513,
+      "kl": 0.0083465576171875,
+      "learning_rate": 2.6738011840432817e-07,
+      "loss": 0.0086,
+      "step": 1380
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 504.881441116333,
+      "epoch": 2.021033133160049,
+      "grad_norm": 0.03991848975419998,
+      "kl": 0.00807046890258789,
+      "learning_rate": 2.6456447485204014e-07,
+      "loss": 0.0078,
+      "num_tokens": 387180856.0,
+      "reward": 0.07700893218861893,
+      "reward_std": 0.0893906393321231,
+      "rewards/pure_accuracy_reward_math": 0.07700893026776612,
+      "step": 1381
+    },
+    {
+      "clip_ratio": 0.00029079897933570464,
+      "epoch": 2.022945236174599,
+      "grad_norm": 0.03955512493848801,
+      "kl": 0.008087635040283203,
+      "learning_rate": 2.617629064489838e-07,
+      "loss": 0.0078,
+      "step": 1382
+    },
+    {
+      "clip_ratio": 0.00034119405472665676,
+      "epoch": 2.0248573391891487,
+      "grad_norm": 0.04050750657916069,
+      "kl": 0.008031845092773438,
+      "learning_rate": 2.5897543083489544e-07,
+      "loss": 0.0077,
+      "step": 1383
+    },
+    {
+      "clip_ratio": 0.0003633832532159431,
+      "epoch": 2.026769442203699,
+      "grad_norm": 0.03760417178273201,
+      "kl": 0.007889270782470703,
+      "learning_rate": 2.562020655607772e-07,
+      "loss": 0.0076,
+      "step": 1384
+    },
+    {
+      "clip_ratio": 0.00040043183099669477,
+      "epoch": 2.0286815452182485,
+      "grad_norm": 0.036376822739839554,
+      "kl": 0.007742404937744141,
+      "learning_rate": 2.534428280887891e-07,
+      "loss": 0.0076,
+      "step": 1385
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 521.2332820892334,
+      "epoch": 2.0305936482327986,
+      "grad_norm": 0.03659322112798691,
+      "kl": 0.0079498291015625,
+      "learning_rate": 2.50697735792135e-07,
+      "loss": 0.0074,
+      "num_tokens": 390784592.0,
+      "reward": 0.0678013424621895,
+      "reward_std": 0.07990403228905052,
+      "rewards/pure_accuracy_reward_math": 0.06780134083237499,
+      "step": 1386
+    },
+    {
+      "clip_ratio": 0.0003029348101790674,
+      "epoch": 2.0325057512473483,
+      "grad_norm": 0.03603421524167061,
+      "kl": 0.0077915191650390625,
+      "learning_rate": 2.47966805954957e-07,
+      "loss": 0.0073,
+      "step": 1387
+    },
+    {
+      "clip_ratio": 0.0002788126068935526,
+      "epoch": 2.0344178542618985,
+      "grad_norm": 0.035584706813097,
+      "kl": 0.00768280029296875,
+      "learning_rate": 2.4525005577222373e-07,
+      "loss": 0.0073,
+      "step": 1388
+    },
+    {
+      "clip_ratio": 0.00033219700696918153,
+      "epoch": 2.036329957276448,
+      "grad_norm": 0.033913753926754,
+      "kl": 0.007656097412109375,
+      "learning_rate": 2.42547502349624e-07,
+      "loss": 0.0072,
+      "step": 1389
+    },
+    {
+      "clip_ratio": 0.00034793876449157324,
+      "epoch": 2.0382420602909983,
+      "grad_norm": 0.033490557223558426,
+      "kl": 0.007609367370605469,
+      "learning_rate": 2.398591627034588e-07,
+      "loss": 0.0072,
+      "step": 1390
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 534.8217334747314,
+      "epoch": 2.040154163305548,
+      "grad_norm": 0.04065319523215294,
+      "kl": 0.007349491119384766,
+      "learning_rate": 2.3718505376053246e-07,
+      "loss": 0.0094,
+      "num_tokens": 394433277.0,
+      "reward": 0.07589286056463607,
+      "reward_std": 0.09050671145087108,
+      "rewards/pure_accuracy_reward_math": 0.07589285823632963,
+      "step": 1391
+    },
+    {
+      "clip_ratio": 0.00032872594630362073,
+      "epoch": 2.042066266320098,
+      "grad_norm": 0.0390729084610939,
+      "kl": 0.007353305816650391,
+      "learning_rate": 2.345251923580491e-07,
+      "loss": 0.0094,
+      "step": 1392
+    },
+    {
+      "clip_ratio": 0.00038015836332760955,
+      "epoch": 2.043978369334648,
+      "grad_norm": 0.037973206490278244,
+      "kl": 0.007381916046142578,
+      "learning_rate": 2.3187959524350352e-07,
+      "loss": 0.0093,
+      "step": 1393
+    },
+    {
+      "clip_ratio": 0.00041672343576237836,
+      "epoch": 2.045890472349198,
+      "grad_norm": 0.037547629326581955,
+      "kl": 0.007441043853759766,
+      "learning_rate": 2.2924827907457841e-07,
+      "loss": 0.0092,
+      "step": 1394
+    },
+    {
+      "clip_ratio": 0.00047711057584365335,
+      "epoch": 2.0478025753637477,
+      "grad_norm": 0.037767618894577026,
+      "kl": 0.007452487945556641,
+      "learning_rate": 2.266312604190374e-07,
+      "loss": 0.0091,
+      "step": 1395
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.9163165092468,
+      "epoch": 2.049714678378298,
+      "grad_norm": 0.039165694266557693,
+      "kl": 0.007717609405517578,
+      "learning_rate": 2.2402855575462152e-07,
+      "loss": 0.0071,
+      "num_tokens": 398030605.0,
+      "reward": 0.07840402194415219,
+      "reward_std": 0.08072105259634554,
+      "rewards/pure_accuracy_reward_math": 0.07840401885914616,
+      "step": 1396
+    },
+    {
+      "clip_ratio": 0.0002864374472437703,
+      "epoch": 2.0516267813928475,
+      "grad_norm": 0.03918104246258736,
+      "kl": 0.007798194885253906,
+      "learning_rate": 2.2144018146894542e-07,
+      "loss": 0.007,
+      "step": 1397
+    },
+    {
+      "clip_ratio": 0.00028412381868747616,
+      "epoch": 2.0535388844073976,
+      "grad_norm": 0.03787809982895851,
+      "kl": 0.007855415344238281,
+      "learning_rate": 2.1886615385939502e-07,
+      "loss": 0.007,
+      "step": 1398
+    },
+    {
+      "clip_ratio": 0.0002802736350417945,
+      "epoch": 2.0554509874219473,
+      "grad_norm": 0.03685666248202324,
+      "kl": 0.007898807525634766,
+      "learning_rate": 2.1630648913302354e-07,
+      "loss": 0.0069,
+      "step": 1399
+    },
+    {
+      "clip_ratio": 0.0003048399971703475,
+      "epoch": 2.0573630904364975,
+      "grad_norm": 0.03653446584939957,
+      "kl": 0.0079193115234375,
+      "learning_rate": 2.1376120340645014e-07,
+      "loss": 0.0068,
+      "step": 1400
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.7120804786682,
+      "epoch": 2.059275193451047,
+      "grad_norm": 0.041400156915187836,
+      "kl": 0.0076904296875,
+      "learning_rate": 2.1123031270575827e-07,
+      "loss": 0.0112,
+      "num_tokens": 401639357.0,
+      "reward": 0.08398437922005542,
+      "reward_std": 0.08836089540272951,
+      "rewards/pure_accuracy_reward_math": 0.08398437665891834,
+      "step": 1401
+    },
+    {
+      "clip_ratio": 0.0003276587292475597,
+      "epoch": 2.0611872964655973,
+      "grad_norm": 0.04058953374624252,
+      "kl": 0.007676601409912109,
+      "learning_rate": 2.0871383296639487e-07,
+      "loss": 0.0112,
+      "step": 1402
+    },
+    {
+      "clip_ratio": 0.00033817819053183484,
+      "epoch": 2.063099399480147,
+      "grad_norm": 0.040160875767469406,
+      "kl": 0.007659435272216797,
+      "learning_rate": 2.062117800330693e-07,
+      "loss": 0.0112,
+      "step": 1403
+    },
+    {
+      "clip_ratio": 0.00034579052078242967,
+      "epoch": 2.065011502494697,
+      "grad_norm": 0.03876737132668495,
+      "kl": 0.007627964019775391,
+      "learning_rate": 2.0372416965965675e-07,
+      "loss": 0.0111,
+      "step": 1404
+    },
+    {
+      "clip_ratio": 0.00035969930786450277,
+      "epoch": 2.066923605509247,
+      "grad_norm": 0.03797266259789467,
+      "kl": 0.007703304290771484,
+      "learning_rate": 2.0125101750909315e-07,
+      "loss": 0.011,
+      "step": 1405
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 514.2500252723694,
+      "epoch": 2.068835708523797,
+      "grad_norm": 0.05333253741264343,
+      "kl": 0.010094165802001953,
+      "learning_rate": 1.9879233915328312e-07,
+      "loss": 0.0065,
+      "num_tokens": 405215041.0,
+      "reward": 0.08231027176952921,
+      "reward_std": 0.08208991179708391,
+      "rewards/pure_accuracy_reward_math": 0.08231026903376915,
+      "step": 1406
+    },
+    {
+      "clip_ratio": 0.0002884399551135175,
+      "epoch": 2.0707478115383466,
+      "grad_norm": 0.04066501557826996,
+      "kl": 0.009914398193359375,
+      "learning_rate": 1.9634815007299634e-07,
+      "loss": 0.0065,
+      "step": 1407
+    },
+    {
+      "clip_ratio": 0.0003325861029566113,
+      "epoch": 2.0726599145528963,
+      "grad_norm": 0.03939688578248024,
+      "kl": 0.00982666015625,
+      "learning_rate": 1.9391846565777418e-07,
+      "loss": 0.0064,
+      "step": 1408
+    },
+    {
+      "clip_ratio": 0.0003743518978467364,
+      "epoch": 2.0745720175674465,
+      "grad_norm": 0.03857440873980522,
+      "kl": 0.009755611419677734,
+      "learning_rate": 1.9150330120583012e-07,
+      "loss": 0.0063,
+      "step": 1409
+    },
+    {
+      "clip_ratio": 0.0004666026043196325,
+      "epoch": 2.076484120581996,
+      "grad_norm": 0.03952641412615776,
+      "kl": 0.0096588134765625,
+      "learning_rate": 1.891026719239547e-07,
+      "loss": 0.0062,
+      "step": 1410
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.8532605171204,
+      "epoch": 2.0783962235965463,
+      "grad_norm": 0.04142899066209793,
+      "kl": 0.008448123931884766,
+      "learning_rate": 1.8671659292742007e-07,
+      "loss": 0.0099,
+      "num_tokens": 408804459.0,
+      "reward": 0.08286830742144957,
+      "reward_std": 0.08260788215557113,
+      "rewards/pure_accuracy_reward_math": 0.08286830509314314,
+      "step": 1411
+    },
+    {
+      "clip_ratio": 0.0003487231184635675,
+      "epoch": 2.080308326611096,
+      "grad_norm": 0.040530916303396225,
+      "kl": 0.008367538452148438,
+      "learning_rate": 1.8434507923988375e-07,
+      "loss": 0.0099,
+      "step": 1412
+    },
+    {
+      "clip_ratio": 0.0003221970002869057,
+      "epoch": 2.082220429625646,
+      "grad_norm": 0.03941330686211586,
+      "kl": 0.008350849151611328,
+      "learning_rate": 1.8198814579329426e-07,
+      "loss": 0.0098,
+      "step": 1413
+    },
+    {
+      "clip_ratio": 0.00037204451541583694,
+      "epoch": 2.084132532640196,
+      "grad_norm": 0.03861032798886299,
+      "kl": 0.008304595947265625,
+      "learning_rate": 1.7964580742779847e-07,
+      "loss": 0.0097,
+      "step": 1414
+    },
+    {
+      "clip_ratio": 0.0003590778907209824,
+      "epoch": 2.086044635654746,
+      "grad_norm": 0.03945469483733177,
+      "kl": 0.008287906646728516,
+      "learning_rate": 1.7731807889164537e-07,
+      "loss": 0.0096,
+      "step": 1415
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 529.592381477356,
+      "epoch": 2.0879567386692957,
+      "grad_norm": 0.03833872824907303,
+      "kl": 0.0077228546142578125,
+      "learning_rate": 1.7500497484109703e-07,
+      "loss": 0.0109,
+      "num_tokens": 412432506.0,
+      "reward": 0.07449777142028324,
+      "reward_std": 0.08200978167587891,
+      "rewards/pure_accuracy_reward_math": 0.07449776885914616,
+      "step": 1416
+    },
+    {
+      "clip_ratio": 0.0002795722035671133,
+      "epoch": 2.089868841683846,
+      "grad_norm": 0.03684116527438164,
+      "kl": 0.007727146148681641,
+      "learning_rate": 1.7270650984033245e-07,
+      "loss": 0.0108,
+      "step": 1417
+    },
+    {
+      "clip_ratio": 0.00033119657558700055,
+      "epoch": 2.0917809446983955,
+      "grad_norm": 0.03667665645480156,
+      "kl": 0.007739067077636719,
+      "learning_rate": 1.7042269836135882e-07,
+      "loss": 0.0108,
+      "step": 1418
+    },
+    {
+      "clip_ratio": 0.00036255177064958843,
+      "epoch": 2.0936930477129456,
+      "grad_norm": 0.037857044488191605,
+      "kl": 0.007757663726806641,
+      "learning_rate": 1.6815355478391886e-07,
+      "loss": 0.0107,
+      "step": 1419
+    },
+    {
+      "clip_ratio": 0.0003589615364489873,
+      "epoch": 2.0956051507274953,
+      "grad_norm": 0.0360855907201767,
+      "kl": 0.007729053497314453,
+      "learning_rate": 1.6589909339539968e-07,
+      "loss": 0.0106,
+      "step": 1420
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.7469544410706,
+      "epoch": 2.0975172537420455,
+      "grad_norm": 0.041348401457071304,
+      "kl": 0.007639408111572266,
+      "learning_rate": 1.6365932839074532e-07,
+      "loss": 0.0099,
+      "num_tokens": 416048915.0,
+      "reward": 0.07979911076836288,
+      "reward_std": 0.08175079576903954,
+      "rewards/pure_accuracy_reward_math": 0.07979910861467943,
+      "step": 1421
+    },
+    {
+      "clip_ratio": 0.00028084742956480113,
+      "epoch": 2.099429356756595,
+      "grad_norm": 0.03983917832374573,
+      "kl": 0.007691860198974609,
+      "learning_rate": 1.6143427387236455e-07,
+      "loss": 0.0099,
+      "step": 1422
+    },
+    {
+      "clip_ratio": 0.00032101355429858813,
+      "epoch": 2.1013414597711453,
+      "grad_norm": 0.04035898670554161,
+      "kl": 0.007829666137695312,
+      "learning_rate": 1.592239438500434e-07,
+      "loss": 0.0098,
+      "step": 1423
+    },
+    {
+      "clip_ratio": 0.00036129408920260175,
+      "epoch": 2.103253562785695,
+      "grad_norm": 0.03893222287297249,
+      "kl": 0.0079498291015625,
+      "learning_rate": 1.570283522408586e-07,
+      "loss": 0.0097,
+      "step": 1424
+    },
+    {
+      "clip_ratio": 0.0003233651194136655,
+      "epoch": 2.105165665800245,
+      "grad_norm": 0.03798089176416397,
+      "kl": 0.008071422576904297,
+      "learning_rate": 1.5484751286908655e-07,
+      "loss": 0.0097,
+      "step": 1425
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 515.3281455039978,
+      "epoch": 2.107077768814795,
+      "grad_norm": 0.04489213973283768,
+      "kl": 0.00823831558227539,
+      "learning_rate": 1.5268143946611802e-07,
+      "loss": 0.01,
+      "num_tokens": 419628171.0,
+      "reward": 0.07952009321888909,
+      "reward_std": 0.0892580482759513,
+      "rewards/pure_accuracy_reward_math": 0.07952009089058265,
+      "step": 1426
+    },
+    {
+      "clip_ratio": 0.0003507794546067089,
+      "epoch": 2.108989871829345,
+      "grad_norm": 0.04182901233434677,
+      "kl": 0.008199691772460938,
+      "learning_rate": 1.5053014567037171e-07,
+      "loss": 0.01,
+      "step": 1427
+    },
+    {
+      "clip_ratio": 0.0004634781105323782,
+      "epoch": 2.1109019748438946,
+      "grad_norm": 0.04111779108643532,
+      "kl": 0.008260250091552734,
+      "learning_rate": 1.483936450272097e-07,
+      "loss": 0.0099,
+      "step": 1428
+    },
+    {
+      "clip_ratio": 0.0005032591409417364,
+      "epoch": 2.1128140778584448,
+      "grad_norm": 0.04071485623717308,
+      "kl": 0.008274078369140625,
+      "learning_rate": 1.4627195098884856e-07,
+      "loss": 0.0098,
+      "step": 1429
+    },
+    {
+      "clip_ratio": 0.0005640338476382567,
+      "epoch": 2.1147261808729945,
+      "grad_norm": 0.041747044771909714,
+      "kl": 0.008271217346191406,
+      "learning_rate": 1.441650769142791e-07,
+      "loss": 0.0097,
+      "step": 1430
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.8217334747314,
+      "epoch": 2.1166382838875446,
+      "grad_norm": 0.04057304188609123,
+      "kl": 0.00798797607421875,
+      "learning_rate": 1.4207303606917856e-07,
+      "loss": 0.0057,
+      "num_tokens": 423255484.0,
+      "reward": 0.08761161076836288,
+      "reward_std": 0.09866452467394993,
+      "rewards/pure_accuracy_reward_math": 0.08761160855647177,
+      "step": 1431
+    },
+    {
+      "clip_ratio": 0.0003497144300581567,
+      "epoch": 2.1185503869020943,
+      "grad_norm": 0.03972388803958893,
+      "kl": 0.007953643798828125,
+      "learning_rate": 1.3999584162582874e-07,
+      "loss": 0.0057,
+      "step": 1432
+    },
+    {
+      "clip_ratio": 0.00037741022566706306,
+      "epoch": 2.1204624899166444,
+      "grad_norm": 0.03924018144607544,
+      "kl": 0.00795888900756836,
+      "learning_rate": 1.3793350666303328e-07,
+      "loss": 0.0056,
+      "step": 1433
+    },
+    {
+      "clip_ratio": 0.0003785647801350933,
+      "epoch": 2.122374592931194,
+      "grad_norm": 0.03913624957203865,
+      "kl": 0.007895946502685547,
+      "learning_rate": 1.3588604416603424e-07,
+      "loss": 0.0055,
+      "step": 1434
+    },
+    {
+      "clip_ratio": 0.0003937934675377619,
+      "epoch": 2.1242866959457443,
+      "grad_norm": 0.03699544072151184,
+      "kl": 0.00783538818359375,
+      "learning_rate": 1.3385346702643188e-07,
+      "loss": 0.0054,
+      "step": 1435
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 533.7888078689575,
+      "epoch": 2.126198798960294,
+      "grad_norm": 0.042676378041505814,
+      "kl": 0.010451793670654297,
+      "learning_rate": 1.3183578804210173e-07,
+      "loss": 0.0098,
+      "num_tokens": 426903267.0,
+      "reward": 0.07645089671132155,
+      "reward_std": 0.08488008996937424,
+      "rewards/pure_accuracy_reward_math": 0.07645089426659979,
+      "step": 1436
+    },
+    {
+      "clip_ratio": 0.00036263700505401175,
+      "epoch": 2.128110901974844,
+      "grad_norm": 0.03884616866707802,
+      "kl": 0.010242462158203125,
+      "learning_rate": 1.2983301991711578e-07,
+      "loss": 0.0098,
+      "step": 1437
+    },
+    {
+      "clip_ratio": 0.0003990789759313884,
+      "epoch": 2.130023004989394,
+      "grad_norm": 0.0399676114320755,
+      "kl": 0.01007843017578125,
+      "learning_rate": 1.278451752616608e-07,
+      "loss": 0.0097,
+      "step": 1438
+    },
+    {
+      "clip_ratio": 0.0004171350746560165,
+      "epoch": 2.131935108003944,
+      "grad_norm": 0.039714373648166656,
+      "kl": 0.010037422180175781,
+      "learning_rate": 1.258722665919604e-07,
+      "loss": 0.0097,
+      "step": 1439
+    },
+    {
+      "clip_ratio": 0.00039808801824392503,
+      "epoch": 2.1338472110184936,
+      "grad_norm": 0.03794709965586662,
+      "kl": 0.009942054748535156,
+      "learning_rate": 1.2391430633019452e-07,
+      "loss": 0.0096,
+      "step": 1440
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 525.7826709747314,
+      "epoch": 2.1357593140330433,
+      "grad_norm": 0.05131447687745094,
+      "kl": 0.00860595703125,
+      "learning_rate": 1.2197130680442399e-07,
+      "loss": 0.0073,
+      "num_tokens": 430520032.0,
+      "reward": 0.07282366428989917,
+      "reward_std": 0.0797313749208115,
+      "rewards/pure_accuracy_reward_math": 0.07282366172876209,
+      "step": 1441
+    },
+    {
+      "clip_ratio": 0.0003007381984616586,
+      "epoch": 2.1376714170475934,
+      "grad_norm": 0.03815394267439842,
+      "kl": 0.008358001708984375,
+      "learning_rate": 1.2004328024850938e-07,
+      "loss": 0.0073,
+      "step": 1442
+    },
+    {
+      "clip_ratio": 0.0003256684682355626,
+      "epoch": 2.139583520062143,
+      "grad_norm": 0.03841105103492737,
+      "kl": 0.008275985717773438,
+      "learning_rate": 1.1813023880203722e-07,
+      "loss": 0.0072,
+      "step": 1443
+    },
+    {
+      "clip_ratio": 0.00034418403180325186,
+      "epoch": 2.1414956230766933,
+      "grad_norm": 0.041511572897434235,
+      "kl": 0.008276939392089844,
+      "learning_rate": 1.1623219451024098e-07,
+      "loss": 0.0071,
+      "step": 1444
+    },
+    {
+      "clip_ratio": 0.00032526867431670325,
+      "epoch": 2.143407726091243,
+      "grad_norm": 0.03922862559556961,
+      "kl": 0.008294105529785156,
+      "learning_rate": 1.1434915932392682e-07,
+      "loss": 0.007,
+      "step": 1445
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 526.7310523986816,
+      "epoch": 2.145319829105793,
+      "grad_norm": 0.04134941101074219,
+      "kl": 0.008166313171386719,
+      "learning_rate": 1.1248114509939817e-07,
+      "loss": 0.0067,
+      "num_tokens": 434141592.0,
+      "reward": 0.08342634307336994,
+      "reward_std": 0.08578344061970711,
+      "rewards/pure_accuracy_reward_math": 0.08342634132714011,
+      "step": 1446
+    },
+    {
+      "clip_ratio": 0.00029539940015865795,
+      "epoch": 2.147231932120343,
+      "grad_norm": 0.04034848138689995,
+      "kl": 0.008122920989990234,
+      "learning_rate": 1.1062816359838024e-07,
+      "loss": 0.0066,
+      "step": 1447
+    },
+    {
+      "clip_ratio": 0.0003565281184592095,
+      "epoch": 2.149144035134893,
+      "grad_norm": 0.04018424078822136,
+      "kl": 0.00803232192993164,
+      "learning_rate": 1.0879022648794645e-07,
+      "loss": 0.0066,
+      "step": 1448
+    },
+    {
+      "clip_ratio": 0.0003515161848781645,
+      "epoch": 2.1510561381494426,
+      "grad_norm": 0.03917380049824715,
+      "kl": 0.007886886596679688,
+      "learning_rate": 1.0696734534044629e-07,
+      "loss": 0.0065,
+      "step": 1449
+    },
+    {
+      "clip_ratio": 0.0004228238227028669,
+      "epoch": 2.1529682411639928,
+      "grad_norm": 0.038036227226257324,
+      "kl": 0.00785064697265625,
+      "learning_rate": 1.0515953163342973e-07,
+      "loss": 0.0064,
+      "step": 1450
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 544.0078330039978,
+      "epoch": 2.1548803441785425,
+      "grad_norm": 0.03814779594540596,
+      "kl": 0.008002758026123047,
+      "learning_rate": 1.0336679674957716e-07,
+      "loss": 0.0113,
+      "num_tokens": 437824108.0,
+      "reward": 0.07533482514554635,
+      "reward_std": 0.07659588241949677,
+      "rewards/pure_accuracy_reward_math": 0.07533482287544757,
+      "step": 1451
+    },
+    {
+      "clip_ratio": 0.0002914705042371679,
+      "epoch": 2.1567924471930926,
+      "grad_norm": 0.03763413056731224,
+      "kl": 0.00798654556274414,
+      "learning_rate": 1.0158915197662628e-07,
+      "loss": 0.0113,
+      "step": 1452
+    },
+    {
+      "clip_ratio": 0.0002916823746659247,
+      "epoch": 2.1587045502076423,
+      "grad_norm": 0.036225125193595886,
+      "kl": 0.008030414581298828,
+      "learning_rate": 9.982660850730269e-08,
+      "loss": 0.0112,
+      "step": 1453
+    },
+    {
+      "clip_ratio": 0.0002708278207137482,
+      "epoch": 2.1606166532221924,
+      "grad_norm": 0.03529945760965347,
+      "kl": 0.00803375244140625,
+      "learning_rate": 9.807917743924838e-08,
+      "loss": 0.0112,
+      "step": 1454
+    },
+    {
+      "clip_ratio": 0.0002930295025862506,
+      "epoch": 2.162528756236742,
+      "grad_norm": 0.03426925837993622,
+      "kl": 0.007987022399902344,
+      "learning_rate": 9.634686977495089e-08,
+      "loss": 0.0111,
+      "step": 1455
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 517.6585068702698,
+      "epoch": 2.1644408592512923,
+      "grad_norm": 0.038425736129283905,
+      "kl": 0.008115291595458984,
+      "learning_rate": 9.462969642167613e-08,
+      "loss": 0.0052,
+      "num_tokens": 441407888.0,
+      "reward": 0.07617187869618647,
+      "reward_std": 0.0740246243076399,
+      "rewards/pure_accuracy_reward_math": 0.07617187630967237,
+      "step": 1456
+    },
+    {
+      "clip_ratio": 0.00023060813538222646,
+      "epoch": 2.166352962265842,
+      "grad_norm": 0.03851727396249771,
+      "kl": 0.008001327514648438,
+      "learning_rate": 9.292766819139847e-08,
+      "loss": 0.0052,
+      "step": 1457
+    },
+    {
+      "clip_ratio": 0.0002378168165932948,
+      "epoch": 2.168265065280392,
+      "grad_norm": 0.040155645459890366,
+      "kl": 0.007994651794433594,
+      "learning_rate": 9.12407958007322e-08,
+      "loss": 0.0051,
+      "step": 1458
+    },
+    {
+      "clip_ratio": 0.0002497726611068174,
+      "epoch": 2.170177168294942,
+      "grad_norm": 0.0425233468413353,
+      "kl": 0.007935047149658203,
+      "learning_rate": 8.956908987086538e-08,
+      "loss": 0.005,
+      "step": 1459
+    },
+    {
+      "clip_ratio": 0.00030142679486289126,
+      "epoch": 2.172089271309492,
+      "grad_norm": 0.03647738695144653,
+      "kl": 0.007966041564941406,
+      "learning_rate": 8.791256092749223e-08,
+      "loss": 0.0049,
+      "step": 1460
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.2968997955322,
+      "epoch": 2.1740013743240416,
+      "grad_norm": 0.22045741975307465,
+      "kl": 0.022356510162353516,
+      "learning_rate": 8.627121940074645e-08,
+      "loss": 0.0122,
+      "num_tokens": 445010628.0,
+      "reward": 0.08705357578583062,
+      "reward_std": 0.08814817463280633,
+      "rewards/pure_accuracy_reward_math": 0.08705357281723991,
+      "step": 1461
+    },
+    {
+      "clip_ratio": 0.00031046926528688346,
+      "epoch": 2.1759134773385918,
+      "grad_norm": 0.06329243630170822,
+      "kl": 0.015823841094970703,
+      "learning_rate": 8.464507562513657e-08,
+      "loss": 0.0119,
+      "step": 1462
+    },
+    {
+      "clip_ratio": 0.0003438202776351318,
+      "epoch": 2.1778255803531414,
+      "grad_norm": 0.05041000247001648,
+      "kl": 0.014271736145019531,
+      "learning_rate": 8.303413983948017e-08,
+      "loss": 0.0118,
+      "step": 1463
+    },
+    {
+      "clip_ratio": 0.0003563892260558532,
+      "epoch": 2.1797376833676916,
+      "grad_norm": 0.04660080000758171,
+      "kl": 0.013462543487548828,
+      "learning_rate": 8.143842218683862e-08,
+      "loss": 0.0117,
+      "step": 1464
+    },
+    {
+      "clip_ratio": 0.0004125210731444895,
+      "epoch": 2.1816497863822413,
+      "grad_norm": 0.04536700248718262,
+      "kl": 0.012927532196044922,
+      "learning_rate": 7.985793271445636e-08,
+      "loss": 0.0116,
+      "step": 1465
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 517.6127443313599,
+      "epoch": 2.1835618893967914,
+      "grad_norm": 0.08454474061727524,
+      "kl": 0.010744094848632812,
+      "learning_rate": 7.829268137369311e-08,
+      "loss": 0.0075,
+      "num_tokens": 448601372.0,
+      "reward": 0.0750558071595151,
+      "reward_std": 0.0813654173980467,
+      "rewards/pure_accuracy_reward_math": 0.07505580488941632,
+      "step": 1466
+    },
+    {
+      "clip_ratio": 0.00028517025145902153,
+      "epoch": 2.185473992411341,
+      "grad_norm": 0.04138394817709923,
+      "kl": 0.009669780731201172,
+      "learning_rate": 7.674267801996427e-08,
+      "loss": 0.0075,
+      "step": 1467
+    },
+    {
+      "clip_ratio": 0.00027802770790685827,
+      "epoch": 2.1873860954258912,
+      "grad_norm": 0.03745463490486145,
+      "kl": 0.009511947631835938,
+      "learning_rate": 7.52079324126792e-08,
+      "loss": 0.0074,
+      "step": 1468
+    },
+    {
+      "clip_ratio": 0.0003267590287805433,
+      "epoch": 2.189298198440441,
+      "grad_norm": 0.036841075867414474,
+      "kl": 0.00956106185913086,
+      "learning_rate": 7.368845421517779e-08,
+      "loss": 0.0073,
+      "step": 1469
+    },
+    {
+      "clip_ratio": 0.0003443693621534294,
+      "epoch": 2.191210301454991,
+      "grad_norm": 0.0362345427274704,
+      "kl": 0.009715557098388672,
+      "learning_rate": 7.21842529946698e-08,
+      "loss": 0.0072,
+      "step": 1470
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 499.83763551712036,
+      "epoch": 2.1931224044695408,
+      "grad_norm": 0.0431695282459259,
+      "kl": 0.008378028869628906,
+      "learning_rate": 7.0695338222177e-08,
+      "loss": 0.0093,
+      "num_tokens": 452124382.0,
+      "reward": 0.07756696839351207,
+      "reward_std": 0.08685944566968828,
+      "rewards/pure_accuracy_reward_math": 0.07756696530850604,
+      "step": 1471
+    },
+    {
+      "clip_ratio": 0.0003288618632950602,
+      "epoch": 2.195034507484091,
+      "grad_norm": 0.042445823550224304,
+      "kl": 0.008408546447753906,
+      "learning_rate": 6.922171927247062e-08,
+      "loss": 0.0092,
+      "step": 1472
+    },
+    {
+      "clip_ratio": 0.0003429904774066017,
+      "epoch": 2.1969466104986406,
+      "grad_norm": 0.04231419414281845,
+      "kl": 0.008434295654296875,
+      "learning_rate": 6.776340542401422e-08,
+      "loss": 0.0092,
+      "step": 1473
+    },
+    {
+      "clip_ratio": 0.00035230960349963425,
+      "epoch": 2.1988587135131903,
+      "grad_norm": 0.04162426292896271,
+      "kl": 0.008434295654296875,
+      "learning_rate": 6.632040585890398e-08,
+      "loss": 0.0091,
+      "step": 1474
+    },
+    {
+      "clip_ratio": 0.000348456743722636,
+      "epoch": 2.2007708165277404,
+      "grad_norm": 0.04009128361940384,
+      "kl": 0.008394718170166016,
+      "learning_rate": 6.489272966281269e-08,
+      "loss": 0.009,
+      "step": 1475
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 511.53015899658203,
+      "epoch": 2.2026829195422906,
+      "grad_norm": 0.03803718462586403,
+      "kl": 0.008605003356933594,
+      "learning_rate": 6.348038582493e-08,
+      "loss": 0.0064,
+      "num_tokens": 455697798.0,
+      "reward": 0.06863839633297175,
+      "reward_std": 0.0772402475704439,
+      "rewards/pure_accuracy_reward_math": 0.06863839423749596,
+      "step": 1476
+    },
+    {
+      "clip_ratio": 0.0002735381897878142,
+      "epoch": 2.2045950225568403,
+      "grad_norm": 0.036724258214235306,
+      "kl": 0.008575439453125,
+      "learning_rate": 6.208338323790891e-08,
+      "loss": 0.0064,
+      "step": 1477
+    },
+    {
+      "clip_ratio": 0.000271568493644736,
+      "epoch": 2.20650712557139,
+      "grad_norm": 0.03627302870154381,
+      "kl": 0.008494853973388672,
+      "learning_rate": 6.070173069780638e-08,
+      "loss": 0.0063,
+      "step": 1478
+    },
+    {
+      "clip_ratio": 0.0003129301562694309,
+      "epoch": 2.20841922858594,
+      "grad_norm": 0.035685960203409195,
+      "kl": 0.008512496948242188,
+      "learning_rate": 5.933543690403082e-08,
+      "loss": 0.0063,
+      "step": 1479
+    },
+    {
+      "clip_ratio": 0.0003575469975203305,
+      "epoch": 2.21033133160049,
+      "grad_norm": 0.03495527431368828,
+      "kl": 0.008492469787597656,
+      "learning_rate": 5.7984510459285215e-08,
+      "loss": 0.0062,
+      "step": 1480
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.403482913971,
+      "epoch": 2.21224343461504,
+      "grad_norm": 0.041989997029304504,
+      "kl": 0.008183956146240234,
+      "learning_rate": 5.6648959869514965e-08,
+      "loss": 0.0075,
+      "num_tokens": 459321180.0,
+      "reward": 0.07617187898722477,
+      "reward_std": 0.0817908609751612,
+      "rewards/pure_accuracy_reward_math": 0.07617187630967237,
+      "step": 1481
+    },
+    {
+      "clip_ratio": 0.0003129412224893713,
+      "epoch": 2.2141555376295896,
+      "grad_norm": 0.04108978435397148,
+      "kl": 0.00823974609375,
+      "learning_rate": 5.532879354385234e-08,
+      "loss": 0.0075,
+      "step": 1482
+    },
+    {
+      "clip_ratio": 0.0003202799926498301,
+      "epoch": 2.2160676406441397,
+      "grad_norm": 0.03990933671593666,
+      "kl": 0.00827646255493164,
+      "learning_rate": 5.4024019794565176e-08,
+      "loss": 0.0075,
+      "step": 1483
+    },
+    {
+      "clip_ratio": 0.0003925440155398974,
+      "epoch": 2.2179797436586894,
+      "grad_norm": 0.039193831384181976,
+      "kl": 0.008234977722167969,
+      "learning_rate": 5.273464683700352e-08,
+      "loss": 0.0074,
+      "step": 1484
+    },
+    {
+      "clip_ratio": 0.0004001183214654702,
+      "epoch": 2.2198918466732396,
+      "grad_norm": 0.039878588169813156,
+      "kl": 0.00826406478881836,
+      "learning_rate": 5.1460682789547526e-08,
+      "loss": 0.0073,
+      "step": 1485
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 531.470449924469,
+      "epoch": 2.2218039496877893,
+      "grad_norm": 0.04079683497548103,
+      "kl": 0.011513710021972656,
+      "learning_rate": 5.020213567355825e-08,
+      "loss": 0.0091,
+      "num_tokens": 462957626.0,
+      "reward": 0.06752232459257357,
+      "reward_std": 0.07320140459341928,
+      "rewards/pure_accuracy_reward_math": 0.0675223229045514,
+      "step": 1486
+    },
+    {
+      "clip_ratio": 0.0002717390548241383,
+      "epoch": 2.2237160527023394,
+      "grad_norm": 0.037311483174562454,
+      "kl": 0.011410713195800781,
+      "learning_rate": 4.8959013413324705e-08,
+      "loss": 0.009,
+      "step": 1487
+    },
+    {
+      "clip_ratio": 0.0002951391629721911,
+      "epoch": 2.225628155716889,
+      "grad_norm": 0.035728756338357925,
+      "kl": 0.011387348175048828,
+      "learning_rate": 4.773132383601664e-08,
+      "loss": 0.009,
+      "step": 1488
+    },
+    {
+      "clip_ratio": 0.00030970129540719427,
+      "epoch": 2.2275402587314392,
+      "grad_norm": 0.03630708530545235,
+      "kl": 0.011130332946777344,
+      "learning_rate": 4.6519074671631805e-08,
+      "loss": 0.0089,
+      "step": 1489
+    },
+    {
+      "clip_ratio": 0.00035198272149727927,
+      "epoch": 2.229452361745989,
+      "grad_norm": 0.035501569509506226,
+      "kl": 0.010982990264892578,
+      "learning_rate": 4.5322273552951265e-08,
+      "loss": 0.0088,
+      "step": 1490
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.0912661552429,
+      "epoch": 2.231364464760539,
+      "grad_norm": 0.039065275341272354,
+      "kl": 0.008381366729736328,
+      "learning_rate": 4.4140928015488085e-08,
+      "loss": 0.0067,
+      "num_tokens": 466540145.0,
+      "reward": 0.08007812951109372,
+      "reward_std": 0.07346039032563567,
+      "rewards/pure_accuracy_reward_math": 0.08007812619325705,
+      "step": 1491
+    },
+    {
+      "clip_ratio": 0.0002747246091985289,
+      "epoch": 2.2332765677750888,
+      "grad_norm": 0.03766880929470062,
+      "kl": 0.008387088775634766,
+      "learning_rate": 4.297504549744119e-08,
+      "loss": 0.0067,
+      "step": 1492
+    },
+    {
+      "clip_ratio": 0.0002486348788579562,
+      "epoch": 2.235188670789639,
+      "grad_norm": 0.03599947690963745,
+      "kl": 0.0084991455078125,
+      "learning_rate": 4.182463333964909e-08,
+      "loss": 0.0066,
+      "step": 1493
+    },
+    {
+      "clip_ratio": 0.0002674886795261955,
+      "epoch": 2.2371007738041886,
+      "grad_norm": 0.0361332893371582,
+      "kl": 0.008679389953613281,
+      "learning_rate": 4.068969878554263e-08,
+      "loss": 0.0066,
+      "step": 1494
+    },
+    {
+      "clip_ratio": 0.00031218544620514876,
+      "epoch": 2.2390128768187387,
+      "grad_norm": 0.035462211817502975,
+      "kl": 0.008719921112060547,
+      "learning_rate": 3.957024898110007e-08,
+      "loss": 0.0065,
+      "step": 1495
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 507.05945777893066,
+      "epoch": 2.2409249798332884,
+      "grad_norm": 0.10880274325609207,
+      "kl": 0.012134075164794922,
+      "learning_rate": 3.846629097480126e-08,
+      "loss": 0.0046,
+      "num_tokens": 470091662.0,
+      "reward": 0.07952009330620058,
+      "reward_std": 0.08660046098520979,
+      "rewards/pure_accuracy_reward_math": 0.0795200907450635,
+      "step": 1496
+    },
+    {
+      "clip_ratio": 0.00034633993402621854,
+      "epoch": 2.2428370828478386,
+      "grad_norm": 0.04444468766450882,
+      "kl": 0.010071754455566406,
+      "learning_rate": 3.737783171758408e-08,
+      "loss": 0.0045,
+      "step": 1497
+    },
+    {
+      "clip_ratio": 0.00040814166391101026,
+      "epoch": 2.2447491858623883,
+      "grad_norm": 0.050679393112659454,
+      "kl": 0.009745597839355469,
+      "learning_rate": 3.630487806280086e-08,
+      "loss": 0.0044,
+      "step": 1498
+    },
+    {
+      "clip_ratio": 0.00040935890626769833,
+      "epoch": 2.2466612888769384,
+      "grad_norm": 0.04249563813209534,
+      "kl": 0.009531974792480469,
+      "learning_rate": 3.524743676617426e-08,
+      "loss": 0.0044,
+      "step": 1499
+    },
+    {
+      "clip_ratio": 0.00041069585563491273,
+      "epoch": 2.248573391891488,
+      "grad_norm": 0.04013880342245102,
+      "kl": 0.009422779083251953,
+      "learning_rate": 3.42055144857556e-08,
+      "loss": 0.0042,
+      "step": 1500
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.4908156394958,
+      "epoch": 2.250485494906038,
+      "grad_norm": 0.04119328781962395,
+      "kl": 0.00858306884765625,
+      "learning_rate": 3.3179117781882154e-08,
+      "loss": 0.0064,
+      "num_tokens": 473729421.0,
+      "reward": 0.08175223629223183,
+      "reward_std": 0.080375739664305,
+      "rewards/pure_accuracy_reward_math": 0.08175223390571773,
+      "step": 1501
+    },
+    {
+      "clip_ratio": 0.00027040669908728887,
+      "epoch": 2.252397597920588,
+      "grad_norm": 0.03726639971137047,
+      "kl": 0.008556365966796875,
+      "learning_rate": 3.216825311713689e-08,
+      "loss": 0.0064,
+      "step": 1502
+    },
+    {
+      "clip_ratio": 0.0003022322244419229,
+      "epoch": 2.254309700935138,
+      "grad_norm": 0.03740008547902107,
+      "kl": 0.008624553680419922,
+      "learning_rate": 3.11729268563063e-08,
+      "loss": 0.0063,
+      "step": 1503
+    },
+    {
+      "clip_ratio": 0.0002972338604081415,
+      "epoch": 2.2562218039496877,
+      "grad_norm": 0.036019936203956604,
+      "kl": 0.008683204650878906,
+      "learning_rate": 3.019314526634232e-08,
+      "loss": 0.0062,
+      "step": 1504
+    },
+    {
+      "clip_ratio": 0.0003317092545103151,
+      "epoch": 2.258133906964238,
+      "grad_norm": 0.035242002457380295,
+      "kl": 0.008699893951416016,
+      "learning_rate": 2.922891451632076e-08,
+      "loss": 0.0062,
+      "step": 1505
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.8340096473694,
+      "epoch": 2.2600460099787876,
+      "grad_norm": 0.04786042869091034,
+      "kl": 0.0166015625,
+      "learning_rate": 2.8280240677403813e-08,
+      "loss": 0.0117,
+      "num_tokens": 477311002.0,
+      "reward": 0.08593750389991328,
+      "reward_std": 0.09509739134227857,
+      "rewards/pure_accuracy_reward_math": 0.08593750139698386,
+      "step": 1506
+    },
+    {
+      "clip_ratio": 0.0003771551589011324,
+      "epoch": 2.2619581129933373,
+      "grad_norm": 0.04542854428291321,
+      "kl": 0.016517162322998047,
+      "learning_rate": 2.7347129722801736e-08,
+      "loss": 0.0117,
+      "step": 1507
+    },
+    {
+      "clip_ratio": 0.00043879733209450933,
+      "epoch": 2.2638702160078874,
+      "grad_norm": 0.04336082562804222,
+      "kl": 0.016106605529785156,
+      "learning_rate": 2.6429587527734835e-08,
+      "loss": 0.0116,
+      "step": 1508
+    },
+    {
+      "clip_ratio": 0.0005006881825977416,
+      "epoch": 2.2657823190224375,
+      "grad_norm": 0.04397574067115784,
+      "kl": 0.015746116638183594,
+      "learning_rate": 2.5527619869396003e-08,
+      "loss": 0.0115,
+      "step": 1509
+    },
+    {
+      "clip_ratio": 0.0005348546662844456,
+      "epoch": 2.2676944220369872,
+      "grad_norm": 0.043936342000961304,
+      "kl": 0.015500068664550781,
+      "learning_rate": 2.464123242691574e-08,
+      "loss": 0.0114,
+      "step": 1510
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 526.8474016189575,
+      "epoch": 2.269606525051537,
+      "grad_norm": 0.04165401682257652,
+      "kl": 0.008256912231445312,
+      "learning_rate": 2.377043078132496e-08,
+      "loss": 0.0079,
+      "num_tokens": 480935151.0,
+      "reward": 0.08342634345171973,
+      "reward_std": 0.09024772583507001,
+      "rewards/pure_accuracy_reward_math": 0.08342634071595967,
+      "step": 1511
+    },
+    {
+      "clip_ratio": 0.0003286536882569635,
+      "epoch": 2.271518628066087,
+      "grad_norm": 0.04013460502028465,
+      "kl": 0.008354663848876953,
+      "learning_rate": 2.291522041552141e-08,
+      "loss": 0.0079,
+      "step": 1512
+    },
+    {
+      "clip_ratio": 0.00034448601985559435,
+      "epoch": 2.273430731080637,
+      "grad_norm": 0.03929148614406586,
+      "kl": 0.008509159088134766,
+      "learning_rate": 2.207560671423331e-08,
+      "loss": 0.0078,
+      "step": 1513
+    },
+    {
+      "clip_ratio": 0.00038580430322099346,
+      "epoch": 2.275342834095187,
+      "grad_norm": 0.04108521342277527,
+      "kl": 0.008730888366699219,
+      "learning_rate": 2.1251594963986876e-08,
+      "loss": 0.0077,
+      "step": 1514
+    },
+    {
+      "clip_ratio": 0.00038072799372912414,
+      "epoch": 2.2772549371097366,
+      "grad_norm": 0.038887783885002136,
+      "kl": 0.008725643157958984,
+      "learning_rate": 2.0443190353072185e-08,
+      "loss": 0.0076,
+      "step": 1515
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.4051609039307,
+      "epoch": 2.2791670401242867,
+      "grad_norm": 0.03783741220831871,
+      "kl": 0.008581161499023438,
+      "learning_rate": 1.9650397971510972e-08,
+      "loss": 0.0064,
+      "num_tokens": 484530587.0,
+      "reward": 0.08231027124566026,
+      "reward_std": 0.08037574036279693,
+      "rewards/pure_accuracy_reward_math": 0.08231026897556148,
+      "step": 1516
+    },
+    {
+      "clip_ratio": 0.0002746778108644321,
+      "epoch": 2.2810791431388364,
+      "grad_norm": 0.03765445947647095,
+      "kl": 0.008580207824707031,
+      "learning_rate": 1.8873222811024717e-08,
+      "loss": 0.0063,
+      "step": 1517
+    },
+    {
+      "clip_ratio": 0.00031986788579274616,
+      "epoch": 2.2829912461533866,
+      "grad_norm": 0.03684096038341522,
+      "kl": 0.008593082427978516,
+      "learning_rate": 1.8111669765003005e-08,
+      "loss": 0.0063,
+      "step": 1518
+    },
+    {
+      "clip_ratio": 0.0003354349921380617,
+      "epoch": 2.2849033491679362,
+      "grad_norm": 0.03599463030695915,
+      "kl": 0.008591175079345703,
+      "learning_rate": 1.73657436284716e-08,
+      "loss": 0.0062,
+      "step": 1519
+    },
+    {
+      "clip_ratio": 0.0003505910435706028,
+      "epoch": 2.2868154521824864,
+      "grad_norm": 0.035750966519117355,
+      "kl": 0.00874948501586914,
+      "learning_rate": 1.6635449098064972e-08,
+      "loss": 0.0061,
+      "step": 1520
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 521.2455606460571,
+      "epoch": 2.288727555197036,
+      "grad_norm": 0.03890154883265495,
+      "kl": 0.008922100067138672,
+      "learning_rate": 1.5920790771993822e-08,
+      "loss": 0.0078,
+      "num_tokens": 488136255.0,
+      "reward": 0.07952009289874695,
+      "reward_std": 0.07556614064378664,
+      "rewards/pure_accuracy_reward_math": 0.07952009068685584,
+      "step": 1521
+    },
+    {
+      "clip_ratio": 0.00024827225587387147,
+      "epoch": 2.290639658211586,
+      "grad_norm": 0.037810854613780975,
+      "kl": 0.008934974670410156,
+      "learning_rate": 1.5221773150017882e-08,
+      "loss": 0.0078,
+      "step": 1522
+    },
+    {
+      "clip_ratio": 0.0002384709360967463,
+      "epoch": 2.292551761226136,
+      "grad_norm": 0.0364384800195694,
+      "kl": 0.008936882019042969,
+      "learning_rate": 1.4538400633417049e-08,
+      "loss": 0.0077,
+      "step": 1523
+    },
+    {
+      "clip_ratio": 0.0002599185108635993,
+      "epoch": 2.294463864240686,
+      "grad_norm": 0.035106074064970016,
+      "kl": 0.008829116821289062,
+      "learning_rate": 1.387067752496335e-08,
+      "loss": 0.0076,
+      "step": 1524
+    },
+    {
+      "clip_ratio": 0.0003290796867077006,
+      "epoch": 2.2963759672552357,
+      "grad_norm": 0.03489363566040993,
+      "kl": 0.0086822509765625,
+      "learning_rate": 1.3218608028895131e-08,
+      "loss": 0.0076,
+      "step": 1525
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 517.0547122955322,
+      "epoch": 2.298288070269786,
+      "grad_norm": 0.040062014013528824,
+      "kl": 0.008834362030029297,
+      "learning_rate": 1.2582196250888745e-08,
+      "loss": 0.0071,
+      "num_tokens": 491722139.0,
+      "reward": 0.08621652179863304,
+      "reward_std": 0.08020308247068897,
+      "rewards/pure_accuracy_reward_math": 0.08621651906287298,
+      "step": 1526
+    },
+    {
+      "clip_ratio": 0.00031514769625573535,
+      "epoch": 2.3002001732843356,
+      "grad_norm": 0.03938477113842964,
+      "kl": 0.008733272552490234,
+      "learning_rate": 1.1961446198033855e-08,
+      "loss": 0.0071,
+      "step": 1527
+    },
+    {
+      "clip_ratio": 0.00030386562087869606,
+      "epoch": 2.3021122762988857,
+      "grad_norm": 0.03844742849469185,
+      "kl": 0.008654594421386719,
+      "learning_rate": 1.1356361778808167e-08,
+      "loss": 0.007,
+      "step": 1528
+    },
+    {
+      "clip_ratio": 0.00034510965764411594,
+      "epoch": 2.3040243793134354,
+      "grad_norm": 0.03755528852343559,
+      "kl": 0.00861358642578125,
+      "learning_rate": 1.076694680305218e-08,
+      "loss": 0.007,
+      "step": 1529
+    },
+    {
+      "clip_ratio": 0.00035207756366162357,
+      "epoch": 2.3059364823279855,
+      "grad_norm": 0.03696778416633606,
+      "kl": 0.008616447448730469,
+      "learning_rate": 1.0193204981946426e-08,
+      "loss": 0.0069,
+      "step": 1530
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 516.7249145507812,
+      "epoch": 2.3078485853425352,
+      "grad_norm": 0.045076508074998856,
+      "kl": 0.014521598815917969,
+      "learning_rate": 9.63513992798676e-09,
+      "loss": 0.0065,
+      "num_tokens": 495305537.0,
+      "reward": 0.07505580713041127,
+      "reward_std": 0.07844264624873176,
+      "rewards/pure_accuracy_reward_math": 0.07505580480210483,
+      "step": 1531
+    },
+    {
+      "clip_ratio": 0.0003054732096074986,
+      "epoch": 2.3097606883570854,
+      "grad_norm": 0.041828691959381104,
+      "kl": 0.01419973373413086,
+      "learning_rate": 9.092755154961886e-09,
+      "loss": 0.0065,
+      "step": 1532
+    },
+    {
+      "clip_ratio": 0.00030572324658351135,
+      "epoch": 2.311672791371635,
+      "grad_norm": 0.03949357569217682,
+      "kl": 0.013697624206542969,
+      "learning_rate": 8.566054077932262e-09,
+      "loss": 0.0064,
+      "step": 1533
+    },
+    {
+      "clip_ratio": 0.0003279060996987937,
+      "epoch": 2.313584894386185,
+      "grad_norm": 0.038545649498701096,
+      "kl": 0.01345968246459961,
+      "learning_rate": 8.055040013207061e-09,
+      "loss": 0.0063,
+      "step": 1534
+    },
+    {
+      "clip_ratio": 0.00033917763732915773,
+      "epoch": 2.315496997400735,
+      "grad_norm": 0.03716408833861351,
+      "kl": 0.01330709457397461,
+      "learning_rate": 7.559716178325016e-09,
+      "loss": 0.0062,
+      "step": 1535
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 519.2921552658081,
+      "epoch": 2.317409100415285,
+      "grad_norm": 0.041162386536598206,
+      "kl": 0.008297443389892578,
+      "learning_rate": 7.080085692032224e-09,
+      "loss": 0.0079,
+      "num_tokens": 498900584.0,
+      "reward": 0.08928571816068143,
+      "reward_std": 0.08428199036279693,
+      "rewards/pure_accuracy_reward_math": 0.08928571571595967,
+      "step": 1536
+    },
+    {
+      "clip_ratio": 0.00029752771973790004,
+      "epoch": 2.3193212034298347,
+      "grad_norm": 0.03933210298418999,
+      "kl": 0.008346080780029297,
+      "learning_rate": 6.616151574264374e-09,
+      "loss": 0.0079,
+      "step": 1537
+    },
+    {
+      "clip_ratio": 0.0003302163729017593,
+      "epoch": 2.321233306444385,
+      "grad_norm": 0.038146842271089554,
+      "kl": 0.008320331573486328,
+      "learning_rate": 6.1679167461262124e-09,
+      "loss": 0.0078,
+      "step": 1538
+    },
+    {
+      "clip_ratio": 0.0003326926421891585,
+      "epoch": 2.3231454094589346,
+      "grad_norm": 0.038072116672992706,
+      "kl": 0.008330345153808594,
+      "learning_rate": 5.735384029874336e-09,
+      "loss": 0.0077,
+      "step": 1539
+    },
+    {
+      "clip_ratio": 0.00038002995881925017,
+      "epoch": 2.3250575124734847,
+      "grad_norm": 0.037320397794246674,
+      "kl": 0.008296012878417969,
+      "learning_rate": 5.31855614889859e-09,
+      "loss": 0.0076,
+      "step": 1540
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.1487407684326,
+      "epoch": 2.3269696154880344,
+      "grad_norm": 0.03688493371009827,
+      "kl": 0.008476734161376953,
+      "learning_rate": 4.917435727704867e-09,
+      "loss": 0.0024,
+      "num_tokens": 502500281.0,
+      "reward": 0.0811942005821038,
+      "reward_std": 0.0787416979437694,
+      "rewards/pure_accuracy_reward_math": 0.08119419842842035,
+      "step": 1541
+    },
+    {
+      "clip_ratio": 0.00028201957394458077,
+      "epoch": 2.3288817185025845,
+      "grad_norm": 0.03607385605573654,
+      "kl": 0.008441448211669922,
+      "learning_rate": 4.53202529190011e-09,
+      "loss": 0.0023,
+      "step": 1542
+    },
+    {
+      "clip_ratio": 0.0002742231245633775,
+      "epoch": 2.330793821517134,
+      "grad_norm": 0.03572804853320122,
+      "kl": 0.00852060317993164,
+      "learning_rate": 4.162327268173727e-09,
+      "loss": 0.0023,
+      "step": 1543
+    },
+    {
+      "clip_ratio": 0.0003046261713848253,
+      "epoch": 2.332705924531684,
+      "grad_norm": 0.034965962171554565,
+      "kl": 0.00861501693725586,
+      "learning_rate": 3.80834398428509e-09,
+      "loss": 0.0022,
+      "step": 1544
+    },
+    {
+      "clip_ratio": 0.0003226917802976459,
+      "epoch": 2.334618027546234,
+      "grad_norm": 0.034803807735443115,
+      "kl": 0.008724212646484375,
+      "learning_rate": 3.470077669046612e-09,
+      "loss": 0.0021,
+      "step": 1545
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 538.0273699760437,
+      "epoch": 2.336530130560784,
+      "grad_norm": 0.034996818751096725,
+      "kl": 0.008575439453125,
+      "learning_rate": 3.147530452311809e-09,
+      "loss": 0.0064,
+      "num_tokens": 506159719.0,
+      "reward": 0.06891741408617236,
+      "reward_std": 0.07063014718005434,
+      "rewards/pure_accuracy_reward_math": 0.06891741210711189,
+      "step": 1546
+    },
+    {
+      "clip_ratio": 0.00023073077210256088,
+      "epoch": 2.338442233575334,
+      "grad_norm": 0.03347066789865494,
+      "kl": 0.008565902709960938,
+      "learning_rate": 2.8407043649597567e-09,
+      "loss": 0.0063,
+      "step": 1547
+    },
+    {
+      "clip_ratio": 0.000268154504112772,
+      "epoch": 2.3403543365898836,
+      "grad_norm": 0.03273630142211914,
+      "kl": 0.008545398712158203,
+      "learning_rate": 2.549601338883989e-09,
+      "loss": 0.0063,
+      "step": 1548
+    },
+    {
+      "clip_ratio": 0.00029292683666426456,
+      "epoch": 2.3422664396044337,
+      "grad_norm": 0.032376162707805634,
+      "kl": 0.008570671081542969,
+      "learning_rate": 2.2742232069794533e-09,
+      "loss": 0.0063,
+      "step": 1549
+    },
+    {
+      "clip_ratio": 0.0003443536306235728,
+      "epoch": 2.344178542618984,
+      "grad_norm": 0.031950000673532486,
+      "kl": 0.008484363555908203,
+      "learning_rate": 2.01457170313113e-09,
+      "loss": 0.0062,
+      "step": 1550
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 520.7207255363464,
+      "epoch": 2.3460906456335335,
+      "grad_norm": 0.04171088710427284,
+      "kl": 0.009114742279052734,
+      "learning_rate": 1.7706484622034837e-09,
+      "loss": 0.005,
+      "num_tokens": 509757966.0,
+      "reward": 0.07672991443541832,
+      "reward_std": 0.08149181143380702,
+      "rewards/pure_accuracy_reward_math": 0.07672991228173487,
+      "step": 1551
+    },
+    {
+      "clip_ratio": 0.0003305982788788242,
+      "epoch": 2.3480027486480832,
+      "grad_norm": 0.04123101010918617,
+      "kl": 0.009046554565429688,
+      "learning_rate": 1.5424550200293653e-09,
+      "loss": 0.005,
+      "step": 1552
+    },
+    {
+      "clip_ratio": 0.0003486324259256435,
+      "epoch": 2.3499148516626334,
+      "grad_norm": 0.039809513837099075,
+      "kl": 0.008966445922851562,
+      "learning_rate": 1.3299928134014039e-09,
+      "loss": 0.0049,
+      "step": 1553
+    },
+    {
+      "clip_ratio": 0.0003954665013452541,
+      "epoch": 2.351826954677183,
+      "grad_norm": 0.0393875353038311,
+      "kl": 0.008915901184082031,
+      "learning_rate": 1.1332631800620164e-09,
+      "loss": 0.0049,
+      "step": 1554
+    },
+    {
+      "clip_ratio": 0.0004334128346954458,
+      "epoch": 2.353739057691733,
+      "grad_norm": 0.03990260884165764,
+      "kl": 0.008862972259521484,
+      "learning_rate": 9.522673586956355e-10,
+      "loss": 0.0047,
+      "step": 1555
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 518.488025188446,
+      "epoch": 2.355651160706283,
+      "grad_norm": 0.04300679266452789,
+      "kl": 0.009171009063720703,
+      "learning_rate": 7.870064889206608e-10,
+      "loss": 0.0082,
+      "num_tokens": 513350767.0,
+      "reward": 0.07728794994181953,
+      "reward_std": 0.08290693227900192,
+      "rewards/pure_accuracy_reward_math": 0.07728794743889011,
+      "step": 1556
+    },
+    {
+      "clip_ratio": 0.000295089724772879,
+      "epoch": 2.357563263720833,
+      "grad_norm": 0.04144243150949478,
+      "kl": 0.009136676788330078,
+      "learning_rate": 6.374816112819648e-10,
+      "loss": 0.0082,
+      "step": 1557
+    },
+    {
+      "clip_ratio": 0.0003283331608940898,
+      "epoch": 2.3594753667353827,
+      "grad_norm": 0.039357006549835205,
+      "kl": 0.009202003479003906,
+      "learning_rate": 5.036936672447868e-10,
+      "loss": 0.0081,
+      "step": 1558
+    },
+    {
+      "clip_ratio": 0.00036647373104869985,
+      "epoch": 2.361387469749933,
+      "grad_norm": 0.03904441371560097,
+      "kl": 0.009307384490966797,
+      "learning_rate": 3.8564349918890356e-10,
+      "loss": 0.008,
+      "step": 1559
+    },
+    {
+      "clip_ratio": 0.0004084905730792343,
+      "epoch": 2.3632995727644825,
+      "grad_norm": 0.03901646286249161,
+      "kl": 0.00932168960571289,
+      "learning_rate": 2.833318504030791e-10,
+      "loss": 0.0079,
+      "step": 1560
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.474356174469,
+      "epoch": 2.3652116757790327,
+      "grad_norm": 5.391517162322998,
+      "kl": 0.0942845344543457,
+      "learning_rate": 1.9675936507979056e-10,
+      "loss": 0.0081,
+      "num_tokens": 516974751.0,
+      "reward": 0.06975446754950099,
+      "reward_std": 0.06989945453824475,
+      "rewards/pure_accuracy_reward_math": 0.06975446597789414,
+      "step": 1561
+    },
+    {
+      "clip_ratio": 0.0002886794856635788,
+      "epoch": 2.3671237787935824,
+      "grad_norm": 0.1764528900384903,
+      "kl": 0.013553619384765625,
+      "learning_rate": 1.2592658831245274e-10,
+      "loss": 0.0049,
+      "step": 1562
+    },
+    {
+      "clip_ratio": 0.00028670978349509824,
+      "epoch": 2.3690358818081325,
+      "grad_norm": 0.03846847265958786,
+      "kl": 0.009183406829833984,
+      "learning_rate": 7.083396609097737e-11,
+      "loss": 0.0047,
+      "step": 1563
+    },
+    {
+      "clip_ratio": 0.0002776476591748178,
+      "epoch": 2.370947984822682,
+      "grad_norm": 0.035545963793992996,
+      "kl": 0.008979320526123047,
+      "learning_rate": 3.148184529927489e-11,
+      "loss": 0.0046,
+      "step": 1564
+    },
+    {
+      "clip_ratio": 0.00032522391097700165,
+      "epoch": 2.3728600878372323,
+      "grad_norm": 0.1538141518831253,
+      "kl": 0.009156227111816406,
+      "learning_rate": 7.870473713589288e-12,
+      "loss": 0.0046,
+      "step": 1565
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 530.6135845184326,
+      "epoch": 2.374772190851782,
+      "grad_norm": 0.0368269719183445,
+      "kl": 0.008574485778808594,
+      "learning_rate": 0.0,
+      "loss": 0.0087,
+      "num_tokens": 520611370.0,
+      "reward": 0.07142857427243143,
+      "reward_std": 0.07900068280287087,
+      "rewards/pure_accuracy_reward_math": 0.07142857293365523,
+      "step": 1566
+    },
+    {
+      "epoch": 2.374772190851782,
+      "step": 1566,
+      "total_flos": 0.0,
+      "train_loss": 0.003398028112404372,
+      "train_runtime": 273585.6306,
+      "train_samples_per_second": 1.028,
+      "train_steps_per_second": 0.006
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1566,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}