{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 234,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 251.75,
      "completions/max_terminated_length": 251.75,
      "completions/mean_length": 133.75390625,
      "completions/mean_terminated_length": 133.75390625,
      "completions/min_length": 68.25,
      "completions/min_terminated_length": 68.25,
      "epoch": 0.004273504273504274,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 7.757056816470261,
      "grpo_loss": -0.03923009986246484,
      "kl": 4.7147274017333984e-05,
      "learning_rate": 0.0,
      "loss": 0.3179,
      "num_tokens": 67233.0,
      "reward": 2.875,
      "reward_std": 0.16637087427079678,
      "rewards/accuracy_reward/mean": 0.875,
      "rewards/accuracy_reward/std": 0.3316035121679306,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.01564404438249767,
      "stage2_sft_loss": 3.0086944103240967,
      "step": 1,
      "total_loss": 0.2772833965718746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 295.25,
      "completions/max_terminated_length": 295.25,
      "completions/mean_length": 142.0234375,
      "completions/mean_terminated_length": 142.0234375,
      "completions/min_length": 64.25,
      "completions/min_terminated_length": 64.25,
      "epoch": 0.008547008547008548,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 7.627764897193262,
      "grpo_loss": 0.00449168267357436,
      "kl": 2.8908252716064453e-05,
      "learning_rate": 8.333333333333333e-07,
      "loss": 0.3089,
      "num_tokens": 138127.0,
      "reward": 2.91796875,
      "reward_std": 0.15782861225306988,
      "rewards/accuracy_reward/mean": 0.91796875,
      "rewards/accuracy_reward/std": 0.2668132297694683,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.009601139056030661,
      "stage2_sft_loss": 3.002301514148712,
      "step": 2,
      "total_loss": 0.31432297825813293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 120.30859375,
      "completions/mean_terminated_length": 120.30859375,
      "completions/min_length": 66.25,
      "completions/min_terminated_length": 66.25,
      "epoch": 0.01282051282051282,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.495269211942902,
      "grpo_loss": -0.046216885010437636,
      "kl": 4.70578670501709e-05,
      "learning_rate": 1.6666666666666667e-06,
      "loss": 0.3016,
      "num_tokens": 202910.0,
      "reward": 2.91796875,
      "reward_std": 0.12046922650188208,
      "rewards/accuracy_reward/mean": 0.91796875,
      "rewards/accuracy_reward/std": 0.2554394565522671,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.033421904779970646,
      "stage2_sft_loss": 2.95179682970047,
      "step": 3,
      "total_loss": 0.28238470479846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 210.5,
      "completions/max_terminated_length": 210.5,
      "completions/mean_length": 125.6328125,
      "completions/mean_terminated_length": 125.6328125,
      "completions/min_length": 66.25,
      "completions/min_terminated_length": 66.25,
      "epoch": 0.017094017094017096,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.493119857304641,
      "grpo_loss": -0.006215297922665286,
      "kl": 4.932284355163574e-05,
      "learning_rate": 2.5e-06,
      "loss": 0.2727,
      "num_tokens": 268104.0,
      "reward": 2.94921875,
      "reward_std": 0.0991684952750802,
      "rewards/accuracy_reward/mean": 0.94921875,
      "rewards/accuracy_reward/std": 0.21033688634634018,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.008940062718465924,
      "stage2_sft_loss": 2.607166886329651,
      "step": 4,
      "total_loss": 0.26344145461916924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 244.5,
      "completions/max_terminated_length": 244.5,
      "completions/mean_length": 129.5390625,
      "completions/mean_terminated_length": 129.5390625,
      "completions/min_length": 66.5,
      "completions/min_terminated_length": 66.5,
      "epoch": 0.021367521367521368,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 4.563119008532273,
      "grpo_loss": -0.015285405330359936,
      "kl": 5.14984130859375e-05,
      "learning_rate": 3.3333333333333333e-06,
      "loss": 0.2226,
      "num_tokens": 332650.0,
      "reward": 2.921875,
      "reward_std": 0.16584291495382786,
      "rewards/accuracy_reward/mean": 0.921875,
      "rewards/accuracy_reward/std": 0.26355477422475815,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.018052286352030933,
      "stage2_sft_loss": 2.005835622549057,
      "step": 5,
      "total_loss": 0.20335044339299202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 140.67578125,
      "completions/mean_terminated_length": 140.67578125,
      "completions/min_length": 64.5,
      "completions/min_terminated_length": 64.5,
      "epoch": 0.02564102564102564,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 3.4342274451007677,
      "grpo_loss": 0.011418178940715507,
      "kl": 0.00010204315185546875,
      "learning_rate": 4.166666666666667e-06,
      "loss": 0.1624,
      "num_tokens": 403311.0,
      "reward": 2.88671875,
      "reward_std": 0.16755038313567638,
      "rewards/accuracy_reward/mean": 0.88671875,
      "rewards/accuracy_reward/std": 0.3000990152359009,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.014668514719232917,
      "stage2_sft_loss": 1.380816400051117,
      "step": 6,
      "total_loss": 0.1641683429479599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 129.4140625,
      "completions/mean_terminated_length": 129.4140625,
      "completions/min_length": 71.75,
      "completions/min_terminated_length": 71.75,
      "epoch": 0.029914529914529916,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 3.2005230162166525,
      "grpo_loss": 0.0499419612691554,
      "kl": 0.0001811981201171875,
      "learning_rate": 5e-06,
      "loss": 0.138,
      "num_tokens": 468425.0,
      "reward": 2.9189453125,
      "reward_std": 0.1503223106265068,
      "rewards/accuracy_reward/mean": 0.92578125,
      "rewards/accuracy_reward/std": 0.24648795649409294,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9970703125,
      "rewards/tag_count_reward/std": 0.02343750186264515,
      "stage1_sft_loss": 0.008992363698780537,
      "stage2_sft_loss": 1.0415422320365906,
      "step": 7,
      "total_loss": 0.16308855265378952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 223.75,
      "completions/max_terminated_length": 223.75,
      "completions/mean_length": 131.37890625,
      "completions/mean_terminated_length": 131.37890625,
      "completions/min_length": 51.5,
      "completions/min_terminated_length": 51.5,
      "epoch": 0.03418803418803419,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 4.478538499496423,
      "grpo_loss": -0.023593724047032083,
      "kl": 0.0139923095703125,
      "learning_rate": 5.833333333333334e-06,
      "loss": 0.0476,
      "num_tokens": 534458.0,
      "reward": 2.6806640625,
      "reward_std": 0.3335800841450691,
      "rewards/accuracy_reward/mean": 0.8515625,
      "rewards/accuracy_reward/std": 0.34831516817212105,
      "rewards/format_reward/mean": 0.8828125,
      "rewards/format_reward/std": 0.31499073281884193,
      "rewards/tag_count_reward/mean": 0.9462890625,
      "rewards/tag_count_reward/std": 0.14767055958509445,
      "stage1_sft_loss": 0.009093074826523662,
      "stage2_sft_loss": 0.41022688150405884,
      "step": 8,
      "total_loss": 0.02652203943580389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 264.5,
      "completions/max_terminated_length": 264.5,
      "completions/mean_length": 139.171875,
      "completions/mean_terminated_length": 139.171875,
      "completions/min_length": 67.25,
      "completions/min_terminated_length": 67.25,
      "epoch": 0.038461538461538464,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 1.8284207472683809,
      "grpo_loss": 3.280168118635629e-05,
      "kl": 0.0007824897766113281,
      "learning_rate": 6.666666666666667e-06,
      "loss": 0.0525,
      "num_tokens": 605526.0,
      "reward": 2.89453125,
      "reward_std": 0.1113965567201376,
      "rewards/accuracy_reward/mean": 0.89453125,
      "rewards/accuracy_reward/std": 0.29731010645627975,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.008276262902654707,
      "stage2_sft_loss": 0.22668107599020004,
      "step": 9,
      "total_loss": 0.030977172777056694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 262.25,
      "completions/max_terminated_length": 262.25,
      "completions/mean_length": 147.97265625,
      "completions/mean_terminated_length": 147.97265625,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.042735042735042736,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 1.0282877052261212,
      "grpo_loss": 0.0008859479815441773,
      "kl": 0.0009212493896484375,
      "learning_rate": 7.500000000000001e-06,
      "loss": 0.0131,
      "num_tokens": 677151.0,
      "reward": 2.94921875,
      "reward_std": 0.12073762249201536,
      "rewards/accuracy_reward/mean": 0.94921875,
      "rewards/accuracy_reward/std": 0.1755770929157734,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0038859813648741692,
      "stage2_sft_loss": 0.06995399482548237,
      "step": 10,
      "total_loss": 0.01176732883322984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 143.76953125,
      "completions/mean_terminated_length": 143.76953125,
      "completions/min_length": 67.5,
      "completions/min_terminated_length": 67.5,
      "epoch": 0.04700854700854701,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 1.0728683234729373,
      "grpo_loss": 3.9769674543777e-05,
      "kl": 0.002231597900390625,
      "learning_rate": 8.333333333333334e-06,
      "loss": 0.0235,
      "num_tokens": 746860.0,
      "reward": 2.9287109375,
      "reward_std": 0.11981397215276957,
      "rewards/accuracy_reward/mean": 0.93359375,
      "rewards/accuracy_reward/std": 0.23851029947400093,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.011087916325777769,
      "stage2_sft_loss": 0.003615820431150496,
      "step": 11,
      "total_loss": 0.011489268275909126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 137.61328125,
      "completions/mean_terminated_length": 137.61328125,
      "completions/min_length": 63.75,
      "completions/min_terminated_length": 63.75,
      "epoch": 0.05128205128205128,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 2.613003430486309,
      "grpo_loss": 0.007426628337270813,
      "kl": 0.014751434326171875,
      "learning_rate": 9.166666666666666e-06,
      "loss": 0.0109,
      "num_tokens": 815081.0,
      "reward": 2.916015625,
      "reward_std": 0.14841514453291893,
      "rewards/accuracy_reward/mean": 0.95703125,
      "rewards/accuracy_reward/std": 0.17054874077439308,
      "rewards/format_reward/mean": 0.97265625,
      "rewards/format_reward/std": 0.10469620674848557,
      "rewards/tag_count_reward/mean": 0.986328125,
      "rewards/tag_count_reward/std": 0.05234810337424278,
      "stage1_sft_loss": 0.019605551147833467,
      "stage2_sft_loss": 0.016597392386756837,
      "step": 12,
      "total_loss": 0.02869191882200539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 250.25,
      "completions/max_terminated_length": 250.25,
      "completions/mean_length": 137.28125,
      "completions/mean_terminated_length": 137.28125,
      "completions/min_length": 75.25,
      "completions/min_terminated_length": 75.25,
      "epoch": 0.05555555555555555,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 18.474324997624624,
      "grpo_loss": 0.03844568212775812,
      "kl": 0.151611328125,
      "learning_rate": 1e-05,
      "loss": 0.097,
      "num_tokens": 885025.0,
      "reward": 2.0830078125,
      "reward_std": 0.2612629234790802,
      "rewards/accuracy_reward/mean": 0.8046875,
      "rewards/accuracy_reward/std": 0.39255890995264053,
      "rewards/format_reward/mean": 0.515625,
      "rewards/format_reward/std": 0.48288241028785706,
      "rewards/tag_count_reward/mean": 0.7626953125,
      "rewards/tag_count_reward/std": 0.2884962745010853,
      "stage1_sft_loss": 0.014269684674218297,
      "stage2_sft_loss": 0.0914901232754346,
      "step": 13,
      "total_loss": 0.061864377348683774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 228.5,
      "completions/max_terminated_length": 228.5,
      "completions/mean_length": 119.6796875,
      "completions/mean_terminated_length": 119.6796875,
      "completions/min_length": 65.75,
      "completions/min_terminated_length": 65.75,
      "epoch": 0.05982905982905983,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.915039501524939,
      "grpo_loss": 0.0098477653568807,
      "kl": 0.005916595458984375,
      "learning_rate": 1.0833333333333334e-05,
      "loss": 0.0059,
      "num_tokens": 948439.0,
      "reward": 2.9599609375,
      "reward_std": 0.08884451817721128,
      "rewards/accuracy_reward/mean": 0.98046875,
      "rewards/accuracy_reward/std": 0.11576050892472267,
      "rewards/format_reward/mean": 0.98828125,
      "rewards/format_reward/std": 0.05326050892472267,
      "rewards/tag_count_reward/mean": 0.9912109375,
      "rewards/tag_count_reward/std": 0.03994538262486458,
      "stage1_sft_loss": 0.009817917714826763,
      "stage2_sft_loss": 0.009871041984297335,
      "step": 14,
      "total_loss": 0.020652787410654128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 214.0,
      "completions/max_terminated_length": 214.0,
      "completions/mean_length": 127.10546875,
      "completions/mean_terminated_length": 127.10546875,
      "completions/min_length": 74.5,
      "completions/min_terminated_length": 74.5,
      "epoch": 0.0641025641025641,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 2.1793393582609744,
      "grpo_loss": 0.00019377293256184203,
      "kl": 0.0197601318359375,
      "learning_rate": 1.1666666666666668e-05,
      "loss": 0.0314,
      "num_tokens": 1013130.0,
      "reward": 2.87109375,
      "reward_std": 0.14518490061163902,
      "rewards/accuracy_reward/mean": 0.87109375,
      "rewards/accuracy_reward/std": 0.32502105459570885,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.01030012167757377,
      "stage2_sft_loss": 0.0008246329380199313,
      "step": 15,
      "total_loss": 0.010576357715763152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 122.9140625,
      "completions/mean_terminated_length": 122.9140625,
      "completions/min_length": 63.25,
      "completions/min_terminated_length": 63.25,
      "epoch": 0.06837606837606838,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.0006532751956856,
      "grpo_loss": 0.00015884065362570254,
      "kl": 0.011383056640625,
      "learning_rate": 1.25e-05,
      "loss": 0.0248,
      "num_tokens": 1075980.0,
      "reward": 2.90625,
      "reward_std": 0.06378498114645481,
      "rewards/accuracy_reward/mean": 0.90625,
      "rewards/accuracy_reward/std": 0.2523401081562042,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.013333460665307939,
      "stage2_sft_loss": 0.001343881434877403,
      "step": 16,
      "total_loss": 0.013626688945805654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 218.75,
      "completions/max_terminated_length": 218.75,
      "completions/mean_length": 129.1171875,
      "completions/mean_terminated_length": 129.1171875,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.07264957264957266,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.9687509137531682,
      "grpo_loss": 0.0009033796213770984,
      "kl": 0.011016845703125,
      "learning_rate": 1.3333333333333333e-05,
      "loss": 0.0109,
      "num_tokens": 1142074.0,
      "reward": 2.97265625,
      "reward_std": 0.06207750644534826,
      "rewards/accuracy_reward/mean": 0.97265625,
      "rewards/accuracy_reward/std": 0.13608578220009804,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.021144109414308332,
      "stage2_sft_loss": 0.0017398461059201509,
      "step": 17,
      "total_loss": 0.022221474413527176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 225.75,
      "completions/max_terminated_length": 225.75,
      "completions/mean_length": 127.78515625,
      "completions/mean_terminated_length": 127.78515625,
      "completions/min_length": 61.75,
      "completions/min_terminated_length": 61.75,
      "epoch": 0.07692307692307693,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.232924765040882,
      "grpo_loss": 0.0006565492107597493,
      "kl": 0.0255126953125,
      "learning_rate": 1.416666666666667e-05,
      "loss": 0.0187,
      "num_tokens": 1206819.0,
      "reward": 2.9033203125,
      "reward_std": 0.11760413460433483,
      "rewards/accuracy_reward/mean": 0.91015625,
      "rewards/accuracy_reward/std": 0.1928669586777687,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9970703125,
      "rewards/tag_count_reward/std": 0.02343750186264515,
      "stage1_sft_loss": 0.011690539606206585,
      "stage2_sft_loss": 0.0009237131016561761,
      "step": 18,
      "total_loss": 0.012439460391760804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 148.640625,
      "completions/mean_terminated_length": 148.640625,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.0811965811965812,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 2.205851245453103,
      "grpo_loss": -0.0012649910768232075,
      "kl": 0.0226593017578125,
      "learning_rate": 1.5000000000000002e-05,
      "loss": 0.031,
      "num_tokens": 1277815.0,
      "reward": 2.9140625,
      "reward_std": 0.11876176204532385,
      "rewards/accuracy_reward/mean": 0.9140625,
      "rewards/accuracy_reward/std": 0.2623785026371479,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.012684938323218375,
      "stage2_sft_loss": 0.0032887740817386657,
      "step": 19,
      "total_loss": 0.011748824326787144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 245.25,
      "completions/max_terminated_length": 245.25,
      "completions/mean_length": 146.29296875,
      "completions/mean_terminated_length": 146.29296875,
      "completions/min_length": 76.25,
      "completions/min_terminated_length": 76.25,
      "epoch": 0.08547008547008547,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 2.6677025177715405,
      "grpo_loss": -0.03813791631546337,
      "kl": 0.048675537109375,
      "learning_rate": 1.5833333333333333e-05,
      "loss": 0.0254,
      "num_tokens": 1349738.0,
      "reward": 2.900390625,
      "reward_std": 0.15921209380030632,
      "rewards/accuracy_reward/mean": 0.90234375,
      "rewards/accuracy_reward/std": 0.29337338730692863,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.998046875,
      "rewards/tag_count_reward/std": 0.010960506275296211,
      "stage1_sft_loss": 0.023294871527468786,
      "stage2_sft_loss": 0.026308074113330804,
      "step": 20,
      "total_loss": -0.012212236848426983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 136.9375,
      "completions/mean_terminated_length": 136.9375,
      "completions/min_length": 67.25,
      "completions/min_terminated_length": 67.25,
      "epoch": 0.08974358974358974,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 2.7414827300784337,
      "grpo_loss": 0.0006684071158815641,
      "kl": 0.043243408203125,
      "learning_rate": 1.6666666666666667e-05,
      "loss": 0.032,
      "num_tokens": 1418866.0,
      "reward": 2.83984375,
      "reward_std": 0.1150759719312191,
      "rewards/accuracy_reward/mean": 0.83984375,
      "rewards/accuracy_reward/std": 0.3069465383887291,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.02183083994896151,
      "stage2_sft_loss": 0.000715008718543686,
      "step": 21,
      "total_loss": 0.02257074779481627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 261.5,
      "completions/max_terminated_length": 261.5,
      "completions/mean_length": 132.8515625,
      "completions/mean_terminated_length": 132.8515625,
      "completions/min_length": 74.5,
      "completions/min_terminated_length": 74.5,
      "epoch": 0.09401709401709402,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 2.842263357480505,
      "grpo_loss": 0.002760487957857549,
      "kl": 0.040985107421875,
      "learning_rate": 1.7500000000000002e-05,
      "loss": 0.041,
      "num_tokens": 1485252.0,
      "reward": 2.8662109375,
      "reward_std": 0.1345482999458909,
      "rewards/accuracy_reward/mean": 0.87109375,
      "rewards/accuracy_reward/std": 0.300577100366354,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.013363483012653887,
      "stage2_sft_loss": 0.011726534779882059,
      "step": 22,
      "total_loss": 0.0172966243699193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 235.25,
      "completions/max_terminated_length": 235.25,
      "completions/mean_length": 128.09375,
      "completions/mean_terminated_length": 128.09375,
      "completions/min_length": 67.25,
      "completions/min_terminated_length": 67.25,
      "epoch": 0.09829059829059829,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 2.2903318954420415,
      "grpo_loss": 0.04079900741635356,
      "kl": 0.040191650390625,
      "learning_rate": 1.8333333333333333e-05,
      "loss": 0.0472,
      "num_tokens": 1550084.0,
      "reward": 2.8671875,
      "reward_std": 0.17411433160305023,
      "rewards/accuracy_reward/mean": 0.8671875,
      "rewards/accuracy_reward/std": 0.3389710336923599,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05828226753510535,
      "stage2_sft_loss": 0.011348728352459148,
      "step": 23,
      "total_loss": 0.10021615156438202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 351.25,
      "completions/max_terminated_length": 351.25,
      "completions/mean_length": 132.10546875,
      "completions/mean_terminated_length": 132.10546875,
      "completions/min_length": 59.5,
      "completions/min_terminated_length": 59.5,
      "epoch": 0.10256410256410256,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.3689469506326883,
      "grpo_loss": 0.0008243897027568892,
      "kl": 0.0498046875,
      "learning_rate": 1.916666666666667e-05,
      "loss": 0.0496,
      "num_tokens": 1617023.0,
      "reward": 2.87109375,
      "reward_std": 0.20608290284872055,
      "rewards/accuracy_reward/mean": 0.87109375,
      "rewards/accuracy_reward/std": 0.33124853298068047,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.011117298912722617,
      "stage2_sft_loss": 0.001786644832463935,
      "step": 24,
      "total_loss": 0.012120353290811181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 126.80078125,
      "completions/mean_terminated_length": 126.80078125,
      "completions/min_length": 61.75,
      "completions/min_terminated_length": 61.75,
      "epoch": 0.10683760683760683,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.3631085168942807,
      "grpo_loss": 0.0019136814298690297,
      "kl": 0.051910400390625,
      "learning_rate": 2e-05,
      "loss": 0.0334,
      "num_tokens": 1682060.0,
      "reward": 2.8203125,
      "reward_std": 0.18767160922288895,
      "rewards/accuracy_reward/mean": 0.82421875,
      "rewards/accuracy_reward/std": 0.3797462359070778,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09090784145519137,
      "stage2_sft_loss": 0.0017171509243780747,
      "step": 25,
      "total_loss": 0.09299323987215757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 127.9765625,
      "completions/mean_terminated_length": 127.9765625,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.1111111111111111,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.4456833239275717,
      "grpo_loss": -0.012467821623431519,
      "kl": 0.05926513671875,
      "learning_rate": 1.9998881018102735e-05,
      "loss": 0.0471,
      "num_tokens": 1747406.0,
      "reward": 2.83203125,
      "reward_std": 0.18003800511360168,
      "rewards/accuracy_reward/mean": 0.83203125,
      "rewards/accuracy_reward/std": 0.36339013651013374,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.02221155003644526,
      "stage2_sft_loss": 0.003884657140588388,
      "step": 26,
      "total_loss": 0.010132194496691227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.90625,
      "completions/max_length": 513.75,
      "completions/max_terminated_length": 296.25,
      "completions/mean_length": 149.0,
      "completions/mean_terminated_length": 141.6593952178955,
      "completions/min_length": 65.25,
      "completions/min_terminated_length": 65.25,
      "epoch": 0.11538461538461539,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.9242918299689125,
      "grpo_loss": 0.30453615926671773,
      "kl": 0.100341796875,
      "learning_rate": 1.9995524322835035e-05,
      "loss": 0.187,
      "num_tokens": 1818462.0,
      "reward": 2.7412109375,
      "reward_std": 0.28656554222106934,
      "rewards/accuracy_reward/mean": 0.76171875,
      "rewards/accuracy_reward/std": 0.39992421492934227,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.08451050892472267,
      "rewards/tag_count_reward/mean": 0.9951171875,
      "rewards/tag_count_reward/std": 0.029733512550592422,
      "stage1_sft_loss": 0.06889296881854534,
      "stage2_sft_loss": 0.0499890799401328,
      "step": 27,
      "total_loss": 0.3784280573017895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 432.75,
      "completions/max_terminated_length": 305.75,
      "completions/mean_length": 146.4296875,
      "completions/mean_terminated_length": 143.96856689453125,
      "completions/min_length": 65.25,
      "completions/min_terminated_length": 65.25,
      "epoch": 0.11965811965811966,
      "frac_reward_zero_std": 0.28125,
      "grad_norm": 2.757226870099959,
      "grpo_loss": 0.08061277940578293,
      "kl": 0.07757568359375,
      "learning_rate": 1.9989930665413148e-05,
      "loss": 0.1245,
      "num_tokens": 1889644.0,
      "reward": 2.7841796875,
      "reward_std": 0.32649947702884674,
      "rewards/accuracy_reward/mean": 0.80078125,
      "rewards/accuracy_reward/std": 0.3944511339068413,
      "rewards/format_reward/mean": 0.98828125,
      "rewards/format_reward/std": 0.07509202510118484,
      "rewards/tag_count_reward/mean": 0.9951171875,
      "rewards/tag_count_reward/std": 0.03298301063477993,
      "stage1_sft_loss": 0.04570018174126744,
      "stage2_sft_loss": 0.00978102523367852,
      "step": 28,
      "total_loss": 0.1272910633124411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 327.75,
      "completions/max_terminated_length": 327.75,
      "completions/mean_length": 147.90625,
      "completions/mean_terminated_length": 147.90625,
      "completions/min_length": 49.75,
      "completions/min_terminated_length": 49.75,
      "epoch": 0.12393162393162394,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.5000758685486737,
      "grpo_loss": 0.07717300354852341,
      "kl": 0.06719970703125,
      "learning_rate": 1.998210129767735e-05,
      "loss": 0.1019,
      "num_tokens": 1960196.0,
      "reward": 2.8193359375,
      "reward_std": 0.20288447104394436,
      "rewards/accuracy_reward/mean": 0.8203125,
      "rewards/accuracy_reward/std": 0.38116608560085297,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.09177985601127148,
      "stage2_sft_loss": 0.018172981101088226,
      "step": 29,
      "total_loss": 0.17077014781534672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 245.5,
      "completions/max_terminated_length": 245.5,
      "completions/mean_length": 133.14453125,
      "completions/mean_terminated_length": 133.14453125,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.1282051282051282,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.628202806969854,
      "grpo_loss": -0.029030836129095405,
      "kl": 0.076416015625,
      "learning_rate": 1.9972037971811802e-05,
      "loss": 0.0678,
      "num_tokens": 2027969.0,
      "reward": 2.7900390625,
      "reward_std": 0.23592286556959152,
      "rewards/accuracy_reward/mean": 0.796875,
      "rewards/accuracy_reward/std": 0.40148045867681503,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.9931640625,
      "rewards/tag_count_reward/std": 0.027866113930940628,
      "stage1_sft_loss": 0.09150360058993101,
      "stage2_sft_loss": 0.03152645181398839,
      "step": 30,
      "total_loss": 0.06562541099265218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 390.5,
      "completions/max_terminated_length": 282.75,
      "completions/mean_length": 139.5234375,
      "completions/mean_terminated_length": 137.08885192871094,
      "completions/min_length": 63.25,
      "completions/min_terminated_length": 63.25,
      "epoch": 0.13247863247863248,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.5503433302929377,
      "grpo_loss": -0.01416161423549056,
      "kl": 0.0740966796875,
      "learning_rate": 1.9959742939952393e-05,
      "loss": 0.0858,
      "num_tokens": 2096983.0,
      "reward": 2.732421875,
      "reward_std": 0.2888246178627014,
      "rewards/accuracy_reward/mean": 0.73828125,
      "rewards/accuracy_reward/std": 0.43773481994867325,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.998046875,
      "rewards/tag_count_reward/std": 0.015625,
      "stage1_sft_loss": 0.07643852988258004,
      "stage2_sft_loss": 0.042741050478070974,
      "step": 31,
      "total_loss": 0.06655102036893368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 346.5,
      "completions/max_terminated_length": 208.25,
      "completions/mean_length": 118.8671875,
      "completions/mean_terminated_length": 116.32986259460449,
      "completions/min_length": 63.75,
      "completions/min_terminated_length": 63.75,
      "epoch": 0.13675213675213677,
      "frac_reward_zero_std": 0.21875,
      "grad_norm": 2.367757483948867,
      "grpo_loss": -0.02189300279133022,
      "kl": 0.071044921875,
      "learning_rate": 1.9945218953682736e-05,
      "loss": 0.0887,
      "num_tokens": 2159077.0,
      "reward": 2.7685546875,
      "reward_std": 0.35685962438583374,
      "rewards/accuracy_reward/mean": 0.77734375,
      "rewards/accuracy_reward/std": 0.4162580147385597,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9951171875,
      "rewards/tag_count_reward/std": 0.032399868592619896,
      "stage1_sft_loss": 0.04500664956867695,
      "stage2_sft_loss": 0.007157278014346957,
      "step": 32,
      "total_loss": 0.023829374462366104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 231.5,
      "completions/max_terminated_length": 231.5,
      "completions/mean_length": 112.40625,
      "completions/mean_terminated_length": 112.40625,
      "completions/min_length": 63.25,
      "completions/min_terminated_length": 63.25,
      "epoch": 0.14102564102564102,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 2.1938369497046755,
      "grpo_loss": -0.03613918833434582,
      "kl": 0.0726318359375,
      "learning_rate": 1.9928469263418376e-05,
      "loss": 0.0584,
      "num_tokens": 2221941.0,
      "reward": 2.85546875,
      "reward_std": 0.16728199180215597,
      "rewards/accuracy_reward/mean": 0.85546875,
      "rewards/accuracy_reward/std": 0.324543260037899,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04147233220282942,
      "stage2_sft_loss": 0.0106936156807933,
      "step": 33,
      "total_loss": 0.006402507424354553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 428.25,
      "completions/max_terminated_length": 302.25,
      "completions/mean_length": 130.5546875,
      "completions/mean_terminated_length": 128.06665802001953,
      "completions/min_length": 60.75,
      "completions/min_terminated_length": 60.75,
      "epoch": 0.1452991452991453,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.245639334646511,
      "grpo_loss": 0.02738322678487748,
      "kl": 0.07635498046875,
      "learning_rate": 1.990949761767935e-05,
      "loss": 0.0873,
      "num_tokens": 2289123.0,
      "reward": 2.734375,
      "reward_std": 0.30481160432100296,
      "rewards/accuracy_reward/mean": 0.74609375,
      "rewards/accuracy_reward/std": 0.4096733331680298,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.03125000186264515,
      "stage1_sft_loss": 0.07007205486297607,
      "stage2_sft_loss": 0.009100667899474502,
      "step": 34,
      "total_loss": 0.09836534410715103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 367.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 139.42578125,
      "completions/mean_terminated_length": 139.42578125,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.14957264957264957,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.6016141061868447,
      "grpo_loss": 0.09335880249273032,
      "kl": 0.0828857421875,
      "learning_rate": 1.9888308262251286e-05,
      "loss": 0.0736,
      "num_tokens": 2359440.0,
      "reward": 2.71875,
      "reward_std": 0.2593473196029663,
      "rewards/accuracy_reward/mean": 0.71875,
      "rewards/accuracy_reward/std": 0.43959466367959976,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06350621022284031,
      "stage2_sft_loss": 0.01845851496909745,
      "step": 35,
      "total_loss": 0.15871086157858372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 237.25,
      "completions/max_terminated_length": 237.25,
      "completions/mean_length": 116.875,
      "completions/mean_terminated_length": 116.875,
      "completions/min_length": 63.25,
      "completions/min_terminated_length": 63.25,
      "epoch": 0.15384615384615385,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.0495372575896247,
      "grpo_loss": -0.05065595902851783,
      "kl": 0.0775146484375,
      "learning_rate": 1.9864905939235215e-05,
      "loss": 0.0527,
      "num_tokens": 2422144.0,
      "reward": 2.8271484375,
      "reward_std": 0.2663256488740444,
      "rewards/accuracy_reward/mean": 0.83203125,
      "rewards/accuracy_reward/std": 0.3622770607471466,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.04677494731731713,
      "stage2_sft_loss": 0.018676572712138295,
      "step": 36,
      "total_loss": -0.002013354969676584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 231.5,
      "completions/max_terminated_length": 231.5,
      "completions/mean_length": 131.09375,
      "completions/mean_terminated_length": 131.09375,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 0.1581196581196581,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.5671491792749266,
      "grpo_loss": -0.04589961684541777,
      "kl": 0.0906982421875,
      "learning_rate": 1.98392958859863e-05,
      "loss": 0.0581,
      "num_tokens": 2488736.0,
      "reward": 2.8671875,
      "reward_std": 0.2171314489096403,
      "rewards/accuracy_reward/mean": 0.8671875,
      "rewards/accuracy_reward/std": 0.33943870663642883,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0481309499591589,
      "stage2_sft_loss": 0.012330142228165641,
      "step": 37,
      "total_loss": 0.003464347682893276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 389.5,
      "completions/max_terminated_length": 255.75,
      "completions/mean_length": 133.875,
      "completions/mean_terminated_length": 131.4334716796875,
      "completions/min_length": 61.5,
      "completions/min_terminated_length": 61.5,
      "epoch": 0.1623931623931624,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.721311776528719,
      "grpo_loss": 0.011054625327233225,
      "kl": 0.096923828125,
      "learning_rate": 1.9811483833941726e-05,
      "loss": 0.1324,
      "num_tokens": 2555168.0,
      "reward": 2.7509765625,
      "reward_std": 0.26854483410716057,
      "rewards/accuracy_reward/mean": 0.7578125,
      "rewards/accuracy_reward/std": 0.42780231684446335,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9970703125,
      "rewards/tag_count_reward/std": 0.02343750186264515,
      "stage1_sft_loss": 0.06103221420198679,
      "stage2_sft_loss": 0.01654094600235112,
      "step": 38,
      "total_loss": 0.07374093309044838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 249.25,
      "completions/max_terminated_length": 249.25,
      "completions/mean_length": 141.53515625,
      "completions/mean_terminated_length": 141.53515625,
      "completions/min_length": 69.25,
      "completions/min_terminated_length": 69.25,
      "epoch": 0.16666666666666666,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 1.947435216648249,
      "grpo_loss": -0.03299991798121482,
      "kl": 0.07366943359375,
      "learning_rate": 1.9781476007338058e-05,
      "loss": 0.0564,
      "num_tokens": 2624649.0,
      "reward": 2.859375,
      "reward_std": 0.16781241074204445,
      "rewards/accuracy_reward/mean": 0.859375,
      "rewards/accuracy_reward/std": 0.2867218255996704,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.03650881536304951,
      "stage2_sft_loss": 0.009737508415128104,
      "step": 39,
      "total_loss": 0.004482649266719818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 259.5,
      "completions/mean_length": 136.1640625,
      "completions/mean_terminated_length": 133.68520736694336,
      "completions/min_length": 69.75,
      "completions/min_terminated_length": 69.75,
      "epoch": 0.17094017094017094,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.559201255744803,
      "grpo_loss": 0.10094646381912753,
      "kl": 0.1031494140625,
      "learning_rate": 1.9749279121818235e-05,
      "loss": 0.1331,
      "num_tokens": 2692499.0,
      "reward": 2.7470703125,
      "reward_std": 0.2642187662422657,
      "rewards/accuracy_reward/mean": 0.75390625,
      "rewards/accuracy_reward/std": 0.421333409845829,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9970703125,
      "rewards/tag_count_reward/std": 0.02343750186264515,
      "stage1_sft_loss": 0.0752207487821579,
      "stage2_sft_loss": 0.013288459565956146,
      "step": 40,
      "total_loss": 0.1774960570037365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 232.75,
      "completions/max_terminated_length": 232.75,
      "completions/mean_length": 131.78515625,
      "completions/mean_terminated_length": 131.78515625,
      "completions/min_length": 70.75,
      "completions/min_terminated_length": 70.75,
      "epoch": 0.1752136752136752,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.6517951258463395,
      "grpo_loss": -0.0022816565178800374,
      "kl": 0.0897216796875,
      "learning_rate": 1.9714900382928674e-05,
      "loss": 0.0642,
      "num_tokens": 2757900.0,
      "reward": 2.78515625,
      "reward_std": 0.22765203192830086,
      "rewards/accuracy_reward/mean": 0.78515625,
      "rewards/accuracy_reward/std": 0.3814377970993519,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.03658259240910411,
      "stage2_sft_loss": 0.002672710397746414,
      "step": 41,
      "total_loss": 0.03456820675637573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 253.75,
      "completions/max_terminated_length": 253.75,
      "completions/mean_length": 137.40625,
      "completions/mean_terminated_length": 137.40625,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.1794871794871795,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.8783991169680565,
      "grpo_loss": -0.018433816148899496,
      "kl": 0.1077880859375,
      "learning_rate": 1.9678347484506667e-05,
      "loss": 0.0765,
      "num_tokens": 2826092.0,
      "reward": 2.8203125,
      "reward_std": 0.2398776262998581,
      "rewards/accuracy_reward/mean": 0.8203125,
      "rewards/accuracy_reward/std": 0.3670658878982067,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09272383619099855,
      "stage2_sft_loss": 0.008958085381891578,
      "step": 42,
      "total_loss": 0.07518582977354527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 238.5,
      "completions/max_terminated_length": 238.5,
      "completions/mean_length": 129.51953125,
      "completions/mean_terminated_length": 129.51953125,
      "completions/min_length": 53.5,
      "completions/min_terminated_length": 53.5,
      "epoch": 0.18376068376068377,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.9214012259202597,
      "grpo_loss": 0.005569995621044654,
      "kl": 0.1058349609375,
      "learning_rate": 1.9639628606958535e-05,
      "loss": 0.0934,
      "num_tokens": 2891849.0,
      "reward": 2.7578125,
      "reward_std": 0.20555248856544495,
      "rewards/accuracy_reward/mean": 0.7578125,
      "rewards/accuracy_reward/std": 0.41410429030656815,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07663985062390566,
      "stage2_sft_loss": 0.03882742972928099,
      "step": 43,
      "total_loss": 0.08609258919022977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 126.0390625,
      "completions/mean_terminated_length": 126.0390625,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.18803418803418803,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.714946091428236,
      "grpo_loss": 0.011901529098395258,
      "kl": 0.1015625,
      "learning_rate": 1.9598752415428893e-05,
      "loss": 0.0697,
      "num_tokens": 2957499.0,
      "reward": 2.8203125,
      "reward_std": 0.20582089200615883,
      "rewards/accuracy_reward/mean": 0.8203125,
      "rewards/accuracy_reward/std": 0.3795153424143791,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06588136032223701,
      "stage2_sft_loss": 0.008633614983409643,
      "step": 44,
      "total_loss": 0.07864625263027847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 334.75,
      "completions/max_terminated_length": 334.75,
      "completions/mean_length": 149.09765625,
      "completions/mean_terminated_length": 149.09765625,
      "completions/min_length": 63.25,
      "completions/min_terminated_length": 63.25,
      "epoch": 0.19230769230769232,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.636385542774345,
      "grpo_loss": -0.0019036887679249048,
      "kl": 0.116455078125,
      "learning_rate": 1.955572805786141e-05,
      "loss": 0.0948,
      "num_tokens": 3029404.0,
      "reward": 2.79296875,
      "reward_std": 0.2106798104941845,
      "rewards/accuracy_reward/mean": 0.79296875,
      "rewards/accuracy_reward/std": 0.38675472885370255,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07216842658817768,
      "stage2_sft_loss": 0.013309189293067902,
      "step": 45,
      "total_loss": 0.07159565854817629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 322.25,
      "completions/max_terminated_length": 322.25,
      "completions/mean_length": 141.2421875,
      "completions/mean_terminated_length": 141.2421875,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.19658119658119658,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 3.1514982024595684,
      "grpo_loss": -0.034489477693568915,
      "kl": 0.1217041015625,
      "learning_rate": 1.9510565162951538e-05,
      "loss": 0.0756,
      "num_tokens": 3100466.0,
      "reward": 2.8203125,
      "reward_std": 0.1822783462703228,
      "rewards/accuracy_reward/mean": 0.8203125,
      "rewards/accuracy_reward/std": 0.36859218776226044,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04779887804761529,
      "stage2_sft_loss": 0.01275486926897429,
      "step": 46,
      "total_loss": 0.014584887307137251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 297.25,
      "completions/max_terminated_length": 297.25,
      "completions/mean_length": 130.953125,
      "completions/mean_terminated_length": 130.953125,
      "completions/min_length": 65.75,
      "completions/min_terminated_length": 65.75,
      "epoch": 0.20085470085470086,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.639450650301935,
      "grpo_loss": 0.06104949407745153,
      "kl": 0.115966796875,
      "learning_rate": 1.9463273837991643e-05,
      "loss": 0.1032,
      "num_tokens": 3168358.0,
      "reward": 2.8359375,
      "reward_std": 0.1751839891076088,
      "rewards/accuracy_reward/mean": 0.8359375,
      "rewards/accuracy_reward/std": 0.3520497493445873,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08292205259203911,
      "stage2_sft_loss": 0.0034438925213180482,
      "step": 47,
      "total_loss": 0.14431593287736177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 322.25,
      "completions/max_terminated_length": 322.25,
      "completions/mean_length": 128.65234375,
      "completions/mean_terminated_length": 128.65234375,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.20512820512820512,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.8560131128791295,
      "grpo_loss": 0.020472342817811295,
      "kl": 0.11181640625,
      "learning_rate": 1.9413864666609036e-05,
      "loss": 0.0903,
      "num_tokens": 3234877.0,
      "reward": 2.7421875,
      "reward_std": 0.24184712767601013,
      "rewards/accuracy_reward/mean": 0.7421875,
      "rewards/accuracy_reward/std": 0.42586562037467957,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05367158725857735,
      "stage2_sft_loss": 0.01597646134905517,
      "step": 48,
      "total_loss": 0.0757415764965117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 259.25,
      "completions/max_terminated_length": 259.25,
      "completions/mean_length": 123.24609375,
      "completions/mean_terminated_length": 123.24609375,
      "completions/min_length": 59.75,
      "completions/min_terminated_length": 59.75,
      "epoch": 0.2094017094017094,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.632971721617991,
      "grpo_loss": -0.038325678557157516,
      "kl": 0.1258544921875,
      "learning_rate": 1.9362348706397374e-05,
      "loss": 0.1024,
      "num_tokens": 3298860.0,
      "reward": 2.8125,
      "reward_std": 0.21778054535388947,
      "rewards/accuracy_reward/mean": 0.8125,
      "rewards/accuracy_reward/std": 0.3778356984257698,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10100406594574451,
      "stage2_sft_loss": 0.008230720413848758,
      "step": 49,
      "total_loss": 0.06350146140903234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 294.75,
      "completions/max_terminated_length": 294.75,
      "completions/mean_length": 127.5078125,
      "completions/mean_terminated_length": 127.5078125,
      "completions/min_length": 66.75,
      "completions/min_terminated_length": 66.75,
      "epoch": 0.21367521367521367,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.611812735045678,
      "grpo_loss": 0.05120960566273425,
      "kl": 0.114013671875,
      "learning_rate": 1.9308737486442045e-05,
      "loss": 0.1098,
      "num_tokens": 3363390.0,
      "reward": 2.796875,
      "reward_std": 0.22883153706789017,
      "rewards/accuracy_reward/mean": 0.796875,
      "rewards/accuracy_reward/std": 0.39198366552591324,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08808937482535839,
      "stage2_sft_loss": 0.01635385846020654,
      "step": 50,
      "total_loss": 0.14093436300754547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 131.59765625,
      "completions/mean_terminated_length": 131.59765625,
      "completions/min_length": 61.25,
      "completions/min_terminated_length": 61.25,
      "epoch": 0.21794871794871795,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 2.746283813005745,
      "grpo_loss": 0.012858211644925177,
      "kl": 0.122802734375,
      "learning_rate": 1.9253043004739967e-05,
      "loss": 0.0938,
      "num_tokens": 3429839.0,
      "reward": 2.751953125,
      "reward_std": 0.29525136202573776,
      "rewards/accuracy_reward/mean": 0.7578125,
      "rewards/accuracy_reward/std": 0.415524423122406,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.998046875,
      "rewards/tag_count_reward/std": 0.015625,
      "stage1_sft_loss": 0.10655882768332958,
      "stage2_sft_loss": 0.03493454266572371,
      "step": 51,
      "total_loss": 0.12291049398481846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 320.25,
      "completions/max_terminated_length": 320.25,
      "completions/mean_length": 138.62109375,
      "completions/mean_terminated_length": 138.62109375,
      "completions/min_length": 61.5,
      "completions/min_terminated_length": 61.5,
      "epoch": 0.2222222222222222,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.0483754177336944,
      "grpo_loss": -0.026889342727372423,
      "kl": 0.1077880859375,
      "learning_rate": 1.919527772551451e-05,
      "loss": 0.0526,
      "num_tokens": 3498630.0,
      "reward": 2.7734375,
      "reward_std": 0.2531665191054344,
      "rewards/accuracy_reward/mean": 0.7734375,
      "rewards/accuracy_reward/std": 0.4207340404391289,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0339956721290946,
      "stage2_sft_loss": 0.007436095038428903,
      "step": 52,
      "total_loss": 0.00784993963316083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 239.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 137.515625,
      "completions/mean_terminated_length": 137.515625,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.2264957264957265,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.5497153837326794,
      "grpo_loss": 0.035073344348347746,
      "kl": 0.113037109375,
      "learning_rate": 1.913545457642601e-05,
      "loss": 0.0757,
      "num_tokens": 3566970.0,
      "reward": 2.7177734375,
      "reward_std": 0.26900235936045647,
      "rewards/accuracy_reward/mean": 0.72265625,
      "rewards/accuracy_reward/std": 0.4300394877791405,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.05108351481612772,
      "stage2_sft_loss": 0.00476168844033964,
      "step": 53,
      "total_loss": 0.0866330279968679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 339.5,
      "completions/max_terminated_length": 339.5,
      "completions/mean_length": 149.2734375,
      "completions/mean_terminated_length": 149.2734375,
      "completions/min_length": 61.75,
      "completions/min_terminated_length": 61.75,
      "epoch": 0.23076923076923078,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.0633181493829387,
      "grpo_loss": -0.02261926600476727,
      "kl": 0.1124267578125,
      "learning_rate": 1.907358694567865e-05,
      "loss": 0.0659,
      "num_tokens": 3638304.0,
      "reward": 2.77734375,
      "reward_std": 0.21831096336245537,
      "rewards/accuracy_reward/mean": 0.77734375,
      "rewards/accuracy_reward/std": 0.404835507273674,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04506829101592302,
      "stage2_sft_loss": 0.008663312415592372,
      "step": 54,
      "total_loss": 0.023315356113016605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 132.94140625,
      "completions/mean_terminated_length": 132.94140625,
      "completions/min_length": 53.5,
      "completions/min_terminated_length": 53.5,
      "epoch": 0.23504273504273504,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.1835339801699676,
      "grpo_loss": -0.008162843412719667,
      "kl": 0.129150390625,
      "learning_rate": 1.900968867902419e-05,
      "loss": 0.0742,
      "num_tokens": 3705953.0,
      "reward": 2.73046875,
      "reward_std": 0.23619184270501137,
      "rewards/accuracy_reward/mean": 0.73046875,
      "rewards/accuracy_reward/std": 0.44539549201726913,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06734194559976459,
      "stage2_sft_loss": 0.02584816412127111,
      "step": 55,
      "total_loss": 0.06176391919143498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 271.25,
      "completions/max_terminated_length": 271.25,
      "completions/mean_length": 125.203125,
      "completions/mean_terminated_length": 125.203125,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.23931623931623933,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.733215153622518,
      "grpo_loss": 0.012670826748944819,
      "kl": 0.1082763671875,
      "learning_rate": 1.8943774076663372e-05,
      "loss": 0.0959,
      "num_tokens": 3769509.0,
      "reward": 2.837890625,
      "reward_std": 0.19866740703582764,
      "rewards/accuracy_reward/mean": 0.84375,
      "rewards/accuracy_reward/std": 0.3607676178216934,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.998046875,
      "rewards/tag_count_reward/std": 0.015625,
      "stage1_sft_loss": 0.09594716737046838,
      "stage2_sft_loss": 0.010085783338581678,
      "step": 56,
      "total_loss": 0.10962657257914543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 125.234375,
      "completions/mean_terminated_length": 125.234375,
      "completions/min_length": 64.5,
      "completions/min_terminated_length": 64.5,
      "epoch": 0.24358974358974358,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.7279202762656127,
      "grpo_loss": 0.025446785730309784,
      "kl": 0.123291015625,
      "learning_rate": 1.8875857890045544e-05,
      "loss": 0.1284,
      "num_tokens": 3835513.0,
      "reward": 2.85546875,
      "reward_std": 0.17780011892318726,
      "rewards/accuracy_reward/mean": 0.85546875,
      "rewards/accuracy_reward/std": 0.3399829603731632,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06284261774271727,
      "stage2_sft_loss": 0.023300331144127995,
      "step": 57,
      "total_loss": 0.09061943367123604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 291.25,
      "completions/max_terminated_length": 291.25,
      "completions/mean_length": 133.2421875,
      "completions/mean_terminated_length": 133.2421875,
      "completions/min_length": 61.25,
      "completions/min_terminated_length": 61.25,
      "epoch": 0.24786324786324787,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.583004960150858,
      "grpo_loss": -0.05313935538288206,
      "kl": 0.1015625,
      "learning_rate": 1.880595531856738e-05,
      "loss": 0.099,
      "num_tokens": 3903943.0,
      "reward": 2.828125,
      "reward_std": 0.22700048610568047,
      "rewards/accuracy_reward/mean": 0.828125,
      "rewards/accuracy_reward/std": 0.37239526212215424,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08836286794394255,
      "stage2_sft_loss": 0.014138269005343318,
      "step": 58,
      "total_loss": 0.03663733811117709
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 204.25,
      "completions/max_terminated_length": 204.25,
      "completions/mean_length": 116.6328125,
      "completions/mean_terminated_length": 116.6328125,
      "completions/min_length": 66.5,
      "completions/min_terminated_length": 66.5,
      "epoch": 0.25213675213675213,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.801369803273171,
      "grpo_loss": 0.008001050562597811,
      "kl": 0.127685546875,
      "learning_rate": 1.87340820061713e-05,
      "loss": 0.085,
      "num_tokens": 3965385.0,
      "reward": 2.8046875,
      "reward_std": 0.18702251091599464,
      "rewards/accuracy_reward/mean": 0.8046875,
      "rewards/accuracy_reward/std": 0.3881981149315834,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09833742864429951,
      "stage2_sft_loss": 0.014318075140181463,
      "step": 59,
      "total_loss": 0.1077702846378088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 373.75,
      "completions/max_terminated_length": 373.75,
      "completions/mean_length": 137.9453125,
      "completions/mean_terminated_length": 137.9453125,
      "completions/min_length": 61.75,
      "completions/min_terminated_length": 61.75,
      "epoch": 0.2564102564102564,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.4729951369645224,
      "grpo_loss": 0.032328023575246334,
      "kl": 0.1182861328125,
      "learning_rate": 1.866025403784439e-05,
      "loss": 0.0737,
      "num_tokens": 4034187.0,
      "reward": 2.72265625,
      "reward_std": 0.18254429474473,
      "rewards/accuracy_reward/mean": 0.72265625,
      "rewards/accuracy_reward/std": 0.4300394877791405,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08115582540631294,
      "stage2_sft_loss": 0.015377040166640654,
      "step": 60,
      "total_loss": 0.11502155102789402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 292.5,
      "completions/max_terminated_length": 292.5,
      "completions/mean_length": 141.75,
      "completions/mean_terminated_length": 141.75,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.2606837606837607,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.6380177354683982,
      "grpo_loss": -0.0549847207730636,
      "kl": 0.1064453125,
      "learning_rate": 1.8584487936018663e-05,
      "loss": 0.0864,
      "num_tokens": 4104171.0,
      "reward": 2.7890625,
      "reward_std": 0.2877512201666832,
      "rewards/accuracy_reward/mean": 0.7890625,
      "rewards/accuracy_reward/std": 0.3888401687145233,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05271852482110262,
      "stage2_sft_loss": 0.011416645691497251,
      "step": 61,
      "total_loss": -0.0011245310306549072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 292.75,
      "completions/max_terminated_length": 292.75,
      "completions/mean_length": 142.89453125,
      "completions/mean_terminated_length": 142.89453125,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 0.26495726495726496,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.7487282290939965,
      "grpo_loss": -0.003460176521912217,
      "kl": 0.1220703125,
      "learning_rate": 1.8506800656873397e-05,
      "loss": 0.072,
      "num_tokens": 4175072.0,
      "reward": 2.73828125,
      "reward_std": 0.27248647063970566,
      "rewards/accuracy_reward/mean": 0.73828125,
      "rewards/accuracy_reward/std": 0.4357043281197548,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07949414104223251,
      "stage2_sft_loss": 0.021672878530807793,
      "step": 62,
      "total_loss": 0.07820125250145793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 264.75,
      "completions/max_terminated_length": 264.75,
      "completions/mean_length": 124.671875,
      "completions/mean_terminated_length": 124.671875,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.2692307692307692,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.813615284328756,
      "grpo_loss": 0.03867705701850355,
      "kl": 0.1285400390625,
      "learning_rate": 1.8427209586540392e-05,
      "loss": 0.1084,
      "num_tokens": 4240332.0,
      "reward": 2.734375,
      "reward_std": 0.2494782656431198,
      "rewards/accuracy_reward/mean": 0.734375,
      "rewards/accuracy_reward/std": 0.43634092807769775,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.13912567868828773,
      "stage2_sft_loss": 0.012130932504078373,
      "step": 63,
      "total_loss": 0.17901583388447762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 251.25,
      "completions/max_terminated_length": 251.25,
      "completions/mean_length": 130.765625,
      "completions/mean_terminated_length": 130.765625,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.27350427350427353,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.591798770488697,
      "grpo_loss": 0.022650390383205377,
      "kl": 0.1220703125,
      "learning_rate": 1.834573253721303e-05,
      "loss": 0.0926,
      "num_tokens": 4306648.0,
      "reward": 2.7177734375,
      "reward_std": 0.24752728268504143,
      "rewards/accuracy_reward/mean": 0.72265625,
      "rewards/accuracy_reward/std": 0.41977670788764954,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.0872775714378804,
      "stage2_sft_loss": 0.021733683039201424,
      "step": 64,
      "total_loss": 0.11210132867563516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 259.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 126.6875,
      "completions/mean_terminated_length": 126.6875,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.2777777777777778,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 3.672327894257255,
      "grpo_loss": 0.007078448950778693,
      "kl": 0.146484375,
      "learning_rate": 1.826238774315995e-05,
      "loss": 0.0939,
      "num_tokens": 4370736.0,
      "reward": 2.7412109375,
      "reward_std": 0.2940043546259403,
      "rewards/accuracy_reward/mean": 0.74609375,
      "rewards/accuracy_reward/std": 0.4365294650197029,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.06434842199087143,
      "stage2_sft_loss": 0.004225670505547896,
      "step": 65,
      "total_loss": 0.07184943649917841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 127.3515625,
      "completions/mean_terminated_length": 127.3515625,
      "completions/min_length": 52.5,
      "completions/min_terminated_length": 52.5,
      "epoch": 0.28205128205128205,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 3.0415796698361546,
      "grpo_loss": 0.029165754676796496,
      "kl": 0.145263671875,
      "learning_rate": 1.8177193856644315e-05,
      "loss": 0.1335,
      "num_tokens": 4435762.0,
      "reward": 2.75390625,
      "reward_std": 0.25540194660425186,
      "rewards/accuracy_reward/mean": 0.75390625,
      "rewards/accuracy_reward/std": 0.4187935143709183,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09681425290182233,
      "stage2_sft_loss": 0.008607075666077435,
      "step": 66,
      "total_loss": 0.1268407143652439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 290.0,
      "completions/max_terminated_length": 290.0,
      "completions/mean_length": 151.48828125,
      "completions/mean_terminated_length": 151.48828125,
      "completions/min_length": 65.75,
      "completions/min_terminated_length": 65.75,
      "epoch": 0.2863247863247863,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.652440633953194,
      "grpo_loss": -0.0036158948205411434,
      "kl": 0.131103515625,
      "learning_rate": 1.8090169943749477e-05,
      "loss": 0.0995,
      "num_tokens": 4508927.0,
      "reward": 2.69140625,
      "reward_std": 0.28235550969839096,
      "rewards/accuracy_reward/mean": 0.69140625,
      "rewards/accuracy_reward/std": 0.4624432474374771,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08319994248449802,
      "stage2_sft_loss": 0.02071410999633372,
      "step": 67,
      "total_loss": 0.08165545924566686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 385.5,
      "completions/max_terminated_length": 247.5,
      "completions/mean_length": 129.46484375,
      "completions/mean_terminated_length": 126.99547576904297,
      "completions/min_length": 66.5,
      "completions/min_terminated_length": 66.5,
      "epoch": 0.2905982905982906,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.6022327307903494,
      "grpo_loss": 0.014323993455036543,
      "kl": 0.143798828125,
      "learning_rate": 1.8001335480112067e-05,
      "loss": 0.128,
      "num_tokens": 4573438.0,
      "reward": 2.7509765625,
      "reward_std": 0.2806566655635834,
      "rewards/accuracy_reward/mean": 0.7578125,
      "rewards/accuracy_reward/std": 0.4210883155465126,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9970703125,
      "rewards/tag_count_reward/std": 0.02343750186264515,
      "stage1_sft_loss": 0.06220689509063959,
      "stage2_sft_loss": 0.015904361513094045,
      "step": 68,
      "total_loss": 0.07812132267281413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 300.25,
      "completions/max_terminated_length": 300.25,
      "completions/mean_length": 138.07421875,
      "completions/mean_terminated_length": 138.07421875,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.2948717948717949,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 3.7319832588310686,
      "grpo_loss": -0.02579806634457782,
      "kl": 0.167236328125,
      "learning_rate": 1.7910710346563417e-05,
      "loss": 0.0894,
      "num_tokens": 4643169.0,
      "reward": 2.73828125,
      "reward_std": 0.22882908582687378,
      "rewards/accuracy_reward/mean": 0.73828125,
      "rewards/accuracy_reward/std": 0.4144846946001053,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.12717284774407744,
      "stage2_sft_loss": 0.021347035173675977,
      "step": 69,
      "total_loss": 0.10350948257837445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 340.0,
      "completions/max_terminated_length": 340.0,
      "completions/mean_length": 138.43359375,
      "completions/mean_terminated_length": 138.43359375,
      "completions/min_length": 62.75,
      "completions/min_terminated_length": 62.75,
      "epoch": 0.29914529914529914,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.7765053595431586,
      "grpo_loss": -0.025126174557954073,
      "kl": 0.1556396484375,
      "learning_rate": 1.78183148246803e-05,
      "loss": 0.1098,
      "num_tokens": 4712112.0,
      "reward": 2.75390625,
      "reward_std": 0.29183993488550186,
      "rewards/accuracy_reward/mean": 0.75390625,
      "rewards/accuracy_reward/std": 0.4199880510568619,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09879820048809052,
      "stage2_sft_loss": 0.01800179434940219,
      "step": 70,
      "total_loss": 0.07547220401465893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 445.75,
      "completions/max_terminated_length": 350.5,
      "completions/mean_length": 147.3828125,
      "completions/mean_terminated_length": 145.00130462646484,
      "completions/min_length": 65.25,
      "completions/min_terminated_length": 65.25,
      "epoch": 0.3034188034188034,
      "frac_reward_zero_std": 0.21875,
      "grad_norm": 3.1655373431903913,
      "grpo_loss": 0.04423183586914092,
      "kl": 0.1580810546875,
      "learning_rate": 1.7724169592245996e-05,
      "loss": 0.1313,
      "num_tokens": 4783658.0,
      "reward": 2.6455078125,
      "reward_std": 0.36747099831700325,
      "rewards/accuracy_reward/mean": 0.65234375,
      "rewards/accuracy_reward/std": 0.4756612181663513,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9970703125,
      "rewards/tag_count_reward/std": 0.02343750186264515,
      "stage1_sft_loss": 0.08538164384663105,
      "stage2_sft_loss": 0.018582295946544036,
      "step": 71,
      "total_loss": 0.1314717074856162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 287.0,
      "completions/max_terminated_length": 287.0,
      "completions/mean_length": 139.63671875,
      "completions/mean_terminated_length": 139.63671875,
      "completions/min_length": 69.5,
      "completions/min_terminated_length": 69.5,
      "epoch": 0.3076923076923077,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 3.0160388980914843,
      "grpo_loss": -0.016488363384269178,
      "kl": 0.1353759765625,
      "learning_rate": 1.7628295718622666e-05,
      "loss": 0.0915,
      "num_tokens": 4852685.0,
      "reward": 2.75390625,
      "reward_std": 0.2222587689757347,
      "rewards/accuracy_reward/mean": 0.75390625,
      "rewards/accuracy_reward/std": 0.4294766038656235,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10504917008802295,
      "stage2_sft_loss": 0.016596848021436017,
      "step": 72,
      "total_loss": 0.09022049233317375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 362.25,
      "completions/max_terminated_length": 362.25,
      "completions/mean_length": 136.34375,
      "completions/mean_terminated_length": 136.34375,
      "completions/min_length": 55.5,
      "completions/min_terminated_length": 55.5,
      "epoch": 0.31196581196581197,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.3874562532234616,
      "grpo_loss": -0.025953882723115385,
      "kl": 0.1104736328125,
      "learning_rate": 1.7530714660036112e-05,
      "loss": 0.1125,
      "num_tokens": 4920901.0,
      "reward": 2.66015625,
      "reward_std": 0.23804902657866478,
      "rewards/accuracy_reward/mean": 0.66015625,
      "rewards/accuracy_reward/std": 0.4760490208864212,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04344508983194828,
      "stage2_sft_loss": 0.018051974155241624,
      "step": 73,
      "total_loss": 0.019296405371278524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 250.5,
      "completions/max_terminated_length": 250.5,
      "completions/mean_length": 120.171875,
      "completions/mean_terminated_length": 120.171875,
      "completions/min_length": 59.75,
      "completions/min_terminated_length": 59.75,
      "epoch": 0.3162393162393162,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.526664631890765,
      "grpo_loss": -0.030613353825174272,
      "kl": 0.119384765625,
      "learning_rate": 1.7431448254773943e-05,
      "loss": 0.1256,
      "num_tokens": 4983641.0,
      "reward": 2.6796875,
      "reward_std": 0.2553994879126549,
      "rewards/accuracy_reward/mean": 0.6796875,
      "rewards/accuracy_reward/std": 0.4430723860859871,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10160567285493016,
      "stage2_sft_loss": 0.005139547283761203,
      "step": 74,
      "total_loss": 0.071506273932755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 230.25,
      "completions/max_terminated_length": 230.25,
      "completions/mean_length": 130.84375,
      "completions/mean_terminated_length": 130.84375,
      "completions/min_length": 58.5,
      "completions/min_terminated_length": 58.5,
      "epoch": 0.32051282051282054,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.5098120068964827,
      "grpo_loss": -0.009117206209339201,
      "kl": 0.14794921875,
      "learning_rate": 1.7330518718298263e-05,
      "loss": 0.0893,
      "num_tokens": 5050153.0,
      "reward": 2.72265625,
      "reward_std": 0.2585534080862999,
      "rewards/accuracy_reward/mean": 0.72265625,
      "rewards/accuracy_reward/std": 0.4464419335126877,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09497358370572329,
      "stage2_sft_loss": 0.023782045347616076,
      "step": 75,
      "total_loss": 0.08823457965627313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 240.0,
      "completions/max_terminated_length": 240.0,
      "completions/mean_length": 118.3203125,
      "completions/mean_terminated_length": 118.3203125,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.3247863247863248,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 3.6984272155957005,
      "grpo_loss": -0.003023316152393818,
      "kl": 0.172119140625,
      "learning_rate": 1.7227948638273918e-05,
      "loss": 0.0736,
      "num_tokens": 5113211.0,
      "reward": 2.783203125,
      "reward_std": 0.2550964131951332,
      "rewards/accuracy_reward/mean": 0.7890625,
      "rewards/accuracy_reward/std": 0.4064294844865799,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.998046875,
      "rewards/tag_count_reward/std": 0.015625,
      "stage1_sft_loss": 0.05189145356416702,
      "stage2_sft_loss": 0.007722017195192166,
      "step": 76,
      "total_loss": 0.04964033979922533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 298.75,
      "completions/max_terminated_length": 298.75,
      "completions/mean_length": 144.1640625,
      "completions/mean_terminated_length": 144.1640625,
      "completions/min_length": 73.25,
      "completions/min_terminated_length": 73.25,
      "epoch": 0.32905982905982906,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.3502081107560997,
      "grpo_loss": 0.009372745989821851,
      "kl": 0.1197509765625,
      "learning_rate": 1.712376096951345e-05,
      "loss": 0.055,
      "num_tokens": 5183701.0,
      "reward": 2.65625,
      "reward_std": 0.2786785438656807,
      "rewards/accuracy_reward/mean": 0.65625,
      "rewards/accuracy_reward/std": 0.4706496447324753,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08107265457510948,
      "stage2_sft_loss": 0.021250678692013025,
      "step": 77,
      "total_loss": 0.09257047018036246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 273.25,
      "completions/max_terminated_length": 273.25,
      "completions/mean_length": 137.51171875,
      "completions/mean_terminated_length": 137.51171875,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.3333333333333333,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.729145057547932,
      "grpo_loss": 0.12022795213852078,
      "kl": 0.13525390625,
      "learning_rate": 1.7017979028839918e-05,
      "loss": 0.1115,
      "num_tokens": 5250640.0,
      "reward": 2.72265625,
      "reward_std": 0.2792089581489563,
      "rewards/accuracy_reward/mean": 0.72265625,
      "rewards/accuracy_reward/std": 0.4468590244650841,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10791449341922998,
      "stage2_sft_loss": 0.01693276612786576,
      "step": 78,
      "total_loss": 0.22983572259545326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 283.5,
      "completions/max_terminated_length": 283.5,
      "completions/mean_length": 137.28515625,
      "completions/mean_terminated_length": 137.28515625,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.33760683760683763,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 2.3802824913901546,
      "grpo_loss": 0.005306557985022664,
      "kl": 0.1236572265625,
      "learning_rate": 1.691062648986865e-05,
      "loss": 0.0957,
      "num_tokens": 5319329.0,
      "reward": 2.630859375,
      "reward_std": 0.32588188722729683,
      "rewards/accuracy_reward/mean": 0.63671875,
      "rewards/accuracy_reward/std": 0.4642375111579895,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.998046875,
      "rewards/tag_count_reward/std": 0.015625,
      "stage1_sft_loss": 0.07730232924222946,
      "stage2_sft_loss": 0.0019270793563919142,
      "step": 79,
      "total_loss": 0.08280159346759319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 124.80859375,
      "completions/mean_terminated_length": 124.80859375,
      "completions/min_length": 63.25,
      "completions/min_terminated_length": 63.25,
      "epoch": 0.3418803418803419,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.562572433193672,
      "grpo_loss": -0.03958694860921241,
      "kl": 0.119873046875,
      "learning_rate": 1.6801727377709195e-05,
      "loss": 0.1113,
      "num_tokens": 5385328.0,
      "reward": 2.765625,
      "reward_std": 0.2514565847814083,
      "rewards/accuracy_reward/mean": 0.765625,
      "rewards/accuracy_reward/std": 0.42420244961977005,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10037143575027585,
      "stage2_sft_loss": 0.012038417553412728,
      "step": 80,
      "total_loss": 0.06198833044618368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 242.0,
      "completions/max_terminated_length": 242.0,
      "completions/mean_length": 124.39453125,
      "completions/mean_terminated_length": 124.39453125,
      "completions/min_length": 63.75,
      "completions/min_terminated_length": 63.75,
      "epoch": 0.34615384615384615,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.499248013632045,
      "grpo_loss": 0.04034198680892587,
      "kl": 0.1214599609375,
      "learning_rate": 1.6691306063588583e-05,
      "loss": 0.1041,
      "num_tokens": 5448349.0,
      "reward": 2.796875,
      "reward_std": 0.2021375447511673,
      "rewards/accuracy_reward/mean": 0.796875,
      "rewards/accuracy_reward/std": 0.3994177505373955,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07490937830880284,
      "stage2_sft_loss": 0.02029243257129565,
      "step": 81,
      "total_loss": 0.11728060524910688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 286.5,
      "completions/max_terminated_length": 286.5,
      "completions/mean_length": 140.00390625,
      "completions/mean_terminated_length": 140.00390625,
      "completions/min_length": 62.75,
      "completions/min_terminated_length": 62.75,
      "epoch": 0.3504273504273504,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.5742755470532064,
      "grpo_loss": -0.03953387896763161,
      "kl": 0.1248779296875,
      "learning_rate": 1.657938725939713e-05,
      "loss": 0.0792,
      "num_tokens": 5517238.0,
      "reward": 2.7373046875,
      "reward_std": 0.2568397559225559,
      "rewards/accuracy_reward/mean": 0.7421875,
      "rewards/accuracy_reward/std": 0.43832528591156006,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.06850403128191829,
      "stage2_sft_loss": 0.019538108550477773,
      "step": 82,
      "total_loss": 0.030923962127417326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 350.0,
      "completions/max_terminated_length": 350.0,
      "completions/mean_length": 138.76953125,
      "completions/mean_terminated_length": 138.76953125,
      "completions/min_length": 63.75,
      "completions/min_terminated_length": 63.75,
      "epoch": 0.3547008547008547,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.4821729998966995,
      "grpo_loss": -0.03367833292577416,
      "kl": 0.1253662109375,
      "learning_rate": 1.6465996012157996e-05,
      "loss": 0.0798,
      "num_tokens": 5586435.0,
      "reward": 2.7421875,
      "reward_std": 0.2086990401148796,
      "rewards/accuracy_reward/mean": 0.7421875,
      "rewards/accuracy_reward/std": 0.4246060326695442,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.13172664865851402,
      "stage2_sft_loss": 0.0064331964968005195,
      "step": 83,
      "total_loss": 0.09869163855910301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 137.140625,
      "completions/mean_terminated_length": 137.140625,
      "completions/min_length": 65.75,
      "completions/min_terminated_length": 65.75,
      "epoch": 0.358974358974359,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 3.423143308177815,
      "grpo_loss": -0.00183769257273525,
      "kl": 0.1729736328125,
      "learning_rate": 1.635115769842179e-05,
      "loss": 0.0942,
      "num_tokens": 5655103.0,
      "reward": 2.65625,
      "reward_std": 0.26341626048088074,
      "rewards/accuracy_reward/mean": 0.65625,
      "rewards/accuracy_reward/std": 0.4552925229072571,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04354301746934652,
      "stage2_sft_loss": 0.01797918685770128,
      "step": 84,
      "total_loss": 0.04350324580445886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 245.25,
      "completions/max_terminated_length": 245.25,
      "completions/mean_length": 118.8125,
      "completions/mean_terminated_length": 118.8125,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.36324786324786323,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.726421526007492,
      "grpo_loss": 0.021800624788738787,
      "kl": 0.1282958984375,
      "learning_rate": 1.6234898018587336e-05,
      "loss": 0.1009,
      "num_tokens": 5717567.0,
      "reward": 2.74609375,
      "reward_std": 0.28353746607899666,
      "rewards/accuracy_reward/mean": 0.74609375,
      "rewards/accuracy_reward/std": 0.43080218881368637,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04685461334884167,
      "stage2_sft_loss": 0.02155728975776583,
      "step": 85,
      "total_loss": 0.07081096805632114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 293.5,
      "completions/max_terminated_length": 293.5,
      "completions/mean_length": 138.03515625,
      "completions/mean_terminated_length": 138.03515625,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 0.36752136752136755,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.7129461434409365,
      "grpo_loss": 0.04890600312501192,
      "kl": 0.138671875,
      "learning_rate": 1.6117242991150064e-05,
      "loss": 0.0943,
      "num_tokens": 5786584.0,
      "reward": 2.75,
      "reward_std": 0.230536550283432,
      "rewards/accuracy_reward/mean": 0.75,
      "rewards/accuracy_reward/std": 0.42425093054771423,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10090539930388331,
      "stage2_sft_loss": 0.019177716341800988,
      "step": 86,
      "total_loss": 0.15172917302697897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 307.5,
      "completions/max_terminated_length": 307.5,
      "completions/mean_length": 135.3046875,
      "completions/mean_terminated_length": 135.3046875,
      "completions/min_length": 62.5,
      "completions/min_terminated_length": 62.5,
      "epoch": 0.3717948717948718,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.925690756507991,
      "grpo_loss": -0.022680374735500664,
      "kl": 0.1314697265625,
      "learning_rate": 1.599821894687914e-05,
      "loss": 0.1157,
      "num_tokens": 5853374.0,
      "reward": 2.69921875,
      "reward_std": 0.22594210505485535,
      "rewards/accuracy_reward/mean": 0.69921875,
      "rewards/accuracy_reward/std": 0.4589729681611061,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07890512142330408,
      "stage2_sft_loss": 0.009549629903631285,
      "step": 87,
      "total_loss": 0.057179709896445274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 287.25,
      "completions/max_terminated_length": 287.25,
      "completions/mean_length": 127.3359375,
      "completions/mean_terminated_length": 127.3359375,
      "completions/min_length": 61.75,
      "completions/min_terminated_length": 61.75,
      "epoch": 0.37606837606837606,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.749444964354371,
      "grpo_loss": 0.026062425575219095,
      "kl": 0.138916015625,
      "learning_rate": 1.5877852522924733e-05,
      "loss": 0.0766,
      "num_tokens": 5918284.0,
      "reward": 2.75,
      "reward_std": 0.24211551621556282,
      "rewards/accuracy_reward/mean": 0.75390625,
      "rewards/accuracy_reward/std": 0.39269477128982544,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08172346837818623,
      "stage2_sft_loss": 0.018881638650782406,
      "step": 88,
      "total_loss": 0.1096740560606122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 246.75,
      "completions/max_terminated_length": 246.75,
      "completions/mean_length": 121.3359375,
      "completions/mean_terminated_length": 121.3359375,
      "completions/min_length": 62.5,
      "completions/min_terminated_length": 62.5,
      "epoch": 0.3803418803418803,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 2.5645519227467615,
      "grpo_loss": 0.007930530468001962,
      "kl": 0.1263427734375,
      "learning_rate": 1.575617065685674e-05,
      "loss": 0.0944,
      "num_tokens": 5980826.0,
      "reward": 2.8515625,
      "reward_std": 0.14651167765259743,
      "rewards/accuracy_reward/mean": 0.8515625,
      "rewards/accuracy_reward/std": 0.3466116338968277,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.12439357303082943,
      "stage2_sft_loss": 0.011787514406023547,
      "step": 89,
      "total_loss": 0.13350285589694977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 264.25,
      "completions/max_terminated_length": 264.25,
      "completions/mean_length": 136.91015625,
      "completions/mean_terminated_length": 136.91015625,
      "completions/min_length": 65.75,
      "completions/min_terminated_length": 65.75,
      "epoch": 0.38461538461538464,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.4879182580397234,
      "grpo_loss": 0.0011445782147347927,
      "kl": 0.14013671875,
      "learning_rate": 1.563320058063622e-05,
      "loss": 0.0774,
      "num_tokens": 6050443.0,
      "reward": 2.66796875,
      "reward_std": 0.3047170788049698,
      "rewards/accuracy_reward/mean": 0.66796875,
      "rewards/accuracy_reward/std": 0.45233723521232605,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.1086853314191103,
      "stage2_sft_loss": 0.01397453507524915,
      "step": 90,
      "total_loss": 0.1112273633480072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 302.5,
      "completions/max_terminated_length": 302.5,
      "completions/mean_length": 129.0546875,
      "completions/mean_terminated_length": 129.0546875,
      "completions/min_length": 61.5,
      "completions/min_terminated_length": 61.5,
      "epoch": 0.3888888888888889,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.5519925026432895,
      "grpo_loss": -0.04885828430997208,
      "kl": 0.1190185546875,
      "learning_rate": 1.5508969814521026e-05,
      "loss": 0.081,
      "num_tokens": 6115801.0,
      "reward": 2.7109375,
      "reward_std": 0.19332443550229073,
      "rewards/accuracy_reward/mean": 0.7109375,
      "rewards/accuracy_reward/std": 0.44970038533210754,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06702740117907524,
      "stage2_sft_loss": 0.007887058920459822,
      "step": 91,
      "total_loss": 0.018957823514938354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 230.75,
      "completions/max_terminated_length": 230.75,
      "completions/mean_length": 125.98828125,
      "completions/mean_terminated_length": 125.98828125,
      "completions/min_length": 52.25,
      "completions/min_terminated_length": 52.25,
      "epoch": 0.39316239316239315,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 2.671387602994381,
      "grpo_loss": -0.026032934081740677,
      "kl": 0.1090087890625,
      "learning_rate": 1.5383506160906826e-05,
      "loss": 0.0816,
      "num_tokens": 6180062.0,
      "reward": 2.8046875,
      "reward_std": 0.15650184452533722,
      "rewards/accuracy_reward/mean": 0.8046875,
      "rewards/accuracy_reward/std": 0.388528935611248,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0885940557345748,
      "stage2_sft_loss": 0.02217566382023506,
      "step": 92,
      "total_loss": 0.06477868836373091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 222.5,
      "completions/max_terminated_length": 222.5,
      "completions/mean_length": 130.9140625,
      "completions/mean_terminated_length": 130.9140625,
      "completions/min_length": 55.25,
      "completions/min_terminated_length": 55.25,
      "epoch": 0.3974358974358974,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.9580194659239027,
      "grpo_loss": 0.01426442974479869,
      "kl": 0.152099609375,
      "learning_rate": 1.5256837698105047e-05,
      "loss": 0.1078,
      "num_tokens": 6247824.0,
      "reward": 2.68359375,
      "reward_std": 0.25829383358359337,
      "rewards/accuracy_reward/mean": 0.68359375,
      "rewards/accuracy_reward/std": 0.46058642119169235,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.12474778946489096,
      "stage2_sft_loss": 0.0037532774949795566,
      "step": 93,
      "total_loss": 0.13938755076378584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 234.0,
      "completions/max_terminated_length": 234.0,
      "completions/mean_length": 117.26171875,
      "completions/mean_terminated_length": 117.26171875,
      "completions/min_length": 66.5,
      "completions/min_terminated_length": 66.5,
      "epoch": 0.4017094017094017,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.6809485416788807,
      "grpo_loss": 0.05255669995676726,
      "kl": 0.1064453125,
      "learning_rate": 1.5128992774059063e-05,
      "loss": 0.0891,
      "num_tokens": 6310051.0,
      "reward": 2.7421875,
      "reward_std": 0.22738608345389366,
      "rewards/accuracy_reward/mean": 0.7421875,
      "rewards/accuracy_reward/std": 0.4302185848355293,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10235591046512127,
      "stage2_sft_loss": 0.009095492074266076,
      "step": 94,
      "total_loss": 0.1558221597224474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 301.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 138.12890625,
      "completions/mean_terminated_length": 138.12890625,
      "completions/min_length": 66.75,
      "completions/min_terminated_length": 66.75,
      "epoch": 0.405982905982906,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.5597991268269777,
      "grpo_loss": 0.016854463145136833,
      "kl": 0.1239013671875,
      "learning_rate": 1.5000000000000002e-05,
      "loss": 0.0841,
      "num_tokens": 6379940.0,
      "reward": 2.69921875,
      "reward_std": 0.2855108827352524,
      "rewards/accuracy_reward/mean": 0.69921875,
      "rewards/accuracy_reward/std": 0.45850304514169693,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08178659062832594,
      "stage2_sft_loss": 0.021333697950467467,
      "step": 95,
      "total_loss": 0.10077442298643291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 342.5,
      "completions/max_terminated_length": 219.25,
      "completions/mean_length": 127.04296875,
      "completions/mean_terminated_length": 124.57942962646484,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.41025641025641024,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.8329251039895205,
      "grpo_loss": 0.0065450501861050725,
      "kl": 0.1209716796875,
      "learning_rate": 1.4869888244043674e-05,
      "loss": 0.1343,
      "num_tokens": 6444855.0,
      "reward": 2.7080078125,
      "reward_std": 0.2602709634229541,
      "rewards/accuracy_reward/mean": 0.71484375,
      "rewards/accuracy_reward/std": 0.427839957177639,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9970703125,
      "rewards/tag_count_reward/std": 0.02343750186264515,
      "stage1_sft_loss": 0.09643353894352913,
      "stage2_sft_loss": 0.03424702318443451,
      "step": 96,
      "total_loss": 0.10640329401940107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 320.5,
      "completions/max_terminated_length": 320.5,
      "completions/mean_length": 134.9296875,
      "completions/mean_terminated_length": 134.9296875,
      "completions/min_length": 57.5,
      "completions/min_terminated_length": 57.5,
      "epoch": 0.41452991452991456,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.4973266632814486,
      "grpo_loss": -0.05328447837382555,
      "kl": 0.1409912109375,
      "learning_rate": 1.4738686624729987e-05,
      "loss": 0.0991,
      "num_tokens": 6513957.0,
      "reward": 2.7373046875,
      "reward_std": 0.28144313395023346,
      "rewards/accuracy_reward/mean": 0.7421875,
      "rewards/accuracy_reward/std": 0.427031971514225,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.07091331109404564,
      "stage2_sft_loss": 0.015732734580524266,
      "step": 97,
      "total_loss": 0.019202106399461627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.9375,
      "completions/max_length": 374.5,
      "completions/max_terminated_length": 249.5,
      "completions/mean_length": 139.7265625,
      "completions/mean_terminated_length": 134.7769660949707,
      "completions/min_length": 59.75,
      "completions/min_terminated_length": 59.75,
      "epoch": 0.4188034188034188,
      "frac_reward_zero_std": 0.28125,
      "grad_norm": 2.3197516269195115,
      "grpo_loss": -0.05551034724339843,
      "kl": 0.1173095703125,
      "learning_rate": 1.4606424504506325e-05,
      "loss": 0.1264,
      "num_tokens": 6583695.0,
      "reward": 2.705078125,
      "reward_std": 0.3469051569700241,
      "rewards/accuracy_reward/mean": 0.71875,
      "rewards/accuracy_reward/std": 0.44226548820734024,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.043842025101184845,
      "rewards/tag_count_reward/mean": 0.994140625,
      "rewards/tag_count_reward/std": 0.032881516963243484,
      "stage1_sft_loss": 0.05779402703046799,
      "stage2_sft_loss": 0.011461514979600906,
      "step": 98,
      "total_loss": 0.0034298310056328773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 369.0,
      "completions/max_terminated_length": 369.0,
      "completions/mean_length": 138.578125,
      "completions/mean_terminated_length": 138.578125,
      "completions/min_length": 58.5,
      "completions/min_terminated_length": 58.5,
      "epoch": 0.4230769230769231,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.4797442538778522,
      "grpo_loss": -0.03367353102657944,
      "kl": 0.1246337890625,
      "learning_rate": 1.4473131483156326e-05,
      "loss": 0.0719,
      "num_tokens": 6652891.0,
      "reward": 2.7138671875,
      "reward_std": 0.2675051614642143,
      "rewards/accuracy_reward/mean": 0.71875,
      "rewards/accuracy_reward/std": 0.42169189453125,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.08882415713742375,
      "stage2_sft_loss": 0.014731279705301858,
      "step": 99,
      "total_loss": 0.05662375397514552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 271.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 128.98046875,
      "completions/mean_terminated_length": 128.98046875,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.42735042735042733,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.4029762999741564,
      "grpo_loss": -0.01963827926556405,
      "kl": 0.1123046875,
      "learning_rate": 1.4338837391175582e-05,
      "loss": 0.0704,
      "num_tokens": 6719542.0,
      "reward": 2.76953125,
      "reward_std": 0.25460558384656906,
      "rewards/accuracy_reward/mean": 0.76953125,
      "rewards/accuracy_reward/std": 0.4023680202662945,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10046672075986862,
      "stage2_sft_loss": 0.02204990791506134,
      "step": 100,
      "total_loss": 0.0830334322527051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 280.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 130.390625,
      "completions/mean_terminated_length": 130.390625,
      "completions/min_length": 72.5,
      "completions/min_terminated_length": 72.5,
      "epoch": 0.43162393162393164,
      "frac_reward_zero_std": 0.21875,
      "grad_norm": 2.36108691500467,
      "grpo_loss": 0.03791657286637928,
      "kl": 0.1080322265625,
      "learning_rate": 1.4203572283095657e-05,
      "loss": 0.0576,
      "num_tokens": 6784962.0,
      "reward": 2.7763671875,
      "reward_std": 0.33827219903469086,
      "rewards/accuracy_reward/mean": 0.78125,
      "rewards/accuracy_reward/std": 0.4132651388645172,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.06852314155548811,
      "stage2_sft_loss": 0.008980212034657598,
      "step": 101,
      "total_loss": 0.10733773885294795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 270.75,
      "completions/max_terminated_length": 270.75,
      "completions/mean_length": 131.19140625,
      "completions/mean_terminated_length": 131.19140625,
      "completions/min_length": 57.5,
      "completions/min_terminated_length": 57.5,
      "epoch": 0.4358974358974359,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.5331171435040436,
      "grpo_loss": 0.030746392672881484,
      "kl": 0.1236572265625,
      "learning_rate": 1.4067366430758004e-05,
      "loss": 0.0715,
      "num_tokens": 6851587.0,
      "reward": 2.76171875,
      "reward_std": 0.25486908480525017,
      "rewards/accuracy_reward/mean": 0.76171875,
      "rewards/accuracy_reward/std": 0.40466783940792084,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.03651396604254842,
      "stage2_sft_loss": 0.009974595624953508,
      "step": 102,
      "total_loss": 0.06825782265514135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 256.25,
      "completions/max_terminated_length": 256.25,
      "completions/mean_length": 128.2578125,
      "completions/mean_terminated_length": 128.2578125,
      "completions/min_length": 60.5,
      "completions/min_terminated_length": 60.5,
      "epoch": 0.44017094017094016,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.449351104624353,
      "grpo_loss": 0.01105509905028157,
      "kl": 0.1328125,
      "learning_rate": 1.3930250316539237e-05,
      "loss": 0.0706,
      "num_tokens": 6916605.0,
      "reward": 2.73828125,
      "reward_std": 0.21291769668459892,
      "rewards/accuracy_reward/mean": 0.73828125,
      "rewards/accuracy_reward/std": 0.42710205167531967,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08719620713964105,
      "stage2_sft_loss": 0.015745949975098483,
      "step": 103,
      "total_loss": 0.09982590237632394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 396.75,
      "completions/max_terminated_length": 279.25,
      "completions/mean_length": 137.33203125,
      "completions/mean_terminated_length": 134.81733894348145,
      "completions/min_length": 59.75,
      "completions/min_terminated_length": 59.75,
      "epoch": 0.4444444444444444,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.968394025688263,
      "grpo_loss": -0.037616387009620667,
      "kl": 0.162109375,
      "learning_rate": 1.3792254626529286e-05,
      "loss": 0.1165,
      "num_tokens": 6985882.0,
      "reward": 2.7197265625,
      "reward_std": 0.2833760306239128,
      "rewards/accuracy_reward/mean": 0.7265625,
      "rewards/accuracy_reward/std": 0.44848156720399857,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9970703125,
      "rewards/tag_count_reward/std": 0.02343750186264515,
      "stage1_sft_loss": 0.1242181695997715,
      "stage2_sft_loss": 0.011403052310924977,
      "step": 104,
      "total_loss": 0.087742087431252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 316.5,
      "completions/max_terminated_length": 316.5,
      "completions/mean_length": 124.76171875,
      "completions/mean_terminated_length": 124.76171875,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.44871794871794873,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 2.6270391923453937,
      "grpo_loss": -0.011787233641371131,
      "kl": 0.1097412109375,
      "learning_rate": 1.3653410243663953e-05,
      "loss": 0.0882,
      "num_tokens": 7050389.0,
      "reward": 2.79296875,
      "reward_std": 0.11652141716331244,
      "rewards/accuracy_reward/mean": 0.79296875,
      "rewards/accuracy_reward/std": 0.39603784680366516,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05623149313032627,
      "stage2_sft_loss": 0.009826588677242398,
      "step": 105,
      "total_loss": 0.04542691865935922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 258.0,
      "completions/max_terminated_length": 258.0,
      "completions/mean_length": 129.23046875,
      "completions/mean_terminated_length": 129.23046875,
      "completions/min_length": 62.75,
      "completions/min_terminated_length": 62.75,
      "epoch": 0.452991452991453,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.847238135727028,
      "grpo_loss": -0.0050932477752212435,
      "kl": 0.1181640625,
      "learning_rate": 1.3513748240813429e-05,
      "loss": 0.1022,
      "num_tokens": 7116360.0,
      "reward": 2.671875,
      "reward_std": 0.2899891063570976,
      "rewards/accuracy_reward/mean": 0.671875,
      "rewards/accuracy_reward/std": 0.4593518376350403,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04017673246562481,
      "stage2_sft_loss": 0.015764886702527292,
      "step": 106,
      "total_loss": 0.03665997087955475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 230.75,
      "completions/max_terminated_length": 230.75,
      "completions/mean_length": 128.421875,
      "completions/mean_terminated_length": 128.421875,
      "completions/min_length": 68.25,
      "completions/min_terminated_length": 68.25,
      "epoch": 0.45726495726495725,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.896372677278952,
      "grpo_loss": 0.07070640404708683,
      "kl": 0.1162109375,
      "learning_rate": 1.3373299873828303e-05,
      "loss": 0.0939,
      "num_tokens": 7181564.0,
      "reward": 2.78515625,
      "reward_std": 0.25342363119125366,
      "rewards/accuracy_reward/mean": 0.78515625,
      "rewards/accuracy_reward/std": 0.4078981876373291,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.15257746912539005,
      "stage2_sft_loss": 0.009742788024595939,
      "step": 107,
      "total_loss": 0.224258154630661
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 256.25,
      "completions/max_terminated_length": 256.25,
      "completions/mean_length": 130.69921875,
      "completions/mean_terminated_length": 130.69921875,
      "completions/min_length": 60.25,
      "completions/min_terminated_length": 60.25,
      "epoch": 0.46153846153846156,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.38927462180599,
      "grpo_loss": -0.04817646351875737,
      "kl": 0.1099853515625,
      "learning_rate": 1.3232096574544602e-05,
      "loss": 0.0729,
      "num_tokens": 7249071.0,
      "reward": 2.73828125,
      "reward_std": 0.22817998379468918,
      "rewards/accuracy_reward/mean": 0.73828125,
      "rewards/accuracy_reward/std": 0.43208014219999313,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10405674204230309,
      "stage2_sft_loss": 0.0215330894861836,
      "step": 108,
      "total_loss": 0.058033584617078304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 279.75,
      "completions/max_terminated_length": 279.75,
      "completions/mean_length": 138.328125,
      "completions/mean_terminated_length": 138.328125,
      "completions/min_length": 63.75,
      "completions/min_terminated_length": 63.75,
      "epoch": 0.4658119658119658,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.445961316741637,
      "grpo_loss": -0.023899111460195854,
      "kl": 0.11328125,
      "learning_rate": 1.3090169943749475e-05,
      "loss": 0.0826,
      "num_tokens": 7317899.0,
      "reward": 2.703125,
      "reward_std": 0.2747268117964268,
      "rewards/accuracy_reward/mean": 0.703125,
      "rewards/accuracy_reward/std": 0.4562576711177826,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0504996933741495,
      "stage2_sft_loss": 0.01845655390934553,
      "step": 109,
      "total_loss": 0.02844623802229762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 229.0,
      "completions/max_terminated_length": 229.0,
      "completions/mean_length": 127.7890625,
      "completions/mean_terminated_length": 127.7890625,
      "completions/min_length": 68.25,
      "completions/min_terminated_length": 68.25,
      "epoch": 0.4700854700854701,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.3492313298200544,
      "grpo_loss": 0.05720149329863489,
      "kl": 0.096923828125,
      "learning_rate": 1.2947551744109044e-05,
      "loss": 0.0872,
      "num_tokens": 7383917.0,
      "reward": 2.796875,
      "reward_std": 0.24408500641584396,
      "rewards/accuracy_reward/mean": 0.796875,
      "rewards/accuracy_reward/std": 0.37127064168453217,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.11955506075173616,
      "stage2_sft_loss": 0.018571392953163013,
      "step": 110,
      "total_loss": 0.1786136943846941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 311.0,
      "completions/max_terminated_length": 311.0,
      "completions/mean_length": 136.73046875,
      "completions/mean_terminated_length": 136.73046875,
      "completions/min_length": 61.25,
      "completions/min_terminated_length": 61.25,
      "epoch": 0.47435897435897434,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 3.147191794235813,
      "grpo_loss": 0.020814732182770967,
      "kl": 0.1019287109375,
      "learning_rate": 1.2804273893060028e-05,
      "loss": 0.0816,
      "num_tokens": 7451880.0,
      "reward": 2.7060546875,
      "reward_std": 0.2917686812579632,
      "rewards/accuracy_reward/mean": 0.70703125,
      "rewards/accuracy_reward/std": 0.4403637945652008,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.10996824130415916,
      "stage2_sft_loss": 0.03235575696453452,
      "step": 111,
      "total_loss": 0.1340185476001352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 305.75,
      "completions/max_terminated_length": 305.75,
      "completions/mean_length": 132.7578125,
      "completions/mean_terminated_length": 132.7578125,
      "completions/min_length": 67.25,
      "completions/min_terminated_length": 67.25,
      "epoch": 0.47863247863247865,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.5895502657952156,
      "grpo_loss": 0.04569915612228215,
      "kl": 0.1212158203125,
      "learning_rate": 1.2660368455666752e-05,
      "loss": 0.1027,
      "num_tokens": 7518810.0,
      "reward": 2.7578125,
      "reward_std": 0.26196590065956116,
      "rewards/accuracy_reward/mean": 0.7578125,
      "rewards/accuracy_reward/std": 0.43093569576740265,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06594810076057911,
      "stage2_sft_loss": 0.009801828651688993,
      "step": 112,
      "total_loss": 0.11262743826955557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 299.25,
      "completions/max_terminated_length": 299.25,
      "completions/mean_length": 132.6796875,
      "completions/mean_terminated_length": 132.6796875,
      "completions/min_length": 54.25,
      "completions/min_terminated_length": 54.25,
      "epoch": 0.4829059829059829,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 2.8469563747260853,
      "grpo_loss": -0.013087262865155935,
      "kl": 0.1463623046875,
      "learning_rate": 1.2515867637445088e-05,
      "loss": 0.0658,
      "num_tokens": 7586296.0,
      "reward": 2.7265625,
      "reward_std": 0.29367245733737946,
      "rewards/accuracy_reward/mean": 0.7265625,
      "rewards/accuracy_reward/std": 0.44265370070934296,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05833651963621378,
      "stage2_sft_loss": 0.005778922597528435,
      "step": 113,
      "total_loss": 0.04582714755088091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 336.25,
      "completions/max_terminated_length": 336.25,
      "completions/mean_length": 138.3515625,
      "completions/mean_terminated_length": 138.3515625,
      "completions/min_length": 68.25,
      "completions/min_terminated_length": 68.25,
      "epoch": 0.48717948717948717,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.5939421842717167,
      "grpo_loss": -0.025480328127741814,
      "kl": 0.1175537109375,
      "learning_rate": 1.2370803777154976e-05,
      "loss": 0.0781,
      "num_tokens": 7653930.0,
      "reward": 2.6357421875,
      "reward_std": 0.30325330793857574,
      "rewards/accuracy_reward/mean": 0.640625,
      "rewards/accuracy_reward/std": 0.48062169551849365,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.0783770801499486,
      "stage2_sft_loss": 0.02130466280505061,
      "step": 114,
      "total_loss": 0.05502722132951021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 315.0,
      "completions/max_terminated_length": 315.0,
      "completions/mean_length": 135.515625,
      "completions/mean_terminated_length": 135.515625,
      "completions/min_length": 65.25,
      "completions/min_terminated_length": 65.25,
      "epoch": 0.49145299145299143,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.6081245663182995,
      "grpo_loss": 0.038738559873308986,
      "kl": 0.120849609375,
      "learning_rate": 1.2225209339563144e-05,
      "loss": 0.0871,
      "num_tokens": 7722198.0,
      "reward": 2.69921875,
      "reward_std": 0.23277199268341064,
      "rewards/accuracy_reward/mean": 0.69921875,
      "rewards/accuracy_reward/std": 0.4489804431796074,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.11994863720610738,
      "stage2_sft_loss": 0.013875026190362405,
      "step": 115,
      "total_loss": 0.16007469967007637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 289.0,
      "completions/max_terminated_length": 289.0,
      "completions/mean_length": 133.05078125,
      "completions/mean_terminated_length": 133.05078125,
      "completions/min_length": 67.5,
      "completions/min_terminated_length": 67.5,
      "epoch": 0.49572649572649574,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.565895224486075,
      "grpo_loss": 0.018095170467859134,
      "kl": 0.112060546875,
      "learning_rate": 1.2079116908177592e-05,
      "loss": 0.0927,
      "num_tokens": 7789883.0,
      "reward": 2.80078125,
      "reward_std": 0.2501298226416111,
      "rewards/accuracy_reward/mean": 0.80078125,
      "rewards/accuracy_reward/std": 0.40074585378170013,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07847420917823911,
      "stage2_sft_loss": 0.004641155697754584,
      "step": 116,
      "total_loss": 0.09703349741175771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 309.0,
      "completions/max_terminated_length": 309.0,
      "completions/mean_length": 136.375,
      "completions/mean_terminated_length": 136.375,
      "completions/min_length": 68.25,
      "completions/min_terminated_length": 68.25,
      "epoch": 0.5,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.473421491696022,
      "grpo_loss": 0.07887550839222968,
      "kl": 0.117431640625,
      "learning_rate": 1.1932559177955533e-05,
      "loss": 0.0872,
      "num_tokens": 7858947.0,
      "reward": 2.71875,
      "reward_std": 0.3031258285045624,
      "rewards/accuracy_reward/mean": 0.71875,
      "rewards/accuracy_reward/std": 0.4462125226855278,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.034118348732590675,
      "stage2_sft_loss": 0.0057628911308711395,
      "step": 117,
      "total_loss": 0.11357014431268908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 254.25,
      "completions/max_terminated_length": 254.25,
      "completions/mean_length": 141.44140625,
      "completions/mean_terminated_length": 141.44140625,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.5042735042735043,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.3987615620601446,
      "grpo_loss": 0.004953495837980881,
      "kl": 0.109130859375,
      "learning_rate": 1.1785568947986368e-05,
      "loss": 0.0828,
      "num_tokens": 7928500.0,
      "reward": 2.83203125,
      "reward_std": 0.22108563408255577,
      "rewards/accuracy_reward/mean": 0.83203125,
      "rewards/accuracy_reward/std": 0.36582332849502563,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.03221064526587725,
      "stage2_sft_loss": 0.00645940622780472,
      "step": 118,
      "total_loss": 0.037810081616044044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 357.0,
      "completions/max_terminated_length": 357.0,
      "completions/mean_length": 145.38671875,
      "completions/mean_terminated_length": 145.38671875,
      "completions/min_length": 72.5,
      "completions/min_terminated_length": 72.5,
      "epoch": 0.5085470085470085,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.2409935628410236,
      "grpo_loss": 0.01667332003125921,
      "kl": 0.105224609375,
      "learning_rate": 1.1638179114151378e-05,
      "loss": 0.0885,
      "num_tokens": 7999543.0,
      "reward": 2.84375,
      "reward_std": 0.19647981226444244,
      "rewards/accuracy_reward/mean": 0.84375,
      "rewards/accuracy_reward/std": 0.35410288721323013,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08437999477609992,
      "stage2_sft_loss": 0.006384373642504215,
      "step": 119,
      "total_loss": 0.10169175546616316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 338.75,
      "completions/max_terminated_length": 338.75,
      "completions/mean_length": 131.30859375,
      "completions/mean_terminated_length": 131.30859375,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.5128205128205128,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.6895306254784965,
      "grpo_loss": -0.0084699469152838,
      "kl": 0.12841796875,
      "learning_rate": 1.1490422661761744e-05,
      "loss": 0.0962,
      "num_tokens": 8065454.0,
      "reward": 2.78125,
      "reward_std": 0.2585509456694126,
      "rewards/accuracy_reward/mean": 0.78125,
      "rewards/accuracy_reward/std": 0.4090314507484436,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06598297040909529,
      "stage2_sft_loss": 0.009453605287490063,
      "step": 120,
      "total_loss": 0.05845838412642479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 294.25,
      "completions/max_terminated_length": 294.25,
      "completions/mean_length": 130.08203125,
      "completions/mean_terminated_length": 130.08203125,
      "completions/min_length": 68.75,
      "completions/min_terminated_length": 68.75,
      "epoch": 0.5170940170940171,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.708431066615691,
      "grpo_loss": -0.01987017970532179,
      "kl": 0.122314453125,
      "learning_rate": 1.1342332658176556e-05,
      "loss": 0.0947,
      "num_tokens": 8132147.0,
      "reward": 2.69921875,
      "reward_std": 0.24553291127085686,
      "rewards/accuracy_reward/mean": 0.69921875,
      "rewards/accuracy_reward/std": 0.4452369287610054,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06503801513463259,
      "stage2_sft_loss": 0.010300099500454962,
      "step": 121,
      "total_loss": 0.04619784792885184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 290.0,
      "completions/max_terminated_length": 290.0,
      "completions/mean_length": 144.5625,
      "completions/mean_terminated_length": 144.5625,
      "completions/min_length": 58.75,
      "completions/min_terminated_length": 58.75,
      "epoch": 0.5213675213675214,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.455987899386292,
      "grpo_loss": -0.023904464964289218,
      "kl": 0.1068115234375,
      "learning_rate": 1.1193942245402443e-05,
      "loss": 0.0916,
      "num_tokens": 8202891.0,
      "reward": 2.8046875,
      "reward_std": 0.23554520681500435,
      "rewards/accuracy_reward/mean": 0.8046875,
      "rewards/accuracy_reward/std": 0.3951949328184128,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.054928970988839865,
      "stage2_sft_loss": 0.005394812709710095,
      "step": 122,
      "total_loss": 0.031563987489789724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 292.25,
      "completions/max_terminated_length": 292.25,
      "completions/mean_length": 146.9375,
      "completions/mean_terminated_length": 146.9375,
      "completions/min_length": 61.5,
      "completions/min_terminated_length": 61.5,
      "epoch": 0.5256410256410257,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.9877442732221833,
      "grpo_loss": 0.009980375471059233,
      "kl": 0.1158447265625,
      "learning_rate": 1.1045284632676535e-05,
      "loss": 0.0847,
      "num_tokens": 8274523.0,
      "reward": 2.6171875,
      "reward_std": 0.27248402312397957,
      "rewards/accuracy_reward/mean": 0.6171875,
      "rewards/accuracy_reward/std": 0.4808383285999298,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10333962785080075,
      "stage2_sft_loss": 0.036806404881644994,
      "step": 123,
      "total_loss": 0.11700064362958074
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 319.75,
      "completions/max_terminated_length": 319.75,
      "completions/mean_length": 129.9453125,
      "completions/mean_terminated_length": 129.9453125,
      "completions/min_length": 66.25,
      "completions/min_terminated_length": 66.25,
      "epoch": 0.5299145299145299,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.3026834720930385,
      "grpo_loss": 0.013034810457611457,
      "kl": 0.093017578125,
      "learning_rate": 1.0896393089034336e-05,
      "loss": 0.0679,
      "num_tokens": 8340973.0,
      "reward": 2.8359375,
      "reward_std": 0.20437543839216232,
      "rewards/accuracy_reward/mean": 0.8359375,
      "rewards/accuracy_reward/std": 0.37163110077381134,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06969350064173341,
      "stage2_sft_loss": 0.00450861056742724,
      "step": 124,
      "total_loss": 0.08317917049862444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 264.5,
      "completions/max_terminated_length": 264.5,
      "completions/mean_length": 132.69140625,
      "completions/mean_terminated_length": 132.69140625,
      "completions/min_length": 64.75,
      "completions/min_terminated_length": 64.75,
      "epoch": 0.5341880341880342,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 2.46335246220877,
      "grpo_loss": -0.01052320736926049,
      "kl": 0.1055908203125,
      "learning_rate": 1.0747300935864245e-05,
      "loss": 0.0823,
      "num_tokens": 8407934.0,
      "reward": 2.71875,
      "reward_std": 0.2909066006541252,
      "rewards/accuracy_reward/mean": 0.71875,
      "rewards/accuracy_reward/std": 0.4371757209300995,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04156210971996188,
      "stage2_sft_loss": 0.016224085906287655,
      "step": 125,
      "total_loss": 0.032661307603120804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 252.25,
      "completions/max_terminated_length": 252.25,
      "completions/mean_length": 133.44140625,
      "completions/mean_terminated_length": 133.44140625,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.5384615384615384,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.380614799508239,
      "grpo_loss": -0.035499130171956494,
      "kl": 0.1060791015625,
      "learning_rate": 1.0598041539450344e-05,
      "loss": 0.0474,
      "num_tokens": 8475375.0,
      "reward": 2.7265625,
      "reward_std": 0.2852449417114258,
      "rewards/accuracy_reward/mean": 0.7265625,
      "rewards/accuracy_reward/std": 0.4341081902384758,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09117380063980818,
      "stage2_sft_loss": 0.027107439294923097,
      "step": 126,
      "total_loss": 0.05838541442062706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 257.25,
      "completions/max_terminated_length": 257.25,
      "completions/mean_length": 122.09375,
      "completions/mean_terminated_length": 122.09375,
      "completions/min_length": 59.25,
      "completions/min_terminated_length": 59.25,
      "epoch": 0.5427350427350427,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.454086196019503,
      "grpo_loss": 0.016646095959004015,
      "kl": 0.1180419921875,
      "learning_rate": 1.044864830350515e-05,
      "loss": 0.0735,
      "num_tokens": 8539295.0,
      "reward": 2.6953125,
      "reward_std": 0.2588193491101265,
      "rewards/accuracy_reward/mean": 0.6953125,
      "rewards/accuracy_reward/std": 0.45702145248651505,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04539195611141622,
      "stage2_sft_loss": 0.017618531346670352,
      "step": 127,
      "total_loss": 0.0637999044265598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 309.25,
      "completions/max_terminated_length": 309.25,
      "completions/mean_length": 135.65234375,
      "completions/mean_terminated_length": 135.65234375,
      "completions/min_length": 62.25,
      "completions/min_terminated_length": 62.25,
      "epoch": 0.5470085470085471,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.8749697279234163,
      "grpo_loss": -0.028164366842247546,
      "kl": 0.1300048828125,
      "learning_rate": 1.0299154661693987e-05,
      "loss": 0.1046,
      "num_tokens": 8607918.0,
      "reward": 2.6953125,
      "reward_std": 0.2952302098274231,
      "rewards/accuracy_reward/mean": 0.6953125,
      "rewards/accuracy_reward/std": 0.44884125888347626,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06593989208340645,
      "stage2_sft_loss": 0.008943283930420876,
      "step": 128,
      "total_loss": 0.03866985347121954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 286.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 145.3828125,
      "completions/mean_terminated_length": 145.3828125,
      "completions/min_length": 71.5,
      "completions/min_terminated_length": 71.5,
      "epoch": 0.5512820512820513,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.2360139721857832,
      "grpo_loss": -0.0003669927828013897,
      "kl": 0.1151123046875,
      "learning_rate": 1.0149594070152638e-05,
      "loss": 0.0799,
      "num_tokens": 8677888.0,
      "reward": 2.73828125,
      "reward_std": 0.2475024051964283,
      "rewards/accuracy_reward/mean": 0.73828125,
      "rewards/accuracy_reward/std": 0.4189733415842056,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.11887484043836594,
      "stage2_sft_loss": 0.014625519164837897,
      "step": 129,
      "total_loss": 0.11997039895504713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 270.5,
      "completions/max_terminated_length": 270.5,
      "completions/mean_length": 144.18359375,
      "completions/mean_terminated_length": 144.18359375,
      "completions/min_length": 67.5,
      "completions/min_terminated_length": 67.5,
      "epoch": 0.5555555555555556,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.0756110198748474,
      "grpo_loss": 0.03972660058934707,
      "kl": 0.0960693359375,
      "learning_rate": 1e-05,
      "loss": 0.0461,
      "num_tokens": 8750079.0,
      "reward": 2.80859375,
      "reward_std": 0.22278673481196165,
      "rewards/accuracy_reward/mean": 0.80859375,
      "rewards/accuracy_reward/std": 0.348017118871212,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.050859407521784306,
      "stage2_sft_loss": 0.00923627592419507,
      "step": 130,
      "total_loss": 0.0915096364915371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 228.25,
      "completions/max_terminated_length": 228.25,
      "completions/mean_length": 126.390625,
      "completions/mean_terminated_length": 126.390625,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.5598290598290598,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.90853344624612,
      "grpo_loss": -0.005080391361843795,
      "kl": 0.121337890625,
      "learning_rate": 9.850405929847367e-06,
      "loss": 0.058,
      "num_tokens": 8814987.0,
      "reward": 2.779296875,
      "reward_std": 0.21056002005934715,
      "rewards/accuracy_reward/mean": 0.84765625,
      "rewards/accuracy_reward/std": 0.3576347529888153,
      "rewards/format_reward/mean": 0.9453125,
      "rewards/format_reward/std": 0.15677954256534576,
      "rewards/tag_count_reward/mean": 0.986328125,
      "rewards/tag_count_reward/std": 0.03919488564133644,
      "stage1_sft_loss": 0.04821830568835139,
      "stage2_sft_loss": 0.006851830825326033,
      "step": 131,
      "total_loss": 0.043823097832500935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 253.75,
      "completions/max_terminated_length": 253.75,
      "completions/mean_length": 138.89453125,
      "completions/mean_terminated_length": 138.89453125,
      "completions/min_length": 58.5,
      "completions/min_terminated_length": 58.5,
      "epoch": 0.5641025641025641,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.2783596807349347,
      "grpo_loss": -0.0668418699860922,
      "kl": 0.0982666015625,
      "learning_rate": 9.700845338306018e-06,
      "loss": 0.042,
      "num_tokens": 8884136.0,
      "reward": 2.76171875,
      "reward_std": 0.2506577856838703,
      "rewards/accuracy_reward/mean": 0.76171875,
      "rewards/accuracy_reward/std": 0.4220747724175453,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04070015996694565,
      "stage2_sft_loss": 0.010169286300879321,
      "step": 132,
      "total_loss": -0.02512478199787438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 343.0,
      "completions/max_terminated_length": 343.0,
      "completions/mean_length": 150.24609375,
      "completions/mean_terminated_length": 150.24609375,
      "completions/min_length": 57.25,
      "completions/min_terminated_length": 57.25,
      "epoch": 0.5683760683760684,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.527051548886218,
      "grpo_loss": -0.05759265075903386,
      "kl": 0.103271484375,
      "learning_rate": 9.551351696494854e-06,
      "loss": 0.0655,
      "num_tokens": 8956583.0,
      "reward": 2.64453125,
      "reward_std": 0.2313353642821312,
      "rewards/accuracy_reward/mean": 0.64453125,
      "rewards/accuracy_reward/std": 0.45752032846212387,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.11429626494646072,
      "stage2_sft_loss": 0.04731386760249734,
      "step": 133,
      "total_loss": 0.06143500283360481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 273.5,
      "completions/max_terminated_length": 273.5,
      "completions/mean_length": 124.93359375,
      "completions/mean_terminated_length": 124.93359375,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.5726495726495726,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 2.2775167385076047,
      "grpo_loss": -0.011081848337198608,
      "kl": 0.093017578125,
      "learning_rate": 9.401958460549658e-06,
      "loss": 0.0553,
      "num_tokens": 9022390.0,
      "reward": 2.80859375,
      "reward_std": 0.13611222617328167,
      "rewards/accuracy_reward/mean": 0.80859375,
      "rewards/accuracy_reward/std": 0.3880981504917145,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06806533015333116,
      "stage2_sft_loss": 0.011031713336706161,
      "step": 134,
      "total_loss": 0.058086653240025043
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 280.25,
      "completions/max_terminated_length": 280.25,
      "completions/mean_length": 127.94921875,
      "completions/mean_terminated_length": 127.94921875,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.5769230769230769,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.2333257978413013,
      "grpo_loss": 0.046479592099785805,
      "kl": 0.0970458984375,
      "learning_rate": 9.252699064135759e-06,
      "loss": 0.0892,
      "num_tokens": 9088265.0,
      "reward": 2.80859375,
      "reward_std": 0.25434111803770065,
      "rewards/accuracy_reward/mean": 0.80859375,
      "rewards/accuracy_reward/std": 0.35886215418577194,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06378544913604856,
      "stage2_sft_loss": 0.018909562553744763,
      "step": 135,
      "total_loss": 0.11215599719434977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 233.5,
      "completions/max_terminated_length": 233.5,
      "completions/mean_length": 116.78125,
      "completions/mean_terminated_length": 116.78125,
      "completions/min_length": 56.25,
      "completions/min_terminated_length": 56.25,
      "epoch": 0.5811965811965812,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.4767783638602623,
      "grpo_loss": -0.027499133881065063,
      "kl": 0.10888671875,
      "learning_rate": 9.103606910965666e-06,
      "loss": 0.0663,
      "num_tokens": 9151545.0,
      "reward": 2.83984375,
      "reward_std": 0.2068792637437582,
      "rewards/accuracy_reward/mean": 0.83984375,
      "rewards/accuracy_reward/std": 0.36639247089624405,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05348319374024868,
      "stage2_sft_loss": 0.008430409914581105,
      "step": 136,
      "total_loss": 0.026827102527022362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 355.75,
      "completions/max_terminated_length": 355.75,
      "completions/mean_length": 148.94921875,
      "completions/mean_terminated_length": 148.94921875,
      "completions/min_length": 66.5,
      "completions/min_terminated_length": 66.5,
      "epoch": 0.5854700854700855,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.329224577124586,
      "grpo_loss": 0.026575487572699785,
      "kl": 0.11669921875,
      "learning_rate": 8.954715367323468e-06,
      "loss": 0.0665,
      "num_tokens": 9223548.0,
      "reward": 2.734375,
      "reward_std": 0.2607549577951431,
      "rewards/accuracy_reward/mean": 0.73828125,
      "rewards/accuracy_reward/std": 0.43093063682317734,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015248439274728298,
      "stage1_sft_loss": 0.08758598286658525,
      "stage2_sft_loss": 0.05217828591048601,
      "step": 137,
      "total_loss": 0.11937929969280958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 281.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 134.59765625,
      "completions/mean_terminated_length": 134.59765625,
      "completions/min_length": 55.5,
      "completions/min_terminated_length": 55.5,
      "epoch": 0.5897435897435898,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.5035557731102522,
      "grpo_loss": 0.011050511908251792,
      "kl": 0.1112060546875,
      "learning_rate": 8.806057754597559e-06,
      "loss": 0.0836,
      "num_tokens": 9291653.0,
      "reward": 2.8046875,
      "reward_std": 0.23132899776101112,
      "rewards/accuracy_reward/mean": 0.8046875,
      "rewards/accuracy_reward/std": 0.39095889776945114,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05431360239163041,
      "stage2_sft_loss": 0.009981016424717382,
      "step": 138,
      "total_loss": 0.06636221474036574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 263.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 125.7109375,
      "completions/mean_terminated_length": 125.7109375,
      "completions/min_length": 59.5,
      "completions/min_terminated_length": 59.5,
      "epoch": 0.594017094017094,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.361583240185186,
      "grpo_loss": -0.033124256053270074,
      "kl": 0.100341796875,
      "learning_rate": 8.657667341823449e-06,
      "loss": 0.0876,
      "num_tokens": 9356483.0,
      "reward": 2.8359375,
      "reward_std": 0.19332443177700043,
      "rewards/accuracy_reward/mean": 0.8359375,
      "rewards/accuracy_reward/std": 0.3644886091351509,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07262712554074824,
      "stage2_sft_loss": 0.014838075108855264,
      "step": 139,
      "total_loss": 0.04098667961079627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 443.25,
      "completions/max_terminated_length": 443.25,
      "completions/mean_length": 147.3984375,
      "completions/mean_terminated_length": 147.3984375,
      "completions/min_length": 65.5,
      "completions/min_terminated_length": 65.5,
      "epoch": 0.5982905982905983,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.140356398926018,
      "grpo_loss": -0.05637400213163346,
      "kl": 0.08587646484375,
      "learning_rate": 8.509577338238255e-06,
      "loss": 0.0703,
      "num_tokens": 9427193.0,
      "reward": 2.765625,
      "reward_std": 0.2494782730937004,
      "rewards/accuracy_reward/mean": 0.765625,
      "rewards/accuracy_reward/std": 0.4166136011481285,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05473982123658061,
      "stage2_sft_loss": 0.05722901329136221,
      "step": 140,
      "total_loss": 0.004088719375431538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 318.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 137.15234375,
      "completions/mean_terminated_length": 137.15234375,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.6025641025641025,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.5103848979553662,
      "grpo_loss": 0.058950818143785,
      "kl": 0.107177734375,
      "learning_rate": 8.361820885848623e-06,
      "loss": 0.0941,
      "num_tokens": 9495024.0,
      "reward": 2.78515625,
      "reward_std": 0.1876691598445177,
      "rewards/accuracy_reward/mean": 0.78515625,
      "rewards/accuracy_reward/std": 0.39204035699367523,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07926928717643023,
      "stage2_sft_loss": 0.008115518408885691,
      "step": 141,
      "total_loss": 0.13903165794909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 322.75,
      "completions/max_terminated_length": 322.75,
      "completions/mean_length": 137.8203125,
      "completions/mean_terminated_length": 137.8203125,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.6068376068376068,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.5391650094703944,
      "grpo_loss": 0.07669655280187726,
      "kl": 0.1102294921875,
      "learning_rate": 8.214431052013636e-06,
      "loss": 0.091,
      "num_tokens": 9563378.0,
      "reward": 2.796875,
      "reward_std": 0.19647981226444244,
      "rewards/accuracy_reward/mean": 0.796875,
      "rewards/accuracy_reward/std": 0.37679756060242653,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10939254239201546,
      "stage2_sft_loss": 0.00877940544160083,
      "step": 142,
      "total_loss": 0.18696703761816025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 298.25,
      "completions/max_terminated_length": 298.25,
      "completions/mean_length": 139.26171875,
      "completions/mean_terminated_length": 139.26171875,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.6111111111111112,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.4841493562770363,
      "grpo_loss": 0.020590394386090338,
      "kl": 0.12451171875,
      "learning_rate": 8.06744082204447e-06,
      "loss": 0.0967,
      "num_tokens": 9630797.0,
      "reward": 2.7578125,
      "reward_std": 0.2571094296872616,
      "rewards/accuracy_reward/mean": 0.7578125,
      "rewards/accuracy_reward/std": 0.4005141332745552,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05549432337284088,
      "stage2_sft_loss": 0.014109492563875392,
      "step": 143,
      "total_loss": 0.07749566785059869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 255.75,
      "completions/max_terminated_length": 255.75,
      "completions/mean_length": 135.46484375,
      "completions/mean_terminated_length": 135.46484375,
      "completions/min_length": 67.75,
      "completions/min_terminated_length": 67.75,
      "epoch": 0.6153846153846154,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.64956235594402,
      "grpo_loss": -0.012727254332276061,
      "kl": 0.097412109375,
      "learning_rate": 7.92088309182241e-06,
      "loss": 0.0809,
      "num_tokens": 9698876.0,
      "reward": 2.8046875,
      "reward_std": 0.19739338383078575,
      "rewards/accuracy_reward/mean": 0.8046875,
      "rewards/accuracy_reward/std": 0.394244909286499,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.1811660141684115,
      "stage2_sft_loss": 0.0043692881081369705,
      "step": 144,
      "total_loss": 0.16887568403035402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 279.25,
      "completions/max_terminated_length": 279.25,
      "completions/mean_length": 126.53125,
      "completions/mean_terminated_length": 126.53125,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.6196581196581197,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 2.6250671163078483,
      "grpo_loss": 0.008420213998761028,
      "kl": 0.1058349609375,
      "learning_rate": 7.774790660436857e-06,
      "loss": 0.0779,
      "num_tokens": 9763884.0,
      "reward": 2.8359375,
      "reward_std": 0.1701665222644806,
      "rewards/accuracy_reward/mean": 0.8359375,
      "rewards/accuracy_reward/std": 0.3677399829030037,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04537341580726206,
      "stage2_sft_loss": 0.0017060633217624854,
      "step": 145,
      "total_loss": 0.053964235819876194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 131.6796875,
      "completions/mean_terminated_length": 131.6796875,
      "completions/min_length": 62.25,
      "completions/min_terminated_length": 62.25,
      "epoch": 0.6239316239316239,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.686921408632962,
      "grpo_loss": -0.002991177316289395,
      "kl": 0.09326171875,
      "learning_rate": 7.629196222845027e-06,
      "loss": 0.0558,
      "num_tokens": 9831138.0,
      "reward": 2.84765625,
      "reward_std": 0.20384501945227385,
      "rewards/accuracy_reward/mean": 0.84765625,
      "rewards/accuracy_reward/std": 0.34441937878727913,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05802195239812136,
      "stage2_sft_loss": 0.00841883840621449,
      "step": 146,
      "total_loss": 0.055872660130262375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 280.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 127.69140625,
      "completions/mean_terminated_length": 127.69140625,
      "completions/min_length": 56.5,
      "completions/min_terminated_length": 56.5,
      "epoch": 0.6282051282051282,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.338255966369413,
      "grpo_loss": 0.032924949657171965,
      "kl": 0.10693359375,
      "learning_rate": 7.484132362554915e-06,
      "loss": 0.0601,
      "num_tokens": 9896459.0,
      "reward": 2.859375,
      "reward_std": 0.20437543839216232,
      "rewards/accuracy_reward/mean": 0.859375,
      "rewards/accuracy_reward/std": 0.34932583570480347,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08136490918695927,
      "stage2_sft_loss": 0.014175453514326364,
      "step": 147,
      "total_loss": 0.11570740700699389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 303.25,
      "completions/max_terminated_length": 303.25,
      "completions/mean_length": 144.77734375,
      "completions/mean_terminated_length": 144.77734375,
      "completions/min_length": 66.25,
      "completions/min_terminated_length": 66.25,
      "epoch": 0.6324786324786325,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.495909790539816,
      "grpo_loss": -0.011645266757113859,
      "kl": 0.1090087890625,
      "learning_rate": 7.33963154433325e-06,
      "loss": 0.0714,
      "num_tokens": 9967802.0,
      "reward": 2.765625,
      "reward_std": 0.2217283584177494,
      "rewards/accuracy_reward/mean": 0.765625,
      "rewards/accuracy_reward/std": 0.412590354681015,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05564484978094697,
      "stage2_sft_loss": 0.032550239644479007,
      "step": 148,
      "total_loss": 0.04725460661575198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 284.5,
      "completions/max_terminated_length": 284.5,
      "completions/mean_length": 146.7421875,
      "completions/mean_terminated_length": 146.7421875,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.6367521367521367,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.3326058076550127,
      "grpo_loss": 0.016894530330318958,
      "kl": 0.1033935546875,
      "learning_rate": 7.1957261069399745e-06,
      "loss": 0.0564,
      "num_tokens": 10040056.0,
      "reward": 2.85546875,
      "reward_std": 0.27526114881038666,
      "rewards/accuracy_reward/mean": 0.85546875,
      "rewards/accuracy_reward/std": 0.3532208576798439,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.03281405568122864,
      "stage2_sft_loss": 0.009040810575243086,
      "step": 149,
      "total_loss": 0.05061266664415598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 458.5,
      "completions/max_terminated_length": 323.0,
      "completions/mean_length": 129.56640625,
      "completions/mean_terminated_length": 126.96162033081055,
      "completions/min_length": 60.5,
      "completions/min_terminated_length": 60.5,
      "epoch": 0.6410256410256411,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.866667260437921,
      "grpo_loss": 0.02925427898298949,
      "kl": 0.13037109375,
      "learning_rate": 7.052448255890958e-06,
      "loss": 0.1456,
      "num_tokens": 10105953.0,
      "reward": 2.7548828125,
      "reward_std": 0.24152512475848198,
      "rewards/accuracy_reward/mean": 0.76171875,
      "rewards/accuracy_reward/std": 0.4230128526687622,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9970703125,
      "rewards/tag_count_reward/std": 0.02343750186264515,
      "stage1_sft_loss": 0.13295416068285704,
      "stage2_sft_loss": 0.005717331481719157,
      "step": 150,
      "total_loss": 0.1627801824361086
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 236.75,
      "completions/max_terminated_length": 236.75,
      "completions/mean_length": 134.05078125,
      "completions/mean_terminated_length": 134.05078125,
      "completions/min_length": 61.75,
      "completions/min_terminated_length": 61.75,
      "epoch": 0.6452991452991453,
      "frac_reward_zero_std": 0.3125,
      "grad_norm": 2.597988976639514,
      "grpo_loss": 0.05413528915960342,
      "kl": 0.118408203125,
      "learning_rate": 6.909830056250527e-06,
      "loss": 0.0892,
      "num_tokens": 10174230.0,
      "reward": 2.76953125,
      "reward_std": 0.3056345507502556,
      "rewards/accuracy_reward/mean": 0.76953125,
      "rewards/accuracy_reward/std": 0.4202374145388603,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09626385103911161,
      "stage2_sft_loss": 0.0072241057805513265,
      "step": 151,
      "total_loss": 0.1511215539649129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 372.0,
      "completions/max_terminated_length": 372.0,
      "completions/mean_length": 145.06640625,
      "completions/mean_terminated_length": 145.06640625,
      "completions/min_length": 71.25,
      "completions/min_terminated_length": 71.25,
      "epoch": 0.6495726495726496,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.5961261287645816,
      "grpo_loss": -0.017629655718337744,
      "kl": 0.1204833984375,
      "learning_rate": 6.767903425455402e-06,
      "loss": 0.0789,
      "num_tokens": 10244967.0,
      "reward": 2.75,
      "reward_std": 0.22225632518529892,
      "rewards/accuracy_reward/mean": 0.75,
      "rewards/accuracy_reward/std": 0.3980471268296242,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0796721177175641,
      "stage2_sft_loss": 0.01425616116830497,
      "step": 152,
      "total_loss": 0.06346807722002268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 236.0,
      "completions/max_terminated_length": 236.0,
      "completions/mean_length": 123.4921875,
      "completions/mean_terminated_length": 123.4921875,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.6538461538461539,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.5163651597741836,
      "grpo_loss": 0.04610937915276736,
      "kl": 0.09375,
      "learning_rate": 6.6267001261717015e-06,
      "loss": 0.0741,
      "num_tokens": 10310381.0,
      "reward": 2.86328125,
      "reward_std": 0.22172590345144272,
      "rewards/accuracy_reward/mean": 0.86328125,
      "rewards/accuracy_reward/std": 0.326527189463377,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04725873947609216,
      "stage2_sft_loss": 0.00662843362079002,
      "step": 153,
      "total_loss": 0.09403096046298742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 262.5,
      "completions/max_terminated_length": 262.5,
      "completions/mean_length": 135.859375,
      "completions/mean_terminated_length": 135.859375,
      "completions/min_length": 64.5,
      "completions/min_terminated_length": 64.5,
      "epoch": 0.6581196581196581,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.4252492975801236,
      "grpo_loss": -0.00364104809705168,
      "kl": 0.082763671875,
      "learning_rate": 6.486251759186573e-06,
      "loss": 0.0626,
      "num_tokens": 10379817.0,
      "reward": 2.7890625,
      "reward_std": 0.1837237924337387,
      "rewards/accuracy_reward/mean": 0.7890625,
      "rewards/accuracy_reward/std": 0.38352732732892036,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06735014263540506,
      "stage2_sft_loss": 0.01930423468002118,
      "step": 154,
      "total_loss": 0.06563951540738344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 290.0,
      "completions/max_terminated_length": 290.0,
      "completions/mean_length": 142.26171875,
      "completions/mean_terminated_length": 142.26171875,
      "completions/min_length": 73.25,
      "completions/min_terminated_length": 73.25,
      "epoch": 0.6623931623931624,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.366411089319022,
      "grpo_loss": -0.054288885701680556,
      "kl": 0.0863037109375,
      "learning_rate": 6.34658975633605e-06,
      "loss": 0.0536,
      "num_tokens": 10449180.0,
      "reward": 2.8046875,
      "reward_std": 0.19108654744923115,
      "rewards/accuracy_reward/mean": 0.8046875,
      "rewards/accuracy_reward/std": 0.3968508318066597,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.035379976499825716,
      "stage2_sft_loss": 0.010212840817985125,
      "step": 155,
      "total_loss": -0.017887625843286514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 298.5,
      "completions/max_terminated_length": 298.5,
      "completions/mean_length": 142.125,
      "completions/mean_terminated_length": 142.125,
      "completions/min_length": 76.5,
      "completions/min_terminated_length": 76.5,
      "epoch": 0.6666666666666666,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.129602732934006,
      "grpo_loss": 0.035368968499824405,
      "kl": 0.08502197265625,
      "learning_rate": 6.207745373470717e-06,
      "loss": 0.0831,
      "num_tokens": 10518636.0,
      "reward": 2.859375,
      "reward_std": 0.20923583209514618,
      "rewards/accuracy_reward/mean": 0.859375,
      "rewards/accuracy_reward/std": 0.3443687930703163,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06338846497237682,
      "stage2_sft_loss": 0.012052012367348652,
      "step": 156,
      "total_loss": 0.0999626386910677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 275.5,
      "completions/max_terminated_length": 275.5,
      "completions/mean_length": 144.5234375,
      "completions/mean_terminated_length": 144.5234375,
      "completions/min_length": 75.25,
      "completions/min_terminated_length": 75.25,
      "epoch": 0.6709401709401709,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.3169054497973565,
      "grpo_loss": -0.002087805936753284,
      "kl": 0.0943603515625,
      "learning_rate": 6.069749683460765e-06,
      "loss": 0.0789,
      "num_tokens": 10588434.0,
      "reward": 2.69921875,
      "reward_std": 0.23357326164841652,
      "rewards/accuracy_reward/mean": 0.69921875,
      "rewards/accuracy_reward/std": 0.45767590403556824,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04589258902706206,
      "stage2_sft_loss": 0.006625795169384219,
      "step": 157,
      "total_loss": 0.04446736362297088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -6.96875,
      "completions/max_length": 383.25,
      "completions/max_terminated_length": 238.25,
      "completions/mean_length": 131.90234375,
      "completions/mean_terminated_length": 129.2612247467041,
      "completions/min_length": 80.5,
      "completions/min_terminated_length": 80.5,
      "epoch": 0.6752136752136753,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.2919519383538747,
      "grpo_loss": 0.026327846964704804,
      "kl": 0.087646484375,
      "learning_rate": 5.932633569242e-06,
      "loss": 0.1066,
      "num_tokens": 10654705.0,
      "reward": 2.8330078125,
      "reward_std": 0.20864389650523663,
      "rewards/accuracy_reward/mean": 0.83984375,
      "rewards/accuracy_reward/std": 0.3436870872974396,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9970703125,
      "rewards/tag_count_reward/std": 0.02343750186264515,
      "stage1_sft_loss": 0.05269382195547223,
      "stage2_sft_loss": 0.0020705054157588165,
      "step": 158,
      "total_loss": 0.07922872109338641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 275.5,
      "completions/max_terminated_length": 275.5,
      "completions/mean_length": 133.83203125,
      "completions/mean_terminated_length": 133.83203125,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.6794871794871795,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.295106029646828,
      "grpo_loss": 0.014149629918392748,
      "kl": 0.107421875,
      "learning_rate": 5.796427716904347e-06,
      "loss": 0.0656,
      "num_tokens": 10721630.0,
      "reward": 2.80078125,
      "reward_std": 0.20555494353175163,
      "rewards/accuracy_reward/mean": 0.80078125,
      "rewards/accuracy_reward/std": 0.3571106605231762,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04844714910723269,
      "stage2_sft_loss": 0.00807800801248959,
      "step": 159,
      "total_loss": 0.06340457918122411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 231.5,
      "completions/max_terminated_length": 231.5,
      "completions/mean_length": 124.92578125,
      "completions/mean_terminated_length": 124.92578125,
      "completions/min_length": 62.75,
      "completions/min_terminated_length": 62.75,
      "epoch": 0.6837606837606838,
      "frac_reward_zero_std": 0.34375,
      "grad_norm": 2.398123263945807,
      "grpo_loss": 0.01794378731995039,
      "kl": 0.0992431640625,
      "learning_rate": 5.66116260882442e-06,
      "loss": 0.0669,
      "num_tokens": 10785939.0,
      "reward": 2.8046875,
      "reward_std": 0.29234322160482407,
      "rewards/accuracy_reward/mean": 0.8046875,
      "rewards/accuracy_reward/std": 0.39095889776945114,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.03389233513735235,
      "stage2_sft_loss": 0.018274757923791185,
      "step": 160,
      "total_loss": 0.05366359875188209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 238.75,
      "completions/max_terminated_length": 238.75,
      "completions/mean_length": 130.82421875,
      "completions/mean_terminated_length": 130.82421875,
      "completions/min_length": 66.25,
      "completions/min_terminated_length": 66.25,
      "epoch": 0.688034188034188,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.4713176066788654,
      "grpo_loss": -0.007550991605967283,
      "kl": 0.100830078125,
      "learning_rate": 5.526868516843673e-06,
      "loss": 0.0769,
      "num_tokens": 10852526.0,
      "reward": 2.77734375,
      "reward_std": 0.2656516842544079,
      "rewards/accuracy_reward/mean": 0.77734375,
      "rewards/accuracy_reward/std": 0.4104008078575134,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.11685018753632903,
      "stage2_sft_loss": 0.01413062890060246,
      "step": 161,
      "total_loss": 0.1107122590765357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 264.25,
      "completions/max_terminated_length": 264.25,
      "completions/mean_length": 132.74609375,
      "completions/mean_terminated_length": 132.74609375,
      "completions/min_length": 69.5,
      "completions/min_terminated_length": 69.5,
      "epoch": 0.6923076923076923,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.3313874096022835,
      "grpo_loss": 0.003736324142664671,
      "kl": 0.0833740234375,
      "learning_rate": 5.393575495493679e-06,
      "loss": 0.0709,
      "num_tokens": 10919077.0,
      "reward": 2.8125,
      "reward_std": 0.2117381915450096,
      "rewards/accuracy_reward/mean": 0.8125,
      "rewards/accuracy_reward/std": 0.36151088774204254,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.18217940255999565,
      "stage2_sft_loss": 0.01815573309431784,
      "step": 162,
      "total_loss": 0.18773129768669605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 237.25,
      "completions/max_terminated_length": 237.25,
      "completions/mean_length": 130.7421875,
      "completions/mean_terminated_length": 130.7421875,
      "completions/min_length": 66.5,
      "completions/min_terminated_length": 66.5,
      "epoch": 0.6965811965811965,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 2.466615149845605,
      "grpo_loss": -0.010769763757707551,
      "kl": 0.0894775390625,
      "learning_rate": 5.2613133752700145e-06,
      "loss": 0.0782,
      "num_tokens": 10986211.0,
      "reward": 2.890625,
      "reward_std": 0.13546558748930693,
      "rewards/accuracy_reward/mean": 0.890625,
      "rewards/accuracy_reward/std": 0.29157692193984985,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0620655445381999,
      "stage2_sft_loss": 0.019713282701559365,
      "step": 163,
      "total_loss": 0.05326710897497833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 283.5,
      "completions/max_terminated_length": 283.5,
      "completions/mean_length": 137.265625,
      "completions/mean_terminated_length": 137.265625,
      "completions/min_length": 68.25,
      "completions/min_terminated_length": 68.25,
      "epoch": 0.7008547008547008,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.3553881379649866,
      "grpo_loss": -0.006676109624095261,
      "kl": 0.08697509765625,
      "learning_rate": 5.130111755956327e-06,
      "loss": 0.0871,
      "num_tokens": 11054735.0,
      "reward": 2.81640625,
      "reward_std": 0.20213509909808636,
      "rewards/accuracy_reward/mean": 0.81640625,
      "rewards/accuracy_reward/std": 0.34256091713905334,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09478867053985596,
      "stage2_sft_loss": 0.01719207396672573,
      "step": 164,
      "total_loss": 0.08983177039772272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 252.5,
      "completions/max_terminated_length": 252.5,
      "completions/mean_length": 128.84375,
      "completions/mean_terminated_length": 128.84375,
      "completions/min_length": 64.75,
      "completions/min_terminated_length": 64.75,
      "epoch": 0.7051282051282052,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.772277422206105,
      "grpo_loss": 0.0985541757545434,
      "kl": 0.1038818359375,
      "learning_rate": 5.000000000000003e-06,
      "loss": 0.1192,
      "num_tokens": 11120839.0,
      "reward": 2.875,
      "reward_std": 0.20950030162930489,
      "rewards/accuracy_reward/mean": 0.875,
      "rewards/accuracy_reward/std": 0.32767561450600624,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08193079102784395,
      "stage2_sft_loss": 0.007009456341620535,
      "step": 165,
      "total_loss": 0.18118591140955687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 286.25,
      "completions/max_terminated_length": 286.25,
      "completions/mean_length": 139.16796875,
      "completions/mean_terminated_length": 139.16796875,
      "completions/min_length": 66.5,
      "completions/min_terminated_length": 66.5,
      "epoch": 0.7094017094017094,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.20156747003348,
      "grpo_loss": 0.0015803006826899946,
      "kl": 0.089111328125,
      "learning_rate": 4.87100722594094e-06,
      "loss": 0.0755,
      "num_tokens": 11190498.0,
      "reward": 2.7265625,
      "reward_std": 0.23330241069197655,
      "rewards/accuracy_reward/mean": 0.7265625,
      "rewards/accuracy_reward/std": 0.4193031042814255,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09793944098055363,
      "stage2_sft_loss": 0.0024029574196902104,
      "step": 166,
      "total_loss": 0.09976003784686327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 298.75,
      "completions/max_terminated_length": 298.75,
      "completions/mean_length": 133.5859375,
      "completions/mean_terminated_length": 133.5859375,
      "completions/min_length": 77.5,
      "completions/min_terminated_length": 77.5,
      "epoch": 0.7136752136752137,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.0395625224823095,
      "grpo_loss": 0.003400918962142896,
      "kl": 0.089111328125,
      "learning_rate": 4.743162301894952e-06,
      "loss": 0.0721,
      "num_tokens": 11257320.0,
      "reward": 2.84375,
      "reward_std": 0.2008083276450634,
      "rewards/accuracy_reward/mean": 0.84375,
      "rewards/accuracy_reward/std": 0.3427247516810894,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0751371227670461,
      "stage2_sft_loss": 0.0010712931361922529,
      "step": 167,
      "total_loss": 0.07864517066627741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 289.75,
      "completions/max_terminated_length": 289.75,
      "completions/mean_length": 142.6875,
      "completions/mean_terminated_length": 142.6875,
      "completions/min_length": 70.5,
      "completions/min_terminated_length": 70.5,
      "epoch": 0.717948717948718,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.2661004439431096,
      "grpo_loss": -0.009914183348882943,
      "kl": 0.09600830078125,
      "learning_rate": 4.616493839093179e-06,
      "loss": 0.0642,
      "num_tokens": 11327824.0,
      "reward": 2.734375,
      "reward_std": 0.2659137099981308,
      "rewards/accuracy_reward/mean": 0.734375,
      "rewards/accuracy_reward/std": 0.42969100922346115,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09561806032434106,
      "stage2_sft_loss": 0.01580676135563408,
      "step": 168,
      "total_loss": 0.08728454890660942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 292.25,
      "completions/max_terminated_length": 292.25,
      "completions/mean_length": 127.33203125,
      "completions/mean_terminated_length": 127.33203125,
      "completions/min_length": 62.25,
      "completions/min_terminated_length": 62.25,
      "epoch": 0.7222222222222222,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.1872189970272204,
      "grpo_loss": 0.006283853610511869,
      "kl": 0.077392578125,
      "learning_rate": 4.491030185478976e-06,
      "loss": 0.069,
      "num_tokens": 11392709.0,
      "reward": 2.75,
      "reward_std": 0.192000113427639,
      "rewards/accuracy_reward/mean": 0.75,
      "rewards/accuracy_reward/std": 0.4205075278878212,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06108791567385197,
      "stage2_sft_loss": 0.019755267803702736,
      "step": 169,
      "total_loss": 0.06934729870408773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 246.5,
      "completions/max_terminated_length": 246.5,
      "completions/mean_length": 120.265625,
      "completions/mean_terminated_length": 120.265625,
      "completions/min_length": 53.5,
      "completions/min_terminated_length": 53.5,
      "epoch": 0.7264957264957265,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.5086249757515575,
      "grpo_loss": 0.008289381628856063,
      "kl": 0.090087890625,
      "learning_rate": 4.3667994193637794e-06,
      "loss": 0.0616,
      "num_tokens": 11455393.0,
      "reward": 2.859375,
      "reward_std": 0.1837237998843193,
      "rewards/accuracy_reward/mean": 0.859375,
      "rewards/accuracy_reward/std": 0.34254971891641617,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05750684905797243,
      "stage2_sft_loss": 0.004096082295291126,
      "step": 170,
      "total_loss": 0.0662058424204588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 271.75,
      "completions/max_terminated_length": 271.75,
      "completions/mean_length": 139.2265625,
      "completions/mean_terminated_length": 139.2265625,
      "completions/min_length": 58.75,
      "completions/min_terminated_length": 58.75,
      "epoch": 0.7307692307692307,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.4597641282544047,
      "grpo_loss": 0.0308269634260796,
      "kl": 0.0999755859375,
      "learning_rate": 4.2438293431432665e-06,
      "loss": 0.0742,
      "num_tokens": 11523891.0,
      "reward": 2.80078125,
      "reward_std": 0.22540531679987907,
      "rewards/accuracy_reward/mean": 0.80078125,
      "rewards/accuracy_reward/std": 0.40003233402967453,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04699529241770506,
      "stage2_sft_loss": 0.010027769778389484,
      "step": 171,
      "total_loss": 0.07882503373548388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 310.5,
      "completions/max_terminated_length": 310.5,
      "completions/mean_length": 140.03515625,
      "completions/mean_terminated_length": 140.03515625,
      "completions/min_length": 68.25,
      "completions/min_terminated_length": 68.25,
      "epoch": 0.7350427350427351,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.313340926620552,
      "grpo_loss": -0.016913561208639294,
      "kl": 0.09033203125,
      "learning_rate": 4.12214747707527e-06,
      "loss": 0.0633,
      "num_tokens": 11593892.0,
      "reward": 2.78515625,
      "reward_std": 0.28011762723326683,
      "rewards/accuracy_reward/mean": 0.78515625,
      "rewards/accuracy_reward/std": 0.4029533341526985,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.10063559375703335,
      "stage2_sft_loss": 0.004137711774092168,
      "step": 172,
      "total_loss": 0.08413580618798733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 249.25,
      "completions/max_terminated_length": 249.25,
      "completions/mean_length": 126.921875,
      "completions/mean_terminated_length": 126.921875,
      "completions/min_length": 69.25,
      "completions/min_terminated_length": 69.25,
      "epoch": 0.7393162393162394,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.4017578116497065,
      "grpo_loss": -0.021779871574835852,
      "kl": 0.0828857421875,
      "learning_rate": 4.001781053120863e-06,
      "loss": 0.0805,
      "num_tokens": 11659816.0,
      "reward": 2.8671875,
      "reward_std": 0.1816292516887188,
      "rewards/accuracy_reward/mean": 0.8671875,
      "rewards/accuracy_reward/std": 0.3129511810839176,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.13278495729900897,
      "stage2_sft_loss": 0.012614698614925146,
      "step": 173,
      "total_loss": 0.11226655612699687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 241.25,
      "completions/max_terminated_length": 241.25,
      "completions/mean_length": 128.96484375,
      "completions/mean_terminated_length": 128.96484375,
      "completions/min_length": 59.75,
      "completions/min_terminated_length": 59.75,
      "epoch": 0.7435897435897436,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.5955191634448274,
      "grpo_loss": 0.009170918841846287,
      "kl": 0.098876953125,
      "learning_rate": 3.882757008849936e-06,
      "loss": 0.0864,
      "num_tokens": 11725263.0,
      "reward": 2.6953125,
      "reward_std": 0.2296190746128559,
      "rewards/accuracy_reward/mean": 0.6953125,
      "rewards/accuracy_reward/std": 0.42350123077630997,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05286407144740224,
      "stage2_sft_loss": 0.02471150812925771,
      "step": 174,
      "total_loss": 0.06450614053755999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 315.5,
      "completions/max_terminated_length": 315.5,
      "completions/mean_length": 128.84765625,
      "completions/mean_terminated_length": 128.84765625,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.7478632478632479,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.1001294510887507,
      "grpo_loss": 0.004248449899023399,
      "kl": 0.087890625,
      "learning_rate": 3.7651019814126656e-06,
      "loss": 0.0612,
      "num_tokens": 11791504.0,
      "reward": 2.84375,
      "reward_std": 0.1714957458898425,
      "rewards/accuracy_reward/mean": 0.84375,
      "rewards/accuracy_reward/std": 0.33109506219625473,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.02404563978780061,
      "stage2_sft_loss": 0.005888056548428722,
      "step": 175,
      "total_loss": 0.028882895596325397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 246.75,
      "completions/max_terminated_length": 246.75,
      "completions/mean_length": 126.859375,
      "completions/mean_terminated_length": 126.859375,
      "completions/min_length": 68.25,
      "completions/min_terminated_length": 68.25,
      "epoch": 0.7521367521367521,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 2.467756391649865,
      "grpo_loss": 0.004664378764573485,
      "kl": 0.0887451171875,
      "learning_rate": 3.6488423015782128e-06,
      "loss": 0.0628,
      "num_tokens": 11857804.0,
      "reward": 2.87109375,
      "reward_std": 0.14256631769239902,
      "rewards/accuracy_reward/mean": 0.87109375,
      "rewards/accuracy_reward/std": 0.31893764436244965,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.041731708915904164,
      "stage2_sft_loss": 0.022018967356416397,
      "step": 176,
      "total_loss": 0.04859798448160291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 246.75,
      "completions/max_terminated_length": 246.75,
      "completions/mean_length": 134.4375,
      "completions/mean_terminated_length": 134.4375,
      "completions/min_length": 54.25,
      "completions/min_terminated_length": 54.25,
      "epoch": 0.7564102564102564,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.467385983440296,
      "grpo_loss": -0.0026853438175749034,
      "kl": 0.0904541015625,
      "learning_rate": 3.534003987842005e-06,
      "loss": 0.0838,
      "num_tokens": 11925716.0,
      "reward": 2.8203125,
      "reward_std": 0.22935950569808483,
      "rewards/accuracy_reward/mean": 0.8203125,
      "rewards/accuracy_reward/std": 0.347512349486351,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06543307611718774,
      "stage2_sft_loss": 0.02554303167562466,
      "step": 177,
      "total_loss": 0.0653020367026329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 299.75,
      "completions/max_terminated_length": 299.75,
      "completions/mean_length": 137.3984375,
      "completions/mean_terminated_length": 137.3984375,
      "completions/min_length": 59.25,
      "completions/min_terminated_length": 59.25,
      "epoch": 0.7606837606837606,
      "frac_reward_zero_std": 0.40625,
      "grad_norm": 2.115557905994792,
      "grpo_loss": -0.04446130109135993,
      "kl": 0.0858154296875,
      "learning_rate": 3.4206127406028744e-06,
      "loss": 0.0473,
      "num_tokens": 11993618.0,
      "reward": 2.86328125,
      "reward_std": 0.2429143339395523,
      "rewards/accuracy_reward/mean": 0.86328125,
      "rewards/accuracy_reward/std": 0.34463661164045334,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06367250951007009,
      "stage2_sft_loss": 0.008203808916732669,
      "step": 178,
      "total_loss": 0.020031588152050972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 297.25,
      "completions/max_terminated_length": 297.25,
      "completions/mean_length": 121.7890625,
      "completions/mean_terminated_length": 121.7890625,
      "completions/min_length": 68.75,
      "completions/min_terminated_length": 68.75,
      "epoch": 0.7649572649572649,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 2.709248900203319,
      "grpo_loss": 0.0030166495707817376,
      "kl": 0.101806640625,
      "learning_rate": 3.308693936411421e-06,
      "loss": 0.0637,
      "num_tokens": 12056300.0,
      "reward": 2.7890625,
      "reward_std": 0.1803000308573246,
      "rewards/accuracy_reward/mean": 0.7890625,
      "rewards/accuracy_reward/std": 0.39430833607912064,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05242747673764825,
      "stage2_sft_loss": 0.021223331823421177,
      "step": 179,
      "total_loss": 0.05756645882502198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 243.5,
      "completions/max_terminated_length": 243.5,
      "completions/mean_length": 125.8515625,
      "completions/mean_terminated_length": 125.8515625,
      "completions/min_length": 67.75,
      "completions/min_terminated_length": 67.75,
      "epoch": 0.7692307692307693,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.4130085372673205,
      "grpo_loss": 0.0032237190462183207,
      "kl": 0.0877685546875,
      "learning_rate": 3.1982726222908046e-06,
      "loss": 0.0791,
      "num_tokens": 12122358.0,
      "reward": 2.80078125,
      "reward_std": 0.2468533217906952,
      "rewards/accuracy_reward/mean": 0.80078125,
      "rewards/accuracy_reward/std": 0.395585760474205,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05290105380117893,
      "stage2_sft_loss": 0.0006931633852218511,
      "step": 180,
      "total_loss": 0.056194088188931346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 215.0,
      "completions/max_terminated_length": 215.0,
      "completions/mean_length": 118.58203125,
      "completions/mean_terminated_length": 118.58203125,
      "completions/min_length": 64.5,
      "completions/min_terminated_length": 64.5,
      "epoch": 0.7735042735042735,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.4269696259509774,
      "grpo_loss": 0.01719996099564014,
      "kl": 0.08306884765625,
      "learning_rate": 3.089373510131354e-06,
      "loss": 0.0624,
      "num_tokens": 12184427.0,
      "reward": 2.84375,
      "reward_std": 0.17609265074133873,
      "rewards/accuracy_reward/mean": 0.84375,
      "rewards/accuracy_reward/std": 0.34457017853856087,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.028878788463771343,
      "stage2_sft_loss": 0.007072446060192306,
      "step": 181,
      "total_loss": 0.04678599291946739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 241.0,
      "completions/max_terminated_length": 241.0,
      "completions/mean_length": 127.0703125,
      "completions/mean_terminated_length": 127.0703125,
      "completions/min_length": 54.25,
      "completions/min_terminated_length": 54.25,
      "epoch": 0.7777777777777778,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.7281001020757256,
      "grpo_loss": 0.04298811010085046,
      "kl": 0.0975341796875,
      "learning_rate": 2.9820209711600858e-06,
      "loss": 0.063,
      "num_tokens": 12250877.0,
      "reward": 2.79296875,
      "reward_std": 0.2506577782332897,
      "rewards/accuracy_reward/mean": 0.79296875,
      "rewards/accuracy_reward/std": 0.3889293447136879,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05901889782398939,
      "stage2_sft_loss": 0.0066388859413564205,
      "step": 182,
      "total_loss": 0.10267089866101742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 408.5,
      "completions/max_terminated_length": 408.5,
      "completions/mean_length": 137.64453125,
      "completions/mean_terminated_length": 137.64453125,
      "completions/min_length": 59.25,
      "completions/min_terminated_length": 59.25,
      "epoch": 0.782051282051282,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.40444782166608,
      "grpo_loss": -0.008797756774583831,
      "kl": 0.10546875,
      "learning_rate": 2.876239030486554e-06,
      "loss": 0.0907,
      "num_tokens": 12319018.0,
      "reward": 2.78515625,
      "reward_std": 0.21831095963716507,
      "rewards/accuracy_reward/mean": 0.78515625,
      "rewards/accuracy_reward/std": 0.3904332146048546,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05148952128365636,
      "stage2_sft_loss": 0.003067808851483278,
      "step": 183,
      "total_loss": 0.0429985448718071
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 267.25,
      "completions/max_terminated_length": 267.25,
      "completions/mean_length": 129.82421875,
      "completions/mean_terminated_length": 129.82421875,
      "completions/min_length": 66.25,
      "completions/min_terminated_length": 66.25,
      "epoch": 0.7863247863247863,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.278999858369521,
      "grpo_loss": -0.005033229797845706,
      "kl": 0.0814208984375,
      "learning_rate": 2.7720513617260857e-06,
      "loss": 0.0612,
      "num_tokens": 12384461.0,
      "reward": 2.80078125,
      "reward_std": 0.23368556797504425,
      "rewards/accuracy_reward/mean": 0.80078125,
      "rewards/accuracy_reward/std": 0.3904718607664108,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08307978138327599,
      "stage2_sft_loss": 0.014651871661044424,
      "step": 184,
      "total_loss": 0.07951173838227987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 268.25,
      "completions/max_terminated_length": 268.25,
      "completions/mean_length": 141.74609375,
      "completions/mean_terminated_length": 141.74609375,
      "completions/min_length": 66.25,
      "completions/min_terminated_length": 66.25,
      "epoch": 0.7905982905982906,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.248355130508807,
      "grpo_loss": 0.007732820464298129,
      "kl": 0.0924072265625,
      "learning_rate": 2.669481281701739e-06,
      "loss": 0.0907,
      "num_tokens": 12453828.0,
      "reward": 2.75390625,
      "reward_std": 0.25354476645588875,
      "rewards/accuracy_reward/mean": 0.75390625,
      "rewards/accuracy_reward/std": 0.38477926701307297,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.044389708898961544,
      "stage2_sft_loss": 0.007064787938361405,
      "step": 185,
      "total_loss": 0.052829005755484104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 234.75,
      "completions/max_terminated_length": 234.75,
      "completions/mean_length": 134.88671875,
      "completions/mean_terminated_length": 134.88671875,
      "completions/min_length": 61.75,
      "completions/min_terminated_length": 61.75,
      "epoch": 0.7948717948717948,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.4596631881752544,
      "grpo_loss": -0.02575137373059988,
      "kl": 0.096923828125,
      "learning_rate": 2.5685517452260566e-06,
      "loss": 0.0584,
      "num_tokens": 12523087.0,
      "reward": 2.7734375,
      "reward_std": 0.22764958441257477,
      "rewards/accuracy_reward/mean": 0.7734375,
      "rewards/accuracy_reward/std": 0.40480412542819977,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0866909739561379,
      "stage2_sft_loss": 0.013630262881633826,
      "step": 186,
      "total_loss": 0.06230262666940689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 242.75,
      "completions/max_terminated_length": 242.75,
      "completions/mean_length": 132.58984375,
      "completions/mean_terminated_length": 132.58984375,
      "completions/min_length": 81.25,
      "completions/min_terminated_length": 81.25,
      "epoch": 0.7991452991452992,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.165191336137466,
      "grpo_loss": -0.02968689359840937,
      "kl": 0.093017578125,
      "learning_rate": 2.469285339963892e-06,
      "loss": 0.0461,
      "num_tokens": 12589974.0,
      "reward": 2.83984375,
      "reward_std": 0.19450394995510578,
      "rewards/accuracy_reward/mean": 0.83984375,
      "rewards/accuracy_reward/std": 0.34064802527427673,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.056919783586636186,
      "stage2_sft_loss": 0.007759942389384378,
      "step": 187,
      "total_loss": 0.02800888242200017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 306.25,
      "completions/max_terminated_length": 306.25,
      "completions/mean_length": 130.83984375,
      "completions/mean_terminated_length": 130.83984375,
      "completions/min_length": 73.25,
      "completions/min_terminated_length": 73.25,
      "epoch": 0.8034188034188035,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.226208338506282,
      "grpo_loss": -0.004410345209180377,
      "kl": 0.0858154296875,
      "learning_rate": 2.371704281377335e-06,
      "loss": 0.0448,
      "num_tokens": 12657693.0,
      "reward": 2.8203125,
      "reward_std": 0.2376309186220169,
      "rewards/accuracy_reward/mean": 0.8203125,
      "rewards/accuracy_reward/std": 0.36812880635261536,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.053088022861629725,
      "stage2_sft_loss": 0.00971882028170512,
      "step": 188,
      "total_loss": 0.049649559427052736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 273.5,
      "completions/max_terminated_length": 273.5,
      "completions/mean_length": 140.78125,
      "completions/mean_terminated_length": 140.78125,
      "completions/min_length": 65.75,
      "completions/min_terminated_length": 65.75,
      "epoch": 0.8076923076923077,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.3493601611104364,
      "grpo_loss": -0.006723656959366053,
      "kl": 0.09423828125,
      "learning_rate": 2.275830407754006e-06,
      "loss": 0.0616,
      "num_tokens": 12727141.0,
      "reward": 2.7421875,
      "reward_std": 0.20213755778968334,
      "rewards/accuracy_reward/mean": 0.7421875,
      "rewards/accuracy_reward/std": 0.42482397705316544,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06993068754673004,
      "stage2_sft_loss": 0.02032813218102092,
      "step": 189,
      "total_loss": 0.06523984298110008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 230.5,
      "completions/max_terminated_length": 230.5,
      "completions/mean_length": 123.16796875,
      "completions/mean_terminated_length": 123.16796875,
      "completions/min_length": 63.5,
      "completions/min_terminated_length": 63.5,
      "epoch": 0.811965811965812,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.3089248428431515,
      "grpo_loss": 0.029169104644097388,
      "kl": 0.0794677734375,
      "learning_rate": 2.1816851753197023e-06,
      "loss": 0.0534,
      "num_tokens": 12790576.0,
      "reward": 2.8671875,
      "reward_std": 0.17688901163637638,
      "rewards/accuracy_reward/mean": 0.8671875,
      "rewards/accuracy_reward/std": 0.3410636931657791,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06213785335421562,
      "stage2_sft_loss": 0.0018466471155988984,
      "step": 190,
      "total_loss": 0.09149162436369807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 286.5,
      "completions/max_terminated_length": 286.5,
      "completions/mean_length": 157.46484375,
      "completions/mean_terminated_length": 157.46484375,
      "completions/min_length": 72.5,
      "completions/min_terminated_length": 72.5,
      "epoch": 0.8162393162393162,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.198067341450511,
      "grpo_loss": -0.0376204761560075,
      "kl": 0.08465576171875,
      "learning_rate": 2.08928965343659e-06,
      "loss": 0.0543,
      "num_tokens": 12865439.0,
      "reward": 2.83203125,
      "reward_std": 0.2489478625357151,
      "rewards/accuracy_reward/mean": 0.83203125,
      "rewards/accuracy_reward/std": 0.3710259050130844,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08339540008455515,
      "stage2_sft_loss": 0.006528811136377044,
      "step": 191,
      "total_loss": 0.046427804976701736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 317.25,
      "completions/max_terminated_length": 317.25,
      "completions/mean_length": 137.4609375,
      "completions/mean_terminated_length": 137.4609375,
      "completions/min_length": 71.25,
      "completions/min_terminated_length": 71.25,
      "epoch": 0.8205128205128205,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.237880793803582,
      "grpo_loss": 0.009976790606742725,
      "kl": 0.08349609375,
      "learning_rate": 1.9986645198879385e-06,
      "loss": 0.039,
      "num_tokens": 12935317.0,
      "reward": 2.8359375,
      "reward_std": 0.2210792675614357,
      "rewards/accuracy_reward/mean": 0.8359375,
      "rewards/accuracy_reward/std": 0.36333344131708145,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08060388825833797,
      "stage2_sft_loss": 0.004419473567395471,
      "step": 192,
      "total_loss": 0.09102262475062162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 264.75,
      "completions/max_terminated_length": 264.75,
      "completions/mean_length": 128.59765625,
      "completions/mean_terminated_length": 128.59765625,
      "completions/min_length": 60.75,
      "completions/min_terminated_length": 60.75,
      "epoch": 0.8247863247863247,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.3641231037433035,
      "grpo_loss": -0.023414782102918252,
      "kl": 0.0792236328125,
      "learning_rate": 1.9098300562505266e-06,
      "loss": 0.0626,
      "num_tokens": 13001822.0,
      "reward": 2.8515625,
      "reward_std": 0.17939137108623981,
      "rewards/accuracy_reward/mean": 0.8515625,
      "rewards/accuracy_reward/std": 0.3518161177635193,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08140591345727444,
      "stage2_sft_loss": 0.0020642982563003898,
      "step": 193,
      "total_loss": 0.05819755978882313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 251.25,
      "completions/max_terminated_length": 251.25,
      "completions/mean_length": 125.0,
      "completions/mean_terminated_length": 125.0,
      "completions/min_length": 65.75,
      "completions/min_terminated_length": 65.75,
      "epoch": 0.8290598290598291,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.215355384585231,
      "grpo_loss": 0.001619994145585224,
      "kl": 0.07733154296875,
      "learning_rate": 1.8228061433556866e-06,
      "loss": 0.0581,
      "num_tokens": 13067158.0,
      "reward": 2.8046875,
      "reward_std": 0.21318363025784492,
      "rewards/accuracy_reward/mean": 0.8046875,
      "rewards/accuracy_reward/std": 0.39349858462810516,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.02379709028173238,
      "stage2_sft_loss": 0.0009464863105677068,
      "step": 194,
      "total_loss": 0.025511732907034457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 268.5,
      "completions/max_terminated_length": 268.5,
      "completions/mean_length": 131.46875,
      "completions/mean_terminated_length": 131.46875,
      "completions/min_length": 64.75,
      "completions/min_terminated_length": 64.75,
      "epoch": 0.8333333333333334,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.254564743687319,
      "grpo_loss": 0.017838238331023604,
      "kl": 0.0771484375,
      "learning_rate": 1.7376122568400533e-06,
      "loss": 0.0854,
      "num_tokens": 13133366.0,
      "reward": 2.83203125,
      "reward_std": 0.22882908582687378,
      "rewards/accuracy_reward/mean": 0.83203125,
      "rewards/accuracy_reward/std": 0.3620675168931484,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.032180753303691745,
      "stage2_sft_loss": 0.021464947407366708,
      "step": 195,
      "total_loss": 0.052165485452860594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 266.25,
      "completions/max_terminated_length": 266.25,
      "completions/mean_length": 136.43359375,
      "completions/mean_terminated_length": 136.43359375,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.8376068376068376,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.3197178808051375,
      "grpo_loss": -0.03142386896070093,
      "kl": 0.102783203125,
      "learning_rate": 1.6542674627869738e-06,
      "loss": 0.0672,
      "num_tokens": 13201541.0,
      "reward": 2.81640625,
      "reward_std": 0.2131860852241516,
      "rewards/accuracy_reward/mean": 0.81640625,
      "rewards/accuracy_reward/std": 0.37990450114011765,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.055304642068222165,
      "stage2_sft_loss": 0.007718519467744045,
      "step": 196,
      "total_loss": 0.024652624037116766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 370.75,
      "completions/max_terminated_length": 370.75,
      "completions/mean_length": 134.8828125,
      "completions/mean_terminated_length": 134.8828125,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.8418803418803419,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.860612537760405,
      "grpo_loss": -0.02284381364006549,
      "kl": 0.0767822265625,
      "learning_rate": 1.5727904134596084e-06,
      "loss": 0.0555,
      "num_tokens": 13269487.0,
      "reward": 2.81640625,
      "reward_std": 0.18820202350616455,
      "rewards/accuracy_reward/mean": 0.81640625,
      "rewards/accuracy_reward/std": 0.3756125792860985,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07799512334167957,
      "stage2_sft_loss": 0.015525784227065742,
      "step": 197,
      "total_loss": 0.056703890673816204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 263.75,
      "completions/max_terminated_length": 263.75,
      "completions/mean_length": 135.46484375,
      "completions/mean_terminated_length": 135.46484375,
      "completions/min_length": 66.75,
      "completions/min_terminated_length": 66.75,
      "epoch": 0.8461538461538461,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 1.8339436198084127,
      "grpo_loss": -0.00017789613048080355,
      "kl": 0.06561279296875,
      "learning_rate": 1.4931993431266056e-06,
      "loss": 0.0511,
      "num_tokens": 13338694.0,
      "reward": 2.86328125,
      "reward_std": 0.1916169673204422,
      "rewards/accuracy_reward/mean": 0.86328125,
      "rewards/accuracy_reward/std": 0.33633895218372345,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04241414717398584,
      "stage2_sft_loss": 0.007416335574816912,
      "step": 198,
      "total_loss": 0.042977884877473116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 275.5,
      "completions/max_terminated_length": 275.5,
      "completions/mean_length": 133.71875,
      "completions/mean_terminated_length": 133.71875,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.8504273504273504,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 2.321539071874128,
      "grpo_loss": 0.03887829327140935,
      "kl": 0.08251953125,
      "learning_rate": 1.4155120639813392e-06,
      "loss": 0.0418,
      "num_tokens": 13405974.0,
      "reward": 2.78515625,
      "reward_std": 0.2688070461153984,
      "rewards/accuracy_reward/mean": 0.78515625,
      "rewards/accuracy_reward/std": 0.4082975834608078,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.042024717666208744,
      "stage2_sft_loss": 0.010183911363128573,
      "step": 199,
      "total_loss": 0.08192139957100153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 287.25,
      "completions/max_terminated_length": 287.25,
      "completions/mean_length": 134.9609375,
      "completions/mean_terminated_length": 134.9609375,
      "completions/min_length": 60.75,
      "completions/min_terminated_length": 60.75,
      "epoch": 0.8547008547008547,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.9695463119039087,
      "grpo_loss": -0.029325536685064435,
      "kl": 0.0819091796875,
      "learning_rate": 1.339745962155613e-06,
      "loss": 0.0349,
      "num_tokens": 13473412.0,
      "reward": 2.7890625,
      "reward_std": 0.2301519438624382,
      "rewards/accuracy_reward/mean": 0.7890625,
      "rewards/accuracy_reward/std": 0.40658413618803024,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06595750187989324,
      "stage2_sft_loss": 0.018286966653249692,
      "step": 200,
      "total_loss": 0.038460663286969066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 229.5,
      "completions/max_terminated_length": 229.5,
      "completions/mean_length": 129.95703125,
      "completions/mean_terminated_length": 129.95703125,
      "completions/min_length": 56.75,
      "completions/min_terminated_length": 56.75,
      "epoch": 0.8589743589743589,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 2.457707498128624,
      "grpo_loss": 0.015741711657028645,
      "kl": 0.0843505859375,
      "learning_rate": 1.2659179938287035e-06,
      "loss": 0.0693,
      "num_tokens": 13539713.0,
      "reward": 2.80078125,
      "reward_std": 0.24237754568457603,
      "rewards/accuracy_reward/mean": 0.80078125,
      "rewards/accuracy_reward/std": 0.3829581290483475,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0587872676551342,
      "stage2_sft_loss": 0.006722187114064582,
      "step": 201,
      "total_loss": 0.07520119519904256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 220.0,
      "completions/max_terminated_length": 220.0,
      "completions/mean_length": 125.01171875,
      "completions/mean_terminated_length": 125.01171875,
      "completions/min_length": 62.25,
      "completions/min_terminated_length": 62.25,
      "epoch": 0.8632478632478633,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.8597591668705884,
      "grpo_loss": -0.03844989475328475,
      "kl": 0.0792236328125,
      "learning_rate": 1.19404468143262e-06,
      "loss": 0.0488,
      "num_tokens": 13605524.0,
      "reward": 2.828125,
      "reward_std": 0.23026816546916962,
      "rewards/accuracy_reward/mean": 0.828125,
      "rewards/accuracy_reward/std": 0.374307245016098,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07022882532328367,
      "stage2_sft_loss": 0.00958529122362961,
      "step": 202,
      "total_loss": 0.03273745905607939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 236.75,
      "completions/max_terminated_length": 236.75,
      "completions/mean_length": 140.76171875,
      "completions/mean_terminated_length": 140.76171875,
      "completions/min_length": 67.75,
      "completions/min_terminated_length": 67.75,
      "epoch": 0.8675213675213675,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.294920534337738,
      "grpo_loss": -0.033718791615683585,
      "kl": 0.0762939453125,
      "learning_rate": 1.124142109954459e-06,
      "loss": 0.0614,
      "num_tokens": 13674743.0,
      "reward": 2.83203125,
      "reward_std": 0.2108270674943924,
      "rewards/accuracy_reward/mean": 0.83203125,
      "rewards/accuracy_reward/std": 0.3715285286307335,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.09230023482814431,
      "stage2_sft_loss": 0.012318415741901845,
      "step": 203,
      "total_loss": 0.059813279658555984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 211.5,
      "completions/max_terminated_length": 211.5,
      "completions/mean_length": 113.66796875,
      "completions/mean_terminated_length": 113.66796875,
      "completions/min_length": 58.5,
      "completions/min_terminated_length": 58.5,
      "epoch": 0.8717948717948718,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.035886437312805,
      "grpo_loss": 0.02149005071260035,
      "kl": 0.0849609375,
      "learning_rate": 1.0562259233366334e-06,
      "loss": 0.0525,
      "num_tokens": 13736002.0,
      "reward": 2.87890625,
      "reward_std": 0.1590056698769331,
      "rewards/accuracy_reward/mean": 0.87890625,
      "rewards/accuracy_reward/std": 0.31372954696416855,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.033063370268791914,
      "stage2_sft_loss": 0.003003982281370554,
      "step": 204,
      "total_loss": 0.05485381884500384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 234.75,
      "completions/max_terminated_length": 234.75,
      "completions/mean_length": 117.3984375,
      "completions/mean_terminated_length": 117.3984375,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.8760683760683761,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 2.30690792423194,
      "grpo_loss": 0.0016019267350202426,
      "kl": 0.086669921875,
      "learning_rate": 9.903113209758098e-07,
      "loss": 0.0554,
      "num_tokens": 13798288.0,
      "reward": 2.91796875,
      "reward_std": 0.11139655206352472,
      "rewards/accuracy_reward/mean": 0.91796875,
      "rewards/accuracy_reward/std": 0.2633422575891018,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04401658312417567,
      "stage2_sft_loss": 0.0009024485862028087,
      "step": 205,
      "total_loss": 0.04570875607896596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 264.5,
      "completions/max_terminated_length": 264.5,
      "completions/mean_length": 113.03125,
      "completions/mean_terminated_length": 113.03125,
      "completions/min_length": 49.75,
      "completions/min_terminated_length": 49.75,
      "epoch": 0.8803418803418803,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 2.3831642997829166,
      "grpo_loss": 0.0045173391699790955,
      "kl": 0.0860595703125,
      "learning_rate": 9.264130543213512e-07,
      "loss": 0.0482,
      "num_tokens": 13859392.0,
      "reward": 2.828125,
      "reward_std": 0.17069938778877258,
      "rewards/accuracy_reward/mean": 0.828125,
      "rewards/accuracy_reward/std": 0.37033187597990036,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07017236109822989,
      "stage2_sft_loss": 0.02451086524524726,
      "step": 206,
      "total_loss": 0.07714078575372696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 297.5,
      "completions/max_terminated_length": 297.5,
      "completions/mean_length": 138.5859375,
      "completions/mean_terminated_length": 138.5859375,
      "completions/min_length": 60.5,
      "completions/min_terminated_length": 60.5,
      "epoch": 0.8846153846153846,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.072596487997066,
      "grpo_loss": -0.013341820711502805,
      "kl": 0.0755615234375,
      "learning_rate": 8.645454235739903e-07,
      "loss": 0.0576,
      "num_tokens": 13928598.0,
      "reward": 2.78125,
      "reward_std": 0.17544355988502502,
      "rewards/accuracy_reward/mean": 0.78125,
      "rewards/accuracy_reward/std": 0.38894475251436234,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0771252615377307,
      "stage2_sft_loss": 0.025300035646068864,
      "step": 207,
      "total_loss": 0.06631344370543957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 298.0,
      "completions/max_terminated_length": 298.0,
      "completions/mean_length": 135.73828125,
      "completions/mean_terminated_length": 135.73828125,
      "completions/min_length": 70.5,
      "completions/min_terminated_length": 70.5,
      "epoch": 0.8888888888888888,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 2.3783164154533063,
      "grpo_loss": -0.025427510030567646,
      "kl": 0.06793212890625,
      "learning_rate": 8.047222744854943e-07,
      "loss": 0.0604,
      "num_tokens": 13997835.0,
      "reward": 2.84375,
      "reward_std": 0.1522856391966343,
      "rewards/accuracy_reward/mean": 0.84375,
      "rewards/accuracy_reward/std": 0.34538276866078377,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07427823357284069,
      "stage2_sft_loss": 0.0030603897521359613,
      "step": 208,
      "total_loss": 0.049156763125211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 296.5,
      "completions/max_terminated_length": 296.5,
      "completions/mean_length": 129.3203125,
      "completions/mean_terminated_length": 129.3203125,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.8931623931623932,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.82655021936802,
      "grpo_loss": 0.020664230061811395,
      "kl": 0.1365966796875,
      "learning_rate": 7.46956995260033e-07,
      "loss": 0.074,
      "num_tokens": 14064381.0,
      "reward": 2.8125,
      "reward_std": 0.19621141999959946,
      "rewards/accuracy_reward/mean": 0.8125,
      "rewards/accuracy_reward/std": 0.3881358355283737,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.03121958440169692,
      "stage2_sft_loss": 0.008896204843040323,
      "step": 209,
      "total_loss": 0.052773436065763235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 279.0,
      "completions/max_terminated_length": 279.0,
      "completions/mean_length": 126.23046875,
      "completions/mean_terminated_length": 126.23046875,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.8974358974358975,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 2.9415317368519593,
      "grpo_loss": 0.041792053991230205,
      "kl": 0.0849609375,
      "learning_rate": 6.912625135579587e-07,
      "loss": 0.0899,
      "num_tokens": 14128464.0,
      "reward": 2.875,
      "reward_std": 0.11796049773693085,
      "rewards/accuracy_reward/mean": 0.875,
      "rewards/accuracy_reward/std": 0.32698625698685646,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07522869762033224,
      "stage2_sft_loss": 0.010016409680247307,
      "step": 210,
      "total_loss": 0.11802239343523979
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 288.25,
      "completions/max_terminated_length": 288.25,
      "completions/mean_length": 148.6484375,
      "completions/mean_terminated_length": 148.6484375,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.9017094017094017,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.7130622095694483,
      "grpo_loss": -0.009939641153323464,
      "kl": 0.07244873046875,
      "learning_rate": 6.37651293602628e-07,
      "loss": 0.0868,
      "num_tokens": 14201598.0,
      "reward": 2.7578125,
      "reward_std": 0.22685321792960167,
      "rewards/accuracy_reward/mean": 0.7578125,
      "rewards/accuracy_reward/std": 0.391734354197979,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05721112829633057,
      "stage2_sft_loss": 0.005386116390582174,
      "step": 211,
      "total_loss": 0.04781010281294584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 246.25,
      "completions/max_terminated_length": 246.25,
      "completions/mean_length": 124.21484375,
      "completions/mean_terminated_length": 124.21484375,
      "completions/min_length": 56.5,
      "completions/min_terminated_length": 56.5,
      "epoch": 0.905982905982906,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 4.807508789968689,
      "grpo_loss": -0.016500203098985367,
      "kl": 0.158935546875,
      "learning_rate": 5.861353333909692e-07,
      "loss": 0.0623,
      "num_tokens": 14266213.0,
      "reward": 2.80859375,
      "reward_std": 0.23752107471227646,
      "rewards/accuracy_reward/mean": 0.80859375,
      "rewards/accuracy_reward/std": 0.38669662177562714,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.044385235058143735,
      "stage2_sft_loss": 0.005123868337250315,
      "step": 212,
      "total_loss": 0.028397418092936277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 320.75,
      "completions/max_terminated_length": 320.75,
      "completions/mean_length": 127.9140625,
      "completions/mean_terminated_length": 127.9140625,
      "completions/min_length": 68.25,
      "completions/min_terminated_length": 68.25,
      "epoch": 0.9102564102564102,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.0301737121967856,
      "grpo_loss": -0.00042418597149662673,
      "kl": 0.06964111328125,
      "learning_rate": 5.367261620083575e-07,
      "loss": 0.0551,
      "num_tokens": 14331495.0,
      "reward": 2.8203125,
      "reward_std": 0.20976869761943817,
      "rewards/accuracy_reward/mean": 0.8203125,
      "rewards/accuracy_reward/std": 0.3742370903491974,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05549457389861345,
      "stage2_sft_loss": 0.005932546828262275,
      "step": 213,
      "total_loss": 0.055663644336164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 224.25,
      "completions/max_terminated_length": 224.25,
      "completions/mean_length": 123.359375,
      "completions/mean_terminated_length": 123.359375,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.9145299145299145,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.1641000763673683,
      "grpo_loss": -0.008345429931068793,
      "kl": 0.08563232421875,
      "learning_rate": 4.894348370484648e-07,
      "loss": 0.0608,
      "num_tokens": 14395075.0,
      "reward": 2.83203125,
      "reward_std": 0.18937908113002777,
      "rewards/accuracy_reward/mean": 0.83203125,
      "rewards/accuracy_reward/std": 0.36685848236083984,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.050493769347667694,
      "stage2_sft_loss": 0.0014829118590569124,
      "step": 214,
      "total_loss": 0.04229663033038378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 298.75,
      "completions/max_terminated_length": 298.75,
      "completions/mean_length": 124.5859375,
      "completions/mean_terminated_length": 124.5859375,
      "completions/min_length": 59.5,
      "completions/min_terminated_length": 59.5,
      "epoch": 0.9188034188034188,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.6263204572290344,
      "grpo_loss": 0.028134352585766464,
      "kl": 0.0804443359375,
      "learning_rate": 4.4427194213859216e-07,
      "loss": 0.0847,
      "num_tokens": 14460257.0,
      "reward": 2.8984375,
      "reward_std": 0.09324482083320618,
      "rewards/accuracy_reward/mean": 0.8984375,
      "rewards/accuracy_reward/std": 0.3030460849404335,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08545329654589295,
      "stage2_sft_loss": 0.0015856244317546953,
      "step": 215,
      "total_loss": 0.11374621279537678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 248.25,
      "completions/max_terminated_length": 248.25,
      "completions/mean_length": 130.69921875,
      "completions/mean_terminated_length": 130.69921875,
      "completions/min_length": 69.25,
      "completions/min_terminated_length": 69.25,
      "epoch": 0.9230769230769231,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 2.1211135589296197,
      "grpo_loss": 0.009156979038380086,
      "kl": 0.06561279296875,
      "learning_rate": 4.012475845711106e-07,
      "loss": 0.0504,
      "num_tokens": 14526860.0,
      "reward": 2.88671875,
      "reward_std": 0.15703225508332253,
      "rewards/accuracy_reward/mean": 0.88671875,
      "rewards/accuracy_reward/std": 0.31127535179257393,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.06328852870501578,
      "stage2_sft_loss": 0.0017232074733328773,
      "step": 216,
      "total_loss": 0.07261782942805439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 283.25,
      "completions/max_terminated_length": 283.25,
      "completions/mean_length": 136.74609375,
      "completions/mean_terminated_length": 136.74609375,
      "completions/min_length": 60.75,
      "completions/min_terminated_length": 60.75,
      "epoch": 0.9273504273504274,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.070331892621596,
      "grpo_loss": -0.06435195698577445,
      "kl": 0.0723876953125,
      "learning_rate": 3.603713930414676e-07,
      "loss": 0.0492,
      "num_tokens": 14594891.0,
      "reward": 2.79296875,
      "reward_std": 0.25289567187428474,
      "rewards/accuracy_reward/mean": 0.79296875,
      "rewards/accuracy_reward/std": 0.40503790229558945,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04832777753472328,
      "stage2_sft_loss": 0.017169317688967567,
      "step": 217,
      "total_loss": -0.014307248464319855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 134.9921875,
      "completions/mean_terminated_length": 134.9921875,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.9316239316239316,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.0040656384269946,
      "grpo_loss": -0.053561781911412254,
      "kl": 0.076416015625,
      "learning_rate": 3.2165251549333585e-07,
      "loss": 0.0581,
      "num_tokens": 14663625.0,
      "reward": 2.890625,
      "reward_std": 0.1757119484245777,
      "rewards/accuracy_reward/mean": 0.890625,
      "rewards/accuracy_reward/std": 0.29402102530002594,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04904615576379001,
      "stage2_sft_loss": 0.007902702882347512,
      "step": 218,
      "total_loss": -0.0037253551417961717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 277.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 132.75390625,
      "completions/mean_terminated_length": 132.75390625,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.9358974358974359,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.4721018783588886,
      "grpo_loss": -0.01120759492914658,
      "kl": 0.0736083984375,
      "learning_rate": 2.8509961707132496e-07,
      "loss": 0.0718,
      "num_tokens": 14730890.0,
      "reward": 2.7412109375,
      "reward_std": 0.17201999574899673,
      "rewards/accuracy_reward/mean": 0.74609375,
      "rewards/accuracy_reward/std": 0.4207110106945038,
      "rewards/format_reward/mean": 0.99609375,
      "rewards/format_reward/std": 0.03125,
      "rewards/tag_count_reward/mean": 0.9990234375,
      "rewards/tag_count_reward/std": 0.0078125,
      "stage1_sft_loss": 0.07889411831274629,
      "stage2_sft_loss": 0.022961543887504376,
      "step": 219,
      "total_loss": 0.06998268235474825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 290.5,
      "completions/max_terminated_length": 290.5,
      "completions/mean_length": 143.5546875,
      "completions/mean_terminated_length": 143.5546875,
      "completions/min_length": 63.75,
      "completions/min_terminated_length": 63.75,
      "epoch": 0.9401709401709402,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.284720202352025,
      "grpo_loss": 0.03501165362831671,
      "kl": 0.07568359375,
      "learning_rate": 2.507208781817638e-07,
      "loss": 0.0477,
      "num_tokens": 14802496.0,
      "reward": 2.87109375,
      "reward_std": 0.20240348391234875,
      "rewards/accuracy_reward/mean": 0.87109375,
      "rewards/accuracy_reward/std": 0.3205883875489235,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07232746039517224,
      "stage2_sft_loss": 0.004074504366144538,
      "step": 220,
      "total_loss": 0.107746567344293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 269.5,
      "completions/max_terminated_length": 269.5,
      "completions/mean_length": 133.1796875,
      "completions/mean_terminated_length": 133.1796875,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.9444444444444444,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 2.3319446282196705,
      "grpo_loss": -0.0034089816035702825,
      "kl": 0.08306884765625,
      "learning_rate": 2.1852399266194312e-07,
      "loss": 0.0587,
      "num_tokens": 14869262.0,
      "reward": 2.859375,
      "reward_std": 0.18516533076763153,
      "rewards/accuracy_reward/mean": 0.859375,
      "rewards/accuracy_reward/std": 0.3469066470861435,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.056242790538817644,
      "stage2_sft_loss": 0.018294302353751846,
      "step": 221,
      "total_loss": 0.05466323997825384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 422.25,
      "completions/max_terminated_length": 422.25,
      "completions/mean_length": 159.671875,
      "completions/mean_terminated_length": 159.671875,
      "completions/min_length": 57.75,
      "completions/min_terminated_length": 57.75,
      "epoch": 0.9487179487179487,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.361139064584806,
      "grpo_loss": 0.0038841749192215502,
      "kl": 0.07928466796875,
      "learning_rate": 1.885161660582746e-07,
      "loss": 0.057,
      "num_tokens": 14944802.0,
      "reward": 2.7734375,
      "reward_std": 0.2026655077934265,
      "rewards/accuracy_reward/mean": 0.7734375,
      "rewards/accuracy_reward/std": 0.417447067797184,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.0955225438810885,
      "stage2_sft_loss": 0.0009612532594474033,
      "step": 222,
      "total_loss": 0.09950284566730261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 335.25,
      "completions/max_terminated_length": 335.25,
      "completions/mean_length": 152.96484375,
      "completions/mean_terminated_length": 152.96484375,
      "completions/min_length": 69.5,
      "completions/min_terminated_length": 69.5,
      "epoch": 0.9529914529914529,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 2.0296918687522303,
      "grpo_loss": -0.008389710797928274,
      "kl": 0.07568359375,
      "learning_rate": 1.6070411401370335e-07,
      "loss": 0.0514,
      "num_tokens": 15019097.0,
      "reward": 2.796875,
      "reward_std": 0.20726242195814848,
      "rewards/accuracy_reward/mean": 0.796875,
      "rewards/accuracy_reward/std": 0.35784538090229034,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05725141195580363,
      "stage2_sft_loss": 0.006129985613370081,
      "step": 223,
      "total_loss": 0.049474698840640485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 266.25,
      "completions/max_terminated_length": 266.25,
      "completions/mean_length": 134.58984375,
      "completions/mean_terminated_length": 134.58984375,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.9572649572649573,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 1.8660849704001432,
      "grpo_loss": 0.013021941791521385,
      "kl": 0.0694580078125,
      "learning_rate": 1.350940607647866e-07,
      "loss": 0.0332,
      "num_tokens": 15086920.0,
      "reward": 2.79296875,
      "reward_std": 0.1635986603796482,
      "rewards/accuracy_reward/mean": 0.79296875,
      "rewards/accuracy_reward/std": 0.40334802865982056,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.027745027327910066,
      "stage2_sft_loss": 0.008323597925482318,
      "step": 224,
      "total_loss": 0.041599329095333815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 226.25,
      "completions/max_terminated_length": 226.25,
      "completions/mean_length": 124.17578125,
      "completions/mean_terminated_length": 124.17578125,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 0.9615384615384616,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 2.34498979328278,
      "grpo_loss": 0.00911492871819064,
      "kl": 0.08074951171875,
      "learning_rate": 1.1169173774871478e-07,
      "loss": 0.0702,
      "num_tokens": 15152061.0,
      "reward": 2.84375,
      "reward_std": 0.12468297965824604,
      "rewards/accuracy_reward/mean": 0.84375,
      "rewards/accuracy_reward/std": 0.33794204145669937,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07473163260146976,
      "stage2_sft_loss": 0.004223723481118213,
      "step": 225,
      "total_loss": 0.08426893223077059
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 115.07421875,
      "completions/mean_terminated_length": 115.07421875,
      "completions/min_length": 61.75,
      "completions/min_terminated_length": 61.75,
      "epoch": 0.9658119658119658,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 2.3385372776790243,
      "grpo_loss": -0.03898987057618797,
      "kl": 0.0703125,
      "learning_rate": 9.0502382320653e-08,
      "loss": 0.0556,
      "num_tokens": 15212408.0,
      "reward": 2.83984375,
      "reward_std": 0.11060019582509995,
      "rewards/accuracy_reward/mean": 0.83984375,
      "rewards/accuracy_reward/std": 0.345817930996418,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07038992689922452,
      "stage2_sft_loss": 0.002880757765524322,
      "step": 226,
      "total_loss": 0.031688129995018244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 268.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 145.77734375,
      "completions/mean_terminated_length": 145.77734375,
      "completions/min_length": 74.5,
      "completions/min_terminated_length": 74.5,
      "epoch": 0.9700854700854701,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.438610062892225,
      "grpo_loss": -0.0443235896564147,
      "kl": 0.070068359375,
      "learning_rate": 7.153073658162646e-08,
      "loss": 0.0497,
      "num_tokens": 15282439.0,
      "reward": 2.734375,
      "reward_std": 0.2102927379310131,
      "rewards/accuracy_reward/mean": 0.734375,
      "rewards/accuracy_reward/std": 0.42461463809013367,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.03144627227447927,
      "stage2_sft_loss": 0.01161374260118464,
      "step": 227,
      "total_loss": -0.011715942644514143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 251.75,
      "completions/max_terminated_length": 251.75,
      "completions/mean_length": 116.90234375,
      "completions/mean_terminated_length": 116.90234375,
      "completions/min_length": 59.75,
      "completions/min_terminated_length": 59.75,
      "epoch": 0.9743589743589743,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 1.8053523876604445,
      "grpo_loss": -0.039161166292615235,
      "kl": 0.07305908203125,
      "learning_rate": 5.4781046317267103e-08,
      "loss": 0.04,
      "num_tokens": 15345054.0,
      "reward": 2.8515625,
      "reward_std": 0.21607061475515366,
      "rewards/accuracy_reward/mean": 0.8515625,
      "rewards/accuracy_reward/std": 0.34781959280371666,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.049540508072823286,
      "stage2_sft_loss": 0.005793681345494406,
      "step": 228,
      "total_loss": 0.010958710685372353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 335.25,
      "completions/max_terminated_length": 335.25,
      "completions/mean_length": 143.6328125,
      "completions/mean_terminated_length": 143.6328125,
      "completions/min_length": 52.75,
      "completions/min_terminated_length": 52.75,
      "epoch": 0.9786324786324786,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 2.1238058001256834,
      "grpo_loss": 0.021149674721527845,
      "kl": 0.075927734375,
      "learning_rate": 4.025706004760932e-08,
      "loss": 0.0641,
      "num_tokens": 15414184.0,
      "reward": 2.734375,
      "reward_std": 0.22738118842244148,
      "rewards/accuracy_reward/mean": 0.734375,
      "rewards/accuracy_reward/std": 0.39063628762960434,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.05295246001332998,
      "stage2_sft_loss": 0.008666668943988043,
      "step": 229,
      "total_loss": 0.0749688046053052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 309.75,
      "completions/max_terminated_length": 309.75,
      "completions/mean_length": 150.81640625,
      "completions/mean_terminated_length": 150.81640625,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.9829059829059829,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 2.490990424347089,
      "grpo_loss": 0.007095444132573903,
      "kl": 0.0914306640625,
      "learning_rate": 2.796202818819871e-08,
      "loss": 0.0608,
      "num_tokens": 15486313.0,
      "reward": 2.7265625,
      "reward_std": 0.239877637475729,
      "rewards/accuracy_reward/mean": 0.7265625,
      "rewards/accuracy_reward/std": 0.4078166112303734,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08487359434366226,
      "stage2_sft_loss": 0.018978525884449482,
      "step": 230,
      "total_loss": 0.09386688856466208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 235.0,
      "completions/max_terminated_length": 235.0,
      "completions/mean_length": 134.30078125,
      "completions/mean_terminated_length": 134.30078125,
      "completions/min_length": 65.25,
      "completions/min_terminated_length": 65.25,
      "epoch": 0.9871794871794872,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 1.9811694517457321,
      "grpo_loss": -0.022785615394241177,
      "kl": 0.0667724609375,
      "learning_rate": 1.7898702322648453e-08,
      "loss": 0.0532,
      "num_tokens": 15554166.0,
      "reward": 2.84765625,
      "reward_std": 0.19279402680695057,
      "rewards/accuracy_reward/mean": 0.84765625,
      "rewards/accuracy_reward/std": 0.341281745582819,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.08472150657325983,
      "stage2_sft_loss": 0.020304364268667996,
      "step": 231,
      "total_loss": 0.06396632781252265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 247.25,
      "completions/max_terminated_length": 247.25,
      "completions/mean_length": 122.203125,
      "completions/mean_terminated_length": 122.203125,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.9914529914529915,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.4948488462654845,
      "grpo_loss": 0.0030043296865187585,
      "kl": 0.06317138671875,
      "learning_rate": 1.0069334586854106e-08,
      "loss": 0.0516,
      "num_tokens": 15618106.0,
      "reward": 2.86328125,
      "reward_std": 0.09666222147643566,
      "rewards/accuracy_reward/mean": 0.86328125,
      "rewards/accuracy_reward/std": 0.33113233372569084,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.07209198269993067,
      "stage2_sft_loss": 0.00023392363800667226,
      "step": 232,
      "total_loss": 0.0751197044737637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 267.5,
      "completions/max_terminated_length": 267.5,
      "completions/mean_length": 134.3515625,
      "completions/mean_terminated_length": 134.3515625,
      "completions/min_length": 60.25,
      "completions/min_terminated_length": 60.25,
      "epoch": 0.9957264957264957,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 1.874383236311945,
      "grpo_loss": -0.013771897065453231,
      "kl": 0.06573486328125,
      "learning_rate": 4.475677164966774e-09,
      "loss": 0.047,
      "num_tokens": 15685796.0,
      "reward": 2.80859375,
      "reward_std": 0.18634483218193054,
      "rewards/accuracy_reward/mean": 0.80859375,
      "rewards/accuracy_reward/std": 0.3866342604160309,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.02409595239441842,
      "stage2_sft_loss": 0.0057802217324933736,
      "step": 233,
      "total_loss": 0.010902077774517238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": -7.0,
      "completions/max_length": 270.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 130.3671875,
      "completions/mean_terminated_length": 130.3671875,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 1.0,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 2.257091182884558,
      "grpo_loss": 0.02863113474450074,
      "kl": 0.08056640625,
      "learning_rate": 1.1189818972656697e-09,
      "loss": 0.0498,
      "num_tokens": 15753674.0,
      "reward": 2.78125,
      "reward_std": 0.18714364245533943,
      "rewards/accuracy_reward/mean": 0.78125,
      "rewards/accuracy_reward/std": 0.40201979875564575,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "stage1_sft_loss": 0.04045400209724903,
      "stage2_sft_loss": 0.015837795235711383,
      "step": 234,
      "total_loss": 0.07066891435533762
    },
    {
      "epoch": 1.0,
      "step": 234,
      "total_flos": 0.0,
      "train_loss": 0.07978848305841287,
      "train_runtime": 7290.3624,
      "train_samples_per_second": 1.027,
      "train_steps_per_second": 0.032
    }
  ],
  "logging_steps": 1,
  "max_steps": 234,
  "num_input_tokens_seen": 15753674,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}