{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 251.75, "completions/max_terminated_length": 251.75, "completions/mean_length": 133.75390625, "completions/mean_terminated_length": 133.75390625, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.004273504273504274, "frac_reward_zero_std": 0.59375, "grad_norm": 7.757056816470261, "grpo_loss": -0.03923009986246484, "kl": 4.7147274017333984e-05, "learning_rate": 0.0, "loss": 0.3179, "num_tokens": 67233.0, "reward": 2.875, "reward_std": 0.16637087427079678, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3316035121679306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.01564404438249767, "stage2_sft_loss": 3.0086944103240967, "step": 1, "total_loss": 0.2772833965718746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 295.25, "completions/max_terminated_length": 295.25, "completions/mean_length": 142.0234375, "completions/mean_terminated_length": 142.0234375, "completions/min_length": 64.25, "completions/min_terminated_length": 64.25, "epoch": 0.008547008547008548, "frac_reward_zero_std": 0.59375, "grad_norm": 7.627764897193262, "grpo_loss": 0.00449168267357436, "kl": 2.8908252716064453e-05, "learning_rate": 8.333333333333333e-07, "loss": 0.3089, "num_tokens": 138127.0, "reward": 2.91796875, "reward_std": 0.15782861225306988, "rewards/accuracy_reward/mean": 0.91796875, "rewards/accuracy_reward/std": 0.2668132297694683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.009601139056030661, "stage2_sft_loss": 3.002301514148712, "step": 2, "total_loss": 0.31432297825813293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 120.30859375, "completions/mean_terminated_length": 120.30859375, "completions/min_length": 66.25, "completions/min_terminated_length": 66.25, "epoch": 0.01282051282051282, "frac_reward_zero_std": 0.71875, "grad_norm": 7.495269211942902, "grpo_loss": -0.046216885010437636, "kl": 4.70578670501709e-05, "learning_rate": 1.6666666666666667e-06, "loss": 0.3016, "num_tokens": 202910.0, "reward": 2.91796875, "reward_std": 0.12046922650188208, "rewards/accuracy_reward/mean": 0.91796875, "rewards/accuracy_reward/std": 0.2554394565522671, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.033421904779970646, "stage2_sft_loss": 2.95179682970047, "step": 3, "total_loss": 0.28238470479846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 210.5, "completions/max_terminated_length": 210.5, "completions/mean_length": 125.6328125, "completions/mean_terminated_length": 125.6328125, "completions/min_length": 66.25, "completions/min_terminated_length": 66.25, "epoch": 0.017094017094017096, "frac_reward_zero_std": 0.75, "grad_norm": 6.493119857304641, "grpo_loss": -0.006215297922665286, "kl": 4.932284355163574e-05, "learning_rate": 2.5e-06, "loss": 0.2727, "num_tokens": 268104.0, "reward": 2.94921875, "reward_std": 0.0991684952750802, "rewards/accuracy_reward/mean": 0.94921875, "rewards/accuracy_reward/std": 0.21033688634634018, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.008940062718465924, "stage2_sft_loss": 2.607166886329651, "step": 4, "total_loss": 0.26344145461916924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 244.5, "completions/max_terminated_length": 244.5, "completions/mean_length": 129.5390625, "completions/mean_terminated_length": 129.5390625, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.021367521367521368, "frac_reward_zero_std": 0.59375, "grad_norm": 4.563119008532273, "grpo_loss": -0.015285405330359936, "kl": 5.14984130859375e-05, "learning_rate": 3.3333333333333333e-06, "loss": 0.2226, "num_tokens": 332650.0, "reward": 2.921875, "reward_std": 0.16584291495382786, "rewards/accuracy_reward/mean": 0.921875, "rewards/accuracy_reward/std": 0.26355477422475815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.018052286352030933, "stage2_sft_loss": 2.005835622549057, "step": 5, "total_loss": 0.20335044339299202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 140.67578125, "completions/mean_terminated_length": 140.67578125, "completions/min_length": 64.5, "completions/min_terminated_length": 64.5, "epoch": 0.02564102564102564, "frac_reward_zero_std": 0.59375, "grad_norm": 3.4342274451007677, "grpo_loss": 0.011418178940715507, "kl": 0.00010204315185546875, "learning_rate": 4.166666666666667e-06, "loss": 0.1624, "num_tokens": 403311.0, "reward": 2.88671875, "reward_std": 0.16755038313567638, "rewards/accuracy_reward/mean": 0.88671875, "rewards/accuracy_reward/std": 0.3000990152359009, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.014668514719232917, "stage2_sft_loss": 1.380816400051117, "step": 6, "total_loss": 0.1641683429479599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 129.4140625, "completions/mean_terminated_length": 129.4140625, "completions/min_length": 71.75, "completions/min_terminated_length": 71.75, "epoch": 0.029914529914529916, "frac_reward_zero_std": 0.65625, "grad_norm": 3.2005230162166525, "grpo_loss": 0.0499419612691554, "kl": 0.0001811981201171875, "learning_rate": 5e-06, "loss": 0.138, "num_tokens": 468425.0, "reward": 2.9189453125, "reward_std": 0.1503223106265068, "rewards/accuracy_reward/mean": 0.92578125, "rewards/accuracy_reward/std": 0.24648795649409294, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02343750186264515, "stage1_sft_loss": 0.008992363698780537, "stage2_sft_loss": 1.0415422320365906, "step": 7, "total_loss": 0.16308855265378952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 223.75, "completions/max_terminated_length": 223.75, "completions/mean_length": 131.37890625, "completions/mean_terminated_length": 131.37890625, "completions/min_length": 51.5, "completions/min_terminated_length": 51.5, "epoch": 0.03418803418803419, "frac_reward_zero_std": 0.34375, "grad_norm": 4.478538499496423, "grpo_loss": -0.023593724047032083, "kl": 0.0139923095703125, "learning_rate": 5.833333333333334e-06, "loss": 0.0476, "num_tokens": 534458.0, "reward": 2.6806640625, "reward_std": 0.3335800841450691, "rewards/accuracy_reward/mean": 0.8515625, "rewards/accuracy_reward/std": 0.34831516817212105, "rewards/format_reward/mean": 0.8828125, "rewards/format_reward/std": 0.31499073281884193, "rewards/tag_count_reward/mean": 0.9462890625, "rewards/tag_count_reward/std": 0.14767055958509445, "stage1_sft_loss": 0.009093074826523662, "stage2_sft_loss": 0.41022688150405884, "step": 8, "total_loss": 0.02652203943580389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 264.5, "completions/max_terminated_length": 264.5, "completions/mean_length": 139.171875, "completions/mean_terminated_length": 139.171875, "completions/min_length": 67.25, "completions/min_terminated_length": 67.25, "epoch": 0.038461538461538464, "frac_reward_zero_std": 0.71875, "grad_norm": 1.8284207472683809, "grpo_loss": 3.280168118635629e-05, "kl": 0.0007824897766113281, "learning_rate": 6.666666666666667e-06, "loss": 0.0525, "num_tokens": 605526.0, "reward": 2.89453125, "reward_std": 0.1113965567201376, "rewards/accuracy_reward/mean": 0.89453125, "rewards/accuracy_reward/std": 0.29731010645627975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.008276262902654707, "stage2_sft_loss": 0.22668107599020004, "step": 9, "total_loss": 0.030977172777056694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 262.25, "completions/max_terminated_length": 262.25, "completions/mean_length": 147.97265625, "completions/mean_terminated_length": 147.97265625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.042735042735042736, "frac_reward_zero_std": 0.6875, "grad_norm": 1.0282877052261212, "grpo_loss": 0.0008859479815441773, "kl": 0.0009212493896484375, "learning_rate": 7.500000000000001e-06, "loss": 0.0131, "num_tokens": 677151.0, "reward": 2.94921875, "reward_std": 0.12073762249201536, "rewards/accuracy_reward/mean": 0.94921875, "rewards/accuracy_reward/std": 0.1755770929157734, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0038859813648741692, "stage2_sft_loss": 0.06995399482548237, "step": 10, "total_loss": 0.01176732883322984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 143.76953125, "completions/mean_terminated_length": 143.76953125, "completions/min_length": 67.5, "completions/min_terminated_length": 67.5, "epoch": 0.04700854700854701, "frac_reward_zero_std": 0.71875, "grad_norm": 1.0728683234729373, "grpo_loss": 3.9769674543777e-05, "kl": 0.002231597900390625, "learning_rate": 8.333333333333334e-06, "loss": 0.0235, "num_tokens": 746860.0, "reward": 2.9287109375, "reward_std": 0.11981397215276957, "rewards/accuracy_reward/mean": 0.93359375, "rewards/accuracy_reward/std": 0.23851029947400093, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.011087916325777769, "stage2_sft_loss": 0.003615820431150496, "step": 11, "total_loss": 0.011489268275909126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 137.61328125, "completions/mean_terminated_length": 137.61328125, "completions/min_length": 63.75, "completions/min_terminated_length": 63.75, "epoch": 0.05128205128205128, "frac_reward_zero_std": 0.6875, "grad_norm": 2.613003430486309, "grpo_loss": 0.007426628337270813, "kl": 0.014751434326171875, "learning_rate": 9.166666666666666e-06, "loss": 0.0109, "num_tokens": 815081.0, "reward": 2.916015625, "reward_std": 0.14841514453291893, "rewards/accuracy_reward/mean": 0.95703125, "rewards/accuracy_reward/std": 0.17054874077439308, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.10469620674848557, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.05234810337424278, "stage1_sft_loss": 0.019605551147833467, "stage2_sft_loss": 0.016597392386756837, "step": 12, "total_loss": 0.02869191882200539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 250.25, "completions/max_terminated_length": 250.25, "completions/mean_length": 137.28125, "completions/mean_terminated_length": 137.28125, "completions/min_length": 75.25, "completions/min_terminated_length": 75.25, "epoch": 0.05555555555555555, "frac_reward_zero_std": 0.375, "grad_norm": 18.474324997624624, "grpo_loss": 0.03844568212775812, "kl": 0.151611328125, "learning_rate": 1e-05, "loss": 0.097, "num_tokens": 885025.0, "reward": 2.0830078125, "reward_std": 0.2612629234790802, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39255890995264053, "rewards/format_reward/mean": 0.515625, "rewards/format_reward/std": 0.48288241028785706, "rewards/tag_count_reward/mean": 0.7626953125, "rewards/tag_count_reward/std": 0.2884962745010853, "stage1_sft_loss": 0.014269684674218297, "stage2_sft_loss": 0.0914901232754346, "step": 13, "total_loss": 0.061864377348683774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 228.5, "completions/max_terminated_length": 228.5, "completions/mean_length": 119.6796875, "completions/mean_terminated_length": 119.6796875, "completions/min_length": 65.75, "completions/min_terminated_length": 65.75, "epoch": 0.05982905982905983, "frac_reward_zero_std": 0.8125, "grad_norm": 0.915039501524939, "grpo_loss": 0.0098477653568807, "kl": 0.005916595458984375, "learning_rate": 1.0833333333333334e-05, "loss": 0.0059, "num_tokens": 948439.0, "reward": 2.9599609375, "reward_std": 0.08884451817721128, "rewards/accuracy_reward/mean": 0.98046875, "rewards/accuracy_reward/std": 0.11576050892472267, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.05326050892472267, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.03994538262486458, "stage1_sft_loss": 0.009817917714826763, "stage2_sft_loss": 0.009871041984297335, "step": 14, "total_loss": 0.020652787410654128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 127.10546875, "completions/mean_terminated_length": 127.10546875, "completions/min_length": 74.5, "completions/min_terminated_length": 74.5, "epoch": 0.0641025641025641, "frac_reward_zero_std": 0.6875, "grad_norm": 2.1793393582609744, "grpo_loss": 0.00019377293256184203, "kl": 0.0197601318359375, "learning_rate": 1.1666666666666668e-05, "loss": 0.0314, "num_tokens": 1013130.0, "reward": 2.87109375, "reward_std": 0.14518490061163902, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.32502105459570885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.01030012167757377, "stage2_sft_loss": 0.0008246329380199313, "step": 15, "total_loss": 0.010576357715763152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 122.9140625, "completions/mean_terminated_length": 122.9140625, "completions/min_length": 63.25, "completions/min_terminated_length": 63.25, "epoch": 0.06837606837606838, "frac_reward_zero_std": 0.84375, "grad_norm": 2.0006532751956856, "grpo_loss": 0.00015884065362570254, "kl": 0.011383056640625, "learning_rate": 1.25e-05, "loss": 0.0248, "num_tokens": 1075980.0, "reward": 2.90625, "reward_std": 0.06378498114645481, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 0.2523401081562042, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.013333460665307939, "stage2_sft_loss": 0.001343881434877403, "step": 16, "total_loss": 0.013626688945805654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 218.75, "completions/max_terminated_length": 218.75, "completions/mean_length": 129.1171875, "completions/mean_terminated_length": 129.1171875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.07264957264957266, "frac_reward_zero_std": 0.84375, "grad_norm": 1.9687509137531682, "grpo_loss": 0.0009033796213770984, "kl": 0.011016845703125, "learning_rate": 1.3333333333333333e-05, "loss": 0.0109, "num_tokens": 1142074.0, "reward": 2.97265625, "reward_std": 0.06207750644534826, "rewards/accuracy_reward/mean": 0.97265625, "rewards/accuracy_reward/std": 0.13608578220009804, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.021144109414308332, "stage2_sft_loss": 0.0017398461059201509, "step": 17, "total_loss": 0.022221474413527176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 225.75, "completions/max_terminated_length": 225.75, "completions/mean_length": 127.78515625, "completions/mean_terminated_length": 127.78515625, "completions/min_length": 61.75, "completions/min_terminated_length": 61.75, "epoch": 0.07692307692307693, "frac_reward_zero_std": 0.78125, "grad_norm": 2.232924765040882, "grpo_loss": 0.0006565492107597493, "kl": 0.0255126953125, "learning_rate": 1.416666666666667e-05, "loss": 0.0187, "num_tokens": 1206819.0, "reward": 2.9033203125, "reward_std": 0.11760413460433483, "rewards/accuracy_reward/mean": 0.91015625, "rewards/accuracy_reward/std": 0.1928669586777687, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02343750186264515, "stage1_sft_loss": 0.011690539606206585, "stage2_sft_loss": 0.0009237131016561761, "step": 18, "total_loss": 0.012439460391760804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 148.640625, "completions/mean_terminated_length": 148.640625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.0811965811965812, "frac_reward_zero_std": 0.71875, "grad_norm": 2.205851245453103, "grpo_loss": -0.0012649910768232075, "kl": 0.0226593017578125, "learning_rate": 1.5000000000000002e-05, "loss": 0.031, "num_tokens": 1277815.0, "reward": 2.9140625, "reward_std": 0.11876176204532385, "rewards/accuracy_reward/mean": 0.9140625, "rewards/accuracy_reward/std": 0.2623785026371479, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.012684938323218375, "stage2_sft_loss": 0.0032887740817386657, "step": 19, "total_loss": 0.011748824326787144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 245.25, "completions/max_terminated_length": 245.25, "completions/mean_length": 146.29296875, "completions/mean_terminated_length": 146.29296875, "completions/min_length": 76.25, "completions/min_terminated_length": 76.25, "epoch": 0.08547008547008547, "frac_reward_zero_std": 0.625, "grad_norm": 2.6677025177715405, "grpo_loss": -0.03813791631546337, "kl": 0.048675537109375, "learning_rate": 1.5833333333333333e-05, "loss": 0.0254, "num_tokens": 1349738.0, "reward": 2.900390625, "reward_std": 0.15921209380030632, "rewards/accuracy_reward/mean": 0.90234375, "rewards/accuracy_reward/std": 0.29337338730692863, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.010960506275296211, "stage1_sft_loss": 0.023294871527468786, "stage2_sft_loss": 0.026308074113330804, "step": 20, "total_loss": -0.012212236848426983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 136.9375, "completions/mean_terminated_length": 136.9375, "completions/min_length": 67.25, "completions/min_terminated_length": 67.25, "epoch": 0.08974358974358974, "frac_reward_zero_std": 0.75, "grad_norm": 2.7414827300784337, "grpo_loss": 0.0006684071158815641, "kl": 0.043243408203125, "learning_rate": 1.6666666666666667e-05, "loss": 0.032, "num_tokens": 1418866.0, "reward": 2.83984375, "reward_std": 0.1150759719312191, "rewards/accuracy_reward/mean": 0.83984375, "rewards/accuracy_reward/std": 0.3069465383887291, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.02183083994896151, "stage2_sft_loss": 0.000715008718543686, "step": 21, "total_loss": 0.02257074779481627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 261.5, "completions/max_terminated_length": 261.5, "completions/mean_length": 132.8515625, "completions/mean_terminated_length": 132.8515625, "completions/min_length": 74.5, "completions/min_terminated_length": 74.5, "epoch": 0.09401709401709402, "frac_reward_zero_std": 0.65625, "grad_norm": 2.842263357480505, "grpo_loss": 0.002760487957857549, "kl": 0.040985107421875, "learning_rate": 1.7500000000000002e-05, "loss": 0.041, "num_tokens": 1485252.0, "reward": 2.8662109375, "reward_std": 0.1345482999458909, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.300577100366354, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.013363483012653887, "stage2_sft_loss": 0.011726534779882059, "step": 22, "total_loss": 0.0172966243699193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 235.25, "completions/max_terminated_length": 235.25, "completions/mean_length": 128.09375, "completions/mean_terminated_length": 128.09375, "completions/min_length": 67.25, "completions/min_terminated_length": 67.25, "epoch": 0.09829059829059829, "frac_reward_zero_std": 0.625, "grad_norm": 2.2903318954420415, "grpo_loss": 0.04079900741635356, "kl": 0.040191650390625, "learning_rate": 1.8333333333333333e-05, "loss": 0.0472, "num_tokens": 1550084.0, "reward": 2.8671875, "reward_std": 0.17411433160305023, "rewards/accuracy_reward/mean": 0.8671875, "rewards/accuracy_reward/std": 0.3389710336923599, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05828226753510535, "stage2_sft_loss": 0.011348728352459148, "step": 23, "total_loss": 0.10021615156438202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 351.25, "completions/max_terminated_length": 351.25, "completions/mean_length": 132.10546875, "completions/mean_terminated_length": 132.10546875, "completions/min_length": 59.5, "completions/min_terminated_length": 59.5, "epoch": 0.10256410256410256, "frac_reward_zero_std": 0.53125, "grad_norm": 2.3689469506326883, "grpo_loss": 0.0008243897027568892, "kl": 0.0498046875, "learning_rate": 1.916666666666667e-05, "loss": 0.0496, "num_tokens": 1617023.0, "reward": 2.87109375, "reward_std": 0.20608290284872055, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.33124853298068047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.011117298912722617, "stage2_sft_loss": 0.001786644832463935, "step": 24, "total_loss": 0.012120353290811181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 126.80078125, "completions/mean_terminated_length": 126.80078125, "completions/min_length": 61.75, "completions/min_terminated_length": 61.75, "epoch": 0.10683760683760683, "frac_reward_zero_std": 0.5625, "grad_norm": 2.3631085168942807, "grpo_loss": 0.0019136814298690297, "kl": 0.051910400390625, "learning_rate": 2e-05, "loss": 0.0334, "num_tokens": 1682060.0, "reward": 2.8203125, "reward_std": 0.18767160922288895, "rewards/accuracy_reward/mean": 0.82421875, "rewards/accuracy_reward/std": 0.3797462359070778, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09090784145519137, "stage2_sft_loss": 0.0017171509243780747, "step": 25, "total_loss": 0.09299323987215757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 127.9765625, "completions/mean_terminated_length": 127.9765625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.1111111111111111, "frac_reward_zero_std": 0.59375, "grad_norm": 2.4456833239275717, "grpo_loss": -0.012467821623431519, "kl": 0.05926513671875, "learning_rate": 1.9998881018102735e-05, "loss": 0.0471, "num_tokens": 1747406.0, "reward": 2.83203125, "reward_std": 0.18003800511360168, "rewards/accuracy_reward/mean": 0.83203125, "rewards/accuracy_reward/std": 0.36339013651013374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.02221155003644526, "stage2_sft_loss": 0.003884657140588388, "step": 26, "total_loss": 0.010132194496691227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 513.75, "completions/max_terminated_length": 296.25, "completions/mean_length": 149.0, "completions/mean_terminated_length": 141.6593952178955, "completions/min_length": 65.25, "completions/min_terminated_length": 65.25, "epoch": 0.11538461538461539, "frac_reward_zero_std": 0.40625, "grad_norm": 2.9242918299689125, "grpo_loss": 0.30453615926671773, "kl": 0.100341796875, "learning_rate": 1.9995524322835035e-05, "loss": 0.187, "num_tokens": 1818462.0, "reward": 2.7412109375, "reward_std": 0.28656554222106934, "rewards/accuracy_reward/mean": 0.76171875, "rewards/accuracy_reward/std": 0.39992421492934227, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.08451050892472267, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.029733512550592422, "stage1_sft_loss": 0.06889296881854534, "stage2_sft_loss": 0.0499890799401328, "step": 27, "total_loss": 0.3784280573017895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 432.75, "completions/max_terminated_length": 305.75, "completions/mean_length": 146.4296875, "completions/mean_terminated_length": 143.96856689453125, "completions/min_length": 65.25, "completions/min_terminated_length": 65.25, "epoch": 0.11965811965811966, "frac_reward_zero_std": 0.28125, "grad_norm": 2.757226870099959, "grpo_loss": 0.08061277940578293, "kl": 0.07757568359375, "learning_rate": 1.9989930665413148e-05, "loss": 0.1245, "num_tokens": 1889644.0, "reward": 2.7841796875, "reward_std": 0.32649947702884674, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.3944511339068413, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.07509202510118484, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.03298301063477993, "stage1_sft_loss": 0.04570018174126744, "stage2_sft_loss": 0.00978102523367852, "step": 28, "total_loss": 0.1272910633124411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 327.75, "completions/max_terminated_length": 327.75, "completions/mean_length": 147.90625, "completions/mean_terminated_length": 147.90625, "completions/min_length": 49.75, "completions/min_terminated_length": 49.75, "epoch": 0.12393162393162394, "frac_reward_zero_std": 0.5, "grad_norm": 2.5000758685486737, "grpo_loss": 0.07717300354852341, "kl": 0.06719970703125, "learning_rate": 1.998210129767735e-05, "loss": 0.1019, "num_tokens": 1960196.0, "reward": 2.8193359375, "reward_std": 0.20288447104394436, "rewards/accuracy_reward/mean": 0.8203125, "rewards/accuracy_reward/std": 0.38116608560085297, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.09177985601127148, "stage2_sft_loss": 0.018172981101088226, "step": 29, "total_loss": 0.17077014781534672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 245.5, "completions/max_terminated_length": 245.5, "completions/mean_length": 133.14453125, "completions/mean_terminated_length": 133.14453125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.1282051282051282, "frac_reward_zero_std": 0.4375, "grad_norm": 2.628202806969854, "grpo_loss": -0.029030836129095405, "kl": 0.076416015625, "learning_rate": 1.9972037971811802e-05, "loss": 0.0678, "num_tokens": 2027969.0, "reward": 2.7900390625, "reward_std": 0.23592286556959152, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.40148045867681503, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.027866113930940628, "stage1_sft_loss": 0.09150360058993101, "stage2_sft_loss": 0.03152645181398839, "step": 30, "total_loss": 0.06562541099265218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 390.5, "completions/max_terminated_length": 282.75, "completions/mean_length": 139.5234375, "completions/mean_terminated_length": 137.08885192871094, "completions/min_length": 63.25, "completions/min_terminated_length": 63.25, "epoch": 0.13247863247863248, "frac_reward_zero_std": 0.375, "grad_norm": 2.5503433302929377, "grpo_loss": -0.01416161423549056, "kl": 0.0740966796875, "learning_rate": 1.9959742939952393e-05, "loss": 0.0858, "num_tokens": 2096983.0, "reward": 2.732421875, "reward_std": 0.2888246178627014, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.43773481994867325, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.015625, "stage1_sft_loss": 0.07643852988258004, "stage2_sft_loss": 0.042741050478070974, "step": 31, "total_loss": 0.06655102036893368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 346.5, "completions/max_terminated_length": 208.25, "completions/mean_length": 118.8671875, "completions/mean_terminated_length": 116.32986259460449, "completions/min_length": 63.75, "completions/min_terminated_length": 63.75, "epoch": 0.13675213675213677, "frac_reward_zero_std": 0.21875, "grad_norm": 2.367757483948867, "grpo_loss": -0.02189300279133022, "kl": 0.071044921875, "learning_rate": 1.9945218953682736e-05, "loss": 0.0887, "num_tokens": 2159077.0, "reward": 2.7685546875, "reward_std": 0.35685962438583374, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.4162580147385597, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.032399868592619896, "stage1_sft_loss": 0.04500664956867695, "stage2_sft_loss": 0.007157278014346957, "step": 32, "total_loss": 0.023829374462366104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 231.5, "completions/max_terminated_length": 231.5, "completions/mean_length": 112.40625, "completions/mean_terminated_length": 112.40625, "completions/min_length": 63.25, "completions/min_terminated_length": 63.25, "epoch": 0.14102564102564102, "frac_reward_zero_std": 0.625, "grad_norm": 2.1938369497046755, "grpo_loss": -0.03613918833434582, "kl": 0.0726318359375, "learning_rate": 1.9928469263418376e-05, "loss": 0.0584, "num_tokens": 2221941.0, "reward": 2.85546875, "reward_std": 0.16728199180215597, "rewards/accuracy_reward/mean": 0.85546875, "rewards/accuracy_reward/std": 0.324543260037899, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04147233220282942, "stage2_sft_loss": 0.0106936156807933, "step": 33, "total_loss": 0.006402507424354553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 428.25, "completions/max_terminated_length": 302.25, "completions/mean_length": 130.5546875, "completions/mean_terminated_length": 128.06665802001953, "completions/min_length": 60.75, "completions/min_terminated_length": 60.75, "epoch": 0.1452991452991453, "frac_reward_zero_std": 0.375, "grad_norm": 2.245639334646511, "grpo_loss": 0.02738322678487748, "kl": 0.07635498046875, "learning_rate": 1.990949761767935e-05, "loss": 0.0873, "num_tokens": 2289123.0, "reward": 2.734375, "reward_std": 0.30481160432100296, "rewards/accuracy_reward/mean": 0.74609375, "rewards/accuracy_reward/std": 0.4096733331680298, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03125000186264515, "stage1_sft_loss": 0.07007205486297607, "stage2_sft_loss": 0.009100667899474502, "step": 34, "total_loss": 0.09836534410715103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 139.42578125, "completions/mean_terminated_length": 139.42578125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.14957264957264957, "frac_reward_zero_std": 0.40625, "grad_norm": 2.6016141061868447, "grpo_loss": 0.09335880249273032, "kl": 0.0828857421875, "learning_rate": 1.9888308262251286e-05, "loss": 0.0736, "num_tokens": 2359440.0, "reward": 2.71875, "reward_std": 0.2593473196029663, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.43959466367959976, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06350621022284031, "stage2_sft_loss": 0.01845851496909745, "step": 35, "total_loss": 0.15871086157858372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 237.25, "completions/max_terminated_length": 237.25, "completions/mean_length": 116.875, "completions/mean_terminated_length": 116.875, "completions/min_length": 63.25, "completions/min_terminated_length": 63.25, "epoch": 0.15384615384615385, "frac_reward_zero_std": 0.375, "grad_norm": 2.0495372575896247, "grpo_loss": -0.05065595902851783, "kl": 0.0775146484375, "learning_rate": 1.9864905939235215e-05, "loss": 0.0527, "num_tokens": 2422144.0, "reward": 2.8271484375, "reward_std": 0.2663256488740444, "rewards/accuracy_reward/mean": 0.83203125, "rewards/accuracy_reward/std": 0.3622770607471466, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.04677494731731713, "stage2_sft_loss": 0.018676572712138295, "step": 36, "total_loss": -0.002013354969676584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 231.5, "completions/max_terminated_length": 231.5, "completions/mean_length": 131.09375, "completions/mean_terminated_length": 131.09375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.1581196581196581, "frac_reward_zero_std": 0.5, "grad_norm": 2.5671491792749266, "grpo_loss": -0.04589961684541777, "kl": 0.0906982421875, "learning_rate": 1.98392958859863e-05, "loss": 0.0581, "num_tokens": 2488736.0, "reward": 2.8671875, "reward_std": 0.2171314489096403, "rewards/accuracy_reward/mean": 0.8671875, "rewards/accuracy_reward/std": 0.33943870663642883, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0481309499591589, "stage2_sft_loss": 0.012330142228165641, "step": 37, "total_loss": 0.003464347682893276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 389.5, "completions/max_terminated_length": 255.75, "completions/mean_length": 133.875, "completions/mean_terminated_length": 131.4334716796875, "completions/min_length": 61.5, "completions/min_terminated_length": 61.5, "epoch": 0.1623931623931624, "frac_reward_zero_std": 0.46875, "grad_norm": 2.721311776528719, "grpo_loss": 0.011054625327233225, "kl": 0.096923828125, "learning_rate": 1.9811483833941726e-05, "loss": 0.1324, "num_tokens": 2555168.0, "reward": 2.7509765625, "reward_std": 0.26854483410716057, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.42780231684446335, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02343750186264515, "stage1_sft_loss": 0.06103221420198679, "stage2_sft_loss": 0.01654094600235112, "step": 38, "total_loss": 0.07374093309044838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 249.25, "completions/max_terminated_length": 249.25, "completions/mean_length": 141.53515625, "completions/mean_terminated_length": 141.53515625, "completions/min_length": 69.25, "completions/min_terminated_length": 69.25, "epoch": 0.16666666666666666, "frac_reward_zero_std": 0.625, "grad_norm": 1.947435216648249, "grpo_loss": -0.03299991798121482, "kl": 0.07366943359375, "learning_rate": 1.9781476007338058e-05, "loss": 0.0564, "num_tokens": 2624649.0, "reward": 2.859375, "reward_std": 0.16781241074204445, "rewards/accuracy_reward/mean": 0.859375, "rewards/accuracy_reward/std": 0.2867218255996704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.03650881536304951, "stage2_sft_loss": 0.009737508415128104, "step": 39, "total_loss": 0.004482649266719818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 384.0, "completions/max_terminated_length": 259.5, "completions/mean_length": 136.1640625, "completions/mean_terminated_length": 133.68520736694336, "completions/min_length": 69.75, "completions/min_terminated_length": 69.75, "epoch": 0.17094017094017094, "frac_reward_zero_std": 0.4375, "grad_norm": 2.559201255744803, "grpo_loss": 0.10094646381912753, "kl": 0.1031494140625, "learning_rate": 1.9749279121818235e-05, "loss": 0.1331, "num_tokens": 2692499.0, "reward": 2.7470703125, "reward_std": 0.2642187662422657, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.421333409845829, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02343750186264515, "stage1_sft_loss": 0.0752207487821579, "stage2_sft_loss": 0.013288459565956146, "step": 40, "total_loss": 0.1774960570037365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 232.75, "completions/max_terminated_length": 232.75, "completions/mean_length": 131.78515625, "completions/mean_terminated_length": 131.78515625, "completions/min_length": 70.75, "completions/min_terminated_length": 70.75, "epoch": 0.1752136752136752, "frac_reward_zero_std": 0.46875, "grad_norm": 2.6517951258463395, "grpo_loss": -0.0022816565178800374, "kl": 0.0897216796875, "learning_rate": 1.9714900382928674e-05, "loss": 0.0642, "num_tokens": 2757900.0, "reward": 2.78515625, "reward_std": 0.22765203192830086, "rewards/accuracy_reward/mean": 0.78515625, "rewards/accuracy_reward/std": 0.3814377970993519, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.03658259240910411, "stage2_sft_loss": 0.002672710397746414, "step": 41, "total_loss": 0.03456820675637573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 253.75, "completions/max_terminated_length": 253.75, "completions/mean_length": 137.40625, "completions/mean_terminated_length": 137.40625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.1794871794871795, "frac_reward_zero_std": 0.4375, "grad_norm": 2.8783991169680565, "grpo_loss": -0.018433816148899496, "kl": 0.1077880859375, "learning_rate": 1.9678347484506667e-05, "loss": 0.0765, "num_tokens": 2826092.0, "reward": 2.8203125, "reward_std": 0.2398776262998581, "rewards/accuracy_reward/mean": 0.8203125, "rewards/accuracy_reward/std": 0.3670658878982067, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09272383619099855, "stage2_sft_loss": 0.008958085381891578, "step": 42, "total_loss": 0.07518582977354527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 238.5, "completions/max_terminated_length": 238.5, "completions/mean_length": 129.51953125, "completions/mean_terminated_length": 129.51953125, "completions/min_length": 53.5, "completions/min_terminated_length": 53.5, "epoch": 0.18376068376068377, "frac_reward_zero_std": 0.53125, "grad_norm": 2.9214012259202597, "grpo_loss": 0.005569995621044654, "kl": 0.1058349609375, "learning_rate": 1.9639628606958535e-05, "loss": 0.0934, "num_tokens": 2891849.0, "reward": 2.7578125, "reward_std": 0.20555248856544495, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.41410429030656815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07663985062390566, "stage2_sft_loss": 0.03882742972928099, "step": 43, "total_loss": 0.08609258919022977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 126.0390625, "completions/mean_terminated_length": 126.0390625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.18803418803418803, "frac_reward_zero_std": 0.5, "grad_norm": 2.714946091428236, "grpo_loss": 0.011901529098395258, "kl": 0.1015625, "learning_rate": 1.9598752415428893e-05, "loss": 0.0697, "num_tokens": 2957499.0, "reward": 2.8203125, "reward_std": 0.20582089200615883, "rewards/accuracy_reward/mean": 0.8203125, "rewards/accuracy_reward/std": 0.3795153424143791, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06588136032223701, "stage2_sft_loss": 0.008633614983409643, "step": 44, "total_loss": 0.07864625263027847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 334.75, "completions/max_terminated_length": 334.75, "completions/mean_length": 149.09765625, "completions/mean_terminated_length": 149.09765625, "completions/min_length": 63.25, "completions/min_terminated_length": 63.25, "epoch": 0.19230769230769232, "frac_reward_zero_std": 0.53125, "grad_norm": 2.636385542774345, "grpo_loss": -0.0019036887679249048, "kl": 0.116455078125, "learning_rate": 1.955572805786141e-05, "loss": 0.0948, "num_tokens": 3029404.0, "reward": 2.79296875, "reward_std": 0.2106798104941845, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.38675472885370255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07216842658817768, "stage2_sft_loss": 0.013309189293067902, "step": 45, "total_loss": 0.07159565854817629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 322.25, "completions/max_terminated_length": 322.25, "completions/mean_length": 141.2421875, "completions/mean_terminated_length": 141.2421875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.19658119658119658, "frac_reward_zero_std": 0.59375, "grad_norm": 3.1514982024595684, "grpo_loss": -0.034489477693568915, "kl": 0.1217041015625, "learning_rate": 1.9510565162951538e-05, "loss": 0.0756, "num_tokens": 3100466.0, "reward": 2.8203125, "reward_std": 0.1822783462703228, "rewards/accuracy_reward/mean": 0.8203125, "rewards/accuracy_reward/std": 0.36859218776226044, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04779887804761529, "stage2_sft_loss": 0.01275486926897429, "step": 46, "total_loss": 0.014584887307137251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 297.25, "completions/max_terminated_length": 297.25, "completions/mean_length": 130.953125, "completions/mean_terminated_length": 130.953125, "completions/min_length": 65.75, "completions/min_terminated_length": 65.75, "epoch": 0.20085470085470086, "frac_reward_zero_std": 0.5625, "grad_norm": 2.639450650301935, "grpo_loss": 0.06104949407745153, "kl": 0.115966796875, "learning_rate": 1.9463273837991643e-05, "loss": 0.1032, "num_tokens": 3168358.0, "reward": 2.8359375, "reward_std": 0.1751839891076088, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.3520497493445873, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08292205259203911, "stage2_sft_loss": 0.0034438925213180482, "step": 47, "total_loss": 0.14431593287736177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 322.25, "completions/max_terminated_length": 322.25, "completions/mean_length": 128.65234375, "completions/mean_terminated_length": 128.65234375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.20512820512820512, "frac_reward_zero_std": 0.46875, "grad_norm": 2.8560131128791295, "grpo_loss": 0.020472342817811295, "kl": 0.11181640625, "learning_rate": 1.9413864666609036e-05, "loss": 0.0903, "num_tokens": 3234877.0, "reward": 2.7421875, "reward_std": 0.24184712767601013, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.42586562037467957, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05367158725857735, "stage2_sft_loss": 0.01597646134905517, "step": 48, "total_loss": 0.0757415764965117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 259.25, "completions/max_terminated_length": 259.25, "completions/mean_length": 123.24609375, "completions/mean_terminated_length": 123.24609375, "completions/min_length": 59.75, "completions/min_terminated_length": 59.75, "epoch": 0.2094017094017094, "frac_reward_zero_std": 0.5, "grad_norm": 2.632971721617991, "grpo_loss": -0.038325678557157516, "kl": 0.1258544921875, "learning_rate": 1.9362348706397374e-05, "loss": 0.1024, "num_tokens": 3298860.0, "reward": 2.8125, "reward_std": 0.21778054535388947, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3778356984257698, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10100406594574451, "stage2_sft_loss": 0.008230720413848758, "step": 49, "total_loss": 0.06350146140903234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 294.75, "completions/max_terminated_length": 294.75, "completions/mean_length": 127.5078125, "completions/mean_terminated_length": 127.5078125, "completions/min_length": 66.75, "completions/min_terminated_length": 66.75, "epoch": 0.21367521367521367, "frac_reward_zero_std": 0.46875, "grad_norm": 2.611812735045678, "grpo_loss": 0.05120960566273425, "kl": 0.114013671875, "learning_rate": 1.9308737486442045e-05, "loss": 0.1098, "num_tokens": 3363390.0, "reward": 2.796875, "reward_std": 0.22883153706789017, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.39198366552591324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08808937482535839, "stage2_sft_loss": 0.01635385846020654, "step": 50, "total_loss": 0.14093436300754547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 131.59765625, "completions/mean_terminated_length": 131.59765625, "completions/min_length": 61.25, "completions/min_terminated_length": 61.25, "epoch": 0.21794871794871795, "frac_reward_zero_std": 0.3125, "grad_norm": 2.746283813005745, "grpo_loss": 0.012858211644925177, "kl": 0.122802734375, "learning_rate": 1.9253043004739967e-05, "loss": 0.0938, "num_tokens": 3429839.0, "reward": 2.751953125, "reward_std": 0.29525136202573776, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.415524423122406, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.015625, "stage1_sft_loss": 0.10655882768332958, "stage2_sft_loss": 0.03493454266572371, "step": 51, "total_loss": 0.12291049398481846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 320.25, "completions/max_terminated_length": 320.25, "completions/mean_length": 138.62109375, "completions/mean_terminated_length": 138.62109375, "completions/min_length": 61.5, "completions/min_terminated_length": 61.5, "epoch": 0.2222222222222222, "frac_reward_zero_std": 0.40625, "grad_norm": 2.0483754177336944, "grpo_loss": -0.026889342727372423, "kl": 0.1077880859375, "learning_rate": 1.919527772551451e-05, "loss": 0.0526, "num_tokens": 3498630.0, "reward": 2.7734375, "reward_std": 0.2531665191054344, "rewards/accuracy_reward/mean": 0.7734375, "rewards/accuracy_reward/std": 0.4207340404391289, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0339956721290946, "stage2_sft_loss": 0.007436095038428903, "step": 52, "total_loss": 0.00784993963316083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 137.515625, "completions/mean_terminated_length": 137.515625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.2264957264957265, "frac_reward_zero_std": 0.375, "grad_norm": 2.5497153837326794, "grpo_loss": 0.035073344348347746, "kl": 0.113037109375, "learning_rate": 1.913545457642601e-05, "loss": 0.0757, "num_tokens": 3566970.0, "reward": 2.7177734375, "reward_std": 0.26900235936045647, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4300394877791405, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.05108351481612772, "stage2_sft_loss": 0.00476168844033964, "step": 53, "total_loss": 0.0866330279968679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 339.5, "completions/max_terminated_length": 339.5, "completions/mean_length": 149.2734375, "completions/mean_terminated_length": 149.2734375, "completions/min_length": 61.75, "completions/min_terminated_length": 61.75, "epoch": 0.23076923076923078, "frac_reward_zero_std": 0.5, "grad_norm": 2.0633181493829387, "grpo_loss": -0.02261926600476727, "kl": 0.1124267578125, "learning_rate": 1.907358694567865e-05, "loss": 0.0659, "num_tokens": 3638304.0, "reward": 2.77734375, "reward_std": 0.21831096336245537, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.404835507273674, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04506829101592302, "stage2_sft_loss": 0.008663312415592372, "step": 54, "total_loss": 0.023315356113016605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 132.94140625, "completions/mean_terminated_length": 132.94140625, "completions/min_length": 53.5, "completions/min_terminated_length": 53.5, "epoch": 0.23504273504273504, "frac_reward_zero_std": 0.46875, "grad_norm": 2.1835339801699676, "grpo_loss": -0.008162843412719667, "kl": 0.129150390625, "learning_rate": 1.900968867902419e-05, "loss": 0.0742, "num_tokens": 3705953.0, "reward": 2.73046875, "reward_std": 0.23619184270501137, "rewards/accuracy_reward/mean": 0.73046875, "rewards/accuracy_reward/std": 0.44539549201726913, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06734194559976459, "stage2_sft_loss": 0.02584816412127111, "step": 55, "total_loss": 0.06176391919143498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 271.25, "completions/max_terminated_length": 271.25, "completions/mean_length": 125.203125, "completions/mean_terminated_length": 125.203125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.23931623931623933, "frac_reward_zero_std": 0.5625, "grad_norm": 2.733215153622518, "grpo_loss": 0.012670826748944819, "kl": 0.1082763671875, "learning_rate": 1.8943774076663372e-05, "loss": 0.0959, "num_tokens": 3769509.0, "reward": 2.837890625, "reward_std": 0.19866740703582764, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.3607676178216934, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.015625, "stage1_sft_loss": 0.09594716737046838, "stage2_sft_loss": 0.010085783338581678, "step": 56, "total_loss": 0.10962657257914543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 125.234375, "completions/mean_terminated_length": 125.234375, "completions/min_length": 64.5, "completions/min_terminated_length": 64.5, "epoch": 0.24358974358974358, "frac_reward_zero_std": 0.59375, "grad_norm": 2.7279202762656127, "grpo_loss": 0.025446785730309784, "kl": 0.123291015625, "learning_rate": 1.8875857890045544e-05, "loss": 0.1284, "num_tokens": 3835513.0, "reward": 2.85546875, "reward_std": 0.17780011892318726, "rewards/accuracy_reward/mean": 0.85546875, "rewards/accuracy_reward/std": 0.3399829603731632, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06284261774271727, "stage2_sft_loss": 0.023300331144127995, "step": 57, "total_loss": 0.09061943367123604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 291.25, "completions/max_terminated_length": 291.25, "completions/mean_length": 133.2421875, "completions/mean_terminated_length": 133.2421875, "completions/min_length": 61.25, "completions/min_terminated_length": 61.25, "epoch": 0.24786324786324787, "frac_reward_zero_std": 0.46875, "grad_norm": 2.583004960150858, "grpo_loss": -0.05313935538288206, "kl": 0.1015625, "learning_rate": 1.880595531856738e-05, "loss": 0.099, "num_tokens": 3903943.0, "reward": 2.828125, "reward_std": 0.22700048610568047, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.37239526212215424, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08836286794394255, "stage2_sft_loss": 0.014138269005343318, "step": 58, "total_loss": 0.03663733811117709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 204.25, "completions/max_terminated_length": 204.25, "completions/mean_length": 116.6328125, "completions/mean_terminated_length": 116.6328125, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.25213675213675213, "frac_reward_zero_std": 0.5625, "grad_norm": 2.801369803273171, "grpo_loss": 0.008001050562597811, "kl": 0.127685546875, "learning_rate": 1.87340820061713e-05, "loss": 0.085, "num_tokens": 3965385.0, "reward": 2.8046875, "reward_std": 0.18702251091599464, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.3881981149315834, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09833742864429951, "stage2_sft_loss": 0.014318075140181463, "step": 59, "total_loss": 0.1077702846378088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 373.75, "completions/max_terminated_length": 373.75, "completions/mean_length": 137.9453125, "completions/mean_terminated_length": 137.9453125, "completions/min_length": 61.75, "completions/min_terminated_length": 61.75, "epoch": 0.2564102564102564, "frac_reward_zero_std": 0.5625, "grad_norm": 2.4729951369645224, "grpo_loss": 0.032328023575246334, "kl": 0.1182861328125, "learning_rate": 1.866025403784439e-05, "loss": 0.0737, "num_tokens": 4034187.0, "reward": 2.72265625, "reward_std": 0.18254429474473, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4300394877791405, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08115582540631294, "stage2_sft_loss": 0.015377040166640654, "step": 60, "total_loss": 0.11502155102789402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 292.5, "completions/max_terminated_length": 292.5, "completions/mean_length": 141.75, "completions/mean_terminated_length": 141.75, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.2606837606837607, "frac_reward_zero_std": 0.34375, "grad_norm": 2.6380177354683982, "grpo_loss": -0.0549847207730636, "kl": 0.1064453125, "learning_rate": 1.8584487936018663e-05, "loss": 0.0864, "num_tokens": 4104171.0, "reward": 2.7890625, "reward_std": 0.2877512201666832, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.3888401687145233, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05271852482110262, "stage2_sft_loss": 0.011416645691497251, "step": 61, "total_loss": -0.0011245310306549072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 292.75, "completions/max_terminated_length": 292.75, "completions/mean_length": 142.89453125, "completions/mean_terminated_length": 142.89453125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.26495726495726496, "frac_reward_zero_std": 0.40625, "grad_norm": 2.7487282290939965, "grpo_loss": -0.003460176521912217, "kl": 0.1220703125, "learning_rate": 1.8506800656873397e-05, "loss": 0.072, "num_tokens": 4175072.0, "reward": 2.73828125, "reward_std": 0.27248647063970566, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.4357043281197548, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07949414104223251, "stage2_sft_loss": 0.021672878530807793, "step": 62, "total_loss": 0.07820125250145793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 264.75, "completions/max_terminated_length": 264.75, "completions/mean_length": 124.671875, "completions/mean_terminated_length": 124.671875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.2692307692307692, "frac_reward_zero_std": 0.4375, "grad_norm": 2.813615284328756, "grpo_loss": 0.03867705701850355, "kl": 0.1285400390625, "learning_rate": 1.8427209586540392e-05, "loss": 0.1084, "num_tokens": 4240332.0, "reward": 2.734375, "reward_std": 0.2494782656431198, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.43634092807769775, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.13912567868828773, "stage2_sft_loss": 0.012130932504078373, "step": 63, "total_loss": 0.17901583388447762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 251.25, "completions/max_terminated_length": 251.25, "completions/mean_length": 130.765625, "completions/mean_terminated_length": 130.765625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.27350427350427353, "frac_reward_zero_std": 0.4375, "grad_norm": 2.591798770488697, "grpo_loss": 0.022650390383205377, "kl": 0.1220703125, "learning_rate": 1.834573253721303e-05, "loss": 0.0926, "num_tokens": 4306648.0, "reward": 2.7177734375, "reward_std": 0.24752728268504143, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.41977670788764954, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.0872775714378804, "stage2_sft_loss": 0.021733683039201424, "step": 64, "total_loss": 0.11210132867563516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 126.6875, "completions/mean_terminated_length": 126.6875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.2777777777777778, "frac_reward_zero_std": 0.375, "grad_norm": 3.672327894257255, "grpo_loss": 0.007078448950778693, "kl": 0.146484375, "learning_rate": 1.826238774315995e-05, "loss": 0.0939, "num_tokens": 4370736.0, "reward": 2.7412109375, "reward_std": 0.2940043546259403, "rewards/accuracy_reward/mean": 0.74609375, "rewards/accuracy_reward/std": 0.4365294650197029, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.06434842199087143, "stage2_sft_loss": 0.004225670505547896, "step": 65, "total_loss": 0.07184943649917841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 127.3515625, "completions/mean_terminated_length": 127.3515625, "completions/min_length": 52.5, "completions/min_terminated_length": 52.5, "epoch": 0.28205128205128205, "frac_reward_zero_std": 0.40625, "grad_norm": 3.0415796698361546, "grpo_loss": 0.029165754676796496, "kl": 0.145263671875, "learning_rate": 1.8177193856644315e-05, "loss": 0.1335, "num_tokens": 4435762.0, "reward": 2.75390625, "reward_std": 0.25540194660425186, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.4187935143709183, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09681425290182233, "stage2_sft_loss": 0.008607075666077435, "step": 66, "total_loss": 0.1268407143652439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 151.48828125, "completions/mean_terminated_length": 151.48828125, "completions/min_length": 65.75, "completions/min_terminated_length": 65.75, "epoch": 0.2863247863247863, "frac_reward_zero_std": 0.375, "grad_norm": 2.652440633953194, "grpo_loss": -0.0036158948205411434, "kl": 0.131103515625, "learning_rate": 1.8090169943749477e-05, "loss": 0.0995, "num_tokens": 4508927.0, "reward": 2.69140625, "reward_std": 0.28235550969839096, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.4624432474374771, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08319994248449802, "stage2_sft_loss": 0.02071410999633372, "step": 67, "total_loss": 0.08165545924566686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 385.5, "completions/max_terminated_length": 247.5, "completions/mean_length": 129.46484375, "completions/mean_terminated_length": 126.99547576904297, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.2905982905982906, "frac_reward_zero_std": 0.40625, "grad_norm": 2.6022327307903494, "grpo_loss": 0.014323993455036543, "kl": 0.143798828125, "learning_rate": 1.8001335480112067e-05, "loss": 0.128, "num_tokens": 4573438.0, "reward": 2.7509765625, "reward_std": 0.2806566655635834, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.4210883155465126, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02343750186264515, "stage1_sft_loss": 0.06220689509063959, "stage2_sft_loss": 0.015904361513094045, "step": 68, "total_loss": 0.07812132267281413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 300.25, "completions/max_terminated_length": 300.25, "completions/mean_length": 138.07421875, "completions/mean_terminated_length": 138.07421875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.2948717948717949, "frac_reward_zero_std": 0.46875, "grad_norm": 3.7319832588310686, "grpo_loss": -0.02579806634457782, "kl": 0.167236328125, "learning_rate": 1.7910710346563417e-05, "loss": 0.0894, "num_tokens": 4643169.0, "reward": 2.73828125, "reward_std": 0.22882908582687378, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.4144846946001053, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.12717284774407744, "stage2_sft_loss": 0.021347035173675977, "step": 69, "total_loss": 0.10350948257837445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 138.43359375, "completions/mean_terminated_length": 138.43359375, "completions/min_length": 62.75, "completions/min_terminated_length": 62.75, "epoch": 0.29914529914529914, "frac_reward_zero_std": 0.375, "grad_norm": 2.7765053595431586, "grpo_loss": -0.025126174557954073, "kl": 0.1556396484375, "learning_rate": 1.78183148246803e-05, "loss": 0.1098, "num_tokens": 4712112.0, "reward": 2.75390625, "reward_std": 0.29183993488550186, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.4199880510568619, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09879820048809052, "stage2_sft_loss": 0.01800179434940219, "step": 70, "total_loss": 0.07547220401465893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 445.75, "completions/max_terminated_length": 350.5, "completions/mean_length": 147.3828125, "completions/mean_terminated_length": 145.00130462646484, "completions/min_length": 65.25, "completions/min_terminated_length": 65.25, "epoch": 0.3034188034188034, "frac_reward_zero_std": 0.21875, "grad_norm": 3.1655373431903913, "grpo_loss": 0.04423183586914092, "kl": 0.1580810546875, "learning_rate": 1.7724169592245996e-05, "loss": 0.1313, "num_tokens": 4783658.0, "reward": 2.6455078125, "reward_std": 0.36747099831700325, "rewards/accuracy_reward/mean": 0.65234375, "rewards/accuracy_reward/std": 0.4756612181663513, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02343750186264515, "stage1_sft_loss": 0.08538164384663105, "stage2_sft_loss": 0.018582295946544036, "step": 71, "total_loss": 0.1314717074856162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 139.63671875, "completions/mean_terminated_length": 139.63671875, "completions/min_length": 69.5, "completions/min_terminated_length": 69.5, "epoch": 0.3076923076923077, "frac_reward_zero_std": 0.5, "grad_norm": 3.0160388980914843, "grpo_loss": -0.016488363384269178, "kl": 0.1353759765625, "learning_rate": 1.7628295718622666e-05, "loss": 0.0915, "num_tokens": 4852685.0, "reward": 2.75390625, "reward_std": 0.2222587689757347, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.4294766038656235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10504917008802295, "stage2_sft_loss": 0.016596848021436017, "step": 72, "total_loss": 0.09022049233317375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 362.25, "completions/max_terminated_length": 362.25, "completions/mean_length": 136.34375, "completions/mean_terminated_length": 136.34375, "completions/min_length": 55.5, "completions/min_terminated_length": 55.5, "epoch": 0.31196581196581197, "frac_reward_zero_std": 0.4375, "grad_norm": 2.3874562532234616, "grpo_loss": -0.025953882723115385, "kl": 0.1104736328125, "learning_rate": 1.7530714660036112e-05, "loss": 0.1125, "num_tokens": 4920901.0, "reward": 2.66015625, "reward_std": 0.23804902657866478, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.4760490208864212, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04344508983194828, "stage2_sft_loss": 0.018051974155241624, "step": 73, "total_loss": 0.019296405371278524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 250.5, "completions/max_terminated_length": 250.5, "completions/mean_length": 120.171875, "completions/mean_terminated_length": 120.171875, "completions/min_length": 59.75, "completions/min_terminated_length": 59.75, "epoch": 0.3162393162393162, "frac_reward_zero_std": 0.40625, "grad_norm": 2.526664631890765, "grpo_loss": -0.030613353825174272, "kl": 0.119384765625, "learning_rate": 1.7431448254773943e-05, "loss": 0.1256, "num_tokens": 4983641.0, "reward": 2.6796875, "reward_std": 0.2553994879126549, "rewards/accuracy_reward/mean": 0.6796875, "rewards/accuracy_reward/std": 0.4430723860859871, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10160567285493016, "stage2_sft_loss": 0.005139547283761203, "step": 74, "total_loss": 0.071506273932755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 230.25, "completions/max_terminated_length": 230.25, "completions/mean_length": 130.84375, "completions/mean_terminated_length": 130.84375, "completions/min_length": 58.5, "completions/min_terminated_length": 58.5, "epoch": 0.32051282051282054, "frac_reward_zero_std": 0.4375, "grad_norm": 2.5098120068964827, "grpo_loss": -0.009117206209339201, "kl": 0.14794921875, "learning_rate": 1.7330518718298263e-05, "loss": 0.0893, "num_tokens": 5050153.0, "reward": 2.72265625, "reward_std": 0.2585534080862999, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4464419335126877, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09497358370572329, "stage2_sft_loss": 0.023782045347616076, "step": 75, "total_loss": 0.08823457965627313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 118.3203125, "completions/mean_terminated_length": 118.3203125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.3247863247863248, "frac_reward_zero_std": 0.46875, "grad_norm": 3.6984272155957005, "grpo_loss": -0.003023316152393818, "kl": 0.172119140625, "learning_rate": 1.7227948638273918e-05, "loss": 0.0736, "num_tokens": 5113211.0, "reward": 2.783203125, "reward_std": 0.2550964131951332, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4064294844865799, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.015625, "stage1_sft_loss": 0.05189145356416702, "stage2_sft_loss": 0.007722017195192166, "step": 76, "total_loss": 0.04964033979922533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 298.75, "completions/max_terminated_length": 298.75, "completions/mean_length": 144.1640625, "completions/mean_terminated_length": 144.1640625, "completions/min_length": 73.25, "completions/min_terminated_length": 73.25, "epoch": 0.32905982905982906, "frac_reward_zero_std": 0.34375, "grad_norm": 2.3502081107560997, "grpo_loss": 0.009372745989821851, "kl": 0.1197509765625, "learning_rate": 1.712376096951345e-05, "loss": 0.055, "num_tokens": 5183701.0, "reward": 2.65625, "reward_std": 0.2786785438656807, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.4706496447324753, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08107265457510948, "stage2_sft_loss": 0.021250678692013025, "step": 77, "total_loss": 0.09257047018036246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 273.25, "completions/max_terminated_length": 273.25, "completions/mean_length": 137.51171875, "completions/mean_terminated_length": 137.51171875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.34375, "grad_norm": 2.729145057547932, "grpo_loss": 0.12022795213852078, "kl": 0.13525390625, "learning_rate": 1.7017979028839918e-05, "loss": 0.1115, "num_tokens": 5250640.0, "reward": 2.72265625, "reward_std": 0.2792089581489563, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4468590244650841, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10791449341922998, "stage2_sft_loss": 0.01693276612786576, "step": 78, "total_loss": 0.22983572259545326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 283.5, "completions/max_terminated_length": 283.5, "completions/mean_length": 137.28515625, "completions/mean_terminated_length": 137.28515625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.33760683760683763, "frac_reward_zero_std": 0.3125, "grad_norm": 2.3802824913901546, "grpo_loss": 0.005306557985022664, "kl": 0.1236572265625, "learning_rate": 1.691062648986865e-05, "loss": 0.0957, "num_tokens": 5319329.0, "reward": 2.630859375, "reward_std": 0.32588188722729683, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.4642375111579895, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.015625, "stage1_sft_loss": 0.07730232924222946, "stage2_sft_loss": 0.0019270793563919142, "step": 79, "total_loss": 0.08280159346759319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 124.80859375, "completions/mean_terminated_length": 124.80859375, "completions/min_length": 63.25, "completions/min_terminated_length": 63.25, "epoch": 0.3418803418803419, "frac_reward_zero_std": 0.40625, "grad_norm": 2.562572433193672, "grpo_loss": -0.03958694860921241, "kl": 0.119873046875, "learning_rate": 1.6801727377709195e-05, "loss": 0.1113, "num_tokens": 5385328.0, "reward": 2.765625, "reward_std": 0.2514565847814083, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42420244961977005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10037143575027585, "stage2_sft_loss": 0.012038417553412728, "step": 80, "total_loss": 0.06198833044618368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 124.39453125, "completions/mean_terminated_length": 124.39453125, "completions/min_length": 63.75, "completions/min_terminated_length": 63.75, "epoch": 0.34615384615384615, "frac_reward_zero_std": 0.53125, "grad_norm": 2.499248013632045, "grpo_loss": 0.04034198680892587, "kl": 0.1214599609375, "learning_rate": 1.6691306063588583e-05, "loss": 0.1041, "num_tokens": 5448349.0, "reward": 2.796875, "reward_std": 0.2021375447511673, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.3994177505373955, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07490937830880284, "stage2_sft_loss": 0.02029243257129565, "step": 81, "total_loss": 0.11728060524910688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 286.5, "completions/max_terminated_length": 286.5, "completions/mean_length": 140.00390625, "completions/mean_terminated_length": 140.00390625, "completions/min_length": 62.75, "completions/min_terminated_length": 62.75, "epoch": 0.3504273504273504, "frac_reward_zero_std": 0.4375, "grad_norm": 2.5742755470532064, "grpo_loss": -0.03953387896763161, "kl": 0.1248779296875, "learning_rate": 1.657938725939713e-05, "loss": 0.0792, "num_tokens": 5517238.0, "reward": 2.7373046875, "reward_std": 0.2568397559225559, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.43832528591156006, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.06850403128191829, "stage2_sft_loss": 0.019538108550477773, "step": 82, "total_loss": 0.030923962127417326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 138.76953125, "completions/mean_terminated_length": 138.76953125, "completions/min_length": 63.75, "completions/min_terminated_length": 63.75, "epoch": 0.3547008547008547, "frac_reward_zero_std": 0.5625, "grad_norm": 2.4821729998966995, "grpo_loss": -0.03367833292577416, "kl": 0.1253662109375, "learning_rate": 1.6465996012157996e-05, "loss": 0.0798, "num_tokens": 5586435.0, "reward": 2.7421875, "reward_std": 0.2086990401148796, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.4246060326695442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.13172664865851402, "stage2_sft_loss": 0.0064331964968005195, "step": 83, "total_loss": 0.09869163855910301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 137.140625, "completions/mean_terminated_length": 137.140625, "completions/min_length": 65.75, "completions/min_terminated_length": 65.75, "epoch": 0.358974358974359, "frac_reward_zero_std": 0.40625, "grad_norm": 3.423143308177815, "grpo_loss": -0.00183769257273525, "kl": 0.1729736328125, "learning_rate": 1.635115769842179e-05, "loss": 0.0942, "num_tokens": 5655103.0, "reward": 2.65625, "reward_std": 0.26341626048088074, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.4552925229072571, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04354301746934652, "stage2_sft_loss": 0.01797918685770128, "step": 84, "total_loss": 0.04350324580445886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 245.25, "completions/max_terminated_length": 245.25, "completions/mean_length": 118.8125, "completions/mean_terminated_length": 118.8125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.36324786324786323, "frac_reward_zero_std": 0.375, "grad_norm": 2.726421526007492, "grpo_loss": 0.021800624788738787, "kl": 0.1282958984375, "learning_rate": 1.6234898018587336e-05, "loss": 0.1009, "num_tokens": 5717567.0, "reward": 2.74609375, "reward_std": 0.28353746607899666, "rewards/accuracy_reward/mean": 0.74609375, "rewards/accuracy_reward/std": 0.43080218881368637, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04685461334884167, "stage2_sft_loss": 0.02155728975776583, "step": 85, "total_loss": 0.07081096805632114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 293.5, "completions/max_terminated_length": 293.5, "completions/mean_length": 138.03515625, "completions/mean_terminated_length": 138.03515625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.36752136752136755, "frac_reward_zero_std": 0.46875, "grad_norm": 2.7129461434409365, "grpo_loss": 0.04890600312501192, "kl": 0.138671875, "learning_rate": 1.6117242991150064e-05, "loss": 0.0943, "num_tokens": 5786584.0, "reward": 2.75, "reward_std": 0.230536550283432, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.42425093054771423, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10090539930388331, "stage2_sft_loss": 0.019177716341800988, "step": 86, "total_loss": 0.15172917302697897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 307.5, "completions/max_terminated_length": 307.5, "completions/mean_length": 135.3046875, "completions/mean_terminated_length": 135.3046875, "completions/min_length": 62.5, "completions/min_terminated_length": 62.5, "epoch": 0.3717948717948718, "frac_reward_zero_std": 0.46875, "grad_norm": 2.925690756507991, "grpo_loss": -0.022680374735500664, "kl": 0.1314697265625, "learning_rate": 1.599821894687914e-05, "loss": 0.1157, "num_tokens": 5853374.0, "reward": 2.69921875, "reward_std": 0.22594210505485535, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.4589729681611061, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07890512142330408, "stage2_sft_loss": 0.009549629903631285, "step": 87, "total_loss": 0.057179709896445274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 287.25, "completions/max_terminated_length": 287.25, "completions/mean_length": 127.3359375, "completions/mean_terminated_length": 127.3359375, "completions/min_length": 61.75, "completions/min_terminated_length": 61.75, "epoch": 0.37606837606837606, "frac_reward_zero_std": 0.4375, "grad_norm": 2.749444964354371, "grpo_loss": 0.026062425575219095, "kl": 0.138916015625, "learning_rate": 1.5877852522924733e-05, "loss": 0.0766, "num_tokens": 5918284.0, "reward": 2.75, "reward_std": 0.24211551621556282, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.39269477128982544, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08172346837818623, "stage2_sft_loss": 0.018881638650782406, "step": 88, "total_loss": 0.1096740560606122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 246.75, "completions/max_terminated_length": 246.75, "completions/mean_length": 121.3359375, "completions/mean_terminated_length": 121.3359375, "completions/min_length": 62.5, "completions/min_terminated_length": 62.5, "epoch": 0.3803418803418803, "frac_reward_zero_std": 0.65625, "grad_norm": 2.5645519227467615, "grpo_loss": 0.007930530468001962, "kl": 0.1263427734375, "learning_rate": 1.575617065685674e-05, "loss": 0.0944, "num_tokens": 5980826.0, "reward": 2.8515625, "reward_std": 0.14651167765259743, "rewards/accuracy_reward/mean": 0.8515625, "rewards/accuracy_reward/std": 0.3466116338968277, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.12439357303082943, "stage2_sft_loss": 0.011787514406023547, "step": 89, "total_loss": 0.13350285589694977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 264.25, "completions/max_terminated_length": 264.25, "completions/mean_length": 136.91015625, "completions/mean_terminated_length": 136.91015625, "completions/min_length": 65.75, "completions/min_terminated_length": 65.75, "epoch": 0.38461538461538464, "frac_reward_zero_std": 0.34375, "grad_norm": 2.4879182580397234, "grpo_loss": 0.0011445782147347927, "kl": 0.14013671875, "learning_rate": 1.563320058063622e-05, "loss": 0.0774, "num_tokens": 6050443.0, "reward": 2.66796875, "reward_std": 0.3047170788049698, "rewards/accuracy_reward/mean": 0.66796875, "rewards/accuracy_reward/std": 0.45233723521232605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.1086853314191103, "stage2_sft_loss": 0.01397453507524915, "step": 90, "total_loss": 0.1112273633480072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 302.5, "completions/max_terminated_length": 302.5, "completions/mean_length": 129.0546875, "completions/mean_terminated_length": 129.0546875, "completions/min_length": 61.5, "completions/min_terminated_length": 61.5, "epoch": 0.3888888888888889, "frac_reward_zero_std": 0.5625, "grad_norm": 2.5519925026432895, "grpo_loss": -0.04885828430997208, "kl": 0.1190185546875, "learning_rate": 1.5508969814521026e-05, "loss": 0.081, "num_tokens": 6115801.0, "reward": 2.7109375, "reward_std": 0.19332443550229073, "rewards/accuracy_reward/mean": 0.7109375, "rewards/accuracy_reward/std": 0.44970038533210754, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06702740117907524, "stage2_sft_loss": 0.007887058920459822, "step": 91, "total_loss": 0.018957823514938354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 230.75, "completions/max_terminated_length": 230.75, "completions/mean_length": 125.98828125, "completions/mean_terminated_length": 125.98828125, "completions/min_length": 52.25, "completions/min_terminated_length": 52.25, "epoch": 0.39316239316239315, "frac_reward_zero_std": 0.625, "grad_norm": 2.671387602994381, "grpo_loss": -0.026032934081740677, "kl": 0.1090087890625, "learning_rate": 1.5383506160906826e-05, "loss": 0.0816, "num_tokens": 6180062.0, "reward": 2.8046875, "reward_std": 0.15650184452533722, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.388528935611248, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0885940557345748, "stage2_sft_loss": 0.02217566382023506, "step": 92, "total_loss": 0.06477868836373091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 222.5, "completions/max_terminated_length": 222.5, "completions/mean_length": 130.9140625, "completions/mean_terminated_length": 130.9140625, "completions/min_length": 55.25, "completions/min_terminated_length": 55.25, "epoch": 0.3974358974358974, "frac_reward_zero_std": 0.40625, "grad_norm": 2.9580194659239027, "grpo_loss": 0.01426442974479869, "kl": 0.152099609375, "learning_rate": 1.5256837698105047e-05, "loss": 0.1078, "num_tokens": 6247824.0, "reward": 2.68359375, "reward_std": 0.25829383358359337, "rewards/accuracy_reward/mean": 0.68359375, "rewards/accuracy_reward/std": 0.46058642119169235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.12474778946489096, "stage2_sft_loss": 0.0037532774949795566, "step": 93, "total_loss": 0.13938755076378584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 117.26171875, "completions/mean_terminated_length": 117.26171875, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.4017094017094017, "frac_reward_zero_std": 0.5, "grad_norm": 2.6809485416788807, "grpo_loss": 0.05255669995676726, "kl": 0.1064453125, "learning_rate": 1.5128992774059063e-05, "loss": 0.0891, "num_tokens": 6310051.0, "reward": 2.7421875, "reward_std": 0.22738608345389366, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.4302185848355293, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10235591046512127, "stage2_sft_loss": 0.009095492074266076, "step": 94, "total_loss": 0.1558221597224474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 138.12890625, "completions/mean_terminated_length": 138.12890625, "completions/min_length": 66.75, "completions/min_terminated_length": 66.75, "epoch": 0.405982905982906, "frac_reward_zero_std": 0.34375, "grad_norm": 2.5597991268269777, "grpo_loss": 0.016854463145136833, "kl": 0.1239013671875, "learning_rate": 1.5000000000000002e-05, "loss": 0.0841, "num_tokens": 6379940.0, "reward": 2.69921875, "reward_std": 0.2855108827352524, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45850304514169693, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08178659062832594, "stage2_sft_loss": 0.021333697950467467, "step": 95, "total_loss": 0.10077442298643291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 342.5, "completions/max_terminated_length": 219.25, "completions/mean_length": 127.04296875, "completions/mean_terminated_length": 124.57942962646484, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.41025641025641024, "frac_reward_zero_std": 0.4375, "grad_norm": 2.8329251039895205, "grpo_loss": 0.0065450501861050725, "kl": 0.1209716796875, "learning_rate": 1.4869888244043674e-05, "loss": 0.1343, "num_tokens": 6444855.0, "reward": 2.7080078125, "reward_std": 0.2602709634229541, "rewards/accuracy_reward/mean": 0.71484375, "rewards/accuracy_reward/std": 0.427839957177639, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02343750186264515, "stage1_sft_loss": 0.09643353894352913, "stage2_sft_loss": 0.03424702318443451, "step": 96, "total_loss": 0.10640329401940107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 320.5, "completions/max_terminated_length": 320.5, "completions/mean_length": 134.9296875, "completions/mean_terminated_length": 134.9296875, "completions/min_length": 57.5, "completions/min_terminated_length": 57.5, "epoch": 0.41452991452991456, "frac_reward_zero_std": 0.34375, "grad_norm": 2.4973266632814486, "grpo_loss": -0.05328447837382555, "kl": 0.1409912109375, "learning_rate": 1.4738686624729987e-05, "loss": 0.0991, "num_tokens": 6513957.0, "reward": 2.7373046875, "reward_std": 0.28144313395023346, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.427031971514225, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.07091331109404564, "stage2_sft_loss": 0.015732734580524266, "step": 97, "total_loss": 0.019202106399461627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 374.5, "completions/max_terminated_length": 249.5, "completions/mean_length": 139.7265625, "completions/mean_terminated_length": 134.7769660949707, "completions/min_length": 59.75, "completions/min_terminated_length": 59.75, "epoch": 0.4188034188034188, "frac_reward_zero_std": 0.28125, "grad_norm": 2.3197516269195115, "grpo_loss": -0.05551034724339843, "kl": 0.1173095703125, "learning_rate": 1.4606424504506325e-05, "loss": 0.1264, "num_tokens": 6583695.0, "reward": 2.705078125, "reward_std": 0.3469051569700241, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.44226548820734024, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.043842025101184845, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.032881516963243484, "stage1_sft_loss": 0.05779402703046799, "stage2_sft_loss": 0.011461514979600906, "step": 98, "total_loss": 0.0034298310056328773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 138.578125, "completions/mean_terminated_length": 138.578125, "completions/min_length": 58.5, "completions/min_terminated_length": 58.5, "epoch": 0.4230769230769231, "frac_reward_zero_std": 0.375, "grad_norm": 2.4797442538778522, "grpo_loss": -0.03367353102657944, "kl": 0.1246337890625, "learning_rate": 1.4473131483156326e-05, "loss": 0.0719, "num_tokens": 6652891.0, "reward": 2.7138671875, "reward_std": 0.2675051614642143, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.42169189453125, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.08882415713742375, "stage2_sft_loss": 0.014731279705301858, "step": 99, "total_loss": 0.05662375397514552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 128.98046875, "completions/mean_terminated_length": 128.98046875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.42735042735042733, "frac_reward_zero_std": 0.4375, "grad_norm": 2.4029762999741564, "grpo_loss": -0.01963827926556405, "kl": 0.1123046875, "learning_rate": 1.4338837391175582e-05, "loss": 0.0704, "num_tokens": 6719542.0, "reward": 2.76953125, "reward_std": 0.25460558384656906, "rewards/accuracy_reward/mean": 0.76953125, "rewards/accuracy_reward/std": 0.4023680202662945, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10046672075986862, "stage2_sft_loss": 0.02204990791506134, "step": 100, "total_loss": 0.0830334322527051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 130.390625, "completions/mean_terminated_length": 130.390625, "completions/min_length": 72.5, "completions/min_terminated_length": 72.5, "epoch": 0.43162393162393164, "frac_reward_zero_std": 0.21875, "grad_norm": 2.36108691500467, "grpo_loss": 0.03791657286637928, "kl": 0.1080322265625, "learning_rate": 1.4203572283095657e-05, "loss": 0.0576, "num_tokens": 6784962.0, "reward": 2.7763671875, "reward_std": 0.33827219903469086, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.4132651388645172, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.06852314155548811, "stage2_sft_loss": 0.008980212034657598, "step": 101, "total_loss": 0.10733773885294795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 270.75, "completions/max_terminated_length": 270.75, "completions/mean_length": 131.19140625, "completions/mean_terminated_length": 131.19140625, "completions/min_length": 57.5, "completions/min_terminated_length": 57.5, "epoch": 0.4358974358974359, "frac_reward_zero_std": 0.40625, "grad_norm": 2.5331171435040436, "grpo_loss": 0.030746392672881484, "kl": 0.1236572265625, "learning_rate": 1.4067366430758004e-05, "loss": 0.0715, "num_tokens": 6851587.0, "reward": 2.76171875, "reward_std": 0.25486908480525017, "rewards/accuracy_reward/mean": 0.76171875, "rewards/accuracy_reward/std": 0.40466783940792084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.03651396604254842, "stage2_sft_loss": 0.009974595624953508, "step": 102, "total_loss": 0.06825782265514135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 256.25, "completions/max_terminated_length": 256.25, "completions/mean_length": 128.2578125, "completions/mean_terminated_length": 128.2578125, "completions/min_length": 60.5, "completions/min_terminated_length": 60.5, "epoch": 0.44017094017094016, "frac_reward_zero_std": 0.53125, "grad_norm": 2.449351104624353, "grpo_loss": 0.01105509905028157, "kl": 0.1328125, "learning_rate": 1.3930250316539237e-05, "loss": 0.0706, "num_tokens": 6916605.0, "reward": 2.73828125, "reward_std": 0.21291769668459892, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.42710205167531967, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08719620713964105, "stage2_sft_loss": 0.015745949975098483, "step": 103, "total_loss": 0.09982590237632394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 396.75, "completions/max_terminated_length": 279.25, "completions/mean_length": 137.33203125, "completions/mean_terminated_length": 134.81733894348145, "completions/min_length": 59.75, "completions/min_terminated_length": 59.75, "epoch": 0.4444444444444444, "frac_reward_zero_std": 0.40625, "grad_norm": 2.968394025688263, "grpo_loss": -0.037616387009620667, "kl": 0.162109375, "learning_rate": 1.3792254626529286e-05, "loss": 0.1165, "num_tokens": 6985882.0, "reward": 2.7197265625, "reward_std": 0.2833760306239128, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.44848156720399857, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02343750186264515, "stage1_sft_loss": 0.1242181695997715, "stage2_sft_loss": 0.011403052310924977, "step": 104, "total_loss": 0.087742087431252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 316.5, "completions/max_terminated_length": 316.5, "completions/mean_length": 124.76171875, "completions/mean_terminated_length": 124.76171875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.44871794871794873, "frac_reward_zero_std": 0.71875, "grad_norm": 2.6270391923453937, "grpo_loss": -0.011787233641371131, "kl": 0.1097412109375, "learning_rate": 1.3653410243663953e-05, "loss": 0.0882, "num_tokens": 7050389.0, "reward": 2.79296875, "reward_std": 0.11652141716331244, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.39603784680366516, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05623149313032627, "stage2_sft_loss": 0.009826588677242398, "step": 105, "total_loss": 0.04542691865935922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 129.23046875, "completions/mean_terminated_length": 129.23046875, "completions/min_length": 62.75, "completions/min_terminated_length": 62.75, "epoch": 0.452991452991453, "frac_reward_zero_std": 0.34375, "grad_norm": 2.847238135727028, "grpo_loss": -0.0050932477752212435, "kl": 0.1181640625, "learning_rate": 1.3513748240813429e-05, "loss": 0.1022, "num_tokens": 7116360.0, "reward": 2.671875, "reward_std": 0.2899891063570976, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4593518376350403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04017673246562481, "stage2_sft_loss": 0.015764886702527292, "step": 106, "total_loss": 0.03665997087955475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 230.75, "completions/max_terminated_length": 230.75, "completions/mean_length": 128.421875, "completions/mean_terminated_length": 128.421875, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.45726495726495725, "frac_reward_zero_std": 0.4375, "grad_norm": 2.896372677278952, "grpo_loss": 0.07070640404708683, "kl": 0.1162109375, "learning_rate": 1.3373299873828303e-05, "loss": 0.0939, "num_tokens": 7181564.0, "reward": 2.78515625, "reward_std": 0.25342363119125366, "rewards/accuracy_reward/mean": 0.78515625, "rewards/accuracy_reward/std": 0.4078981876373291, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.15257746912539005, "stage2_sft_loss": 0.009742788024595939, "step": 107, "total_loss": 0.224258154630661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 256.25, "completions/max_terminated_length": 256.25, "completions/mean_length": 130.69921875, "completions/mean_terminated_length": 130.69921875, "completions/min_length": 60.25, "completions/min_terminated_length": 60.25, "epoch": 0.46153846153846156, "frac_reward_zero_std": 0.46875, "grad_norm": 2.38927462180599, "grpo_loss": -0.04817646351875737, "kl": 0.1099853515625, "learning_rate": 1.3232096574544602e-05, "loss": 0.0729, "num_tokens": 7249071.0, "reward": 2.73828125, "reward_std": 0.22817998379468918, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.43208014219999313, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10405674204230309, "stage2_sft_loss": 0.0215330894861836, "step": 108, "total_loss": 0.058033584617078304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 279.75, "completions/max_terminated_length": 279.75, "completions/mean_length": 138.328125, "completions/mean_terminated_length": 138.328125, "completions/min_length": 63.75, "completions/min_terminated_length": 63.75, "epoch": 0.4658119658119658, "frac_reward_zero_std": 0.40625, "grad_norm": 2.445961316741637, "grpo_loss": -0.023899111460195854, "kl": 0.11328125, "learning_rate": 1.3090169943749475e-05, "loss": 0.0826, "num_tokens": 7317899.0, "reward": 2.703125, "reward_std": 0.2747268117964268, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4562576711177826, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0504996933741495, "stage2_sft_loss": 0.01845655390934553, "step": 109, "total_loss": 0.02844623802229762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 127.7890625, "completions/mean_terminated_length": 127.7890625, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.4700854700854701, "frac_reward_zero_std": 0.46875, "grad_norm": 2.3492313298200544, "grpo_loss": 0.05720149329863489, "kl": 0.096923828125, "learning_rate": 1.2947551744109044e-05, "loss": 0.0872, "num_tokens": 7383917.0, "reward": 2.796875, "reward_std": 0.24408500641584396, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.37127064168453217, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.11955506075173616, "stage2_sft_loss": 0.018571392953163013, "step": 110, "total_loss": 0.1786136943846941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 136.73046875, "completions/mean_terminated_length": 136.73046875, "completions/min_length": 61.25, "completions/min_terminated_length": 61.25, "epoch": 0.47435897435897434, "frac_reward_zero_std": 0.375, "grad_norm": 3.147191794235813, "grpo_loss": 0.020814732182770967, "kl": 0.1019287109375, "learning_rate": 1.2804273893060028e-05, "loss": 0.0816, "num_tokens": 7451880.0, "reward": 2.7060546875, "reward_std": 0.2917686812579632, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.4403637945652008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.10996824130415916, "stage2_sft_loss": 0.03235575696453452, "step": 111, "total_loss": 0.1340185476001352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 305.75, "completions/max_terminated_length": 305.75, "completions/mean_length": 132.7578125, "completions/mean_terminated_length": 132.7578125, "completions/min_length": 67.25, "completions/min_terminated_length": 67.25, "epoch": 0.47863247863247865, "frac_reward_zero_std": 0.4375, "grad_norm": 2.5895502657952156, "grpo_loss": 0.04569915612228215, "kl": 0.1212158203125, "learning_rate": 1.2660368455666752e-05, "loss": 0.1027, "num_tokens": 7518810.0, "reward": 2.7578125, "reward_std": 0.26196590065956116, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.43093569576740265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06594810076057911, "stage2_sft_loss": 0.009801828651688993, "step": 112, "total_loss": 0.11262743826955557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 299.25, "completions/max_terminated_length": 299.25, "completions/mean_length": 132.6796875, "completions/mean_terminated_length": 132.6796875, "completions/min_length": 54.25, "completions/min_terminated_length": 54.25, "epoch": 0.4829059829059829, "frac_reward_zero_std": 0.3125, "grad_norm": 2.8469563747260853, "grpo_loss": -0.013087262865155935, "kl": 0.1463623046875, "learning_rate": 1.2515867637445088e-05, "loss": 0.0658, "num_tokens": 7586296.0, "reward": 2.7265625, "reward_std": 0.29367245733737946, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.44265370070934296, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05833651963621378, "stage2_sft_loss": 0.005778922597528435, "step": 113, "total_loss": 0.04582714755088091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 336.25, "completions/max_terminated_length": 336.25, "completions/mean_length": 138.3515625, "completions/mean_terminated_length": 138.3515625, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.48717948717948717, "frac_reward_zero_std": 0.34375, "grad_norm": 2.5939421842717167, "grpo_loss": -0.025480328127741814, "kl": 0.1175537109375, "learning_rate": 1.2370803777154976e-05, "loss": 0.0781, "num_tokens": 7653930.0, "reward": 2.6357421875, "reward_std": 0.30325330793857574, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.48062169551849365, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.0783770801499486, "stage2_sft_loss": 0.02130466280505061, "step": 114, "total_loss": 0.05502722132951021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 135.515625, "completions/mean_terminated_length": 135.515625, "completions/min_length": 65.25, "completions/min_terminated_length": 65.25, "epoch": 0.49145299145299143, "frac_reward_zero_std": 0.46875, "grad_norm": 2.6081245663182995, "grpo_loss": 0.038738559873308986, "kl": 0.120849609375, "learning_rate": 1.2225209339563144e-05, "loss": 0.0871, "num_tokens": 7722198.0, "reward": 2.69921875, "reward_std": 0.23277199268341064, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.4489804431796074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.11994863720610738, "stage2_sft_loss": 0.013875026190362405, "step": 115, "total_loss": 0.16007469967007637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 133.05078125, "completions/mean_terminated_length": 133.05078125, "completions/min_length": 67.5, "completions/min_terminated_length": 67.5, "epoch": 0.49572649572649574, "frac_reward_zero_std": 0.4375, "grad_norm": 2.565895224486075, "grpo_loss": 0.018095170467859134, "kl": 0.112060546875, "learning_rate": 1.2079116908177592e-05, "loss": 0.0927, "num_tokens": 7789883.0, "reward": 2.80078125, "reward_std": 0.2501298226416111, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40074585378170013, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07847420917823911, "stage2_sft_loss": 0.004641155697754584, "step": 116, "total_loss": 0.09703349741175771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 136.375, "completions/mean_terminated_length": 136.375, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.5, "frac_reward_zero_std": 0.34375, "grad_norm": 2.473421491696022, "grpo_loss": 0.07887550839222968, "kl": 0.117431640625, "learning_rate": 1.1932559177955533e-05, "loss": 0.0872, "num_tokens": 7858947.0, "reward": 2.71875, "reward_std": 0.3031258285045624, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.4462125226855278, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.034118348732590675, "stage2_sft_loss": 0.0057628911308711395, "step": 117, "total_loss": 0.11357014431268908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 254.25, "completions/max_terminated_length": 254.25, "completions/mean_length": 141.44140625, "completions/mean_terminated_length": 141.44140625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.5042735042735043, "frac_reward_zero_std": 0.4375, "grad_norm": 2.3987615620601446, "grpo_loss": 0.004953495837980881, "kl": 0.109130859375, "learning_rate": 1.1785568947986368e-05, "loss": 0.0828, "num_tokens": 7928500.0, "reward": 2.83203125, "reward_std": 0.22108563408255577, "rewards/accuracy_reward/mean": 0.83203125, "rewards/accuracy_reward/std": 0.36582332849502563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.03221064526587725, "stage2_sft_loss": 0.00645940622780472, "step": 118, "total_loss": 0.037810081616044044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 145.38671875, "completions/mean_terminated_length": 145.38671875, "completions/min_length": 72.5, "completions/min_terminated_length": 72.5, "epoch": 0.5085470085470085, "frac_reward_zero_std": 0.53125, "grad_norm": 2.2409935628410236, "grpo_loss": 0.01667332003125921, "kl": 0.105224609375, "learning_rate": 1.1638179114151378e-05, "loss": 0.0885, "num_tokens": 7999543.0, "reward": 2.84375, "reward_std": 0.19647981226444244, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.35410288721323013, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08437999477609992, "stage2_sft_loss": 0.006384373642504215, "step": 119, "total_loss": 0.10169175546616316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 338.75, "completions/max_terminated_length": 338.75, "completions/mean_length": 131.30859375, "completions/mean_terminated_length": 131.30859375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.5128205128205128, "frac_reward_zero_std": 0.4375, "grad_norm": 2.6895306254784965, "grpo_loss": -0.0084699469152838, "kl": 0.12841796875, "learning_rate": 1.1490422661761744e-05, "loss": 0.0962, "num_tokens": 8065454.0, "reward": 2.78125, "reward_std": 0.2585509456694126, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.4090314507484436, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06598297040909529, "stage2_sft_loss": 0.009453605287490063, "step": 120, "total_loss": 0.05845838412642479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 294.25, "completions/max_terminated_length": 294.25, "completions/mean_length": 130.08203125, "completions/mean_terminated_length": 130.08203125, "completions/min_length": 68.75, "completions/min_terminated_length": 68.75, "epoch": 0.5170940170940171, "frac_reward_zero_std": 0.4375, "grad_norm": 2.708431066615691, "grpo_loss": -0.01987017970532179, "kl": 0.122314453125, "learning_rate": 1.1342332658176556e-05, "loss": 0.0947, "num_tokens": 8132147.0, "reward": 2.69921875, "reward_std": 0.24553291127085686, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.4452369287610054, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06503801513463259, "stage2_sft_loss": 0.010300099500454962, "step": 121, "total_loss": 0.04619784792885184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 144.5625, "completions/mean_terminated_length": 144.5625, "completions/min_length": 58.75, "completions/min_terminated_length": 58.75, "epoch": 0.5213675213675214, "frac_reward_zero_std": 0.46875, "grad_norm": 2.455987899386292, "grpo_loss": -0.023904464964289218, "kl": 0.1068115234375, "learning_rate": 1.1193942245402443e-05, "loss": 0.0916, "num_tokens": 8202891.0, "reward": 2.8046875, "reward_std": 0.23554520681500435, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.3951949328184128, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.054928970988839865, "stage2_sft_loss": 0.005394812709710095, "step": 122, "total_loss": 0.031563987489789724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 292.25, "completions/max_terminated_length": 292.25, "completions/mean_length": 146.9375, "completions/mean_terminated_length": 146.9375, "completions/min_length": 61.5, "completions/min_terminated_length": 61.5, "epoch": 0.5256410256410257, "frac_reward_zero_std": 0.40625, "grad_norm": 2.9877442732221833, "grpo_loss": 0.009980375471059233, "kl": 0.1158447265625, "learning_rate": 1.1045284632676535e-05, "loss": 0.0847, "num_tokens": 8274523.0, "reward": 2.6171875, "reward_std": 0.27248402312397957, "rewards/accuracy_reward/mean": 0.6171875, "rewards/accuracy_reward/std": 0.4808383285999298, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10333962785080075, "stage2_sft_loss": 0.036806404881644994, "step": 123, "total_loss": 0.11700064362958074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 319.75, "completions/max_terminated_length": 319.75, "completions/mean_length": 129.9453125, "completions/mean_terminated_length": 129.9453125, "completions/min_length": 66.25, "completions/min_terminated_length": 66.25, "epoch": 0.5299145299145299, "frac_reward_zero_std": 0.53125, "grad_norm": 2.3026834720930385, "grpo_loss": 0.013034810457611457, "kl": 0.093017578125, "learning_rate": 1.0896393089034336e-05, "loss": 0.0679, "num_tokens": 8340973.0, "reward": 2.8359375, "reward_std": 0.20437543839216232, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.37163110077381134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06969350064173341, "stage2_sft_loss": 0.00450861056742724, "step": 124, "total_loss": 0.08317917049862444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 264.5, "completions/max_terminated_length": 264.5, "completions/mean_length": 132.69140625, "completions/mean_terminated_length": 132.69140625, "completions/min_length": 64.75, "completions/min_terminated_length": 64.75, "epoch": 0.5341880341880342, "frac_reward_zero_std": 0.3125, "grad_norm": 2.46335246220877, "grpo_loss": -0.01052320736926049, "kl": 0.1055908203125, "learning_rate": 1.0747300935864245e-05, "loss": 0.0823, "num_tokens": 8407934.0, "reward": 2.71875, "reward_std": 0.2909066006541252, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.4371757209300995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04156210971996188, "stage2_sft_loss": 0.016224085906287655, "step": 125, "total_loss": 0.032661307603120804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 252.25, "completions/max_terminated_length": 252.25, "completions/mean_length": 133.44140625, "completions/mean_terminated_length": 133.44140625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.5384615384615384, "frac_reward_zero_std": 0.375, "grad_norm": 2.380614799508239, "grpo_loss": -0.035499130171956494, "kl": 0.1060791015625, "learning_rate": 1.0598041539450344e-05, "loss": 0.0474, "num_tokens": 8475375.0, "reward": 2.7265625, "reward_std": 0.2852449417114258, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.4341081902384758, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09117380063980818, "stage2_sft_loss": 0.027107439294923097, "step": 126, "total_loss": 0.05838541442062706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 257.25, "completions/max_terminated_length": 257.25, "completions/mean_length": 122.09375, "completions/mean_terminated_length": 122.09375, "completions/min_length": 59.25, "completions/min_terminated_length": 59.25, "epoch": 0.5427350427350427, "frac_reward_zero_std": 0.40625, "grad_norm": 2.454086196019503, "grpo_loss": 0.016646095959004015, "kl": 0.1180419921875, "learning_rate": 1.044864830350515e-05, "loss": 0.0735, "num_tokens": 8539295.0, "reward": 2.6953125, "reward_std": 0.2588193491101265, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.45702145248651505, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04539195611141622, "stage2_sft_loss": 0.017618531346670352, "step": 127, "total_loss": 0.0637999044265598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 309.25, "completions/max_terminated_length": 309.25, "completions/mean_length": 135.65234375, "completions/mean_terminated_length": 135.65234375, "completions/min_length": 62.25, "completions/min_terminated_length": 62.25, "epoch": 0.5470085470085471, "frac_reward_zero_std": 0.34375, "grad_norm": 2.8749697279234163, "grpo_loss": -0.028164366842247546, "kl": 0.1300048828125, "learning_rate": 1.0299154661693987e-05, "loss": 0.1046, "num_tokens": 8607918.0, "reward": 2.6953125, "reward_std": 0.2952302098274231, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.44884125888347626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06593989208340645, "stage2_sft_loss": 0.008943283930420876, "step": 128, "total_loss": 0.03866985347121954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 145.3828125, "completions/mean_terminated_length": 145.3828125, "completions/min_length": 71.5, "completions/min_terminated_length": 71.5, "epoch": 0.5512820512820513, "frac_reward_zero_std": 0.46875, "grad_norm": 2.2360139721857832, "grpo_loss": -0.0003669927828013897, "kl": 0.1151123046875, "learning_rate": 1.0149594070152638e-05, "loss": 0.0799, "num_tokens": 8677888.0, "reward": 2.73828125, "reward_std": 0.2475024051964283, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.4189733415842056, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.11887484043836594, "stage2_sft_loss": 0.014625519164837897, "step": 129, "total_loss": 0.11997039895504713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 270.5, "completions/max_terminated_length": 270.5, "completions/mean_length": 144.18359375, "completions/mean_terminated_length": 144.18359375, "completions/min_length": 67.5, "completions/min_terminated_length": 67.5, "epoch": 0.5555555555555556, "frac_reward_zero_std": 0.5, "grad_norm": 2.0756110198748474, "grpo_loss": 0.03972660058934707, "kl": 0.0960693359375, "learning_rate": 1e-05, "loss": 0.0461, "num_tokens": 8750079.0, "reward": 2.80859375, "reward_std": 0.22278673481196165, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.348017118871212, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.050859407521784306, "stage2_sft_loss": 0.00923627592419507, "step": 130, "total_loss": 0.0915096364915371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 228.25, "completions/max_terminated_length": 228.25, "completions/mean_length": 126.390625, "completions/mean_terminated_length": 126.390625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.5598290598290598, "frac_reward_zero_std": 0.5625, "grad_norm": 2.90853344624612, "grpo_loss": -0.005080391361843795, "kl": 0.121337890625, "learning_rate": 9.850405929847367e-06, "loss": 0.058, "num_tokens": 8814987.0, "reward": 2.779296875, "reward_std": 0.21056002005934715, "rewards/accuracy_reward/mean": 0.84765625, "rewards/accuracy_reward/std": 0.3576347529888153, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.15677954256534576, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.03919488564133644, "stage1_sft_loss": 0.04821830568835139, "stage2_sft_loss": 0.006851830825326033, "step": 131, "total_loss": 0.043823097832500935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 253.75, "completions/max_terminated_length": 253.75, "completions/mean_length": 138.89453125, "completions/mean_terminated_length": 138.89453125, "completions/min_length": 58.5, "completions/min_terminated_length": 58.5, "epoch": 0.5641025641025641, "frac_reward_zero_std": 0.4375, "grad_norm": 2.2783596807349347, "grpo_loss": -0.0668418699860922, "kl": 0.0982666015625, "learning_rate": 9.700845338306018e-06, "loss": 0.042, "num_tokens": 8884136.0, "reward": 2.76171875, "reward_std": 0.2506577856838703, "rewards/accuracy_reward/mean": 0.76171875, "rewards/accuracy_reward/std": 0.4220747724175453, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04070015996694565, "stage2_sft_loss": 0.010169286300879321, "step": 132, "total_loss": -0.02512478199787438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 150.24609375, "completions/mean_terminated_length": 150.24609375, "completions/min_length": 57.25, "completions/min_terminated_length": 57.25, "epoch": 0.5683760683760684, "frac_reward_zero_std": 0.4375, "grad_norm": 2.527051548886218, "grpo_loss": -0.05759265075903386, "kl": 0.103271484375, "learning_rate": 9.551351696494854e-06, "loss": 0.0655, "num_tokens": 8956583.0, "reward": 2.64453125, "reward_std": 0.2313353642821312, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.45752032846212387, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.11429626494646072, "stage2_sft_loss": 0.04731386760249734, "step": 133, "total_loss": 0.06143500283360481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 273.5, "completions/max_terminated_length": 273.5, "completions/mean_length": 124.93359375, "completions/mean_terminated_length": 124.93359375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.5726495726495726, "frac_reward_zero_std": 0.6875, "grad_norm": 2.2775167385076047, "grpo_loss": -0.011081848337198608, "kl": 0.093017578125, "learning_rate": 9.401958460549658e-06, "loss": 0.0553, "num_tokens": 9022390.0, "reward": 2.80859375, "reward_std": 0.13611222617328167, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.3880981504917145, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06806533015333116, "stage2_sft_loss": 0.011031713336706161, "step": 134, "total_loss": 0.058086653240025043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 280.25, "completions/max_terminated_length": 280.25, "completions/mean_length": 127.94921875, "completions/mean_terminated_length": 127.94921875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.5769230769230769, "frac_reward_zero_std": 0.40625, "grad_norm": 2.2333257978413013, "grpo_loss": 0.046479592099785805, "kl": 0.0970458984375, "learning_rate": 9.252699064135759e-06, "loss": 0.0892, "num_tokens": 9088265.0, "reward": 2.80859375, "reward_std": 0.25434111803770065, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.35886215418577194, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06378544913604856, "stage2_sft_loss": 0.018909562553744763, "step": 135, "total_loss": 0.11215599719434977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 233.5, "completions/max_terminated_length": 233.5, "completions/mean_length": 116.78125, "completions/mean_terminated_length": 116.78125, "completions/min_length": 56.25, "completions/min_terminated_length": 56.25, "epoch": 0.5811965811965812, "frac_reward_zero_std": 0.5, "grad_norm": 2.4767783638602623, "grpo_loss": -0.027499133881065063, "kl": 0.10888671875, "learning_rate": 9.103606910965666e-06, "loss": 0.0663, "num_tokens": 9151545.0, "reward": 2.83984375, "reward_std": 0.2068792637437582, "rewards/accuracy_reward/mean": 0.83984375, "rewards/accuracy_reward/std": 0.36639247089624405, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05348319374024868, "stage2_sft_loss": 0.008430409914581105, "step": 136, "total_loss": 0.026827102527022362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 355.75, "completions/max_terminated_length": 355.75, "completions/mean_length": 148.94921875, "completions/mean_terminated_length": 148.94921875, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.5854700854700855, "frac_reward_zero_std": 0.375, "grad_norm": 2.329224577124586, "grpo_loss": 0.026575487572699785, "kl": 0.11669921875, "learning_rate": 8.954715367323468e-06, "loss": 0.0665, "num_tokens": 9223548.0, "reward": 2.734375, "reward_std": 0.2607549577951431, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.43093063682317734, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015248439274728298, "stage1_sft_loss": 0.08758598286658525, "stage2_sft_loss": 0.05217828591048601, "step": 137, "total_loss": 0.11937929969280958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 134.59765625, "completions/mean_terminated_length": 134.59765625, "completions/min_length": 55.5, "completions/min_terminated_length": 55.5, "epoch": 0.5897435897435898, "frac_reward_zero_std": 0.5, "grad_norm": 2.5035557731102522, "grpo_loss": 0.011050511908251792, "kl": 0.1112060546875, "learning_rate": 8.806057754597559e-06, "loss": 0.0836, "num_tokens": 9291653.0, "reward": 2.8046875, "reward_std": 0.23132899776101112, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39095889776945114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05431360239163041, "stage2_sft_loss": 0.009981016424717382, "step": 138, "total_loss": 0.06636221474036574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 125.7109375, "completions/mean_terminated_length": 125.7109375, "completions/min_length": 59.5, "completions/min_terminated_length": 59.5, "epoch": 0.594017094017094, "frac_reward_zero_std": 0.5625, "grad_norm": 2.361583240185186, "grpo_loss": -0.033124256053270074, "kl": 0.100341796875, "learning_rate": 8.657667341823449e-06, "loss": 0.0876, "num_tokens": 9356483.0, "reward": 2.8359375, "reward_std": 0.19332443177700043, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.3644886091351509, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07262712554074824, "stage2_sft_loss": 0.014838075108855264, "step": 139, "total_loss": 0.04098667961079627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 443.25, "completions/max_terminated_length": 443.25, "completions/mean_length": 147.3984375, "completions/mean_terminated_length": 147.3984375, "completions/min_length": 65.5, "completions/min_terminated_length": 65.5, "epoch": 0.5982905982905983, "frac_reward_zero_std": 0.4375, "grad_norm": 2.140356398926018, "grpo_loss": -0.05637400213163346, "kl": 0.08587646484375, "learning_rate": 8.509577338238255e-06, "loss": 0.0703, "num_tokens": 9427193.0, "reward": 2.765625, "reward_std": 0.2494782730937004, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.4166136011481285, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05473982123658061, "stage2_sft_loss": 0.05722901329136221, "step": 140, "total_loss": 0.004088719375431538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 137.15234375, "completions/mean_terminated_length": 137.15234375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.6025641025641025, "frac_reward_zero_std": 0.5625, "grad_norm": 2.5103848979553662, "grpo_loss": 0.058950818143785, "kl": 0.107177734375, "learning_rate": 8.361820885848623e-06, "loss": 0.0941, "num_tokens": 9495024.0, "reward": 2.78515625, "reward_std": 0.1876691598445177, "rewards/accuracy_reward/mean": 0.78515625, "rewards/accuracy_reward/std": 0.39204035699367523, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07926928717643023, "stage2_sft_loss": 0.008115518408885691, "step": 141, "total_loss": 0.13903165794909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 322.75, "completions/max_terminated_length": 322.75, "completions/mean_length": 137.8203125, "completions/mean_terminated_length": 137.8203125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.6068376068376068, "frac_reward_zero_std": 0.53125, "grad_norm": 2.5391650094703944, "grpo_loss": 0.07669655280187726, "kl": 0.1102294921875, "learning_rate": 8.214431052013636e-06, "loss": 0.091, "num_tokens": 9563378.0, "reward": 2.796875, "reward_std": 0.19647981226444244, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.37679756060242653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10939254239201546, "stage2_sft_loss": 0.00877940544160083, "step": 142, "total_loss": 0.18696703761816025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 298.25, "completions/max_terminated_length": 298.25, "completions/mean_length": 139.26171875, "completions/mean_terminated_length": 139.26171875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.6111111111111112, "frac_reward_zero_std": 0.40625, "grad_norm": 2.4841493562770363, "grpo_loss": 0.020590394386090338, "kl": 0.12451171875, "learning_rate": 8.06744082204447e-06, "loss": 0.0967, "num_tokens": 9630797.0, "reward": 2.7578125, "reward_std": 0.2571094296872616, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.4005141332745552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05549432337284088, "stage2_sft_loss": 0.014109492563875392, "step": 143, "total_loss": 0.07749566785059869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 255.75, "completions/max_terminated_length": 255.75, "completions/mean_length": 135.46484375, "completions/mean_terminated_length": 135.46484375, "completions/min_length": 67.75, "completions/min_terminated_length": 67.75, "epoch": 0.6153846153846154, "frac_reward_zero_std": 0.5625, "grad_norm": 2.64956235594402, "grpo_loss": -0.012727254332276061, "kl": 0.097412109375, "learning_rate": 7.92088309182241e-06, "loss": 0.0809, "num_tokens": 9698876.0, "reward": 2.8046875, "reward_std": 0.19739338383078575, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.394244909286499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.1811660141684115, "stage2_sft_loss": 0.0043692881081369705, "step": 144, "total_loss": 0.16887568403035402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 279.25, "completions/max_terminated_length": 279.25, "completions/mean_length": 126.53125, "completions/mean_terminated_length": 126.53125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.6196581196581197, "frac_reward_zero_std": 0.625, "grad_norm": 2.6250671163078483, "grpo_loss": 0.008420213998761028, "kl": 0.1058349609375, "learning_rate": 7.774790660436857e-06, "loss": 0.0779, "num_tokens": 9763884.0, "reward": 2.8359375, "reward_std": 0.1701665222644806, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.3677399829030037, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04537341580726206, "stage2_sft_loss": 0.0017060633217624854, "step": 145, "total_loss": 0.053964235819876194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 131.6796875, "completions/mean_terminated_length": 131.6796875, "completions/min_length": 62.25, "completions/min_terminated_length": 62.25, "epoch": 0.6239316239316239, "frac_reward_zero_std": 0.53125, "grad_norm": 2.686921408632962, "grpo_loss": -0.002991177316289395, "kl": 0.09326171875, "learning_rate": 7.629196222845027e-06, "loss": 0.0558, "num_tokens": 9831138.0, "reward": 2.84765625, "reward_std": 0.20384501945227385, "rewards/accuracy_reward/mean": 0.84765625, "rewards/accuracy_reward/std": 0.34441937878727913, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05802195239812136, "stage2_sft_loss": 0.00841883840621449, "step": 146, "total_loss": 0.055872660130262375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 127.69140625, "completions/mean_terminated_length": 127.69140625, "completions/min_length": 56.5, "completions/min_terminated_length": 56.5, "epoch": 0.6282051282051282, "frac_reward_zero_std": 0.53125, "grad_norm": 2.338255966369413, "grpo_loss": 0.032924949657171965, "kl": 0.10693359375, "learning_rate": 7.484132362554915e-06, "loss": 0.0601, "num_tokens": 9896459.0, "reward": 2.859375, "reward_std": 0.20437543839216232, "rewards/accuracy_reward/mean": 0.859375, "rewards/accuracy_reward/std": 0.34932583570480347, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08136490918695927, "stage2_sft_loss": 0.014175453514326364, "step": 147, "total_loss": 0.11570740700699389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 303.25, "completions/max_terminated_length": 303.25, "completions/mean_length": 144.77734375, "completions/mean_terminated_length": 144.77734375, "completions/min_length": 66.25, "completions/min_terminated_length": 66.25, "epoch": 0.6324786324786325, "frac_reward_zero_std": 0.5, "grad_norm": 2.495909790539816, "grpo_loss": -0.011645266757113859, "kl": 0.1090087890625, "learning_rate": 7.33963154433325e-06, "loss": 0.0714, "num_tokens": 9967802.0, "reward": 2.765625, "reward_std": 0.2217283584177494, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.412590354681015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05564484978094697, "stage2_sft_loss": 0.032550239644479007, "step": 148, "total_loss": 0.04725460661575198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 284.5, "completions/max_terminated_length": 284.5, "completions/mean_length": 146.7421875, "completions/mean_terminated_length": 146.7421875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.6367521367521367, "frac_reward_zero_std": 0.34375, "grad_norm": 2.3326058076550127, "grpo_loss": 0.016894530330318958, "kl": 0.1033935546875, "learning_rate": 7.1957261069399745e-06, "loss": 0.0564, "num_tokens": 10040056.0, "reward": 2.85546875, "reward_std": 0.27526114881038666, "rewards/accuracy_reward/mean": 0.85546875, "rewards/accuracy_reward/std": 0.3532208576798439, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.03281405568122864, "stage2_sft_loss": 0.009040810575243086, "step": 149, "total_loss": 0.05061266664415598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 458.5, "completions/max_terminated_length": 323.0, "completions/mean_length": 129.56640625, "completions/mean_terminated_length": 126.96162033081055, "completions/min_length": 60.5, "completions/min_terminated_length": 60.5, "epoch": 0.6410256410256411, "frac_reward_zero_std": 0.5, "grad_norm": 2.866667260437921, "grpo_loss": 0.02925427898298949, "kl": 0.13037109375, "learning_rate": 7.052448255890958e-06, "loss": 0.1456, "num_tokens": 10105953.0, "reward": 2.7548828125, "reward_std": 0.24152512475848198, "rewards/accuracy_reward/mean": 0.76171875, "rewards/accuracy_reward/std": 0.4230128526687622, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02343750186264515, "stage1_sft_loss": 0.13295416068285704, "stage2_sft_loss": 0.005717331481719157, "step": 150, "total_loss": 0.1627801824361086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 236.75, "completions/max_terminated_length": 236.75, "completions/mean_length": 134.05078125, "completions/mean_terminated_length": 134.05078125, "completions/min_length": 61.75, "completions/min_terminated_length": 61.75, "epoch": 0.6452991452991453, "frac_reward_zero_std": 0.3125, "grad_norm": 2.597988976639514, "grpo_loss": 0.05413528915960342, "kl": 0.118408203125, "learning_rate": 6.909830056250527e-06, "loss": 0.0892, "num_tokens": 10174230.0, "reward": 2.76953125, "reward_std": 0.3056345507502556, "rewards/accuracy_reward/mean": 0.76953125, "rewards/accuracy_reward/std": 0.4202374145388603, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09626385103911161, "stage2_sft_loss": 0.0072241057805513265, "step": 151, "total_loss": 0.1511215539649129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 145.06640625, "completions/mean_terminated_length": 145.06640625, "completions/min_length": 71.25, "completions/min_terminated_length": 71.25, "epoch": 0.6495726495726496, "frac_reward_zero_std": 0.5, "grad_norm": 2.5961261287645816, "grpo_loss": -0.017629655718337744, "kl": 0.1204833984375, "learning_rate": 6.767903425455402e-06, "loss": 0.0789, "num_tokens": 10244967.0, "reward": 2.75, "reward_std": 0.22225632518529892, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.3980471268296242, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0796721177175641, "stage2_sft_loss": 0.01425616116830497, "step": 152, "total_loss": 0.06346807722002268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 123.4921875, "completions/mean_terminated_length": 123.4921875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.6538461538461539, "frac_reward_zero_std": 0.5, "grad_norm": 2.5163651597741836, "grpo_loss": 0.04610937915276736, "kl": 0.09375, "learning_rate": 6.6267001261717015e-06, "loss": 0.0741, "num_tokens": 10310381.0, "reward": 2.86328125, "reward_std": 0.22172590345144272, "rewards/accuracy_reward/mean": 0.86328125, "rewards/accuracy_reward/std": 0.326527189463377, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04725873947609216, "stage2_sft_loss": 0.00662843362079002, "step": 153, "total_loss": 0.09403096046298742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 262.5, "completions/max_terminated_length": 262.5, "completions/mean_length": 135.859375, "completions/mean_terminated_length": 135.859375, "completions/min_length": 64.5, "completions/min_terminated_length": 64.5, "epoch": 0.6581196581196581, "frac_reward_zero_std": 0.5625, "grad_norm": 2.4252492975801236, "grpo_loss": -0.00364104809705168, "kl": 0.082763671875, "learning_rate": 6.486251759186573e-06, "loss": 0.0626, "num_tokens": 10379817.0, "reward": 2.7890625, "reward_std": 0.1837237924337387, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.38352732732892036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06735014263540506, "stage2_sft_loss": 0.01930423468002118, "step": 154, "total_loss": 0.06563951540738344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 142.26171875, "completions/mean_terminated_length": 142.26171875, "completions/min_length": 73.25, "completions/min_terminated_length": 73.25, "epoch": 0.6623931623931624, "frac_reward_zero_std": 0.5625, "grad_norm": 2.366411089319022, "grpo_loss": -0.054288885701680556, "kl": 0.0863037109375, "learning_rate": 6.34658975633605e-06, "loss": 0.0536, "num_tokens": 10449180.0, "reward": 2.8046875, "reward_std": 0.19108654744923115, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.3968508318066597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.035379976499825716, "stage2_sft_loss": 0.010212840817985125, "step": 155, "total_loss": -0.017887625843286514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 298.5, "completions/max_terminated_length": 298.5, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 76.5, "completions/min_terminated_length": 76.5, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.5, "grad_norm": 2.129602732934006, "grpo_loss": 0.035368968499824405, "kl": 0.08502197265625, "learning_rate": 6.207745373470717e-06, "loss": 0.0831, "num_tokens": 10518636.0, "reward": 2.859375, "reward_std": 0.20923583209514618, "rewards/accuracy_reward/mean": 0.859375, "rewards/accuracy_reward/std": 0.3443687930703163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06338846497237682, "stage2_sft_loss": 0.012052012367348652, "step": 156, "total_loss": 0.0999626386910677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 275.5, "completions/max_terminated_length": 275.5, "completions/mean_length": 144.5234375, "completions/mean_terminated_length": 144.5234375, "completions/min_length": 75.25, "completions/min_terminated_length": 75.25, "epoch": 0.6709401709401709, "frac_reward_zero_std": 0.4375, "grad_norm": 2.3169054497973565, "grpo_loss": -0.002087805936753284, "kl": 0.0943603515625, "learning_rate": 6.069749683460765e-06, "loss": 0.0789, "num_tokens": 10588434.0, "reward": 2.69921875, "reward_std": 0.23357326164841652, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45767590403556824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04589258902706206, "stage2_sft_loss": 0.006625795169384219, "step": 157, "total_loss": 0.04446736362297088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 383.25, "completions/max_terminated_length": 238.25, "completions/mean_length": 131.90234375, "completions/mean_terminated_length": 129.2612247467041, "completions/min_length": 80.5, "completions/min_terminated_length": 80.5, "epoch": 0.6752136752136753, "frac_reward_zero_std": 0.5625, "grad_norm": 2.2919519383538747, "grpo_loss": 0.026327846964704804, "kl": 0.087646484375, "learning_rate": 5.932633569242e-06, "loss": 0.1066, "num_tokens": 10654705.0, "reward": 2.8330078125, "reward_std": 0.20864389650523663, "rewards/accuracy_reward/mean": 0.83984375, "rewards/accuracy_reward/std": 0.3436870872974396, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02343750186264515, "stage1_sft_loss": 0.05269382195547223, "stage2_sft_loss": 0.0020705054157588165, "step": 158, "total_loss": 0.07922872109338641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 275.5, "completions/max_terminated_length": 275.5, "completions/mean_length": 133.83203125, "completions/mean_terminated_length": 133.83203125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6794871794871795, "frac_reward_zero_std": 0.53125, "grad_norm": 2.295106029646828, "grpo_loss": 0.014149629918392748, "kl": 0.107421875, "learning_rate": 5.796427716904347e-06, "loss": 0.0656, "num_tokens": 10721630.0, "reward": 2.80078125, "reward_std": 0.20555494353175163, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.3571106605231762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04844714910723269, "stage2_sft_loss": 0.00807800801248959, "step": 159, "total_loss": 0.06340457918122411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 231.5, "completions/max_terminated_length": 231.5, "completions/mean_length": 124.92578125, "completions/mean_terminated_length": 124.92578125, "completions/min_length": 62.75, "completions/min_terminated_length": 62.75, "epoch": 0.6837606837606838, "frac_reward_zero_std": 0.34375, "grad_norm": 2.398123263945807, "grpo_loss": 0.01794378731995039, "kl": 0.0992431640625, "learning_rate": 5.66116260882442e-06, "loss": 0.0669, "num_tokens": 10785939.0, "reward": 2.8046875, "reward_std": 0.29234322160482407, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39095889776945114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.03389233513735235, "stage2_sft_loss": 0.018274757923791185, "step": 160, "total_loss": 0.05366359875188209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 238.75, "completions/max_terminated_length": 238.75, "completions/mean_length": 130.82421875, "completions/mean_terminated_length": 130.82421875, "completions/min_length": 66.25, "completions/min_terminated_length": 66.25, "epoch": 0.688034188034188, "frac_reward_zero_std": 0.40625, "grad_norm": 2.4713176066788654, "grpo_loss": -0.007550991605967283, "kl": 0.100830078125, "learning_rate": 5.526868516843673e-06, "loss": 0.0769, "num_tokens": 10852526.0, "reward": 2.77734375, "reward_std": 0.2656516842544079, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.4104008078575134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.11685018753632903, "stage2_sft_loss": 0.01413062890060246, "step": 161, "total_loss": 0.1107122590765357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 264.25, "completions/max_terminated_length": 264.25, "completions/mean_length": 132.74609375, "completions/mean_terminated_length": 132.74609375, "completions/min_length": 69.5, "completions/min_terminated_length": 69.5, "epoch": 0.6923076923076923, "frac_reward_zero_std": 0.53125, "grad_norm": 2.3313874096022835, "grpo_loss": 0.003736324142664671, "kl": 0.0833740234375, "learning_rate": 5.393575495493679e-06, "loss": 0.0709, "num_tokens": 10919077.0, "reward": 2.8125, "reward_std": 0.2117381915450096, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.36151088774204254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.18217940255999565, "stage2_sft_loss": 0.01815573309431784, "step": 162, "total_loss": 0.18773129768669605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 237.25, "completions/max_terminated_length": 237.25, "completions/mean_length": 130.7421875, "completions/mean_terminated_length": 130.7421875, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.6965811965811965, "frac_reward_zero_std": 0.6875, "grad_norm": 2.466615149845605, "grpo_loss": -0.010769763757707551, "kl": 0.0894775390625, "learning_rate": 5.2613133752700145e-06, "loss": 0.0782, "num_tokens": 10986211.0, "reward": 2.890625, "reward_std": 0.13546558748930693, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 0.29157692193984985, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0620655445381999, "stage2_sft_loss": 0.019713282701559365, "step": 163, "total_loss": 0.05326710897497833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 283.5, "completions/max_terminated_length": 283.5, "completions/mean_length": 137.265625, "completions/mean_terminated_length": 137.265625, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.7008547008547008, "frac_reward_zero_std": 0.53125, "grad_norm": 2.3553881379649866, "grpo_loss": -0.006676109624095261, "kl": 0.08697509765625, "learning_rate": 5.130111755956327e-06, "loss": 0.0871, "num_tokens": 11054735.0, "reward": 2.81640625, "reward_std": 0.20213509909808636, "rewards/accuracy_reward/mean": 0.81640625, "rewards/accuracy_reward/std": 0.34256091713905334, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09478867053985596, "stage2_sft_loss": 0.01719207396672573, "step": 164, "total_loss": 0.08983177039772272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 252.5, "completions/max_terminated_length": 252.5, "completions/mean_length": 128.84375, "completions/mean_terminated_length": 128.84375, "completions/min_length": 64.75, "completions/min_terminated_length": 64.75, "epoch": 0.7051282051282052, "frac_reward_zero_std": 0.53125, "grad_norm": 2.772277422206105, "grpo_loss": 0.0985541757545434, "kl": 0.1038818359375, "learning_rate": 5.000000000000003e-06, "loss": 0.1192, "num_tokens": 11120839.0, "reward": 2.875, "reward_std": 0.20950030162930489, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.32767561450600624, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08193079102784395, "stage2_sft_loss": 0.007009456341620535, "step": 165, "total_loss": 0.18118591140955687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 286.25, "completions/max_terminated_length": 286.25, "completions/mean_length": 139.16796875, "completions/mean_terminated_length": 139.16796875, "completions/min_length": 66.5, "completions/min_terminated_length": 66.5, "epoch": 0.7094017094017094, "frac_reward_zero_std": 0.46875, "grad_norm": 2.20156747003348, "grpo_loss": 0.0015803006826899946, "kl": 0.089111328125, "learning_rate": 4.87100722594094e-06, "loss": 0.0755, "num_tokens": 11190498.0, "reward": 2.7265625, "reward_std": 0.23330241069197655, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.4193031042814255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09793944098055363, "stage2_sft_loss": 0.0024029574196902104, "step": 166, "total_loss": 0.09976003784686327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 298.75, "completions/max_terminated_length": 298.75, "completions/mean_length": 133.5859375, "completions/mean_terminated_length": 133.5859375, "completions/min_length": 77.5, "completions/min_terminated_length": 77.5, "epoch": 0.7136752136752137, "frac_reward_zero_std": 0.5625, "grad_norm": 2.0395625224823095, "grpo_loss": 0.003400918962142896, "kl": 0.089111328125, "learning_rate": 4.743162301894952e-06, "loss": 0.0721, "num_tokens": 11257320.0, "reward": 2.84375, "reward_std": 0.2008083276450634, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.3427247516810894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0751371227670461, "stage2_sft_loss": 0.0010712931361922529, "step": 167, "total_loss": 0.07864517066627741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 289.75, "completions/max_terminated_length": 289.75, "completions/mean_length": 142.6875, "completions/mean_terminated_length": 142.6875, "completions/min_length": 70.5, "completions/min_terminated_length": 70.5, "epoch": 0.717948717948718, "frac_reward_zero_std": 0.4375, "grad_norm": 2.2661004439431096, "grpo_loss": -0.009914183348882943, "kl": 0.09600830078125, "learning_rate": 4.616493839093179e-06, "loss": 0.0642, "num_tokens": 11327824.0, "reward": 2.734375, "reward_std": 0.2659137099981308, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.42969100922346115, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09561806032434106, "stage2_sft_loss": 0.01580676135563408, "step": 168, "total_loss": 0.08728454890660942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 292.25, "completions/max_terminated_length": 292.25, "completions/mean_length": 127.33203125, "completions/mean_terminated_length": 127.33203125, "completions/min_length": 62.25, "completions/min_terminated_length": 62.25, "epoch": 0.7222222222222222, "frac_reward_zero_std": 0.59375, "grad_norm": 2.1872189970272204, "grpo_loss": 0.006283853610511869, "kl": 0.077392578125, "learning_rate": 4.491030185478976e-06, "loss": 0.069, "num_tokens": 11392709.0, "reward": 2.75, "reward_std": 0.192000113427639, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4205075278878212, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06108791567385197, "stage2_sft_loss": 0.019755267803702736, "step": 169, "total_loss": 0.06934729870408773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 246.5, "completions/max_terminated_length": 246.5, "completions/mean_length": 120.265625, "completions/mean_terminated_length": 120.265625, "completions/min_length": 53.5, "completions/min_terminated_length": 53.5, "epoch": 0.7264957264957265, "frac_reward_zero_std": 0.5625, "grad_norm": 2.5086249757515575, "grpo_loss": 0.008289381628856063, "kl": 0.090087890625, "learning_rate": 4.3667994193637794e-06, "loss": 0.0616, "num_tokens": 11455393.0, "reward": 2.859375, "reward_std": 0.1837237998843193, "rewards/accuracy_reward/mean": 0.859375, "rewards/accuracy_reward/std": 0.34254971891641617, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05750684905797243, "stage2_sft_loss": 0.004096082295291126, "step": 170, "total_loss": 0.0662058424204588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 271.75, "completions/max_terminated_length": 271.75, "completions/mean_length": 139.2265625, "completions/mean_terminated_length": 139.2265625, "completions/min_length": 58.75, "completions/min_terminated_length": 58.75, "epoch": 0.7307692307692307, "frac_reward_zero_std": 0.53125, "grad_norm": 2.4597641282544047, "grpo_loss": 0.0308269634260796, "kl": 0.0999755859375, "learning_rate": 4.2438293431432665e-06, "loss": 0.0742, "num_tokens": 11523891.0, "reward": 2.80078125, "reward_std": 0.22540531679987907, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40003233402967453, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04699529241770506, "stage2_sft_loss": 0.010027769778389484, "step": 171, "total_loss": 0.07882503373548388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 310.5, "completions/max_terminated_length": 310.5, "completions/mean_length": 140.03515625, "completions/mean_terminated_length": 140.03515625, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.7350427350427351, "frac_reward_zero_std": 0.375, "grad_norm": 2.313340926620552, "grpo_loss": -0.016913561208639294, "kl": 0.09033203125, "learning_rate": 4.12214747707527e-06, "loss": 0.0633, "num_tokens": 11593892.0, "reward": 2.78515625, "reward_std": 0.28011762723326683, "rewards/accuracy_reward/mean": 0.78515625, "rewards/accuracy_reward/std": 0.4029533341526985, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.10063559375703335, "stage2_sft_loss": 0.004137711774092168, "step": 172, "total_loss": 0.08413580618798733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 249.25, "completions/max_terminated_length": 249.25, "completions/mean_length": 126.921875, "completions/mean_terminated_length": 126.921875, "completions/min_length": 69.25, "completions/min_terminated_length": 69.25, "epoch": 0.7393162393162394, "frac_reward_zero_std": 0.59375, "grad_norm": 2.4017578116497065, "grpo_loss": -0.021779871574835852, "kl": 0.0828857421875, "learning_rate": 4.001781053120863e-06, "loss": 0.0805, "num_tokens": 11659816.0, "reward": 2.8671875, "reward_std": 0.1816292516887188, "rewards/accuracy_reward/mean": 0.8671875, "rewards/accuracy_reward/std": 0.3129511810839176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.13278495729900897, "stage2_sft_loss": 0.012614698614925146, "step": 173, "total_loss": 0.11226655612699687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 241.25, "completions/max_terminated_length": 241.25, "completions/mean_length": 128.96484375, "completions/mean_terminated_length": 128.96484375, "completions/min_length": 59.75, "completions/min_terminated_length": 59.75, "epoch": 0.7435897435897436, "frac_reward_zero_std": 0.5, "grad_norm": 2.5955191634448274, "grpo_loss": 0.009170918841846287, "kl": 0.098876953125, "learning_rate": 3.882757008849936e-06, "loss": 0.0864, "num_tokens": 11725263.0, "reward": 2.6953125, "reward_std": 0.2296190746128559, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.42350123077630997, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05286407144740224, "stage2_sft_loss": 0.02471150812925771, "step": 174, "total_loss": 0.06450614053755999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 315.5, "completions/max_terminated_length": 315.5, "completions/mean_length": 128.84765625, "completions/mean_terminated_length": 128.84765625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.7478632478632479, "frac_reward_zero_std": 0.59375, "grad_norm": 2.1001294510887507, "grpo_loss": 0.004248449899023399, "kl": 0.087890625, "learning_rate": 3.7651019814126656e-06, "loss": 0.0612, "num_tokens": 11791504.0, "reward": 2.84375, "reward_std": 0.1714957458898425, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.33109506219625473, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.02404563978780061, "stage2_sft_loss": 0.005888056548428722, "step": 175, "total_loss": 0.028882895596325397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 246.75, "completions/max_terminated_length": 246.75, "completions/mean_length": 126.859375, "completions/mean_terminated_length": 126.859375, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.7521367521367521, "frac_reward_zero_std": 0.65625, "grad_norm": 2.467756391649865, "grpo_loss": 0.004664378764573485, "kl": 0.0887451171875, "learning_rate": 3.6488423015782128e-06, "loss": 0.0628, "num_tokens": 11857804.0, "reward": 2.87109375, "reward_std": 0.14256631769239902, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.31893764436244965, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.041731708915904164, "stage2_sft_loss": 0.022018967356416397, "step": 176, "total_loss": 0.04859798448160291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 246.75, "completions/max_terminated_length": 246.75, "completions/mean_length": 134.4375, "completions/mean_terminated_length": 134.4375, "completions/min_length": 54.25, "completions/min_terminated_length": 54.25, "epoch": 0.7564102564102564, "frac_reward_zero_std": 0.46875, "grad_norm": 2.467385983440296, "grpo_loss": -0.0026853438175749034, "kl": 0.0904541015625, "learning_rate": 3.534003987842005e-06, "loss": 0.0838, "num_tokens": 11925716.0, "reward": 2.8203125, "reward_std": 0.22935950569808483, "rewards/accuracy_reward/mean": 0.8203125, "rewards/accuracy_reward/std": 0.347512349486351, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06543307611718774, "stage2_sft_loss": 0.02554303167562466, "step": 177, "total_loss": 0.0653020367026329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 299.75, "completions/max_terminated_length": 299.75, "completions/mean_length": 137.3984375, "completions/mean_terminated_length": 137.3984375, "completions/min_length": 59.25, "completions/min_terminated_length": 59.25, "epoch": 0.7606837606837606, "frac_reward_zero_std": 0.40625, "grad_norm": 2.115557905994792, "grpo_loss": -0.04446130109135993, "kl": 0.0858154296875, "learning_rate": 3.4206127406028744e-06, "loss": 0.0473, "num_tokens": 11993618.0, "reward": 2.86328125, "reward_std": 0.2429143339395523, "rewards/accuracy_reward/mean": 0.86328125, "rewards/accuracy_reward/std": 0.34463661164045334, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06367250951007009, "stage2_sft_loss": 0.008203808916732669, "step": 178, "total_loss": 0.020031588152050972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 297.25, "completions/max_terminated_length": 297.25, "completions/mean_length": 121.7890625, "completions/mean_terminated_length": 121.7890625, "completions/min_length": 68.75, "completions/min_terminated_length": 68.75, "epoch": 0.7649572649572649, "frac_reward_zero_std": 0.625, "grad_norm": 2.709248900203319, "grpo_loss": 0.0030166495707817376, "kl": 0.101806640625, "learning_rate": 3.308693936411421e-06, "loss": 0.0637, "num_tokens": 12056300.0, "reward": 2.7890625, "reward_std": 0.1803000308573246, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.39430833607912064, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05242747673764825, "stage2_sft_loss": 0.021223331823421177, "step": 179, "total_loss": 0.05756645882502198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 243.5, "completions/max_terminated_length": 243.5, "completions/mean_length": 125.8515625, "completions/mean_terminated_length": 125.8515625, "completions/min_length": 67.75, "completions/min_terminated_length": 67.75, "epoch": 0.7692307692307693, "frac_reward_zero_std": 0.46875, "grad_norm": 2.4130085372673205, "grpo_loss": 0.0032237190462183207, "kl": 0.0877685546875, "learning_rate": 3.1982726222908046e-06, "loss": 0.0791, "num_tokens": 12122358.0, "reward": 2.80078125, "reward_std": 0.2468533217906952, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.395585760474205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05290105380117893, "stage2_sft_loss": 0.0006931633852218511, "step": 180, "total_loss": 0.056194088188931346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 118.58203125, "completions/mean_terminated_length": 118.58203125, "completions/min_length": 64.5, "completions/min_terminated_length": 64.5, "epoch": 0.7735042735042735, "frac_reward_zero_std": 0.59375, "grad_norm": 2.4269696259509774, "grpo_loss": 0.01719996099564014, "kl": 0.08306884765625, "learning_rate": 3.089373510131354e-06, "loss": 0.0624, "num_tokens": 12184427.0, "reward": 2.84375, "reward_std": 0.17609265074133873, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.34457017853856087, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.028878788463771343, "stage2_sft_loss": 0.007072446060192306, "step": 181, "total_loss": 0.04678599291946739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 127.0703125, "completions/mean_terminated_length": 127.0703125, "completions/min_length": 54.25, "completions/min_terminated_length": 54.25, "epoch": 0.7777777777777778, "frac_reward_zero_std": 0.4375, "grad_norm": 2.7281001020757256, "grpo_loss": 0.04298811010085046, "kl": 0.0975341796875, "learning_rate": 2.9820209711600858e-06, "loss": 0.063, "num_tokens": 12250877.0, "reward": 2.79296875, "reward_std": 0.2506577782332897, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.3889293447136879, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05901889782398939, "stage2_sft_loss": 0.0066388859413564205, "step": 182, "total_loss": 0.10267089866101742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 408.5, "completions/max_terminated_length": 408.5, "completions/mean_length": 137.64453125, "completions/mean_terminated_length": 137.64453125, "completions/min_length": 59.25, "completions/min_terminated_length": 59.25, "epoch": 0.782051282051282, "frac_reward_zero_std": 0.5, "grad_norm": 2.40444782166608, "grpo_loss": -0.008797756774583831, "kl": 0.10546875, "learning_rate": 2.876239030486554e-06, "loss": 0.0907, "num_tokens": 12319018.0, "reward": 2.78515625, "reward_std": 0.21831095963716507, "rewards/accuracy_reward/mean": 0.78515625, "rewards/accuracy_reward/std": 0.3904332146048546, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05148952128365636, "stage2_sft_loss": 0.003067808851483278, "step": 183, "total_loss": 0.0429985448718071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 267.25, "completions/max_terminated_length": 267.25, "completions/mean_length": 129.82421875, "completions/mean_terminated_length": 129.82421875, "completions/min_length": 66.25, "completions/min_terminated_length": 66.25, "epoch": 0.7863247863247863, "frac_reward_zero_std": 0.5, "grad_norm": 2.278999858369521, "grpo_loss": -0.005033229797845706, "kl": 0.0814208984375, "learning_rate": 2.7720513617260857e-06, "loss": 0.0612, "num_tokens": 12384461.0, "reward": 2.80078125, "reward_std": 0.23368556797504425, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.3904718607664108, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08307978138327599, "stage2_sft_loss": 0.014651871661044424, "step": 184, "total_loss": 0.07951173838227987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 268.25, "completions/max_terminated_length": 268.25, "completions/mean_length": 141.74609375, "completions/mean_terminated_length": 141.74609375, "completions/min_length": 66.25, "completions/min_terminated_length": 66.25, "epoch": 0.7905982905982906, "frac_reward_zero_std": 0.4375, "grad_norm": 2.248355130508807, "grpo_loss": 0.007732820464298129, "kl": 0.0924072265625, "learning_rate": 2.669481281701739e-06, "loss": 0.0907, "num_tokens": 12453828.0, "reward": 2.75390625, "reward_std": 0.25354476645588875, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.38477926701307297, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.044389708898961544, "stage2_sft_loss": 0.007064787938361405, "step": 185, "total_loss": 0.052829005755484104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 234.75, "completions/max_terminated_length": 234.75, "completions/mean_length": 134.88671875, "completions/mean_terminated_length": 134.88671875, "completions/min_length": 61.75, "completions/min_terminated_length": 61.75, "epoch": 0.7948717948717948, "frac_reward_zero_std": 0.46875, "grad_norm": 2.4596631881752544, "grpo_loss": -0.02575137373059988, "kl": 0.096923828125, "learning_rate": 2.5685517452260566e-06, "loss": 0.0584, "num_tokens": 12523087.0, "reward": 2.7734375, "reward_std": 0.22764958441257477, "rewards/accuracy_reward/mean": 0.7734375, "rewards/accuracy_reward/std": 0.40480412542819977, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0866909739561379, "stage2_sft_loss": 0.013630262881633826, "step": 186, "total_loss": 0.06230262666940689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 242.75, "completions/max_terminated_length": 242.75, "completions/mean_length": 132.58984375, "completions/mean_terminated_length": 132.58984375, "completions/min_length": 81.25, "completions/min_terminated_length": 81.25, "epoch": 0.7991452991452992, "frac_reward_zero_std": 0.5625, "grad_norm": 2.165191336137466, "grpo_loss": -0.02968689359840937, "kl": 0.093017578125, "learning_rate": 2.469285339963892e-06, "loss": 0.0461, "num_tokens": 12589974.0, "reward": 2.83984375, "reward_std": 0.19450394995510578, "rewards/accuracy_reward/mean": 0.83984375, "rewards/accuracy_reward/std": 0.34064802527427673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.056919783586636186, "stage2_sft_loss": 0.007759942389384378, "step": 187, "total_loss": 0.02800888242200017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 306.25, "completions/max_terminated_length": 306.25, "completions/mean_length": 130.83984375, "completions/mean_terminated_length": 130.83984375, "completions/min_length": 73.25, "completions/min_terminated_length": 73.25, "epoch": 0.8034188034188035, "frac_reward_zero_std": 0.5, "grad_norm": 2.226208338506282, "grpo_loss": -0.004410345209180377, "kl": 0.0858154296875, "learning_rate": 2.371704281377335e-06, "loss": 0.0448, "num_tokens": 12657693.0, "reward": 2.8203125, "reward_std": 0.2376309186220169, "rewards/accuracy_reward/mean": 0.8203125, "rewards/accuracy_reward/std": 0.36812880635261536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.053088022861629725, "stage2_sft_loss": 0.00971882028170512, "step": 188, "total_loss": 0.049649559427052736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 273.5, "completions/max_terminated_length": 273.5, "completions/mean_length": 140.78125, "completions/mean_terminated_length": 140.78125, "completions/min_length": 65.75, "completions/min_terminated_length": 65.75, "epoch": 0.8076923076923077, "frac_reward_zero_std": 0.53125, "grad_norm": 2.3493601611104364, "grpo_loss": -0.006723656959366053, "kl": 0.09423828125, "learning_rate": 2.275830407754006e-06, "loss": 0.0616, "num_tokens": 12727141.0, "reward": 2.7421875, "reward_std": 0.20213755778968334, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.42482397705316544, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06993068754673004, "stage2_sft_loss": 0.02032813218102092, "step": 189, "total_loss": 0.06523984298110008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 230.5, "completions/max_terminated_length": 230.5, "completions/mean_length": 123.16796875, "completions/mean_terminated_length": 123.16796875, "completions/min_length": 63.5, "completions/min_terminated_length": 63.5, "epoch": 0.811965811965812, "frac_reward_zero_std": 0.5625, "grad_norm": 2.3089248428431515, "grpo_loss": 0.029169104644097388, "kl": 0.0794677734375, "learning_rate": 2.1816851753197023e-06, "loss": 0.0534, "num_tokens": 12790576.0, "reward": 2.8671875, "reward_std": 0.17688901163637638, "rewards/accuracy_reward/mean": 0.8671875, "rewards/accuracy_reward/std": 0.3410636931657791, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06213785335421562, "stage2_sft_loss": 0.0018466471155988984, "step": 190, "total_loss": 0.09149162436369807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 286.5, "completions/max_terminated_length": 286.5, "completions/mean_length": 157.46484375, "completions/mean_terminated_length": 157.46484375, "completions/min_length": 72.5, "completions/min_terminated_length": 72.5, "epoch": 0.8162393162393162, "frac_reward_zero_std": 0.4375, "grad_norm": 2.198067341450511, "grpo_loss": -0.0376204761560075, "kl": 0.08465576171875, "learning_rate": 2.08928965343659e-06, "loss": 0.0543, "num_tokens": 12865439.0, "reward": 2.83203125, "reward_std": 0.2489478625357151, "rewards/accuracy_reward/mean": 0.83203125, "rewards/accuracy_reward/std": 0.3710259050130844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08339540008455515, "stage2_sft_loss": 0.006528811136377044, "step": 191, "total_loss": 0.046427804976701736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 317.25, "completions/max_terminated_length": 317.25, "completions/mean_length": 137.4609375, "completions/mean_terminated_length": 137.4609375, "completions/min_length": 71.25, "completions/min_terminated_length": 71.25, "epoch": 0.8205128205128205, "frac_reward_zero_std": 0.5, "grad_norm": 2.237880793803582, "grpo_loss": 0.009976790606742725, "kl": 0.08349609375, "learning_rate": 1.9986645198879385e-06, "loss": 0.039, "num_tokens": 12935317.0, "reward": 2.8359375, "reward_std": 0.2210792675614357, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.36333344131708145, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08060388825833797, "stage2_sft_loss": 0.004419473567395471, "step": 192, "total_loss": 0.09102262475062162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 264.75, "completions/max_terminated_length": 264.75, "completions/mean_length": 128.59765625, "completions/mean_terminated_length": 128.59765625, "completions/min_length": 60.75, "completions/min_terminated_length": 60.75, "epoch": 0.8247863247863247, "frac_reward_zero_std": 0.59375, "grad_norm": 2.3641231037433035, "grpo_loss": -0.023414782102918252, "kl": 0.0792236328125, "learning_rate": 1.9098300562505266e-06, "loss": 0.0626, "num_tokens": 13001822.0, "reward": 2.8515625, "reward_std": 0.17939137108623981, "rewards/accuracy_reward/mean": 0.8515625, "rewards/accuracy_reward/std": 0.3518161177635193, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08140591345727444, "stage2_sft_loss": 0.0020642982563003898, "step": 193, "total_loss": 0.05819755978882313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 251.25, "completions/max_terminated_length": 251.25, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 65.75, "completions/min_terminated_length": 65.75, "epoch": 0.8290598290598291, "frac_reward_zero_std": 0.5, "grad_norm": 2.215355384585231, "grpo_loss": 0.001619994145585224, "kl": 0.07733154296875, "learning_rate": 1.8228061433556866e-06, "loss": 0.0581, "num_tokens": 13067158.0, "reward": 2.8046875, "reward_std": 0.21318363025784492, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39349858462810516, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.02379709028173238, "stage2_sft_loss": 0.0009464863105677068, "step": 194, "total_loss": 0.025511732907034457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 268.5, "completions/max_terminated_length": 268.5, "completions/mean_length": 131.46875, "completions/mean_terminated_length": 131.46875, "completions/min_length": 64.75, "completions/min_terminated_length": 64.75, "epoch": 0.8333333333333334, "frac_reward_zero_std": 0.46875, "grad_norm": 2.254564743687319, "grpo_loss": 0.017838238331023604, "kl": 0.0771484375, "learning_rate": 1.7376122568400533e-06, "loss": 0.0854, "num_tokens": 13133366.0, "reward": 2.83203125, "reward_std": 0.22882908582687378, "rewards/accuracy_reward/mean": 0.83203125, "rewards/accuracy_reward/std": 0.3620675168931484, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.032180753303691745, "stage2_sft_loss": 0.021464947407366708, "step": 195, "total_loss": 0.052165485452860594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 266.25, "completions/max_terminated_length": 266.25, "completions/mean_length": 136.43359375, "completions/mean_terminated_length": 136.43359375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.8376068376068376, "frac_reward_zero_std": 0.5, "grad_norm": 2.3197178808051375, "grpo_loss": -0.03142386896070093, "kl": 0.102783203125, "learning_rate": 1.6542674627869738e-06, "loss": 0.0672, "num_tokens": 13201541.0, "reward": 2.81640625, "reward_std": 0.2131860852241516, "rewards/accuracy_reward/mean": 0.81640625, "rewards/accuracy_reward/std": 0.37990450114011765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.055304642068222165, "stage2_sft_loss": 0.007718519467744045, "step": 196, "total_loss": 0.024652624037116766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 370.75, "completions/max_terminated_length": 370.75, "completions/mean_length": 134.8828125, "completions/mean_terminated_length": 134.8828125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.8418803418803419, "frac_reward_zero_std": 0.5625, "grad_norm": 2.860612537760405, "grpo_loss": -0.02284381364006549, "kl": 0.0767822265625, "learning_rate": 1.5727904134596084e-06, "loss": 0.0555, "num_tokens": 13269487.0, "reward": 2.81640625, "reward_std": 0.18820202350616455, "rewards/accuracy_reward/mean": 0.81640625, "rewards/accuracy_reward/std": 0.3756125792860985, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07799512334167957, "stage2_sft_loss": 0.015525784227065742, "step": 197, "total_loss": 0.056703890673816204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 263.75, "completions/max_terminated_length": 263.75, "completions/mean_length": 135.46484375, "completions/mean_terminated_length": 135.46484375, "completions/min_length": 66.75, "completions/min_terminated_length": 66.75, "epoch": 0.8461538461538461, "frac_reward_zero_std": 0.5625, "grad_norm": 1.8339436198084127, "grpo_loss": -0.00017789613048080355, "kl": 0.06561279296875, "learning_rate": 1.4931993431266056e-06, "loss": 0.0511, "num_tokens": 13338694.0, "reward": 2.86328125, "reward_std": 0.1916169673204422, "rewards/accuracy_reward/mean": 0.86328125, "rewards/accuracy_reward/std": 0.33633895218372345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04241414717398584, "stage2_sft_loss": 0.007416335574816912, "step": 198, "total_loss": 0.042977884877473116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 275.5, "completions/max_terminated_length": 275.5, "completions/mean_length": 133.71875, "completions/mean_terminated_length": 133.71875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.8504273504273504, "frac_reward_zero_std": 0.375, "grad_norm": 2.321539071874128, "grpo_loss": 0.03887829327140935, "kl": 0.08251953125, "learning_rate": 1.4155120639813392e-06, "loss": 0.0418, "num_tokens": 13405974.0, "reward": 2.78515625, "reward_std": 0.2688070461153984, "rewards/accuracy_reward/mean": 0.78515625, "rewards/accuracy_reward/std": 0.4082975834608078, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.042024717666208744, "stage2_sft_loss": 0.010183911363128573, "step": 199, "total_loss": 0.08192139957100153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 287.25, "completions/max_terminated_length": 287.25, "completions/mean_length": 134.9609375, "completions/mean_terminated_length": 134.9609375, "completions/min_length": 60.75, "completions/min_terminated_length": 60.75, "epoch": 0.8547008547008547, "frac_reward_zero_std": 0.5, "grad_norm": 1.9695463119039087, "grpo_loss": -0.029325536685064435, "kl": 0.0819091796875, "learning_rate": 1.339745962155613e-06, "loss": 0.0349, "num_tokens": 13473412.0, "reward": 2.7890625, "reward_std": 0.2301519438624382, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.40658413618803024, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06595750187989324, "stage2_sft_loss": 0.018286966653249692, "step": 200, "total_loss": 0.038460663286969066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 229.5, "completions/max_terminated_length": 229.5, "completions/mean_length": 129.95703125, "completions/mean_terminated_length": 129.95703125, "completions/min_length": 56.75, "completions/min_terminated_length": 56.75, "epoch": 0.8589743589743589, "frac_reward_zero_std": 0.46875, "grad_norm": 2.457707498128624, "grpo_loss": 0.015741711657028645, "kl": 0.0843505859375, "learning_rate": 1.2659179938287035e-06, "loss": 0.0693, "num_tokens": 13539713.0, "reward": 2.80078125, "reward_std": 0.24237754568457603, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.3829581290483475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0587872676551342, "stage2_sft_loss": 0.006722187114064582, "step": 201, "total_loss": 0.07520119519904256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 125.01171875, "completions/mean_terminated_length": 125.01171875, "completions/min_length": 62.25, "completions/min_terminated_length": 62.25, "epoch": 0.8632478632478633, "frac_reward_zero_std": 0.5, "grad_norm": 1.8597591668705884, "grpo_loss": -0.03844989475328475, "kl": 0.0792236328125, "learning_rate": 1.19404468143262e-06, "loss": 0.0488, "num_tokens": 13605524.0, "reward": 2.828125, "reward_std": 0.23026816546916962, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.374307245016098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07022882532328367, "stage2_sft_loss": 0.00958529122362961, "step": 202, "total_loss": 0.03273745905607939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 236.75, "completions/max_terminated_length": 236.75, "completions/mean_length": 140.76171875, "completions/mean_terminated_length": 140.76171875, "completions/min_length": 67.75, "completions/min_terminated_length": 67.75, "epoch": 0.8675213675213675, "frac_reward_zero_std": 0.5, "grad_norm": 2.294920534337738, "grpo_loss": -0.033718791615683585, "kl": 0.0762939453125, "learning_rate": 1.124142109954459e-06, "loss": 0.0614, "num_tokens": 13674743.0, "reward": 2.83203125, "reward_std": 0.2108270674943924, "rewards/accuracy_reward/mean": 0.83203125, "rewards/accuracy_reward/std": 0.3715285286307335, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.09230023482814431, "stage2_sft_loss": 0.012318415741901845, "step": 203, "total_loss": 0.059813279658555984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 211.5, "completions/max_terminated_length": 211.5, "completions/mean_length": 113.66796875, "completions/mean_terminated_length": 113.66796875, "completions/min_length": 58.5, "completions/min_terminated_length": 58.5, "epoch": 0.8717948717948718, "frac_reward_zero_std": 0.59375, "grad_norm": 2.035886437312805, "grpo_loss": 0.02149005071260035, "kl": 0.0849609375, "learning_rate": 1.0562259233366334e-06, "loss": 0.0525, "num_tokens": 13736002.0, "reward": 2.87890625, "reward_std": 0.1590056698769331, "rewards/accuracy_reward/mean": 0.87890625, "rewards/accuracy_reward/std": 0.31372954696416855, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.033063370268791914, "stage2_sft_loss": 0.003003982281370554, "step": 204, "total_loss": 0.05485381884500384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 234.75, "completions/max_terminated_length": 234.75, "completions/mean_length": 117.3984375, "completions/mean_terminated_length": 117.3984375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.8760683760683761, "frac_reward_zero_std": 0.71875, "grad_norm": 2.30690792423194, "grpo_loss": 0.0016019267350202426, "kl": 0.086669921875, "learning_rate": 9.903113209758098e-07, "loss": 0.0554, "num_tokens": 13798288.0, "reward": 2.91796875, "reward_std": 0.11139655206352472, "rewards/accuracy_reward/mean": 0.91796875, "rewards/accuracy_reward/std": 0.2633422575891018, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04401658312417567, "stage2_sft_loss": 0.0009024485862028087, "step": 205, "total_loss": 0.04570875607896596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 264.5, "completions/max_terminated_length": 264.5, "completions/mean_length": 113.03125, "completions/mean_terminated_length": 113.03125, "completions/min_length": 49.75, "completions/min_terminated_length": 49.75, "epoch": 0.8803418803418803, "frac_reward_zero_std": 0.625, "grad_norm": 2.3831642997829166, "grpo_loss": 0.0045173391699790955, "kl": 0.0860595703125, "learning_rate": 9.264130543213512e-07, "loss": 0.0482, "num_tokens": 13859392.0, "reward": 2.828125, "reward_std": 0.17069938778877258, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.37033187597990036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07017236109822989, "stage2_sft_loss": 0.02451086524524726, "step": 206, "total_loss": 0.07714078575372696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 297.5, "completions/max_terminated_length": 297.5, "completions/mean_length": 138.5859375, "completions/mean_terminated_length": 138.5859375, "completions/min_length": 60.5, "completions/min_terminated_length": 60.5, "epoch": 0.8846153846153846, "frac_reward_zero_std": 0.59375, "grad_norm": 2.072596487997066, "grpo_loss": -0.013341820711502805, "kl": 0.0755615234375, "learning_rate": 8.645454235739903e-07, "loss": 0.0576, "num_tokens": 13928598.0, "reward": 2.78125, "reward_std": 0.17544355988502502, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.38894475251436234, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0771252615377307, "stage2_sft_loss": 0.025300035646068864, "step": 207, "total_loss": 0.06631344370543957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 135.73828125, "completions/mean_terminated_length": 135.73828125, "completions/min_length": 70.5, "completions/min_terminated_length": 70.5, "epoch": 0.8888888888888888, "frac_reward_zero_std": 0.65625, "grad_norm": 2.3783164154533063, "grpo_loss": -0.025427510030567646, "kl": 0.06793212890625, "learning_rate": 8.047222744854943e-07, "loss": 0.0604, "num_tokens": 13997835.0, "reward": 2.84375, "reward_std": 0.1522856391966343, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.34538276866078377, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07427823357284069, "stage2_sft_loss": 0.0030603897521359613, "step": 208, "total_loss": 0.049156763125211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 296.5, "completions/max_terminated_length": 296.5, "completions/mean_length": 129.3203125, "completions/mean_terminated_length": 129.3203125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.8931623931623932, "frac_reward_zero_std": 0.5625, "grad_norm": 2.82655021936802, "grpo_loss": 0.020664230061811395, "kl": 0.1365966796875, "learning_rate": 7.46956995260033e-07, "loss": 0.074, "num_tokens": 14064381.0, "reward": 2.8125, "reward_std": 0.19621141999959946, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3881358355283737, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.03121958440169692, "stage2_sft_loss": 0.008896204843040323, "step": 209, "total_loss": 0.052773436065763235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 126.23046875, "completions/mean_terminated_length": 126.23046875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.8974358974358975, "frac_reward_zero_std": 0.75, "grad_norm": 2.9415317368519593, "grpo_loss": 0.041792053991230205, "kl": 0.0849609375, "learning_rate": 6.912625135579587e-07, "loss": 0.0899, "num_tokens": 14128464.0, "reward": 2.875, "reward_std": 0.11796049773693085, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.32698625698685646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07522869762033224, "stage2_sft_loss": 0.010016409680247307, "step": 210, "total_loss": 0.11802239343523979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 288.25, "completions/max_terminated_length": 288.25, "completions/mean_length": 148.6484375, "completions/mean_terminated_length": 148.6484375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.9017094017094017, "frac_reward_zero_std": 0.5, "grad_norm": 2.7130622095694483, "grpo_loss": -0.009939641153323464, "kl": 0.07244873046875, "learning_rate": 6.37651293602628e-07, "loss": 0.0868, "num_tokens": 14201598.0, "reward": 2.7578125, "reward_std": 0.22685321792960167, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.391734354197979, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05721112829633057, "stage2_sft_loss": 0.005386116390582174, "step": 211, "total_loss": 0.04781010281294584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 246.25, "completions/max_terminated_length": 246.25, "completions/mean_length": 124.21484375, "completions/mean_terminated_length": 124.21484375, "completions/min_length": 56.5, "completions/min_terminated_length": 56.5, "epoch": 0.905982905982906, "frac_reward_zero_std": 0.4375, "grad_norm": 4.807508789968689, "grpo_loss": -0.016500203098985367, "kl": 0.158935546875, "learning_rate": 5.861353333909692e-07, "loss": 0.0623, "num_tokens": 14266213.0, "reward": 2.80859375, "reward_std": 0.23752107471227646, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.38669662177562714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.044385235058143735, "stage2_sft_loss": 0.005123868337250315, "step": 212, "total_loss": 0.028397418092936277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 320.75, "completions/max_terminated_length": 320.75, "completions/mean_length": 127.9140625, "completions/mean_terminated_length": 127.9140625, "completions/min_length": 68.25, "completions/min_terminated_length": 68.25, "epoch": 0.9102564102564102, "frac_reward_zero_std": 0.5, "grad_norm": 2.0301737121967856, "grpo_loss": -0.00042418597149662673, "kl": 0.06964111328125, "learning_rate": 5.367261620083575e-07, "loss": 0.0551, "num_tokens": 14331495.0, "reward": 2.8203125, "reward_std": 0.20976869761943817, "rewards/accuracy_reward/mean": 0.8203125, "rewards/accuracy_reward/std": 0.3742370903491974, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05549457389861345, "stage2_sft_loss": 0.005932546828262275, "step": 213, "total_loss": 0.055663644336164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 224.25, "completions/max_terminated_length": 224.25, "completions/mean_length": 123.359375, "completions/mean_terminated_length": 123.359375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.9145299145299145, "frac_reward_zero_std": 0.5625, "grad_norm": 2.1641000763673683, "grpo_loss": -0.008345429931068793, "kl": 0.08563232421875, "learning_rate": 4.894348370484648e-07, "loss": 0.0608, "num_tokens": 14395075.0, "reward": 2.83203125, "reward_std": 0.18937908113002777, "rewards/accuracy_reward/mean": 0.83203125, "rewards/accuracy_reward/std": 0.36685848236083984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.050493769347667694, "stage2_sft_loss": 0.0014829118590569124, "step": 214, "total_loss": 0.04229663033038378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 298.75, "completions/max_terminated_length": 298.75, "completions/mean_length": 124.5859375, "completions/mean_terminated_length": 124.5859375, "completions/min_length": 59.5, "completions/min_terminated_length": 59.5, "epoch": 0.9188034188034188, "frac_reward_zero_std": 0.78125, "grad_norm": 2.6263204572290344, "grpo_loss": 0.028134352585766464, "kl": 0.0804443359375, "learning_rate": 4.4427194213859216e-07, "loss": 0.0847, "num_tokens": 14460257.0, "reward": 2.8984375, "reward_std": 0.09324482083320618, "rewards/accuracy_reward/mean": 0.8984375, "rewards/accuracy_reward/std": 0.3030460849404335, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08545329654589295, "stage2_sft_loss": 0.0015856244317546953, "step": 215, "total_loss": 0.11374621279537678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 248.25, "completions/max_terminated_length": 248.25, "completions/mean_length": 130.69921875, "completions/mean_terminated_length": 130.69921875, "completions/min_length": 69.25, "completions/min_terminated_length": 69.25, "epoch": 0.9230769230769231, "frac_reward_zero_std": 0.625, "grad_norm": 2.1211135589296197, "grpo_loss": 0.009156979038380086, "kl": 0.06561279296875, "learning_rate": 4.012475845711106e-07, "loss": 0.0504, "num_tokens": 14526860.0, "reward": 2.88671875, "reward_std": 0.15703225508332253, "rewards/accuracy_reward/mean": 0.88671875, "rewards/accuracy_reward/std": 0.31127535179257393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.06328852870501578, "stage2_sft_loss": 0.0017232074733328773, "step": 216, "total_loss": 0.07261782942805439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 283.25, "completions/max_terminated_length": 283.25, "completions/mean_length": 136.74609375, "completions/mean_terminated_length": 136.74609375, "completions/min_length": 60.75, "completions/min_terminated_length": 60.75, "epoch": 0.9273504273504274, "frac_reward_zero_std": 0.4375, "grad_norm": 2.070331892621596, "grpo_loss": -0.06435195698577445, "kl": 0.0723876953125, "learning_rate": 3.603713930414676e-07, "loss": 0.0492, "num_tokens": 14594891.0, "reward": 2.79296875, "reward_std": 0.25289567187428474, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40503790229558945, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04832777753472328, "stage2_sft_loss": 0.017169317688967567, "step": 217, "total_loss": -0.014307248464319855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 134.9921875, "completions/mean_terminated_length": 134.9921875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.9316239316239316, "frac_reward_zero_std": 0.5625, "grad_norm": 2.0040656384269946, "grpo_loss": -0.053561781911412254, "kl": 0.076416015625, "learning_rate": 3.2165251549333585e-07, "loss": 0.0581, "num_tokens": 14663625.0, "reward": 2.890625, "reward_std": 0.1757119484245777, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 0.29402102530002594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04904615576379001, "stage2_sft_loss": 0.007902702882347512, "step": 218, "total_loss": -0.0037253551417961717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 132.75390625, "completions/mean_terminated_length": 132.75390625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.9358974358974359, "frac_reward_zero_std": 0.59375, "grad_norm": 2.4721018783588886, "grpo_loss": -0.01120759492914658, "kl": 0.0736083984375, "learning_rate": 2.8509961707132496e-07, "loss": 0.0718, "num_tokens": 14730890.0, "reward": 2.7412109375, "reward_std": 0.17201999574899673, "rewards/accuracy_reward/mean": 0.74609375, "rewards/accuracy_reward/std": 0.4207110106945038, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.03125, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.0078125, "stage1_sft_loss": 0.07889411831274629, "stage2_sft_loss": 0.022961543887504376, "step": 219, "total_loss": 0.06998268235474825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 290.5, "completions/max_terminated_length": 290.5, "completions/mean_length": 143.5546875, "completions/mean_terminated_length": 143.5546875, "completions/min_length": 63.75, "completions/min_terminated_length": 63.75, "epoch": 0.9401709401709402, "frac_reward_zero_std": 0.5, "grad_norm": 2.284720202352025, "grpo_loss": 0.03501165362831671, "kl": 0.07568359375, "learning_rate": 2.507208781817638e-07, "loss": 0.0477, "num_tokens": 14802496.0, "reward": 2.87109375, "reward_std": 0.20240348391234875, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.3205883875489235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07232746039517224, "stage2_sft_loss": 0.004074504366144538, "step": 220, "total_loss": 0.107746567344293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 269.5, "completions/max_terminated_length": 269.5, "completions/mean_length": 133.1796875, "completions/mean_terminated_length": 133.1796875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9444444444444444, "frac_reward_zero_std": 0.59375, "grad_norm": 2.3319446282196705, "grpo_loss": -0.0034089816035702825, "kl": 0.08306884765625, "learning_rate": 2.1852399266194312e-07, "loss": 0.0587, "num_tokens": 14869262.0, "reward": 2.859375, "reward_std": 0.18516533076763153, "rewards/accuracy_reward/mean": 0.859375, "rewards/accuracy_reward/std": 0.3469066470861435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.056242790538817644, "stage2_sft_loss": 0.018294302353751846, "step": 221, "total_loss": 0.05466323997825384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 422.25, "completions/max_terminated_length": 422.25, "completions/mean_length": 159.671875, "completions/mean_terminated_length": 159.671875, "completions/min_length": 57.75, "completions/min_terminated_length": 57.75, "epoch": 0.9487179487179487, "frac_reward_zero_std": 0.53125, "grad_norm": 2.361139064584806, "grpo_loss": 0.0038841749192215502, "kl": 0.07928466796875, "learning_rate": 1.885161660582746e-07, "loss": 0.057, "num_tokens": 14944802.0, "reward": 2.7734375, "reward_std": 0.2026655077934265, "rewards/accuracy_reward/mean": 0.7734375, "rewards/accuracy_reward/std": 0.417447067797184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.0955225438810885, "stage2_sft_loss": 0.0009612532594474033, "step": 222, "total_loss": 0.09950284566730261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 335.25, "completions/max_terminated_length": 335.25, "completions/mean_length": 152.96484375, "completions/mean_terminated_length": 152.96484375, "completions/min_length": 69.5, "completions/min_terminated_length": 69.5, "epoch": 0.9529914529914529, "frac_reward_zero_std": 0.53125, "grad_norm": 2.0296918687522303, "grpo_loss": -0.008389710797928274, "kl": 0.07568359375, "learning_rate": 1.6070411401370335e-07, "loss": 0.0514, "num_tokens": 15019097.0, "reward": 2.796875, "reward_std": 0.20726242195814848, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.35784538090229034, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05725141195580363, "stage2_sft_loss": 0.006129985613370081, "step": 223, "total_loss": 0.049474698840640485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 266.25, "completions/max_terminated_length": 266.25, "completions/mean_length": 134.58984375, "completions/mean_terminated_length": 134.58984375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.9572649572649573, "frac_reward_zero_std": 0.65625, "grad_norm": 1.8660849704001432, "grpo_loss": 0.013021941791521385, "kl": 0.0694580078125, "learning_rate": 1.350940607647866e-07, "loss": 0.0332, "num_tokens": 15086920.0, "reward": 2.79296875, "reward_std": 0.1635986603796482, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40334802865982056, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.027745027327910066, "stage2_sft_loss": 0.008323597925482318, "step": 224, "total_loss": 0.041599329095333815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 226.25, "completions/max_terminated_length": 226.25, "completions/mean_length": 124.17578125, "completions/mean_terminated_length": 124.17578125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.9615384615384616, "frac_reward_zero_std": 0.6875, "grad_norm": 2.34498979328278, "grpo_loss": 0.00911492871819064, "kl": 0.08074951171875, "learning_rate": 1.1169173774871478e-07, "loss": 0.0702, "num_tokens": 15152061.0, "reward": 2.84375, "reward_std": 0.12468297965824604, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.33794204145669937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07473163260146976, "stage2_sft_loss": 0.004223723481118213, "step": 225, "total_loss": 0.08426893223077059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 115.07421875, "completions/mean_terminated_length": 115.07421875, "completions/min_length": 61.75, "completions/min_terminated_length": 61.75, "epoch": 0.9658119658119658, "frac_reward_zero_std": 0.75, "grad_norm": 2.3385372776790243, "grpo_loss": -0.03898987057618797, "kl": 0.0703125, "learning_rate": 9.0502382320653e-08, "loss": 0.0556, "num_tokens": 15212408.0, "reward": 2.83984375, "reward_std": 0.11060019582509995, "rewards/accuracy_reward/mean": 0.83984375, "rewards/accuracy_reward/std": 0.345817930996418, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07038992689922452, "stage2_sft_loss": 0.002880757765524322, "step": 226, "total_loss": 0.031688129995018244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 145.77734375, "completions/mean_terminated_length": 145.77734375, "completions/min_length": 74.5, "completions/min_terminated_length": 74.5, "epoch": 0.9700854700854701, "frac_reward_zero_std": 0.5625, "grad_norm": 2.438610062892225, "grpo_loss": -0.0443235896564147, "kl": 0.070068359375, "learning_rate": 7.153073658162646e-08, "loss": 0.0497, "num_tokens": 15282439.0, "reward": 2.734375, "reward_std": 0.2102927379310131, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.42461463809013367, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.03144627227447927, "stage2_sft_loss": 0.01161374260118464, "step": 227, "total_loss": -0.011715942644514143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 251.75, "completions/max_terminated_length": 251.75, "completions/mean_length": 116.90234375, "completions/mean_terminated_length": 116.90234375, "completions/min_length": 59.75, "completions/min_terminated_length": 59.75, "epoch": 0.9743589743589743, "frac_reward_zero_std": 0.5, "grad_norm": 1.8053523876604445, "grpo_loss": -0.039161166292615235, "kl": 0.07305908203125, "learning_rate": 5.4781046317267103e-08, "loss": 0.04, "num_tokens": 15345054.0, "reward": 2.8515625, "reward_std": 0.21607061475515366, "rewards/accuracy_reward/mean": 0.8515625, "rewards/accuracy_reward/std": 0.34781959280371666, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.049540508072823286, "stage2_sft_loss": 0.005793681345494406, "step": 228, "total_loss": 0.010958710685372353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 335.25, "completions/max_terminated_length": 335.25, "completions/mean_length": 143.6328125, "completions/mean_terminated_length": 143.6328125, "completions/min_length": 52.75, "completions/min_terminated_length": 52.75, "epoch": 0.9786324786324786, "frac_reward_zero_std": 0.5, "grad_norm": 2.1238058001256834, "grpo_loss": 0.021149674721527845, "kl": 0.075927734375, "learning_rate": 4.025706004760932e-08, "loss": 0.0641, "num_tokens": 15414184.0, "reward": 2.734375, "reward_std": 0.22738118842244148, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.39063628762960434, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.05295246001332998, "stage2_sft_loss": 0.008666668943988043, "step": 229, "total_loss": 0.0749688046053052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 309.75, "completions/max_terminated_length": 309.75, "completions/mean_length": 150.81640625, "completions/mean_terminated_length": 150.81640625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.9829059829059829, "frac_reward_zero_std": 0.4375, "grad_norm": 2.490990424347089, "grpo_loss": 0.007095444132573903, "kl": 0.0914306640625, "learning_rate": 2.796202818819871e-08, "loss": 0.0608, "num_tokens": 15486313.0, "reward": 2.7265625, "reward_std": 0.239877637475729, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.4078166112303734, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08487359434366226, "stage2_sft_loss": 0.018978525884449482, "step": 230, "total_loss": 0.09386688856466208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 134.30078125, "completions/mean_terminated_length": 134.30078125, "completions/min_length": 65.25, "completions/min_terminated_length": 65.25, "epoch": 0.9871794871794872, "frac_reward_zero_std": 0.5625, "grad_norm": 1.9811694517457321, "grpo_loss": -0.022785615394241177, "kl": 0.0667724609375, "learning_rate": 1.7898702322648453e-08, "loss": 0.0532, "num_tokens": 15554166.0, "reward": 2.84765625, "reward_std": 0.19279402680695057, "rewards/accuracy_reward/mean": 0.84765625, "rewards/accuracy_reward/std": 0.341281745582819, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.08472150657325983, "stage2_sft_loss": 0.020304364268667996, "step": 231, "total_loss": 0.06396632781252265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 247.25, "completions/max_terminated_length": 247.25, "completions/mean_length": 122.203125, "completions/mean_terminated_length": 122.203125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.9914529914529915, "frac_reward_zero_std": 0.78125, "grad_norm": 2.4948488462654845, "grpo_loss": 0.0030043296865187585, "kl": 0.06317138671875, "learning_rate": 1.0069334586854106e-08, "loss": 0.0516, "num_tokens": 15618106.0, "reward": 2.86328125, "reward_std": 0.09666222147643566, "rewards/accuracy_reward/mean": 0.86328125, "rewards/accuracy_reward/std": 0.33113233372569084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.07209198269993067, "stage2_sft_loss": 0.00023392363800667226, "step": 232, "total_loss": 0.0751197044737637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 267.5, "completions/max_terminated_length": 267.5, "completions/mean_length": 134.3515625, "completions/mean_terminated_length": 134.3515625, "completions/min_length": 60.25, "completions/min_terminated_length": 60.25, "epoch": 0.9957264957264957, "frac_reward_zero_std": 0.59375, "grad_norm": 1.874383236311945, "grpo_loss": -0.013771897065453231, "kl": 0.06573486328125, "learning_rate": 4.475677164966774e-09, "loss": 0.047, "num_tokens": 15685796.0, "reward": 2.80859375, "reward_std": 0.18634483218193054, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.3866342604160309, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.02409595239441842, "stage2_sft_loss": 0.0057802217324933736, "step": 233, "total_loss": 0.010902077774517238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 130.3671875, "completions/mean_terminated_length": 130.3671875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.0, "frac_reward_zero_std": 0.5625, "grad_norm": 2.257091182884558, "grpo_loss": 0.02863113474450074, "kl": 0.08056640625, "learning_rate": 1.1189818972656697e-09, "loss": 0.0498, "num_tokens": 15753674.0, "reward": 2.78125, "reward_std": 0.18714364245533943, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.40201979875564575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "stage1_sft_loss": 0.04045400209724903, "stage2_sft_loss": 0.015837795235711383, "step": 234, "total_loss": 0.07066891435533762 }, { "epoch": 1.0, "step": 234, "total_flos": 0.0, "train_loss": 0.07978848305841287, "train_runtime": 7290.3624, "train_samples_per_second": 1.027, "train_steps_per_second": 0.032 } ], "logging_steps": 1, "max_steps": 234, "num_input_tokens_seen": 15753674, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }