{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.12625992042012513, "epoch": 0.03571428571428571, "frac_reward_zero_std": 0.1, "grad_norm": 0.5207030773162842, "kl": 0.029890698788585725, "learning_rate": 4.8392857142857146e-05, "loss": 0.0003, "num_tokens": 46564.0, "reward": 0.24375, "reward_std": 0.38889276385307314, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.24375, "rewards/format_reward/std": 0.38889277875423434, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.12813723259605467, "epoch": 0.07142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.5805707573890686, "kl": 0.14628016483038664, "learning_rate": 4.660714285714286e-05, "loss": 0.0015, "num_tokens": 92372.0, "reward": 0.31875, "reward_std": 0.44964262247085574, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.31875, "rewards/format_reward/std": 0.4496426522731781, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.13213527642656117, "epoch": 0.10714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.452333003282547, "kl": 0.14923681337386369, "learning_rate": 4.482142857142857e-05, "loss": 0.0015, "num_tokens": 137784.0, "reward": 0.4125, "reward_std": 0.480369633436203, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.4125, "rewards/format_reward/std": 0.48036965131759646, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.13474671822041273, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.6397606730461121, "kl": 0.17617695135995745, "learning_rate": 4.303571428571429e-05, "loss": 0.0018, "num_tokens": 184512.0, "reward": 0.29375, "reward_std": 0.4483493506908417, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.29375, "rewards/format_reward/std": 0.4483493685722351, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.13772077192552387, "epoch": 0.17857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.023568596690893173, "kl": 0.20543135565239937, "learning_rate": 4.125e-05, "loss": 0.0021, "num_tokens": 230810.0, "reward": 0.40625, "reward_std": 0.46784973740577696, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4678497463464737, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.1001779991784133, "epoch": 0.21428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.3460525572299957, "kl": 0.22092539798468352, "learning_rate": 3.946428571428571e-05, "loss": 0.0022, "num_tokens": 276958.0, "reward": 0.63125, "reward_std": 0.4759595632553101, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.63125, "rewards/format_reward/std": 0.4759595781564713, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.09113281725440174, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.27182736992836, "kl": 0.2702178731560707, "learning_rate": 3.767857142857143e-05, "loss": 0.0027, "num_tokens": 322870.0, "reward": 0.74375, "reward_std": 0.4677943289279938, "rewards/accuracy_reward/mean": 0.01875, "rewards/accuracy_reward/std": 0.05915650427341461, "rewards/format_reward/mean": 0.725, "rewards/format_reward/std": 0.44086891412734985, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.07283033218700438, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.3, "grad_norm": 0.4305204153060913, "kl": 0.30486682509072127, "learning_rate": 3.589285714285715e-05, "loss": 0.003, "num_tokens": 368898.0, "reward": 0.875, "reward_std": 0.2758632004261017, "rewards/accuracy_reward/mean": 0.00625, "rewards/accuracy_reward/std": 0.025, "rewards/format_reward/mean": 0.86875, "rewards/format_reward/std": 0.26565522849559786, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.06701735492097213, "epoch": 0.32142857142857145, "frac_reward_zero_std": 0.1, "grad_norm": 0.007617300376296043, "kl": 0.32528871656395497, "learning_rate": 3.410714285714286e-05, "loss": 0.0033, "num_tokens": 415134.0, "reward": 0.8875, "reward_std": 0.33791992366313933, "rewards/accuracy_reward/mean": 0.0125, "rewards/accuracy_reward/std": 0.05, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.31671638786792755, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.06509439181536436, "epoch": 0.35714285714285715, "frac_reward_zero_std": 0.3, "grad_norm": 0.010140369646251202, "kl": 0.298904563812539, "learning_rate": 3.2321428571428574e-05, "loss": 0.003, "num_tokens": 460524.0, "reward": 0.9375, "reward_std": 0.2313473641872406, "rewards/accuracy_reward/mean": 0.0125, "rewards/accuracy_reward/std": 0.034156504273414615, "rewards/format_reward/mean": 0.925, "rewards/format_reward/std": 0.19719087481498718, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.0780115590954665, "epoch": 0.39285714285714285, "frac_reward_zero_std": 0.2, "grad_norm": 0.6102597117424011, "kl": 0.314992543682456, "learning_rate": 3.053571428571429e-05, "loss": 0.0031, "num_tokens": 506718.0, "reward": 0.95, "reward_std": 0.2625021517276764, "rewards/accuracy_reward/mean": 0.025, "rewards/accuracy_reward/std": 0.06831300854682923, "rewards/format_reward/mean": 0.925, "rewards/format_reward/std": 0.19418915510177612, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.06869765471710707, "epoch": 0.42857142857142855, "frac_reward_zero_std": 0.3, "grad_norm": 0.18664264678955078, "kl": 0.3312602242454886, "learning_rate": 2.8749999999999997e-05, "loss": 0.0033, "num_tokens": 551914.0, "reward": 0.975, "reward_std": 0.23939219117164612, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.09331300854682922, "rewards/format_reward/mean": 0.94375, "rewards/format_reward/std": 0.17446779310703278, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.07147310696454952, "epoch": 0.4642857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.02365143969655037, "kl": 0.35637650578282776, "learning_rate": 2.6964285714285714e-05, "loss": 0.0036, "num_tokens": 597752.0, "reward": 0.95625, "reward_std": 0.17910928130149842, "rewards/accuracy_reward/mean": 0.0125, "rewards/accuracy_reward/std": 0.034156504273414615, "rewards/format_reward/mean": 0.94375, "rewards/format_reward/std": 0.16162601709365845, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.06830690645729191, "epoch": 0.5, "frac_reward_zero_std": 0.2, "grad_norm": 0.02468046173453331, "kl": 0.3641864719800651, "learning_rate": 2.5178571428571428e-05, "loss": 0.0036, "num_tokens": 643248.0, "reward": 0.95625, "reward_std": 0.2526991993188858, "rewards/accuracy_reward/mean": 0.025, "rewards/accuracy_reward/std": 0.08415650427341462, "rewards/format_reward/mean": 0.93125, "rewards/format_reward/std": 0.18202786147594452, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.07277907942188903, "epoch": 0.5357142857142857, "frac_reward_zero_std": 0.2, "grad_norm": 0.3342110812664032, "kl": 0.35218177605420353, "learning_rate": 2.3392857142857145e-05, "loss": 0.0035, "num_tokens": 689174.0, "reward": 0.96875, "reward_std": 0.2683339625597, "rewards/accuracy_reward/mean": 0.0375, "rewards/accuracy_reward/std": 0.11831300854682922, "rewards/format_reward/mean": 0.93125, "rewards/format_reward/std": 0.21162601709365844, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.07139425584464335, "epoch": 0.5714285714285714, "frac_reward_zero_std": 0.9, "grad_norm": 0.021100714802742004, "kl": 0.3498226302675903, "learning_rate": 2.1607142857142858e-05, "loss": 0.0035, "num_tokens": 735904.0, "reward": 1.00625, "reward_std": 0.025, "rewards/accuracy_reward/mean": 0.00625, "rewards/accuracy_reward/std": 0.025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.09096196873579174, "epoch": 0.6071428571428571, "frac_reward_zero_std": 0.2, "grad_norm": 0.36118584871292114, "kl": 0.3759258924983442, "learning_rate": 1.982142857142857e-05, "loss": 0.0038, "num_tokens": 782492.0, "reward": 0.95, "reward_std": 0.26942057013511655, "rewards/accuracy_reward/mean": 0.025, "rewards/accuracy_reward/std": 0.06831300854682923, "rewards/format_reward/mean": 0.925, "rewards/format_reward/std": 0.217780801653862, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.11560409280937164, "epoch": 0.6428571428571429, "frac_reward_zero_std": 0.6, "grad_norm": 0.00570648955181241, "kl": 0.382719272794202, "learning_rate": 1.8035714285714285e-05, "loss": 0.0038, "num_tokens": 829112.0, "reward": 0.98125, "reward_std": 0.12840956151485444, "rewards/accuracy_reward/mean": 0.0125, "rewards/accuracy_reward/std": 0.05, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.09331300854682922, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.10731171959778293, "epoch": 0.6785714285714286, "frac_reward_zero_std": 0.6, "grad_norm": 0.012304564006626606, "kl": 0.3598832252435386, "learning_rate": 1.6250000000000002e-05, "loss": 0.0036, "num_tokens": 875390.0, "reward": 0.95625, "reward_std": 0.15684083700180054, "rewards/accuracy_reward/mean": 0.0125, "rewards/accuracy_reward/std": 0.034156504273414615, "rewards/format_reward/mean": 0.94375, "rewards/format_reward/std": 0.13977908194065095, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.1362523474963382, "epoch": 0.7142857142857143, "frac_reward_zero_std": 0.4, "grad_norm": 0.5960667729377747, "kl": 0.39577870490029454, "learning_rate": 1.4464285714285717e-05, "loss": 0.004, "num_tokens": 922518.0, "reward": 0.93125, "reward_std": 0.22683832049369812, "rewards/accuracy_reward/mean": 0.01875, "rewards/accuracy_reward/std": 0.075, "rewards/format_reward/mean": 0.9125, "rewards/format_reward/std": 0.19454776644706726, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.10914715133840218, "epoch": 0.75, "frac_reward_zero_std": 0.5, "grad_norm": 0.006838708650320768, "kl": 0.37330353884026407, "learning_rate": 1.2678571428571429e-05, "loss": 0.0037, "num_tokens": 968198.0, "reward": 0.99375, "reward_std": 0.16918914914131164, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.09031128883361816, "rewards/format_reward/mean": 0.9625, "rewards/format_reward/std": 0.07887786626815796, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.11798728982685133, "epoch": 0.7857142857142857, "frac_reward_zero_std": 0.7, "grad_norm": 0.020410774275660515, "kl": 0.3922887581400573, "learning_rate": 1.0892857142857144e-05, "loss": 0.0039, "num_tokens": 1014656.0, "reward": 0.9625, "reward_std": 0.1187208503484726, "rewards/accuracy_reward/mean": 0.00625, "rewards/accuracy_reward/std": 0.025, "rewards/format_reward/mean": 0.95625, "rewards/format_reward/std": 0.10862429738044739, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.1181378597393632, "epoch": 0.8214285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.011710015125572681, "kl": 0.38200892861932517, "learning_rate": 9.107142857142858e-06, "loss": 0.0038, "num_tokens": 1061108.0, "reward": 0.99375, "reward_std": 0.1616260051727295, "rewards/accuracy_reward/mean": 0.025, "rewards/accuracy_reward/std": 0.06831300854682923, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.09331300854682922, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.1463294762186706, "epoch": 0.8571428571428571, "frac_reward_zero_std": 0.3, "grad_norm": 0.007682339753955603, "kl": 0.40318670785054567, "learning_rate": 7.321428571428572e-06, "loss": 0.004, "num_tokens": 1108338.0, "reward": 0.925, "reward_std": 0.25421872138977053, "rewards/accuracy_reward/mean": 0.01875, "rewards/accuracy_reward/std": 0.05915650427341461, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.1950622320175171, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.12144056173274294, "epoch": 0.8928571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.01580824889242649, "kl": 0.384551886562258, "learning_rate": 5.535714285714285e-06, "loss": 0.0038, "num_tokens": 1153978.0, "reward": 0.9625, "reward_std": 0.16303436160087587, "rewards/accuracy_reward/mean": 0.0125, "rewards/accuracy_reward/std": 0.05, "rewards/format_reward/mean": 0.95, "rewards/format_reward/std": 0.11303437054157257, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.13457293838728218, "epoch": 0.9285714285714286, "frac_reward_zero_std": 0.3, "grad_norm": 0.018065791577100754, "kl": 0.38744454458355904, "learning_rate": 3.75e-06, "loss": 0.0039, "num_tokens": 1200028.0, "reward": 0.99375, "reward_std": 0.19331300258636475, "rewards/accuracy_reward/mean": 0.025, "rewards/accuracy_reward/std": 0.06831300854682923, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.12677656556479633, "epoch": 0.9642857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.008128122426569462, "kl": 0.3681739863939583, "learning_rate": 1.9642857142857144e-06, "loss": 0.0037, "num_tokens": 1246390.0, "reward": 0.91875, "reward_std": 0.17746950387954713, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.91875, "rewards/format_reward/std": 0.17746951282024384, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 160.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 160.0, "completions/min_terminated_length": 0.0, "entropy": 0.1310231292503886, "epoch": 1.0, "frac_reward_zero_std": 0.6, "grad_norm": 0.3009057939052582, "kl": 0.42363901240751145, "learning_rate": 1.7857142857142858e-07, "loss": 0.0042, "num_tokens": 1292538.0, "reward": 0.98125, "reward_std": 0.10915650129318237, "rewards/accuracy_reward/mean": 0.00625, "rewards/accuracy_reward/std": 0.025, "rewards/format_reward/mean": 0.975, "rewards/format_reward/std": 0.08415650427341462, "step": 280 } ], "logging_steps": 10, "max_steps": 280, "num_input_tokens_seen": 1292538, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }