| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.045, |
| "eval_steps": 500, |
| "global_step": 180, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1074.8, |
| "completions/max_terminated_length": 1074.8, |
| "completions/mean_length": 706.75, |
| "completions/mean_terminated_length": 706.75, |
| "completions/min_length": 402.2, |
| "completions/min_terminated_length": 402.2, |
| "epoch": 0.00125, |
| "frac_reward_zero_std": 0.2, |
| "grad_norm": 1.4768723249435425, |
| "kl": 0.007990466803312302, |
| "learning_rate": 4e-07, |
| "loss": 0.0002, |
| "num_tokens": 69230.0, |
| "reward": 1.8499999523162842, |
| "reward_std": 0.642622172832489, |
| "rewards/reward_episode/mean": 0.75, |
| "rewards/reward_episode/std": 0.37370702624320984, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.8, |
| "rewards/reward_turn_acc/std": 0.2689151793718338, |
| "step": 5, |
| "step_time": 184.47292952840002 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 1073.4, |
| "completions/max_terminated_length": 886.6, |
| "completions/mean_length": 593.95, |
| "completions/mean_terminated_length": 551.6178588867188, |
| "completions/min_length": 252.6, |
| "completions/min_terminated_length": 252.6, |
| "epoch": 0.0025, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.24281319975852966, |
| "kl": 0.023634731024503707, |
| "learning_rate": 9e-07, |
| "loss": 0.0005, |
| "num_tokens": 133948.0, |
| "reward": 1.4514285564422607, |
| "reward_std": 0.18255305290222168, |
| "rewards/reward_episode/mean": 0.5714285731315613, |
| "rewards/reward_episode/std": 0.06998541951179504, |
| "rewards/reward_format/mean": 0.2925000131130219, |
| "rewards/reward_format/std": 0.02121320515871048, |
| "rewards/reward_turn_acc/mean": 0.5875, |
| "rewards/reward_turn_acc/std": 0.12069339156150818, |
| "step": 10, |
| "step_time": 149.69308385100004 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 975.2, |
| "completions/max_terminated_length": 975.2, |
| "completions/mean_length": 659.025, |
| "completions/mean_terminated_length": 659.025, |
| "completions/min_length": 404.8, |
| "completions/min_terminated_length": 404.8, |
| "epoch": 0.00375, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.18021762371063232, |
| "kl": 0.024673289433121682, |
| "learning_rate": 1.4e-06, |
| "loss": 0.0005, |
| "num_tokens": 201269.0, |
| "reward": 2.174999952316284, |
| "reward_std": 0.13363062143325805, |
| "rewards/reward_episode/mean": 0.9, |
| "rewards/reward_episode/std": 0.10690449476242066, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.975, |
| "rewards/reward_turn_acc/std": 0.026726123690605164, |
| "step": 15, |
| "step_time": 148.49896004200008 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 959.2, |
| "completions/max_terminated_length": 959.2, |
| "completions/mean_length": 668.625, |
| "completions/mean_terminated_length": 668.625, |
| "completions/min_length": 371.4, |
| "completions/min_terminated_length": 371.4, |
| "epoch": 0.005, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.1508709192276001, |
| "kl": 0.026933318004012108, |
| "learning_rate": 1.8999999999999998e-06, |
| "loss": 0.0005, |
| "num_tokens": 268974.0, |
| "reward": 1.8666666150093079, |
| "reward_std": 0.09428089261054992, |
| "rewards/reward_episode/mean": 0.775, |
| "rewards/reward_episode/std": 0.07071067690849304, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.7916666746139527, |
| "rewards/reward_turn_acc/std": 0.0235702246427536, |
| "step": 20, |
| "step_time": 150.50589508559997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1054.2, |
| "completions/max_terminated_length": 1054.2, |
| "completions/mean_length": 713.4, |
| "completions/mean_terminated_length": 713.4, |
| "completions/min_length": 492.8, |
| "completions/min_terminated_length": 492.8, |
| "epoch": 0.00625, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.21855127811431885, |
| "kl": 0.025407736003398896, |
| "learning_rate": 1.997564050259824e-06, |
| "loss": 0.0005, |
| "num_tokens": 338470.0, |
| "reward": 1.8666666150093079, |
| "reward_std": 0.09428089261054992, |
| "rewards/reward_episode/mean": 0.775, |
| "rewards/reward_episode/std": 0.07071067690849304, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.7916666746139527, |
| "rewards/reward_turn_acc/std": 0.0235702246427536, |
| "step": 25, |
| "step_time": 164.48858075240005 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1610.2, |
| "completions/max_terminated_length": 1610.2, |
| "completions/mean_length": 878.9, |
| "completions/mean_terminated_length": 878.9, |
| "completions/min_length": 479.6, |
| "completions/min_terminated_length": 479.6, |
| "epoch": 0.0075, |
| "frac_reward_zero_std": 0.4, |
| "grad_norm": 1.978322982788086, |
| "kl": 0.023890410736203193, |
| "learning_rate": 1.9876883405951377e-06, |
| "loss": 0.0005, |
| "num_tokens": 414586.0, |
| "reward": 1.7187499523162841, |
| "reward_std": 0.4155827879905701, |
| "rewards/reward_episode/mean": 0.625, |
| "rewards/reward_episode/std": 0.2777303636074066, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.79375, |
| "rewards/reward_turn_acc/std": 0.14834305942058562, |
| "step": 30, |
| "step_time": 201.0124486300001 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 699.2, |
| "completions/max_terminated_length": 699.2, |
| "completions/mean_length": 517.35, |
| "completions/mean_terminated_length": 517.35, |
| "completions/min_length": 413.4, |
| "completions/min_terminated_length": 413.4, |
| "epoch": 0.00875, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.17440621554851532, |
| "kl": 0.02569504640996456, |
| "learning_rate": 1.9702957262759963e-06, |
| "loss": 0.0005, |
| "num_tokens": 476240.0, |
| "reward": 1.599999976158142, |
| "reward_std": 0.0, |
| "rewards/reward_episode/mean": 0.6, |
| "rewards/reward_episode/std": 0.0, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.7, |
| "rewards/reward_turn_acc/std": 0.0, |
| "step": 35, |
| "step_time": 120.60697470220002 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1189.0, |
| "completions/max_terminated_length": 1189.0, |
| "completions/mean_length": 727.275, |
| "completions/mean_terminated_length": 727.275, |
| "completions/min_length": 348.2, |
| "completions/min_terminated_length": 348.2, |
| "epoch": 0.01, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.1508954018354416, |
| "kl": 0.023599811643362046, |
| "learning_rate": 1.945518575599317e-06, |
| "loss": 0.0005, |
| "num_tokens": 546291.0, |
| "reward": 1.8999999523162843, |
| "reward_std": 0.42761797904968263, |
| "rewards/reward_episode/mean": 0.8, |
| "rewards/reward_episode/std": 0.21380898952484131, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.8, |
| "rewards/reward_turn_acc/std": 0.21380898952484131, |
| "step": 40, |
| "step_time": 167.5162495483999 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 805.0, |
| "completions/max_terminated_length": 805.0, |
| "completions/mean_length": 518.625, |
| "completions/mean_terminated_length": 518.625, |
| "completions/min_length": 258.4, |
| "completions/min_terminated_length": 258.4, |
| "epoch": 0.01125, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.21857145428657532, |
| "kl": 0.028848744928836823, |
| "learning_rate": 1.9135454576426007e-06, |
| "loss": 0.0006, |
| "num_tokens": 607996.0, |
| "reward": 1.3687499642372132, |
| "reward_std": 0.38325761556625365, |
| "rewards/reward_episode/mean": 0.525, |
| "rewards/reward_episode/std": 0.19609185457229614, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.54375, |
| "rewards/reward_turn_acc/std": 0.19413249492645263, |
| "step": 45, |
| "step_time": 121.05726050320008 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 546.0, |
| "completions/max_terminated_length": 546.0, |
| "completions/mean_length": 371.525, |
| "completions/mean_terminated_length": 371.525, |
| "completions/min_length": 257.2, |
| "completions/min_terminated_length": 257.2, |
| "epoch": 0.0125, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 4.936390399932861, |
| "kl": 0.026249005272984505, |
| "learning_rate": 1.8746197071393956e-06, |
| "loss": 0.0005, |
| "num_tokens": 663817.0, |
| "reward": 1.4062499761581422, |
| "reward_std": 0.12938729524612427, |
| "rewards/reward_episode/mean": 0.525, |
| "rewards/reward_episode/std": 0.10350984334945679, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.58125, |
| "rewards/reward_turn_acc/std": 0.025877460837364197, |
| "step": 50, |
| "step_time": 88.19599000860016 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1019.2, |
| "completions/max_terminated_length": 1019.2, |
| "completions/mean_length": 663.425, |
| "completions/mean_terminated_length": 663.425, |
| "completions/min_length": 407.0, |
| "completions/min_terminated_length": 407.0, |
| "epoch": 0.01375, |
| "frac_reward_zero_std": 0.4, |
| "grad_norm": 0.5663604140281677, |
| "kl": 0.024248089268803598, |
| "learning_rate": 1.8290375725550415e-06, |
| "loss": 0.0005, |
| "num_tokens": 731314.0, |
| "reward": 1.8249999642372132, |
| "reward_std": 0.28284270465373995, |
| "rewards/reward_episode/mean": 0.75, |
| "rewards/reward_episode/std": 0.1414213538169861, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.775, |
| "rewards/reward_turn_acc/std": 0.1414213538169861, |
| "step": 55, |
| "step_time": 153.18542814620014 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1005.8, |
| "completions/max_terminated_length": 1005.8, |
| "completions/mean_length": 678.9, |
| "completions/mean_terminated_length": 678.9, |
| "completions/min_length": 355.4, |
| "completions/min_terminated_length": 355.4, |
| "epoch": 0.015, |
| "frac_reward_zero_std": 0.4, |
| "grad_norm": 1.0358364582061768, |
| "kl": 0.02808024510741234, |
| "learning_rate": 1.7771459614569707e-06, |
| "loss": 0.0006, |
| "num_tokens": 799430.0, |
| "reward": 1.3958333134651184, |
| "reward_std": 0.46076027154922483, |
| "rewards/reward_episode/mean": 0.45, |
| "rewards/reward_episode/std": 0.29960169792175295, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.6458333373069763, |
| "rewards/reward_turn_acc/std": 0.16115862429141997, |
| "step": 60, |
| "step_time": 156.5224271021998 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 1094.6, |
| "completions/max_terminated_length": 924.6, |
| "completions/mean_length": 665.95, |
| "completions/mean_terminated_length": 635.0892944335938, |
| "completions/min_length": 369.8, |
| "completions/min_terminated_length": 369.8, |
| "epoch": 0.01625, |
| "frac_reward_zero_std": 0.4, |
| "grad_norm": 0.4963475167751312, |
| "kl": 0.03317975252866745, |
| "learning_rate": 1.719339800338651e-06, |
| "loss": 0.0007, |
| "num_tokens": 867028.0, |
| "reward": 1.731666624546051, |
| "reward_std": 0.3784398674964905, |
| "rewards/reward_episode/mean": 0.7, |
| "rewards/reward_episode/std": 0.23400336503982544, |
| "rewards/reward_format/mean": 0.2962500095367432, |
| "rewards/reward_format/std": 0.01060660257935524, |
| "rewards/reward_turn_acc/mean": 0.7354166746139527, |
| "rewards/reward_turn_acc/std": 0.13382990509271622, |
| "step": 65, |
| "step_time": 154.9406071833997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 869.0, |
| "completions/max_terminated_length": 869.0, |
| "completions/mean_length": 517.85, |
| "completions/mean_terminated_length": 517.85, |
| "completions/min_length": 274.8, |
| "completions/min_terminated_length": 274.8, |
| "epoch": 0.0175, |
| "frac_reward_zero_std": 0.4, |
| "grad_norm": 0.18251997232437134, |
| "kl": 0.026099709048867225, |
| "learning_rate": 1.6560590289905071e-06, |
| "loss": 0.0005, |
| "num_tokens": 928702.0, |
| "reward": 1.2874999642372131, |
| "reward_std": 0.3805915355682373, |
| "rewards/reward_episode/mean": 0.425, |
| "rewards/reward_episode/std": 0.25587469935417173, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.5625, |
| "rewards/reward_turn_acc/std": 0.1247168481349945, |
| "step": 70, |
| "step_time": 120.89109840739984 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 755.8, |
| "completions/max_terminated_length": 755.8, |
| "completions/mean_length": 585.1, |
| "completions/mean_terminated_length": 585.1, |
| "completions/min_length": 379.4, |
| "completions/min_terminated_length": 379.4, |
| "epoch": 0.01875, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.9700937271118164, |
| "kl": 0.038158373534679414, |
| "learning_rate": 1.587785252292473e-06, |
| "loss": 0.0008, |
| "num_tokens": 993066.0, |
| "reward": 1.6666666388511657, |
| "reward_std": 0.09428089261054992, |
| "rewards/reward_episode/mean": 0.625, |
| "rewards/reward_episode/std": 0.07071067690849304, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.7416666746139526, |
| "rewards/reward_turn_acc/std": 0.02357022315263748, |
| "step": 75, |
| "step_time": 136.10523430559914 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 914.6, |
| "completions/max_terminated_length": 914.6, |
| "completions/mean_length": 637.2, |
| "completions/mean_terminated_length": 637.2, |
| "completions/min_length": 372.2, |
| "completions/min_terminated_length": 372.2, |
| "epoch": 0.02, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.1770908683538437, |
| "kl": 0.03178380690515041, |
| "learning_rate": 1.5150380749100543e-06, |
| "loss": 0.0006, |
| "num_tokens": 1059514.0, |
| "reward": 1.6958333134651185, |
| "reward_std": 0.2440791130065918, |
| "rewards/reward_episode/mean": 0.65, |
| "rewards/reward_episode/std": 0.17422052025794982, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.7458333373069763, |
| "rewards/reward_turn_acc/std": 0.06985861659049988, |
| "step": 80, |
| "step_time": 147.74201027340024 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 595.6, |
| "completions/max_terminated_length": 595.6, |
| "completions/mean_length": 415.1, |
| "completions/mean_terminated_length": 415.1, |
| "completions/min_length": 201.2, |
| "completions/min_terminated_length": 201.2, |
| "epoch": 0.02125, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 3.970363140106201, |
| "kl": 0.025650039687752722, |
| "learning_rate": 1.4383711467890773e-06, |
| "loss": 0.0005, |
| "num_tokens": 1117078.0, |
| "reward": 1.2210714101791382, |
| "reward_std": 0.12172651290893555, |
| "rewards/reward_episode/mean": 0.42857142984867097, |
| "rewards/reward_episode/std": 0.06998542547225953, |
| "rewards/reward_format/mean": 0.2925000131130219, |
| "rewards/reward_format/std": 0.02121320515871048, |
| "rewards/reward_turn_acc/mean": 0.5, |
| "rewards/reward_turn_acc/std": 0.05345224738121033, |
| "step": 85, |
| "step_time": 112.42376970660007 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 534.5, |
| "completions/mean_terminated_length": 534.5, |
| "completions/min_length": 386.6, |
| "completions/min_terminated_length": 386.6, |
| "epoch": 0.0225, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.045042984187603, |
| "kl": 0.030681686848402022, |
| "learning_rate": 1.3583679495453e-06, |
| "loss": 0.0006, |
| "num_tokens": 1179418.0, |
| "reward": 1.399999976158142, |
| "reward_std": 0.13801310062408448, |
| "rewards/reward_episode/mean": 0.525, |
| "rewards/reward_episode/std": 0.10350984334945679, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.575, |
| "rewards/reward_turn_acc/std": 0.034503278136253354, |
| "step": 90, |
| "step_time": 124.52647838519988 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 1451.0, |
| "completions/max_terminated_length": 1412.0, |
| "completions/mean_length": 658.3, |
| "completions/mean_terminated_length": 618.2035766601563, |
| "completions/min_length": 364.8, |
| "completions/min_terminated_length": 364.8, |
| "epoch": 0.02375, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.12616221606731415, |
| "kl": 0.032654117047786715, |
| "learning_rate": 1.275637355816999e-06, |
| "loss": 0.0007, |
| "num_tokens": 1246710.0, |
| "reward": 1.1749999880790711, |
| "reward_std": 0.046291005611419675, |
| "rewards/reward_episode/mean": 0.4, |
| "rewards/reward_episode/std": 0.0, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.475, |
| "rewards/reward_turn_acc/std": 0.046291005611419675, |
| "step": 95, |
| "step_time": 152.6072565958002 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 935.0, |
| "completions/max_terminated_length": 935.0, |
| "completions/mean_length": 567.975, |
| "completions/mean_terminated_length": 567.975, |
| "completions/min_length": 283.0, |
| "completions/min_terminated_length": 283.0, |
| "epoch": 0.025, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.1803286075592041, |
| "kl": 0.034163566678762435, |
| "learning_rate": 1.1908089953765447e-06, |
| "loss": 0.0007, |
| "num_tokens": 1310389.0, |
| "reward": 1.1333333194255828, |
| "reward_std": 0.06172134280204773, |
| "rewards/reward_episode/mean": 0.4, |
| "rewards/reward_episode/std": 0.0, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.4333333343267441, |
| "rewards/reward_turn_acc/std": 0.06172134280204773, |
| "step": 100, |
| "step_time": 131.9193014902001 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1057.6, |
| "completions/max_terminated_length": 1057.6, |
| "completions/mean_length": 634.95, |
| "completions/mean_terminated_length": 634.95, |
| "completions/min_length": 412.8, |
| "completions/min_terminated_length": 412.8, |
| "epoch": 0.02625, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.22720672190189362, |
| "kl": 0.030544454604387282, |
| "learning_rate": 1.1045284632676535e-06, |
| "loss": 0.0006, |
| "num_tokens": 1376747.0, |
| "reward": 1.8333333134651184, |
| "reward_std": 0.1234426736831665, |
| "rewards/reward_episode/mean": 0.75, |
| "rewards/reward_episode/std": 0.09258201122283935, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.7833333373069763, |
| "rewards/reward_turn_acc/std": 0.030860668420791625, |
| "step": 105, |
| "step_time": 147.16702165919997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1322.2, |
| "completions/max_terminated_length": 1322.2, |
| "completions/mean_length": 742.775, |
| "completions/mean_terminated_length": 742.775, |
| "completions/min_length": 334.6, |
| "completions/min_terminated_length": 334.6, |
| "epoch": 0.0275, |
| "frac_reward_zero_std": 0.4, |
| "grad_norm": 0.27993643283843994, |
| "kl": 0.03184134848415852, |
| "learning_rate": 1.0174524064372837e-06, |
| "loss": 0.0006, |
| "num_tokens": 1447418.0, |
| "reward": 1.2216071248054505, |
| "reward_std": 0.47933497428894045, |
| "rewards/reward_episode/mean": 0.39285714030265806, |
| "rewards/reward_episode/std": 0.2819071352481842, |
| "rewards/reward_format/mean": 0.28500001430511473, |
| "rewards/reward_format/std": 0.04242641031742096, |
| "rewards/reward_turn_acc/mean": 0.54375, |
| "rewards/reward_turn_acc/std": 0.20800404548645018, |
| "step": 110, |
| "step_time": 199.30736396940082 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 890.4, |
| "completions/max_terminated_length": 890.4, |
| "completions/mean_length": 571.225, |
| "completions/mean_terminated_length": 571.225, |
| "completions/min_length": 361.6, |
| "completions/min_terminated_length": 361.6, |
| "epoch": 0.02875, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.07077457010746002, |
| "kl": 0.0366281196475029, |
| "learning_rate": 9.302435262558747e-07, |
| "loss": 0.0007, |
| "num_tokens": 1511227.0, |
| "reward": 1.2499999761581422, |
| "reward_std": 0.20701966285705567, |
| "rewards/reward_episode/mean": 0.475, |
| "rewards/reward_episode/std": 0.10350984334945679, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.475, |
| "rewards/reward_turn_acc/std": 0.10350984334945679, |
| "step": 115, |
| "step_time": 132.90744929379872 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 822.4, |
| "completions/max_terminated_length": 822.4, |
| "completions/mean_length": 557.675, |
| "completions/mean_terminated_length": 557.675, |
| "completions/min_length": 374.8, |
| "completions/min_terminated_length": 374.8, |
| "epoch": 0.03, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.12555372714996338, |
| "kl": 0.0267240971326828, |
| "learning_rate": 8.435655349597689e-07, |
| "loss": 0.0005, |
| "num_tokens": 1574494.0, |
| "reward": 1.7333333134651183, |
| "reward_std": 0.13801310062408448, |
| "rewards/reward_episode/mean": 0.675, |
| "rewards/reward_episode/std": 0.10350984334945679, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.7583333373069763, |
| "rewards/reward_turn_acc/std": 0.034503278136253354, |
| "step": 120, |
| "step_time": 129.78554452779935 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 928.8, |
| "completions/max_terminated_length": 928.8, |
| "completions/mean_length": 662.975, |
| "completions/mean_terminated_length": 662.975, |
| "completions/min_length": 428.0, |
| "completions/min_terminated_length": 428.0, |
| "epoch": 0.03125, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.16741153597831726, |
| "kl": 0.041321803256869315, |
| "learning_rate": 7.580781044003324e-07, |
| "loss": 0.0008, |
| "num_tokens": 1641973.0, |
| "reward": 2.1333333015441895, |
| "reward_std": 0.261455774307251, |
| "rewards/reward_episode/mean": 0.875, |
| "rewards/reward_episode/std": 0.19609185457229614, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.9583333373069763, |
| "rewards/reward_turn_acc/std": 0.06536394655704499, |
| "step": 125, |
| "step_time": 153.5184129243993 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 806.8, |
| "completions/max_terminated_length": 806.8, |
| "completions/mean_length": 583.45, |
| "completions/mean_terminated_length": 583.45, |
| "completions/min_length": 364.4, |
| "completions/min_terminated_length": 364.4, |
| "epoch": 0.0325, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.07892462611198425, |
| "kl": 0.04311100393533707, |
| "learning_rate": 6.744318455428435e-07, |
| "loss": 0.0009, |
| "num_tokens": 1706271.0, |
| "reward": 1.954166626930237, |
| "reward_std": 0.1628679633140564, |
| "rewards/reward_episode/mean": 0.75, |
| "rewards/reward_episode/std": 0.09258201122283935, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.9041666746139526, |
| "rewards/reward_turn_acc/std": 0.07028595805168152, |
| "step": 130, |
| "step_time": 135.70551129099914 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 993.2, |
| "completions/max_terminated_length": 993.2, |
| "completions/mean_length": 663.55, |
| "completions/mean_terminated_length": 663.55, |
| "completions/min_length": 458.8, |
| "completions/min_terminated_length": 458.8, |
| "epoch": 0.03375, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.1227680891752243, |
| "kl": 0.032725603133440015, |
| "learning_rate": 5.932633569241999e-07, |
| "loss": 0.0007, |
| "num_tokens": 1773773.0, |
| "reward": 1.8399999737739563, |
| "reward_std": 0.11109839677810669, |
| "rewards/reward_episode/mean": 0.75, |
| "rewards/reward_episode/std": 0.09258201122283935, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.7899999976158142, |
| "rewards/reward_turn_acc/std": 0.018516401946544647, |
| "step": 135, |
| "step_time": 153.66733937559985 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 886.8, |
| "completions/max_terminated_length": 886.8, |
| "completions/mean_length": 611.125, |
| "completions/mean_terminated_length": 611.125, |
| "completions/min_length": 443.4, |
| "completions/min_terminated_length": 443.4, |
| "epoch": 0.035, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.11796784400939941, |
| "kl": 0.03184816054999828, |
| "learning_rate": 5.15190379753663e-07, |
| "loss": 0.0006, |
| "num_tokens": 1839178.0, |
| "reward": 1.4999999761581422, |
| "reward_std": 0.0, |
| "rewards/reward_episode/mean": 0.6, |
| "rewards/reward_episode/std": 0.0, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.6, |
| "rewards/reward_turn_acc/std": 0.0, |
| "step": 140, |
| "step_time": 141.9817959435997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1114.2, |
| "completions/max_terminated_length": 1114.2, |
| "completions/mean_length": 684.85, |
| "completions/mean_terminated_length": 684.85, |
| "completions/min_length": 422.0, |
| "completions/min_terminated_length": 422.0, |
| "epoch": 0.03625, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.24452009797096252, |
| "kl": 0.03683492615818977, |
| "learning_rate": 4.408070965292533e-07, |
| "loss": 0.0007, |
| "num_tokens": 1907532.0, |
| "reward": 1.749999964237213, |
| "reward_std": 0.20701966285705567, |
| "rewards/reward_episode/mean": 0.725, |
| "rewards/reward_episode/std": 0.10350984334945679, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.725, |
| "rewards/reward_turn_acc/std": 0.10350984334945679, |
| "step": 145, |
| "step_time": 158.1961192132003 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 877.8, |
| "completions/max_terminated_length": 877.8, |
| "completions/mean_length": 636.8, |
| "completions/mean_terminated_length": 636.8, |
| "completions/min_length": 292.6, |
| "completions/min_terminated_length": 292.6, |
| "epoch": 0.0375, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.13973401486873627, |
| "kl": 0.026585786044597624, |
| "learning_rate": 3.706796089501627e-07, |
| "loss": 0.0005, |
| "num_tokens": 1973964.0, |
| "reward": 1.4499999761581421, |
| "reward_std": 0.1414213538169861, |
| "rewards/reward_episode/mean": 0.575, |
| "rewards/reward_episode/std": 0.07071067690849304, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.575, |
| "rewards/reward_turn_acc/std": 0.07071067690849304, |
| "step": 150, |
| "step_time": 147.47617424319978 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1018.8, |
| "completions/max_terminated_length": 1018.8, |
| "completions/mean_length": 711.375, |
| "completions/mean_terminated_length": 711.375, |
| "completions/min_length": 425.6, |
| "completions/min_terminated_length": 425.6, |
| "epoch": 0.03875, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.13568446040153503, |
| "kl": 0.03252220153808594, |
| "learning_rate": 3.0534162954100263e-07, |
| "loss": 0.0007, |
| "num_tokens": 2043379.0, |
| "reward": 1.9687499523162841, |
| "reward_std": 0.3542271614074707, |
| "rewards/reward_episode/mean": 0.8, |
| "rewards/reward_episode/std": 0.21380898952484131, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.86875, |
| "rewards/reward_turn_acc/std": 0.14075465500354767, |
| "step": 155, |
| "step_time": 164.7620894429987 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1127.2, |
| "completions/max_terminated_length": 1127.2, |
| "completions/mean_length": 721.725, |
| "completions/mean_terminated_length": 721.725, |
| "completions/min_length": 347.4, |
| "completions/min_terminated_length": 347.4, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 19.780593872070312, |
| "kl": 0.02986324019730091, |
| "learning_rate": 2.45290419777228e-07, |
| "loss": 0.0006, |
| "num_tokens": 2113208.0, |
| "reward": 1.7124999642372132, |
| "reward_std": 0.30222115516662595, |
| "rewards/reward_episode/mean": 0.675, |
| "rewards/reward_episode/std": 0.19609185457229614, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.7375, |
| "rewards/reward_turn_acc/std": 0.10890566408634186, |
| "step": 160, |
| "step_time": 166.95693593720063 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1160.8, |
| "completions/max_terminated_length": 1160.8, |
| "completions/mean_length": 720.15, |
| "completions/mean_terminated_length": 720.15, |
| "completions/min_length": 314.8, |
| "completions/min_terminated_length": 314.8, |
| "epoch": 0.04125, |
| "frac_reward_zero_std": 0.2, |
| "grad_norm": 4.931512832641602, |
| "kl": 1750.856663009897, |
| "learning_rate": 1.9098300562505264e-07, |
| "loss": 35.0171, |
| "num_tokens": 2182974.0, |
| "reward": 1.5805952072143554, |
| "reward_std": 0.6564932465553284, |
| "rewards/reward_episode/mean": 0.6214285731315613, |
| "rewards/reward_episode/std": 0.3511104345321655, |
| "rewards/reward_format/mean": 0.2925000131130219, |
| "rewards/reward_format/std": 0.02121320515871048, |
| "rewards/reward_turn_acc/mean": 0.6666666686534881, |
| "rewards/reward_turn_acc/std": 0.31317211091518404, |
| "step": 165, |
| "step_time": 180.93480901119838 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1163.0, |
| "completions/max_terminated_length": 1163.0, |
| "completions/mean_length": 622.675, |
| "completions/mean_terminated_length": 622.675, |
| "completions/min_length": 243.2, |
| "completions/min_terminated_length": 243.2, |
| "epoch": 0.0425, |
| "frac_reward_zero_std": 0.6, |
| "grad_norm": 0.1496662050485611, |
| "kl": 0.029036011174321176, |
| "learning_rate": 1.4283269929788776e-07, |
| "loss": 0.0006, |
| "num_tokens": 2248841.0, |
| "reward": 1.5249999642372132, |
| "reward_std": 0.32403703927993777, |
| "rewards/reward_episode/mean": 0.6, |
| "rewards/reward_episode/std": 0.1851640224456787, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.625, |
| "rewards/reward_turn_acc/std": 0.13887301683425904, |
| "step": 170, |
| "step_time": 144.32274561419982 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 879.4, |
| "completions/max_terminated_length": 879.4, |
| "completions/mean_length": 593.05, |
| "completions/mean_terminated_length": 593.05, |
| "completions/min_length": 417.0, |
| "completions/min_terminated_length": 417.0, |
| "epoch": 0.04375, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.16961710155010223, |
| "kl": 0.030310070887207986, |
| "learning_rate": 1.0120595370083318e-07, |
| "loss": 0.0006, |
| "num_tokens": 2313523.0, |
| "reward": 1.368749976158142, |
| "reward_std": 0.18696351051330568, |
| "rewards/reward_episode/mean": 0.525, |
| "rewards/reward_episode/std": 0.10350984334945679, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.54375, |
| "rewards/reward_turn_acc/std": 0.09038608074188233, |
| "step": 175, |
| "step_time": 137.76643493900048 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completion_length": 3072.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 959.6, |
| "completions/max_terminated_length": 959.6, |
| "completions/mean_length": 674.2, |
| "completions/mean_terminated_length": 674.2, |
| "completions/min_length": 435.4, |
| "completions/min_terminated_length": 435.4, |
| "epoch": 0.045, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.11210030317306519, |
| "kl": 0.03215576484799385, |
| "learning_rate": 6.641957350279837e-08, |
| "loss": 0.0006, |
| "num_tokens": 2381451.0, |
| "reward": 1.8999999642372132, |
| "reward_std": 0.0, |
| "rewards/reward_episode/mean": 0.8, |
| "rewards/reward_episode/std": 0.0, |
| "rewards/reward_format/mean": 0.30000001192092896, |
| "rewards/reward_format/std": 0.0, |
| "rewards/reward_turn_acc/mean": 0.8, |
| "rewards/reward_turn_acc/std": 0.0, |
| "step": 180, |
| "step_time": 155.74470006280026 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 200, |
| "num_input_tokens_seen": 2381451, |
| "num_train_epochs": 1, |
| "save_steps": 20, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|