{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.045, "eval_steps": 500, "global_step": 180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.8, "completions/max_terminated_length": 1074.8, "completions/mean_length": 706.75, "completions/mean_terminated_length": 706.75, "completions/min_length": 402.2, "completions/min_terminated_length": 402.2, "epoch": 0.00125, "frac_reward_zero_std": 0.2, "grad_norm": 1.4768723249435425, "kl": 0.007990466803312302, "learning_rate": 4e-07, "loss": 0.0002, "num_tokens": 69230.0, "reward": 1.8499999523162842, "reward_std": 0.642622172832489, "rewards/reward_episode/mean": 0.75, "rewards/reward_episode/std": 0.37370702624320984, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.8, "rewards/reward_turn_acc/std": 0.2689151793718338, "step": 5, "step_time": 184.47292952840002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.025, "completions/max_length": 1073.4, "completions/max_terminated_length": 886.6, "completions/mean_length": 593.95, "completions/mean_terminated_length": 551.6178588867188, "completions/min_length": 252.6, "completions/min_terminated_length": 252.6, "epoch": 0.0025, "frac_reward_zero_std": 0.6, "grad_norm": 0.24281319975852966, "kl": 0.023634731024503707, "learning_rate": 9e-07, "loss": 0.0005, "num_tokens": 133948.0, "reward": 1.4514285564422607, "reward_std": 0.18255305290222168, "rewards/reward_episode/mean": 0.5714285731315613, "rewards/reward_episode/std": 0.06998541951179504, "rewards/reward_format/mean": 0.2925000131130219, "rewards/reward_format/std": 0.02121320515871048, "rewards/reward_turn_acc/mean": 0.5875, "rewards/reward_turn_acc/std": 0.12069339156150818, "step": 10, "step_time": 149.69308385100004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.2, "completions/max_terminated_length": 975.2, "completions/mean_length": 659.025, "completions/mean_terminated_length": 659.025, "completions/min_length": 404.8, "completions/min_terminated_length": 404.8, "epoch": 0.00375, "frac_reward_zero_std": 0.8, "grad_norm": 0.18021762371063232, "kl": 0.024673289433121682, "learning_rate": 1.4e-06, "loss": 0.0005, "num_tokens": 201269.0, "reward": 2.174999952316284, "reward_std": 0.13363062143325805, "rewards/reward_episode/mean": 0.9, "rewards/reward_episode/std": 0.10690449476242066, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.975, "rewards/reward_turn_acc/std": 0.026726123690605164, "step": 15, "step_time": 148.49896004200008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.2, "completions/max_terminated_length": 959.2, "completions/mean_length": 668.625, "completions/mean_terminated_length": 668.625, "completions/min_length": 371.4, "completions/min_terminated_length": 371.4, "epoch": 0.005, "frac_reward_zero_std": 0.8, "grad_norm": 0.1508709192276001, "kl": 0.026933318004012108, "learning_rate": 1.8999999999999998e-06, "loss": 0.0005, "num_tokens": 268974.0, "reward": 1.8666666150093079, "reward_std": 0.09428089261054992, "rewards/reward_episode/mean": 0.775, "rewards/reward_episode/std": 0.07071067690849304, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.7916666746139527, "rewards/reward_turn_acc/std": 0.0235702246427536, "step": 20, "step_time": 150.50589508559997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.2, "completions/max_terminated_length": 1054.2, "completions/mean_length": 713.4, "completions/mean_terminated_length": 713.4, "completions/min_length": 492.8, "completions/min_terminated_length": 492.8, "epoch": 0.00625, "frac_reward_zero_std": 0.8, "grad_norm": 0.21855127811431885, "kl": 0.025407736003398896, "learning_rate": 1.997564050259824e-06, "loss": 0.0005, "num_tokens": 338470.0, "reward": 1.8666666150093079, "reward_std": 0.09428089261054992, "rewards/reward_episode/mean": 0.775, "rewards/reward_episode/std": 0.07071067690849304, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.7916666746139527, "rewards/reward_turn_acc/std": 0.0235702246427536, "step": 25, "step_time": 164.48858075240005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.2, "completions/max_terminated_length": 1610.2, "completions/mean_length": 878.9, "completions/mean_terminated_length": 878.9, "completions/min_length": 479.6, "completions/min_terminated_length": 479.6, "epoch": 0.0075, "frac_reward_zero_std": 0.4, "grad_norm": 1.978322982788086, "kl": 0.023890410736203193, "learning_rate": 1.9876883405951377e-06, "loss": 0.0005, "num_tokens": 414586.0, "reward": 1.7187499523162841, "reward_std": 0.4155827879905701, "rewards/reward_episode/mean": 0.625, "rewards/reward_episode/std": 0.2777303636074066, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.79375, "rewards/reward_turn_acc/std": 0.14834305942058562, "step": 30, "step_time": 201.0124486300001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.2, "completions/max_terminated_length": 699.2, "completions/mean_length": 517.35, "completions/mean_terminated_length": 517.35, "completions/min_length": 413.4, "completions/min_terminated_length": 413.4, "epoch": 0.00875, "frac_reward_zero_std": 1.0, "grad_norm": 0.17440621554851532, "kl": 0.02569504640996456, "learning_rate": 1.9702957262759963e-06, "loss": 0.0005, "num_tokens": 476240.0, "reward": 1.599999976158142, "reward_std": 0.0, "rewards/reward_episode/mean": 0.6, "rewards/reward_episode/std": 0.0, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.7, "rewards/reward_turn_acc/std": 0.0, "step": 35, "step_time": 120.60697470220002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 727.275, "completions/mean_terminated_length": 727.275, "completions/min_length": 348.2, "completions/min_terminated_length": 348.2, "epoch": 0.01, "frac_reward_zero_std": 0.6, "grad_norm": 0.1508954018354416, "kl": 0.023599811643362046, "learning_rate": 1.945518575599317e-06, "loss": 0.0005, "num_tokens": 546291.0, "reward": 1.8999999523162843, "reward_std": 0.42761797904968263, "rewards/reward_episode/mean": 0.8, "rewards/reward_episode/std": 0.21380898952484131, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.8, "rewards/reward_turn_acc/std": 0.21380898952484131, "step": 40, "step_time": 167.5162495483999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 518.625, "completions/mean_terminated_length": 518.625, "completions/min_length": 258.4, "completions/min_terminated_length": 258.4, "epoch": 0.01125, "frac_reward_zero_std": 0.6, "grad_norm": 0.21857145428657532, "kl": 0.028848744928836823, "learning_rate": 1.9135454576426007e-06, "loss": 0.0006, "num_tokens": 607996.0, "reward": 1.3687499642372132, "reward_std": 0.38325761556625365, "rewards/reward_episode/mean": 0.525, "rewards/reward_episode/std": 0.19609185457229614, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.54375, "rewards/reward_turn_acc/std": 0.19413249492645263, "step": 45, "step_time": 121.05726050320008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 371.525, "completions/mean_terminated_length": 371.525, "completions/min_length": 257.2, "completions/min_terminated_length": 257.2, "epoch": 0.0125, "frac_reward_zero_std": 0.8, "grad_norm": 4.936390399932861, "kl": 0.026249005272984505, "learning_rate": 1.8746197071393956e-06, "loss": 0.0005, "num_tokens": 663817.0, "reward": 1.4062499761581422, "reward_std": 0.12938729524612427, "rewards/reward_episode/mean": 0.525, "rewards/reward_episode/std": 0.10350984334945679, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.58125, "rewards/reward_turn_acc/std": 0.025877460837364197, "step": 50, "step_time": 88.19599000860016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.2, "completions/max_terminated_length": 1019.2, "completions/mean_length": 663.425, "completions/mean_terminated_length": 663.425, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.01375, "frac_reward_zero_std": 0.4, "grad_norm": 0.5663604140281677, "kl": 0.024248089268803598, "learning_rate": 1.8290375725550415e-06, "loss": 0.0005, "num_tokens": 731314.0, "reward": 1.8249999642372132, "reward_std": 0.28284270465373995, "rewards/reward_episode/mean": 0.75, "rewards/reward_episode/std": 0.1414213538169861, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.775, "rewards/reward_turn_acc/std": 0.1414213538169861, "step": 55, "step_time": 153.18542814620014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.8, "completions/max_terminated_length": 1005.8, "completions/mean_length": 678.9, "completions/mean_terminated_length": 678.9, "completions/min_length": 355.4, "completions/min_terminated_length": 355.4, "epoch": 0.015, "frac_reward_zero_std": 0.4, "grad_norm": 1.0358364582061768, "kl": 0.02808024510741234, "learning_rate": 1.7771459614569707e-06, "loss": 0.0006, "num_tokens": 799430.0, "reward": 1.3958333134651184, "reward_std": 0.46076027154922483, "rewards/reward_episode/mean": 0.45, "rewards/reward_episode/std": 0.29960169792175295, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.6458333373069763, "rewards/reward_turn_acc/std": 0.16115862429141997, "step": 60, "step_time": 156.5224271021998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.025, "completions/max_length": 1094.6, "completions/max_terminated_length": 924.6, "completions/mean_length": 665.95, "completions/mean_terminated_length": 635.0892944335938, "completions/min_length": 369.8, "completions/min_terminated_length": 369.8, "epoch": 0.01625, "frac_reward_zero_std": 0.4, "grad_norm": 0.4963475167751312, "kl": 0.03317975252866745, "learning_rate": 1.719339800338651e-06, "loss": 0.0007, "num_tokens": 867028.0, "reward": 1.731666624546051, "reward_std": 0.3784398674964905, "rewards/reward_episode/mean": 0.7, "rewards/reward_episode/std": 0.23400336503982544, "rewards/reward_format/mean": 0.2962500095367432, "rewards/reward_format/std": 0.01060660257935524, "rewards/reward_turn_acc/mean": 0.7354166746139527, "rewards/reward_turn_acc/std": 0.13382990509271622, "step": 65, "step_time": 154.9406071833997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 517.85, "completions/mean_terminated_length": 517.85, "completions/min_length": 274.8, "completions/min_terminated_length": 274.8, "epoch": 0.0175, "frac_reward_zero_std": 0.4, "grad_norm": 0.18251997232437134, "kl": 0.026099709048867225, "learning_rate": 1.6560590289905071e-06, "loss": 0.0005, "num_tokens": 928702.0, "reward": 1.2874999642372131, "reward_std": 0.3805915355682373, "rewards/reward_episode/mean": 0.425, "rewards/reward_episode/std": 0.25587469935417173, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.5625, "rewards/reward_turn_acc/std": 0.1247168481349945, "step": 70, "step_time": 120.89109840739984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.8, "completions/max_terminated_length": 755.8, "completions/mean_length": 585.1, "completions/mean_terminated_length": 585.1, "completions/min_length": 379.4, "completions/min_terminated_length": 379.4, "epoch": 0.01875, "frac_reward_zero_std": 0.8, "grad_norm": 0.9700937271118164, "kl": 0.038158373534679414, "learning_rate": 1.587785252292473e-06, "loss": 0.0008, "num_tokens": 993066.0, "reward": 1.6666666388511657, "reward_std": 0.09428089261054992, "rewards/reward_episode/mean": 0.625, "rewards/reward_episode/std": 0.07071067690849304, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.7416666746139526, "rewards/reward_turn_acc/std": 0.02357022315263748, "step": 75, "step_time": 136.10523430559914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.6, "completions/max_terminated_length": 914.6, "completions/mean_length": 637.2, "completions/mean_terminated_length": 637.2, "completions/min_length": 372.2, "completions/min_terminated_length": 372.2, "epoch": 0.02, "frac_reward_zero_std": 0.6, "grad_norm": 0.1770908683538437, "kl": 0.03178380690515041, "learning_rate": 1.5150380749100543e-06, "loss": 0.0006, "num_tokens": 1059514.0, "reward": 1.6958333134651185, "reward_std": 0.2440791130065918, "rewards/reward_episode/mean": 0.65, "rewards/reward_episode/std": 0.17422052025794982, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.7458333373069763, "rewards/reward_turn_acc/std": 0.06985861659049988, "step": 80, "step_time": 147.74201027340024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.6, "completions/max_terminated_length": 595.6, "completions/mean_length": 415.1, "completions/mean_terminated_length": 415.1, "completions/min_length": 201.2, "completions/min_terminated_length": 201.2, "epoch": 0.02125, "frac_reward_zero_std": 0.8, "grad_norm": 3.970363140106201, "kl": 0.025650039687752722, "learning_rate": 1.4383711467890773e-06, "loss": 0.0005, "num_tokens": 1117078.0, "reward": 1.2210714101791382, "reward_std": 0.12172651290893555, "rewards/reward_episode/mean": 0.42857142984867097, "rewards/reward_episode/std": 0.06998542547225953, "rewards/reward_format/mean": 0.2925000131130219, "rewards/reward_format/std": 0.02121320515871048, "rewards/reward_turn_acc/mean": 0.5, "rewards/reward_turn_acc/std": 0.05345224738121033, "step": 85, "step_time": 112.42376970660007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 534.5, "completions/mean_terminated_length": 534.5, "completions/min_length": 386.6, "completions/min_terminated_length": 386.6, "epoch": 0.0225, "frac_reward_zero_std": 0.8, "grad_norm": 0.045042984187603, "kl": 0.030681686848402022, "learning_rate": 1.3583679495453e-06, "loss": 0.0006, "num_tokens": 1179418.0, "reward": 1.399999976158142, "reward_std": 0.13801310062408448, "rewards/reward_episode/mean": 0.525, "rewards/reward_episode/std": 0.10350984334945679, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.575, "rewards/reward_turn_acc/std": 0.034503278136253354, "step": 90, "step_time": 124.52647838519988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.025, "completions/max_length": 1451.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 658.3, "completions/mean_terminated_length": 618.2035766601563, "completions/min_length": 364.8, "completions/min_terminated_length": 364.8, "epoch": 0.02375, "frac_reward_zero_std": 0.8, "grad_norm": 0.12616221606731415, "kl": 0.032654117047786715, "learning_rate": 1.275637355816999e-06, "loss": 0.0007, "num_tokens": 1246710.0, "reward": 1.1749999880790711, "reward_std": 0.046291005611419675, "rewards/reward_episode/mean": 0.4, "rewards/reward_episode/std": 0.0, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.475, "rewards/reward_turn_acc/std": 0.046291005611419675, "step": 95, "step_time": 152.6072565958002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 567.975, "completions/mean_terminated_length": 567.975, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.025, "frac_reward_zero_std": 0.8, "grad_norm": 0.1803286075592041, "kl": 0.034163566678762435, "learning_rate": 1.1908089953765447e-06, "loss": 0.0007, "num_tokens": 1310389.0, "reward": 1.1333333194255828, "reward_std": 0.06172134280204773, "rewards/reward_episode/mean": 0.4, "rewards/reward_episode/std": 0.0, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.4333333343267441, "rewards/reward_turn_acc/std": 0.06172134280204773, "step": 100, "step_time": 131.9193014902001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.6, "completions/max_terminated_length": 1057.6, "completions/mean_length": 634.95, "completions/mean_terminated_length": 634.95, "completions/min_length": 412.8, "completions/min_terminated_length": 412.8, "epoch": 0.02625, "frac_reward_zero_std": 0.8, "grad_norm": 0.22720672190189362, "kl": 0.030544454604387282, "learning_rate": 1.1045284632676535e-06, "loss": 0.0006, "num_tokens": 1376747.0, "reward": 1.8333333134651184, "reward_std": 0.1234426736831665, "rewards/reward_episode/mean": 0.75, "rewards/reward_episode/std": 0.09258201122283935, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.7833333373069763, "rewards/reward_turn_acc/std": 0.030860668420791625, "step": 105, "step_time": 147.16702165919997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1322.2, "completions/max_terminated_length": 1322.2, "completions/mean_length": 742.775, "completions/mean_terminated_length": 742.775, "completions/min_length": 334.6, "completions/min_terminated_length": 334.6, "epoch": 0.0275, "frac_reward_zero_std": 0.4, "grad_norm": 0.27993643283843994, "kl": 0.03184134848415852, "learning_rate": 1.0174524064372837e-06, "loss": 0.0006, "num_tokens": 1447418.0, "reward": 1.2216071248054505, "reward_std": 0.47933497428894045, "rewards/reward_episode/mean": 0.39285714030265806, "rewards/reward_episode/std": 0.2819071352481842, "rewards/reward_format/mean": 0.28500001430511473, "rewards/reward_format/std": 0.04242641031742096, "rewards/reward_turn_acc/mean": 0.54375, "rewards/reward_turn_acc/std": 0.20800404548645018, "step": 110, "step_time": 199.30736396940082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.4, "completions/max_terminated_length": 890.4, "completions/mean_length": 571.225, "completions/mean_terminated_length": 571.225, "completions/min_length": 361.6, "completions/min_terminated_length": 361.6, "epoch": 0.02875, "frac_reward_zero_std": 0.8, "grad_norm": 0.07077457010746002, "kl": 0.0366281196475029, "learning_rate": 9.302435262558747e-07, "loss": 0.0007, "num_tokens": 1511227.0, "reward": 1.2499999761581422, "reward_std": 0.20701966285705567, "rewards/reward_episode/mean": 0.475, "rewards/reward_episode/std": 0.10350984334945679, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.475, "rewards/reward_turn_acc/std": 0.10350984334945679, "step": 115, "step_time": 132.90744929379872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.4, "completions/max_terminated_length": 822.4, "completions/mean_length": 557.675, "completions/mean_terminated_length": 557.675, "completions/min_length": 374.8, "completions/min_terminated_length": 374.8, "epoch": 0.03, "frac_reward_zero_std": 0.8, "grad_norm": 0.12555372714996338, "kl": 0.0267240971326828, "learning_rate": 8.435655349597689e-07, "loss": 0.0005, "num_tokens": 1574494.0, "reward": 1.7333333134651183, "reward_std": 0.13801310062408448, "rewards/reward_episode/mean": 0.675, "rewards/reward_episode/std": 0.10350984334945679, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.7583333373069763, "rewards/reward_turn_acc/std": 0.034503278136253354, "step": 120, "step_time": 129.78554452779935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.8, "completions/max_terminated_length": 928.8, "completions/mean_length": 662.975, "completions/mean_terminated_length": 662.975, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.03125, "frac_reward_zero_std": 0.6, "grad_norm": 0.16741153597831726, "kl": 0.041321803256869315, "learning_rate": 7.580781044003324e-07, "loss": 0.0008, "num_tokens": 1641973.0, "reward": 2.1333333015441895, "reward_std": 0.261455774307251, "rewards/reward_episode/mean": 0.875, "rewards/reward_episode/std": 0.19609185457229614, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.9583333373069763, "rewards/reward_turn_acc/std": 0.06536394655704499, "step": 125, "step_time": 153.5184129243993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.8, "completions/max_terminated_length": 806.8, "completions/mean_length": 583.45, "completions/mean_terminated_length": 583.45, "completions/min_length": 364.4, "completions/min_terminated_length": 364.4, "epoch": 0.0325, "frac_reward_zero_std": 0.6, "grad_norm": 0.07892462611198425, "kl": 0.04311100393533707, "learning_rate": 6.744318455428435e-07, "loss": 0.0009, "num_tokens": 1706271.0, "reward": 1.954166626930237, "reward_std": 0.1628679633140564, "rewards/reward_episode/mean": 0.75, "rewards/reward_episode/std": 0.09258201122283935, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.9041666746139526, "rewards/reward_turn_acc/std": 0.07028595805168152, "step": 130, "step_time": 135.70551129099914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.2, "completions/max_terminated_length": 993.2, "completions/mean_length": 663.55, "completions/mean_terminated_length": 663.55, "completions/min_length": 458.8, "completions/min_terminated_length": 458.8, "epoch": 0.03375, "frac_reward_zero_std": 0.8, "grad_norm": 0.1227680891752243, "kl": 0.032725603133440015, "learning_rate": 5.932633569241999e-07, "loss": 0.0007, "num_tokens": 1773773.0, "reward": 1.8399999737739563, "reward_std": 0.11109839677810669, "rewards/reward_episode/mean": 0.75, "rewards/reward_episode/std": 0.09258201122283935, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.7899999976158142, "rewards/reward_turn_acc/std": 0.018516401946544647, "step": 135, "step_time": 153.66733937559985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.8, "completions/max_terminated_length": 886.8, "completions/mean_length": 611.125, "completions/mean_terminated_length": 611.125, "completions/min_length": 443.4, "completions/min_terminated_length": 443.4, "epoch": 0.035, "frac_reward_zero_std": 1.0, "grad_norm": 0.11796784400939941, "kl": 0.03184816054999828, "learning_rate": 5.15190379753663e-07, "loss": 0.0006, "num_tokens": 1839178.0, "reward": 1.4999999761581422, "reward_std": 0.0, "rewards/reward_episode/mean": 0.6, "rewards/reward_episode/std": 0.0, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.6, "rewards/reward_turn_acc/std": 0.0, "step": 140, "step_time": 141.9817959435997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.2, "completions/max_terminated_length": 1114.2, "completions/mean_length": 684.85, "completions/mean_terminated_length": 684.85, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.03625, "frac_reward_zero_std": 0.8, "grad_norm": 0.24452009797096252, "kl": 0.03683492615818977, "learning_rate": 4.408070965292533e-07, "loss": 0.0007, "num_tokens": 1907532.0, "reward": 1.749999964237213, "reward_std": 0.20701966285705567, "rewards/reward_episode/mean": 0.725, "rewards/reward_episode/std": 0.10350984334945679, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.725, "rewards/reward_turn_acc/std": 0.10350984334945679, "step": 145, "step_time": 158.1961192132003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.8, "completions/max_terminated_length": 877.8, "completions/mean_length": 636.8, "completions/mean_terminated_length": 636.8, "completions/min_length": 292.6, "completions/min_terminated_length": 292.6, "epoch": 0.0375, "frac_reward_zero_std": 0.8, "grad_norm": 0.13973401486873627, "kl": 0.026585786044597624, "learning_rate": 3.706796089501627e-07, "loss": 0.0005, "num_tokens": 1973964.0, "reward": 1.4499999761581421, "reward_std": 0.1414213538169861, "rewards/reward_episode/mean": 0.575, "rewards/reward_episode/std": 0.07071067690849304, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.575, "rewards/reward_turn_acc/std": 0.07071067690849304, "step": 150, "step_time": 147.47617424319978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.8, "completions/max_terminated_length": 1018.8, "completions/mean_length": 711.375, "completions/mean_terminated_length": 711.375, "completions/min_length": 425.6, "completions/min_terminated_length": 425.6, "epoch": 0.03875, "frac_reward_zero_std": 0.6, "grad_norm": 0.13568446040153503, "kl": 0.03252220153808594, "learning_rate": 3.0534162954100263e-07, "loss": 0.0007, "num_tokens": 2043379.0, "reward": 1.9687499523162841, "reward_std": 0.3542271614074707, "rewards/reward_episode/mean": 0.8, "rewards/reward_episode/std": 0.21380898952484131, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.86875, "rewards/reward_turn_acc/std": 0.14075465500354767, "step": 155, "step_time": 164.7620894429987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.2, "completions/max_terminated_length": 1127.2, "completions/mean_length": 721.725, "completions/mean_terminated_length": 721.725, "completions/min_length": 347.4, "completions/min_terminated_length": 347.4, "epoch": 0.04, "frac_reward_zero_std": 0.6, "grad_norm": 19.780593872070312, "kl": 0.02986324019730091, "learning_rate": 2.45290419777228e-07, "loss": 0.0006, "num_tokens": 2113208.0, "reward": 1.7124999642372132, "reward_std": 0.30222115516662595, "rewards/reward_episode/mean": 0.675, "rewards/reward_episode/std": 0.19609185457229614, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.7375, "rewards/reward_turn_acc/std": 0.10890566408634186, "step": 160, "step_time": 166.95693593720063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1160.8, "completions/max_terminated_length": 1160.8, "completions/mean_length": 720.15, "completions/mean_terminated_length": 720.15, "completions/min_length": 314.8, "completions/min_terminated_length": 314.8, "epoch": 0.04125, "frac_reward_zero_std": 0.2, "grad_norm": 4.931512832641602, "kl": 1750.856663009897, "learning_rate": 1.9098300562505264e-07, "loss": 35.0171, "num_tokens": 2182974.0, "reward": 1.5805952072143554, "reward_std": 0.6564932465553284, "rewards/reward_episode/mean": 0.6214285731315613, "rewards/reward_episode/std": 0.3511104345321655, "rewards/reward_format/mean": 0.2925000131130219, "rewards/reward_format/std": 0.02121320515871048, "rewards/reward_turn_acc/mean": 0.6666666686534881, "rewards/reward_turn_acc/std": 0.31317211091518404, "step": 165, "step_time": 180.93480901119838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 622.675, "completions/mean_terminated_length": 622.675, "completions/min_length": 243.2, "completions/min_terminated_length": 243.2, "epoch": 0.0425, "frac_reward_zero_std": 0.6, "grad_norm": 0.1496662050485611, "kl": 0.029036011174321176, "learning_rate": 1.4283269929788776e-07, "loss": 0.0006, "num_tokens": 2248841.0, "reward": 1.5249999642372132, "reward_std": 0.32403703927993777, "rewards/reward_episode/mean": 0.6, "rewards/reward_episode/std": 0.1851640224456787, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.625, "rewards/reward_turn_acc/std": 0.13887301683425904, "step": 170, "step_time": 144.32274561419982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 879.4, "completions/max_terminated_length": 879.4, "completions/mean_length": 593.05, "completions/mean_terminated_length": 593.05, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.04375, "frac_reward_zero_std": 0.8, "grad_norm": 0.16961710155010223, "kl": 0.030310070887207986, "learning_rate": 1.0120595370083318e-07, "loss": 0.0006, "num_tokens": 2313523.0, "reward": 1.368749976158142, "reward_std": 0.18696351051330568, "rewards/reward_episode/mean": 0.525, "rewards/reward_episode/std": 0.10350984334945679, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.54375, "rewards/reward_turn_acc/std": 0.09038608074188233, "step": 175, "step_time": 137.76643493900048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3072.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.6, "completions/max_terminated_length": 959.6, "completions/mean_length": 674.2, "completions/mean_terminated_length": 674.2, "completions/min_length": 435.4, "completions/min_terminated_length": 435.4, "epoch": 0.045, "frac_reward_zero_std": 1.0, "grad_norm": 0.11210030317306519, "kl": 0.03215576484799385, "learning_rate": 6.641957350279837e-08, "loss": 0.0006, "num_tokens": 2381451.0, "reward": 1.8999999642372132, "reward_std": 0.0, "rewards/reward_episode/mean": 0.8, "rewards/reward_episode/std": 0.0, "rewards/reward_format/mean": 0.30000001192092896, "rewards/reward_format/std": 0.0, "rewards/reward_turn_acc/mean": 0.8, "rewards/reward_turn_acc/std": 0.0, "step": 180, "step_time": 155.74470006280026 } ], "logging_steps": 5, "max_steps": 200, "num_input_tokens_seen": 2381451, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }