syllogym-judge-qwen3-4b-grpo / trainer_state.json
farffadet's picture
copy checkpoint-180 from v11
ee7baca verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.045,
"eval_steps": 500,
"global_step": 180,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1074.8,
"completions/max_terminated_length": 1074.8,
"completions/mean_length": 706.75,
"completions/mean_terminated_length": 706.75,
"completions/min_length": 402.2,
"completions/min_terminated_length": 402.2,
"epoch": 0.00125,
"frac_reward_zero_std": 0.2,
"grad_norm": 1.4768723249435425,
"kl": 0.007990466803312302,
"learning_rate": 4e-07,
"loss": 0.0002,
"num_tokens": 69230.0,
"reward": 1.8499999523162842,
"reward_std": 0.642622172832489,
"rewards/reward_episode/mean": 0.75,
"rewards/reward_episode/std": 0.37370702624320984,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.8,
"rewards/reward_turn_acc/std": 0.2689151793718338,
"step": 5,
"step_time": 184.47292952840002
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 1073.4,
"completions/max_terminated_length": 886.6,
"completions/mean_length": 593.95,
"completions/mean_terminated_length": 551.6178588867188,
"completions/min_length": 252.6,
"completions/min_terminated_length": 252.6,
"epoch": 0.0025,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.24281319975852966,
"kl": 0.023634731024503707,
"learning_rate": 9e-07,
"loss": 0.0005,
"num_tokens": 133948.0,
"reward": 1.4514285564422607,
"reward_std": 0.18255305290222168,
"rewards/reward_episode/mean": 0.5714285731315613,
"rewards/reward_episode/std": 0.06998541951179504,
"rewards/reward_format/mean": 0.2925000131130219,
"rewards/reward_format/std": 0.02121320515871048,
"rewards/reward_turn_acc/mean": 0.5875,
"rewards/reward_turn_acc/std": 0.12069339156150818,
"step": 10,
"step_time": 149.69308385100004
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 975.2,
"completions/max_terminated_length": 975.2,
"completions/mean_length": 659.025,
"completions/mean_terminated_length": 659.025,
"completions/min_length": 404.8,
"completions/min_terminated_length": 404.8,
"epoch": 0.00375,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.18021762371063232,
"kl": 0.024673289433121682,
"learning_rate": 1.4e-06,
"loss": 0.0005,
"num_tokens": 201269.0,
"reward": 2.174999952316284,
"reward_std": 0.13363062143325805,
"rewards/reward_episode/mean": 0.9,
"rewards/reward_episode/std": 0.10690449476242066,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.975,
"rewards/reward_turn_acc/std": 0.026726123690605164,
"step": 15,
"step_time": 148.49896004200008
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 959.2,
"completions/max_terminated_length": 959.2,
"completions/mean_length": 668.625,
"completions/mean_terminated_length": 668.625,
"completions/min_length": 371.4,
"completions/min_terminated_length": 371.4,
"epoch": 0.005,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.1508709192276001,
"kl": 0.026933318004012108,
"learning_rate": 1.8999999999999998e-06,
"loss": 0.0005,
"num_tokens": 268974.0,
"reward": 1.8666666150093079,
"reward_std": 0.09428089261054992,
"rewards/reward_episode/mean": 0.775,
"rewards/reward_episode/std": 0.07071067690849304,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.7916666746139527,
"rewards/reward_turn_acc/std": 0.0235702246427536,
"step": 20,
"step_time": 150.50589508559997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1054.2,
"completions/max_terminated_length": 1054.2,
"completions/mean_length": 713.4,
"completions/mean_terminated_length": 713.4,
"completions/min_length": 492.8,
"completions/min_terminated_length": 492.8,
"epoch": 0.00625,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.21855127811431885,
"kl": 0.025407736003398896,
"learning_rate": 1.997564050259824e-06,
"loss": 0.0005,
"num_tokens": 338470.0,
"reward": 1.8666666150093079,
"reward_std": 0.09428089261054992,
"rewards/reward_episode/mean": 0.775,
"rewards/reward_episode/std": 0.07071067690849304,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.7916666746139527,
"rewards/reward_turn_acc/std": 0.0235702246427536,
"step": 25,
"step_time": 164.48858075240005
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1610.2,
"completions/max_terminated_length": 1610.2,
"completions/mean_length": 878.9,
"completions/mean_terminated_length": 878.9,
"completions/min_length": 479.6,
"completions/min_terminated_length": 479.6,
"epoch": 0.0075,
"frac_reward_zero_std": 0.4,
"grad_norm": 1.978322982788086,
"kl": 0.023890410736203193,
"learning_rate": 1.9876883405951377e-06,
"loss": 0.0005,
"num_tokens": 414586.0,
"reward": 1.7187499523162841,
"reward_std": 0.4155827879905701,
"rewards/reward_episode/mean": 0.625,
"rewards/reward_episode/std": 0.2777303636074066,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.79375,
"rewards/reward_turn_acc/std": 0.14834305942058562,
"step": 30,
"step_time": 201.0124486300001
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 699.2,
"completions/max_terminated_length": 699.2,
"completions/mean_length": 517.35,
"completions/mean_terminated_length": 517.35,
"completions/min_length": 413.4,
"completions/min_terminated_length": 413.4,
"epoch": 0.00875,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.17440621554851532,
"kl": 0.02569504640996456,
"learning_rate": 1.9702957262759963e-06,
"loss": 0.0005,
"num_tokens": 476240.0,
"reward": 1.599999976158142,
"reward_std": 0.0,
"rewards/reward_episode/mean": 0.6,
"rewards/reward_episode/std": 0.0,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.7,
"rewards/reward_turn_acc/std": 0.0,
"step": 35,
"step_time": 120.60697470220002
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1189.0,
"completions/max_terminated_length": 1189.0,
"completions/mean_length": 727.275,
"completions/mean_terminated_length": 727.275,
"completions/min_length": 348.2,
"completions/min_terminated_length": 348.2,
"epoch": 0.01,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.1508954018354416,
"kl": 0.023599811643362046,
"learning_rate": 1.945518575599317e-06,
"loss": 0.0005,
"num_tokens": 546291.0,
"reward": 1.8999999523162843,
"reward_std": 0.42761797904968263,
"rewards/reward_episode/mean": 0.8,
"rewards/reward_episode/std": 0.21380898952484131,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.8,
"rewards/reward_turn_acc/std": 0.21380898952484131,
"step": 40,
"step_time": 167.5162495483999
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 805.0,
"completions/max_terminated_length": 805.0,
"completions/mean_length": 518.625,
"completions/mean_terminated_length": 518.625,
"completions/min_length": 258.4,
"completions/min_terminated_length": 258.4,
"epoch": 0.01125,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.21857145428657532,
"kl": 0.028848744928836823,
"learning_rate": 1.9135454576426007e-06,
"loss": 0.0006,
"num_tokens": 607996.0,
"reward": 1.3687499642372132,
"reward_std": 0.38325761556625365,
"rewards/reward_episode/mean": 0.525,
"rewards/reward_episode/std": 0.19609185457229614,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.54375,
"rewards/reward_turn_acc/std": 0.19413249492645263,
"step": 45,
"step_time": 121.05726050320008
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 546.0,
"completions/max_terminated_length": 546.0,
"completions/mean_length": 371.525,
"completions/mean_terminated_length": 371.525,
"completions/min_length": 257.2,
"completions/min_terminated_length": 257.2,
"epoch": 0.0125,
"frac_reward_zero_std": 0.8,
"grad_norm": 4.936390399932861,
"kl": 0.026249005272984505,
"learning_rate": 1.8746197071393956e-06,
"loss": 0.0005,
"num_tokens": 663817.0,
"reward": 1.4062499761581422,
"reward_std": 0.12938729524612427,
"rewards/reward_episode/mean": 0.525,
"rewards/reward_episode/std": 0.10350984334945679,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.58125,
"rewards/reward_turn_acc/std": 0.025877460837364197,
"step": 50,
"step_time": 88.19599000860016
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1019.2,
"completions/max_terminated_length": 1019.2,
"completions/mean_length": 663.425,
"completions/mean_terminated_length": 663.425,
"completions/min_length": 407.0,
"completions/min_terminated_length": 407.0,
"epoch": 0.01375,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.5663604140281677,
"kl": 0.024248089268803598,
"learning_rate": 1.8290375725550415e-06,
"loss": 0.0005,
"num_tokens": 731314.0,
"reward": 1.8249999642372132,
"reward_std": 0.28284270465373995,
"rewards/reward_episode/mean": 0.75,
"rewards/reward_episode/std": 0.1414213538169861,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.775,
"rewards/reward_turn_acc/std": 0.1414213538169861,
"step": 55,
"step_time": 153.18542814620014
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1005.8,
"completions/max_terminated_length": 1005.8,
"completions/mean_length": 678.9,
"completions/mean_terminated_length": 678.9,
"completions/min_length": 355.4,
"completions/min_terminated_length": 355.4,
"epoch": 0.015,
"frac_reward_zero_std": 0.4,
"grad_norm": 1.0358364582061768,
"kl": 0.02808024510741234,
"learning_rate": 1.7771459614569707e-06,
"loss": 0.0006,
"num_tokens": 799430.0,
"reward": 1.3958333134651184,
"reward_std": 0.46076027154922483,
"rewards/reward_episode/mean": 0.45,
"rewards/reward_episode/std": 0.29960169792175295,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.6458333373069763,
"rewards/reward_turn_acc/std": 0.16115862429141997,
"step": 60,
"step_time": 156.5224271021998
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 1094.6,
"completions/max_terminated_length": 924.6,
"completions/mean_length": 665.95,
"completions/mean_terminated_length": 635.0892944335938,
"completions/min_length": 369.8,
"completions/min_terminated_length": 369.8,
"epoch": 0.01625,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.4963475167751312,
"kl": 0.03317975252866745,
"learning_rate": 1.719339800338651e-06,
"loss": 0.0007,
"num_tokens": 867028.0,
"reward": 1.731666624546051,
"reward_std": 0.3784398674964905,
"rewards/reward_episode/mean": 0.7,
"rewards/reward_episode/std": 0.23400336503982544,
"rewards/reward_format/mean": 0.2962500095367432,
"rewards/reward_format/std": 0.01060660257935524,
"rewards/reward_turn_acc/mean": 0.7354166746139527,
"rewards/reward_turn_acc/std": 0.13382990509271622,
"step": 65,
"step_time": 154.9406071833997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 869.0,
"completions/max_terminated_length": 869.0,
"completions/mean_length": 517.85,
"completions/mean_terminated_length": 517.85,
"completions/min_length": 274.8,
"completions/min_terminated_length": 274.8,
"epoch": 0.0175,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.18251997232437134,
"kl": 0.026099709048867225,
"learning_rate": 1.6560590289905071e-06,
"loss": 0.0005,
"num_tokens": 928702.0,
"reward": 1.2874999642372131,
"reward_std": 0.3805915355682373,
"rewards/reward_episode/mean": 0.425,
"rewards/reward_episode/std": 0.25587469935417173,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.5625,
"rewards/reward_turn_acc/std": 0.1247168481349945,
"step": 70,
"step_time": 120.89109840739984
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 755.8,
"completions/max_terminated_length": 755.8,
"completions/mean_length": 585.1,
"completions/mean_terminated_length": 585.1,
"completions/min_length": 379.4,
"completions/min_terminated_length": 379.4,
"epoch": 0.01875,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.9700937271118164,
"kl": 0.038158373534679414,
"learning_rate": 1.587785252292473e-06,
"loss": 0.0008,
"num_tokens": 993066.0,
"reward": 1.6666666388511657,
"reward_std": 0.09428089261054992,
"rewards/reward_episode/mean": 0.625,
"rewards/reward_episode/std": 0.07071067690849304,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.7416666746139526,
"rewards/reward_turn_acc/std": 0.02357022315263748,
"step": 75,
"step_time": 136.10523430559914
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 914.6,
"completions/max_terminated_length": 914.6,
"completions/mean_length": 637.2,
"completions/mean_terminated_length": 637.2,
"completions/min_length": 372.2,
"completions/min_terminated_length": 372.2,
"epoch": 0.02,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.1770908683538437,
"kl": 0.03178380690515041,
"learning_rate": 1.5150380749100543e-06,
"loss": 0.0006,
"num_tokens": 1059514.0,
"reward": 1.6958333134651185,
"reward_std": 0.2440791130065918,
"rewards/reward_episode/mean": 0.65,
"rewards/reward_episode/std": 0.17422052025794982,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.7458333373069763,
"rewards/reward_turn_acc/std": 0.06985861659049988,
"step": 80,
"step_time": 147.74201027340024
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 595.6,
"completions/max_terminated_length": 595.6,
"completions/mean_length": 415.1,
"completions/mean_terminated_length": 415.1,
"completions/min_length": 201.2,
"completions/min_terminated_length": 201.2,
"epoch": 0.02125,
"frac_reward_zero_std": 0.8,
"grad_norm": 3.970363140106201,
"kl": 0.025650039687752722,
"learning_rate": 1.4383711467890773e-06,
"loss": 0.0005,
"num_tokens": 1117078.0,
"reward": 1.2210714101791382,
"reward_std": 0.12172651290893555,
"rewards/reward_episode/mean": 0.42857142984867097,
"rewards/reward_episode/std": 0.06998542547225953,
"rewards/reward_format/mean": 0.2925000131130219,
"rewards/reward_format/std": 0.02121320515871048,
"rewards/reward_turn_acc/mean": 0.5,
"rewards/reward_turn_acc/std": 0.05345224738121033,
"step": 85,
"step_time": 112.42376970660007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 534.5,
"completions/mean_terminated_length": 534.5,
"completions/min_length": 386.6,
"completions/min_terminated_length": 386.6,
"epoch": 0.0225,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.045042984187603,
"kl": 0.030681686848402022,
"learning_rate": 1.3583679495453e-06,
"loss": 0.0006,
"num_tokens": 1179418.0,
"reward": 1.399999976158142,
"reward_std": 0.13801310062408448,
"rewards/reward_episode/mean": 0.525,
"rewards/reward_episode/std": 0.10350984334945679,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.575,
"rewards/reward_turn_acc/std": 0.034503278136253354,
"step": 90,
"step_time": 124.52647838519988
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 1451.0,
"completions/max_terminated_length": 1412.0,
"completions/mean_length": 658.3,
"completions/mean_terminated_length": 618.2035766601563,
"completions/min_length": 364.8,
"completions/min_terminated_length": 364.8,
"epoch": 0.02375,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.12616221606731415,
"kl": 0.032654117047786715,
"learning_rate": 1.275637355816999e-06,
"loss": 0.0007,
"num_tokens": 1246710.0,
"reward": 1.1749999880790711,
"reward_std": 0.046291005611419675,
"rewards/reward_episode/mean": 0.4,
"rewards/reward_episode/std": 0.0,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.475,
"rewards/reward_turn_acc/std": 0.046291005611419675,
"step": 95,
"step_time": 152.6072565958002
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 935.0,
"completions/max_terminated_length": 935.0,
"completions/mean_length": 567.975,
"completions/mean_terminated_length": 567.975,
"completions/min_length": 283.0,
"completions/min_terminated_length": 283.0,
"epoch": 0.025,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.1803286075592041,
"kl": 0.034163566678762435,
"learning_rate": 1.1908089953765447e-06,
"loss": 0.0007,
"num_tokens": 1310389.0,
"reward": 1.1333333194255828,
"reward_std": 0.06172134280204773,
"rewards/reward_episode/mean": 0.4,
"rewards/reward_episode/std": 0.0,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.4333333343267441,
"rewards/reward_turn_acc/std": 0.06172134280204773,
"step": 100,
"step_time": 131.9193014902001
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1057.6,
"completions/max_terminated_length": 1057.6,
"completions/mean_length": 634.95,
"completions/mean_terminated_length": 634.95,
"completions/min_length": 412.8,
"completions/min_terminated_length": 412.8,
"epoch": 0.02625,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.22720672190189362,
"kl": 0.030544454604387282,
"learning_rate": 1.1045284632676535e-06,
"loss": 0.0006,
"num_tokens": 1376747.0,
"reward": 1.8333333134651184,
"reward_std": 0.1234426736831665,
"rewards/reward_episode/mean": 0.75,
"rewards/reward_episode/std": 0.09258201122283935,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.7833333373069763,
"rewards/reward_turn_acc/std": 0.030860668420791625,
"step": 105,
"step_time": 147.16702165919997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1322.2,
"completions/max_terminated_length": 1322.2,
"completions/mean_length": 742.775,
"completions/mean_terminated_length": 742.775,
"completions/min_length": 334.6,
"completions/min_terminated_length": 334.6,
"epoch": 0.0275,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.27993643283843994,
"kl": 0.03184134848415852,
"learning_rate": 1.0174524064372837e-06,
"loss": 0.0006,
"num_tokens": 1447418.0,
"reward": 1.2216071248054505,
"reward_std": 0.47933497428894045,
"rewards/reward_episode/mean": 0.39285714030265806,
"rewards/reward_episode/std": 0.2819071352481842,
"rewards/reward_format/mean": 0.28500001430511473,
"rewards/reward_format/std": 0.04242641031742096,
"rewards/reward_turn_acc/mean": 0.54375,
"rewards/reward_turn_acc/std": 0.20800404548645018,
"step": 110,
"step_time": 199.30736396940082
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 890.4,
"completions/max_terminated_length": 890.4,
"completions/mean_length": 571.225,
"completions/mean_terminated_length": 571.225,
"completions/min_length": 361.6,
"completions/min_terminated_length": 361.6,
"epoch": 0.02875,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.07077457010746002,
"kl": 0.0366281196475029,
"learning_rate": 9.302435262558747e-07,
"loss": 0.0007,
"num_tokens": 1511227.0,
"reward": 1.2499999761581422,
"reward_std": 0.20701966285705567,
"rewards/reward_episode/mean": 0.475,
"rewards/reward_episode/std": 0.10350984334945679,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.475,
"rewards/reward_turn_acc/std": 0.10350984334945679,
"step": 115,
"step_time": 132.90744929379872
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 822.4,
"completions/max_terminated_length": 822.4,
"completions/mean_length": 557.675,
"completions/mean_terminated_length": 557.675,
"completions/min_length": 374.8,
"completions/min_terminated_length": 374.8,
"epoch": 0.03,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.12555372714996338,
"kl": 0.0267240971326828,
"learning_rate": 8.435655349597689e-07,
"loss": 0.0005,
"num_tokens": 1574494.0,
"reward": 1.7333333134651183,
"reward_std": 0.13801310062408448,
"rewards/reward_episode/mean": 0.675,
"rewards/reward_episode/std": 0.10350984334945679,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.7583333373069763,
"rewards/reward_turn_acc/std": 0.034503278136253354,
"step": 120,
"step_time": 129.78554452779935
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 928.8,
"completions/max_terminated_length": 928.8,
"completions/mean_length": 662.975,
"completions/mean_terminated_length": 662.975,
"completions/min_length": 428.0,
"completions/min_terminated_length": 428.0,
"epoch": 0.03125,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.16741153597831726,
"kl": 0.041321803256869315,
"learning_rate": 7.580781044003324e-07,
"loss": 0.0008,
"num_tokens": 1641973.0,
"reward": 2.1333333015441895,
"reward_std": 0.261455774307251,
"rewards/reward_episode/mean": 0.875,
"rewards/reward_episode/std": 0.19609185457229614,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.9583333373069763,
"rewards/reward_turn_acc/std": 0.06536394655704499,
"step": 125,
"step_time": 153.5184129243993
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 806.8,
"completions/max_terminated_length": 806.8,
"completions/mean_length": 583.45,
"completions/mean_terminated_length": 583.45,
"completions/min_length": 364.4,
"completions/min_terminated_length": 364.4,
"epoch": 0.0325,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.07892462611198425,
"kl": 0.04311100393533707,
"learning_rate": 6.744318455428435e-07,
"loss": 0.0009,
"num_tokens": 1706271.0,
"reward": 1.954166626930237,
"reward_std": 0.1628679633140564,
"rewards/reward_episode/mean": 0.75,
"rewards/reward_episode/std": 0.09258201122283935,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.9041666746139526,
"rewards/reward_turn_acc/std": 0.07028595805168152,
"step": 130,
"step_time": 135.70551129099914
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 993.2,
"completions/max_terminated_length": 993.2,
"completions/mean_length": 663.55,
"completions/mean_terminated_length": 663.55,
"completions/min_length": 458.8,
"completions/min_terminated_length": 458.8,
"epoch": 0.03375,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.1227680891752243,
"kl": 0.032725603133440015,
"learning_rate": 5.932633569241999e-07,
"loss": 0.0007,
"num_tokens": 1773773.0,
"reward": 1.8399999737739563,
"reward_std": 0.11109839677810669,
"rewards/reward_episode/mean": 0.75,
"rewards/reward_episode/std": 0.09258201122283935,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.7899999976158142,
"rewards/reward_turn_acc/std": 0.018516401946544647,
"step": 135,
"step_time": 153.66733937559985
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 886.8,
"completions/max_terminated_length": 886.8,
"completions/mean_length": 611.125,
"completions/mean_terminated_length": 611.125,
"completions/min_length": 443.4,
"completions/min_terminated_length": 443.4,
"epoch": 0.035,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.11796784400939941,
"kl": 0.03184816054999828,
"learning_rate": 5.15190379753663e-07,
"loss": 0.0006,
"num_tokens": 1839178.0,
"reward": 1.4999999761581422,
"reward_std": 0.0,
"rewards/reward_episode/mean": 0.6,
"rewards/reward_episode/std": 0.0,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.6,
"rewards/reward_turn_acc/std": 0.0,
"step": 140,
"step_time": 141.9817959435997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1114.2,
"completions/max_terminated_length": 1114.2,
"completions/mean_length": 684.85,
"completions/mean_terminated_length": 684.85,
"completions/min_length": 422.0,
"completions/min_terminated_length": 422.0,
"epoch": 0.03625,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.24452009797096252,
"kl": 0.03683492615818977,
"learning_rate": 4.408070965292533e-07,
"loss": 0.0007,
"num_tokens": 1907532.0,
"reward": 1.749999964237213,
"reward_std": 0.20701966285705567,
"rewards/reward_episode/mean": 0.725,
"rewards/reward_episode/std": 0.10350984334945679,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.725,
"rewards/reward_turn_acc/std": 0.10350984334945679,
"step": 145,
"step_time": 158.1961192132003
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 877.8,
"completions/max_terminated_length": 877.8,
"completions/mean_length": 636.8,
"completions/mean_terminated_length": 636.8,
"completions/min_length": 292.6,
"completions/min_terminated_length": 292.6,
"epoch": 0.0375,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.13973401486873627,
"kl": 0.026585786044597624,
"learning_rate": 3.706796089501627e-07,
"loss": 0.0005,
"num_tokens": 1973964.0,
"reward": 1.4499999761581421,
"reward_std": 0.1414213538169861,
"rewards/reward_episode/mean": 0.575,
"rewards/reward_episode/std": 0.07071067690849304,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.575,
"rewards/reward_turn_acc/std": 0.07071067690849304,
"step": 150,
"step_time": 147.47617424319978
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1018.8,
"completions/max_terminated_length": 1018.8,
"completions/mean_length": 711.375,
"completions/mean_terminated_length": 711.375,
"completions/min_length": 425.6,
"completions/min_terminated_length": 425.6,
"epoch": 0.03875,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.13568446040153503,
"kl": 0.03252220153808594,
"learning_rate": 3.0534162954100263e-07,
"loss": 0.0007,
"num_tokens": 2043379.0,
"reward": 1.9687499523162841,
"reward_std": 0.3542271614074707,
"rewards/reward_episode/mean": 0.8,
"rewards/reward_episode/std": 0.21380898952484131,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.86875,
"rewards/reward_turn_acc/std": 0.14075465500354767,
"step": 155,
"step_time": 164.7620894429987
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1127.2,
"completions/max_terminated_length": 1127.2,
"completions/mean_length": 721.725,
"completions/mean_terminated_length": 721.725,
"completions/min_length": 347.4,
"completions/min_terminated_length": 347.4,
"epoch": 0.04,
"frac_reward_zero_std": 0.6,
"grad_norm": 19.780593872070312,
"kl": 0.02986324019730091,
"learning_rate": 2.45290419777228e-07,
"loss": 0.0006,
"num_tokens": 2113208.0,
"reward": 1.7124999642372132,
"reward_std": 0.30222115516662595,
"rewards/reward_episode/mean": 0.675,
"rewards/reward_episode/std": 0.19609185457229614,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.7375,
"rewards/reward_turn_acc/std": 0.10890566408634186,
"step": 160,
"step_time": 166.95693593720063
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1160.8,
"completions/max_terminated_length": 1160.8,
"completions/mean_length": 720.15,
"completions/mean_terminated_length": 720.15,
"completions/min_length": 314.8,
"completions/min_terminated_length": 314.8,
"epoch": 0.04125,
"frac_reward_zero_std": 0.2,
"grad_norm": 4.931512832641602,
"kl": 1750.856663009897,
"learning_rate": 1.9098300562505264e-07,
"loss": 35.0171,
"num_tokens": 2182974.0,
"reward": 1.5805952072143554,
"reward_std": 0.6564932465553284,
"rewards/reward_episode/mean": 0.6214285731315613,
"rewards/reward_episode/std": 0.3511104345321655,
"rewards/reward_format/mean": 0.2925000131130219,
"rewards/reward_format/std": 0.02121320515871048,
"rewards/reward_turn_acc/mean": 0.6666666686534881,
"rewards/reward_turn_acc/std": 0.31317211091518404,
"step": 165,
"step_time": 180.93480901119838
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1163.0,
"completions/max_terminated_length": 1163.0,
"completions/mean_length": 622.675,
"completions/mean_terminated_length": 622.675,
"completions/min_length": 243.2,
"completions/min_terminated_length": 243.2,
"epoch": 0.0425,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.1496662050485611,
"kl": 0.029036011174321176,
"learning_rate": 1.4283269929788776e-07,
"loss": 0.0006,
"num_tokens": 2248841.0,
"reward": 1.5249999642372132,
"reward_std": 0.32403703927993777,
"rewards/reward_episode/mean": 0.6,
"rewards/reward_episode/std": 0.1851640224456787,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.625,
"rewards/reward_turn_acc/std": 0.13887301683425904,
"step": 170,
"step_time": 144.32274561419982
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 879.4,
"completions/max_terminated_length": 879.4,
"completions/mean_length": 593.05,
"completions/mean_terminated_length": 593.05,
"completions/min_length": 417.0,
"completions/min_terminated_length": 417.0,
"epoch": 0.04375,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.16961710155010223,
"kl": 0.030310070887207986,
"learning_rate": 1.0120595370083318e-07,
"loss": 0.0006,
"num_tokens": 2313523.0,
"reward": 1.368749976158142,
"reward_std": 0.18696351051330568,
"rewards/reward_episode/mean": 0.525,
"rewards/reward_episode/std": 0.10350984334945679,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.54375,
"rewards/reward_turn_acc/std": 0.09038608074188233,
"step": 175,
"step_time": 137.76643493900048
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_length": 3072.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 959.6,
"completions/max_terminated_length": 959.6,
"completions/mean_length": 674.2,
"completions/mean_terminated_length": 674.2,
"completions/min_length": 435.4,
"completions/min_terminated_length": 435.4,
"epoch": 0.045,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.11210030317306519,
"kl": 0.03215576484799385,
"learning_rate": 6.641957350279837e-08,
"loss": 0.0006,
"num_tokens": 2381451.0,
"reward": 1.8999999642372132,
"reward_std": 0.0,
"rewards/reward_episode/mean": 0.8,
"rewards/reward_episode/std": 0.0,
"rewards/reward_format/mean": 0.30000001192092896,
"rewards/reward_format/std": 0.0,
"rewards/reward_turn_acc/mean": 0.8,
"rewards/reward_turn_acc/std": 0.0,
"step": 180,
"step_time": 155.74470006280026
}
],
"logging_steps": 5,
"max_steps": 200,
"num_input_tokens_seen": 2381451,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}