{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.003, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 1505.4375, "completions/mean_terminated_length": 1514.0, "completions/min_length": 1240.0, "completions/min_terminated_length": 1247.0, "entropy": 0.7056397125124931, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 4.146320819854736, "kl": 0.0, "learning_rate": 0.0, "loss": 0.2957, "num_tokens": 69870.0, "reward": -9.635665893554688, "reward_std": 8.469120025634766, "rewards/rollout_reward_func/mean": -9.635665893554688, "rewards/rollout_reward_func/std": 10.615705490112305, "sampling/importance_sampling_ratio/max": 1.892635464668274, "sampling/importance_sampling_ratio/mean": 0.9727171063423157, "sampling/importance_sampling_ratio/min": 3.1309256644105487e-19, "sampling/sampling_logp_difference/max": 21.850507736206055, "sampling/sampling_logp_difference/mean": 0.08760251104831696, "step": 1, "step_time": 19.022535013995366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7056397125124931, "epoch": 2e-05, "grad_norm": 4.182789325714111, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": 0.2957, "step": 2, "step_time": 6.078049275005469 }, { "clip_ratio/high_max": 0.005090707214549184, "clip_ratio/high_mean": 0.002545353607274592, "clip_ratio/low_mean": 0.0009057971183210611, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003451150725595653, "completions/clipped_ratio": 0.03125, "completions/max_length": 1691.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 1547.78125, "completions/mean_terminated_length": 1543.1612548828125, "completions/min_length": 1308.0, "completions/min_terminated_length": 1308.0, "entropy": 0.6416953355073929, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.874028205871582, "kl": 0.0007646833037142642, "learning_rate": 5.714285714285715e-07, "loss": -0.3051, "num_tokens": 141306.0, "reward": -11.108564376831055, "reward_std": 9.716968536376953, "rewards/rollout_reward_func/mean": -11.108564376831055, "rewards/rollout_reward_func/std": 11.346726417541504, "sampling/importance_sampling_ratio/max": 1.4635796546936035, "sampling/importance_sampling_ratio/mean": 0.9265825152397156, "sampling/importance_sampling_ratio/min": 9.699386686718181e-18, "sampling/sampling_logp_difference/max": 23.458328247070312, "sampling/sampling_logp_difference/mean": 0.07872612774372101, "step": 3, "step_time": 18.240646249993006 }, { "clip_ratio/high_max": 0.002470439241733402, "clip_ratio/high_mean": 0.001235219620866701, "clip_ratio/low_mean": 0.0008561643771827221, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002091383998049423, "entropy": 0.6416260898113251, "epoch": 4e-05, "grad_norm": 1.7613728046417236, "kl": 0.0008419926089118235, "learning_rate": 8.571428571428572e-07, "loss": -0.3021, "step": 4, "step_time": 6.554672034995747 }, { "clip_ratio/high_max": 0.0015432098880410194, "clip_ratio/high_mean": 0.0007716049440205097, "clip_ratio/low_mean": 0.0004960317746736109, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012676367186941206, "completions/clipped_ratio": 0.03125, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 1544.46875, "completions/mean_terminated_length": 1555.741943359375, "completions/min_length": 1195.0, "completions/min_terminated_length": 1311.0, "entropy": 0.7032733336091042, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 3.280946969985962, "kl": 0.0008866179341566749, "learning_rate": 1.142857142857143e-06, "loss": 0.5188, "num_tokens": 212465.0, "reward": 0.5527644157409668, "reward_std": 11.524538040161133, "rewards/rollout_reward_func/mean": 0.5527644157409668, "rewards/rollout_reward_func/std": 14.078240394592285, "sampling/importance_sampling_ratio/max": 1.9777272939682007, "sampling/importance_sampling_ratio/mean": 1.0544970035552979, "sampling/importance_sampling_ratio/min": 0.3775985836982727, "sampling/sampling_logp_difference/max": 0.475085973739624, "sampling/sampling_logp_difference/mean": 0.019764788448810577, "step": 5, "step_time": 19.19550205100677 }, { "clip_ratio/high_max": 0.010177965508773923, "clip_ratio/high_mean": 0.0050889827543869615, "clip_ratio/low_mean": 0.0004960317746736109, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005585014529060572, "entropy": 0.7035747244954109, "epoch": 6e-05, "grad_norm": 2.985136032104492, "kl": 0.0008923996865632944, "learning_rate": 1.4285714285714286e-06, "loss": 0.5139, "step": 6, "step_time": 6.198994932987262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 1478.875, "completions/mean_terminated_length": 1486.4334716796875, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "entropy": 0.7195949107408524, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 3.0368170738220215, "kl": 0.0008395522236241959, "learning_rate": 1.7142857142857145e-06, "loss": 0.5239, "num_tokens": 280579.0, "reward": -5.248658180236816, "reward_std": 11.929805755615234, "rewards/rollout_reward_func/mean": -5.248658180236816, "rewards/rollout_reward_func/std": 13.250045776367188, "sampling/importance_sampling_ratio/max": 2.3635098934173584, "sampling/importance_sampling_ratio/mean": 0.9656180143356323, "sampling/importance_sampling_ratio/min": 8.381896420435572e-18, "sampling/sampling_logp_difference/max": 21.992216110229492, "sampling/sampling_logp_difference/mean": 0.07360320538282394, "step": 7, "step_time": 17.63883046999399 }, { "clip_ratio/high_max": 0.0028465966461226344, "clip_ratio/high_mean": 0.0014232983230613172, "clip_ratio/low_mean": 0.0004960317746736109, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019193300977349281, "entropy": 0.7206105217337608, "epoch": 8e-05, "grad_norm": 3.1657252311706543, "kl": 0.0007286458640010096, "learning_rate": 2.0000000000000003e-06, "loss": 0.5216, "step": 8, "step_time": 6.947117421994335 }, { "clip_ratio/high_max": 0.013803776353597641, "clip_ratio/high_mean": 0.0069018881767988205, "clip_ratio/low_mean": 0.001992527977563441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008894416154362261, "completions/clipped_ratio": 0.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 1565.0625, "completions/mean_terminated_length": 1565.0625, "completions/min_length": 1150.0, "completions/min_terminated_length": 1150.0, "entropy": 0.6926305592060089, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 2.602632522583008, "kl": 0.0013718551199417561, "learning_rate": 2.285714285714286e-06, "loss": -0.4899, "num_tokens": 352298.0, "reward": -8.862421989440918, "reward_std": 9.720939636230469, "rewards/rollout_reward_func/mean": -8.862421989440918, "rewards/rollout_reward_func/std": 12.162670135498047, "sampling/importance_sampling_ratio/max": 1.8105370998382568, "sampling/importance_sampling_ratio/mean": 0.946445107460022, "sampling/importance_sampling_ratio/min": 1.3760463601908413e-10, "sampling/sampling_logp_difference/max": 20.209341049194336, "sampling/sampling_logp_difference/mean": 0.051895033568143845, "step": 9, "step_time": 18.235352158983005 }, { "clip_ratio/high_max": 0.013143392279744148, "clip_ratio/high_mean": 0.006571696139872074, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006571696139872074, "entropy": 0.6948402151465416, "epoch": 0.0001, "grad_norm": 2.590895414352417, "kl": 0.0016902631941775326, "learning_rate": 2.571428571428571e-06, "loss": -0.4913, "step": 10, "step_time": 6.3234141299908515 }, { "clip_ratio/high_max": 0.006455760798417032, "clip_ratio/high_mean": 0.003227880399208516, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003227880399208516, "completions/clipped_ratio": 0.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 1571.59375, "completions/mean_terminated_length": 1571.59375, "completions/min_length": 1337.0, "completions/min_terminated_length": 1337.0, "entropy": 0.6634590178728104, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 2.8351783752441406, "kl": 0.001410028140526265, "learning_rate": 2.8571428571428573e-06, "loss": 0.1036, "num_tokens": 423010.0, "reward": -0.6574823260307312, "reward_std": 10.101003646850586, "rewards/rollout_reward_func/mean": -0.6574823260307312, "rewards/rollout_reward_func/std": 10.188403129577637, "sampling/importance_sampling_ratio/max": 1.7375725507736206, "sampling/importance_sampling_ratio/mean": 0.9902209043502808, "sampling/importance_sampling_ratio/min": 8.770549505990296e-20, "sampling/sampling_logp_difference/max": 22.00476837158203, "sampling/sampling_logp_difference/mean": 0.058164019137620926, "step": 11, "step_time": 18.449957339005778 }, { "clip_ratio/high_max": 0.006455760798417032, "clip_ratio/high_mean": 0.003227880399208516, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003227880399208516, "entropy": 0.6640197485685349, "epoch": 0.00012, "grad_norm": 2.6475937366485596, "kl": 0.0018406793096801266, "learning_rate": 3.142857142857143e-06, "loss": 0.1028, "step": 12, "step_time": 6.770373284991365 }, { "clip_ratio/high_max": 0.0034497525775805116, "clip_ratio/high_mean": 0.0017248762887902558, "clip_ratio/low_mean": 0.0019273775978945196, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036522538866847754, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 1585.25, "completions/mean_terminated_length": 1585.25, "completions/min_length": 1445.0, "completions/min_terminated_length": 1445.0, "entropy": 0.5962456464767456, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 2.011409044265747, "kl": 0.0016392105462728068, "learning_rate": 3.428571428571429e-06, "loss": -0.1523, "num_tokens": 495351.0, "reward": -1.2796903848648071, "reward_std": 10.362476348876953, "rewards/rollout_reward_func/mean": -1.2796903848648071, "rewards/rollout_reward_func/std": 11.458198547363281, "sampling/importance_sampling_ratio/max": 2.017104148864746, "sampling/importance_sampling_ratio/mean": 0.9518488049507141, "sampling/importance_sampling_ratio/min": 1.429809800014038e-20, "sampling/sampling_logp_difference/max": 24.189453125, "sampling/sampling_logp_difference/mean": 0.09208101034164429, "step": 13, "step_time": 17.74480581501848 }, { "clip_ratio/high_max": 0.004167824285104871, "clip_ratio/high_mean": 0.0020839121425524354, "clip_ratio/low_mean": 0.0028076592716388404, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004891571414191276, "entropy": 0.5969984382390976, "epoch": 0.00014, "grad_norm": 2.555802822113037, "kl": 0.0025408544097444974, "learning_rate": 3.7142857142857146e-06, "loss": -0.1503, "step": 14, "step_time": 6.206920059004915 }, { "clip_ratio/high_max": 0.003401517984457314, "clip_ratio/high_mean": 0.001700758992228657, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001700758992228657, "completions/clipped_ratio": 0.0, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 1589.21875, "completions/mean_terminated_length": 1589.21875, "completions/min_length": 1479.0, "completions/min_terminated_length": 1479.0, "entropy": 0.6188214793801308, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 1.8131235837936401, "kl": 0.003706506351591088, "learning_rate": 4.000000000000001e-06, "loss": -0.0781, "num_tokens": 567827.0, "reward": -4.277540683746338, "reward_std": 9.351062774658203, "rewards/rollout_reward_func/mean": -4.277540683746338, "rewards/rollout_reward_func/std": 14.185153007507324, "sampling/importance_sampling_ratio/max": 1.5239903926849365, "sampling/importance_sampling_ratio/mean": 0.9080507755279541, "sampling/importance_sampling_ratio/min": 1.693769272710818e-19, "sampling/sampling_logp_difference/max": 22.853668212890625, "sampling/sampling_logp_difference/mean": 0.08956147730350494, "step": 15, "step_time": 31.3668181450048 }, { "clip_ratio/high_max": 0.005675723543390632, "clip_ratio/high_mean": 0.002837861771695316, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002837861771695316, "entropy": 0.6197901889681816, "epoch": 0.00016, "grad_norm": 1.5492043495178223, "kl": 0.005323049655999057, "learning_rate": 4.2857142857142855e-06, "loss": -0.0802, "step": 16, "step_time": 14.755850558998645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 1586.3125, "completions/mean_terminated_length": 1586.3125, "completions/min_length": 1150.0, "completions/min_terminated_length": 1150.0, "entropy": 0.5790679827332497, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 4.601901531219482, "kl": 0.009158092201687396, "learning_rate": 4.571428571428572e-06, "loss": -0.0294, "num_tokens": 641557.0, "reward": -5.134004592895508, "reward_std": 10.837591171264648, "rewards/rollout_reward_func/mean": -5.134004592895508, "rewards/rollout_reward_func/std": 11.531923294067383, "sampling/importance_sampling_ratio/max": 1.9765654802322388, "sampling/importance_sampling_ratio/mean": 0.9713096022605896, "sampling/importance_sampling_ratio/min": 1.8625308215501304e-16, "sampling/sampling_logp_difference/max": 21.36278533935547, "sampling/sampling_logp_difference/mean": 0.08835697919130325, "step": 17, "step_time": 33.460163987998385 }, { "clip_ratio/high_max": 0.010418982012197375, "clip_ratio/high_mean": 0.006234081112779677, "clip_ratio/low_mean": 0.006801519135478884, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0130356001900509, "entropy": 0.5790194571018219, "epoch": 0.00018, "grad_norm": 2.300344228744507, "kl": 0.015334879979491234, "learning_rate": 4.857142857142858e-06, "loss": -0.0329, "step": 18, "step_time": 14.387441612023395 }, { "clip_ratio/high_max": 0.0067366803996264935, "clip_ratio/high_mean": 0.0033683401998132467, "clip_ratio/low_mean": 0.0031883447081781924, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006556684907991439, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 1606.71875, "completions/mean_terminated_length": 1606.71875, "completions/min_length": 1354.0, "completions/min_terminated_length": 1354.0, "entropy": 0.4879921078681946, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 1.8958797454833984, "kl": 0.02948194113560021, "learning_rate": 5.142857142857142e-06, "loss": 0.1686, "num_tokens": 714529.0, "reward": -0.9935553073883057, "reward_std": 10.485939979553223, "rewards/rollout_reward_func/mean": -0.9935553073883057, "rewards/rollout_reward_func/std": 13.237119674682617, "sampling/importance_sampling_ratio/max": 1.7659637928009033, "sampling/importance_sampling_ratio/mean": 1.108857274055481, "sampling/importance_sampling_ratio/min": 0.43520933389663696, "sampling/sampling_logp_difference/max": 0.503682017326355, "sampling/sampling_logp_difference/mean": 0.023367518559098244, "step": 19, "step_time": 32.34357298797113 }, { "clip_ratio/high_max": 0.016077527543529868, "clip_ratio/high_mean": 0.009080430492758751, "clip_ratio/low_mean": 0.007624797115568072, "clip_ratio/low_min": 0.0016891892300918698, "clip_ratio/region_mean": 0.016705227317288518, "entropy": 0.48477455228567123, "epoch": 0.0002, "grad_norm": 2.582484722137451, "kl": 0.04417935898527503, "learning_rate": 5.428571428571429e-06, "loss": 0.1694, "step": 20, "step_time": 14.828311372024473 }, { "clip_ratio/high_max": 0.004836035426706076, "clip_ratio/high_mean": 0.0034596844343468547, "clip_ratio/low_mean": 0.0010593220358714461, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004519006470218301, "completions/clipped_ratio": 0.0, "completions/max_length": 1871.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 1679.3125, "completions/mean_terminated_length": 1679.3125, "completions/min_length": 1220.0, "completions/min_terminated_length": 1220.0, "entropy": 0.4381440170109272, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 2.0390939712524414, "kl": 0.07074410282075405, "learning_rate": 5.7142857142857145e-06, "loss": -0.0025, "num_tokens": 790222.0, "reward": 3.5100250244140625, "reward_std": 7.844137191772461, "rewards/rollout_reward_func/mean": 3.5100250244140625, "rewards/rollout_reward_func/std": 10.62945556640625, "sampling/importance_sampling_ratio/max": 1.617055058479309, "sampling/importance_sampling_ratio/mean": 0.965487003326416, "sampling/importance_sampling_ratio/min": 0.313318133354187, "sampling/sampling_logp_difference/max": 0.9089560508728027, "sampling/sampling_logp_difference/mean": 0.021700311452150345, "step": 21, "step_time": 32.78306466099457 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0024003623984754086, "clip_ratio/low_mean": 0.004327739996369928, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006728102394845337, "entropy": 0.4312720187008381, "epoch": 0.00022, "grad_norm": 2.9633889198303223, "kl": 0.10374317970126867, "learning_rate": 6e-06, "loss": 0.0036, "step": 22, "step_time": 14.458958123999764 }, { "clip_ratio/high_max": 0.004766571568325162, "clip_ratio/high_mean": 0.002383285784162581, "clip_ratio/low_mean": 0.00566925120074302, "clip_ratio/low_min": 0.0020491802133619785, "clip_ratio/region_mean": 0.0080525369849056, "completions/clipped_ratio": 0.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1615.75, "completions/mean_terminated_length": 1615.75, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "entropy": 0.34873105213046074, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 4.310447692871094, "kl": 0.20971096400171518, "learning_rate": 6.285714285714286e-06, "loss": 0.0463, "num_tokens": 863753.0, "reward": 0.40746456384658813, "reward_std": 6.895313262939453, "rewards/rollout_reward_func/mean": 0.40746456384658813, "rewards/rollout_reward_func/std": 7.3030476570129395, "sampling/importance_sampling_ratio/max": 1.7552417516708374, "sampling/importance_sampling_ratio/mean": 1.0114002227783203, "sampling/importance_sampling_ratio/min": 0.320573091506958, "sampling/sampling_logp_difference/max": 1.0495622158050537, "sampling/sampling_logp_difference/mean": 0.024037808179855347, "step": 23, "step_time": 19.614130993984872 }, { "clip_ratio/high_max": 0.004766571568325162, "clip_ratio/high_mean": 0.002383285784162581, "clip_ratio/low_mean": 0.0031999836210161448, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005583269288763404, "entropy": 0.34637371078133583, "epoch": 0.00024, "grad_norm": 4.1669416427612305, "kl": 0.19860890274867415, "learning_rate": 6.571428571428572e-06, "loss": 0.0417, "step": 24, "step_time": 7.2632460259628715 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.002195685636252165, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035254728281870484, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 1653.25, "completions/mean_terminated_length": 1653.25, "completions/min_length": 975.0, "completions/min_terminated_length": 975.0, "entropy": 0.3655538037419319, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 1.2680400609970093, "kl": 0.08888887660577893, "learning_rate": 6.857142857142858e-06, "loss": 0.0305, "num_tokens": 938695.0, "reward": 2.0412206649780273, "reward_std": 8.42188549041748, "rewards/rollout_reward_func/mean": 2.0412206649780273, "rewards/rollout_reward_func/std": 9.144899368286133, "sampling/importance_sampling_ratio/max": 1.2744624614715576, "sampling/importance_sampling_ratio/mean": 0.9358847737312317, "sampling/importance_sampling_ratio/min": 0.3314089775085449, "sampling/sampling_logp_difference/max": 0.9411232471466064, "sampling/sampling_logp_difference/mean": 0.019530773162841797, "step": 25, "step_time": 17.73095381996245 }, { "clip_ratio/high_max": 0.010099776554852724, "clip_ratio/high_mean": 0.005049888277426362, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005049888277426362, "entropy": 0.3649589419364929, "epoch": 0.00026, "grad_norm": 1.5859415531158447, "kl": 0.0808827462606132, "learning_rate": 7.1428571428571436e-06, "loss": 0.0278, "step": 26, "step_time": 6.368391898999107 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0036287567345425487, "clip_ratio/low_mean": 0.003495603916235268, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007124360650777817, "completions/clipped_ratio": 0.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 1645.25, "completions/mean_terminated_length": 1645.25, "completions/min_length": 1452.0, "completions/min_terminated_length": 1452.0, "entropy": 0.3710002973675728, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 1.6164464950561523, "kl": 0.12447707494720817, "learning_rate": 7.428571428571429e-06, "loss": 0.0443, "num_tokens": 1013031.0, "reward": 8.975237846374512, "reward_std": 8.62988567352295, "rewards/rollout_reward_func/mean": 8.975237846374512, "rewards/rollout_reward_func/std": 11.465543746948242, "sampling/importance_sampling_ratio/max": 1.3292858600616455, "sampling/importance_sampling_ratio/mean": 0.9924591779708862, "sampling/importance_sampling_ratio/min": 0.4715452492237091, "sampling/sampling_logp_difference/max": 0.8626588582992554, "sampling/sampling_logp_difference/mean": 0.019632747396826744, "step": 27, "step_time": 17.42895478400169 }, { "clip_ratio/high_max": 0.004826958058401942, "clip_ratio/high_mean": 0.0037155624013394117, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037155624013394117, "entropy": 0.3706408813595772, "epoch": 0.00028, "grad_norm": 1.235813021659851, "kl": 0.11198390694335103, "learning_rate": 7.714285714285716e-06, "loss": 0.0413, "step": 28, "step_time": 6.83195742002863 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00232667347881943, "completions/clipped_ratio": 0.0, "completions/max_length": 1834.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 1701.34375, "completions/mean_terminated_length": 1701.34375, "completions/min_length": 1315.0, "completions/min_terminated_length": 1315.0, "entropy": 0.3397510312497616, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.8463234901428223, "kl": 0.058322271797806025, "learning_rate": 8.000000000000001e-06, "loss": -0.0408, "num_tokens": 1089202.0, "reward": 7.994076728820801, "reward_std": 6.615372180938721, "rewards/rollout_reward_func/mean": 7.994076728820801, "rewards/rollout_reward_func/std": 9.697607040405273, "sampling/importance_sampling_ratio/max": 1.406444787979126, "sampling/importance_sampling_ratio/mean": 1.0202399492263794, "sampling/importance_sampling_ratio/min": 0.43312764167785645, "sampling/sampling_logp_difference/max": 0.943016767501831, "sampling/sampling_logp_difference/mean": 0.01731639727950096, "step": 29, "step_time": 17.841164074008702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3368573635816574, "epoch": 0.0003, "grad_norm": 1.1397476196289062, "kl": 0.05558100715279579, "learning_rate": 8.285714285714287e-06, "loss": -0.0454, "step": 30, "step_time": 6.388152815998183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002465846948325634, "clip_ratio/low_min": 0.0008333333535119891, "clip_ratio/region_mean": 0.002465846948325634, "completions/clipped_ratio": 0.03125, "completions/max_length": 1809.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 1638.8125, "completions/mean_terminated_length": 1640.354736328125, "completions/min_length": 1315.0, "completions/min_terminated_length": 1315.0, "entropy": 0.4789782762527466, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 1.5038994550704956, "kl": 0.11407524440437555, "learning_rate": 8.571428571428571e-06, "loss": -0.207, "num_tokens": 1162947.0, "reward": 2.8459901809692383, "reward_std": 8.33094310760498, "rewards/rollout_reward_func/mean": 2.8459901809692383, "rewards/rollout_reward_func/std": 9.87761402130127, "sampling/importance_sampling_ratio/max": 1.3517836332321167, "sampling/importance_sampling_ratio/mean": 0.8868120908737183, "sampling/importance_sampling_ratio/min": 2.9791601585844874e-13, "sampling/sampling_logp_difference/max": 8.996126174926758, "sampling/sampling_logp_difference/mean": 0.053531065583229065, "step": 31, "step_time": 17.34830972299096 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.005260995589196682, "clip_ratio/low_min": 0.0016666667070239782, "clip_ratio/region_mean": 0.006590782781131566, "entropy": 0.4772786870598793, "epoch": 0.00032, "grad_norm": 1.4022109508514404, "kl": 0.10773721244186163, "learning_rate": 8.857142857142858e-06, "loss": -0.209, "step": 32, "step_time": 6.7390321699786 }, { "clip_ratio/high_max": 0.0020833334419876337, "clip_ratio/high_mean": 0.002066256944090128, "clip_ratio/low_mean": 0.0021186440717428923, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00418490101583302, "completions/clipped_ratio": 0.0, "completions/max_length": 1786.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 1598.625, "completions/mean_terminated_length": 1598.625, "completions/min_length": 1464.0, "completions/min_terminated_length": 1464.0, "entropy": 0.45652540028095245, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 1.816322684288025, "kl": 0.07848886027932167, "learning_rate": 9.142857142857144e-06, "loss": -0.1279, "num_tokens": 1235783.0, "reward": 0.9879084229469299, "reward_std": 9.234064102172852, "rewards/rollout_reward_func/mean": 0.9879084229469299, "rewards/rollout_reward_func/std": 9.167927742004395, "sampling/importance_sampling_ratio/max": 2.423945903778076, "sampling/importance_sampling_ratio/mean": 1.00472092628479, "sampling/importance_sampling_ratio/min": 1.0657071248471576e-19, "sampling/sampling_logp_difference/max": 24.90825653076172, "sampling/sampling_logp_difference/mean": 0.06860785186290741, "step": 33, "step_time": 17.159739262991934 }, { "clip_ratio/high_max": 0.0020833334419876337, "clip_ratio/high_mean": 0.002066256944090128, "clip_ratio/low_mean": 0.002476415189448744, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004542672133538872, "entropy": 0.4551401659846306, "epoch": 0.00034, "grad_norm": 1.272695541381836, "kl": 0.08226586831733584, "learning_rate": 9.42857142857143e-06, "loss": -0.13, "step": 34, "step_time": 6.199632717005443 }, { "clip_ratio/high_max": 0.004098360426723957, "clip_ratio/high_mean": 0.0020491802133619785, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003073770320042968, "completions/clipped_ratio": 0.0, "completions/max_length": 1877.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 1686.1875, "completions/mean_terminated_length": 1686.1875, "completions/min_length": 1421.0, "completions/min_terminated_length": 1421.0, "entropy": 0.3553536906838417, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 1.202547550201416, "kl": 0.07763728406280279, "learning_rate": 9.714285714285715e-06, "loss": 0.0352, "num_tokens": 1312530.0, "reward": 2.731177568435669, "reward_std": 9.222713470458984, "rewards/rollout_reward_func/mean": 2.731177568435669, "rewards/rollout_reward_func/std": 9.950661659240723, "sampling/importance_sampling_ratio/max": 1.5744929313659668, "sampling/importance_sampling_ratio/mean": 1.014841079711914, "sampling/importance_sampling_ratio/min": 0.4829196333885193, "sampling/sampling_logp_difference/max": 0.5506792068481445, "sampling/sampling_logp_difference/mean": 0.01920858770608902, "step": 35, "step_time": 17.74512630897516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018691847217269242, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018691847217269242, "entropy": 0.35133614018559456, "epoch": 0.00036, "grad_norm": 1.1520718336105347, "kl": 0.08916338346898556, "learning_rate": 1e-05, "loss": 0.036, "step": 36, "step_time": 6.468839604029199 }, { "clip_ratio/high_max": 0.0028439435409381986, "clip_ratio/high_mean": 0.0014219717704690993, "clip_ratio/low_mean": 0.0010593220358714461, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024812939227558672, "completions/clipped_ratio": 0.03125, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 1612.34375, "completions/mean_terminated_length": 1617.806396484375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.42784957587718964, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 1.6890476942062378, "kl": 0.10881091328337789, "learning_rate": 9.999999999925968e-06, "loss": 0.0876, "num_tokens": 1385754.0, "reward": 1.0901482105255127, "reward_std": 7.094250202178955, "rewards/rollout_reward_func/mean": 1.0901482105255127, "rewards/rollout_reward_func/std": 7.999716281890869, "sampling/importance_sampling_ratio/max": 1.664754867553711, "sampling/importance_sampling_ratio/mean": 0.9950845241546631, "sampling/importance_sampling_ratio/min": 8.571208801134791e-19, "sampling/sampling_logp_difference/max": 20.68086814880371, "sampling/sampling_logp_difference/mean": 0.10799533128738403, "step": 37, "step_time": 18.311824714997783 }, { "clip_ratio/high_max": 0.0034873889526352286, "clip_ratio/high_mean": 0.0017436944763176143, "clip_ratio/low_mean": 0.0031962302746251225, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004939924750942737, "entropy": 0.4263388365507126, "epoch": 0.00038, "grad_norm": 1.2181106805801392, "kl": 0.08622925961390138, "learning_rate": 9.999999999703871e-06, "loss": 0.085, "step": 38, "step_time": 6.260259240996675 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.003442607820034027, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004744691192172468, "completions/clipped_ratio": 0.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 1625.6875, "completions/mean_terminated_length": 1625.6875, "completions/min_length": 909.0, "completions/min_terminated_length": 909.0, "entropy": 0.33136629685759544, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 1.2450401782989502, "kl": 0.0819755382835865, "learning_rate": 9.99999999933371e-06, "loss": -0.1199, "num_tokens": 1459504.0, "reward": 0.9303473234176636, "reward_std": 5.546414375305176, "rewards/rollout_reward_func/mean": 0.9303473234176636, "rewards/rollout_reward_func/std": 6.516648292541504, "sampling/importance_sampling_ratio/max": 1.4690643548965454, "sampling/importance_sampling_ratio/mean": 1.051254153251648, "sampling/importance_sampling_ratio/min": 0.3365603983402252, "sampling/sampling_logp_difference/max": 0.9071788787841797, "sampling/sampling_logp_difference/mean": 0.02257063239812851, "step": 39, "step_time": 17.20556906297861 }, { "clip_ratio/high_max": 0.007465278031304479, "clip_ratio/high_mean": 0.0037326390156522393, "clip_ratio/low_mean": 0.003442607820034027, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007175246835686266, "entropy": 0.33277528174221516, "epoch": 0.0004, "grad_norm": 1.2355331182479858, "kl": 0.07522942405194044, "learning_rate": 9.999999998815483e-06, "loss": -0.1224, "step": 40, "step_time": 6.147663529001875 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024003623984754086, "completions/clipped_ratio": 0.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 1623.75, "completions/mean_terminated_length": 1623.75, "completions/min_length": 1408.0, "completions/min_terminated_length": 1408.0, "entropy": 0.3599771447479725, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 1.2169064283370972, "kl": 0.09305838961154222, "learning_rate": 9.999999998149191e-06, "loss": 0.079, "num_tokens": 1533756.0, "reward": 2.1111409664154053, "reward_std": 8.500118255615234, "rewards/rollout_reward_func/mean": 2.1111409664154053, "rewards/rollout_reward_func/std": 11.245372772216797, "sampling/importance_sampling_ratio/max": 2.621987819671631, "sampling/importance_sampling_ratio/mean": 0.93711256980896, "sampling/importance_sampling_ratio/min": 1.1192202350606713e-16, "sampling/sampling_logp_difference/max": 20.35271644592285, "sampling/sampling_logp_difference/mean": 0.06611192226409912, "step": 41, "step_time": 18.463792597016436 }, { "clip_ratio/high_max": 0.0020833334419876337, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.005452252342365682, "clip_ratio/low_min": 0.0020833334419876337, "clip_ratio/region_mean": 0.006493918946944177, "entropy": 0.35906321555376053, "epoch": 0.00042, "grad_norm": 1.2307145595550537, "kl": 0.10247583128511906, "learning_rate": 9.999999997334835e-06, "loss": 0.0786, "step": 42, "step_time": 6.209023094983422 }, { "clip_ratio/high_max": 0.0021186440717428923, "clip_ratio/high_mean": 0.0010593220358714461, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019273775978945196, "completions/clipped_ratio": 0.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 1273.59375, "completions/mean_terminated_length": 1273.59375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.5211328193545341, "epoch": 0.00043, "frac_reward_zero_std": 0.25, "grad_norm": 1.7480376958847046, "kl": 0.059858485125005245, "learning_rate": 9.999999996372415e-06, "loss": -0.1045, "num_tokens": 1596530.0, "reward": 2.4837710857391357, "reward_std": 6.891545295715332, "rewards/rollout_reward_func/mean": 2.4837710857391357, "rewards/rollout_reward_func/std": 13.304924011230469, "sampling/importance_sampling_ratio/max": 2.274510145187378, "sampling/importance_sampling_ratio/mean": 0.8776018619537354, "sampling/importance_sampling_ratio/min": 3.118805752036876e-17, "sampling/sampling_logp_difference/max": 23.163333892822266, "sampling/sampling_logp_difference/mean": 0.10364654660224915, "step": 43, "step_time": 16.27671506399929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004682649741880596, "clip_ratio/low_min": 0.0020491802133619785, "clip_ratio/region_mean": 0.004682649741880596, "entropy": 0.5199121572077274, "epoch": 0.00044, "grad_norm": 1.2313787937164307, "kl": 0.07027228316292167, "learning_rate": 9.999999995261927e-06, "loss": -0.1062, "step": 44, "step_time": 6.317062545014778 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1491.59375, "completions/mean_terminated_length": 1491.59375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "entropy": 0.35521718487143517, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 1.0407559871673584, "kl": 0.09993920475244522, "learning_rate": 9.999999994003377e-06, "loss": -0.0063, "num_tokens": 1665230.0, "reward": 4.9595489501953125, "reward_std": 7.92152738571167, "rewards/rollout_reward_func/mean": 4.9595489501953125, "rewards/rollout_reward_func/std": 9.487390518188477, "sampling/importance_sampling_ratio/max": 1.8535637855529785, "sampling/importance_sampling_ratio/mean": 0.9862438440322876, "sampling/importance_sampling_ratio/min": 4.1320553804470597e-20, "sampling/sampling_logp_difference/max": 22.90564727783203, "sampling/sampling_logp_difference/mean": 0.0892951563000679, "step": 45, "step_time": 17.677669901022455 }, { "clip_ratio/high_max": 0.004800724796950817, "clip_ratio/high_mean": 0.0024003623984754086, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003424952505156398, "entropy": 0.3552154414355755, "epoch": 0.00046, "grad_norm": 1.0801676511764526, "kl": 0.10049129277467728, "learning_rate": 9.99999999259676e-06, "loss": -0.011, "step": 46, "step_time": 6.8743953139928635 }, { "clip_ratio/high_max": 0.0028409091755747795, "clip_ratio/high_mean": 0.0014204545877873898, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014204545877873898, "completions/clipped_ratio": 0.0, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 1679.5, "completions/mean_terminated_length": 1679.5, "completions/min_length": 1471.0, "completions/min_terminated_length": 1471.0, "entropy": 0.3313494399189949, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 1.3899956941604614, "kl": 0.1133374604396522, "learning_rate": 9.99999999104208e-06, "loss": 0.1801, "num_tokens": 1741397.0, "reward": -1.440907597541809, "reward_std": 6.082624435424805, "rewards/rollout_reward_func/mean": -1.440907597541809, "rewards/rollout_reward_func/std": 8.593509674072266, "sampling/importance_sampling_ratio/max": 1.4595133066177368, "sampling/importance_sampling_ratio/mean": 0.8581361770629883, "sampling/importance_sampling_ratio/min": 0.31652596592903137, "sampling/sampling_logp_difference/max": 0.9885225296020508, "sampling/sampling_logp_difference/mean": 0.026569686830043793, "step": 47, "step_time": 18.06078358599916 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013586956774815917, "entropy": 0.3338804319500923, "epoch": 0.00048, "grad_norm": 1.117269515991211, "kl": 0.10523033188655972, "learning_rate": 9.999999989339336e-06, "loss": 0.1734, "step": 48, "step_time": 6.246455219981726 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0032337038428522646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004535787214990705, "completions/clipped_ratio": 0.03125, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 1699.5, "completions/mean_terminated_length": 1701.741943359375, "completions/min_length": 1355.0, "completions/min_terminated_length": 1355.0, "entropy": 0.36175065487623215, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 1.0797888040542603, "kl": 0.08832791121676564, "learning_rate": 9.999999987488526e-06, "loss": -0.1456, "num_tokens": 1816961.0, "reward": 7.763421058654785, "reward_std": 11.577056884765625, "rewards/rollout_reward_func/mean": 7.763421058654785, "rewards/rollout_reward_func/std": 14.011706352233887, "sampling/importance_sampling_ratio/max": 1.5598341226577759, "sampling/importance_sampling_ratio/mean": 0.9705573320388794, "sampling/importance_sampling_ratio/min": 6.83769001032259e-17, "sampling/sampling_logp_difference/max": 12.651195526123047, "sampling/sampling_logp_difference/mean": 0.061426110565662384, "step": 49, "step_time": 18.69651278895617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010593220358714461, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010593220358714461, "entropy": 0.36302887089550495, "epoch": 0.0005, "grad_norm": 1.0279839038848877, "kl": 0.0812316769734025, "learning_rate": 9.99999998548965e-06, "loss": -0.1452, "step": 50, "step_time": 6.8632628330087755 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.002659574383869767, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002659574383869767, "completions/clipped_ratio": 0.0, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 1629.15625, "completions/mean_terminated_length": 1629.15625, "completions/min_length": 1348.0, "completions/min_terminated_length": 1348.0, "entropy": 0.3763550743460655, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 1.1846777200698853, "kl": 0.10679574217647314, "learning_rate": 9.999999983342712e-06, "loss": -0.1087, "num_tokens": 1890699.0, "reward": 2.4481358528137207, "reward_std": 7.529686450958252, "rewards/rollout_reward_func/mean": 2.4481358528137207, "rewards/rollout_reward_func/std": 7.819620609283447, "sampling/importance_sampling_ratio/max": 1.4895434379577637, "sampling/importance_sampling_ratio/mean": 0.9461918473243713, "sampling/importance_sampling_ratio/min": 0.28045061230659485, "sampling/sampling_logp_difference/max": 0.9607776403427124, "sampling/sampling_logp_difference/mean": 0.026092153042554855, "step": 51, "step_time": 16.65602345195657 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0034910652320832014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004820852424018085, "entropy": 0.37588672526180744, "epoch": 0.00052, "grad_norm": 1.1473146677017212, "kl": 0.11360744666308165, "learning_rate": 9.999999981047708e-06, "loss": -0.1099, "step": 52, "step_time": 6.154988697031513 }, { "clip_ratio/high_max": 0.005817099707201123, "clip_ratio/high_mean": 0.0029085498536005616, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029085498536005616, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 1702.8125, "completions/mean_terminated_length": 1702.8125, "completions/min_length": 1351.0, "completions/min_terminated_length": 1351.0, "entropy": 0.3132033497095108, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 1.0671532154083252, "kl": 0.07768056727945805, "learning_rate": 9.999999978604638e-06, "loss": -0.0196, "num_tokens": 1966802.0, "reward": 0.9729039072990417, "reward_std": 7.330527305603027, "rewards/rollout_reward_func/mean": 0.9729039072990417, "rewards/rollout_reward_func/std": 8.473395347595215, "sampling/importance_sampling_ratio/max": 1.4216086864471436, "sampling/importance_sampling_ratio/mean": 0.9868911504745483, "sampling/importance_sampling_ratio/min": 0.5096778273582458, "sampling/sampling_logp_difference/max": 0.4956507682800293, "sampling/sampling_logp_difference/mean": 0.02278944104909897, "step": 53, "step_time": 18.001605535988347 }, { "clip_ratio/high_max": 0.0028409091755747795, "clip_ratio/high_mean": 0.0014204545877873898, "clip_ratio/low_mean": 0.0011160714784637094, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002536526066251099, "entropy": 0.31401190534234047, "epoch": 0.00054, "grad_norm": 1.0782172679901123, "kl": 0.07678838539868593, "learning_rate": 9.999999976013505e-06, "loss": -0.0195, "step": 54, "step_time": 6.732543230013107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002066256827674806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002066256827674806, "completions/clipped_ratio": 0.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 1607.53125, "completions/mean_terminated_length": 1607.53125, "completions/min_length": 1421.0, "completions/min_terminated_length": 1421.0, "entropy": 0.3205175884068012, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 1.0191819667816162, "kl": 0.13573621585965157, "learning_rate": 9.999999973274305e-06, "loss": -0.023, "num_tokens": 2039506.0, "reward": 11.198617935180664, "reward_std": 9.031862258911133, "rewards/rollout_reward_func/mean": 11.198617935180664, "rewards/rollout_reward_func/std": 11.013355255126953, "sampling/importance_sampling_ratio/max": 1.4095962047576904, "sampling/importance_sampling_ratio/mean": 0.9571484327316284, "sampling/importance_sampling_ratio/min": 3.873824585856607e-19, "sampling/sampling_logp_difference/max": 24.27388572692871, "sampling/sampling_logp_difference/mean": 0.07361195236444473, "step": 55, "step_time": 16.54867287199886 }, { "clip_ratio/high_max": 0.0021551724057644606, "clip_ratio/high_mean": 0.0010775862028822303, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021021763095632195, "entropy": 0.32108433172106743, "epoch": 0.00056, "grad_norm": 0.9433408975601196, "kl": 0.13122781459242105, "learning_rate": 9.999999970387043e-06, "loss": -0.0232, "step": 56, "step_time": 6.009192591023748 }, { "clip_ratio/high_max": 0.008159722434356809, "clip_ratio/high_mean": 0.004079861217178404, "clip_ratio/low_mean": 0.0019154864130541682, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059953476302325726, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1635.125, "completions/mean_terminated_length": 1635.125, "completions/min_length": 1349.0, "completions/min_terminated_length": 1349.0, "entropy": 0.37475926242768764, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 1.4047514200210571, "kl": 0.12750958371907473, "learning_rate": 9.999999967351714e-06, "loss": -0.1336, "num_tokens": 2113240.0, "reward": -1.3381195068359375, "reward_std": 9.812685012817383, "rewards/rollout_reward_func/mean": -1.3381195068359375, "rewards/rollout_reward_func/std": 10.788722038269043, "sampling/importance_sampling_ratio/max": 2.0437846183776855, "sampling/importance_sampling_ratio/mean": 0.9757305383682251, "sampling/importance_sampling_ratio/min": 0.2844389081001282, "sampling/sampling_logp_difference/max": 0.7622966766357422, "sampling/sampling_logp_difference/mean": 0.028693925589323044, "step": 57, "step_time": 17.556380936992355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001897831098176539, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001897831098176539, "entropy": 0.3760465532541275, "epoch": 0.00058, "grad_norm": 1.4485571384429932, "kl": 0.13581231329590082, "learning_rate": 9.999999964168322e-06, "loss": -0.1346, "step": 58, "step_time": 7.198723911991692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017637122655287385, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017637122655287385, "completions/clipped_ratio": 0.0, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 1639.0625, "completions/mean_terminated_length": 1639.0625, "completions/min_length": 1336.0, "completions/min_terminated_length": 1336.0, "entropy": 0.4102902077138424, "epoch": 0.00059, "frac_reward_zero_std": 0.0, "grad_norm": 1.130444884300232, "kl": 0.11346077732741833, "learning_rate": 9.999999960836863e-06, "loss": -0.0705, "num_tokens": 2187156.0, "reward": 3.3815667629241943, "reward_std": 9.446186065673828, "rewards/rollout_reward_func/mean": 3.3815667629241943, "rewards/rollout_reward_func/std": 11.451709747314453, "sampling/importance_sampling_ratio/max": 1.7240840196609497, "sampling/importance_sampling_ratio/mean": 0.9092661142349243, "sampling/importance_sampling_ratio/min": 4.494004090223549e-20, "sampling/sampling_logp_difference/max": 22.416624069213867, "sampling/sampling_logp_difference/mean": 0.07765375822782516, "step": 59, "step_time": 17.590123357018456 }, { "clip_ratio/high_max": 0.0020833334419876337, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0019607843714766204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030024510924704373, "entropy": 0.4114570878446102, "epoch": 0.0006, "grad_norm": 1.3044610023498535, "kl": 0.10379154980182648, "learning_rate": 9.99999995735734e-06, "loss": -0.0719, "step": 60, "step_time": 6.4113933829794405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018926456687040627, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018926456687040627, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 1682.75, "completions/mean_terminated_length": 1682.75, "completions/min_length": 1455.0, "completions/min_terminated_length": 1455.0, "entropy": 0.3523150607943535, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 1.1861178874969482, "kl": 0.06870952108874917, "learning_rate": 9.999999953729754e-06, "loss": 0.1259, "num_tokens": 2262211.0, "reward": 3.6392788887023926, "reward_std": 7.325782775878906, "rewards/rollout_reward_func/mean": 3.6392788887023926, "rewards/rollout_reward_func/std": 8.627582550048828, "sampling/importance_sampling_ratio/max": 1.8609733581542969, "sampling/importance_sampling_ratio/mean": 0.9519423842430115, "sampling/importance_sampling_ratio/min": 0.38773787021636963, "sampling/sampling_logp_difference/max": 0.7177550792694092, "sampling/sampling_logp_difference/mean": 0.02772839367389679, "step": 61, "step_time": 17.91583978800918 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002631870564073324, "entropy": 0.3515685051679611, "epoch": 0.00062, "grad_norm": 1.1726771593093872, "kl": 0.06807711208239198, "learning_rate": 9.9999999499541e-06, "loss": 0.1276, "step": 62, "step_time": 7.403751600009855 }, { "clip_ratio/high_max": 0.004132513655349612, "clip_ratio/high_mean": 0.002066256827674806, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002066256827674806, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1654.59375, "completions/mean_terminated_length": 1654.59375, "completions/min_length": 1457.0, "completions/min_terminated_length": 1457.0, "entropy": 0.37839649245142937, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 1.9293122291564941, "kl": 0.07969994377344847, "learning_rate": 9.999999946030383e-06, "loss": 0.128, "num_tokens": 2337359.0, "reward": 2.588115692138672, "reward_std": 7.043045997619629, "rewards/rollout_reward_func/mean": 2.588115692138672, "rewards/rollout_reward_func/std": 9.081293106079102, "sampling/importance_sampling_ratio/max": 2.2831010818481445, "sampling/importance_sampling_ratio/mean": 1.075446367263794, "sampling/importance_sampling_ratio/min": 1.2042303410014646e-15, "sampling/sampling_logp_difference/max": 21.305469512939453, "sampling/sampling_logp_difference/mean": 0.06199558824300766, "step": 63, "step_time": 17.69512166903587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003073770320042968, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003073770320042968, "entropy": 0.37679174914956093, "epoch": 0.00064, "grad_norm": 1.269195556640625, "kl": 0.08935316279530525, "learning_rate": 9.999999941958601e-06, "loss": 0.1233, "step": 64, "step_time": 6.267018090977217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0020839121425524354, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020839121425524354, "completions/clipped_ratio": 0.0, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 1646.25, "completions/mean_terminated_length": 1646.25, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "entropy": 0.3475262373685837, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 1.3278712034225464, "kl": 0.09133362490683794, "learning_rate": 9.999999937738754e-06, "loss": -0.0381, "num_tokens": 2412061.0, "reward": 3.6248416900634766, "reward_std": 8.621959686279297, "rewards/rollout_reward_func/mean": 3.6248416900634766, "rewards/rollout_reward_func/std": 9.7487211227417, "sampling/importance_sampling_ratio/max": 1.5179404020309448, "sampling/importance_sampling_ratio/mean": 0.9851886034011841, "sampling/importance_sampling_ratio/min": 0.39749738574028015, "sampling/sampling_logp_difference/max": 0.5305309891700745, "sampling/sampling_logp_difference/mean": 0.02566239796578884, "step": 65, "step_time": 17.0000887010101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006216426030732691, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006216426030732691, "entropy": 0.3449064679443836, "epoch": 0.00066, "grad_norm": 0.9543479084968567, "kl": 0.09928610362112522, "learning_rate": 9.999999933370843e-06, "loss": -0.0429, "step": 66, "step_time": 6.694458622019738 }, { "clip_ratio/high_max": 0.004766571568325162, "clip_ratio/high_mean": 0.002383285784162581, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002383285784162581, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 1635.4375, "completions/mean_terminated_length": 1635.4375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.32538245618343353, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 1.3552147150039673, "kl": 0.13805021718144417, "learning_rate": 9.999999928854868e-06, "loss": -0.0538, "num_tokens": 2485968.0, "reward": -0.38425278663635254, "reward_std": 8.660255432128906, "rewards/rollout_reward_func/mean": -0.38425278663635254, "rewards/rollout_reward_func/std": 11.226299285888672, "sampling/importance_sampling_ratio/max": 1.9103338718414307, "sampling/importance_sampling_ratio/mean": 0.9491332173347473, "sampling/importance_sampling_ratio/min": 0.5465161204338074, "sampling/sampling_logp_difference/max": 0.4849357604980469, "sampling/sampling_logp_difference/mean": 0.02541452832520008, "step": 67, "step_time": 17.454139159992337 }, { "clip_ratio/high_max": 0.005495169200003147, "clip_ratio/high_mean": 0.0027475846000015736, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027475846000015736, "entropy": 0.3242308795452118, "epoch": 0.00068, "grad_norm": 1.3603967428207397, "kl": 0.14271392300724983, "learning_rate": 9.999999924190826e-06, "loss": -0.0534, "step": 68, "step_time": 6.281184991981718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 1641.65625, "completions/mean_terminated_length": 1641.65625, "completions/min_length": 1455.0, "completions/min_terminated_length": 1455.0, "entropy": 0.2903779149055481, "epoch": 0.00069, "frac_reward_zero_std": 0.0, "grad_norm": 1.1047630310058594, "kl": 0.1457831682637334, "learning_rate": 9.99999991937872e-06, "loss": -0.0965, "num_tokens": 2560261.0, "reward": 9.005597114562988, "reward_std": 12.582215309143066, "rewards/rollout_reward_func/mean": 9.005597114562988, "rewards/rollout_reward_func/std": 13.062797546386719, "sampling/importance_sampling_ratio/max": 1.6109542846679688, "sampling/importance_sampling_ratio/mean": 1.0130547285079956, "sampling/importance_sampling_ratio/min": 0.4675336480140686, "sampling/sampling_logp_difference/max": 0.7631773948669434, "sampling/sampling_logp_difference/mean": 0.02319551445543766, "step": 69, "step_time": 16.773872042016592 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.2915779519826174, "epoch": 0.0007, "grad_norm": 1.0945956707000732, "kl": 0.13914149813354015, "learning_rate": 9.99999991441855e-06, "loss": -0.0945, "step": 70, "step_time": 6.704220099010854 }, { "clip_ratio/high_max": 0.004872563760727644, "clip_ratio/high_mean": 0.002436281880363822, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002436281880363822, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 1590.59375, "completions/mean_terminated_length": 1590.59375, "completions/min_length": 1317.0, "completions/min_terminated_length": 1317.0, "entropy": 0.2997173201292753, "epoch": 0.00071, "frac_reward_zero_std": 0.0, "grad_norm": 1.124435544013977, "kl": 0.12970630079507828, "learning_rate": 9.999999909310314e-06, "loss": -0.0803, "num_tokens": 2632848.0, "reward": 6.44558572769165, "reward_std": 11.013298034667969, "rewards/rollout_reward_func/mean": 6.44558572769165, "rewards/rollout_reward_func/std": 15.794655799865723, "sampling/importance_sampling_ratio/max": 1.4456207752227783, "sampling/importance_sampling_ratio/mean": 0.9680818319320679, "sampling/importance_sampling_ratio/min": 0.3740185797214508, "sampling/sampling_logp_difference/max": 0.6084417104721069, "sampling/sampling_logp_difference/mean": 0.025496874004602432, "step": 71, "step_time": 17.151413693005452 }, { "clip_ratio/high_max": 0.0042402095859870315, "clip_ratio/high_mean": 0.0021201047929935157, "clip_ratio/low_mean": 0.0012755101779475808, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033956149709410965, "entropy": 0.30025866255164146, "epoch": 0.00072, "grad_norm": 1.1591134071350098, "kl": 0.11754721309989691, "learning_rate": 9.999999904054014e-06, "loss": -0.086, "step": 72, "step_time": 6.086873343985644 }, { "clip_ratio/high_max": 0.0027777778450399637, "clip_ratio/high_mean": 0.0013888889225199819, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888889225199819, "completions/clipped_ratio": 0.0, "completions/max_length": 1741.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 1630.4375, "completions/mean_terminated_length": 1630.4375, "completions/min_length": 1518.0, "completions/min_terminated_length": 1518.0, "entropy": 0.2578518223017454, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.9294137358665466, "kl": 0.1159139471128583, "learning_rate": 9.999999898649649e-06, "loss": -0.1298, "num_tokens": 2706686.0, "reward": 6.557687759399414, "reward_std": 8.066841125488281, "rewards/rollout_reward_func/mean": 6.557687759399414, "rewards/rollout_reward_func/std": 9.539935111999512, "sampling/importance_sampling_ratio/max": 1.8169114589691162, "sampling/importance_sampling_ratio/mean": 0.963904857635498, "sampling/importance_sampling_ratio/min": 0.5572739243507385, "sampling/sampling_logp_difference/max": 0.6680850982666016, "sampling/sampling_logp_difference/mean": 0.025495607405900955, "step": 73, "step_time": 16.83965299000556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.25771068409085274, "epoch": 0.00074, "grad_norm": 0.8422790765762329, "kl": 0.11813578475266695, "learning_rate": 9.99999989309722e-06, "loss": -0.133, "step": 74, "step_time": 6.587405835976824 }, { "clip_ratio/high_max": 0.005376965738832951, "clip_ratio/high_mean": 0.0026884828694164753, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004047178546898067, "completions/clipped_ratio": 0.0, "completions/max_length": 1847.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 1654.15625, "completions/mean_terminated_length": 1654.15625, "completions/min_length": 1500.0, "completions/min_terminated_length": 1500.0, "entropy": 0.29320803470909595, "epoch": 0.00075, "frac_reward_zero_std": 0.0, "grad_norm": 1.0994460582733154, "kl": 0.12737912870943546, "learning_rate": 9.999999887396725e-06, "loss": -0.1709, "num_tokens": 2781272.0, "reward": 7.248015403747559, "reward_std": 7.882260799407959, "rewards/rollout_reward_func/mean": 7.248015403747559, "rewards/rollout_reward_func/std": 9.155941009521484, "sampling/importance_sampling_ratio/max": 1.5756421089172363, "sampling/importance_sampling_ratio/mean": 0.870418906211853, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8544964790344238, "sampling/sampling_logp_difference/mean": 0.025140468031167984, "step": 75, "step_time": 17.744673141016392 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026607790496200323, "entropy": 0.29205809719860554, "epoch": 0.00076, "grad_norm": 1.0844976902008057, "kl": 0.13281533867120743, "learning_rate": 9.999999881548166e-06, "loss": -0.1705, "step": 76, "step_time": 6.362084027990932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0008445946150459349, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008445946150459349, "completions/clipped_ratio": 0.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 1649.84375, "completions/mean_terminated_length": 1649.84375, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "entropy": 0.39790954254567623, "epoch": 0.00077, "frac_reward_zero_std": 0.0, "grad_norm": 1.8977450132369995, "kl": 0.11336077656596899, "learning_rate": 9.999999875551542e-06, "loss": 0.3828, "num_tokens": 2855851.0, "reward": 5.79808235168457, "reward_std": 6.402070999145508, "rewards/rollout_reward_func/mean": 5.79808235168457, "rewards/rollout_reward_func/std": 8.75307559967041, "sampling/importance_sampling_ratio/max": 2.0286850929260254, "sampling/importance_sampling_ratio/mean": 1.0433138608932495, "sampling/importance_sampling_ratio/min": 0.4457562565803528, "sampling/sampling_logp_difference/max": 0.5284347534179688, "sampling/sampling_logp_difference/mean": 0.02805529534816742, "step": 77, "step_time": 17.519357715020305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018862613360397518, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018862613360397518, "entropy": 0.39548420906066895, "epoch": 0.00078, "grad_norm": 1.6799070835113525, "kl": 0.12623973563313484, "learning_rate": 9.999999869406853e-06, "loss": 0.3794, "step": 78, "step_time": 6.891301669020322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "completions/clipped_ratio": 0.0, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 1605.4375, "completions/mean_terminated_length": 1605.4375, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "entropy": 0.3090809974819422, "epoch": 0.00079, "frac_reward_zero_std": 0.0, "grad_norm": 0.6590104699134827, "kl": 0.1494432920590043, "learning_rate": 9.999999863114101e-06, "loss": 0.0797, "num_tokens": 2929100.0, "reward": 4.982916831970215, "reward_std": 8.33253288269043, "rewards/rollout_reward_func/mean": 4.982916831970215, "rewards/rollout_reward_func/std": 8.954483032226562, "sampling/importance_sampling_ratio/max": 1.8613463640213013, "sampling/importance_sampling_ratio/mean": 0.895674467086792, "sampling/importance_sampling_ratio/min": 0.21810315549373627, "sampling/sampling_logp_difference/max": 0.8440449237823486, "sampling/sampling_logp_difference/mean": 0.03305047005414963, "step": 79, "step_time": 16.90107941000315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005122950533404946, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005122950533404946, "entropy": 0.3062464576214552, "epoch": 0.0008, "grad_norm": 1.4444913864135742, "kl": 0.17543433513492346, "learning_rate": 9.99999985667328e-06, "loss": 0.0795, "step": 80, "step_time": 6.047317050979473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 1627.9375, "completions/mean_terminated_length": 1627.9375, "completions/min_length": 1333.0, "completions/min_terminated_length": 1333.0, "entropy": 0.3245549853891134, "epoch": 0.00081, "frac_reward_zero_std": 0.0, "grad_norm": 1.2258262634277344, "kl": 0.13559613469988108, "learning_rate": 9.999999850084397e-06, "loss": -0.0901, "num_tokens": 3003526.0, "reward": 6.662212371826172, "reward_std": 6.930264472961426, "rewards/rollout_reward_func/mean": 6.662212371826172, "rewards/rollout_reward_func/std": 8.497859954833984, "sampling/importance_sampling_ratio/max": 1.7074333429336548, "sampling/importance_sampling_ratio/mean": 0.885154664516449, "sampling/importance_sampling_ratio/min": 4.581072324326245e-14, "sampling/sampling_logp_difference/max": 21.803922653198242, "sampling/sampling_logp_difference/mean": 0.12069699168205261, "step": 81, "step_time": 17.554498502999195 }, { "clip_ratio/high_max": 0.0020491802133619785, "clip_ratio/high_mean": 0.0010245901066809893, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002413479029200971, "entropy": 0.3242450263351202, "epoch": 0.00082, "grad_norm": 1.1592916250228882, "kl": 0.13859413657337427, "learning_rate": 9.99999984334745e-06, "loss": -0.0937, "step": 82, "step_time": 6.393739366001682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 1279.65625, "completions/mean_terminated_length": 1279.65625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.33396398834884167, "epoch": 0.00083, "frac_reward_zero_std": 0.0, "grad_norm": 1.7327684164047241, "kl": 0.14556773751974106, "learning_rate": 9.999999836462437e-06, "loss": 0.1993, "num_tokens": 3065604.0, "reward": 3.486565351486206, "reward_std": 5.676440238952637, "rewards/rollout_reward_func/mean": 3.486565351486206, "rewards/rollout_reward_func/std": 6.540695667266846, "sampling/importance_sampling_ratio/max": 1.6764583587646484, "sampling/importance_sampling_ratio/mean": 0.9743399620056152, "sampling/importance_sampling_ratio/min": 0.35151007771492004, "sampling/sampling_logp_difference/max": 0.8574001789093018, "sampling/sampling_logp_difference/mean": 0.03397724777460098, "step": 83, "step_time": 16.76093037199462 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.002016128972172737, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003374824649654329, "entropy": 0.3327972814440727, "epoch": 0.00084, "grad_norm": 1.561924695968628, "kl": 0.14696736261248589, "learning_rate": 9.99999982942936e-06, "loss": 0.193, "step": 84, "step_time": 6.430620785016799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0020491802133619785, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020491802133619785, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 1640.84375, "completions/mean_terminated_length": 1640.84375, "completions/min_length": 1364.0, "completions/min_terminated_length": 1364.0, "entropy": 0.2599208876490593, "epoch": 0.00085, "frac_reward_zero_std": 0.0, "grad_norm": 1.0426123142242432, "kl": 0.13481817953288555, "learning_rate": 9.999999822248216e-06, "loss": 0.1288, "num_tokens": 3140753.0, "reward": 0.2642984390258789, "reward_std": 8.728519439697266, "rewards/rollout_reward_func/mean": 0.2642984390258789, "rewards/rollout_reward_func/std": 11.526087760925293, "sampling/importance_sampling_ratio/max": 1.902886986732483, "sampling/importance_sampling_ratio/mean": 1.0716423988342285, "sampling/importance_sampling_ratio/min": 0.26088374853134155, "sampling/sampling_logp_difference/max": 0.506378173828125, "sampling/sampling_logp_difference/mean": 0.022509340196847916, "step": 85, "step_time": 16.673425905013573 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.003073770320042968, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004432465881109238, "entropy": 0.2595645170658827, "epoch": 0.00086, "grad_norm": 1.1238845586776733, "kl": 0.13921704050153494, "learning_rate": 9.999999814919009e-06, "loss": 0.1288, "step": 86, "step_time": 6.139811241038842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 1608.84375, "completions/mean_terminated_length": 1608.84375, "completions/min_length": 1506.0, "completions/min_terminated_length": 1506.0, "entropy": 0.2851357739418745, "epoch": 0.00087, "frac_reward_zero_std": 0.0, "grad_norm": 1.2657455205917358, "kl": 0.16311322059482336, "learning_rate": 9.999999807441738e-06, "loss": 0.1704, "num_tokens": 3213900.0, "reward": 0.9642136096954346, "reward_std": 8.448262214660645, "rewards/rollout_reward_func/mean": 0.9642136096954346, "rewards/rollout_reward_func/std": 9.740251541137695, "sampling/importance_sampling_ratio/max": 2.303440570831299, "sampling/importance_sampling_ratio/mean": 1.008488416671753, "sampling/importance_sampling_ratio/min": 0.5220088362693787, "sampling/sampling_logp_difference/max": 0.5648818016052246, "sampling/sampling_logp_difference/mean": 0.029840506613254547, "step": 87, "step_time": 16.657975489011733 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "entropy": 0.2877427823841572, "epoch": 0.00088, "grad_norm": 1.16610848903656, "kl": 0.15486302226781845, "learning_rate": 9.999999799816401e-06, "loss": 0.1703, "step": 88, "step_time": 6.467442798995762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "completions/clipped_ratio": 0.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 1620.21875, "completions/mean_terminated_length": 1620.21875, "completions/min_length": 1293.0, "completions/min_terminated_length": 1293.0, "entropy": 0.2654573004692793, "epoch": 0.00089, "frac_reward_zero_std": 0.0, "grad_norm": 1.1540342569351196, "kl": 0.1338475625962019, "learning_rate": 9.999999792042999e-06, "loss": -0.0721, "num_tokens": 3286692.0, "reward": 1.8022962808609009, "reward_std": 7.115427017211914, "rewards/rollout_reward_func/mean": 1.8022962808609009, "rewards/rollout_reward_func/std": 8.253561973571777, "sampling/importance_sampling_ratio/max": 1.8665374517440796, "sampling/importance_sampling_ratio/mean": 0.9927425384521484, "sampling/importance_sampling_ratio/min": 0.42673173546791077, "sampling/sampling_logp_difference/max": 0.5771679878234863, "sampling/sampling_logp_difference/mean": 0.025445295497775078, "step": 89, "step_time": 16.306627727026353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013297871919348836, "entropy": 0.2705059368163347, "epoch": 0.0009, "grad_norm": 1.0495223999023438, "kl": 0.12033465970307589, "learning_rate": 9.999999784121533e-06, "loss": -0.073, "step": 90, "step_time": 6.124930074016447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 1618.09375, "completions/mean_terminated_length": 1618.09375, "completions/min_length": 1294.0, "completions/min_terminated_length": 1294.0, "entropy": 0.27524856105446815, "epoch": 0.00091, "frac_reward_zero_std": 0.0, "grad_norm": 0.8082772493362427, "kl": 0.12074475642293692, "learning_rate": 9.999999776052001e-06, "loss": 0.0233, "num_tokens": 3360405.0, "reward": -2.6570687294006348, "reward_std": 10.464418411254883, "rewards/rollout_reward_func/mean": -2.6570687294006348, "rewards/rollout_reward_func/std": 16.174386978149414, "sampling/importance_sampling_ratio/max": 1.9121888875961304, "sampling/importance_sampling_ratio/mean": 1.0011422634124756, "sampling/importance_sampling_ratio/min": 0.41476160287857056, "sampling/sampling_logp_difference/max": 0.9920110702514648, "sampling/sampling_logp_difference/mean": 0.029301125556230545, "step": 91, "step_time": 17.423839259994566 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.2806878611445427, "epoch": 0.00092, "grad_norm": 0.7378302216529846, "kl": 0.11060467921197414, "learning_rate": 9.999999767834406e-06, "loss": 0.0248, "step": 92, "step_time": 6.637053437996656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017517236701678485, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017517236701678485, "completions/clipped_ratio": 0.03125, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 1628.5, "completions/mean_terminated_length": 1654.54833984375, "completions/min_length": 821.0, "completions/min_terminated_length": 1289.0, "entropy": 0.32333023101091385, "epoch": 0.00093, "frac_reward_zero_std": 0.0, "grad_norm": 1.4687355756759644, "kl": 0.08091159677132964, "learning_rate": 9.999999759468744e-06, "loss": -0.0079, "num_tokens": 3434956.0, "reward": -1.5648401975631714, "reward_std": 9.364018440246582, "rewards/rollout_reward_func/mean": -1.5648401975631714, "rewards/rollout_reward_func/std": 12.69128131866455, "sampling/importance_sampling_ratio/max": 2.1736202239990234, "sampling/importance_sampling_ratio/mean": 1.0720882415771484, "sampling/importance_sampling_ratio/min": 0.42230284214019775, "sampling/sampling_logp_difference/max": 0.6265373229980469, "sampling/sampling_logp_difference/mean": 0.0269186832010746, "step": 93, "step_time": 17.9028281270148 }, { "clip_ratio/high_max": 0.0020491802133619785, "clip_ratio/high_mean": 0.0010245901066809893, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "entropy": 0.3269411735236645, "epoch": 0.00094, "grad_norm": 1.3381267786026, "kl": 0.07900093123316765, "learning_rate": 9.99999975095502e-06, "loss": -0.01, "step": 94, "step_time": 6.114363114000298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1628.28125, "completions/mean_terminated_length": 1628.28125, "completions/min_length": 1462.0, "completions/min_terminated_length": 1462.0, "entropy": 0.3444838896393776, "epoch": 0.00095, "frac_reward_zero_std": 0.0, "grad_norm": 1.2317733764648438, "kl": 0.1189874792471528, "learning_rate": 9.999999742293229e-06, "loss": -0.0112, "num_tokens": 3508455.0, "reward": 2.241374969482422, "reward_std": 8.48141860961914, "rewards/rollout_reward_func/mean": 2.241374969482422, "rewards/rollout_reward_func/std": 9.46461009979248, "sampling/importance_sampling_ratio/max": 1.7729943990707397, "sampling/importance_sampling_ratio/mean": 0.9944843053817749, "sampling/importance_sampling_ratio/min": 0.5259365439414978, "sampling/sampling_logp_difference/max": 0.6193251609802246, "sampling/sampling_logp_difference/mean": 0.026563305407762527, "step": 95, "step_time": 17.76328463399841 }, { "clip_ratio/high_max": 0.009451662423089147, "clip_ratio/high_mean": 0.004725831211544573, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004725831211544573, "entropy": 0.3464929983019829, "epoch": 0.00096, "grad_norm": 1.1388624906539917, "kl": 0.10611155908554792, "learning_rate": 9.999999733483374e-06, "loss": -0.0146, "step": 96, "step_time": 6.71051817300031 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036287568509578705, "completions/clipped_ratio": 0.0, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 1654.21875, "completions/mean_terminated_length": 1654.21875, "completions/min_length": 1451.0, "completions/min_terminated_length": 1451.0, "entropy": 0.3395773209631443, "epoch": 0.00097, "frac_reward_zero_std": 0.0, "grad_norm": 0.9018979668617249, "kl": 0.10827788151800632, "learning_rate": 9.999999724525454e-06, "loss": 0.0551, "num_tokens": 3583730.0, "reward": 1.534318208694458, "reward_std": 7.050787448883057, "rewards/rollout_reward_func/mean": 1.534318208694458, "rewards/rollout_reward_func/std": 9.857051849365234, "sampling/importance_sampling_ratio/max": 1.4822131395339966, "sampling/importance_sampling_ratio/mean": 0.9050658345222473, "sampling/importance_sampling_ratio/min": 0.424882709980011, "sampling/sampling_logp_difference/max": 0.47632741928100586, "sampling/sampling_logp_difference/mean": 0.024851003661751747, "step": 97, "step_time": 17.00762781201047 }, { "clip_ratio/high_max": 0.007812500232830644, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00623292347881943, "entropy": 0.3411915861070156, "epoch": 0.00098, "grad_norm": 0.9022724032402039, "kl": 0.10402639769017696, "learning_rate": 9.99999971541947e-06, "loss": 0.0565, "step": 98, "step_time": 6.236799924998195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1844.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 1643.40625, "completions/mean_terminated_length": 1643.40625, "completions/min_length": 1485.0, "completions/min_terminated_length": 1485.0, "entropy": 0.33190596103668213, "epoch": 0.00099, "frac_reward_zero_std": 0.0, "grad_norm": 0.8403921127319336, "kl": 0.12121224030852318, "learning_rate": 9.99999970616542e-06, "loss": 0.0154, "num_tokens": 3657753.0, "reward": 8.165836334228516, "reward_std": 9.233345985412598, "rewards/rollout_reward_func/mean": 8.165836334228516, "rewards/rollout_reward_func/std": 9.86094856262207, "sampling/importance_sampling_ratio/max": 2.0728392601013184, "sampling/importance_sampling_ratio/mean": 1.135339379310608, "sampling/importance_sampling_ratio/min": 0.46550998091697693, "sampling/sampling_logp_difference/max": 0.6066279411315918, "sampling/sampling_logp_difference/mean": 0.026337727904319763, "step": 99, "step_time": 17.489089494993095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "entropy": 0.3303120955824852, "epoch": 0.001, "grad_norm": 0.9656193852424622, "kl": 0.11541659291833639, "learning_rate": 9.999999696763307e-06, "loss": 0.0156, "step": 100, "step_time": 6.361514836011338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 1664.8125, "completions/mean_terminated_length": 1664.8125, "completions/min_length": 1521.0, "completions/min_terminated_length": 1521.0, "entropy": 0.3254339173436165, "epoch": 0.00101, "frac_reward_zero_std": 0.0, "grad_norm": 0.7036325931549072, "kl": 0.10238504502922297, "learning_rate": 9.999999687213127e-06, "loss": 0.0353, "num_tokens": 3732231.0, "reward": 8.350814819335938, "reward_std": 9.859748840332031, "rewards/rollout_reward_func/mean": 8.350814819335938, "rewards/rollout_reward_func/std": 11.146523475646973, "sampling/importance_sampling_ratio/max": 1.5631202459335327, "sampling/importance_sampling_ratio/mean": 0.9889119863510132, "sampling/importance_sampling_ratio/min": 0.35384273529052734, "sampling/sampling_logp_difference/max": 0.5236930847167969, "sampling/sampling_logp_difference/mean": 0.027563437819480896, "step": 101, "step_time": 18.00494698897819 }, { "clip_ratio/high_max": 0.0020491802133619785, "clip_ratio/high_mean": 0.0010245901066809893, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "entropy": 0.32405535876750946, "epoch": 0.00102, "grad_norm": 0.7230035066604614, "kl": 0.10036924388259649, "learning_rate": 9.999999677514885e-06, "loss": 0.0339, "step": 102, "step_time": 6.152818701986689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002354377298615873, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002354377298615873, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 1667.8125, "completions/mean_terminated_length": 1667.8125, "completions/min_length": 1477.0, "completions/min_terminated_length": 1477.0, "entropy": 0.4009552225470543, "epoch": 0.00103, "frac_reward_zero_std": 0.0, "grad_norm": 0.9087545275688171, "kl": 0.12612003181129694, "learning_rate": 9.999999667668576e-06, "loss": -0.0472, "num_tokens": 3807361.0, "reward": 5.5272603034973145, "reward_std": 9.944602966308594, "rewards/rollout_reward_func/mean": 5.5272603034973145, "rewards/rollout_reward_func/std": 10.936971664428711, "sampling/importance_sampling_ratio/max": 1.545738935470581, "sampling/importance_sampling_ratio/mean": 0.892126739025116, "sampling/importance_sampling_ratio/min": 0.1905590444803238, "sampling/sampling_logp_difference/max": 1.192850947380066, "sampling/sampling_logp_difference/mean": 0.03049352765083313, "step": 103, "step_time": 17.30795979400864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0036564606707543135, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036564606707543135, "entropy": 0.39822786673903465, "epoch": 0.00104, "grad_norm": 0.8702638149261475, "kl": 0.12465381342917681, "learning_rate": 9.999999657674202e-06, "loss": -0.0482, "step": 104, "step_time": 6.719693032981013 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 1691.875, "completions/mean_terminated_length": 1691.875, "completions/min_length": 1636.0, "completions/min_terminated_length": 1636.0, "entropy": 0.26598341576755047, "epoch": 0.00105, "frac_reward_zero_std": 0.0, "grad_norm": 1.09138023853302, "kl": 0.09533415175974369, "learning_rate": 9.999999647531763e-06, "loss": -0.0543, "num_tokens": 3882542.0, "reward": -1.102081298828125, "reward_std": 6.231112003326416, "rewards/rollout_reward_func/mean": -1.102081298828125, "rewards/rollout_reward_func/std": 9.735074996948242, "sampling/importance_sampling_ratio/max": 1.6031757593154907, "sampling/importance_sampling_ratio/mean": 1.0384677648544312, "sampling/importance_sampling_ratio/min": 0.5203094482421875, "sampling/sampling_logp_difference/max": 0.4077695608139038, "sampling/sampling_logp_difference/mean": 0.021985895931720734, "step": 105, "step_time": 17.579777111008298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026909722946584225, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026909722946584225, "entropy": 0.2603845726698637, "epoch": 0.00106, "grad_norm": 0.9107802510261536, "kl": 0.10189682990312576, "learning_rate": 9.99999963724126e-06, "loss": -0.0572, "step": 106, "step_time": 6.361362586001633 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026884828694164753, "completions/clipped_ratio": 0.0, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 1615.1875, "completions/mean_terminated_length": 1615.1875, "completions/min_length": 1475.0, "completions/min_terminated_length": 1475.0, "entropy": 0.2496139220893383, "epoch": 0.00107, "frac_reward_zero_std": 0.0, "grad_norm": 0.8104920387268066, "kl": 0.09701190888881683, "learning_rate": 9.999999626802692e-06, "loss": -0.0993, "num_tokens": 3955451.0, "reward": 2.1392765045166016, "reward_std": 5.099225044250488, "rewards/rollout_reward_func/mean": 2.1392765045166016, "rewards/rollout_reward_func/std": 5.51261568069458, "sampling/importance_sampling_ratio/max": 1.651798129081726, "sampling/importance_sampling_ratio/mean": 1.004228115081787, "sampling/importance_sampling_ratio/min": 0.253883421421051, "sampling/sampling_logp_difference/max": 0.6667094230651855, "sampling/sampling_logp_difference/mean": 0.022637609392404556, "step": 107, "step_time": 17.28257027400832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013586956774815917, "entropy": 0.24570557288825512, "epoch": 0.00108, "grad_norm": 0.8204213976860046, "kl": 0.09996318724006414, "learning_rate": 9.99999961621606e-06, "loss": -0.0978, "step": 108, "step_time": 6.559291850018781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027186761144548655, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027186761144548655, "completions/clipped_ratio": 0.03125, "completions/max_length": 1834.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 1622.5625, "completions/mean_terminated_length": 1619.6773681640625, "completions/min_length": 1079.0, "completions/min_terminated_length": 1079.0, "entropy": 0.31464538536965847, "epoch": 0.00109, "frac_reward_zero_std": 0.0, "grad_norm": 1.4984983205795288, "kl": 0.10155944805592299, "learning_rate": 9.999999605481363e-06, "loss": 0.0856, "num_tokens": 4028743.0, "reward": 1.771266222000122, "reward_std": 14.21844482421875, "rewards/rollout_reward_func/mean": 1.771266222000122, "rewards/rollout_reward_func/std": 17.986845016479492, "sampling/importance_sampling_ratio/max": 1.7850298881530762, "sampling/importance_sampling_ratio/mean": 0.9863642454147339, "sampling/importance_sampling_ratio/min": 0.5179370641708374, "sampling/sampling_logp_difference/max": 1.1239013671875, "sampling/sampling_logp_difference/mean": 0.030838951468467712, "step": 109, "step_time": 17.896458756003994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008837935631163418, "clip_ratio/low_min": 0.002659574383869767, "clip_ratio/region_mean": 0.008837935631163418, "entropy": 0.31012305431067944, "epoch": 0.0011, "grad_norm": 1.3933000564575195, "kl": 0.11323428712785244, "learning_rate": 9.999999594598602e-06, "loss": 0.0852, "step": 110, "step_time": 6.33158524701139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 1608.21875, "completions/mean_terminated_length": 1608.21875, "completions/min_length": 1077.0, "completions/min_terminated_length": 1077.0, "entropy": 0.24848720617592335, "epoch": 0.00111, "frac_reward_zero_std": 0.0, "grad_norm": 0.9933966398239136, "kl": 0.12310208007693291, "learning_rate": 9.999999583567774e-06, "loss": -0.0856, "num_tokens": 4101838.0, "reward": 6.407966613769531, "reward_std": 8.375419616699219, "rewards/rollout_reward_func/mean": 6.407966613769531, "rewards/rollout_reward_func/std": 9.31225299835205, "sampling/importance_sampling_ratio/max": 2.041457414627075, "sampling/importance_sampling_ratio/mean": 0.9499837756156921, "sampling/importance_sampling_ratio/min": 0.32322558760643005, "sampling/sampling_logp_difference/max": 0.5667493343353271, "sampling/sampling_logp_difference/mean": 0.023919325321912766, "step": 111, "step_time": 16.47657907998655 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.00232667347881943, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036287568509578705, "entropy": 0.24526936002075672, "epoch": 0.00112, "grad_norm": 0.7821959853172302, "kl": 0.12570339441299438, "learning_rate": 9.999999572388884e-06, "loss": -0.0887, "step": 112, "step_time": 6.6094277989905095 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.0004111842135898769, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003043054777663201, "completions/clipped_ratio": 0.03125, "completions/max_length": 2003.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 1734.4375, "completions/mean_terminated_length": 1725.774169921875, "completions/min_length": 1640.0, "completions/min_terminated_length": 1640.0, "entropy": 0.2711036931723356, "epoch": 0.00113, "frac_reward_zero_std": 0.0, "grad_norm": 0.8173717260360718, "kl": 0.07816016720607877, "learning_rate": 9.999999561061925e-06, "loss": -0.0737, "num_tokens": 4179239.0, "reward": 6.637342929840088, "reward_std": 5.497828960418701, "rewards/rollout_reward_func/mean": 6.637342929840088, "rewards/rollout_reward_func/std": 13.010616302490234, "sampling/importance_sampling_ratio/max": 2.037193536758423, "sampling/importance_sampling_ratio/mean": 0.9971824288368225, "sampling/importance_sampling_ratio/min": 0.414572536945343, "sampling/sampling_logp_difference/max": 0.6167750358581543, "sampling/sampling_logp_difference/mean": 0.027510441839694977, "step": 113, "step_time": 19.215131983015453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 0.2679080627858639, "epoch": 0.00114, "grad_norm": 0.8090457320213318, "kl": 0.08259443053975701, "learning_rate": 9.999999549586906e-06, "loss": -0.0763, "step": 114, "step_time": 6.734361608003383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1849.0, "completions/max_terminated_length": 1849.0, "completions/mean_length": 1721.09375, "completions/mean_terminated_length": 1721.09375, "completions/min_length": 1671.0, "completions/min_terminated_length": 1671.0, "entropy": 0.2386282328516245, "epoch": 0.00115, "frac_reward_zero_std": 0.0, "grad_norm": 0.9069452881813049, "kl": 0.12694119848310947, "learning_rate": 9.99999953796382e-06, "loss": -0.0437, "num_tokens": 4256213.0, "reward": 2.982818126678467, "reward_std": 4.084477424621582, "rewards/rollout_reward_func/mean": 2.982818126678467, "rewards/rollout_reward_func/std": 4.399836540222168, "sampling/importance_sampling_ratio/max": 1.6433260440826416, "sampling/importance_sampling_ratio/mean": 1.0018846988677979, "sampling/importance_sampling_ratio/min": 0.40104833245277405, "sampling/sampling_logp_difference/max": 0.8438625335693359, "sampling/sampling_logp_difference/mean": 0.03200629726052284, "step": 115, "step_time": 17.391304989010678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 0.2352404110133648, "epoch": 0.00116, "grad_norm": 0.978081226348877, "kl": 0.13185255322605371, "learning_rate": 9.99999952619267e-06, "loss": -0.0456, "step": 116, "step_time": 6.829499252009555 }, { "clip_ratio/high_max": 0.01003031269647181, "clip_ratio/high_mean": 0.005015156348235905, "clip_ratio/low_mean": 0.002659574383869767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007674730732105672, "completions/clipped_ratio": 0.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 1627.6875, "completions/mean_terminated_length": 1627.6875, "completions/min_length": 1457.0, "completions/min_terminated_length": 1457.0, "entropy": 0.22785228863358498, "epoch": 0.00117, "frac_reward_zero_std": 0.0, "grad_norm": 1.354264736175537, "kl": 0.19312882982194424, "learning_rate": 9.999999514273453e-06, "loss": -0.0891, "num_tokens": 4330567.0, "reward": 4.647030830383301, "reward_std": 7.999951362609863, "rewards/rollout_reward_func/mean": 4.647030830383301, "rewards/rollout_reward_func/std": 9.646321296691895, "sampling/importance_sampling_ratio/max": 1.5326151847839355, "sampling/importance_sampling_ratio/mean": 0.998310923576355, "sampling/importance_sampling_ratio/min": 0.2049121856689453, "sampling/sampling_logp_difference/max": 1.0557575225830078, "sampling/sampling_logp_difference/mean": 0.03139008581638336, "step": 117, "step_time": 16.626296731992625 }, { "clip_ratio/high_max": 0.007368328981101513, "clip_ratio/high_mean": 0.0036841644905507565, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036841644905507565, "entropy": 0.22823701053857803, "epoch": 0.00118, "grad_norm": 1.0613915920257568, "kl": 0.17982678581029177, "learning_rate": 9.999999502206173e-06, "loss": -0.0909, "step": 118, "step_time": 5.96750666697335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 1609.4375, "completions/mean_terminated_length": 1609.4375, "completions/min_length": 1506.0, "completions/min_terminated_length": 1506.0, "entropy": 0.2393393199890852, "epoch": 0.00119, "frac_reward_zero_std": 0.0, "grad_norm": 1.0243966579437256, "kl": 0.10230187606066465, "learning_rate": 9.999999489990828e-06, "loss": -0.3952, "num_tokens": 4403634.0, "reward": 8.96430778503418, "reward_std": 7.039121150970459, "rewards/rollout_reward_func/mean": 8.96430778503418, "rewards/rollout_reward_func/std": 9.868720054626465, "sampling/importance_sampling_ratio/max": 2.210692882537842, "sampling/importance_sampling_ratio/mean": 1.0681219100952148, "sampling/importance_sampling_ratio/min": 0.5256638526916504, "sampling/sampling_logp_difference/max": 0.5038142204284668, "sampling/sampling_logp_difference/mean": 0.02815105952322483, "step": 119, "step_time": 16.3269175930036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013297871919348836, "entropy": 0.23971233516931534, "epoch": 0.0012, "grad_norm": 1.0055460929870605, "kl": 0.10329719912260771, "learning_rate": 9.999999477627418e-06, "loss": -0.3965, "step": 120, "step_time": 6.424562390995561 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013297871919348836, "completions/clipped_ratio": 0.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 1525.25, "completions/mean_terminated_length": 1525.25, "completions/min_length": 1103.0, "completions/min_terminated_length": 1103.0, "entropy": 0.24542769230902195, "epoch": 0.00121, "frac_reward_zero_std": 0.0, "grad_norm": 0.9047726392745972, "kl": 0.12966847606003284, "learning_rate": 9.999999465115944e-06, "loss": -0.0873, "num_tokens": 4473574.0, "reward": 14.91907024383545, "reward_std": 18.090713500976562, "rewards/rollout_reward_func/mean": 14.91907024383545, "rewards/rollout_reward_func/std": 28.93967628479004, "sampling/importance_sampling_ratio/max": 2.1593284606933594, "sampling/importance_sampling_ratio/mean": 1.0198925733566284, "sampling/importance_sampling_ratio/min": 0.27339673042297363, "sampling/sampling_logp_difference/max": 0.5622825622558594, "sampling/sampling_logp_difference/mean": 0.030078621581196785, "step": 121, "step_time": 16.63385564100463 }, { "clip_ratio/high_max": 0.004708754597231746, "clip_ratio/high_mean": 0.002354377298615873, "clip_ratio/low_mean": 0.002354377298615873, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004708754597231746, "entropy": 0.24422423727810383, "epoch": 0.00122, "grad_norm": 0.9230634570121765, "kl": 0.12887399271130562, "learning_rate": 9.999999452456404e-06, "loss": -0.0899, "step": 122, "step_time": 6.5959602480288595 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013297871919348836, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 1712.09375, "completions/mean_terminated_length": 1712.09375, "completions/min_length": 1552.0, "completions/min_terminated_length": 1552.0, "entropy": 0.24614222906529903, "epoch": 0.00123, "frac_reward_zero_std": 0.0, "grad_norm": 1.2284295558929443, "kl": 0.12034551799297333, "learning_rate": 9.9999994396488e-06, "loss": -0.017, "num_tokens": 4549589.0, "reward": 4.790639877319336, "reward_std": 5.642117023468018, "rewards/rollout_reward_func/mean": 4.790639877319336, "rewards/rollout_reward_func/std": 7.868781566619873, "sampling/importance_sampling_ratio/max": 1.7135753631591797, "sampling/importance_sampling_ratio/mean": 0.9457619190216064, "sampling/importance_sampling_ratio/min": 3.679141786729176e-18, "sampling/sampling_logp_difference/max": 22.29609489440918, "sampling/sampling_logp_difference/mean": 0.08050332963466644, "step": 123, "step_time": 17.08274853398325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888889225199819, "entropy": 0.2443638127297163, "epoch": 0.00124, "grad_norm": 1.1770434379577637, "kl": 0.1269354922696948, "learning_rate": 9.999999426693132e-06, "loss": -0.0173, "step": 124, "step_time": 6.685065648998716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0020491802133619785, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020491802133619785, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 1578.28125, "completions/mean_terminated_length": 1578.28125, "completions/min_length": 1263.0, "completions/min_terminated_length": 1263.0, "entropy": 0.24355297908186913, "epoch": 0.00125, "frac_reward_zero_std": 0.0, "grad_norm": 0.9832190871238708, "kl": 0.1830962970852852, "learning_rate": 9.999999413589398e-06, "loss": -0.0566, "num_tokens": 4621552.0, "reward": 13.290432929992676, "reward_std": 15.10336685180664, "rewards/rollout_reward_func/mean": 13.290432929992676, "rewards/rollout_reward_func/std": 26.753387451171875, "sampling/importance_sampling_ratio/max": 2.4743406772613525, "sampling/importance_sampling_ratio/mean": 1.0138651132583618, "sampling/importance_sampling_ratio/min": 1.5063109832334364e-16, "sampling/sampling_logp_difference/max": 23.313261032104492, "sampling/sampling_logp_difference/mean": 0.07898054271936417, "step": 125, "step_time": 16.34186544599652 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.003126766299828887, "clip_ratio/low_min": 0.0020491802133619785, "clip_ratio/region_mean": 0.005758636863902211, "entropy": 0.2434285506606102, "epoch": 0.00126, "grad_norm": 0.8846249580383301, "kl": 0.19987506233155727, "learning_rate": 9.999999400337598e-06, "loss": -0.0584, "step": 126, "step_time": 6.651914462010609 }, { "clip_ratio/high_max": 0.0027777778450399637, "clip_ratio/high_mean": 0.0013888889225199819, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027186761144548655, "completions/clipped_ratio": 0.0, "completions/max_length": 1873.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 1725.0, "completions/mean_terminated_length": 1725.0, "completions/min_length": 1493.0, "completions/min_terminated_length": 1493.0, "entropy": 0.24008044600486755, "epoch": 0.00127, "frac_reward_zero_std": 0.0, "grad_norm": 0.6875267624855042, "kl": 0.21041650138795376, "learning_rate": 9.999999386937737e-06, "loss": -0.1321, "num_tokens": 4698838.0, "reward": 6.591200351715088, "reward_std": 7.070637226104736, "rewards/rollout_reward_func/mean": 6.591200351715088, "rewards/rollout_reward_func/std": 10.466858863830566, "sampling/importance_sampling_ratio/max": 2.6160686016082764, "sampling/importance_sampling_ratio/mean": 1.0830183029174805, "sampling/importance_sampling_ratio/min": 0.2268618643283844, "sampling/sampling_logp_difference/max": 1.0123896598815918, "sampling/sampling_logp_difference/mean": 0.041539035737514496, "step": 127, "step_time": 17.48914127901662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013586956774815917, "entropy": 0.23946300335228443, "epoch": 0.00128, "grad_norm": 0.7259523272514343, "kl": 0.2037885021418333, "learning_rate": 9.999999373389809e-06, "loss": -0.1332, "step": 128, "step_time": 6.961486543019419 }, { "clip_ratio/high_max": 0.005155187100172043, "clip_ratio/high_mean": 0.0025775935500860214, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003879676922224462, "completions/clipped_ratio": 0.0, "completions/max_length": 1828.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 1687.03125, "completions/mean_terminated_length": 1687.03125, "completions/min_length": 1339.0, "completions/min_terminated_length": 1339.0, "entropy": 0.2051472868770361, "epoch": 0.00129, "frac_reward_zero_std": 0.0, "grad_norm": 0.8209474682807922, "kl": 0.15841065254062414, "learning_rate": 9.999999359693816e-06, "loss": -0.1279, "num_tokens": 4773987.0, "reward": 9.433834075927734, "reward_std": 8.818329811096191, "rewards/rollout_reward_func/mean": 9.433834075927734, "rewards/rollout_reward_func/std": 9.546534538269043, "sampling/importance_sampling_ratio/max": 1.7890204191207886, "sampling/importance_sampling_ratio/mean": 1.0900124311447144, "sampling/importance_sampling_ratio/min": 0.3541417121887207, "sampling/sampling_logp_difference/max": 0.9469351768493652, "sampling/sampling_logp_difference/mean": 0.037607982754707336, "step": 129, "step_time": 17.566188623997732 }, { "clip_ratio/high_max": 0.007759353844448924, "clip_ratio/high_mean": 0.003879676922224462, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005181760294362903, "entropy": 0.20546288043260574, "epoch": 0.0013, "grad_norm": 0.8324758410453796, "kl": 0.1566485781222582, "learning_rate": 9.999999345849757e-06, "loss": -0.1287, "step": 130, "step_time": 6.886117387010017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004366357112303376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004366357112303376, "completions/clipped_ratio": 0.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 1534.6875, "completions/mean_terminated_length": 1534.6875, "completions/min_length": 1057.0, "completions/min_terminated_length": 1057.0, "entropy": 0.2388804368674755, "epoch": 0.00131, "frac_reward_zero_std": 0.0, "grad_norm": 1.1745887994766235, "kl": 0.25405599270015955, "learning_rate": 9.999999331857635e-06, "loss": -0.1766, "num_tokens": 4844630.0, "reward": 17.314945220947266, "reward_std": 16.554960250854492, "rewards/rollout_reward_func/mean": 17.314945220947266, "rewards/rollout_reward_func/std": 27.98472023010254, "sampling/importance_sampling_ratio/max": 2.1109719276428223, "sampling/importance_sampling_ratio/mean": 0.995877742767334, "sampling/importance_sampling_ratio/min": 0.3491049110889435, "sampling/sampling_logp_difference/max": 0.9240174293518066, "sampling/sampling_logp_difference/mean": 0.04220716655254364, "step": 131, "step_time": 16.436990634014364 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0012755101779475808, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026052973698824644, "entropy": 0.23991331830620766, "epoch": 0.00132, "grad_norm": 1.0355007648468018, "kl": 0.24390931520611048, "learning_rate": 9.999999317717449e-06, "loss": -0.1797, "step": 132, "step_time": 6.174773411999922 }, { "clip_ratio/high_max": 0.007812500232830644, "clip_ratio/high_mean": 0.003906250116415322, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 1668.03125, "completions/mean_terminated_length": 1668.03125, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "entropy": 0.22819190844893456, "epoch": 0.00133, "frac_reward_zero_std": 0.0, "grad_norm": 0.8312011957168579, "kl": 0.18424110859632492, "learning_rate": 9.999999303429197e-06, "loss": 0.0278, "num_tokens": 4919791.0, "reward": 2.1089086532592773, "reward_std": 7.422804832458496, "rewards/rollout_reward_func/mean": 2.1089086532592773, "rewards/rollout_reward_func/std": 8.982881546020508, "sampling/importance_sampling_ratio/max": 1.8427815437316895, "sampling/importance_sampling_ratio/mean": 0.8756811618804932, "sampling/importance_sampling_ratio/min": 0.17287199199199677, "sampling/sampling_logp_difference/max": 0.8881862163543701, "sampling/sampling_logp_difference/mean": 0.04368170350790024, "step": 133, "step_time": 17.390132736982196 }, { "clip_ratio/high_max": 0.007812500232830644, "clip_ratio/high_mean": 0.003906250116415322, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.23028352111577988, "epoch": 0.00134, "grad_norm": 0.8703155517578125, "kl": 0.18170421849936247, "learning_rate": 9.999999288992881e-06, "loss": 0.026, "step": 134, "step_time": 6.680072904986446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 1640.625, "completions/mean_terminated_length": 1640.625, "completions/min_length": 1574.0, "completions/min_terminated_length": 1574.0, "entropy": 0.19946525245904922, "epoch": 0.00135, "frac_reward_zero_std": 0.0, "grad_norm": 0.7146205306053162, "kl": 0.14251765795052052, "learning_rate": 9.999999274408498e-06, "loss": -0.3174, "num_tokens": 4993880.0, "reward": 8.279874801635742, "reward_std": 6.961740970611572, "rewards/rollout_reward_func/mean": 8.279874801635742, "rewards/rollout_reward_func/std": 7.880353927612305, "sampling/importance_sampling_ratio/max": 2.2946879863739014, "sampling/importance_sampling_ratio/mean": 1.0556809902191162, "sampling/importance_sampling_ratio/min": 0.32311511039733887, "sampling/sampling_logp_difference/max": 0.6104530096054077, "sampling/sampling_logp_difference/mean": 0.036065757274627686, "step": 135, "step_time": 16.398413172006258 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003962862421758473, "entropy": 0.20138211734592915, "epoch": 0.00136, "grad_norm": 0.8016093373298645, "kl": 0.14050656743347645, "learning_rate": 9.999999259676054e-06, "loss": -0.3175, "step": 136, "step_time": 6.125010264993762 }, { "clip_ratio/high_max": 0.0010080644860863686, "clip_ratio/high_mean": 0.0005040322430431843, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005040322430431843, "completions/clipped_ratio": 0.0, "completions/max_length": 1852.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 1715.46875, "completions/mean_terminated_length": 1715.46875, "completions/min_length": 1553.0, "completions/min_terminated_length": 1553.0, "entropy": 0.31518380902707577, "epoch": 0.00137, "frac_reward_zero_std": 0.0, "grad_norm": 1.1627895832061768, "kl": 0.19766394328325987, "learning_rate": 9.999999244795544e-06, "loss": -0.2789, "num_tokens": 5070559.0, "reward": 2.825596332550049, "reward_std": 5.118865966796875, "rewards/rollout_reward_func/mean": 2.825596332550049, "rewards/rollout_reward_func/std": 8.292236328125, "sampling/importance_sampling_ratio/max": 2.1737070083618164, "sampling/importance_sampling_ratio/mean": 0.9759609699249268, "sampling/importance_sampling_ratio/min": 1.568257844475818e-17, "sampling/sampling_logp_difference/max": 13.170364379882812, "sampling/sampling_logp_difference/mean": 0.09027349948883057, "step": 137, "step_time": 18.228433351032436 }, { "clip_ratio/high_max": 0.0010080644860863686, "clip_ratio/high_mean": 0.0005040322430431843, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018338194349780679, "entropy": 0.31584348157048225, "epoch": 0.00138, "grad_norm": 1.1142544746398926, "kl": 0.20045672729611397, "learning_rate": 9.999999229766967e-06, "loss": -0.2828, "step": 138, "step_time": 6.916452539982856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 1668.5625, "completions/mean_terminated_length": 1668.5625, "completions/min_length": 1477.0, "completions/min_terminated_length": 1477.0, "entropy": 0.27814466319978237, "epoch": 0.00139, "frac_reward_zero_std": 0.0, "grad_norm": 0.9855332374572754, "kl": 0.22340053506195545, "learning_rate": 9.999999214590327e-06, "loss": -0.0602, "num_tokens": 5145884.0, "reward": 4.645150184631348, "reward_std": 7.6181840896606445, "rewards/rollout_reward_func/mean": 4.645150184631348, "rewards/rollout_reward_func/std": 10.275853157043457, "sampling/importance_sampling_ratio/max": 2.1695244312286377, "sampling/importance_sampling_ratio/mean": 0.9515726566314697, "sampling/importance_sampling_ratio/min": 0.20832279324531555, "sampling/sampling_logp_difference/max": 0.743858814239502, "sampling/sampling_logp_difference/mean": 0.044110335409641266, "step": 139, "step_time": 16.74238319399592 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0014204545877873898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027502417797222733, "entropy": 0.28105359338223934, "epoch": 0.0014, "grad_norm": 0.9812045693397522, "kl": 0.21582742221653461, "learning_rate": 9.999999199265622e-06, "loss": -0.062, "step": 140, "step_time": 6.423543630007771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 1604.40625, "completions/mean_terminated_length": 1604.40625, "completions/min_length": 1475.0, "completions/min_terminated_length": 1475.0, "entropy": 0.24216235801577568, "epoch": 0.00141, "frac_reward_zero_std": 0.0, "grad_norm": 1.1123993396759033, "kl": 0.18741069734096527, "learning_rate": 9.999999183792851e-06, "loss": 0.0565, "num_tokens": 5218496.0, "reward": 8.09426498413086, "reward_std": 5.365090370178223, "rewards/rollout_reward_func/mean": 8.09426498413086, "rewards/rollout_reward_func/std": 6.0367326736450195, "sampling/importance_sampling_ratio/max": 1.9210368394851685, "sampling/importance_sampling_ratio/mean": 1.0942233800888062, "sampling/importance_sampling_ratio/min": 0.21610233187675476, "sampling/sampling_logp_difference/max": 0.9070119857788086, "sampling/sampling_logp_difference/mean": 0.03533536568284035, "step": 141, "step_time": 16.254843674018048 }, { "clip_ratio/high_max": 0.007257513701915741, "clip_ratio/high_mean": 0.006232923595234752, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006232923595234752, "entropy": 0.24454051814973354, "epoch": 0.00142, "grad_norm": 0.9005355834960938, "kl": 0.1877992246299982, "learning_rate": 9.999999168172017e-06, "loss": 0.056, "step": 142, "step_time": 6.412194520991761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1717.0, "completions/max_terminated_length": 1717.0, "completions/mean_length": 1644.1875, "completions/mean_terminated_length": 1644.1875, "completions/min_length": 1457.0, "completions/min_terminated_length": 1457.0, "entropy": 0.21233283914625645, "epoch": 0.00143, "frac_reward_zero_std": 0.0, "grad_norm": 0.6902234554290771, "kl": 0.16332794073969126, "learning_rate": 9.999999152403118e-06, "loss": -0.0615, "num_tokens": 5292838.0, "reward": 6.110971927642822, "reward_std": 5.937996864318848, "rewards/rollout_reward_func/mean": 6.110971927642822, "rewards/rollout_reward_func/std": 8.901442527770996, "sampling/importance_sampling_ratio/max": 2.2532308101654053, "sampling/importance_sampling_ratio/mean": 1.0682849884033203, "sampling/importance_sampling_ratio/min": 0.24916724860668182, "sampling/sampling_logp_difference/max": 0.8003082275390625, "sampling/sampling_logp_difference/mean": 0.04168836027383804, "step": 143, "step_time": 16.199017560007633 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 0.2141868006438017, "epoch": 0.00144, "grad_norm": 0.5706713199615479, "kl": 0.15940899308770895, "learning_rate": 9.999999136486154e-06, "loss": -0.0609, "step": 144, "step_time": 6.12937252302072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1734.84375, "completions/mean_terminated_length": 1734.84375, "completions/min_length": 1584.0, "completions/min_terminated_length": 1584.0, "entropy": 0.2970868647098541, "epoch": 0.00145, "frac_reward_zero_std": 0.0, "grad_norm": 1.3608251810073853, "kl": 0.19464401435106993, "learning_rate": 9.999999120421124e-06, "loss": -0.2549, "num_tokens": 5370014.0, "reward": 6.775051116943359, "reward_std": 8.484387397766113, "rewards/rollout_reward_func/mean": 6.775051116943359, "rewards/rollout_reward_func/std": 12.852544784545898, "sampling/importance_sampling_ratio/max": 2.917727470397949, "sampling/importance_sampling_ratio/mean": 1.3573473691940308, "sampling/importance_sampling_ratio/min": 0.1967896968126297, "sampling/sampling_logp_difference/max": 0.8622117042541504, "sampling/sampling_logp_difference/mean": 0.04172255098819733, "step": 145, "step_time": 17.778259722996154 }, { "clip_ratio/high_max": 0.005321558099240065, "clip_ratio/high_mean": 0.0026607790496200323, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036853691563010216, "entropy": 0.2984684258699417, "epoch": 0.00146, "grad_norm": 1.1198410987854004, "kl": 0.20733815617859364, "learning_rate": 9.99999910420803e-06, "loss": -0.259, "step": 146, "step_time": 6.4600711969978875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 1678.1875, "completions/mean_terminated_length": 1678.1875, "completions/min_length": 1581.0, "completions/min_terminated_length": 1581.0, "entropy": 0.24582931213080883, "epoch": 0.00147, "frac_reward_zero_std": 0.0, "grad_norm": 0.6891108751296997, "kl": 0.20601816847920418, "learning_rate": 9.999999087846873e-06, "loss": -0.3033, "num_tokens": 5446048.0, "reward": 1.3893117904663086, "reward_std": 4.353728771209717, "rewards/rollout_reward_func/mean": 1.3893117904663086, "rewards/rollout_reward_func/std": 6.944112777709961, "sampling/importance_sampling_ratio/max": 2.0792346000671387, "sampling/importance_sampling_ratio/mean": 1.0027928352355957, "sampling/importance_sampling_ratio/min": 0.28149667382240295, "sampling/sampling_logp_difference/max": 0.759207010269165, "sampling/sampling_logp_difference/mean": 0.045305777341127396, "step": 147, "step_time": 17.605765489992336 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013297871919348836, "entropy": 0.24695717357099056, "epoch": 0.00148, "grad_norm": 0.662736177444458, "kl": 0.1956745833158493, "learning_rate": 9.99999907133765e-06, "loss": -0.3039, "step": 148, "step_time": 6.173672115997761 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 1626.25, "completions/mean_terminated_length": 1626.25, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "entropy": 0.2285515759140253, "epoch": 0.00149, "frac_reward_zero_std": 0.0, "grad_norm": 1.0919668674468994, "kl": 0.18257456459105015, "learning_rate": 9.99999905468036e-06, "loss": -0.2, "num_tokens": 5519832.0, "reward": 6.976448059082031, "reward_std": 7.517453193664551, "rewards/rollout_reward_func/mean": 6.976448059082031, "rewards/rollout_reward_func/std": 9.654501914978027, "sampling/importance_sampling_ratio/max": 1.721216082572937, "sampling/importance_sampling_ratio/mean": 0.919627845287323, "sampling/importance_sampling_ratio/min": 0.14148102700710297, "sampling/sampling_logp_difference/max": 0.9272639751434326, "sampling/sampling_logp_difference/mean": 0.034582577645778656, "step": 149, "step_time": 18.036939932004316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22980872169137, "epoch": 0.0015, "grad_norm": 1.319507360458374, "kl": 0.19367970805615187, "learning_rate": 9.999999037875009e-06, "loss": -0.1991, "step": 150, "step_time": 6.322831986020901 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013586956774815917, "completions/clipped_ratio": 0.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 1634.8125, "completions/mean_terminated_length": 1634.8125, "completions/min_length": 1396.0, "completions/min_terminated_length": 1396.0, "entropy": 0.2720724530518055, "epoch": 0.00151, "frac_reward_zero_std": 0.0, "grad_norm": 1.5986058712005615, "kl": 0.25020872708410025, "learning_rate": 9.999999020921592e-06, "loss": -0.0659, "num_tokens": 5593794.0, "reward": 12.143424987792969, "reward_std": 9.994368553161621, "rewards/rollout_reward_func/mean": 12.143424987792969, "rewards/rollout_reward_func/std": 12.7567777633667, "sampling/importance_sampling_ratio/max": 2.3541791439056396, "sampling/importance_sampling_ratio/mean": 0.9175240993499756, "sampling/importance_sampling_ratio/min": 8.278394462990502e-12, "sampling/sampling_logp_difference/max": 22.627912521362305, "sampling/sampling_logp_difference/mean": 0.07575052976608276, "step": 151, "step_time": 16.726163999002893 }, { "clip_ratio/high_max": 0.005321558099240065, "clip_ratio/high_mean": 0.0026607790496200323, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026607790496200323, "entropy": 0.2742513306438923, "epoch": 0.00152, "grad_norm": 1.3204537630081177, "kl": 0.23389587551355362, "learning_rate": 9.999999003820108e-06, "loss": -0.0703, "step": 152, "step_time": 6.160953949991381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 1689.28125, "completions/mean_terminated_length": 1689.28125, "completions/min_length": 1589.0, "completions/min_terminated_length": 1589.0, "entropy": 0.29065221175551414, "epoch": 0.00153, "frac_reward_zero_std": 0.0, "grad_norm": 0.8678275346755981, "kl": 0.18727624788880348, "learning_rate": 9.999998986570562e-06, "loss": -0.1309, "num_tokens": 5669985.0, "reward": 7.734021186828613, "reward_std": 6.549450874328613, "rewards/rollout_reward_func/mean": 7.734021186828613, "rewards/rollout_reward_func/std": 8.855950355529785, "sampling/importance_sampling_ratio/max": 2.122692584991455, "sampling/importance_sampling_ratio/mean": 1.0235466957092285, "sampling/importance_sampling_ratio/min": 0.38856664299964905, "sampling/sampling_logp_difference/max": 0.5795457363128662, "sampling/sampling_logp_difference/mean": 0.034058578312397, "step": 153, "step_time": 16.870816470007412 }, { "clip_ratio/high_max": 0.006757934810593724, "clip_ratio/high_mean": 0.003378967405296862, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003378967405296862, "entropy": 0.2929100822657347, "epoch": 0.00154, "grad_norm": 0.7291151881217957, "kl": 0.17188291251659393, "learning_rate": 9.999998969172949e-06, "loss": -0.1323, "step": 154, "step_time": 6.232317937014159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.03125, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 1650.25, "completions/mean_terminated_length": 1652.9354248046875, "completions/min_length": 1498.0, "completions/min_terminated_length": 1498.0, "entropy": 0.3370017632842064, "epoch": 0.00155, "frac_reward_zero_std": 0.0, "grad_norm": 1.4682841300964355, "kl": 0.17820005863904953, "learning_rate": 9.999998951627273e-06, "loss": 0.0149, "num_tokens": 5744414.0, "reward": 3.293747663497925, "reward_std": 9.363197326660156, "rewards/rollout_reward_func/mean": 3.293747663497925, "rewards/rollout_reward_func/std": 12.862122535705566, "sampling/importance_sampling_ratio/max": 2.353569269180298, "sampling/importance_sampling_ratio/mean": 0.960463285446167, "sampling/importance_sampling_ratio/min": 0.17280441522598267, "sampling/sampling_logp_difference/max": 0.6690106391906738, "sampling/sampling_logp_difference/mean": 0.03728917986154556, "step": 155, "step_time": 16.657229089993052 }, { "clip_ratio/high_max": 0.0030598959419876337, "clip_ratio/high_mean": 0.0015299479709938169, "clip_ratio/low_mean": 0.003906250116415322, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005436198087409139, "entropy": 0.33962738141417503, "epoch": 0.00156, "grad_norm": 0.732698917388916, "kl": 0.17041430436074734, "learning_rate": 9.999998933933532e-06, "loss": 0.0083, "step": 156, "step_time": 6.1049487620039145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "completions/clipped_ratio": 0.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 1688.0625, "completions/mean_terminated_length": 1688.0625, "completions/min_length": 1557.0, "completions/min_terminated_length": 1557.0, "entropy": 0.39255664497613907, "epoch": 0.00157, "frac_reward_zero_std": 0.0, "grad_norm": 1.0827445983886719, "kl": 0.1685182973742485, "learning_rate": 9.999998916091727e-06, "loss": 0.0612, "num_tokens": 5819187.0, "reward": 3.1269803047180176, "reward_std": 8.263084411621094, "rewards/rollout_reward_func/mean": 3.1269803047180176, "rewards/rollout_reward_func/std": 9.094642639160156, "sampling/importance_sampling_ratio/max": 2.122588872909546, "sampling/importance_sampling_ratio/mean": 1.1091370582580566, "sampling/importance_sampling_ratio/min": 0.27494925260543823, "sampling/sampling_logp_difference/max": 0.7097280025482178, "sampling/sampling_logp_difference/mean": 0.03713656961917877, "step": 157, "step_time": 16.517342315011774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.3940476328134537, "epoch": 0.00158, "grad_norm": 1.2380092144012451, "kl": 0.1552406121045351, "learning_rate": 9.999998898101855e-06, "loss": 0.0615, "step": 158, "step_time": 6.64079051297449 }, { "clip_ratio/high_max": 0.0008278145687654614, "clip_ratio/high_mean": 0.0004139072843827307, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004139072843827307, "completions/clipped_ratio": 0.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 1725.75, "completions/mean_terminated_length": 1725.75, "completions/min_length": 1581.0, "completions/min_terminated_length": 1581.0, "entropy": 0.4579268768429756, "epoch": 0.00159, "frac_reward_zero_std": 0.0, "grad_norm": 1.7341723442077637, "kl": 0.15266202483326197, "learning_rate": 9.99999887996392e-06, "loss": 0.0339, "num_tokens": 5896572.0, "reward": 2.1551430225372314, "reward_std": 9.15644645690918, "rewards/rollout_reward_func/mean": 2.1551430225372314, "rewards/rollout_reward_func/std": 9.456039428710938, "sampling/importance_sampling_ratio/max": 2.2081494331359863, "sampling/importance_sampling_ratio/mean": 1.0434294939041138, "sampling/importance_sampling_ratio/min": 0.18727818131446838, "sampling/sampling_logp_difference/max": 0.7376601696014404, "sampling/sampling_logp_difference/mean": 0.04232223704457283, "step": 159, "step_time": 18.386425847987994 }, { "clip_ratio/high_max": 0.00465334695763886, "clip_ratio/high_mean": 0.0027405807049944997, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027405807049944997, "entropy": 0.4576542302966118, "epoch": 0.0016, "grad_norm": 1.8382275104522705, "kl": 0.1542828045785427, "learning_rate": 9.99999886167792e-06, "loss": 0.0305, "step": 160, "step_time": 6.393661024994799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1896.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 1440.375, "completions/mean_terminated_length": 1440.375, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "entropy": 0.24292725790292025, "epoch": 0.00161, "frac_reward_zero_std": 0.25, "grad_norm": 0.47333717346191406, "kl": 0.15225913655012846, "learning_rate": 9.999998843243854e-06, "loss": -0.1122, "num_tokens": 5964686.0, "reward": 3.84615421295166, "reward_std": 5.886692047119141, "rewards/rollout_reward_func/mean": 3.84615421295166, "rewards/rollout_reward_func/std": 8.027098655700684, "sampling/importance_sampling_ratio/max": 1.734418272972107, "sampling/importance_sampling_ratio/mean": 0.9881309866905212, "sampling/importance_sampling_ratio/min": 0.1568344086408615, "sampling/sampling_logp_difference/max": 0.791522741317749, "sampling/sampling_logp_difference/mean": 0.035355255007743835, "step": 161, "step_time": 17.007406476011965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "entropy": 0.24078470468521118, "epoch": 0.00162, "grad_norm": 0.4367291033267975, "kl": 0.154022884555161, "learning_rate": 9.999998824661725e-06, "loss": -0.1124, "step": 162, "step_time": 6.946171509014675 }, { "clip_ratio/high_max": 0.0020491802133619785, "clip_ratio/high_mean": 0.0010245901066809893, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020491802133619785, "completions/clipped_ratio": 0.0, "completions/max_length": 1754.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 1631.6875, "completions/mean_terminated_length": 1631.6875, "completions/min_length": 1448.0, "completions/min_terminated_length": 1448.0, "entropy": 0.2981764320284128, "epoch": 0.00163, "frac_reward_zero_std": 0.0, "grad_norm": 0.4989367425441742, "kl": 0.11931829527020454, "learning_rate": 9.99999880593153e-06, "loss": -0.0615, "num_tokens": 6039224.0, "reward": 3.073826789855957, "reward_std": 6.525941371917725, "rewards/rollout_reward_func/mean": 3.073826789855957, "rewards/rollout_reward_func/std": 6.6125102043151855, "sampling/importance_sampling_ratio/max": 1.7610772848129272, "sampling/importance_sampling_ratio/mean": 0.9258933067321777, "sampling/importance_sampling_ratio/min": 0.42555803060531616, "sampling/sampling_logp_difference/max": 0.485187292098999, "sampling/sampling_logp_difference/mean": 0.031936176121234894, "step": 163, "step_time": 17.781010968988994 }, { "clip_ratio/high_max": 0.0020491802133619785, "clip_ratio/high_mean": 0.0010245901066809893, "clip_ratio/low_mean": 0.003090847050771117, "clip_ratio/low_min": 0.0020491802133619785, "clip_ratio/region_mean": 0.0041154371574521065, "entropy": 0.29621831327676773, "epoch": 0.00164, "grad_norm": 0.5350509881973267, "kl": 0.12769107520580292, "learning_rate": 9.99999878705327e-06, "loss": -0.062, "step": 164, "step_time": 6.195043484985945 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0010775862028822303, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002379669575020671, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 1618.125, "completions/mean_terminated_length": 1618.125, "completions/min_length": 1324.0, "completions/min_terminated_length": 1324.0, "entropy": 0.33398789539933205, "epoch": 0.00165, "frac_reward_zero_std": 0.0, "grad_norm": 0.8201420903205872, "kl": 0.20769737102091312, "learning_rate": 9.999998768026947e-06, "loss": 0.0807, "num_tokens": 6112390.0, "reward": 4.835277557373047, "reward_std": 7.906643867492676, "rewards/rollout_reward_func/mean": 4.835277557373047, "rewards/rollout_reward_func/std": 13.531353950500488, "sampling/importance_sampling_ratio/max": 1.5256699323654175, "sampling/importance_sampling_ratio/mean": 0.8458409309387207, "sampling/importance_sampling_ratio/min": 0.3707176446914673, "sampling/sampling_logp_difference/max": 1.0553977489471436, "sampling/sampling_logp_difference/mean": 0.03517671674489975, "step": 165, "step_time": 16.627031766009168 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.0034490401158109307, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807735793292522, "entropy": 0.33249078691005707, "epoch": 0.00166, "grad_norm": 0.8366281986236572, "kl": 0.20799753442406654, "learning_rate": 9.999998748852559e-06, "loss": 0.0756, "step": 166, "step_time": 6.546527286962373 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.002659574383869767, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002659574383869767, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1697.625, "completions/mean_terminated_length": 1697.625, "completions/min_length": 1537.0, "completions/min_terminated_length": 1537.0, "entropy": 0.3002784438431263, "epoch": 0.00167, "frac_reward_zero_std": 0.0, "grad_norm": 0.7687087059020996, "kl": 0.17146450746804476, "learning_rate": 9.999998729530104e-06, "loss": 0.0638, "num_tokens": 6188661.0, "reward": 3.382504940032959, "reward_std": 8.596784591674805, "rewards/rollout_reward_func/mean": 3.382504940032959, "rewards/rollout_reward_func/std": 12.0718412399292, "sampling/importance_sampling_ratio/max": 1.8114066123962402, "sampling/importance_sampling_ratio/mean": 1.0046281814575195, "sampling/importance_sampling_ratio/min": 0.28282269835472107, "sampling/sampling_logp_difference/max": 0.9600496292114258, "sampling/sampling_logp_difference/mean": 0.036788903176784515, "step": 167, "step_time": 17.03360440203687 }, { "clip_ratio/high_max": 0.007923315512016416, "clip_ratio/high_mean": 0.003961657756008208, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003961657756008208, "entropy": 0.3006763905286789, "epoch": 0.00168, "grad_norm": 0.6789728403091431, "kl": 0.17414818797260523, "learning_rate": 9.999998710059587e-06, "loss": 0.062, "step": 168, "step_time": 6.71336575197347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 1653.375, "completions/mean_terminated_length": 1653.375, "completions/min_length": 1534.0, "completions/min_terminated_length": 1534.0, "entropy": 0.27304605953395367, "epoch": 0.00169, "frac_reward_zero_std": 0.0, "grad_norm": 0.7411391735076904, "kl": 0.14821083843708038, "learning_rate": 9.999998690441003e-06, "loss": -0.1143, "num_tokens": 6262912.0, "reward": 9.483894348144531, "reward_std": 10.136153221130371, "rewards/rollout_reward_func/mean": 9.483894348144531, "rewards/rollout_reward_func/std": 11.134727478027344, "sampling/importance_sampling_ratio/max": 1.6971220970153809, "sampling/importance_sampling_ratio/mean": 1.0072002410888672, "sampling/importance_sampling_ratio/min": 0.31239598989486694, "sampling/sampling_logp_difference/max": 0.6532154083251953, "sampling/sampling_logp_difference/mean": 0.03347219526767731, "step": 169, "step_time": 17.006397210017894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888889225199819, "entropy": 0.2726808600127697, "epoch": 0.0017, "grad_norm": 0.7578477263450623, "kl": 0.148045863956213, "learning_rate": 9.999998670674356e-06, "loss": -0.1164, "step": 170, "step_time": 6.786566305003362 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002631870564073324, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 1615.28125, "completions/mean_terminated_length": 1615.28125, "completions/min_length": 1366.0, "completions/min_terminated_length": 1366.0, "entropy": 0.2628182265907526, "epoch": 0.00171, "frac_reward_zero_std": 0.0, "grad_norm": 0.7566852569580078, "kl": 0.15396695490926504, "learning_rate": 9.999998650759644e-06, "loss": 0.0475, "num_tokens": 6335832.0, "reward": 11.251425743103027, "reward_std": 12.318971633911133, "rewards/rollout_reward_func/mean": 11.251425743103027, "rewards/rollout_reward_func/std": 12.386514663696289, "sampling/importance_sampling_ratio/max": 1.9865792989730835, "sampling/importance_sampling_ratio/mean": 0.9503176808357239, "sampling/importance_sampling_ratio/min": 0.32179316878318787, "sampling/sampling_logp_difference/max": 0.6249246597290039, "sampling/sampling_logp_difference/mean": 0.03295636177062988, "step": 171, "step_time": 16.88569373199425 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.002631870564073324, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.00526374124456197, "entropy": 0.2633429132401943, "epoch": 0.00172, "grad_norm": 0.873836874961853, "kl": 0.15182703640311956, "learning_rate": 9.999998630696865e-06, "loss": 0.047, "step": 172, "step_time": 6.877450691972626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888889225199819, "completions/clipped_ratio": 0.0, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 1698.46875, "completions/mean_terminated_length": 1698.46875, "completions/min_length": 1509.0, "completions/min_terminated_length": 1509.0, "entropy": 0.32171614840626717, "epoch": 0.00173, "frac_reward_zero_std": 0.0, "grad_norm": 0.6867273449897766, "kl": 0.1344722853973508, "learning_rate": 9.999998610486024e-06, "loss": -0.0785, "num_tokens": 6411299.0, "reward": 0.8415656089782715, "reward_std": 7.260726451873779, "rewards/rollout_reward_func/mean": 0.8415656089782715, "rewards/rollout_reward_func/std": 13.62413215637207, "sampling/importance_sampling_ratio/max": 1.817505121231079, "sampling/importance_sampling_ratio/mean": 0.9898264408111572, "sampling/importance_sampling_ratio/min": 0.32091426849365234, "sampling/sampling_logp_difference/max": 0.6117509603500366, "sampling/sampling_logp_difference/mean": 0.03296835348010063, "step": 173, "step_time": 17.866205081983935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004136473522521555, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004136473522521555, "entropy": 0.32146602123975754, "epoch": 0.00174, "grad_norm": 0.6523978114128113, "kl": 0.13615641091018915, "learning_rate": 9.999998590127118e-06, "loss": -0.0784, "step": 174, "step_time": 6.8325870410044445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 1668.40625, "completions/mean_terminated_length": 1668.40625, "completions/min_length": 1499.0, "completions/min_terminated_length": 1499.0, "entropy": 0.27825627848505974, "epoch": 0.00175, "frac_reward_zero_std": 0.0, "grad_norm": 0.8677846789360046, "kl": 0.2158765820786357, "learning_rate": 9.999998569620147e-06, "loss": -0.0301, "num_tokens": 6486138.0, "reward": 6.738082408905029, "reward_std": 9.026667594909668, "rewards/rollout_reward_func/mean": 6.738082408905029, "rewards/rollout_reward_func/std": 10.321571350097656, "sampling/importance_sampling_ratio/max": 1.8776869773864746, "sampling/importance_sampling_ratio/mean": 0.9062869548797607, "sampling/importance_sampling_ratio/min": 0.271128386259079, "sampling/sampling_logp_difference/max": 1.3804700374603271, "sampling/sampling_logp_difference/mean": 0.038915716111660004, "step": 175, "step_time": 17.32730075798463 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013297871919348836, "entropy": 0.27759527415037155, "epoch": 0.00176, "grad_norm": 0.8404209017753601, "kl": 0.2094637956470251, "learning_rate": 9.999998548965111e-06, "loss": -0.0342, "step": 176, "step_time": 6.6543145929899765 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 1711.5, "completions/mean_terminated_length": 1711.5, "completions/min_length": 1588.0, "completions/min_terminated_length": 1588.0, "entropy": 0.26036018691956997, "epoch": 0.00177, "frac_reward_zero_std": 0.0, "grad_norm": 0.6322230696678162, "kl": 0.14847222343087196, "learning_rate": 9.99999852816201e-06, "loss": 0.0291, "num_tokens": 6562337.0, "reward": -1.8191328048706055, "reward_std": 7.667733192443848, "rewards/rollout_reward_func/mean": -1.8191328048706055, "rewards/rollout_reward_func/std": 10.265327453613281, "sampling/importance_sampling_ratio/max": 2.097322940826416, "sampling/importance_sampling_ratio/mean": 1.031116247177124, "sampling/importance_sampling_ratio/min": 0.38465407490730286, "sampling/sampling_logp_difference/max": 0.7076263427734375, "sampling/sampling_logp_difference/mean": 0.03170010447502136, "step": 177, "step_time": 16.627841575973434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2600437868386507, "epoch": 0.00178, "grad_norm": 0.7207894325256348, "kl": 0.15639653895050287, "learning_rate": 9.999998507210844e-06, "loss": 0.0286, "step": 178, "step_time": 6.283116954989964 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013586956774815917, "completions/clipped_ratio": 0.0, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 1707.59375, "completions/mean_terminated_length": 1707.59375, "completions/min_length": 1067.0, "completions/min_terminated_length": 1067.0, "entropy": 0.33644455298781395, "epoch": 0.00179, "frac_reward_zero_std": 0.0, "grad_norm": 0.8553715944290161, "kl": 0.11993694584816694, "learning_rate": 9.999998486111612e-06, "loss": -0.1964, "num_tokens": 6638796.0, "reward": 4.6377153396606445, "reward_std": 4.861329078674316, "rewards/rollout_reward_func/mean": 4.6377153396606445, "rewards/rollout_reward_func/std": 7.492771625518799, "sampling/importance_sampling_ratio/max": 1.7832858562469482, "sampling/importance_sampling_ratio/mean": 1.0193367004394531, "sampling/importance_sampling_ratio/min": 2.3702961815530443e-10, "sampling/sampling_logp_difference/max": 7.978146076202393, "sampling/sampling_logp_difference/mean": 0.08155259490013123, "step": 179, "step_time": 18.59578241201234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005630630766972899, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005630630766972899, "entropy": 0.3382049798965454, "epoch": 0.0018, "grad_norm": 1.2414796352386475, "kl": 0.11799891386181116, "learning_rate": 9.999998464864318e-06, "loss": -0.2009, "step": 180, "step_time": 6.977028289009468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1754.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 1661.96875, "completions/mean_terminated_length": 1661.96875, "completions/min_length": 1564.0, "completions/min_terminated_length": 1564.0, "entropy": 0.28952496498823166, "epoch": 0.00181, "frac_reward_zero_std": 0.0, "grad_norm": 0.8744194507598877, "kl": 0.14304536953568459, "learning_rate": 9.999998443468958e-06, "loss": -0.0587, "num_tokens": 6713659.0, "reward": 3.502458095550537, "reward_std": 4.11943244934082, "rewards/rollout_reward_func/mean": 3.502458095550537, "rewards/rollout_reward_func/std": 7.398775577545166, "sampling/importance_sampling_ratio/max": 1.8870512247085571, "sampling/importance_sampling_ratio/mean": 0.983060359954834, "sampling/importance_sampling_ratio/min": 0.31154775619506836, "sampling/sampling_logp_difference/max": 0.755927562713623, "sampling/sampling_logp_difference/mean": 0.0301639586687088, "step": 181, "step_time": 17.42876873096975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.289512200281024, "epoch": 0.00182, "grad_norm": 0.9124158024787903, "kl": 0.1407114341855049, "learning_rate": 9.999998421925533e-06, "loss": -0.0618, "step": 182, "step_time": 6.139948702999391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 1684.21875, "completions/mean_terminated_length": 1684.21875, "completions/min_length": 1471.0, "completions/min_terminated_length": 1471.0, "entropy": 0.30168063193559647, "epoch": 0.00183, "frac_reward_zero_std": 0.0, "grad_norm": 0.7253329753875732, "kl": 0.12137273047119379, "learning_rate": 9.999998400234044e-06, "loss": -0.0017, "num_tokens": 6789175.0, "reward": 4.73601770401001, "reward_std": 7.661101341247559, "rewards/rollout_reward_func/mean": 4.73601770401001, "rewards/rollout_reward_func/std": 9.947338104248047, "sampling/importance_sampling_ratio/max": 2.1799325942993164, "sampling/importance_sampling_ratio/mean": 1.0396524667739868, "sampling/importance_sampling_ratio/min": 0.4662618935108185, "sampling/sampling_logp_difference/max": 0.5852774381637573, "sampling/sampling_logp_difference/mean": 0.02623514086008072, "step": 183, "step_time": 18.09763770797872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30195858515799046, "epoch": 0.00184, "grad_norm": 0.7228226065635681, "kl": 0.11288892291486263, "learning_rate": 9.999998378394488e-06, "loss": -0.0041, "step": 184, "step_time": 6.205342650981038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888889225199819, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 1722.40625, "completions/mean_terminated_length": 1722.40625, "completions/min_length": 1525.0, "completions/min_terminated_length": 1525.0, "entropy": 0.3112775646150112, "epoch": 0.00185, "frac_reward_zero_std": 0.0, "grad_norm": 0.7525662183761597, "kl": 0.14138545654714108, "learning_rate": 9.99999835640687e-06, "loss": -0.19, "num_tokens": 6865734.0, "reward": 2.9032394886016846, "reward_std": 8.523319244384766, "rewards/rollout_reward_func/mean": 2.9032394886016846, "rewards/rollout_reward_func/std": 9.083523750305176, "sampling/importance_sampling_ratio/max": 1.6267588138580322, "sampling/importance_sampling_ratio/mean": 0.9622193574905396, "sampling/importance_sampling_ratio/min": 0.43079012632369995, "sampling/sampling_logp_difference/max": 0.48114895820617676, "sampling/sampling_logp_difference/mean": 0.031337957829236984, "step": 185, "step_time": 17.95944309301558 }, { "clip_ratio/high_max": 0.0020491802133619785, "clip_ratio/high_mean": 0.0010245901066809893, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "entropy": 0.3126814737915993, "epoch": 0.00186, "grad_norm": 0.6958068013191223, "kl": 0.13429542630910873, "learning_rate": 9.999998334271189e-06, "loss": -0.1913, "step": 186, "step_time": 6.485593170989887 }, { "clip_ratio/high_max": 0.0027777778450399637, "clip_ratio/high_mean": 0.0013888889225199819, "clip_ratio/low_mean": 0.002466475125402212, "clip_ratio/low_min": 0.0021551724057644606, "clip_ratio/region_mean": 0.003855364047922194, "completions/clipped_ratio": 0.0, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 1628.6875, "completions/mean_terminated_length": 1628.6875, "completions/min_length": 1469.0, "completions/min_terminated_length": 1469.0, "entropy": 0.31032001227140427, "epoch": 0.00187, "frac_reward_zero_std": 0.0, "grad_norm": 0.6825176477432251, "kl": 0.15187601558864117, "learning_rate": 9.99999831198744e-06, "loss": -0.0601, "num_tokens": 6939965.0, "reward": 6.422233581542969, "reward_std": 8.794349670410156, "rewards/rollout_reward_func/mean": 6.422233581542969, "rewards/rollout_reward_func/std": 11.258831024169922, "sampling/importance_sampling_ratio/max": 1.6708012819290161, "sampling/importance_sampling_ratio/mean": 0.9802118539810181, "sampling/importance_sampling_ratio/min": 0.3090344965457916, "sampling/sampling_logp_difference/max": 0.8799679279327393, "sampling/sampling_logp_difference/mean": 0.028598960489034653, "step": 187, "step_time": 16.747505973020452 }, { "clip_ratio/high_max": 0.0027777778450399637, "clip_ratio/high_mean": 0.0013888889225199819, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888889225199819, "entropy": 0.3109175078570843, "epoch": 0.00188, "grad_norm": 0.7079899907112122, "kl": 0.1477835550904274, "learning_rate": 9.999998289555627e-06, "loss": -0.0601, "step": 188, "step_time": 6.07611179597734 }, { "clip_ratio/high_max": 0.0020491802133619785, "clip_ratio/high_mean": 0.0010245901066809893, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "completions/clipped_ratio": 0.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 1662.96875, "completions/mean_terminated_length": 1662.96875, "completions/min_length": 1525.0, "completions/min_terminated_length": 1525.0, "entropy": 0.3517409637570381, "epoch": 0.00189, "frac_reward_zero_std": 0.0, "grad_norm": 0.71088045835495, "kl": 0.15317167155444622, "learning_rate": 9.999998266975747e-06, "loss": 0.0211, "num_tokens": 7015055.0, "reward": 5.983144283294678, "reward_std": 8.17827033996582, "rewards/rollout_reward_func/mean": 5.983144283294678, "rewards/rollout_reward_func/std": 9.610690116882324, "sampling/importance_sampling_ratio/max": 1.482043981552124, "sampling/importance_sampling_ratio/mean": 0.8315011858940125, "sampling/importance_sampling_ratio/min": 1.2981145530502901e-17, "sampling/sampling_logp_difference/max": 22.655431747436523, "sampling/sampling_logp_difference/mean": 0.07867354899644852, "step": 189, "step_time": 16.83934736403171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3501877561211586, "epoch": 0.0019, "grad_norm": 0.6940600872039795, "kl": 0.1541421227157116, "learning_rate": 9.999998244247805e-06, "loss": 0.0207, "step": 190, "step_time": 6.144184050019248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0004111842135898769, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004111842135898769, "completions/clipped_ratio": 0.03125, "completions/max_length": 1903.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 1708.71875, "completions/mean_terminated_length": 1702.4515380859375, "completions/min_length": 1576.0, "completions/min_terminated_length": 1576.0, "entropy": 0.3438649009913206, "epoch": 0.00191, "frac_reward_zero_std": 0.0, "grad_norm": 0.9664538502693176, "kl": 0.111414834856987, "learning_rate": 9.9999982213718e-06, "loss": 0.0795, "num_tokens": 7092023.0, "reward": 9.559746742248535, "reward_std": 9.822237014770508, "rewards/rollout_reward_func/mean": 9.559746742248535, "rewards/rollout_reward_func/std": 14.170333862304688, "sampling/importance_sampling_ratio/max": 1.6432435512542725, "sampling/importance_sampling_ratio/mean": 0.9708099365234375, "sampling/importance_sampling_ratio/min": 0.3748074471950531, "sampling/sampling_logp_difference/max": 0.6945250034332275, "sampling/sampling_logp_difference/mean": 0.0279039666056633, "step": 191, "step_time": 18.119554285978666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013297871919348836, "entropy": 0.3419705890119076, "epoch": 0.00192, "grad_norm": 0.9055866003036499, "kl": 0.11308139190077782, "learning_rate": 9.999998198347727e-06, "loss": 0.079, "step": 192, "step_time": 6.52186049897864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 1619.875, "completions/mean_terminated_length": 1619.875, "completions/min_length": 1473.0, "completions/min_terminated_length": 1473.0, "entropy": 0.29301365837454796, "epoch": 0.00193, "frac_reward_zero_std": 0.0, "grad_norm": 1.069839358329773, "kl": 0.15144740790128708, "learning_rate": 9.999998175175592e-06, "loss": -0.0131, "num_tokens": 7166167.0, "reward": 2.9328341484069824, "reward_std": 10.981138229370117, "rewards/rollout_reward_func/mean": 2.9328341484069824, "rewards/rollout_reward_func/std": 12.225034713745117, "sampling/importance_sampling_ratio/max": 1.6442902088165283, "sampling/importance_sampling_ratio/mean": 0.9795912504196167, "sampling/importance_sampling_ratio/min": 0.4307028651237488, "sampling/sampling_logp_difference/max": 0.9361486434936523, "sampling/sampling_logp_difference/mean": 0.030327484011650085, "step": 193, "step_time": 17.024693104001926 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.002066256827674806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033683401998132467, "entropy": 0.2910199835896492, "epoch": 0.00194, "grad_norm": 1.0914729833602905, "kl": 0.1519699990749359, "learning_rate": 9.99999815185539e-06, "loss": -0.0162, "step": 194, "step_time": 6.082666194008198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 1667.3125, "completions/mean_terminated_length": 1667.3125, "completions/min_length": 1504.0, "completions/min_terminated_length": 1504.0, "entropy": 0.25190277583897114, "epoch": 0.00195, "frac_reward_zero_std": 0.0, "grad_norm": 0.8080146908760071, "kl": 0.11990550626069307, "learning_rate": 9.999998128387122e-06, "loss": -0.0697, "num_tokens": 7241444.0, "reward": 3.085637331008911, "reward_std": 7.397335052490234, "rewards/rollout_reward_func/mean": 3.085637331008911, "rewards/rollout_reward_func/std": 10.731897354125977, "sampling/importance_sampling_ratio/max": 1.8757683038711548, "sampling/importance_sampling_ratio/mean": 1.0332531929016113, "sampling/importance_sampling_ratio/min": 0.3597898483276367, "sampling/sampling_logp_difference/max": 0.519088864326477, "sampling/sampling_logp_difference/mean": 0.027686364948749542, "step": 195, "step_time": 17.263403434000793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.250925499945879, "epoch": 0.00196, "grad_norm": 0.8008567690849304, "kl": 0.12246787268668413, "learning_rate": 9.999998104770791e-06, "loss": -0.0719, "step": 196, "step_time": 6.115700915004709 }, { "clip_ratio/high_max": 0.0020833334419876337, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 1744.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 1677.03125, "completions/mean_terminated_length": 1677.03125, "completions/min_length": 1558.0, "completions/min_terminated_length": 1558.0, "entropy": 0.29858213663101196, "epoch": 0.00197, "frac_reward_zero_std": 0.0, "grad_norm": 0.8568600416183472, "kl": 0.15838666073977947, "learning_rate": 9.999998081006396e-06, "loss": -0.1527, "num_tokens": 7316182.0, "reward": 14.369819641113281, "reward_std": 12.729696273803711, "rewards/rollout_reward_func/mean": 14.369819641113281, "rewards/rollout_reward_func/std": 12.989999771118164, "sampling/importance_sampling_ratio/max": 2.0849390029907227, "sampling/importance_sampling_ratio/mean": 0.9841243624687195, "sampling/importance_sampling_ratio/min": 0.38641947507858276, "sampling/sampling_logp_difference/max": 0.621403694152832, "sampling/sampling_logp_difference/mean": 0.029784835875034332, "step": 197, "step_time": 16.834048769975198 }, { "clip_ratio/high_max": 0.0020491802133619785, "clip_ratio/high_mean": 0.0010245901066809893, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "entropy": 0.2975227143615484, "epoch": 0.00198, "grad_norm": 0.8656068444252014, "kl": 0.16134764160960913, "learning_rate": 9.999998057093936e-06, "loss": -0.1529, "step": 198, "step_time": 6.165923100008513 }, { "clip_ratio/high_max": 0.0024999999441206455, "clip_ratio/high_mean": 0.0012499999720603228, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012499999720603228, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 1235.53125, "completions/mean_terminated_length": 1235.53125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.259755652397871, "epoch": 0.00199, "frac_reward_zero_std": 0.25, "grad_norm": 0.9238920211791992, "kl": 0.19941392540931702, "learning_rate": 9.999998033033413e-06, "loss": -0.0088, "num_tokens": 7377682.0, "reward": 4.155179023742676, "reward_std": 5.812473773956299, "rewards/rollout_reward_func/mean": 4.155179023742676, "rewards/rollout_reward_func/std": 10.092905044555664, "sampling/importance_sampling_ratio/max": 1.9417250156402588, "sampling/importance_sampling_ratio/mean": 0.9827766418457031, "sampling/importance_sampling_ratio/min": 0.34737473726272583, "sampling/sampling_logp_difference/max": 0.6436953544616699, "sampling/sampling_logp_difference/mean": 0.028055690228939056, "step": 199, "step_time": 15.134658821014455 }, { "clip_ratio/high_max": 0.0024999999441206455, "clip_ratio/high_mean": 0.0012499999720603228, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025520833441987634, "entropy": 0.2585579566657543, "epoch": 0.002, "grad_norm": 0.7928014993667603, "kl": 0.2042941376566887, "learning_rate": 9.999998008824823e-06, "loss": -0.0101, "step": 200, "step_time": 6.434046792972367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 1655.25, "completions/mean_terminated_length": 1655.25, "completions/min_length": 1482.0, "completions/min_terminated_length": 1482.0, "entropy": 0.3122125919908285, "epoch": 0.00201, "frac_reward_zero_std": 0.0, "grad_norm": 0.9008649587631226, "kl": 0.14696883410215378, "learning_rate": 9.999997984468169e-06, "loss": -0.0912, "num_tokens": 7452696.0, "reward": 7.750061988830566, "reward_std": 9.524072647094727, "rewards/rollout_reward_func/mean": 7.750061988830566, "rewards/rollout_reward_func/std": 11.091376304626465, "sampling/importance_sampling_ratio/max": 2.2767248153686523, "sampling/importance_sampling_ratio/mean": 1.014455795288086, "sampling/importance_sampling_ratio/min": 0.36429426074028015, "sampling/sampling_logp_difference/max": 0.7707972526550293, "sampling/sampling_logp_difference/mean": 0.029672536998987198, "step": 201, "step_time": 16.977380541022285 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0023714539129287004, "clip_ratio/low_mean": 0.0021021763095632195, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00447363022249192, "entropy": 0.3103725463151932, "epoch": 0.00202, "grad_norm": 0.9275845885276794, "kl": 0.1491086371243, "learning_rate": 9.99999795996345e-06, "loss": -0.0931, "step": 202, "step_time": 6.197417368035531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 1746.84375, "completions/mean_terminated_length": 1746.84375, "completions/min_length": 1630.0, "completions/min_terminated_length": 1630.0, "entropy": 0.24662496149539948, "epoch": 0.00203, "frac_reward_zero_std": 0.0, "grad_norm": 0.935571551322937, "kl": 0.13020009826868773, "learning_rate": 9.999997935310666e-06, "loss": -0.2638, "num_tokens": 7529895.0, "reward": 8.675664901733398, "reward_std": 8.380345344543457, "rewards/rollout_reward_func/mean": 8.675664901733398, "rewards/rollout_reward_func/std": 12.899768829345703, "sampling/importance_sampling_ratio/max": 2.1046431064605713, "sampling/importance_sampling_ratio/mean": 1.0595152378082275, "sampling/importance_sampling_ratio/min": 0.25222504138946533, "sampling/sampling_logp_difference/max": 0.5954606533050537, "sampling/sampling_logp_difference/mean": 0.028266554698348045, "step": 203, "step_time": 17.143050544022117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003645833465270698, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003645833465270698, "entropy": 0.24344369769096375, "epoch": 0.00204, "grad_norm": 0.9559041857719421, "kl": 0.13788230251520872, "learning_rate": 9.999997910509816e-06, "loss": -0.2698, "step": 204, "step_time": 6.776170637996984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 1583.28125, "completions/mean_terminated_length": 1583.28125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.30443818494677544, "epoch": 0.00205, "frac_reward_zero_std": 0.0, "grad_norm": 1.1306363344192505, "kl": 0.15221731271594763, "learning_rate": 9.999997885560904e-06, "loss": 0.0317, "num_tokens": 7601882.0, "reward": 4.189817905426025, "reward_std": 8.956092834472656, "rewards/rollout_reward_func/mean": 4.189817905426025, "rewards/rollout_reward_func/std": 12.25725269317627, "sampling/importance_sampling_ratio/max": 2.2548816204071045, "sampling/importance_sampling_ratio/mean": 1.0585670471191406, "sampling/importance_sampling_ratio/min": 0.37149500846862793, "sampling/sampling_logp_difference/max": 0.6250150203704834, "sampling/sampling_logp_difference/mean": 0.03004813939332962, "step": 205, "step_time": 17.753356259985594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0008561643771827221, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008561643771827221, "entropy": 0.30365515872836113, "epoch": 0.00206, "grad_norm": 1.0920538902282715, "kl": 0.15824744291603565, "learning_rate": 9.999997860463926e-06, "loss": 0.0329, "step": 206, "step_time": 6.240021898993291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 1632.90625, "completions/mean_terminated_length": 1632.90625, "completions/min_length": 1527.0, "completions/min_terminated_length": 1527.0, "entropy": 0.2644720636308193, "epoch": 0.00207, "frac_reward_zero_std": 0.0, "grad_norm": 0.9331479668617249, "kl": 0.11727872304618359, "learning_rate": 9.999997835218883e-06, "loss": -0.2237, "num_tokens": 7675283.0, "reward": 6.746086597442627, "reward_std": 5.34084415435791, "rewards/rollout_reward_func/mean": 6.746086597442627, "rewards/rollout_reward_func/std": 6.082992076873779, "sampling/importance_sampling_ratio/max": 1.9850131273269653, "sampling/importance_sampling_ratio/mean": 0.9419898986816406, "sampling/importance_sampling_ratio/min": 0.354443222284317, "sampling/sampling_logp_difference/max": 0.5179198384284973, "sampling/sampling_logp_difference/mean": 0.030891261994838715, "step": 207, "step_time": 16.960254243007512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.2628643997013569, "epoch": 0.00208, "grad_norm": 0.9565983414649963, "kl": 0.12086850684136152, "learning_rate": 9.999997809825776e-06, "loss": -0.2267, "step": 208, "step_time": 6.54559659199731 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 1702.5625, "completions/mean_terminated_length": 1702.5625, "completions/min_length": 1573.0, "completions/min_terminated_length": 1573.0, "entropy": 0.2699402868747711, "epoch": 0.00209, "frac_reward_zero_std": 0.0, "grad_norm": 0.9522121548652649, "kl": 0.18773407954722643, "learning_rate": 9.999997784284603e-06, "loss": 0.1305, "num_tokens": 7751883.0, "reward": 13.15239429473877, "reward_std": 8.032751083374023, "rewards/rollout_reward_func/mean": 13.15239429473877, "rewards/rollout_reward_func/std": 12.878203392028809, "sampling/importance_sampling_ratio/max": 2.122818946838379, "sampling/importance_sampling_ratio/mean": 0.9998441934585571, "sampling/importance_sampling_ratio/min": 0.28060635924339294, "sampling/sampling_logp_difference/max": 1.1390037536621094, "sampling/sampling_logp_difference/mean": 0.03425164520740509, "step": 209, "step_time": 17.239307598996675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888889225199819, "entropy": 0.26940452493727207, "epoch": 0.0021, "grad_norm": 0.9752706289291382, "kl": 0.18476699572056532, "learning_rate": 9.999997758595368e-06, "loss": 0.1292, "step": 210, "step_time": 6.2460901139711495 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 1618.65625, "completions/mean_terminated_length": 1618.65625, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "entropy": 0.19060739316046238, "epoch": 0.00211, "frac_reward_zero_std": 0.0, "grad_norm": 0.8917297720909119, "kl": 0.15959873981773853, "learning_rate": 9.999997732758066e-06, "loss": -0.1659, "num_tokens": 7825598.0, "reward": 2.544642448425293, "reward_std": 7.506851673126221, "rewards/rollout_reward_func/mean": 2.544642448425293, "rewards/rollout_reward_func/std": 13.084085464477539, "sampling/importance_sampling_ratio/max": 1.9332152605056763, "sampling/importance_sampling_ratio/mean": 1.017883062362671, "sampling/importance_sampling_ratio/min": 0.2558363080024719, "sampling/sampling_logp_difference/max": 0.960334062576294, "sampling/sampling_logp_difference/mean": 0.031107084825634956, "step": 211, "step_time": 16.779484019993106 }, { "clip_ratio/high_max": 0.0032051282469183207, "clip_ratio/high_mean": 0.0016025641234591603, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002904647495597601, "entropy": 0.19030712731182575, "epoch": 0.00212, "grad_norm": 0.792644739151001, "kl": 0.15366925299167633, "learning_rate": 9.9999977067727e-06, "loss": -0.1676, "step": 212, "step_time": 6.589090716995997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 1672.59375, "completions/mean_terminated_length": 1672.59375, "completions/min_length": 1533.0, "completions/min_terminated_length": 1533.0, "entropy": 0.22861934453248978, "epoch": 0.00213, "frac_reward_zero_std": 0.0, "grad_norm": 0.7443928122520447, "kl": 0.19178087823092937, "learning_rate": 9.999997680639268e-06, "loss": -0.1873, "num_tokens": 7901183.0, "reward": 12.30010986328125, "reward_std": 9.320655822753906, "rewards/rollout_reward_func/mean": 12.30010986328125, "rewards/rollout_reward_func/std": 9.631754875183105, "sampling/importance_sampling_ratio/max": 2.234417200088501, "sampling/importance_sampling_ratio/mean": 0.9670222997665405, "sampling/importance_sampling_ratio/min": 0.09924029558897018, "sampling/sampling_logp_difference/max": 1.447255253791809, "sampling/sampling_logp_difference/mean": 0.040259476751089096, "step": 213, "step_time": 17.01770007101004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22803233563899994, "epoch": 0.00214, "grad_norm": 0.6795225739479065, "kl": 0.19548166822642088, "learning_rate": 9.999997654357772e-06, "loss": -0.1872, "step": 214, "step_time": 6.174372494991985 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.002659574383869767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003989361575804651, "completions/clipped_ratio": 0.0, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 1636.6875, "completions/mean_terminated_length": 1636.6875, "completions/min_length": 1467.0, "completions/min_terminated_length": 1467.0, "entropy": 0.21907278150320053, "epoch": 0.00215, "frac_reward_zero_std": 0.0, "grad_norm": 0.8174968361854553, "kl": 0.19605023320764303, "learning_rate": 9.999997627928213e-06, "loss": -0.0106, "num_tokens": 7975539.0, "reward": 8.33426570892334, "reward_std": 7.444827079772949, "rewards/rollout_reward_func/mean": 8.33426570892334, "rewards/rollout_reward_func/std": 10.655043601989746, "sampling/importance_sampling_ratio/max": 2.096503734588623, "sampling/importance_sampling_ratio/mean": 0.9762864112854004, "sampling/importance_sampling_ratio/min": 0.2375217080116272, "sampling/sampling_logp_difference/max": 0.9397575855255127, "sampling/sampling_logp_difference/mean": 0.03147196024656296, "step": 215, "step_time": 16.322072408001986 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002659574383869767, "entropy": 0.2177918739616871, "epoch": 0.00216, "grad_norm": 0.8552853465080261, "kl": 0.20590543933212757, "learning_rate": 9.999997601350588e-06, "loss": -0.0116, "step": 216, "step_time": 6.60685641800228 }, { "clip_ratio/high_max": 0.007371876039542258, "clip_ratio/high_mean": 0.003685938019771129, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0050157252117060125, "completions/clipped_ratio": 0.03125, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 1544.0625, "completions/mean_terminated_length": 1577.9354248046875, "completions/min_length": 494.0, "completions/min_terminated_length": 1092.0, "entropy": 0.2489533070474863, "epoch": 0.00217, "frac_reward_zero_std": 0.0, "grad_norm": 0.9176120758056641, "kl": 0.18755596224218607, "learning_rate": 9.999997574624898e-06, "loss": 0.0173, "num_tokens": 8047781.0, "reward": 7.096983909606934, "reward_std": 5.64551305770874, "rewards/rollout_reward_func/mean": 7.096983909606934, "rewards/rollout_reward_func/std": 6.517624855041504, "sampling/importance_sampling_ratio/max": 2.2828381061553955, "sampling/importance_sampling_ratio/mean": 0.898209273815155, "sampling/importance_sampling_ratio/min": 0.417507529258728, "sampling/sampling_logp_difference/max": 1.2275519371032715, "sampling/sampling_logp_difference/mean": 0.036025624722242355, "step": 217, "step_time": 16.40247307998652 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.004080028971657157, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005568124237470329, "entropy": 0.2480370942503214, "epoch": 0.00218, "grad_norm": 1.0084148645401, "kl": 0.19024685584008694, "learning_rate": 9.999997547751144e-06, "loss": 0.0161, "step": 218, "step_time": 6.578330785021535 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002631870564073324, "completions/clipped_ratio": 0.0, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1638.65625, "completions/mean_terminated_length": 1638.65625, "completions/min_length": 1200.0, "completions/min_terminated_length": 1200.0, "entropy": 0.31037737242877483, "epoch": 0.00219, "frac_reward_zero_std": 0.0, "grad_norm": 1.1229106187820435, "kl": 0.20764620788395405, "learning_rate": 9.999997520729326e-06, "loss": -0.1025, "num_tokens": 8121342.0, "reward": -0.8874363899230957, "reward_std": 6.310605049133301, "rewards/rollout_reward_func/mean": -0.8874363899230957, "rewards/rollout_reward_func/std": 7.2722883224487305, "sampling/importance_sampling_ratio/max": 2.2437801361083984, "sampling/importance_sampling_ratio/mean": 0.9471398591995239, "sampling/importance_sampling_ratio/min": 0.35444432497024536, "sampling/sampling_logp_difference/max": 1.1235229969024658, "sampling/sampling_logp_difference/mean": 0.0362231582403183, "step": 219, "step_time": 17.601282405026723 }, { "clip_ratio/high_max": 0.0059557571075856686, "clip_ratio/high_mean": 0.0029778785537928343, "clip_ratio/low_mean": 0.0027901786379516125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005768057191744447, "entropy": 0.31261641159653664, "epoch": 0.0022, "grad_norm": 1.0851309299468994, "kl": 0.21176804415881634, "learning_rate": 9.999997493559442e-06, "loss": -0.1051, "step": 220, "step_time": 6.942218269992736 }, { "clip_ratio/high_max": 0.008041518973186612, "clip_ratio/high_mean": 0.004020759486593306, "clip_ratio/low_mean": 0.0026909722946584225, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0067117317812517285, "completions/clipped_ratio": 0.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 1684.4375, "completions/mean_terminated_length": 1684.4375, "completions/min_length": 1475.0, "completions/min_terminated_length": 1475.0, "entropy": 0.2986330855637789, "epoch": 0.00221, "frac_reward_zero_std": 0.0, "grad_norm": 1.4658781290054321, "kl": 0.2600144725292921, "learning_rate": 9.999997466241494e-06, "loss": 0.169, "num_tokens": 8196694.0, "reward": 9.041900634765625, "reward_std": 8.995718002319336, "rewards/rollout_reward_func/mean": 9.041900634765625, "rewards/rollout_reward_func/std": 11.81253719329834, "sampling/importance_sampling_ratio/max": 2.1623754501342773, "sampling/importance_sampling_ratio/mean": 1.0579729080200195, "sampling/importance_sampling_ratio/min": 0.2628326117992401, "sampling/sampling_logp_difference/max": 1.1153502464294434, "sampling/sampling_logp_difference/mean": 0.040829166769981384, "step": 221, "step_time": 17.439855353019084 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.003933953936211765, "clip_ratio/low_mean": 0.0026909722946584225, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0066249261144548655, "entropy": 0.29955359920859337, "epoch": 0.00222, "grad_norm": 1.7548515796661377, "kl": 0.24573382828384638, "learning_rate": 9.99999743877548e-06, "loss": 0.1646, "step": 222, "step_time": 6.952217769037816 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002631870564073324, "completions/clipped_ratio": 0.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 1643.21875, "completions/mean_terminated_length": 1643.21875, "completions/min_length": 1493.0, "completions/min_terminated_length": 1493.0, "entropy": 0.23135039024055004, "epoch": 0.00223, "frac_reward_zero_std": 0.0, "grad_norm": 1.0879281759262085, "kl": 0.15847640205174685, "learning_rate": 9.999997411161401e-06, "loss": -0.5425, "num_tokens": 8270695.0, "reward": 7.3971266746521, "reward_std": 4.1652445793151855, "rewards/rollout_reward_func/mean": 7.3971266746521, "rewards/rollout_reward_func/std": 5.156468868255615, "sampling/importance_sampling_ratio/max": 2.0306589603424072, "sampling/importance_sampling_ratio/mean": 0.9779294729232788, "sampling/importance_sampling_ratio/min": 0.2146739810705185, "sampling/sampling_logp_difference/max": 1.0121033191680908, "sampling/sampling_logp_difference/mean": 0.04034660756587982, "step": 223, "step_time": 17.415504698990844 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.002659574383869767, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002659574383869767, "entropy": 0.23419341444969177, "epoch": 0.00224, "grad_norm": 0.9638247489929199, "kl": 0.15692045167088509, "learning_rate": 9.99999738339926e-06, "loss": -0.5468, "step": 224, "step_time": 6.180178539987537 }, { "clip_ratio/high_max": 0.0008278145687654614, "clip_ratio/high_mean": 0.0004139072843827307, "clip_ratio/low_mean": 0.001803369668778032, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002217276953160763, "completions/clipped_ratio": 0.03125, "completions/max_length": 1800.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1569.8125, "completions/mean_terminated_length": 1562.3870849609375, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "entropy": 0.3328532241284847, "epoch": 0.00225, "frac_reward_zero_std": 0.0, "grad_norm": 1.138601303100586, "kl": 0.2179300431162119, "learning_rate": 9.999997355489053e-06, "loss": -0.5355, "num_tokens": 8342208.0, "reward": 4.3460845947265625, "reward_std": 8.12332820892334, "rewards/rollout_reward_func/mean": 4.3460845947265625, "rewards/rollout_reward_func/std": 11.652142524719238, "sampling/importance_sampling_ratio/max": 1.8687165975570679, "sampling/importance_sampling_ratio/mean": 0.9077799320220947, "sampling/importance_sampling_ratio/min": 4.507729350786095e-17, "sampling/sampling_logp_difference/max": 23.961957931518555, "sampling/sampling_logp_difference/mean": 0.10767684131860733, "step": 225, "step_time": 16.458355388982454 }, { "clip_ratio/high_max": 0.0034319813130423427, "clip_ratio/high_mean": 0.0017159906565211713, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017159906565211713, "entropy": 0.3347261771559715, "epoch": 0.00226, "grad_norm": 1.1161432266235352, "kl": 0.20008392073214054, "learning_rate": 9.99999732743078e-06, "loss": -0.5399, "step": 226, "step_time": 6.6617297159536975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010593220358714461, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010593220358714461, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 1675.28125, "completions/mean_terminated_length": 1675.28125, "completions/min_length": 1478.0, "completions/min_terminated_length": 1478.0, "entropy": 0.2452167496085167, "epoch": 0.00227, "frac_reward_zero_std": 0.0, "grad_norm": 1.057695984840393, "kl": 0.15562982112169266, "learning_rate": 9.999997299224444e-06, "loss": 0.1974, "num_tokens": 8417959.0, "reward": 9.692388534545898, "reward_std": 9.383927345275879, "rewards/rollout_reward_func/mean": 9.692388534545898, "rewards/rollout_reward_func/std": 11.3983736038208, "sampling/importance_sampling_ratio/max": 2.8745646476745605, "sampling/importance_sampling_ratio/mean": 1.0462580919265747, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9866037368774414, "sampling/sampling_logp_difference/mean": 0.034482866525650024, "step": 227, "step_time": 17.07198098198569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0021186440717428923, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021186440717428923, "entropy": 0.24799764342606068, "epoch": 0.00228, "grad_norm": 0.9588650465011597, "kl": 0.15123312640935183, "learning_rate": 9.999997270870041e-06, "loss": 0.197, "step": 228, "step_time": 6.345221847994253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002383285784162581, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002383285784162581, "completions/clipped_ratio": 0.03125, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 1701.09375, "completions/mean_terminated_length": 1697.9354248046875, "completions/min_length": 1578.0, "completions/min_terminated_length": 1578.0, "entropy": 0.3296756334602833, "epoch": 0.00229, "frac_reward_zero_std": 0.0, "grad_norm": 1.6870832443237305, "kl": 0.19182932376861572, "learning_rate": 9.999997242367576e-06, "loss": -0.1292, "num_tokens": 8494122.0, "reward": 8.126039505004883, "reward_std": 10.810226440429688, "rewards/rollout_reward_func/mean": 8.126039505004883, "rewards/rollout_reward_func/std": 12.264974594116211, "sampling/importance_sampling_ratio/max": 2.724033832550049, "sampling/importance_sampling_ratio/mean": 1.1820440292358398, "sampling/importance_sampling_ratio/min": 0.18919557332992554, "sampling/sampling_logp_difference/max": 1.1057090759277344, "sampling/sampling_logp_difference/mean": 0.03898300975561142, "step": 229, "step_time": 17.496836471007555 }, { "clip_ratio/high_max": 0.0028937748284079134, "clip_ratio/high_mean": 0.0014468874142039567, "clip_ratio/low_mean": 0.007724577677436173, "clip_ratio/low_min": 0.0017361111240461469, "clip_ratio/region_mean": 0.009171465208055452, "entropy": 0.3321936000138521, "epoch": 0.0023, "grad_norm": 1.4342739582061768, "kl": 0.18400872312486172, "learning_rate": 9.999997213717045e-06, "loss": -0.1374, "step": 230, "step_time": 6.778669374994934 }, { "clip_ratio/high_max": 0.007312921341508627, "clip_ratio/high_mean": 0.0036564606707543135, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005015156348235905, "completions/clipped_ratio": 0.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 1572.0625, "completions/mean_terminated_length": 1572.0625, "completions/min_length": 1348.0, "completions/min_terminated_length": 1348.0, "entropy": 0.3268578704446554, "epoch": 0.00231, "frac_reward_zero_std": 0.0, "grad_norm": 1.4352997541427612, "kl": 0.28508901223540306, "learning_rate": 9.99999718491845e-06, "loss": -0.1066, "num_tokens": 8566672.0, "reward": 8.48876953125, "reward_std": 9.348672866821289, "rewards/rollout_reward_func/mean": 8.48876953125, "rewards/rollout_reward_func/std": 11.30949878692627, "sampling/importance_sampling_ratio/max": 2.035207509994507, "sampling/importance_sampling_ratio/mean": 0.8822882175445557, "sampling/importance_sampling_ratio/min": 0.1726056933403015, "sampling/sampling_logp_difference/max": 1.810767650604248, "sampling/sampling_logp_difference/mean": 0.043136026710271835, "step": 231, "step_time": 16.27966301201377 }, { "clip_ratio/high_max": 0.009544790023937821, "clip_ratio/high_mean": 0.004772395011968911, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004772395011968911, "entropy": 0.326477263122797, "epoch": 0.00232, "grad_norm": 1.117466926574707, "kl": 0.23840315639972687, "learning_rate": 9.99999715597179e-06, "loss": -0.1133, "step": 232, "step_time": 6.005401846996392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.03125, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 1621.21875, "completions/mean_terminated_length": 1618.3870849609375, "completions/min_length": 1434.0, "completions/min_terminated_length": 1434.0, "entropy": 0.33667995035648346, "epoch": 0.00233, "frac_reward_zero_std": 0.0, "grad_norm": 1.6114346981048584, "kl": 0.15120230242609978, "learning_rate": 9.999997126877066e-06, "loss": -0.1368, "num_tokens": 8640795.0, "reward": 8.421208381652832, "reward_std": 8.341537475585938, "rewards/rollout_reward_func/mean": 8.421208381652832, "rewards/rollout_reward_func/std": 10.693611145019531, "sampling/importance_sampling_ratio/max": 2.7017385959625244, "sampling/importance_sampling_ratio/mean": 1.084932804107666, "sampling/importance_sampling_ratio/min": 0.3072262406349182, "sampling/sampling_logp_difference/max": 0.7183988094329834, "sampling/sampling_logp_difference/mean": 0.03450397402048111, "step": 233, "step_time": 17.73834097001236 }, { "clip_ratio/high_max": 0.007812500232830644, "clip_ratio/high_mean": 0.004947916837409139, "clip_ratio/low_mean": 0.005983134149573743, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010931050986982882, "entropy": 0.3319475520402193, "epoch": 0.00234, "grad_norm": 1.2507743835449219, "kl": 0.15297285001724958, "learning_rate": 9.999997097634277e-06, "loss": -0.1403, "step": 234, "step_time": 6.2181126909999875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "completions/clipped_ratio": 0.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 1651.96875, "completions/mean_terminated_length": 1651.96875, "completions/min_length": 1457.0, "completions/min_terminated_length": 1457.0, "entropy": 0.29181801714003086, "epoch": 0.00235, "frac_reward_zero_std": 0.0, "grad_norm": 0.9517942070960999, "kl": 0.1458740197122097, "learning_rate": 9.999997068243421e-06, "loss": -0.1437, "num_tokens": 8715092.0, "reward": 10.55793285369873, "reward_std": 7.9695634841918945, "rewards/rollout_reward_func/mean": 10.55793285369873, "rewards/rollout_reward_func/std": 11.564793586730957, "sampling/importance_sampling_ratio/max": 1.968155026435852, "sampling/importance_sampling_ratio/mean": 0.8996999859809875, "sampling/importance_sampling_ratio/min": 1.509343932931846e-14, "sampling/sampling_logp_difference/max": 21.926584243774414, "sampling/sampling_logp_difference/mean": 0.08177145570516586, "step": 235, "step_time": 17.531996044999687 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.002659574383869767, "clip_ratio/low_mean": 0.0020491802133619785, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0047087547136470675, "entropy": 0.2884218003600836, "epoch": 0.00236, "grad_norm": 0.8365284204483032, "kl": 0.15329450368881226, "learning_rate": 9.999997038704502e-06, "loss": -0.1436, "step": 236, "step_time": 6.3604239800188225 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 1630.84375, "completions/mean_terminated_length": 1630.84375, "completions/min_length": 1450.0, "completions/min_terminated_length": 1450.0, "entropy": 0.28436414897441864, "epoch": 0.00237, "frac_reward_zero_std": 0.0, "grad_norm": 1.0795029401779175, "kl": 0.168654995970428, "learning_rate": 9.99999700901752e-06, "loss": -0.0128, "num_tokens": 8789074.0, "reward": 10.059757232666016, "reward_std": 6.925573825836182, "rewards/rollout_reward_func/mean": 10.059757232666016, "rewards/rollout_reward_func/std": 11.102499008178711, "sampling/importance_sampling_ratio/max": 2.735682487487793, "sampling/importance_sampling_ratio/mean": 1.0953364372253418, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.026933193206787, "sampling/sampling_logp_difference/mean": 0.039296992123126984, "step": 237, "step_time": 17.33113893601694 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0033683401998132467, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004670423571951687, "entropy": 0.2811966259032488, "epoch": 0.00238, "grad_norm": 1.146651029586792, "kl": 0.185172688215971, "learning_rate": 9.999996979182472e-06, "loss": -0.0149, "step": 238, "step_time": 6.149650151026435 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1877.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 1701.46875, "completions/mean_terminated_length": 1701.46875, "completions/min_length": 1238.0, "completions/min_terminated_length": 1238.0, "entropy": 0.2687853779643774, "epoch": 0.00239, "frac_reward_zero_std": 0.0, "grad_norm": 0.8743325471878052, "kl": 0.17006723675876856, "learning_rate": 9.99999694919936e-06, "loss": -0.1713, "num_tokens": 8865404.0, "reward": 4.004517555236816, "reward_std": 12.075565338134766, "rewards/rollout_reward_func/mean": 4.004517555236816, "rewards/rollout_reward_func/std": 15.469751358032227, "sampling/importance_sampling_ratio/max": 2.3105993270874023, "sampling/importance_sampling_ratio/mean": 0.8783736824989319, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.150219440460205, "sampling/sampling_logp_difference/mean": 0.049677811563014984, "step": 239, "step_time": 18.9562221160013 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031328321201726794, "entropy": 0.2648761738091707, "epoch": 0.0024, "grad_norm": 0.9577370285987854, "kl": 0.1864310409873724, "learning_rate": 9.999996919068181e-06, "loss": -0.1733, "step": 240, "step_time": 6.54108049099159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1704.4375, "completions/mean_terminated_length": 1704.4375, "completions/min_length": 1609.0, "completions/min_terminated_length": 1609.0, "entropy": 0.21218602545559406, "epoch": 0.00241, "frac_reward_zero_std": 0.0, "grad_norm": 0.6854826807975769, "kl": 0.2584373988211155, "learning_rate": 9.99999688878894e-06, "loss": -0.207, "num_tokens": 8941313.0, "reward": 11.11736011505127, "reward_std": 6.280404090881348, "rewards/rollout_reward_func/mean": 11.11736011505127, "rewards/rollout_reward_func/std": 8.219101905822754, "sampling/importance_sampling_ratio/max": 1.7017089128494263, "sampling/importance_sampling_ratio/mean": 0.7430813312530518, "sampling/importance_sampling_ratio/min": 0.2561444044113159, "sampling/sampling_logp_difference/max": 1.0890023708343506, "sampling/sampling_logp_difference/mean": 0.050988346338272095, "step": 241, "step_time": 18.113166210998315 }, { "clip_ratio/high_max": 0.007981132483109832, "clip_ratio/high_mean": 0.003990566241554916, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003990566241554916, "entropy": 0.21230853348970413, "epoch": 0.00242, "grad_norm": 0.6088385581970215, "kl": 0.24448918923735619, "learning_rate": 9.999996858361633e-06, "loss": -0.209, "step": 242, "step_time": 6.386485864000861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002354377298615873, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002354377298615873, "completions/clipped_ratio": 0.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 1683.28125, "completions/mean_terminated_length": 1683.28125, "completions/min_length": 1468.0, "completions/min_terminated_length": 1468.0, "entropy": 0.26464011147618294, "epoch": 0.00243, "frac_reward_zero_std": 0.0, "grad_norm": 0.944441020488739, "kl": 0.2161180004477501, "learning_rate": 9.99999682778626e-06, "loss": -0.016, "num_tokens": 9016660.0, "reward": 7.343211650848389, "reward_std": 6.911102294921875, "rewards/rollout_reward_func/mean": 7.343211650848389, "rewards/rollout_reward_func/std": 7.349098205566406, "sampling/importance_sampling_ratio/max": 2.4053428173065186, "sampling/importance_sampling_ratio/mean": 0.9167285561561584, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9007391929626465, "sampling/sampling_logp_difference/mean": 0.0443832091987133, "step": 243, "step_time": 17.640845214016736 }, { "clip_ratio/high_max": 0.007812500232830644, "clip_ratio/high_mean": 0.003906250116415322, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.26471036672592163, "epoch": 0.00244, "grad_norm": 0.9129452109336853, "kl": 0.23048090189695358, "learning_rate": 9.999996797062825e-06, "loss": -0.0188, "step": 244, "step_time": 6.222677419995307 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.002659574383869767, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002659574383869767, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 1690.84375, "completions/mean_terminated_length": 1690.84375, "completions/min_length": 1555.0, "completions/min_terminated_length": 1555.0, "entropy": 0.2658832222223282, "epoch": 0.00245, "frac_reward_zero_std": 0.0, "grad_norm": 2.6659154891967773, "kl": 0.49573541432619095, "learning_rate": 9.999996766191325e-06, "loss": -0.2884, "num_tokens": 9092078.0, "reward": 6.386229038238525, "reward_std": 7.280588626861572, "rewards/rollout_reward_func/mean": 6.386229038238525, "rewards/rollout_reward_func/std": 11.245016098022461, "sampling/importance_sampling_ratio/max": 2.4214608669281006, "sampling/importance_sampling_ratio/mean": 0.9312887787818909, "sampling/importance_sampling_ratio/min": 0.09448271989822388, "sampling/sampling_logp_difference/max": 1.3632465600967407, "sampling/sampling_logp_difference/mean": 0.051595307886600494, "step": 245, "step_time": 17.31270710901299 }, { "clip_ratio/high_max": 0.0047782184556126595, "clip_ratio/high_mean": 0.0023891092278063297, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023891092278063297, "entropy": 0.2663339376449585, "epoch": 0.00246, "grad_norm": 2.32917857170105, "kl": 0.45332044921815395, "learning_rate": 9.999996735171758e-06, "loss": -0.2968, "step": 246, "step_time": 6.234720461987308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1851.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 1709.84375, "completions/mean_terminated_length": 1709.84375, "completions/min_length": 1553.0, "completions/min_terminated_length": 1553.0, "entropy": 0.20069380663335323, "epoch": 0.00247, "frac_reward_zero_std": 0.0, "grad_norm": 1.2144299745559692, "kl": 0.2537014037370682, "learning_rate": 9.99999670400413e-06, "loss": 0.0071, "num_tokens": 9168756.0, "reward": 7.594346046447754, "reward_std": 6.470214366912842, "rewards/rollout_reward_func/mean": 7.594346046447754, "rewards/rollout_reward_func/std": 8.790609359741211, "sampling/importance_sampling_ratio/max": 2.865860939025879, "sampling/importance_sampling_ratio/mean": 0.8611065745353699, "sampling/importance_sampling_ratio/min": 0.2114133983850479, "sampling/sampling_logp_difference/max": 1.2004327774047852, "sampling/sampling_logp_difference/mean": 0.04944504052400589, "step": 247, "step_time": 17.5921094109799 }, { "clip_ratio/high_max": 0.018284574849531054, "clip_ratio/high_mean": 0.009142287424765527, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009142287424765527, "entropy": 0.2055994737893343, "epoch": 0.00248, "grad_norm": 0.9102283120155334, "kl": 0.2279172483831644, "learning_rate": 9.999996672688435e-06, "loss": 0.0002, "step": 248, "step_time": 6.4680789479753 }, { "clip_ratio/high_max": 0.0008223684271797538, "clip_ratio/high_mean": 0.0004111842135898769, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004111842135898769, "completions/clipped_ratio": 0.03125, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 1643.65625, "completions/mean_terminated_length": 1644.806396484375, "completions/min_length": 1362.0, "completions/min_terminated_length": 1362.0, "entropy": 0.22801293060183525, "epoch": 0.00249, "frac_reward_zero_std": 0.0, "grad_norm": 1.2165124416351318, "kl": 0.23212281335145235, "learning_rate": 9.999996641224675e-06, "loss": -0.2107, "num_tokens": 9243244.0, "reward": 11.745177268981934, "reward_std": 10.083707809448242, "rewards/rollout_reward_func/mean": 11.745177268981934, "rewards/rollout_reward_func/std": 13.36994743347168, "sampling/importance_sampling_ratio/max": 1.8573793172836304, "sampling/importance_sampling_ratio/mean": 0.9563181400299072, "sampling/importance_sampling_ratio/min": 5.620415653100985e-16, "sampling/sampling_logp_difference/max": 13.22378921508789, "sampling/sampling_logp_difference/mean": 0.07179053127765656, "step": 249, "step_time": 16.68289119300607 }, { "clip_ratio/high_max": 0.008634868660010397, "clip_ratio/high_mean": 0.004317434330005199, "clip_ratio/low_mean": 0.0068824406480416656, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011199874978046864, "entropy": 0.23216930031776428, "epoch": 0.0025, "grad_norm": 0.709208071231842, "kl": 0.17897660844027996, "learning_rate": 9.99999660961285e-06, "loss": -0.2155, "step": 250, "step_time": 6.569442774998606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 1616.1875, "completions/mean_terminated_length": 1616.1875, "completions/min_length": 1419.0, "completions/min_terminated_length": 1419.0, "entropy": 0.22336449101567268, "epoch": 0.00251, "frac_reward_zero_std": 0.0, "grad_norm": 1.0643469095230103, "kl": 0.20764287933707237, "learning_rate": 9.999996577852961e-06, "loss": -0.1544, "num_tokens": 9316233.0, "reward": 9.066548347473145, "reward_std": 7.173275470733643, "rewards/rollout_reward_func/mean": 9.066548347473145, "rewards/rollout_reward_func/std": 8.616281509399414, "sampling/importance_sampling_ratio/max": 2.296557903289795, "sampling/importance_sampling_ratio/mean": 1.0019618272781372, "sampling/importance_sampling_ratio/min": 5.327309083408434e-18, "sampling/sampling_logp_difference/max": 21.94968605041504, "sampling/sampling_logp_difference/mean": 0.07890128344297409, "step": 251, "step_time": 17.20948045401019 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "entropy": 0.22621558792889118, "epoch": 0.00252, "grad_norm": 1.067492961883545, "kl": 0.20120939798653126, "learning_rate": 9.999996545945008e-06, "loss": -0.1582, "step": 252, "step_time": 6.172337967029307 }, { "clip_ratio/high_max": 0.007923315512016416, "clip_ratio/high_mean": 0.003961657756008208, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005263741128146648, "completions/clipped_ratio": 0.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 1724.15625, "completions/mean_terminated_length": 1724.15625, "completions/min_length": 1619.0, "completions/min_terminated_length": 1619.0, "entropy": 0.2554971184581518, "epoch": 0.00253, "frac_reward_zero_std": 0.0, "grad_norm": 0.8342512845993042, "kl": 0.13784301187843084, "learning_rate": 9.99999651388899e-06, "loss": -0.1617, "num_tokens": 9393051.0, "reward": 6.3587446212768555, "reward_std": 5.878141403198242, "rewards/rollout_reward_func/mean": 6.3587446212768555, "rewards/rollout_reward_func/std": 7.401224136352539, "sampling/importance_sampling_ratio/max": 1.7011706829071045, "sampling/importance_sampling_ratio/mean": 0.8102820515632629, "sampling/importance_sampling_ratio/min": 0.18401601910591125, "sampling/sampling_logp_difference/max": 0.7976007461547852, "sampling/sampling_logp_difference/mean": 0.03998343273997307, "step": 253, "step_time": 16.99476237599447 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.002659574383869767, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003989361575804651, "entropy": 0.25496077723801136, "epoch": 0.00254, "grad_norm": 0.8155441880226135, "kl": 0.13515021838247776, "learning_rate": 9.999996481684908e-06, "loss": -0.1635, "step": 254, "step_time": 6.855723389977356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 1573.125, "completions/mean_terminated_length": 1573.125, "completions/min_length": 1326.0, "completions/min_terminated_length": 1326.0, "entropy": 0.24164918810129166, "epoch": 0.00255, "frac_reward_zero_std": 0.0, "grad_norm": 0.8914675712585449, "kl": 0.20398523099720478, "learning_rate": 9.999996449332759e-06, "loss": -0.1174, "num_tokens": 9465560.0, "reward": 7.4363603591918945, "reward_std": 4.648753643035889, "rewards/rollout_reward_func/mean": 7.4363603591918945, "rewards/rollout_reward_func/std": 6.641512393951416, "sampling/importance_sampling_ratio/max": 2.7182021141052246, "sampling/importance_sampling_ratio/mean": 1.07029128074646, "sampling/importance_sampling_ratio/min": 0.1695837378501892, "sampling/sampling_logp_difference/max": 1.3619351387023926, "sampling/sampling_logp_difference/mean": 0.036803267896175385, "step": 255, "step_time": 16.92990311801259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24215029180049896, "epoch": 0.00256, "grad_norm": 0.9378842711448669, "kl": 0.2046555932611227, "learning_rate": 9.999996416832548e-06, "loss": -0.1193, "step": 256, "step_time": 6.071674560022075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 1718.4375, "completions/mean_terminated_length": 1728.258056640625, "completions/min_length": 1414.0, "completions/min_terminated_length": 1481.0, "entropy": 0.33465169556438923, "epoch": 0.00257, "frac_reward_zero_std": 0.0, "grad_norm": 0.8127149343490601, "kl": 0.1856810823082924, "learning_rate": 9.99999638418427e-06, "loss": -0.1616, "num_tokens": 9541738.0, "reward": 7.635941982269287, "reward_std": 4.438357353210449, "rewards/rollout_reward_func/mean": 7.635941982269287, "rewards/rollout_reward_func/std": 5.802652359008789, "sampling/importance_sampling_ratio/max": 1.9430603981018066, "sampling/importance_sampling_ratio/mean": 0.9753587245941162, "sampling/importance_sampling_ratio/min": 1.1276283605511503e-16, "sampling/sampling_logp_difference/max": 9.333967208862305, "sampling/sampling_logp_difference/mean": 0.08307760953903198, "step": 257, "step_time": 18.23436665198824 }, { "clip_ratio/high_max": 0.0021186440717428923, "clip_ratio/high_mean": 0.0010593220358714461, "clip_ratio/low_mean": 0.002631870564073324, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036911925999447703, "entropy": 0.3343069441616535, "epoch": 0.00258, "grad_norm": 0.7026026248931885, "kl": 0.1845451071858406, "learning_rate": 9.99999635138793e-06, "loss": -0.164, "step": 258, "step_time": 6.930805518961279 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013297871919348836, "completions/clipped_ratio": 0.03125, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 1624.625, "completions/mean_terminated_length": 1631.806396484375, "completions/min_length": 1402.0, "completions/min_terminated_length": 1557.0, "entropy": 0.26722284965217113, "epoch": 0.00259, "frac_reward_zero_std": 0.0, "grad_norm": 0.789501428604126, "kl": 0.1330670602619648, "learning_rate": 9.999996318443524e-06, "loss": -0.0659, "num_tokens": 9615668.0, "reward": 7.344491958618164, "reward_std": 8.043564796447754, "rewards/rollout_reward_func/mean": 7.344491958618164, "rewards/rollout_reward_func/std": 9.5149564743042, "sampling/importance_sampling_ratio/max": 2.893212080001831, "sampling/importance_sampling_ratio/mean": 1.0169603824615479, "sampling/importance_sampling_ratio/min": 1.911685610690909e-19, "sampling/sampling_logp_difference/max": 11.67113208770752, "sampling/sampling_logp_difference/mean": 0.07321169972419739, "step": 259, "step_time": 16.906341740017524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.2665066681802273, "epoch": 0.0026, "grad_norm": 0.7773205637931824, "kl": 0.1342231323942542, "learning_rate": 9.999996285351053e-06, "loss": -0.0655, "step": 260, "step_time": 6.583419538990711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1734.96875, "completions/mean_terminated_length": 1734.96875, "completions/min_length": 1717.0, "completions/min_terminated_length": 1717.0, "entropy": 0.258160263299942, "epoch": 0.00261, "frac_reward_zero_std": 0.0, "grad_norm": 1.1056163311004639, "kl": 0.11192460730671883, "learning_rate": 9.999996252110517e-06, "loss": -0.2641, "num_tokens": 9692570.0, "reward": 11.925846099853516, "reward_std": 8.022760391235352, "rewards/rollout_reward_func/mean": 11.925846099853516, "rewards/rollout_reward_func/std": 12.211597442626953, "sampling/importance_sampling_ratio/max": 2.026559352874756, "sampling/importance_sampling_ratio/mean": 0.9596048593521118, "sampling/importance_sampling_ratio/min": 0.18917091190814972, "sampling/sampling_logp_difference/max": 0.9285869598388672, "sampling/sampling_logp_difference/mean": 0.03509381785988808, "step": 261, "step_time": 17.308086828983505 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.2580849453806877, "epoch": 0.00262, "grad_norm": 1.065615177154541, "kl": 0.11027599964290857, "learning_rate": 9.999996218721918e-06, "loss": -0.2674, "step": 262, "step_time": 6.322305525027332 }, { "clip_ratio/high_max": 0.0027777778450399637, "clip_ratio/high_mean": 0.0013888889225199819, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026909722946584225, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 1677.0625, "completions/mean_terminated_length": 1677.0625, "completions/min_length": 1625.0, "completions/min_terminated_length": 1625.0, "entropy": 0.23242628388106823, "epoch": 0.00263, "frac_reward_zero_std": 0.0, "grad_norm": 1.0263671875, "kl": 0.2316211173310876, "learning_rate": 9.999996185185253e-06, "loss": -0.2581, "num_tokens": 9768087.0, "reward": 7.5794878005981445, "reward_std": 7.448802947998047, "rewards/rollout_reward_func/mean": 7.5794878005981445, "rewards/rollout_reward_func/std": 11.126993179321289, "sampling/importance_sampling_ratio/max": 1.9072003364562988, "sampling/importance_sampling_ratio/mean": 0.9081864953041077, "sampling/importance_sampling_ratio/min": 0.12659026682376862, "sampling/sampling_logp_difference/max": 1.2870104312896729, "sampling/sampling_logp_difference/mean": 0.029428765177726746, "step": 263, "step_time": 18.527959250975982 }, { "clip_ratio/high_max": 0.005381944589316845, "clip_ratio/high_mean": 0.0026909722946584225, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003993055666796863, "entropy": 0.2318430170416832, "epoch": 0.00264, "grad_norm": 0.9478548169136047, "kl": 0.2195054292678833, "learning_rate": 9.999996151500525e-06, "loss": -0.2611, "step": 264, "step_time": 6.935703489012667 }, { "clip_ratio/high_max": 0.005809294991195202, "clip_ratio/high_mean": 0.002904647495597601, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002904647495597601, "completions/clipped_ratio": 0.0, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 1587.1875, "completions/mean_terminated_length": 1587.1875, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "entropy": 0.19888284988701344, "epoch": 0.00265, "frac_reward_zero_std": 0.0, "grad_norm": 1.0900896787643433, "kl": 0.2283599078655243, "learning_rate": 9.99999611766773e-06, "loss": -0.1542, "num_tokens": 9841129.0, "reward": 11.329960823059082, "reward_std": 6.5591206550598145, "rewards/rollout_reward_func/mean": 11.329960823059082, "rewards/rollout_reward_func/std": 10.03671932220459, "sampling/importance_sampling_ratio/max": 2.6593644618988037, "sampling/importance_sampling_ratio/mean": 1.0350549221038818, "sampling/importance_sampling_ratio/min": 0.144341379404068, "sampling/sampling_logp_difference/max": 1.9292607307434082, "sampling/sampling_logp_difference/mean": 0.03784944862127304, "step": 265, "step_time": 16.83255538099911 }, { "clip_ratio/high_max": 0.007812500232830644, "clip_ratio/high_mean": 0.003906250116415322, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "entropy": 0.19720461778342724, "epoch": 0.00266, "grad_norm": 1.0230287313461304, "kl": 0.22190905176103115, "learning_rate": 9.999996083686872e-06, "loss": -0.1597, "step": 266, "step_time": 6.063315493011032 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013297871919348836, "completions/clipped_ratio": 0.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 1709.53125, "completions/mean_terminated_length": 1709.53125, "completions/min_length": 1508.0, "completions/min_terminated_length": 1508.0, "entropy": 0.2589623462408781, "epoch": 0.00267, "frac_reward_zero_std": 0.0, "grad_norm": 1.1073801517486572, "kl": 0.23111738450825214, "learning_rate": 9.99999604955795e-06, "loss": -0.1308, "num_tokens": 9917749.0, "reward": 9.883926391601562, "reward_std": 6.641888618469238, "rewards/rollout_reward_func/mean": 9.883926391601562, "rewards/rollout_reward_func/std": 7.9768967628479, "sampling/importance_sampling_ratio/max": 2.161320447921753, "sampling/importance_sampling_ratio/mean": 0.9485155344009399, "sampling/importance_sampling_ratio/min": 0.2705399990081787, "sampling/sampling_logp_difference/max": 1.0482197999954224, "sampling/sampling_logp_difference/mean": 0.038760967552661896, "step": 267, "step_time": 18.682645180015243 }, { "clip_ratio/high_max": 0.004708754597231746, "clip_ratio/high_mean": 0.002354377298615873, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002354377298615873, "entropy": 0.2566060461103916, "epoch": 0.00268, "grad_norm": 0.848807692527771, "kl": 0.22342652268707752, "learning_rate": 9.999996015280962e-06, "loss": -0.1333, "step": 268, "step_time": 7.153046464009094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 1640.96875, "completions/mean_terminated_length": 1640.96875, "completions/min_length": 1516.0, "completions/min_terminated_length": 1516.0, "entropy": 0.25535392574965954, "epoch": 0.00269, "frac_reward_zero_std": 0.0, "grad_norm": 0.9480115175247192, "kl": 0.16580884903669357, "learning_rate": 9.999995980855908e-06, "loss": -0.0012, "num_tokens": 9992226.0, "reward": 4.32950496673584, "reward_std": 5.638688564300537, "rewards/rollout_reward_func/mean": 4.32950496673584, "rewards/rollout_reward_func/std": 6.413414001464844, "sampling/importance_sampling_ratio/max": 2.7369349002838135, "sampling/importance_sampling_ratio/mean": 1.0555014610290527, "sampling/importance_sampling_ratio/min": 0.3795217275619507, "sampling/sampling_logp_difference/max": 0.7226409912109375, "sampling/sampling_logp_difference/mean": 0.031509753316640854, "step": 269, "step_time": 17.104207691998454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003906250116415322, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.003906250116415322, "entropy": 0.2550513315945864, "epoch": 0.0027, "grad_norm": 0.7922755479812622, "kl": 0.1643505785614252, "learning_rate": 9.99999594628279e-06, "loss": -0.0077, "step": 270, "step_time": 6.451874619015143 }, { "clip_ratio/high_max": 0.004708754597231746, "clip_ratio/high_mean": 0.002354377298615873, "clip_ratio/low_mean": 0.0026909722946584225, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005045349593274295, "completions/clipped_ratio": 0.03125, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 1691.1875, "completions/mean_terminated_length": 1688.3870849609375, "completions/min_length": 1463.0, "completions/min_terminated_length": 1463.0, "entropy": 0.30452617816627026, "epoch": 0.00271, "frac_reward_zero_std": 0.0, "grad_norm": 1.5104411840438843, "kl": 0.22948436625301838, "learning_rate": 9.99999591156161e-06, "loss": 0.2198, "num_tokens": 10068104.0, "reward": 6.432297706604004, "reward_std": 6.147103786468506, "rewards/rollout_reward_func/mean": 6.432297706604004, "rewards/rollout_reward_func/std": 8.701411247253418, "sampling/importance_sampling_ratio/max": 1.832856297492981, "sampling/importance_sampling_ratio/mean": 0.8569630980491638, "sampling/importance_sampling_ratio/min": 0.2733321189880371, "sampling/sampling_logp_difference/max": 0.9841279983520508, "sampling/sampling_logp_difference/mean": 0.04119417071342468, "step": 271, "step_time": 18.331481316985446 }, { "clip_ratio/high_max": 0.005376965738832951, "clip_ratio/high_mean": 0.0026884828694164753, "clip_ratio/low_mean": 0.004460852185729891, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007149335055146366, "entropy": 0.3066379688680172, "epoch": 0.00272, "grad_norm": 1.3485785722732544, "kl": 0.22686109133064747, "learning_rate": 9.999995876692364e-06, "loss": 0.21, "step": 272, "step_time": 7.108004461013479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.03125, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1671.15625, "completions/mean_terminated_length": 1673.1934814453125, "completions/min_length": 1460.0, "completions/min_terminated_length": 1460.0, "entropy": 0.3139380067586899, "epoch": 0.00273, "frac_reward_zero_std": 0.0, "grad_norm": 1.2853142023086548, "kl": 0.19973258022218943, "learning_rate": 9.999995841675053e-06, "loss": 0.2147, "num_tokens": 10143122.0, "reward": 9.67431926727295, "reward_std": 5.3862810134887695, "rewards/rollout_reward_func/mean": 9.67431926727295, "rewards/rollout_reward_func/std": 6.793181419372559, "sampling/importance_sampling_ratio/max": 2.2370247840881348, "sampling/importance_sampling_ratio/mean": 0.922015905380249, "sampling/importance_sampling_ratio/min": 0.2980327308177948, "sampling/sampling_logp_difference/max": 0.8576059341430664, "sampling/sampling_logp_difference/mean": 0.04406359791755676, "step": 273, "step_time": 18.20580730306392 }, { "clip_ratio/high_max": 0.005381944589316845, "clip_ratio/high_mean": 0.0026909722946584225, "clip_ratio/low_mean": 0.0004111842135898769, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031021565082482994, "entropy": 0.3133608717471361, "epoch": 0.00274, "grad_norm": 1.1139087677001953, "kl": 0.20284830033779144, "learning_rate": 9.999995806509677e-06, "loss": 0.2096, "step": 274, "step_time": 6.229667861000053 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "completions/clipped_ratio": 0.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 1622.3125, "completions/mean_terminated_length": 1622.3125, "completions/min_length": 1288.0, "completions/min_terminated_length": 1288.0, "entropy": 0.22922121360898018, "epoch": 0.00275, "frac_reward_zero_std": 0.0, "grad_norm": 0.6652722358703613, "kl": 0.1784737892448902, "learning_rate": 9.999995771196236e-06, "loss": -0.0751, "num_tokens": 10215786.0, "reward": 0.07990288734436035, "reward_std": 4.6149797439575195, "rewards/rollout_reward_func/mean": 0.07990288734436035, "rewards/rollout_reward_func/std": 5.331188201904297, "sampling/importance_sampling_ratio/max": 2.6279141902923584, "sampling/importance_sampling_ratio/mean": 0.8157901763916016, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 25.672908782958984, "sampling/sampling_logp_difference/mean": 0.08031786978244781, "step": 275, "step_time": 17.442152499992517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027225379599258304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027225379599258304, "entropy": 0.22831929475069046, "epoch": 0.00276, "grad_norm": 0.6972734928131104, "kl": 0.17815779894590378, "learning_rate": 9.999995735734731e-06, "loss": -0.0735, "step": 276, "step_time": 6.033496930031106 }, { "clip_ratio/high_max": 0.008038949454203248, "clip_ratio/high_mean": 0.004019474727101624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004019474727101624, "completions/clipped_ratio": 0.0, "completions/max_length": 1889.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 1593.5, "completions/mean_terminated_length": 1593.5, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "entropy": 0.24142522178590298, "epoch": 0.00277, "frac_reward_zero_std": 0.0, "grad_norm": 1.0345839262008667, "kl": 0.17460421845316887, "learning_rate": 9.999995700125161e-06, "loss": 0.0402, "num_tokens": 10288701.0, "reward": 9.026582717895508, "reward_std": 7.167001724243164, "rewards/rollout_reward_func/mean": 9.026582717895508, "rewards/rollout_reward_func/std": 13.523275375366211, "sampling/importance_sampling_ratio/max": 2.1058051586151123, "sampling/importance_sampling_ratio/mean": 0.9419305920600891, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8494397401809692, "sampling/sampling_logp_difference/mean": 0.03755520284175873, "step": 277, "step_time": 17.357707146991743 }, { "clip_ratio/high_max": 0.010756340576335788, "clip_ratio/high_mean": 0.005378170288167894, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005378170288167894, "entropy": 0.24099778942763805, "epoch": 0.00278, "grad_norm": 0.8826947212219238, "kl": 0.1772823967039585, "learning_rate": 9.999995664367528e-06, "loss": 0.0391, "step": 278, "step_time": 6.410597802998382 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002631870564073324, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 1647.0, "completions/mean_terminated_length": 1647.0, "completions/min_length": 1462.0, "completions/min_terminated_length": 1462.0, "entropy": 0.24750126712024212, "epoch": 0.00279, "frac_reward_zero_std": 0.0, "grad_norm": 0.826245129108429, "kl": 0.22273140214383602, "learning_rate": 9.99999562846183e-06, "loss": -0.1032, "num_tokens": 10362783.0, "reward": 4.619167804718018, "reward_std": 6.310354709625244, "rewards/rollout_reward_func/mean": 4.619167804718018, "rewards/rollout_reward_func/std": 9.842005729675293, "sampling/importance_sampling_ratio/max": 1.6101582050323486, "sampling/importance_sampling_ratio/mean": 0.9931025505065918, "sampling/importance_sampling_ratio/min": 0.1963501125574112, "sampling/sampling_logp_difference/max": 0.9334216117858887, "sampling/sampling_logp_difference/mean": 0.040879398584365845, "step": 279, "step_time": 17.512093784011086 }, { "clip_ratio/high_max": 0.01070350268855691, "clip_ratio/high_mean": 0.005351751344278455, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005351751344278455, "entropy": 0.24799659848213196, "epoch": 0.0028, "grad_norm": 0.8156668543815613, "kl": 0.22060071863234043, "learning_rate": 9.999995592408066e-06, "loss": -0.1037, "step": 280, "step_time": 6.283328612043988 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1792.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 1620.25, "completions/mean_terminated_length": 1620.25, "completions/min_length": 1062.0, "completions/min_terminated_length": 1062.0, "entropy": 0.2766341511160135, "epoch": 0.00281, "frac_reward_zero_std": 0.0, "grad_norm": 1.1858279705047607, "kl": 0.2474101996049285, "learning_rate": 9.999995556206238e-06, "loss": -0.0874, "num_tokens": 10436180.0, "reward": 7.612247467041016, "reward_std": 9.558504104614258, "rewards/rollout_reward_func/mean": 7.612247467041016, "rewards/rollout_reward_func/std": 14.706161499023438, "sampling/importance_sampling_ratio/max": 2.4408814907073975, "sampling/importance_sampling_ratio/mean": 1.0338821411132812, "sampling/importance_sampling_ratio/min": 4.318780779505836e-14, "sampling/sampling_logp_difference/max": 22.375286102294922, "sampling/sampling_logp_difference/mean": 0.07358834147453308, "step": 281, "step_time": 17.474595935971593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.27758494205772877, "epoch": 0.00282, "grad_norm": 1.2391259670257568, "kl": 0.2432745574042201, "learning_rate": 9.999995519856345e-06, "loss": -0.0879, "step": 282, "step_time": 6.349060475025908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 1652.0, "completions/mean_terminated_length": 1652.0, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "entropy": 0.23485244996845722, "epoch": 0.00283, "frac_reward_zero_std": 0.0, "grad_norm": 0.8154040575027466, "kl": 0.21543766371905804, "learning_rate": 9.99999548335839e-06, "loss": -0.3146, "num_tokens": 10510005.0, "reward": 5.02086067199707, "reward_std": 5.039459228515625, "rewards/rollout_reward_func/mean": 5.02086067199707, "rewards/rollout_reward_func/std": 8.399057388305664, "sampling/importance_sampling_ratio/max": 2.8029603958129883, "sampling/importance_sampling_ratio/mean": 0.9871582388877869, "sampling/importance_sampling_ratio/min": 0.21777592599391937, "sampling/sampling_logp_difference/max": 0.9992363452911377, "sampling/sampling_logp_difference/mean": 0.043700963258743286, "step": 283, "step_time": 17.645202309984597 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002659574383869767, "entropy": 0.23550846800208092, "epoch": 0.00284, "grad_norm": 0.8363182544708252, "kl": 0.21343258768320084, "learning_rate": 9.999995446712365e-06, "loss": -0.3158, "step": 284, "step_time": 6.184194153989665 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013586956774815917, "completions/clipped_ratio": 0.03125, "completions/max_length": 1809.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 1691.375, "completions/mean_terminated_length": 1705.1290283203125, "completions/min_length": 1265.0, "completions/min_terminated_length": 1593.0, "entropy": 0.32209539972245693, "epoch": 0.00285, "frac_reward_zero_std": 0.0, "grad_norm": 1.402232050895691, "kl": 0.22062261682003736, "learning_rate": 9.99999540991828e-06, "loss": -0.0216, "num_tokens": 10585392.0, "reward": 6.169960021972656, "reward_std": 10.45901107788086, "rewards/rollout_reward_func/mean": 6.169960021972656, "rewards/rollout_reward_func/std": 10.841930389404297, "sampling/importance_sampling_ratio/max": 2.8454012870788574, "sampling/importance_sampling_ratio/mean": 1.1326924562454224, "sampling/importance_sampling_ratio/min": 0.10358702391386032, "sampling/sampling_logp_difference/max": 1.6271138191223145, "sampling/sampling_logp_difference/mean": 0.04078031703829765, "step": 285, "step_time": 18.213287734033656 }, { "clip_ratio/high_max": 0.010122282896190882, "clip_ratio/high_mean": 0.005061141448095441, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005061141448095441, "entropy": 0.3228088766336441, "epoch": 0.00286, "grad_norm": 1.128409504890442, "kl": 0.18590146955102682, "learning_rate": 9.999995372976127e-06, "loss": -0.0289, "step": 286, "step_time": 6.277510045998497 }, { "clip_ratio/high_max": 0.005321558099240065, "clip_ratio/high_mean": 0.0026607790496200323, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004019474727101624, "completions/clipped_ratio": 0.0, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 1666.6875, "completions/mean_terminated_length": 1666.6875, "completions/min_length": 1560.0, "completions/min_terminated_length": 1560.0, "entropy": 0.2192042265087366, "epoch": 0.00287, "frac_reward_zero_std": 0.0, "grad_norm": 0.7668952345848083, "kl": 0.2138370256870985, "learning_rate": 9.999995335885913e-06, "loss": -0.1148, "num_tokens": 10660184.0, "reward": 13.632942199707031, "reward_std": 9.949352264404297, "rewards/rollout_reward_func/mean": 13.632942199707031, "rewards/rollout_reward_func/std": 10.009238243103027, "sampling/importance_sampling_ratio/max": 2.8991050720214844, "sampling/importance_sampling_ratio/mean": 1.0559149980545044, "sampling/importance_sampling_ratio/min": 0.3851551115512848, "sampling/sampling_logp_difference/max": 1.1157631874084473, "sampling/sampling_logp_difference/mean": 0.0370478481054306, "step": 287, "step_time": 17.08347035200859 }, { "clip_ratio/high_max": 0.005321558099240065, "clip_ratio/high_mean": 0.0026607790496200323, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036853691563010216, "entropy": 0.22041206620633602, "epoch": 0.00288, "grad_norm": 0.7017627954483032, "kl": 0.21117174997925758, "learning_rate": 9.999995298647633e-06, "loss": -0.1169, "step": 288, "step_time": 6.762863796990132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 1654.15625, "completions/mean_terminated_length": 1654.15625, "completions/min_length": 1164.0, "completions/min_terminated_length": 1164.0, "entropy": 0.24024192802608013, "epoch": 0.00289, "frac_reward_zero_std": 0.0, "grad_norm": 0.8413768410682678, "kl": 0.3029135186225176, "learning_rate": 9.999995261261289e-06, "loss": -0.124, "num_tokens": 10733987.0, "reward": 11.960954666137695, "reward_std": 6.715695381164551, "rewards/rollout_reward_func/mean": 11.960954666137695, "rewards/rollout_reward_func/std": 10.024681091308594, "sampling/importance_sampling_ratio/max": 1.5300772190093994, "sampling/importance_sampling_ratio/mean": 0.8446735143661499, "sampling/importance_sampling_ratio/min": 0.23968873918056488, "sampling/sampling_logp_difference/max": 0.8441102504730225, "sampling/sampling_logp_difference/mean": 0.04132190719246864, "step": 289, "step_time": 17.763981282012537 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.2429343406111002, "epoch": 0.0029, "grad_norm": 0.8260259032249451, "kl": 0.2881174962967634, "learning_rate": 9.999995223726879e-06, "loss": -0.1249, "step": 290, "step_time": 6.200118904002011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 1644.40625, "completions/mean_terminated_length": 1644.40625, "completions/min_length": 1567.0, "completions/min_terminated_length": 1567.0, "entropy": 0.2123584058135748, "epoch": 0.00291, "frac_reward_zero_std": 0.0, "grad_norm": 0.8922343254089355, "kl": 0.2060016393661499, "learning_rate": 9.999995186044404e-06, "loss": -0.4535, "num_tokens": 10808964.0, "reward": 4.6076741218566895, "reward_std": 6.139898300170898, "rewards/rollout_reward_func/mean": 4.6076741218566895, "rewards/rollout_reward_func/std": 9.935084342956543, "sampling/importance_sampling_ratio/max": 2.1965487003326416, "sampling/importance_sampling_ratio/mean": 1.0928844213485718, "sampling/importance_sampling_ratio/min": 0.35504257678985596, "sampling/sampling_logp_difference/max": 0.7282986640930176, "sampling/sampling_logp_difference/mean": 0.039448536932468414, "step": 291, "step_time": 16.165088539986755 }, { "clip_ratio/high_max": 0.0027777778450399637, "clip_ratio/high_mean": 0.0013888889225199819, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888889225199819, "entropy": 0.21387290954589844, "epoch": 0.00292, "grad_norm": 0.8675398826599121, "kl": 0.20884080044925213, "learning_rate": 9.999995148213866e-06, "loss": -0.4528, "step": 292, "step_time": 6.658173357995111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1672.53125, "completions/mean_terminated_length": 1672.53125, "completions/min_length": 1593.0, "completions/min_terminated_length": 1593.0, "entropy": 0.23165510967373848, "epoch": 0.00293, "frac_reward_zero_std": 0.0, "grad_norm": 0.857829213142395, "kl": 0.19620831962674856, "learning_rate": 9.999995110235262e-06, "loss": -0.2138, "num_tokens": 10884400.0, "reward": 8.730931282043457, "reward_std": 10.12202262878418, "rewards/rollout_reward_func/mean": 8.730931282043457, "rewards/rollout_reward_func/std": 13.542600631713867, "sampling/importance_sampling_ratio/max": 2.384852170944214, "sampling/importance_sampling_ratio/mean": 0.8415107727050781, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3091309070587158, "sampling/sampling_logp_difference/mean": 0.04259279742836952, "step": 293, "step_time": 16.80740202900779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2299076933413744, "epoch": 0.00294, "grad_norm": 0.8617515563964844, "kl": 0.19584429543465376, "learning_rate": 9.999995072108595e-06, "loss": -0.2166, "step": 294, "step_time": 6.167151440982707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "completions/clipped_ratio": 0.0, "completions/max_length": 1853.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 1694.1875, "completions/mean_terminated_length": 1694.1875, "completions/min_length": 1469.0, "completions/min_terminated_length": 1469.0, "entropy": 0.27573048509657383, "epoch": 0.00295, "frac_reward_zero_std": 0.0, "grad_norm": 1.7840907573699951, "kl": 0.18231362290680408, "learning_rate": 9.99999503383386e-06, "loss": 0.0253, "num_tokens": 10960211.0, "reward": 7.296823501586914, "reward_std": 9.992910385131836, "rewards/rollout_reward_func/mean": 7.296823501586914, "rewards/rollout_reward_func/std": 10.40943431854248, "sampling/importance_sampling_ratio/max": 2.5154125690460205, "sampling/importance_sampling_ratio/mean": 1.017417073249817, "sampling/importance_sampling_ratio/min": 0.27899715304374695, "sampling/sampling_logp_difference/max": 0.8613039255142212, "sampling/sampling_logp_difference/mean": 0.03972065448760986, "step": 295, "step_time": 17.167534129999694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010245901066809893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010245901066809893, "entropy": 0.2728814035654068, "epoch": 0.00296, "grad_norm": 1.6185214519500732, "kl": 0.1898116022348404, "learning_rate": 9.999994995411064e-06, "loss": 0.02, "step": 296, "step_time": 6.893430356984027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1703.84375, "completions/mean_terminated_length": 1703.84375, "completions/min_length": 1565.0, "completions/min_terminated_length": 1565.0, "entropy": 0.2271426096558571, "epoch": 0.00297, "frac_reward_zero_std": 0.0, "grad_norm": 0.7183117270469666, "kl": 0.1951038846746087, "learning_rate": 9.999994956840202e-06, "loss": -0.2079, "num_tokens": 11036566.0, "reward": 13.149208068847656, "reward_std": 5.941537857055664, "rewards/rollout_reward_func/mean": 13.149208068847656, "rewards/rollout_reward_func/std": 9.841896057128906, "sampling/importance_sampling_ratio/max": 2.243212938308716, "sampling/importance_sampling_ratio/mean": 1.0565229654312134, "sampling/importance_sampling_ratio/min": 0.19737297296524048, "sampling/sampling_logp_difference/max": 0.9286561012268066, "sampling/sampling_logp_difference/mean": 0.03950604796409607, "step": 297, "step_time": 17.391589345003013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22314191982150078, "epoch": 0.00298, "grad_norm": 0.6917129158973694, "kl": 0.20468503795564175, "learning_rate": 9.999994918121276e-06, "loss": -0.2117, "step": 298, "step_time": 6.273953368028742 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.0, "completions/max_length": 1880.0, "completions/max_terminated_length": 1880.0, "completions/mean_length": 1643.1875, "completions/mean_terminated_length": 1643.1875, "completions/min_length": 1373.0, "completions/min_terminated_length": 1373.0, "entropy": 0.1924700252711773, "epoch": 0.00299, "frac_reward_zero_std": 0.0, "grad_norm": 0.9369863867759705, "kl": 0.1589259123429656, "learning_rate": 9.999994879254285e-06, "loss": -0.303, "num_tokens": 11110534.0, "reward": 7.322764873504639, "reward_std": 8.005724906921387, "rewards/rollout_reward_func/mean": 7.322764873504639, "rewards/rollout_reward_func/std": 8.989898681640625, "sampling/importance_sampling_ratio/max": 2.86427903175354, "sampling/importance_sampling_ratio/mean": 1.0593392848968506, "sampling/importance_sampling_ratio/min": 0.3740682899951935, "sampling/sampling_logp_difference/max": 1.061275601387024, "sampling/sampling_logp_difference/mean": 0.03460990637540817, "step": 299, "step_time": 17.07779270899482 }, { "clip_ratio/high_max": 0.0027777778450399637, "clip_ratio/high_mean": 0.0013888889225199819, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027777778450399637, "entropy": 0.18851100280880928, "epoch": 0.003, "grad_norm": 0.9343079328536987, "kl": 0.17096585407853127, "learning_rate": 9.99999484023923e-06, "loss": -0.3072, "step": 300, "step_time": 6.857162347980193 } ], "logging_steps": 1.0, "max_steps": 500000, "num_input_tokens_seen": 11110534, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }