{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage/absmean": 0.0, "entropy": 0.15006184577941895, "epoch": 0.005, "grad_norm": 0.0, "importance_ratio": 1.0261199474334717, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.006451699882745743, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 1, "timing/generation_ms": 1698.3512043952942, "timing/scoring_ms": 0.7268786430358887, "timing/total_ms": 1699.07808303833, "tokens/completion": 54.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.3633933067321777 }, { "advantage/absmean": 0.0, "entropy": 0.17195618152618408, "epoch": 0.01, "grad_norm": 0.0, "importance_ratio": 1.0242215394973755, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.08614622056484222, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 2, "timing/generation_ms": 932.7193200588226, "timing/scoring_ms": 0.7946789264678955, "timing/total_ms": 933.5139989852905, "tokens/completion": 410.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.421632289886475 }, { "advantage/absmean": 0.0, "entropy": 0.115142822265625, "epoch": 0.015, "grad_norm": 0.0, "importance_ratio": 1.0218698978424072, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.004534157458692789, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 3, "timing/generation_ms": 724.0002751350403, "timing/scoring_ms": 0.6492435932159424, "timing/total_ms": 724.6495187282562, "tokens/completion": 71.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.2587954998016357 }, { "advantage/absmean": 0.0, "entropy": 0.2124594748020172, "epoch": 0.02, "grad_norm": 0.0, "importance_ratio": 1.0258363485336304, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1492682695388794, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 4, "timing/generation_ms": 2201.596885919571, "timing/scoring_ms": 0.8376836776733398, "timing/total_ms": 2202.4345695972443, "tokens/completion": 248.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 9.19088625907898 }, { "advantage/absmean": 0.0, "entropy": 0.27716493606567383, "epoch": 0.025, "grad_norm": 0.0, "importance_ratio": 1.045271635055542, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.17981982231140137, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 5, "timing/generation_ms": 621.8876540660858, "timing/scoring_ms": 0.6612539291381836, "timing/total_ms": 622.548907995224, "tokens/completion": 59.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.080723762512207 }, { "advantage/absmean": 0.0, "entropy": 0.40397658944129944, "epoch": 0.03, "grad_norm": 0.0, "importance_ratio": 1.0475738048553467, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.11378740519285202, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 6, "timing/generation_ms": 417.83782839775085, "timing/scoring_ms": 0.7635653018951416, "timing/total_ms": 418.601393699646, "tokens/completion": 221.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.328477144241333 }, { "advantage/absmean": 0.0, "entropy": 0.147885262966156, "epoch": 0.035, "grad_norm": 0.0, "importance_ratio": 1.0185482501983643, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.12599562108516693, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 7, "timing/generation_ms": 844.9589610099792, "timing/scoring_ms": 0.8200705051422119, "timing/total_ms": 845.7790315151215, "tokens/completion": 89.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 3.4193923473358154 }, { "advantage/absmean": 0.0, "entropy": 0.11576475948095322, "epoch": 0.04, "grad_norm": 0.0, "importance_ratio": 1.0273798704147339, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0048851314932107925, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 8, "timing/generation_ms": 704.8712074756622, "timing/scoring_ms": 0.6542205810546875, "timing/total_ms": 705.5254280567169, "tokens/completion": 70.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.8819370269775391 }, { "advantage/absmean": 0.0, "entropy": 0.02913064695894718, "epoch": 0.045, "grad_norm": 0.0, "importance_ratio": 1.0153093338012695, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.001006214995868504, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 9, "timing/generation_ms": 271.4902460575104, "timing/scoring_ms": 0.586777925491333, "timing/total_ms": 272.0770239830017, "tokens/completion": 19.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.31793665885925293 }, { "advantage/absmean": 0.0, "entropy": 0.1483931690454483, "epoch": 0.05, "grad_norm": 0.0, "importance_ratio": 1.026798963546753, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0034873306285589933, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 10, "timing/generation_ms": 463.72705698013306, "timing/scoring_ms": 0.611037015914917, "timing/total_ms": 464.338093996048, "tokens/completion": 42.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5941140651702881 }, { "advantage/absmean": 0.0, "entropy": 0.23306170105934143, "epoch": 0.055, "grad_norm": 0.0, "importance_ratio": 1.0433093309402466, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.008515285328030586, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 11, "timing/generation_ms": 870.1420724391937, "timing/scoring_ms": 0.7279813289642334, "timing/total_ms": 870.870053768158, "tokens/completion": 91.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.804434061050415 }, { "advantage/absmean": 0.0, "entropy": 0.29323288798332214, "epoch": 0.06, "grad_norm": 0.0, "importance_ratio": 1.0345637798309326, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.3299441337585449, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 12, "timing/generation_ms": 629.0676891803741, "timing/scoring_ms": 0.6923079490661621, "timing/total_ms": 629.7599971294403, "tokens/completion": 62.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.4737663269042969 }, { "advantage/absmean": 0.0, "entropy": 0.11081530898809433, "epoch": 0.065, "grad_norm": 0.0, "importance_ratio": 1.0188957452774048, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0057387640699744225, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 13, "timing/generation_ms": 426.2040853500366, "timing/scoring_ms": 0.6040334701538086, "timing/total_ms": 426.80811882019043, "tokens/completion": 38.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.4963686466217041 }, { "advantage/absmean": 0.0, "entropy": 0.06551698595285416, "epoch": 0.07, "grad_norm": 0.0, "importance_ratio": 1.015353798866272, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0013749051140621305, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 14, "timing/generation_ms": 386.07409596443176, "timing/scoring_ms": 0.6273090839385986, "timing/total_ms": 386.70140504837036, "tokens/completion": 33.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5338990688323975 }, { "advantage/absmean": 0.0, "entropy": 0.12789036333560944, "epoch": 0.075, "grad_norm": 0.0, "importance_ratio": 0.9985013604164124, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.22250151634216309, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 15, "timing/generation_ms": 555.951327085495, "timing/scoring_ms": 0.9113848209381104, "timing/total_ms": 556.8627119064331, "tokens/completion": 55.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.410506248474121 }, { "advantage/absmean": 0.0, "entropy": 0.030439574271440506, "epoch": 0.08, "grad_norm": 0.0, "importance_ratio": 1.0107778310775757, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0009332169429399073, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 16, "timing/generation_ms": 432.63983726501465, "timing/scoring_ms": 0.645369291305542, "timing/total_ms": 433.2852065563202, "tokens/completion": 36.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5013589859008789 }, { "advantage/absmean": 0.0, "entropy": 0.04658935219049454, "epoch": 0.085, "grad_norm": 0.0, "importance_ratio": 1.0079694986343384, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0018772457260638475, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 17, "timing/generation_ms": 1556.4503371715546, "timing/scoring_ms": 0.6945133209228516, "timing/total_ms": 1557.1448504924774, "tokens/completion": 171.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.7098050117492676 }, { "advantage/absmean": 0.0, "entropy": 0.2258908450603485, "epoch": 0.09, "grad_norm": 0.0, "importance_ratio": 1.0123529434204102, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.3713923394680023, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 18, "timing/generation_ms": 663.5035574436188, "timing/scoring_ms": 2.157866954803467, "timing/total_ms": 665.6614243984222, "tokens/completion": 64.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.1164193153381348 }, { "advantage/absmean": 0.0, "entropy": 0.11793641746044159, "epoch": 0.095, "grad_norm": 0.0, "importance_ratio": 1.0255554914474487, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.004143392201513052, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 19, "timing/generation_ms": 500.055193901062, "timing/scoring_ms": 0.7805824279785156, "timing/total_ms": 500.8357763290405, "tokens/completion": 44.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.6374855041503906 }, { "advantage/absmean": 0.0, "entropy": 0.03061797469854355, "epoch": 0.1, "grad_norm": 0.0, "importance_ratio": 1.0087852478027344, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.001804706989787519, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 20, "timing/generation_ms": 362.3976409435272, "timing/scoring_ms": 0.6100833415985107, "timing/total_ms": 363.00772428512573, "tokens/completion": 30.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5654816627502441 }, { "advantage/absmean": 0.0, "entropy": 0.1123146191239357, "epoch": 0.105, "grad_norm": 0.0, "importance_ratio": 1.0212582349777222, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.003904660465195775, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 21, "timing/generation_ms": 397.72674441337585, "timing/scoring_ms": 0.7104873657226562, "timing/total_ms": 398.4372317790985, "tokens/completion": 35.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.0611999034881592 }, { "advantage/absmean": 0.0, "entropy": 0.1266263872385025, "epoch": 0.11, "grad_norm": 0.0, "importance_ratio": 0.9972212910652161, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.3898836672306061, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 22, "timing/generation_ms": 696.1232721805573, "timing/scoring_ms": 0.6523430347442627, "timing/total_ms": 696.7756152153015, "tokens/completion": 68.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.589643955230713 }, { "advantage/absmean": 0.0, "entropy": 0.2645396888256073, "epoch": 0.115, "grad_norm": 0.0, "importance_ratio": 1.0332810878753662, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.11630075424909592, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 23, "timing/generation_ms": 643.5955762863159, "timing/scoring_ms": 0.7938146591186523, "timing/total_ms": 644.3893909454346, "tokens/completion": 552.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.960213899612427 }, { "advantage/absmean": 0.0, "entropy": 0.11405622959136963, "epoch": 0.12, "grad_norm": 0.0, "importance_ratio": 0.9960747957229614, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.22632676362991333, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 24, "timing/generation_ms": 532.6159298419952, "timing/scoring_ms": 0.6944537162780762, "timing/total_ms": 533.3103835582733, "tokens/completion": 52.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.8383936882019043 }, { "advantage/absmean": 0.0, "entropy": 0.2683159112930298, "epoch": 0.125, "grad_norm": 0.0, "importance_ratio": 1.0473229885101318, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.05252255126833916, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 25, "timing/generation_ms": 1156.8818092346191, "timing/scoring_ms": 0.7328987121582031, "timing/total_ms": 1157.6147079467773, "tokens/completion": 125.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.4190943241119385 }, { "advantage/absmean": 0.0, "entropy": 0.25785261392593384, "epoch": 0.13, "grad_norm": 0.0, "importance_ratio": 1.0333380699157715, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.007053897250443697, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 26, "timing/generation_ms": 776.8439650535583, "timing/scoring_ms": 0.6645023822784424, "timing/total_ms": 777.5084674358368, "tokens/completion": 79.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.0898668766021729 }, { "advantage/absmean": 0.0, "entropy": 0.17658951878547668, "epoch": 0.135, "grad_norm": 0.0, "importance_ratio": 1.0075035095214844, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.11815378069877625, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 27, "timing/generation_ms": 1131.6810548305511, "timing/scoring_ms": 1.0703504085540771, "timing/total_ms": 1132.7514052391052, "tokens/completion": 123.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 5.199021816253662 }, { "advantage/absmean": 0.0, "entropy": 0.19042794406414032, "epoch": 0.14, "grad_norm": 0.0, "importance_ratio": 1.0446932315826416, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.007917712442576885, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 28, "timing/generation_ms": 774.4744420051575, "timing/scoring_ms": 0.6805062294006348, "timing/total_ms": 775.1549482345581, "tokens/completion": 80.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.0865705013275146 }, { "advantage/absmean": 0.0, "entropy": 0.01269950158894062, "epoch": 0.145, "grad_norm": 0.0, "importance_ratio": 1.0108988285064697, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0010445379884913564, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 29, "timing/generation_ms": 351.19348764419556, "timing/scoring_ms": 0.5960464477539062, "timing/total_ms": 351.78953409194946, "tokens/completion": 29.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.3943216800689697 }, { "advantage/absmean": 0.0, "entropy": 0.15677109360694885, "epoch": 0.15, "grad_norm": 0.0, "importance_ratio": 1.0165177583694458, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1225021705031395, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 30, "timing/generation_ms": 395.7294523715973, "timing/scoring_ms": 0.680774450302124, "timing/total_ms": 396.4102268218994, "tokens/completion": 197.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 10.843740224838257 }, { "advantage/absmean": 0.0, "entropy": 0.062308620661497116, "epoch": 0.155, "grad_norm": 0.0, "importance_ratio": 1.0222434997558594, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0038490889128297567, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 31, "timing/generation_ms": 369.1469430923462, "timing/scoring_ms": 0.579148530960083, "timing/total_ms": 369.7260916233063, "tokens/completion": 34.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5606341361999512 }, { "advantage/absmean": 0.0, "entropy": 0.23237283527851105, "epoch": 0.16, "grad_norm": 0.0, "importance_ratio": 1.0378639698028564, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.10799189656972885, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 32, "timing/generation_ms": 927.5532960891724, "timing/scoring_ms": 0.7814168930053711, "timing/total_ms": 928.3347129821777, "tokens/completion": 100.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 5.054908037185669 }, { "advantage/absmean": 0.0, "entropy": 0.09008489549160004, "epoch": 0.165, "grad_norm": 0.0, "importance_ratio": 1.0141470432281494, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.07960522174835205, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 33, "timing/generation_ms": 604.5160591602325, "timing/scoring_ms": 0.7110834121704102, "timing/total_ms": 605.227142572403, "tokens/completion": 240.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.723113536834717 }, { "advantage/absmean": 0.0, "entropy": 0.07147213816642761, "epoch": 0.17, "grad_norm": 0.0, "importance_ratio": 1.017217755317688, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.002959782024845481, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 34, "timing/generation_ms": 2262.271970510483, "timing/scoring_ms": 0.7635056972503662, "timing/total_ms": 2263.035476207733, "tokens/completion": 256.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 3.7624332904815674 }, { "advantage/absmean": 0.0, "entropy": 0.15416958928108215, "epoch": 0.175, "grad_norm": 0.0, "importance_ratio": 1.0176974534988403, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1028779074549675, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 35, "timing/generation_ms": 2697.2156167030334, "timing/scoring_ms": 0.684887170791626, "timing/total_ms": 2697.900503873825, "tokens/completion": 300.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 9.885279178619385 }, { "advantage/absmean": 0.0, "entropy": 0.12759198248386383, "epoch": 0.18, "grad_norm": 0.0, "importance_ratio": 1.012745976448059, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09079758822917938, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 36, "timing/generation_ms": 1976.068377494812, "timing/scoring_ms": 0.7205009460449219, "timing/total_ms": 1976.788878440857, "tokens/completion": 224.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 5.973433494567871 }, { "advantage/absmean": 0.0, "entropy": 0.06154109537601471, "epoch": 0.185, "grad_norm": 0.0, "importance_ratio": 1.0275661945343018, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.004207623656839132, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 37, "timing/generation_ms": 587.2387886047363, "timing/scoring_ms": 0.5762577056884766, "timing/total_ms": 587.8150463104248, "tokens/completion": 57.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.1877899169921875 }, { "advantage/absmean": 0.0, "entropy": 0.17551030218601227, "epoch": 0.19, "grad_norm": 0.0, "importance_ratio": 1.0363469123840332, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.005765930283814669, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 38, "timing/generation_ms": 458.0366611480713, "timing/scoring_ms": 0.7271468639373779, "timing/total_ms": 458.76380801200867, "tokens/completion": 39.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5540122985839844 }, { "advantage/absmean": 0.0, "entropy": 0.19849978387355804, "epoch": 0.195, "grad_norm": 0.0, "importance_ratio": 1.0219554901123047, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.34199321269989014, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 39, "timing/generation_ms": 814.9829804897308, "timing/scoring_ms": 0.7112026214599609, "timing/total_ms": 815.6941831111908, "tokens/completion": 83.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.696500539779663 }, { "advantage/absmean": 0.0, "entropy": 0.18374408781528473, "epoch": 0.2, "grad_norm": 0.0, "importance_ratio": 1.0168310403823853, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.2526581585407257, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 40, "timing/generation_ms": 2500.264137983322, "timing/scoring_ms": 0.8244216442108154, "timing/total_ms": 2501.088559627533, "tokens/completion": 281.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 6.985349655151367 }, { "advantage/absmean": 0.0, "entropy": 0.11079374700784683, "epoch": 0.205, "grad_norm": 0.0, "importance_ratio": 0.9942675828933716, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.34882354736328125, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 41, "timing/generation_ms": 449.1511583328247, "timing/scoring_ms": 0.6124675273895264, "timing/total_ms": 449.76362586021423, "tokens/completion": 41.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.4740698337554932 }, { "advantage/absmean": 0.0, "entropy": 0.0783177837729454, "epoch": 0.21, "grad_norm": 0.0, "importance_ratio": 0.990806519985199, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.2581024467945099, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 42, "timing/generation_ms": 461.476594209671, "timing/scoring_ms": 0.60310959815979, "timing/total_ms": 462.0797038078308, "tokens/completion": 44.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.6569180488586426 }, { "advantage/absmean": 0.0, "entropy": 0.12061590701341629, "epoch": 0.215, "grad_norm": 0.0, "importance_ratio": 1.0076611042022705, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.13572479784488678, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 43, "timing/generation_ms": 2948.4872221946716, "timing/scoring_ms": 0.7379353046417236, "timing/total_ms": 2949.2251574993134, "tokens/completion": 330.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 10.421152114868164 }, { "advantage/absmean": 0.0, "entropy": 0.09077983349561691, "epoch": 0.22, "grad_norm": 0.0, "importance_ratio": 1.0351545810699463, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0057518817484378815, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 44, "timing/generation_ms": 348.82035851478577, "timing/scoring_ms": 0.6234645843505859, "timing/total_ms": 349.44382309913635, "tokens/completion": 31.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.39938855171203613 }, { "advantage/absmean": 0.0, "entropy": 0.2500532269477844, "epoch": 0.225, "grad_norm": 0.0, "importance_ratio": 1.0335536003112793, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.008457548916339874, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 45, "timing/generation_ms": 597.802609205246, "timing/scoring_ms": 0.6000399589538574, "timing/total_ms": 598.4026491641998, "tokens/completion": 57.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.8566296100616455 }, { "advantage/absmean": 0.0, "entropy": 0.11787374317646027, "epoch": 0.23, "grad_norm": 0.0, "importance_ratio": 1.006740927696228, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.15599803626537323, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 46, "timing/generation_ms": 729.5256555080414, "timing/scoring_ms": 0.7490813732147217, "timing/total_ms": 730.2747368812561, "tokens/completion": 75.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.9821267127990723 }, { "advantage/absmean": 0.0, "entropy": 0.09701739996671677, "epoch": 0.235, "grad_norm": 0.0, "importance_ratio": 1.0089037418365479, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1177579015493393, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 47, "timing/generation_ms": 1653.928518295288, "timing/scoring_ms": 0.683814287185669, "timing/total_ms": 1654.6123325824738, "tokens/completion": 181.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 6.359994649887085 }, { "advantage/absmean": 0.0, "entropy": 0.18016532063484192, "epoch": 0.24, "grad_norm": 0.0, "importance_ratio": 1.044398546218872, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.008964401669800282, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 48, "timing/generation_ms": 508.2835555076599, "timing/scoring_ms": 0.7072389125823975, "timing/total_ms": 508.9907944202423, "tokens/completion": 48.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.7296624183654785 }, { "advantage/absmean": 0.0, "entropy": 0.13226501643657684, "epoch": 0.245, "grad_norm": 0.0, "importance_ratio": 1.0037078857421875, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.318349689245224, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 49, "timing/generation_ms": 670.9981858730316, "timing/scoring_ms": 0.7001161575317383, "timing/total_ms": 671.6983020305634, "tokens/completion": 67.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.8289878368377686 }, { "advantage/absmean": 0.0, "entropy": 0.12544099986553192, "epoch": 0.25, "grad_norm": 0.0, "importance_ratio": 0.9969345331192017, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.2675250172615051, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 50, "timing/generation_ms": 744.9484169483185, "timing/scoring_ms": 0.6595849990844727, "timing/total_ms": 745.608001947403, "tokens/completion": 76.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.5667636394500732 }, { "advantage/absmean": 0.0, "entropy": 0.08263812214136124, "epoch": 0.255, "grad_norm": 0.0, "importance_ratio": 1.0120441913604736, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.00446532154455781, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 51, "timing/generation_ms": 746.0676431655884, "timing/scoring_ms": 0.6454885005950928, "timing/total_ms": 746.7131316661835, "tokens/completion": 75.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.114408254623413 }, { "advantage/absmean": 0.0, "entropy": 0.1600213646888733, "epoch": 0.26, "grad_norm": 0.0, "importance_ratio": 1.025458812713623, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.07610664516687393, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 52, "timing/generation_ms": 1782.6890647411346, "timing/scoring_ms": 0.804603099822998, "timing/total_ms": 1783.4936678409576, "tokens/completion": 364.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.725010871887207 }, { "advantage/absmean": 0.0, "entropy": 0.26906779408454895, "epoch": 0.265, "grad_norm": 0.0, "importance_ratio": 1.0447108745574951, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.012701214291155338, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 53, "timing/generation_ms": 606.6789925098419, "timing/scoring_ms": 0.6681978702545166, "timing/total_ms": 607.3471903800964, "tokens/completion": 61.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.1479740142822266 }, { "advantage/absmean": 0.0, "entropy": 0.07608509808778763, "epoch": 0.27, "grad_norm": 0.0, "importance_ratio": 1.0142316818237305, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.001447903923690319, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 54, "timing/generation_ms": 573.223203420639, "timing/scoring_ms": 0.6898641586303711, "timing/total_ms": 573.9130675792694, "tokens/completion": 54.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.6409895420074463 }, { "advantage/absmean": 0.0, "entropy": 0.09488406777381897, "epoch": 0.275, "grad_norm": 0.0, "importance_ratio": 1.0251550674438477, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.002814844949170947, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 55, "timing/generation_ms": 468.05262565612793, "timing/scoring_ms": 0.6207823753356934, "timing/total_ms": 468.6734080314636, "tokens/completion": 42.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.7642278671264648 }, { "advantage/absmean": 0.0, "entropy": 0.1516200751066208, "epoch": 0.28, "grad_norm": 0.0, "importance_ratio": 1.0147314071655273, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.16201260685920715, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 56, "timing/generation_ms": 846.1059331893921, "timing/scoring_ms": 0.6973147392272949, "timing/total_ms": 846.8032479286194, "tokens/completion": 88.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 3.888683557510376 }, { "advantage/absmean": 0.0, "entropy": 0.24036580324172974, "epoch": 0.285, "grad_norm": 0.0, "importance_ratio": 1.0284076929092407, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.23654639720916748, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 57, "timing/generation_ms": 550.5916476249695, "timing/scoring_ms": 0.6979405879974365, "timing/total_ms": 551.2895882129669, "tokens/completion": 51.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.5278804302215576 }, { "advantage/absmean": 0.0, "entropy": 0.0933934673666954, "epoch": 0.29, "grad_norm": 0.0, "importance_ratio": 1.0118343830108643, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.078448586165905, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 58, "timing/generation_ms": 1443.3860182762146, "timing/scoring_ms": 0.722736120223999, "timing/total_ms": 1444.1087543964386, "tokens/completion": 473.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.201233863830566 }, { "advantage/absmean": 0.0, "entropy": 0.03372407704591751, "epoch": 0.295, "grad_norm": 0.0, "importance_ratio": 1.002398133277893, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.06702353805303574, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 59, "timing/generation_ms": 171.8699336051941, "timing/scoring_ms": 0.7537305355072021, "timing/total_ms": 172.6236641407013, "tokens/completion": 568.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 13.6052405834198 }, { "advantage/absmean": 0.0, "entropy": 0.21814168989658356, "epoch": 0.3, "grad_norm": 0.0, "importance_ratio": 1.0371828079223633, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1288362741470337, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 60, "timing/generation_ms": 841.0577774047852, "timing/scoring_ms": 1.2124478816986084, "timing/total_ms": 842.2702252864838, "tokens/completion": 83.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.355940580368042 }, { "advantage/absmean": 0.0, "entropy": 0.23174992203712463, "epoch": 0.305, "grad_norm": 0.0, "importance_ratio": 1.0601450204849243, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.011643811129033566, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 61, "timing/generation_ms": 637.3190581798553, "timing/scoring_ms": 0.640869140625, "timing/total_ms": 637.9599273204803, "tokens/completion": 63.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.287184715270996 }, { "advantage/absmean": 0.0, "entropy": 0.16819830238819122, "epoch": 0.31, "grad_norm": 0.0, "importance_ratio": 1.023901343345642, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.10128042101860046, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 62, "timing/generation_ms": 683.3431720733643, "timing/scoring_ms": 0.7451176643371582, "timing/total_ms": 684.0882897377014, "tokens/completion": 448.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 13.365703821182251 }, { "advantage/absmean": 0.0, "entropy": 0.1999366730451584, "epoch": 0.315, "grad_norm": 0.0, "importance_ratio": 1.018690824508667, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.25828859210014343, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 63, "timing/generation_ms": 810.0410103797913, "timing/scoring_ms": 0.7810592651367188, "timing/total_ms": 810.822069644928, "tokens/completion": 82.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.3877842426300049 }, { "advantage/absmean": 0.0, "entropy": 0.07600334286689758, "epoch": 0.32, "grad_norm": 0.0, "importance_ratio": 0.9966375827789307, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.14751799404621124, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 64, "timing/generation_ms": 711.8004560470581, "timing/scoring_ms": 0.6752312183380127, "timing/total_ms": 712.4756872653961, "tokens/completion": 72.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.9785699844360352 }, { "advantage/absmean": 0.0, "entropy": 0.12479478120803833, "epoch": 0.325, "grad_norm": 0.0, "importance_ratio": 1.0211162567138672, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.13460330665111542, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 65, "timing/generation_ms": 1367.4484491348267, "timing/scoring_ms": 0.561743974685669, "timing/total_ms": 1368.0101931095123, "tokens/completion": 148.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 8.830283880233765 }, { "advantage/absmean": 0.0, "entropy": 0.12324393540620804, "epoch": 0.33, "grad_norm": 0.0, "importance_ratio": 1.0122809410095215, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09175015985965729, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 66, "timing/generation_ms": 1041.243463754654, "timing/scoring_ms": 0.7834732532501221, "timing/total_ms": 1042.026937007904, "tokens/completion": 112.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.6512699127197266 }, { "advantage/absmean": 0.0, "entropy": 0.09839422255754471, "epoch": 0.335, "grad_norm": 0.0, "importance_ratio": 1.0079715251922607, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.11163844168186188, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 67, "timing/generation_ms": 1336.376041173935, "timing/scoring_ms": 0.8764266967773438, "timing/total_ms": 1337.2524678707123, "tokens/completion": 484.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.751113891601562 }, { "advantage/absmean": 0.0, "entropy": 0.05278607830405235, "epoch": 0.34, "grad_norm": 0.0, "importance_ratio": 1.0111417770385742, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.002426152117550373, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 68, "timing/generation_ms": 357.1578860282898, "timing/scoring_ms": 0.8104443550109863, "timing/total_ms": 357.9683303833008, "tokens/completion": 31.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5209541320800781 }, { "advantage/absmean": 0.0, "entropy": 0.04255048930644989, "epoch": 0.345, "grad_norm": 0.0, "importance_ratio": 1.0176432132720947, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.003955851309001446, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 69, "timing/generation_ms": 286.98739409446716, "timing/scoring_ms": 0.5937814712524414, "timing/total_ms": 287.5811755657196, "tokens/completion": 21.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.35463833808898926 }, { "advantage/absmean": 0.0, "entropy": 0.2097052037715912, "epoch": 0.35, "grad_norm": 0.0, "importance_ratio": 1.032148003578186, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.008079438470304012, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 70, "timing/generation_ms": 721.2832272052765, "timing/scoring_ms": 0.7141530513763428, "timing/total_ms": 721.9973802566528, "tokens/completion": 72.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.2694683074951172 }, { "advantage/absmean": 0.0, "entropy": 0.1202264055609703, "epoch": 0.355, "grad_norm": 0.0, "importance_ratio": 1.019940733909607, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1282488852739334, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 71, "timing/generation_ms": 516.5687501430511, "timing/scoring_ms": 0.6620287895202637, "timing/total_ms": 517.2307789325714, "tokens/completion": 225.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.215196371078491 }, { "advantage/absmean": 0.0, "entropy": 0.1233595758676529, "epoch": 0.36, "grad_norm": 0.0, "importance_ratio": 1.0179609060287476, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.06711328029632568, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 72, "timing/generation_ms": 551.3111352920532, "timing/scoring_ms": 0.669330358505249, "timing/total_ms": 551.9804656505585, "tokens/completion": 244.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.296424150466919 }, { "advantage/absmean": 0.0, "entropy": 0.09772825986146927, "epoch": 0.365, "grad_norm": 0.0, "importance_ratio": 1.0192714929580688, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.004957578144967556, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 73, "timing/generation_ms": 366.45641922950745, "timing/scoring_ms": 0.6514191627502441, "timing/total_ms": 367.1078383922577, "tokens/completion": 33.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5354394912719727 }, { "advantage/absmean": 0.0, "entropy": 0.41522714495658875, "epoch": 0.37, "grad_norm": 0.0, "importance_ratio": 1.0678110122680664, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.013985025696456432, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 74, "timing/generation_ms": 707.540363073349, "timing/scoring_ms": 0.632166862487793, "timing/total_ms": 708.1725299358368, "tokens/completion": 72.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.1281323432922363 }, { "advantage/absmean": 0.0, "entropy": 0.08198326081037521, "epoch": 0.375, "grad_norm": 0.0, "importance_ratio": 1.023478627204895, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0061213355511426926, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 75, "timing/generation_ms": 487.6084625720978, "timing/scoring_ms": 0.5840957164764404, "timing/total_ms": 488.1925582885742, "tokens/completion": 44.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.6766035556793213 }, { "advantage/absmean": 0.0, "entropy": 0.07501692324876785, "epoch": 0.38, "grad_norm": 0.0, "importance_ratio": 1.0130146741867065, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.00243639899417758, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 76, "timing/generation_ms": 439.60556387901306, "timing/scoring_ms": 0.634223222732544, "timing/total_ms": 440.2397871017456, "tokens/completion": 40.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.6219010353088379 }, { "advantage/absmean": 0.0, "entropy": 0.487246036529541, "epoch": 0.385, "grad_norm": 0.0, "importance_ratio": 1.0843446254730225, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.14624835550785065, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 77, "timing/generation_ms": 2547.288030385971, "timing/scoring_ms": 0.8234977722167969, "timing/total_ms": 2548.111528158188, "tokens/completion": 289.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 10.090744495391846 }, { "advantage/absmean": 0.0, "entropy": 0.10131778568029404, "epoch": 0.39, "grad_norm": 0.0, "importance_ratio": 1.0091615915298462, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.18738295137882233, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 78, "timing/generation_ms": 697.7240443229675, "timing/scoring_ms": 0.7160007953643799, "timing/total_ms": 698.4400451183319, "tokens/completion": 71.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.320030927658081 }, { "advantage/absmean": 0.0364583320915699, "entropy": 0.30056485533714294, "epoch": 0.395, "grad_norm": 1.03125, "importance_ratio": 1.0502251386642456, "learning_rate": 2e-07, "loss": -0.0961, "mismatch_kl": 0.07894857972860336, "reward": 0.2708333134651184, "reward/chaining_reward": 0.875, "reward/format_reward": 0.125, "reward/order_reward": 0.0, "reward/std": 0.05511981621384621, "reward/tool_name_reward": 0.0833333358168602, "step": 79, "timing/generation_ms": 3944.695234298706, "timing/scoring_ms": 689.6845400333405, "timing/total_ms": 4634.3797743320465, "tokens/completion": 140.5, "tokens/masked_fraction": 0.04626333713531494, "wall_clock/generate_s": 33.40603590011597 }, { "advantage/absmean": 0.0, "entropy": 0.05200544744729996, "epoch": 0.4, "grad_norm": 0.0, "importance_ratio": 1.016318917274475, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0010769128566607833, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 80, "timing/generation_ms": 324.50053095817566, "timing/scoring_ms": 0.7449984550476074, "timing/total_ms": 325.24552941322327, "tokens/completion": 25.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.36626720428466797 }, { "advantage/absmean": 0.0, "entropy": 0.222882941365242, "epoch": 0.405, "grad_norm": 0.0, "importance_ratio": 1.01841139793396, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1850939691066742, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 81, "timing/generation_ms": 2479.6674847602844, "timing/scoring_ms": 0.7945001125335693, "timing/total_ms": 2480.461984872818, "tokens/completion": 278.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 8.752527475357056 }, { "advantage/absmean": 0.0, "entropy": 0.06862051039934158, "epoch": 0.41, "grad_norm": 0.0, "importance_ratio": 1.015547513961792, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.002930709393694997, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 82, "timing/generation_ms": 541.6094362735748, "timing/scoring_ms": 0.8022487163543701, "timing/total_ms": 542.4116849899292, "tokens/completion": 53.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.8380634784698486 }, { "advantage/absmean": 0.0, "entropy": 0.20581920444965363, "epoch": 0.415, "grad_norm": 0.0, "importance_ratio": 1.0127023458480835, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.23816168308258057, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 83, "timing/generation_ms": 1616.5263652801514, "timing/scoring_ms": 0.7355213165283203, "timing/total_ms": 1617.2618865966797, "tokens/completion": 181.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 8.775212049484253 }, { "advantage/absmean": 0.0, "entropy": 0.17871993780136108, "epoch": 0.42, "grad_norm": 0.0, "importance_ratio": 1.0397638082504272, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.006105054169893265, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 84, "timing/generation_ms": 330.89154958724976, "timing/scoring_ms": 0.6275177001953125, "timing/total_ms": 331.51906728744507, "tokens/completion": 28.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5413095951080322 }, { "advantage/absmean": 0.0, "entropy": 0.03972291946411133, "epoch": 0.425, "grad_norm": 0.0, "importance_ratio": 1.0124591588974, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0007898285984992981, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 85, "timing/generation_ms": 266.8865919113159, "timing/scoring_ms": 0.8033514022827148, "timing/total_ms": 267.68994331359863, "tokens/completion": 20.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.30762362480163574 }, { "advantage/absmean": 0.0, "entropy": 0.039211925119161606, "epoch": 0.43, "grad_norm": 0.0, "importance_ratio": 1.0097997188568115, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0021230808924883604, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 86, "timing/generation_ms": 721.7814326286316, "timing/scoring_ms": 0.6339550018310547, "timing/total_ms": 722.4153876304626, "tokens/completion": 73.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.7008509635925293 }, { "advantage/absmean": 0.0, "entropy": 0.14121940732002258, "epoch": 0.435, "grad_norm": 0.0, "importance_ratio": 1.0325099229812622, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.007894005626440048, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 87, "timing/generation_ms": 601.6167104244232, "timing/scoring_ms": 0.6474852561950684, "timing/total_ms": 602.2641956806183, "tokens/completion": 60.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.0835204124450684 }, { "advantage/absmean": 0.0, "entropy": 0.05796457454562187, "epoch": 0.44, "grad_norm": 0.0, "importance_ratio": 1.0140713453292847, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0014408836141228676, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 88, "timing/generation_ms": 507.3610246181488, "timing/scoring_ms": 0.6600320339202881, "timing/total_ms": 508.0210566520691, "tokens/completion": 48.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.1346676349639893 }, { "advantage/absmean": 0.0, "entropy": 0.25835850834846497, "epoch": 0.445, "grad_norm": 0.0, "importance_ratio": 1.0371620655059814, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.22247108817100525, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 89, "timing/generation_ms": 1075.8300125598907, "timing/scoring_ms": 0.6826519966125488, "timing/total_ms": 1076.5126645565033, "tokens/completion": 114.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.956606149673462 }, { "advantage/absmean": 0.0, "entropy": 0.11017632484436035, "epoch": 0.45, "grad_norm": 0.0, "importance_ratio": 1.0151808261871338, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.11205606907606125, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 90, "timing/generation_ms": 1120.9122240543365, "timing/scoring_ms": 0.6285607814788818, "timing/total_ms": 1121.5407848358154, "tokens/completion": 125.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 6.543480634689331 }, { "advantage/absmean": 0.0, "entropy": 0.0456356443464756, "epoch": 0.455, "grad_norm": 0.0, "importance_ratio": 0.9994912147521973, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09318722039461136, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 91, "timing/generation_ms": 472.6267457008362, "timing/scoring_ms": 1.1721551418304443, "timing/total_ms": 473.7989008426666, "tokens/completion": 544.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 9.932720422744751 }, { "advantage/absmean": 0.0, "entropy": 0.2052524983882904, "epoch": 0.46, "grad_norm": 0.0, "importance_ratio": 1.0236974954605103, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.17303957045078278, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 92, "timing/generation_ms": 712.9063010215759, "timing/scoring_ms": 0.7020533084869385, "timing/total_ms": 713.6083543300629, "tokens/completion": 71.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.3908660411834717 }, { "advantage/absmean": 0.0, "entropy": 0.16844892501831055, "epoch": 0.465, "grad_norm": 0.0, "importance_ratio": 1.0157265663146973, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.2021290510892868, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 93, "timing/generation_ms": 1191.9453144073486, "timing/scoring_ms": 0.6962418556213379, "timing/total_ms": 1192.64155626297, "tokens/completion": 128.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.6925199031829834 }, { "advantage/absmean": 0.0, "entropy": 0.21437755227088928, "epoch": 0.47, "grad_norm": 0.0, "importance_ratio": 1.0238310098648071, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.19218796491622925, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 94, "timing/generation_ms": 650.9689390659332, "timing/scoring_ms": 0.7267594337463379, "timing/total_ms": 651.6956984996796, "tokens/completion": 64.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.3016343116760254 }, { "advantage/absmean": 0.0, "entropy": 0.2378346025943756, "epoch": 0.475, "grad_norm": 0.0, "importance_ratio": 1.0258440971374512, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1374587118625641, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 95, "timing/generation_ms": 868.2838082313538, "timing/scoring_ms": 0.9066760540008545, "timing/total_ms": 869.1904842853546, "tokens/completion": 90.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.179683208465576 }, { "advantage/absmean": 0.0, "entropy": 0.3378775715827942, "epoch": 0.48, "grad_norm": 0.0, "importance_ratio": 1.0457041263580322, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.18341806530952454, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 96, "timing/generation_ms": 675.6376028060913, "timing/scoring_ms": 0.7824897766113281, "timing/total_ms": 676.4200925827026, "tokens/completion": 237.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.17886471748352 }, { "advantage/absmean": 0.0, "entropy": 0.028066234663128853, "epoch": 0.485, "grad_norm": 0.0, "importance_ratio": 0.9998559951782227, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0638342872262001, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 97, "timing/generation_ms": 613.0174696445465, "timing/scoring_ms": 0.7825493812561035, "timing/total_ms": 613.8000190258026, "tokens/completion": 563.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.062477350234985 }, { "advantage/absmean": 0.0, "entropy": 0.08168011158704758, "epoch": 0.49, "grad_norm": 0.0, "importance_ratio": 1.0137931108474731, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.002129449974745512, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 98, "timing/generation_ms": 1334.8223268985748, "timing/scoring_ms": 0.7840394973754883, "timing/total_ms": 1335.6063663959503, "tokens/completion": 145.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.7525343894958496 }, { "advantage/absmean": 0.0, "entropy": 0.22353672981262207, "epoch": 0.495, "grad_norm": 0.0, "importance_ratio": 1.0262144804000854, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.08944935351610184, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 99, "timing/generation_ms": 2730.3845584392548, "timing/scoring_ms": 0.8510947227478027, "timing/total_ms": 2731.2356531620026, "tokens/completion": 312.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.590620756149292 }, { "advantage/absmean": 0.0, "entropy": 0.031590744853019714, "epoch": 0.5, "grad_norm": 0.0, "importance_ratio": 1.0214790105819702, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0016724687302485108, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 100, "timing/generation_ms": 275.19553899765015, "timing/scoring_ms": 0.7798969745635986, "timing/total_ms": 275.97543597221375, "tokens/completion": 21.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.3189101219177246 }, { "advantage/absmean": 0.0, "entropy": 0.16243912279605865, "epoch": 0.505, "grad_norm": 0.0, "importance_ratio": 1.0344610214233398, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.006274200510233641, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 101, "timing/generation_ms": 1181.8170845508575, "timing/scoring_ms": 0.7169246673583984, "timing/total_ms": 1182.534009218216, "tokens/completion": 126.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.6333532333374023 }, { "advantage/absmean": 0.0, "entropy": 0.14107359945774078, "epoch": 0.51, "grad_norm": 0.0, "importance_ratio": 1.0257370471954346, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.006637774873524904, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 102, "timing/generation_ms": 448.06233048439026, "timing/scoring_ms": 0.696331262588501, "timing/total_ms": 448.75866174697876, "tokens/completion": 39.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.6069180965423584 }, { "advantage/absmean": 0.0, "entropy": 0.1208053007721901, "epoch": 0.515, "grad_norm": 0.0, "importance_ratio": 1.0094236135482788, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.18172681331634521, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 103, "timing/generation_ms": 1294.5419251918793, "timing/scoring_ms": 0.7261037826538086, "timing/total_ms": 1295.268028974533, "tokens/completion": 140.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.5335001945495605 }, { "advantage/absmean": 0.0, "entropy": 0.18411874771118164, "epoch": 0.52, "grad_norm": 0.0, "importance_ratio": 1.0321792364120483, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.08973871171474457, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 104, "timing/generation_ms": 914.0310287475586, "timing/scoring_ms": 1.479417085647583, "timing/total_ms": 915.5104458332062, "tokens/completion": 284.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.348990440368652 }, { "advantage/absmean": 0.0, "entropy": 0.13948652148246765, "epoch": 0.525, "grad_norm": 0.0, "importance_ratio": 1.0061174631118774, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09687096625566483, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 105, "timing/generation_ms": 616.0270869731903, "timing/scoring_ms": 0.8123517036437988, "timing/total_ms": 616.8394386768341, "tokens/completion": 446.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 13.494132995605469 }, { "advantage/absmean": 0.0, "entropy": 0.15247036516666412, "epoch": 0.53, "grad_norm": 0.0, "importance_ratio": 1.0169906616210938, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0033867901656776667, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 106, "timing/generation_ms": 467.2476649284363, "timing/scoring_ms": 0.7162988185882568, "timing/total_ms": 467.96396374702454, "tokens/completion": 44.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.7201159000396729 }, { "advantage/absmean": 0.0, "entropy": 0.11665071547031403, "epoch": 0.535, "grad_norm": 0.0, "importance_ratio": 1.0270754098892212, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.004207200836390257, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 107, "timing/generation_ms": 684.4183206558228, "timing/scoring_ms": 0.6581544876098633, "timing/total_ms": 685.0764751434326, "tokens/completion": 67.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.8382806777954102 }, { "advantage/absmean": 0.0, "entropy": 0.12457213550806046, "epoch": 0.54, "grad_norm": 0.0, "importance_ratio": 1.0309176445007324, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0032363960053771734, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 108, "timing/generation_ms": 473.9782512187958, "timing/scoring_ms": 0.6288290023803711, "timing/total_ms": 474.60708022117615, "tokens/completion": 44.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.8517920970916748 }, { "advantage/absmean": 0.0, "entropy": 0.14651726186275482, "epoch": 0.545, "grad_norm": 0.0, "importance_ratio": 1.0332300662994385, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.005263304337859154, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 109, "timing/generation_ms": 401.17865800857544, "timing/scoring_ms": 0.6504356861114502, "timing/total_ms": 401.8290936946869, "tokens/completion": 35.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.4571411609649658 }, { "advantage/absmean": 0.0, "entropy": 0.1841438263654709, "epoch": 0.55, "grad_norm": 0.0, "importance_ratio": 1.0061630010604858, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.25429007411003113, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 110, "timing/generation_ms": 908.4518253803253, "timing/scoring_ms": 1.1530518531799316, "timing/total_ms": 909.6048772335052, "tokens/completion": 96.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.456902503967285 }, { "advantage/absmean": 0.0, "entropy": 0.15197767317295074, "epoch": 0.555, "grad_norm": 0.0, "importance_ratio": 1.0174596309661865, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.00518000265583396, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 111, "timing/generation_ms": 668.2363152503967, "timing/scoring_ms": 0.6829798221588135, "timing/total_ms": 668.9192950725555, "tokens/completion": 65.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.9429497718811035 }, { "advantage/absmean": 0.0, "entropy": 0.330410361289978, "epoch": 0.56, "grad_norm": 0.0, "importance_ratio": 1.0407096147537231, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.008736329153180122, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 112, "timing/generation_ms": 725.0267565250397, "timing/scoring_ms": 49.04326796531677, "timing/total_ms": 774.0700244903564, "tokens/completion": 54.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.0000529289245605 }, { "advantage/absmean": 0.0, "entropy": 0.0725167915225029, "epoch": 0.565, "grad_norm": 0.0, "importance_ratio": 1.0272282361984253, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.004099931567907333, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 113, "timing/generation_ms": 395.76655626296997, "timing/scoring_ms": 0.5489587783813477, "timing/total_ms": 396.3155150413513, "tokens/completion": 34.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.4450197219848633 }, { "advantage/absmean": 0.0, "entropy": 0.0958617627620697, "epoch": 0.57, "grad_norm": 0.0, "importance_ratio": 1.0171706676483154, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.002028662944212556, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 114, "timing/generation_ms": 819.5895254611969, "timing/scoring_ms": 0.6240904331207275, "timing/total_ms": 820.2136158943176, "tokens/completion": 84.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.0635149478912354 }, { "advantage/absmean": 0.0, "entropy": 0.2242671400308609, "epoch": 0.575, "grad_norm": 0.0, "importance_ratio": 1.036353588104248, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.05042947083711624, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 115, "timing/generation_ms": 1909.7133576869965, "timing/scoring_ms": 0.7752180099487305, "timing/total_ms": 1910.4885756969452, "tokens/completion": 388.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.138303279876709 }, { "advantage/absmean": 0.0, "entropy": 0.2777499556541443, "epoch": 0.58, "grad_norm": 0.0, "importance_ratio": 1.0384970903396606, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.15782660245895386, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 116, "timing/generation_ms": 3800.138682126999, "timing/scoring_ms": 0.7495880126953125, "timing/total_ms": 3800.888270139694, "tokens/completion": 439.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.202250242233276 }, { "advantage/absmean": 0.0, "entropy": 0.0518534854054451, "epoch": 0.585, "grad_norm": 0.0, "importance_ratio": 1.003575325012207, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.08400674164295197, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 117, "timing/generation_ms": 999.7085630893707, "timing/scoring_ms": 0.7627308368682861, "timing/total_ms": 1000.471293926239, "tokens/completion": 628.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.7797212600708 }, { "advantage/absmean": 0.0, "entropy": 0.38837090134620667, "epoch": 0.59, "grad_norm": 0.0, "importance_ratio": 1.0553014278411865, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.18893013894557953, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 118, "timing/generation_ms": 709.2028856277466, "timing/scoring_ms": 0.8765757083892822, "timing/total_ms": 710.0794613361359, "tokens/completion": 73.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.17161226272583 }, { "advantage/absmean": 0.0, "entropy": 0.18134428560733795, "epoch": 0.595, "grad_norm": 0.0, "importance_ratio": 1.0260416269302368, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.10936398804187775, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 119, "timing/generation_ms": 1527.4927616119385, "timing/scoring_ms": 0.762939453125, "timing/total_ms": 1528.2557010650635, "tokens/completion": 291.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 9.307310819625854 }, { "advantage/absmean": 0.0, "entropy": 0.21357819437980652, "epoch": 0.6, "grad_norm": 0.0, "importance_ratio": 1.0355201959609985, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.03871361166238785, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 120, "timing/generation_ms": 803.0665814876556, "timing/scoring_ms": 0.8001923561096191, "timing/total_ms": 803.8667738437653, "tokens/completion": 259.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.269800662994385 }, { "advantage/absmean": 0.0, "entropy": 0.11926577985286713, "epoch": 0.605, "grad_norm": 0.0, "importance_ratio": 1.0134624242782593, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.14697293937206268, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 121, "timing/generation_ms": 869.4056570529938, "timing/scoring_ms": 0.6933510303497314, "timing/total_ms": 870.0990080833435, "tokens/completion": 299.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 13.899815797805786 }, { "advantage/absmean": 0.0, "entropy": 0.24009546637535095, "epoch": 0.61, "grad_norm": 0.0, "importance_ratio": 1.028232216835022, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1661180853843689, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 122, "timing/generation_ms": 979.9708724021912, "timing/scoring_ms": 0.8241832256317139, "timing/total_ms": 980.7950556278229, "tokens/completion": 514.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 15.01587724685669 }, { "advantage/absmean": 0.0, "entropy": 0.31378039717674255, "epoch": 0.615, "grad_norm": 0.0, "importance_ratio": 1.049980640411377, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.05815834552049637, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 123, "timing/generation_ms": 1546.501785516739, "timing/scoring_ms": 0.7077455520629883, "timing/total_ms": 1547.2095310688019, "tokens/completion": 172.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 3.876373767852783 }, { "advantage/absmean": 0.0, "entropy": 0.3055809736251831, "epoch": 0.62, "grad_norm": 0.0, "importance_ratio": 1.0255380868911743, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.36709433794021606, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 124, "timing/generation_ms": 968.0472612380981, "timing/scoring_ms": 0.7203817367553711, "timing/total_ms": 968.7676429748535, "tokens/completion": 100.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.1922783851623535 }, { "advantage/absmean": 0.0, "entropy": 0.23402170836925507, "epoch": 0.625, "grad_norm": 0.0, "importance_ratio": 1.0423892736434937, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.08263733237981796, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 125, "timing/generation_ms": 735.4680895805359, "timing/scoring_ms": 0.6801784038543701, "timing/total_ms": 736.1482679843903, "tokens/completion": 75.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.500101089477539 }, { "advantage/absmean": 0.0, "entropy": 0.049274127930402756, "epoch": 0.63, "grad_norm": 0.0, "importance_ratio": 0.9916724562644958, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.2128075659275055, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 126, "timing/generation_ms": 502.2531747817993, "timing/scoring_ms": 0.5747973918914795, "timing/total_ms": 502.8279721736908, "tokens/completion": 45.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.9051601886749268 }, { "advantage/absmean": 0.0, "entropy": 0.10682657361030579, "epoch": 0.635, "grad_norm": 0.0, "importance_ratio": 1.0076371431350708, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.14533908665180206, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 127, "timing/generation_ms": 1573.5874772071838, "timing/scoring_ms": 0.6630420684814453, "timing/total_ms": 1574.2505192756653, "tokens/completion": 174.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 4.5822625160217285 }, { "advantage/absmean": 0.0, "entropy": 0.06670039892196655, "epoch": 0.64, "grad_norm": 0.0, "importance_ratio": 1.0067728757858276, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0806964710354805, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 128, "timing/generation_ms": 273.6642360687256, "timing/scoring_ms": 0.7540583610534668, "timing/total_ms": 274.41829442977905, "tokens/completion": 834.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.68679666519165 }, { "advantage/absmean": 0.0, "entropy": 0.18214182555675507, "epoch": 0.645, "grad_norm": 0.0, "importance_ratio": 1.0310328006744385, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09200795739889145, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 129, "timing/generation_ms": 1752.3531913757324, "timing/scoring_ms": 0.9254515171051025, "timing/total_ms": 1753.2786428928375, "tokens/completion": 342.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 10.11969804763794 }, { "advantage/absmean": 0.0, "entropy": 0.13214536011219025, "epoch": 0.65, "grad_norm": 0.0, "importance_ratio": 1.0212507247924805, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.004680355545133352, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 130, "timing/generation_ms": 667.9131984710693, "timing/scoring_ms": 0.6452500820159912, "timing/total_ms": 668.5584485530853, "tokens/completion": 69.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.3132050037384033 }, { "advantage/absmean": 0.0, "entropy": 0.1453593522310257, "epoch": 0.655, "grad_norm": 0.0, "importance_ratio": 1.0073384046554565, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.14992451667785645, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 131, "timing/generation_ms": 1461.0510766506195, "timing/scoring_ms": 0.7370114326477051, "timing/total_ms": 1461.7880880832672, "tokens/completion": 284.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 8.867703199386597 }, { "advantage/absmean": 0.0, "entropy": 0.20772671699523926, "epoch": 0.66, "grad_norm": 0.0, "importance_ratio": 1.0234897136688232, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.2186855524778366, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 132, "timing/generation_ms": 1320.2576041221619, "timing/scoring_ms": 0.806957483291626, "timing/total_ms": 1321.0645616054535, "tokens/completion": 141.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 3.599062442779541 }, { "advantage/absmean": 0.0, "entropy": 0.2090042531490326, "epoch": 0.665, "grad_norm": 0.0, "importance_ratio": 1.0199214220046997, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.07998237013816833, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 133, "timing/generation_ms": 1323.2730627059937, "timing/scoring_ms": 0.7129013538360596, "timing/total_ms": 1323.9859640598297, "tokens/completion": 342.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 13.028274297714233 }, { "advantage/absmean": 0.0, "entropy": 0.28178277611732483, "epoch": 0.67, "grad_norm": 0.0, "importance_ratio": 1.0414365530014038, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.193196102976799, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 134, "timing/generation_ms": 1245.4088926315308, "timing/scoring_ms": 0.6657242774963379, "timing/total_ms": 1246.074616909027, "tokens/completion": 141.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 6.781817436218262 }, { "advantage/absmean": 0.0, "entropy": 0.11931399255990982, "epoch": 0.675, "grad_norm": 0.0, "importance_ratio": 1.0334945917129517, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.007293867412954569, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 135, "timing/generation_ms": 464.6941125392914, "timing/scoring_ms": 0.784844160079956, "timing/total_ms": 465.47895669937134, "tokens/completion": 44.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.6599977016448975 }, { "advantage/absmean": 0.0, "entropy": 0.04729205742478371, "epoch": 0.68, "grad_norm": 0.0, "importance_ratio": 1.0140527486801147, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0013221576809883118, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 136, "timing/generation_ms": 405.37604689598083, "timing/scoring_ms": 0.6449222564697266, "timing/total_ms": 406.02096915245056, "tokens/completion": 35.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5901200771331787 }, { "advantage/absmean": 0.0, "entropy": 0.05103699490427971, "epoch": 0.685, "grad_norm": 0.0, "importance_ratio": 1.0050382614135742, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.07745583355426788, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 137, "timing/generation_ms": 269.2677080631256, "timing/scoring_ms": 0.6955265998840332, "timing/total_ms": 269.96323466300964, "tokens/completion": 809.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 16.056657075881958 }, { "advantage/absmean": 0.0, "entropy": 0.19527268409729004, "epoch": 0.69, "grad_norm": 0.0, "importance_ratio": 1.0254648923873901, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09185754507780075, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 138, "timing/generation_ms": 3765.899121761322, "timing/scoring_ms": 0.8567273616790771, "timing/total_ms": 3766.755849123001, "tokens/completion": 422.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 8.989697933197021 }, { "advantage/absmean": 0.0, "entropy": 0.24571235477924347, "epoch": 0.695, "grad_norm": 0.0, "importance_ratio": 1.022046685218811, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.21676717698574066, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 139, "timing/generation_ms": 1646.1068093776703, "timing/scoring_ms": 0.7834434509277344, "timing/total_ms": 1646.890252828598, "tokens/completion": 182.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 6.598947763442993 }, { "advantage/absmean": 0.0, "entropy": 0.10984079539775848, "epoch": 0.7, "grad_norm": 0.0, "importance_ratio": 1.0199713706970215, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0031276445370167494, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 140, "timing/generation_ms": 696.2663531303406, "timing/scoring_ms": 0.7794201374053955, "timing/total_ms": 697.045773267746, "tokens/completion": 71.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.6167395114898682 }, { "advantage/absmean": 0.0, "entropy": 0.11442814767360687, "epoch": 0.705, "grad_norm": 0.0, "importance_ratio": 1.0181595087051392, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09192286431789398, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 141, "timing/generation_ms": 595.156729221344, "timing/scoring_ms": 0.8848309516906738, "timing/total_ms": 596.0415601730347, "tokens/completion": 672.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.293966054916382 }, { "advantage/absmean": 0.0, "entropy": 0.09457792341709137, "epoch": 0.71, "grad_norm": 0.0, "importance_ratio": 1.0103352069854736, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.14311635494232178, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 142, "timing/generation_ms": 780.9071838855743, "timing/scoring_ms": 0.8233785629272461, "timing/total_ms": 781.7305624485016, "tokens/completion": 443.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 13.156969547271729 }, { "advantage/absmean": 0.0, "entropy": 0.3191906809806824, "epoch": 0.715, "grad_norm": 0.0, "importance_ratio": 1.0470857620239258, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.10388480871915817, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 143, "timing/generation_ms": 1076.995462179184, "timing/scoring_ms": 0.9957849979400635, "timing/total_ms": 1077.991247177124, "tokens/completion": 657.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 9.946331977844238 }, { "advantage/absmean": 0.0, "entropy": 0.1242792010307312, "epoch": 0.72, "grad_norm": 0.0, "importance_ratio": 1.0209869146347046, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.06033819913864136, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 144, "timing/generation_ms": 540.4762625694275, "timing/scoring_ms": 0.80832839012146, "timing/total_ms": 541.284590959549, "tokens/completion": 223.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.298864603042603 }, { "advantage/absmean": 0.0, "entropy": 0.06813202798366547, "epoch": 0.725, "grad_norm": 0.0, "importance_ratio": 1.0060614347457886, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.06888692080974579, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 145, "timing/generation_ms": 994.5988655090332, "timing/scoring_ms": 0.8412599563598633, "timing/total_ms": 995.4401254653931, "tokens/completion": 835.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 13.451871871948242 }, { "advantage/absmean": 0.0, "entropy": 0.2093413919210434, "epoch": 0.73, "grad_norm": 0.0, "importance_ratio": 1.0480271577835083, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.00946128275245428, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 146, "timing/generation_ms": 599.0081429481506, "timing/scoring_ms": 0.6599128246307373, "timing/total_ms": 599.6680557727814, "tokens/completion": 59.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.7919504642486572 }, { "advantage/absmean": 0.0, "entropy": 0.02967538870871067, "epoch": 0.735, "grad_norm": 0.0, "importance_ratio": 1.0014662742614746, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.05430717393755913, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 147, "timing/generation_ms": 2690.7454431056976, "timing/scoring_ms": 0.914543867111206, "timing/total_ms": 2691.659986972809, "tokens/completion": 321.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 18.806960821151733 }, { "advantage/absmean": 0.0, "entropy": 0.19191206991672516, "epoch": 0.74, "grad_norm": 0.0, "importance_ratio": 1.0250086784362793, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.06533827632665634, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 148, "timing/generation_ms": 1226.204663515091, "timing/scoring_ms": 0.7858574390411377, "timing/total_ms": 1226.990520954132, "tokens/completion": 133.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.5662012100219727 }, { "advantage/absmean": 0.0, "entropy": 0.12079251557588577, "epoch": 0.745, "grad_norm": 0.0, "importance_ratio": 1.0156702995300293, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.08981353789567947, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 149, "timing/generation_ms": 2030.1675498485565, "timing/scoring_ms": 0.9343326091766357, "timing/total_ms": 2031.1018824577332, "tokens/completion": 840.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.26139760017395 }, { "advantage/absmean": 0.0, "entropy": 0.366574764251709, "epoch": 0.75, "grad_norm": 0.0, "importance_ratio": 1.0404877662658691, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.47507715225219727, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 150, "timing/generation_ms": 544.8184907436371, "timing/scoring_ms": 0.6380081176757812, "timing/total_ms": 545.4564988613129, "tokens/completion": 52.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.9531588554382324 }, { "advantage/absmean": 0.0, "entropy": 0.08402762562036514, "epoch": 0.755, "grad_norm": 0.0, "importance_ratio": 1.0083701610565186, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0514337532222271, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 151, "timing/generation_ms": 584.3641459941864, "timing/scoring_ms": 0.7977187633514404, "timing/total_ms": 585.1618647575378, "tokens/completion": 247.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 16.126672744750977 }, { "advantage/absmean": 0.0, "entropy": 0.14948773384094238, "epoch": 0.76, "grad_norm": 0.0, "importance_ratio": 1.0117238759994507, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.25467291474342346, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 152, "timing/generation_ms": 873.7495541572571, "timing/scoring_ms": 0.7468461990356445, "timing/total_ms": 874.4964003562927, "tokens/completion": 90.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.834012746810913 }, { "advantage/absmean": 0.0, "entropy": 0.11315376311540604, "epoch": 0.765, "grad_norm": 0.0, "importance_ratio": 1.014126181602478, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.10666359215974808, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 153, "timing/generation_ms": 1559.2613816261292, "timing/scoring_ms": 0.968635082244873, "timing/total_ms": 1560.230016708374, "tokens/completion": 600.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 10.327171802520752 }, { "advantage/absmean": 0.0, "entropy": 0.08693549036979675, "epoch": 0.77, "grad_norm": 0.0, "importance_ratio": 1.022530436515808, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.005849043373018503, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 154, "timing/generation_ms": 533.4776639938354, "timing/scoring_ms": 0.6860494613647461, "timing/total_ms": 534.1637134552002, "tokens/completion": 53.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.7367737293243408 }, { "advantage/absmean": 0.0, "entropy": 0.07404028624296188, "epoch": 0.775, "grad_norm": 0.0, "importance_ratio": 1.010236382484436, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.00151128601282835, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 155, "timing/generation_ms": 309.5822036266327, "timing/scoring_ms": 0.68625807762146, "timing/total_ms": 310.26846170425415, "tokens/completion": 24.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.551006555557251 }, { "advantage/absmean": 0.0, "entropy": 0.22322000563144684, "epoch": 0.78, "grad_norm": 0.0, "importance_ratio": 1.041182279586792, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.008927143178880215, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 156, "timing/generation_ms": 509.2552602291107, "timing/scoring_ms": 0.7590949535369873, "timing/total_ms": 510.0143551826477, "tokens/completion": 48.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.0176374912261963 }, { "advantage/absmean": 0.0, "entropy": 0.005759948864579201, "epoch": 0.785, "grad_norm": 0.0, "importance_ratio": 1.0051088333129883, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.000156716225319542, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 157, "timing/generation_ms": 285.7961654663086, "timing/scoring_ms": 0.5785822868347168, "timing/total_ms": 286.3747477531433, "tokens/completion": 21.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.3277292251586914 }, { "advantage/absmean": 0.0, "entropy": 0.10188260674476624, "epoch": 0.79, "grad_norm": 0.0, "importance_ratio": 1.0081340074539185, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09888118505477905, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 158, "timing/generation_ms": 923.1429696083069, "timing/scoring_ms": 0.7679760456085205, "timing/total_ms": 923.9109456539154, "tokens/completion": 260.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.238730430603027 }, { "advantage/absmean": 0.0, "entropy": 0.04350348189473152, "epoch": 0.795, "grad_norm": 0.0, "importance_ratio": 1.0053517818450928, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.032001592218875885, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 159, "timing/generation_ms": 714.4100964069366, "timing/scoring_ms": 0.9441971778869629, "timing/total_ms": 715.3542935848236, "tokens/completion": 340.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 17.293121814727783 }, { "advantage/absmean": 0.0, "entropy": 0.08451759070158005, "epoch": 0.8, "grad_norm": 0.0, "importance_ratio": 1.0012885332107544, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.10851491242647171, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 160, "timing/generation_ms": 782.3683321475983, "timing/scoring_ms": 0.9162425994873047, "timing/total_ms": 783.2845747470856, "tokens/completion": 345.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 17.219330072402954 }, { "advantage/absmean": 0.0, "entropy": 0.3327235281467438, "epoch": 0.805, "grad_norm": 0.0, "importance_ratio": 1.0560818910598755, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09066620469093323, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 161, "timing/generation_ms": 684.850424528122, "timing/scoring_ms": 1.520305871963501, "timing/total_ms": 686.3707304000854, "tokens/completion": 70.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.0233514308929443 }, { "advantage/absmean": 0.0, "entropy": 0.13878269493579865, "epoch": 0.81, "grad_norm": 0.0, "importance_ratio": 1.0270979404449463, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.057966198772192, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 162, "timing/generation_ms": 1864.498645067215, "timing/scoring_ms": 0.748753547668457, "timing/total_ms": 1865.2473986148834, "tokens/completion": 389.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.679251432418823 }, { "advantage/absmean": 0.0, "entropy": 0.1552003026008606, "epoch": 0.815, "grad_norm": 0.0, "importance_ratio": 1.008542776107788, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.2350100725889206, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 163, "timing/generation_ms": 907.8878462314606, "timing/scoring_ms": 0.8561313152313232, "timing/total_ms": 908.7439775466919, "tokens/completion": 96.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.9676706790924072 }, { "advantage/absmean": 0.0, "entropy": 0.1862545907497406, "epoch": 0.82, "grad_norm": 0.0, "importance_ratio": 1.0322412252426147, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.05478338897228241, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 164, "timing/generation_ms": 4015.0281488895416, "timing/scoring_ms": 0.792384147644043, "timing/total_ms": 4015.8205330371857, "tokens/completion": 717.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 17.306790828704834 }, { "advantage/absmean": 0.0, "entropy": 0.0800366923213005, "epoch": 0.825, "grad_norm": 0.0, "importance_ratio": 1.0066349506378174, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09780741482973099, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 165, "timing/generation_ms": 234.08538103103638, "timing/scoring_ms": 0.7649660110473633, "timing/total_ms": 234.85034704208374, "tokens/completion": 182.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 10.7881019115448 }, { "advantage/absmean": 0.0, "entropy": 0.10572364926338196, "epoch": 0.83, "grad_norm": 0.0, "importance_ratio": 1.0320991277694702, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.004617371596395969, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 166, "timing/generation_ms": 519.7223126888275, "timing/scoring_ms": 0.6583333015441895, "timing/total_ms": 520.3806459903717, "tokens/completion": 52.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.7131249904632568 }, { "advantage/absmean": 0.0, "entropy": 0.1363074630498886, "epoch": 0.835, "grad_norm": 0.0, "importance_ratio": 1.0198731422424316, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.054269395768642426, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 167, "timing/generation_ms": 2606.735050678253, "timing/scoring_ms": 0.7981956005096436, "timing/total_ms": 2607.533246278763, "tokens/completion": 1140.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 13.646069049835205 }, { "advantage/absmean": 0.0, "entropy": 0.09828086942434311, "epoch": 0.84, "grad_norm": 0.0, "importance_ratio": 1.03993821144104, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.13072553277015686, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 168, "timing/generation_ms": 1364.6757006645203, "timing/scoring_ms": 0.7473230361938477, "timing/total_ms": 1365.423023700714, "tokens/completion": 151.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 6.264061450958252 }, { "advantage/absmean": 0.0, "entropy": 0.21706373989582062, "epoch": 0.845, "grad_norm": 0.0, "importance_ratio": 1.0421193838119507, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.009323724545538425, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 169, "timing/generation_ms": 418.77251863479614, "timing/scoring_ms": 0.6786584854125977, "timing/total_ms": 419.45117712020874, "tokens/completion": 39.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.7350790500640869 }, { "advantage/absmean": 0.0, "entropy": 0.17924943566322327, "epoch": 0.85, "grad_norm": 0.0, "importance_ratio": 1.0139546394348145, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.11539552360773087, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 170, "timing/generation_ms": 894.7311043739319, "timing/scoring_ms": 0.6144046783447266, "timing/total_ms": 895.3455090522766, "tokens/completion": 92.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.1736605167388916 }, { "advantage/absmean": 0.0, "entropy": 0.13270172476768494, "epoch": 0.855, "grad_norm": 0.0, "importance_ratio": 1.0190181732177734, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.003293952438980341, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 171, "timing/generation_ms": 618.274062871933, "timing/scoring_ms": 0.6122589111328125, "timing/total_ms": 618.8863217830658, "tokens/completion": 60.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.9098899364471436 }, { "advantage/absmean": 0.0, "entropy": 0.18591608107089996, "epoch": 0.86, "grad_norm": 0.0, "importance_ratio": 1.0227158069610596, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.28750333189964294, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 172, "timing/generation_ms": 1234.259307384491, "timing/scoring_ms": 0.6674528121948242, "timing/total_ms": 1234.9267601966858, "tokens/completion": 133.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.562565326690674 }, { "advantage/absmean": 0.0, "entropy": 0.13000088930130005, "epoch": 0.865, "grad_norm": 0.0, "importance_ratio": 0.9962196350097656, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.2675855755805969, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 173, "timing/generation_ms": 479.78419065475464, "timing/scoring_ms": 0.6464719772338867, "timing/total_ms": 480.4306626319885, "tokens/completion": 45.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.9416956901550293 }, { "advantage/absmean": 0.0, "entropy": 0.18111664056777954, "epoch": 0.87, "grad_norm": 0.0, "importance_ratio": 1.01747727394104, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1416250616312027, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 174, "timing/generation_ms": 2476.1448204517365, "timing/scoring_ms": 1.2516677379608154, "timing/total_ms": 2477.3964881896973, "tokens/completion": 680.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 13.943515062332153 }, { "advantage/absmean": 0.0, "entropy": 0.1258212924003601, "epoch": 0.875, "grad_norm": 0.0, "importance_ratio": 1.0312312841415405, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.003018689574673772, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 175, "timing/generation_ms": 633.431077003479, "timing/scoring_ms": 0.6191730499267578, "timing/total_ms": 634.0502500534058, "tokens/completion": 62.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.7139477729797363 }, { "advantage/absmean": 0.0, "entropy": 0.07265715301036835, "epoch": 0.88, "grad_norm": 0.0, "importance_ratio": 1.0058088302612305, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.06198568642139435, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 176, "timing/generation_ms": 195.3727900981903, "timing/scoring_ms": 0.9649991989135742, "timing/total_ms": 196.33778929710388, "tokens/completion": 1078.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 15.170578002929688 }, { "advantage/absmean": 0.0, "entropy": 0.048659250140190125, "epoch": 0.885, "grad_norm": 0.0, "importance_ratio": 1.0071101188659668, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.013346853666007519, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 177, "timing/generation_ms": 1068.607121706009, "timing/scoring_ms": 0.67138671875, "timing/total_ms": 1069.278508424759, "tokens/completion": 370.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 16.62585186958313 }, { "advantage/absmean": 0.0, "entropy": 0.15628257393836975, "epoch": 0.89, "grad_norm": 0.0, "importance_ratio": 1.025020718574524, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09533118456602097, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 178, "timing/generation_ms": 525.3657698631287, "timing/scoring_ms": 0.7698535919189453, "timing/total_ms": 526.1356234550476, "tokens/completion": 1008.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 14.26209831237793 }, { "advantage/absmean": 0.0, "entropy": 0.2629597783088684, "epoch": 0.895, "grad_norm": 0.0, "importance_ratio": 1.0473555326461792, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.008978270925581455, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 179, "timing/generation_ms": 675.6896078586578, "timing/scoring_ms": 0.6800293922424316, "timing/total_ms": 676.3696372509003, "tokens/completion": 67.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.9611563682556152 }, { "advantage/absmean": 0.0, "entropy": 0.06369853019714355, "epoch": 0.9, "grad_norm": 0.0, "importance_ratio": 1.010459065437317, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0018829090986400843, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 180, "timing/generation_ms": 1127.2559463977814, "timing/scoring_ms": 0.9635686874389648, "timing/total_ms": 1128.2195150852203, "tokens/completion": 122.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 3.0270369052886963 }, { "advantage/absmean": 0.0, "entropy": 0.163645401597023, "epoch": 0.905, "grad_norm": 0.0, "importance_ratio": 1.0052450895309448, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.33905029296875, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 181, "timing/generation_ms": 476.17214918136597, "timing/scoring_ms": 0.6859004497528076, "timing/total_ms": 476.8580496311188, "tokens/completion": 45.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.41276216506958 }, { "advantage/absmean": 0.0, "entropy": 0.10919979959726334, "epoch": 0.91, "grad_norm": 0.0, "importance_ratio": 1.017164945602417, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.07364481687545776, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 182, "timing/generation_ms": 229.94062304496765, "timing/scoring_ms": 0.6324350833892822, "timing/total_ms": 230.57305812835693, "tokens/completion": 254.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 15.365740299224854 }, { "advantage/absmean": 0.0, "entropy": 0.14554215967655182, "epoch": 0.915, "grad_norm": 0.0, "importance_ratio": 1.0174024105072021, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.004148008767515421, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 183, "timing/generation_ms": 584.0282142162323, "timing/scoring_ms": 0.6068646907806396, "timing/total_ms": 584.6350789070129, "tokens/completion": 58.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.7671034336090088 }, { "advantage/absmean": 0.0, "entropy": 0.4024199843406677, "epoch": 0.92, "grad_norm": 0.0, "importance_ratio": 1.0387303829193115, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.38230597972869873, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 184, "timing/generation_ms": 989.8079633712769, "timing/scoring_ms": 0.7306337356567383, "timing/total_ms": 990.5385971069336, "tokens/completion": 103.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 2.455932378768921 }, { "advantage/absmean": 0.0, "entropy": 0.0706729143857956, "epoch": 0.925, "grad_norm": 0.0, "importance_ratio": 1.0143457651138306, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0032624138984829187, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 185, "timing/generation_ms": 291.74336791038513, "timing/scoring_ms": 0.8850991725921631, "timing/total_ms": 292.6284670829773, "tokens/completion": 23.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.36818575859069824 }, { "advantage/absmean": 0.0, "entropy": 0.15155723690986633, "epoch": 0.93, "grad_norm": 0.0, "importance_ratio": 1.018488883972168, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1375313699245453, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 186, "timing/generation_ms": 2153.5862386226654, "timing/scoring_ms": 0.6712675094604492, "timing/total_ms": 2154.257506132126, "tokens/completion": 250.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.50852918624878 }, { "advantage/absmean": 0.0, "entropy": 0.22367219626903534, "epoch": 0.935, "grad_norm": 0.0, "importance_ratio": 1.0382441282272339, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.008602548390626907, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 187, "timing/generation_ms": 427.0813763141632, "timing/scoring_ms": 0.7168352603912354, "timing/total_ms": 427.79821157455444, "tokens/completion": 39.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5594475269317627 }, { "advantage/absmean": 0.0, "entropy": 0.12171094864606857, "epoch": 0.94, "grad_norm": 0.0, "importance_ratio": 1.0303484201431274, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.00624867994338274, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 188, "timing/generation_ms": 840.657502412796, "timing/scoring_ms": 0.6091892719268799, "timing/total_ms": 841.2666916847229, "tokens/completion": 87.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.1978800296783447 }, { "advantage/absmean": 0.0, "entropy": 0.11999785900115967, "epoch": 0.945, "grad_norm": 0.0, "importance_ratio": 1.0153226852416992, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.16127747297286987, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 189, "timing/generation_ms": 1309.2719912528992, "timing/scoring_ms": 0.628441572189331, "timing/total_ms": 1309.9004328250885, "tokens/completion": 144.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 7.504530191421509 }, { "advantage/absmean": 0.0, "entropy": 0.06453464925289154, "epoch": 0.95, "grad_norm": 0.0, "importance_ratio": 1.0172827243804932, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.0012136471923440695, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 190, "timing/generation_ms": 476.50280594825745, "timing/scoring_ms": 0.7374584674835205, "timing/total_ms": 477.24026441574097, "tokens/completion": 45.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 0.5519871711730957 }, { "advantage/absmean": 0.0, "entropy": 0.2637867331504822, "epoch": 0.955, "grad_norm": 0.0, "importance_ratio": 1.034403920173645, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.13701161742210388, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 191, "timing/generation_ms": 1511.2466216087341, "timing/scoring_ms": 0.6977319717407227, "timing/total_ms": 1511.9443535804749, "tokens/completion": 168.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 5.765244245529175 }, { "advantage/absmean": 0.0, "entropy": 0.18417640030384064, "epoch": 0.96, "grad_norm": 0.0, "importance_ratio": 1.0195873975753784, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.1894998997449875, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 192, "timing/generation_ms": 1180.6697845458984, "timing/scoring_ms": 0.7679462432861328, "timing/total_ms": 1181.4377307891846, "tokens/completion": 128.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 3.349003553390503 }, { "advantage/absmean": 0.0, "entropy": 0.25059813261032104, "epoch": 0.965, "grad_norm": 0.0, "importance_ratio": 1.0405279397964478, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.03538493439555168, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 193, "timing/generation_ms": 2027.9718935489655, "timing/scoring_ms": 1.072227954864502, "timing/total_ms": 2029.04412150383, "tokens/completion": 484.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 16.785203456878662 }, { "advantage/absmean": 0.0, "entropy": 0.18361569941043854, "epoch": 0.97, "grad_norm": 0.0, "importance_ratio": 1.0088155269622803, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.28030380606651306, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 194, "timing/generation_ms": 2869.309902191162, "timing/scoring_ms": 0.7140040397644043, "timing/total_ms": 2870.0239062309265, "tokens/completion": 327.5, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 10.665328025817871 }, { "advantage/absmean": 0.0, "entropy": 0.1794624775648117, "epoch": 0.975, "grad_norm": 0.0, "importance_ratio": 1.0275996923446655, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.06256844848394394, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 195, "timing/generation_ms": 533.7411463260651, "timing/scoring_ms": 0.8000433444976807, "timing/total_ms": 534.5411896705627, "tokens/completion": 362.25, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.033095121383667 }, { "advantage/absmean": 0.09375, "entropy": 0.2886289954185486, "epoch": 0.98, "grad_norm": 0.87109375, "importance_ratio": 1.0405404567718506, "learning_rate": 2e-07, "loss": 0.0164, "mismatch_kl": 0.07286263257265091, "reward": 0.2916666865348816, "reward/chaining_reward": 0.875, "reward/format_reward": 0.125, "reward/order_reward": 0.0416666679084301, "reward/std": 0.14433757960796356, "reward/tool_name_reward": 0.125, "step": 196, "timing/generation_ms": 8097.116976976395, "timing/scoring_ms": 1639.241635799408, "timing/total_ms": 9736.358612775803, "tokens/completion": 441.5, "tokens/masked_fraction": 0.09173274040222168, "wall_clock/generate_s": 76.84347653388977 }, { "advantage/absmean": 0.0, "entropy": 0.1777847707271576, "epoch": 0.985, "grad_norm": 0.0, "importance_ratio": 1.0152616500854492, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.213760644197464, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 197, "timing/generation_ms": 1451.9930183887482, "timing/scoring_ms": 0.8165538311004639, "timing/total_ms": 1452.8095722198486, "tokens/completion": 363.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 13.531107664108276 }, { "advantage/absmean": 0.0, "entropy": 0.11029931157827377, "epoch": 0.99, "grad_norm": 0.0, "importance_ratio": 1.0095638036727905, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09593881666660309, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 198, "timing/generation_ms": 310.85309386253357, "timing/scoring_ms": 0.7995367050170898, "timing/total_ms": 311.65263056755066, "tokens/completion": 204.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 11.675525188446045 }, { "advantage/absmean": 0.0, "entropy": 0.2395121157169342, "epoch": 0.995, "grad_norm": 0.0, "importance_ratio": 1.0348376035690308, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.15337741374969482, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 199, "timing/generation_ms": 749.4363486766815, "timing/scoring_ms": 0.8217096328735352, "timing/total_ms": 750.258058309555, "tokens/completion": 408.875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 12.204156398773193 }, { "advantage/absmean": 0.0, "entropy": 0.20129668712615967, "epoch": 1.0, "grad_norm": 0.0, "importance_ratio": 1.0415815114974976, "learning_rate": 2e-07, "loss": 0.0, "mismatch_kl": 0.09972988069057465, "reward": 0.25, "reward/chaining_reward": 1.0, "reward/format_reward": 0.0, "reward/order_reward": 0.0, "reward/std": 0.0, "reward/tool_name_reward": 0.0, "step": 200, "timing/generation_ms": 766.4197385311127, "timing/scoring_ms": 0.74043869972229, "timing/total_ms": 767.160177230835, "tokens/completion": 80.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 1.7591853141784668 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }