{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9561594263043443, "eval_steps": 300, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013279992032004782, "grad_norm": 0.09672249853610992, "learning_rate": 5.555555555555556e-06, "logits/chosen": -5.172801971435547, "logits/rejected": -5.090667724609375, "logps/chosen": -1938.1014404296875, "logps/rejected": -2350.493408203125, "loss": 0.4119, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 1.6186755895614624, "rewards/margins": 1.6609911918640137, "rewards/rejected": -0.04231562465429306, "step": 25 }, { "epoch": 0.026559984064009563, "grad_norm": 0.0022277592215687037, "learning_rate": 1.2169312169312169e-05, "logits/chosen": -5.170306205749512, "logits/rejected": -5.1402153968811035, "logps/chosen": -1873.95654296875, "logps/rejected": -2359.867431640625, "loss": 0.0021, "rewards/accuracies": 0.9987499713897705, "rewards/chosen": 9.564491271972656, "rewards/margins": 14.256077766418457, "rewards/rejected": -4.691585540771484, "step": 50 }, { "epoch": 0.03983997609601434, "grad_norm": 0.0011371532455086708, "learning_rate": 1.8783068783068782e-05, "logits/chosen": -5.187832832336426, "logits/rejected": -5.198115825653076, "logps/chosen": -1910.8944091796875, "logps/rejected": -2489.38037109375, "loss": 0.0031, "rewards/accuracies": 0.9987499713897705, "rewards/chosen": 12.900212287902832, "rewards/margins": 22.378742218017578, "rewards/rejected": -9.478529930114746, "step": 75 }, { "epoch": 0.05311996812801913, "grad_norm": 0.052303072065114975, "learning_rate": 2.5396825396825397e-05, "logits/chosen": -5.105872631072998, "logits/rejected": -5.113857269287109, "logps/chosen": -1784.389404296875, "logps/rejected": -2483.621826171875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 15.34033203125, "rewards/margins": 23.903770446777344, "rewards/rejected": -8.563438415527344, "step": 100 }, { "epoch": 0.0663999601600239, "grad_norm": 0.00014079039101488888, "learning_rate": 3.1746031746031745e-05, "logits/chosen": -5.093454360961914, "logits/rejected": -5.118114471435547, "logps/chosen": -1748.82177734375, "logps/rejected": -2502.882568359375, "loss": 0.0444, "rewards/accuracies": 0.9975000023841858, "rewards/chosen": 17.988405227661133, "rewards/margins": 26.943090438842773, "rewards/rejected": -8.954689025878906, "step": 125 }, { "epoch": 0.07967995219202868, "grad_norm": 0.0, "learning_rate": 3.7830687830687835e-05, "logits/chosen": -5.150300979614258, "logits/rejected": -5.2056450843811035, "logps/chosen": -1758.1025390625, "logps/rejected": -2555.788818359375, "loss": 0.1328, "rewards/accuracies": 0.9962499737739563, "rewards/chosen": 19.03263282775879, "rewards/margins": 33.82518005371094, "rewards/rejected": -14.792549133300781, "step": 150 }, { "epoch": 0.09295994422403346, "grad_norm": 1.7618376091377286e-07, "learning_rate": 4.4444444444444447e-05, "logits/chosen": -4.575012683868408, "logits/rejected": -4.625253200531006, "logps/chosen": -1652.279541015625, "logps/rejected": -2526.673095703125, "loss": 0.1233, "rewards/accuracies": 0.9962499737739563, "rewards/chosen": 33.62147521972656, "rewards/margins": 40.31822967529297, "rewards/rejected": -6.696755886077881, "step": 175 }, { "epoch": 0.10623993625603825, "grad_norm": 0.0, "learning_rate": 4.988193624557261e-05, "logits/chosen": -4.950489521026611, "logits/rejected": -5.021872043609619, "logps/chosen": -1673.5699462890625, "logps/rejected": -2633.55810546875, "loss": 0.0448, "rewards/accuracies": 0.9975000023841858, "rewards/chosen": 28.974946975708008, "rewards/margins": 54.47317886352539, "rewards/rejected": -25.49822235107422, "step": 200 }, { "epoch": 0.11951992828804303, "grad_norm": 8.172152774932329e-06, "learning_rate": 4.914403778040142e-05, "logits/chosen": -5.6842427253723145, "logits/rejected": -5.796781063079834, "logps/chosen": -1676.33544921875, "logps/rejected": -2672.82275390625, "loss": 0.267, "rewards/accuracies": 0.9962499737739563, "rewards/chosen": 23.7067928314209, "rewards/margins": 54.104942321777344, "rewards/rejected": -30.398147583007812, "step": 225 }, { "epoch": 0.1327999203200478, "grad_norm": 2.3736068044399872e-07, "learning_rate": 4.840613931523023e-05, "logits/chosen": -5.078042030334473, "logits/rejected": -5.211648464202881, "logps/chosen": -1770.01025390625, "logps/rejected": -2542.817626953125, "loss": 0.0444, "rewards/accuracies": 0.9962499737739563, "rewards/chosen": 22.469932556152344, "rewards/margins": 38.964515686035156, "rewards/rejected": -16.494586944580078, "step": 250 }, { "epoch": 0.1460799123520526, "grad_norm": 0.005825951229780912, "learning_rate": 4.766824085005903e-05, "logits/chosen": -3.7053234577178955, "logits/rejected": -3.792789936065674, "logps/chosen": -1495.026611328125, "logps/rejected": -2455.17333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 45.91399383544922, "rewards/margins": 57.979087829589844, "rewards/rejected": -12.06509017944336, "step": 275 }, { "epoch": 0.15935990438405737, "grad_norm": 0.0, "learning_rate": 4.693034238488784e-05, "logits/chosen": -4.392225742340088, "logits/rejected": -4.497957706451416, "logps/chosen": -1525.969482421875, "logps/rejected": -2793.993408203125, "loss": 0.0026, "rewards/accuracies": 0.9987499713897705, "rewards/chosen": 47.394325256347656, "rewards/margins": 87.98892211914062, "rewards/rejected": -40.5946044921875, "step": 300 }, { "epoch": 0.15935990438405737, "eval_logits/chosen": -5.281342029571533, "eval_logits/rejected": -5.432720184326172, "eval_logps/chosen": -1583.0440673828125, "eval_logps/rejected": -3079.93701171875, "eval_loss": 0.006094762589782476, "eval_rewards/accuracies": 0.9997354745864868, "eval_rewards/chosen": 38.269344329833984, "eval_rewards/margins": 109.5548095703125, "eval_rewards/rejected": -71.28547668457031, "eval_runtime": 5053.7815, "eval_samples_per_second": 1.496, "eval_steps_per_second": 1.496, "step": 300 }, { "epoch": 0.17263989641606214, "grad_norm": 0.0, "learning_rate": 4.619244391971665e-05, "logits/chosen": -5.1374006271362305, "logits/rejected": -5.255193710327148, "logps/chosen": -1523.1787109375, "logps/rejected": -2809.8671875, "loss": 0.0058, "rewards/accuracies": 0.9987499713897705, "rewards/chosen": 48.07456970214844, "rewards/margins": 91.14300537109375, "rewards/rejected": -43.06843566894531, "step": 325 }, { "epoch": 0.18591988844806692, "grad_norm": 0.0, "learning_rate": 4.545454545454546e-05, "logits/chosen": -5.125935077667236, "logits/rejected": -5.29694938659668, "logps/chosen": -1479.3677978515625, "logps/rejected": -2825.154296875, "loss": 0.0734, "rewards/accuracies": 0.9987499713897705, "rewards/chosen": 46.14125442504883, "rewards/margins": 98.82136535644531, "rewards/rejected": -52.680118560791016, "step": 350 }, { "epoch": 0.1991998804800717, "grad_norm": NaN, "learning_rate": 4.471664698937427e-05, "logits/chosen": -5.370585918426514, "logits/rejected": -5.502476692199707, "logps/chosen": -1628.15234375, "logps/rejected": -2935.11865234375, "loss": 0.2488, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 41.5304069519043, "rewards/margins": 86.15292358398438, "rewards/rejected": -44.622520446777344, "step": 375 }, { "epoch": 0.2124798725120765, "grad_norm": 3.176272730343044e-05, "learning_rate": 4.400826446280992e-05, "logits/chosen": -4.086411952972412, "logits/rejected": -4.187539577484131, "logps/chosen": -1379.5189208984375, "logps/rejected": -2316.741943359375, "loss": 0.0516, "rewards/accuracies": 0.9987499713897705, "rewards/chosen": 53.773406982421875, "rewards/margins": 58.21940612792969, "rewards/rejected": -4.445999622344971, "step": 400 }, { "epoch": 0.22575986454408128, "grad_norm": 0.0, "learning_rate": 4.327036599763873e-05, "logits/chosen": -4.049648761749268, "logits/rejected": -4.1783528327941895, "logps/chosen": -1389.79052734375, "logps/rejected": -2547.714111328125, "loss": 0.0261, "rewards/accuracies": 0.9975000023841858, "rewards/chosen": 63.311161041259766, "rewards/margins": 73.89366912841797, "rewards/rejected": -10.582507133483887, "step": 425 }, { "epoch": 0.23903985657608606, "grad_norm": 0.0, "learning_rate": 4.253246753246753e-05, "logits/chosen": -4.82040548324585, "logits/rejected": -5.097967624664307, "logps/chosen": -1308.955322265625, "logps/rejected": -2926.090087890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 65.9859390258789, "rewards/margins": 110.7310562133789, "rewards/rejected": -44.745121002197266, "step": 450 }, { "epoch": 0.2523198486080908, "grad_norm": 0.0, "learning_rate": 4.1794569067296344e-05, "logits/chosen": -5.209223747253418, "logits/rejected": -5.616621017456055, "logps/chosen": -1131.145751953125, "logps/rejected": -2934.873779296875, "loss": 0.1267, "rewards/accuracies": 0.9987499713897705, "rewards/chosen": 77.0686264038086, "rewards/margins": 137.7195281982422, "rewards/rejected": -60.650909423828125, "step": 475 }, { "epoch": 0.2655998406400956, "grad_norm": 0.0, "learning_rate": 4.105667060212515e-05, "logits/chosen": -5.333703994750977, "logits/rejected": -5.778102397918701, "logps/chosen": -1165.505126953125, "logps/rejected": -3131.888671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 73.15416717529297, "rewards/margins": 152.99362182617188, "rewards/rejected": -79.83946990966797, "step": 500 }, { "epoch": 0.27887983267210037, "grad_norm": 0.0, "learning_rate": 4.031877213695396e-05, "logits/chosen": -5.346728324890137, "logits/rejected": -5.7733964920043945, "logps/chosen": -1223.05224609375, "logps/rejected": -3238.250732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.84019470214844, "rewards/margins": 158.7885284423828, "rewards/rejected": -83.94831848144531, "step": 525 }, { "epoch": 0.2921598247041052, "grad_norm": 0.0, "learning_rate": 3.9580873671782764e-05, "logits/chosen": -5.371117115020752, "logits/rejected": -5.80122184753418, "logps/chosen": -1197.4788818359375, "logps/rejected": -3147.82373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 73.79009246826172, "rewards/margins": 156.77114868164062, "rewards/rejected": -82.98106384277344, "step": 550 }, { "epoch": 0.30543981673611, "grad_norm": 0.0, "learning_rate": 3.884297520661157e-05, "logits/chosen": -5.349785327911377, "logits/rejected": -5.790933609008789, "logps/chosen": -1208.9173583984375, "logps/rejected": -3130.772216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.89309692382812, "rewards/margins": 158.11007690429688, "rewards/rejected": -82.21697235107422, "step": 575 }, { "epoch": 0.31871980876811473, "grad_norm": 0.0, "learning_rate": 3.810507674144038e-05, "logits/chosen": -5.363485336303711, "logits/rejected": -5.786189079284668, "logps/chosen": -1208.5062255859375, "logps/rejected": -3253.5537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.30347442626953, "rewards/margins": 159.1309051513672, "rewards/rejected": -84.82743835449219, "step": 600 }, { "epoch": 0.31871980876811473, "eval_logits/chosen": -5.346546173095703, "eval_logits/rejected": -5.785186767578125, "eval_logps/chosen": -1229.4915771484375, "eval_logps/rejected": -3205.3955078125, "eval_loss": 2.772683060925374e-08, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 73.62458801269531, "eval_rewards/margins": 157.45587158203125, "eval_rewards/rejected": -83.83128356933594, "eval_runtime": 5050.407, "eval_samples_per_second": 1.497, "eval_steps_per_second": 1.497, "step": 600 }, { "epoch": 0.33199980080011954, "grad_norm": 0.0, "learning_rate": 3.736717827626919e-05, "logits/chosen": -5.342191219329834, "logits/rejected": -5.78147029876709, "logps/chosen": -1231.17626953125, "logps/rejected": -3281.271240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 76.75041198730469, "rewards/margins": 161.56117248535156, "rewards/rejected": -84.8107681274414, "step": 625 }, { "epoch": 0.3452797928321243, "grad_norm": 0.0, "learning_rate": 3.6629279811097996e-05, "logits/chosen": -5.3565673828125, "logits/rejected": -5.786357879638672, "logps/chosen": -1223.502197265625, "logps/rejected": -3251.859619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.81534576416016, "rewards/margins": 161.51220703125, "rewards/rejected": -85.69686889648438, "step": 650 }, { "epoch": 0.3585597848641291, "grad_norm": 0.0, "learning_rate": 3.58913813459268e-05, "logits/chosen": -5.362225532531738, "logits/rejected": -5.812385082244873, "logps/chosen": -1166.56884765625, "logps/rejected": -3114.923095703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 72.61547088623047, "rewards/margins": 154.4960479736328, "rewards/rejected": -81.88056945800781, "step": 675 }, { "epoch": 0.37183977689613384, "grad_norm": 0.0, "learning_rate": 3.515348288075561e-05, "logits/chosen": -5.3681135177612305, "logits/rejected": -5.7970967292785645, "logps/chosen": -1179.33642578125, "logps/rejected": -3196.193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 73.7895736694336, "rewards/margins": 157.34544372558594, "rewards/rejected": -83.55587768554688, "step": 700 }, { "epoch": 0.38511976892813865, "grad_norm": 0.0, "learning_rate": 3.4415584415584416e-05, "logits/chosen": -5.34775447845459, "logits/rejected": -5.784357070922852, "logps/chosen": -1217.151611328125, "logps/rejected": -3244.43603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.03498077392578, "rewards/margins": 159.89642333984375, "rewards/rejected": -84.86143493652344, "step": 725 }, { "epoch": 0.3983997609601434, "grad_norm": 0.0, "learning_rate": 3.367768595041322e-05, "logits/chosen": -5.360373020172119, "logits/rejected": -5.796443939208984, "logps/chosen": -1183.0787353515625, "logps/rejected": -3178.691162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.35552978515625, "rewards/margins": 158.35353088378906, "rewards/rejected": -83.99797821044922, "step": 750 }, { "epoch": 0.4116797529921482, "grad_norm": 0.0, "learning_rate": 3.293978748524203e-05, "logits/chosen": -5.358860492706299, "logits/rejected": -5.76988410949707, "logps/chosen": -1195.098388671875, "logps/rejected": -3297.325927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.31096649169922, "rewards/margins": 160.78826904296875, "rewards/rejected": -86.4773178100586, "step": 775 }, { "epoch": 0.424959745024153, "grad_norm": 0.0, "learning_rate": 3.220188902007084e-05, "logits/chosen": -5.348212242126465, "logits/rejected": -5.7905988693237305, "logps/chosen": -1181.4913330078125, "logps/rejected": -3163.5537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 73.05725860595703, "rewards/margins": 156.5362548828125, "rewards/rejected": -83.47899627685547, "step": 800 }, { "epoch": 0.43823973705615776, "grad_norm": 0.0, "learning_rate": 3.146399055489965e-05, "logits/chosen": -5.34844970703125, "logits/rejected": -5.808111667633057, "logps/chosen": -1188.3974609375, "logps/rejected": -3050.988525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.98808288574219, "rewards/margins": 156.63418579101562, "rewards/rejected": -80.64608764648438, "step": 825 }, { "epoch": 0.45151972908816257, "grad_norm": 0.0, "learning_rate": 3.072609208972845e-05, "logits/chosen": -5.347776889801025, "logits/rejected": -5.778768539428711, "logps/chosen": -1217.6676025390625, "logps/rejected": -3179.977294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 72.46488189697266, "rewards/margins": 155.26470947265625, "rewards/rejected": -82.7998275756836, "step": 850 }, { "epoch": 0.4647997211201673, "grad_norm": 0.0, "learning_rate": 2.9988193624557266e-05, "logits/chosen": -5.346984386444092, "logits/rejected": -5.777279853820801, "logps/chosen": -1190.4200439453125, "logps/rejected": -3160.3427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.78855895996094, "rewards/margins": 158.04502868652344, "rewards/rejected": -83.25646209716797, "step": 875 }, { "epoch": 0.4780797131521721, "grad_norm": 0.0, "learning_rate": 2.925029515938607e-05, "logits/chosen": -5.340627670288086, "logits/rejected": -5.768866062164307, "logps/chosen": -1216.4300537109375, "logps/rejected": -3318.114990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 76.93177795410156, "rewards/margins": 162.36549377441406, "rewards/rejected": -85.43370819091797, "step": 900 }, { "epoch": 0.4780797131521721, "eval_logits/chosen": -5.34751558303833, "eval_logits/rejected": -5.786447525024414, "eval_logps/chosen": -1229.6158447265625, "eval_logps/rejected": -3206.52294921875, "eval_loss": 2.7424055915048484e-08, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 73.61215209960938, "eval_rewards/margins": 157.55618286132812, "eval_rewards/rejected": -83.94403076171875, "eval_runtime": 5055.7493, "eval_samples_per_second": 1.496, "eval_steps_per_second": 1.496, "step": 900 }, { "epoch": 0.4913597051841769, "grad_norm": 0.0, "learning_rate": 2.8512396694214875e-05, "logits/chosen": -5.3588547706604, "logits/rejected": -5.789546966552734, "logps/chosen": -1181.3402099609375, "logps/rejected": -3084.116943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 73.35798645019531, "rewards/margins": 153.719970703125, "rewards/rejected": -80.36196899414062, "step": 925 }, { "epoch": 0.5046396972161816, "grad_norm": 0.0, "learning_rate": 2.7774498229043683e-05, "logits/chosen": -5.35978889465332, "logits/rejected": -5.78073787689209, "logps/chosen": -1190.137451171875, "logps/rejected": -3243.138427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 73.11257934570312, "rewards/margins": 157.15797424316406, "rewards/rejected": -84.0453872680664, "step": 950 }, { "epoch": 0.5179196892481864, "grad_norm": 0.0, "learning_rate": 2.7036599763872494e-05, "logits/chosen": -5.359025955200195, "logits/rejected": -5.762141704559326, "logps/chosen": -1213.3565673828125, "logps/rejected": -3293.6083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.01912689208984, "rewards/margins": 159.57952880859375, "rewards/rejected": -85.56039428710938, "step": 975 }, { "epoch": 0.5311996812801912, "grad_norm": 0.0, "learning_rate": 2.62987012987013e-05, "logits/chosen": -5.3734660148620605, "logits/rejected": -5.804994106292725, "logps/chosen": -1160.0848388671875, "logps/rejected": -3189.649658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 72.9050064086914, "rewards/margins": 157.53111267089844, "rewards/rejected": -84.62610626220703, "step": 1000 }, { "epoch": 0.544479673312196, "grad_norm": 0.0, "learning_rate": 2.5560802833530107e-05, "logits/chosen": -5.334293365478516, "logits/rejected": -5.756840229034424, "logps/chosen": -1224.6365966796875, "logps/rejected": -3241.719482421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.11827850341797, "rewards/margins": 158.6215057373047, "rewards/rejected": -83.50323486328125, "step": 1025 }, { "epoch": 0.5577596653442007, "grad_norm": 0.0, "learning_rate": 2.4822904368358915e-05, "logits/chosen": -5.3553924560546875, "logits/rejected": -5.774379253387451, "logps/chosen": -1204.6495361328125, "logps/rejected": -3309.4697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.23416900634766, "rewards/margins": 161.051513671875, "rewards/rejected": -86.81734466552734, "step": 1050 }, { "epoch": 0.5710396573762055, "grad_norm": 0.0, "learning_rate": 2.4085005903187723e-05, "logits/chosen": -5.352781295776367, "logits/rejected": -5.79608154296875, "logps/chosen": -1181.0159912109375, "logps/rejected": -3210.6171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.22586059570312, "rewards/margins": 158.11729431152344, "rewards/rejected": -83.89143371582031, "step": 1075 }, { "epoch": 0.5843196494082104, "grad_norm": 0.0, "learning_rate": 2.334710743801653e-05, "logits/chosen": -5.354379177093506, "logits/rejected": -5.800614833831787, "logps/chosen": -1205.6617431640625, "logps/rejected": -3202.022216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.01228332519531, "rewards/margins": 159.0198974609375, "rewards/rejected": -84.00760650634766, "step": 1100 }, { "epoch": 0.5975996414402152, "grad_norm": 0.0, "learning_rate": 2.2609208972845338e-05, "logits/chosen": -5.3513922691345215, "logits/rejected": -5.785496234893799, "logps/chosen": -1204.985107421875, "logps/rejected": -3199.90771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.09490966796875, "rewards/margins": 159.92877197265625, "rewards/rejected": -84.83385467529297, "step": 1125 }, { "epoch": 0.61087963347222, "grad_norm": 0.0, "learning_rate": 2.1871310507674146e-05, "logits/chosen": -5.356717109680176, "logits/rejected": -5.778936386108398, "logps/chosen": -1161.7479248046875, "logps/rejected": -3200.0009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 72.54718780517578, "rewards/margins": 156.47047424316406, "rewards/rejected": -83.92327880859375, "step": 1150 }, { "epoch": 0.6241596255042247, "grad_norm": 0.0, "learning_rate": 2.1133412042502954e-05, "logits/chosen": -5.356033802032471, "logits/rejected": -5.776955604553223, "logps/chosen": -1221.6715087890625, "logps/rejected": -3244.65478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 77.02409362792969, "rewards/margins": 162.34893798828125, "rewards/rejected": -85.3248519897461, "step": 1175 }, { "epoch": 0.6374396175362295, "grad_norm": 0.0, "learning_rate": 2.039551357733176e-05, "logits/chosen": -5.343997955322266, "logits/rejected": -5.7793049812316895, "logps/chosen": -1240.2574462890625, "logps/rejected": -3241.74365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 76.53114318847656, "rewards/margins": 160.63980102539062, "rewards/rejected": -84.10865020751953, "step": 1200 }, { "epoch": 0.6374396175362295, "eval_logits/chosen": -5.347506999969482, "eval_logits/rejected": -5.786441326141357, "eval_logps/chosen": -1229.614501953125, "eval_logps/rejected": -3206.52392578125, "eval_loss": 2.717070834989954e-08, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 73.6122817993164, "eval_rewards/margins": 157.5564422607422, "eval_rewards/rejected": -83.94415283203125, "eval_runtime": 5072.1136, "eval_samples_per_second": 1.491, "eval_steps_per_second": 1.491, "step": 1200 }, { "epoch": 0.6507196095682343, "grad_norm": 0.0, "learning_rate": 1.965761511216057e-05, "logits/chosen": -5.319537162780762, "logits/rejected": -5.7901716232299805, "logps/chosen": -1262.02001953125, "logps/rejected": -3200.416259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 77.81517791748047, "rewards/margins": 162.22039794921875, "rewards/rejected": -84.40521240234375, "step": 1225 }, { "epoch": 0.6639996016002391, "grad_norm": 0.0, "learning_rate": 1.8919716646989374e-05, "logits/chosen": -5.357088565826416, "logits/rejected": -5.7999067306518555, "logps/chosen": -1262.8670654296875, "logps/rejected": -3238.267822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.28630065917969, "rewards/margins": 160.115478515625, "rewards/rejected": -84.82917785644531, "step": 1250 }, { "epoch": 0.6772795936322438, "grad_norm": 0.0, "learning_rate": 1.8181818181818182e-05, "logits/chosen": -5.364207744598389, "logits/rejected": -5.781350135803223, "logps/chosen": -1174.3680419921875, "logps/rejected": -3168.82470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 73.05094909667969, "rewards/margins": 156.62974548339844, "rewards/rejected": -83.57878875732422, "step": 1275 }, { "epoch": 0.6905595856642486, "grad_norm": 0.0, "learning_rate": 1.744391971664699e-05, "logits/chosen": -5.36082649230957, "logits/rejected": -5.809657096862793, "logps/chosen": -1232.3963623046875, "logps/rejected": -3192.718505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 76.31717681884766, "rewards/margins": 159.16860961914062, "rewards/rejected": -82.85144805908203, "step": 1300 }, { "epoch": 0.7038395776962534, "grad_norm": 0.0, "learning_rate": 1.6706021251475798e-05, "logits/chosen": -5.333703517913818, "logits/rejected": -5.784048557281494, "logps/chosen": -1232.0091552734375, "logps/rejected": -3142.961181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 76.76317596435547, "rewards/margins": 158.55519104003906, "rewards/rejected": -81.79202270507812, "step": 1325 }, { "epoch": 0.7171195697282582, "grad_norm": 0.0, "learning_rate": 1.5968122786304606e-05, "logits/chosen": -5.347146511077881, "logits/rejected": -5.755209922790527, "logps/chosen": -1232.19677734375, "logps/rejected": -3197.754638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 77.04602813720703, "rewards/margins": 159.55673217773438, "rewards/rejected": -82.51070404052734, "step": 1350 }, { "epoch": 0.730399561760263, "grad_norm": 0.0, "learning_rate": 1.5230224321133414e-05, "logits/chosen": -5.345367908477783, "logits/rejected": -5.765539646148682, "logps/chosen": -1186.147705078125, "logps/rejected": -3244.4658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 71.38182830810547, "rewards/margins": 156.21925354003906, "rewards/rejected": -84.83743286132812, "step": 1375 }, { "epoch": 0.7436795537922677, "grad_norm": 0.0, "learning_rate": 1.4492325855962221e-05, "logits/chosen": -5.351555824279785, "logits/rejected": -5.780291557312012, "logps/chosen": -1244.749267578125, "logps/rejected": -3228.1875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.1323471069336, "rewards/margins": 157.52206420898438, "rewards/rejected": -83.38970947265625, "step": 1400 }, { "epoch": 0.7569595458242725, "grad_norm": 0.0, "learning_rate": 1.3754427390791028e-05, "logits/chosen": -5.369306564331055, "logits/rejected": -5.805060863494873, "logps/chosen": -1220.332763671875, "logps/rejected": -3181.40771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 76.05998229980469, "rewards/margins": 159.49896240234375, "rewards/rejected": -83.43896484375, "step": 1425 }, { "epoch": 0.7702395378562773, "grad_norm": 0.0, "learning_rate": 1.3016528925619837e-05, "logits/chosen": -5.351638317108154, "logits/rejected": -5.7673234939575195, "logps/chosen": -1239.2713623046875, "logps/rejected": -3280.63623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 77.09069061279297, "rewards/margins": 162.12330627441406, "rewards/rejected": -85.0326156616211, "step": 1450 }, { "epoch": 0.7835195298882821, "grad_norm": 0.0, "learning_rate": 1.2278630460448642e-05, "logits/chosen": -5.357781410217285, "logits/rejected": -5.76120138168335, "logps/chosen": -1216.162841796875, "logps/rejected": -3242.781494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.91290283203125, "rewards/margins": 158.61940002441406, "rewards/rejected": -82.70650482177734, "step": 1475 }, { "epoch": 0.7967995219202868, "grad_norm": 0.0, "learning_rate": 1.154073199527745e-05, "logits/chosen": -5.327876567840576, "logits/rejected": -5.785478591918945, "logps/chosen": -1229.9464111328125, "logps/rejected": -3201.066650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.83934783935547, "rewards/margins": 160.0379638671875, "rewards/rejected": -85.1986312866211, "step": 1500 }, { "epoch": 0.7967995219202868, "eval_logits/chosen": -5.349328994750977, "eval_logits/rejected": -5.788782119750977, "eval_logps/chosen": -1229.252685546875, "eval_logps/rejected": -3206.0927734375, "eval_loss": 2.713120395014812e-08, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 73.6484603881836, "eval_rewards/margins": 157.54949951171875, "eval_rewards/rejected": -83.90103912353516, "eval_runtime": 5065.8297, "eval_samples_per_second": 1.493, "eval_steps_per_second": 1.493, "step": 1500 }, { "epoch": 0.8100795139522916, "grad_norm": 0.0, "learning_rate": 1.0802833530106257e-05, "logits/chosen": -5.353452205657959, "logits/rejected": -5.775092601776123, "logps/chosen": -1196.65283203125, "logps/rejected": -3247.838134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.67597961425781, "rewards/margins": 159.15103149414062, "rewards/rejected": -84.47505950927734, "step": 1525 }, { "epoch": 0.8233595059842964, "grad_norm": 0.0, "learning_rate": 1.0064935064935065e-05, "logits/chosen": -5.353899955749512, "logits/rejected": -5.768539905548096, "logps/chosen": -1213.5177001953125, "logps/rejected": -3247.889892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 76.4741439819336, "rewards/margins": 159.95687866210938, "rewards/rejected": -83.48273468017578, "step": 1550 }, { "epoch": 0.8366394980163012, "grad_norm": 0.0, "learning_rate": 9.327036599763873e-06, "logits/chosen": -5.326158046722412, "logits/rejected": -5.789017200469971, "logps/chosen": -1205.4630126953125, "logps/rejected": -3144.54248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 74.31897735595703, "rewards/margins": 156.3781280517578, "rewards/rejected": -82.05914306640625, "step": 1575 }, { "epoch": 0.849919490048306, "grad_norm": 0.0, "learning_rate": 8.589138134592681e-06, "logits/chosen": -5.375563144683838, "logits/rejected": -5.806121349334717, "logps/chosen": -1176.205810546875, "logps/rejected": -3167.938232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 73.48652648925781, "rewards/margins": 155.93348693847656, "rewards/rejected": -82.44695281982422, "step": 1600 }, { "epoch": 0.8631994820803107, "grad_norm": 0.0, "learning_rate": 7.851239669421489e-06, "logits/chosen": -5.364181995391846, "logits/rejected": -5.792683124542236, "logps/chosen": -1207.3050537109375, "logps/rejected": -3223.77001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.72416687011719, "rewards/margins": 160.62159729003906, "rewards/rejected": -84.89742279052734, "step": 1625 }, { "epoch": 0.8764794741123155, "grad_norm": 0.0, "learning_rate": 7.113341204250296e-06, "logits/chosen": -5.384028434753418, "logits/rejected": -5.806175708770752, "logps/chosen": -1151.069091796875, "logps/rejected": -3129.250732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 71.24993896484375, "rewards/margins": 152.2142333984375, "rewards/rejected": -80.96430969238281, "step": 1650 }, { "epoch": 0.8897594661443203, "grad_norm": 0.0, "learning_rate": 6.375442739079103e-06, "logits/chosen": -5.363752365112305, "logits/rejected": -5.777155876159668, "logps/chosen": -1190.1175537109375, "logps/rejected": -3278.9990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 75.2784423828125, "rewards/margins": 159.60365295410156, "rewards/rejected": -84.3252182006836, "step": 1675 }, { "epoch": 0.9030394581763251, "grad_norm": 0.0, "learning_rate": 5.637544273907911e-06, "logits/chosen": -5.344578266143799, "logits/rejected": -5.798131942749023, "logps/chosen": -1239.991455078125, "logps/rejected": -3247.3134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 77.14320373535156, "rewards/margins": 161.80763244628906, "rewards/rejected": -84.66442108154297, "step": 1700 }, { "epoch": 0.9163194502083298, "grad_norm": 0.0, "learning_rate": 4.899645808736718e-06, "logits/chosen": -5.37684440612793, "logits/rejected": -5.844979286193848, "logps/chosen": -1169.2398681640625, "logps/rejected": -3061.455322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 73.63998413085938, "rewards/margins": 154.5657196044922, "rewards/rejected": -80.92573547363281, "step": 1725 }, { "epoch": 0.9295994422403346, "grad_norm": 0.0, "learning_rate": 4.1617473435655256e-06, "logits/chosen": -5.353868007659912, "logits/rejected": -5.773197174072266, "logps/chosen": -1192.5277099609375, "logps/rejected": -3230.524169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 76.6631851196289, "rewards/margins": 159.65138244628906, "rewards/rejected": -82.98820495605469, "step": 1750 }, { "epoch": 0.9428794342723394, "grad_norm": 0.0, "learning_rate": 3.4238488783943334e-06, "logits/chosen": -5.353931903839111, "logits/rejected": -5.798101425170898, "logps/chosen": -1241.2935791015625, "logps/rejected": -3156.944091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 76.2099380493164, "rewards/margins": 158.4427947998047, "rewards/rejected": -82.23284912109375, "step": 1775 }, { "epoch": 0.9561594263043443, "grad_norm": 0.0, "learning_rate": 2.685950413223141e-06, "logits/chosen": -5.3457183837890625, "logits/rejected": -5.788435459136963, "logps/chosen": -1229.1221923828125, "logps/rejected": -3212.9150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 77.10746765136719, "rewards/margins": 159.9591064453125, "rewards/rejected": -82.85163879394531, "step": 1800 }, { "epoch": 0.9561594263043443, "eval_logits/chosen": -5.349380016326904, "eval_logits/rejected": -5.788834095001221, "eval_logps/chosen": -1229.2548828125, "eval_logps/rejected": -3206.10107421875, "eval_loss": 2.729576387139332e-08, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 73.64824676513672, "eval_rewards/margins": 157.55010986328125, "eval_rewards/rejected": -83.90184783935547, "eval_runtime": 5052.2786, "eval_samples_per_second": 1.497, "eval_steps_per_second": 1.497, "step": 1800 } ], "logging_steps": 25, "max_steps": 1883, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }