{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1803, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005546311702717693, "grad_norm": 2.562516450881958, "learning_rate": 0.0, "logits/chosen": 0.5525987148284912, "logits/rejected": 0.8582919239997864, "logps/chosen": -280.66448974609375, "logps/rejected": -198.26028442382812, "loss": 0.2528, "loss/chosen-sft": 1.1612701416015625, "loss/dpo": 0.2528122663497925, "rewards/accuracies": 0.28125, "rewards/chosen": -0.00369430985301733, "rewards/margins": -0.003086067270487547, "rewards/rejected": -0.0006082424661144614, "step": 1 }, { "epoch": 0.0027731558513588465, "grad_norm": 1.8309712409973145, "learning_rate": 1.1049723756906076e-08, "logits/chosen": 0.6242256164550781, "logits/rejected": 0.7471870183944702, "logps/chosen": -365.79705810546875, "logps/rejected": -280.31201171875, "loss": 0.2317, "loss/chosen-sft": 1.3244354724884033, "loss/dpo": 0.23169046640396118, "rewards/accuracies": 0.421875, "rewards/chosen": -0.00040742673445492983, "rewards/margins": -0.0013081450015306473, "rewards/rejected": 0.0009007179760374129, "step": 5 }, { "epoch": 0.005546311702717693, "grad_norm": 2.4382247924804688, "learning_rate": 2.486187845303867e-08, "logits/chosen": 0.649553656578064, "logits/rejected": 0.8831444978713989, "logps/chosen": -284.4432067871094, "logps/rejected": -248.49337768554688, "loss": 0.2305, "loss/chosen-sft": 1.1865930557250977, "loss/dpo": 0.23052072525024414, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0007243253057822585, "rewards/margins": -5.907495506107807e-06, "rewards/rejected": -0.0007184178684838116, "step": 10 }, { "epoch": 0.008319467554076539, "grad_norm": 2.216749668121338, "learning_rate": 3.867403314917127e-08, "logits/chosen": 0.7230191826820374, "logits/rejected": 0.9627832174301147, "logps/chosen": -302.41400146484375, "logps/rejected": -264.18963623046875, "loss": 0.2317, "loss/chosen-sft": 1.27865731716156, "loss/dpo": 0.23168079555034637, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.000250987388426438, "rewards/margins": 0.0003366722376085818, "rewards/rejected": -8.568489283788949e-05, "step": 15 }, { "epoch": 0.011092623405435386, "grad_norm": 2.429425001144409, "learning_rate": 5.248618784530386e-08, "logits/chosen": 0.7443245649337769, "logits/rejected": 0.9213382601737976, "logps/chosen": -296.67926025390625, "logps/rejected": -233.2224578857422, "loss": 0.2435, "loss/chosen-sft": 1.2470526695251465, "loss/dpo": 0.24353936314582825, "rewards/accuracies": 0.53125, "rewards/chosen": 0.000928783614654094, "rewards/margins": 0.0006695252959616482, "rewards/rejected": 0.00025925817317329347, "step": 20 }, { "epoch": 0.013865779256794232, "grad_norm": 2.4279961585998535, "learning_rate": 6.629834254143646e-08, "logits/chosen": 0.8054523468017578, "logits/rejected": 1.0713064670562744, "logps/chosen": -301.8001403808594, "logps/rejected": -265.5172424316406, "loss": 0.2257, "loss/chosen-sft": 1.2987916469573975, "loss/dpo": 0.22568467259407043, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008613772806711495, "rewards/margins": -5.407678145274986e-06, "rewards/rejected": -0.0008559696143493056, "step": 25 }, { "epoch": 0.016638935108153077, "grad_norm": 1.9755202531814575, "learning_rate": 8.011049723756906e-08, "logits/chosen": 0.8093868494033813, "logits/rejected": 0.8907510042190552, "logps/chosen": -317.31488037109375, "logps/rejected": -238.03701782226562, "loss": 0.2252, "loss/chosen-sft": 1.2658107280731201, "loss/dpo": 0.225176140666008, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0001813205162761733, "rewards/margins": -0.0007404539501294494, "rewards/rejected": 0.0005591334775090218, "step": 30 }, { "epoch": 0.019412090959511925, "grad_norm": 2.0302202701568604, "learning_rate": 9.392265193370165e-08, "logits/chosen": 0.7306667566299438, "logits/rejected": 0.9124480485916138, "logps/chosen": -326.53631591796875, "logps/rejected": -278.6661682128906, "loss": 0.2324, "loss/chosen-sft": 1.2703967094421387, "loss/dpo": 0.23235292732715607, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -9.17384386411868e-05, "rewards/margins": 0.00045947995386086404, "rewards/rejected": -0.0005512182251550257, "step": 35 }, { "epoch": 0.022185246810870772, "grad_norm": 2.4935853481292725, "learning_rate": 1.0773480662983425e-07, "logits/chosen": 0.8364327549934387, "logits/rejected": 0.9892117381095886, "logps/chosen": -299.20513916015625, "logps/rejected": -260.8147888183594, "loss": 0.237, "loss/chosen-sft": 1.221801519393921, "loss/dpo": 0.23703160881996155, "rewards/accuracies": 0.5, "rewards/chosen": 0.00010150570597033948, "rewards/margins": 0.00015612409333698452, "rewards/rejected": -5.461848195409402e-05, "step": 40 }, { "epoch": 0.024958402662229616, "grad_norm": 2.5195465087890625, "learning_rate": 1.2154696132596685e-07, "logits/chosen": 0.7738581895828247, "logits/rejected": 1.0265463590621948, "logps/chosen": -318.85101318359375, "logps/rejected": -237.57666015625, "loss": 0.2307, "loss/chosen-sft": 1.2462958097457886, "loss/dpo": 0.23071476817131042, "rewards/accuracies": 0.53125, "rewards/chosen": -8.556898683309555e-05, "rewards/margins": 0.0009564283536747098, "rewards/rejected": -0.0010419972240924835, "step": 45 }, { "epoch": 0.027731558513588463, "grad_norm": 2.201871871948242, "learning_rate": 1.3535911602209942e-07, "logits/chosen": 0.7524826526641846, "logits/rejected": 0.8071184158325195, "logps/chosen": -343.429443359375, "logps/rejected": -256.6566162109375, "loss": 0.2343, "loss/chosen-sft": 1.2330760955810547, "loss/dpo": 0.2342543601989746, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00010758457938209176, "rewards/margins": 0.0004232854989822954, "rewards/rejected": -0.00031570097780786455, "step": 50 }, { "epoch": 0.03050471436494731, "grad_norm": 2.1549618244171143, "learning_rate": 1.4917127071823204e-07, "logits/chosen": 0.6793915033340454, "logits/rejected": 0.9533787965774536, "logps/chosen": -286.3639221191406, "logps/rejected": -248.7386932373047, "loss": 0.2127, "loss/chosen-sft": 1.2288844585418701, "loss/dpo": 0.21267767250537872, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0002672958071343601, "rewards/margins": 0.0003136277082376182, "rewards/rejected": -4.6331912017194554e-05, "step": 55 }, { "epoch": 0.033277870216306155, "grad_norm": 2.341963291168213, "learning_rate": 1.6298342541436463e-07, "logits/chosen": 0.7014708518981934, "logits/rejected": 1.0665433406829834, "logps/chosen": -310.38134765625, "logps/rejected": -242.14944458007812, "loss": 0.2263, "loss/chosen-sft": 1.2344342470169067, "loss/dpo": 0.2262984812259674, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0001726890041027218, "rewards/margins": 0.000270530057605356, "rewards/rejected": -9.784109715837985e-05, "step": 60 }, { "epoch": 0.036051026067665005, "grad_norm": 2.35201096534729, "learning_rate": 1.7679558011049722e-07, "logits/chosen": 0.7712376117706299, "logits/rejected": 0.9752995371818542, "logps/chosen": -323.18377685546875, "logps/rejected": -256.13909912109375, "loss": 0.2288, "loss/chosen-sft": 1.301064133644104, "loss/dpo": 0.228795126080513, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0009808921022340655, "rewards/margins": 0.000625107903033495, "rewards/rejected": 0.0003557841700967401, "step": 65 }, { "epoch": 0.03882418191902385, "grad_norm": 2.6178271770477295, "learning_rate": 1.9060773480662984e-07, "logits/chosen": 0.7605900168418884, "logits/rejected": 0.8606400489807129, "logps/chosen": -351.55023193359375, "logps/rejected": -263.1789855957031, "loss": 0.2374, "loss/chosen-sft": 1.300065040588379, "loss/dpo": 0.23744645714759827, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0018242821097373962, "rewards/margins": 0.0014699746388942003, "rewards/rejected": 0.0003543071507010609, "step": 70 }, { "epoch": 0.04159733777038269, "grad_norm": 1.9824270009994507, "learning_rate": 2.0441988950276244e-07, "logits/chosen": 0.8183063268661499, "logits/rejected": 1.056434988975525, "logps/chosen": -304.6941833496094, "logps/rejected": -246.3820343017578, "loss": 0.2305, "loss/chosen-sft": 1.2456873655319214, "loss/dpo": 0.23054738342761993, "rewards/accuracies": 0.5, "rewards/chosen": 0.0013592742616310716, "rewards/margins": 0.0002732311259023845, "rewards/rejected": 0.0010860430775210261, "step": 75 }, { "epoch": 0.044370493621741544, "grad_norm": 2.122025489807129, "learning_rate": 2.1823204419889503e-07, "logits/chosen": 0.7255276441574097, "logits/rejected": 0.9549322128295898, "logps/chosen": -319.54302978515625, "logps/rejected": -265.33941650390625, "loss": 0.2322, "loss/chosen-sft": 1.2196282148361206, "loss/dpo": 0.23221342265605927, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.002971996320411563, "rewards/margins": 0.002218194305896759, "rewards/rejected": 0.0007538016652688384, "step": 80 }, { "epoch": 0.04714364947310039, "grad_norm": 2.24088454246521, "learning_rate": 2.320441988950276e-07, "logits/chosen": 0.9074276685714722, "logits/rejected": 0.8777278661727905, "logps/chosen": -311.9112243652344, "logps/rejected": -235.16073608398438, "loss": 0.2216, "loss/chosen-sft": 1.3307913541793823, "loss/dpo": 0.22161617875099182, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0005854293704032898, "rewards/margins": 0.00022579837241210043, "rewards/rejected": 0.0003596309688873589, "step": 85 }, { "epoch": 0.04991680532445923, "grad_norm": 2.139037609100342, "learning_rate": 2.4585635359116024e-07, "logits/chosen": 0.5329002141952515, "logits/rejected": 0.8354529142379761, "logps/chosen": -297.3415832519531, "logps/rejected": -252.3900604248047, "loss": 0.2337, "loss/chosen-sft": 1.1595966815948486, "loss/dpo": 0.23370489478111267, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002842614660039544, "rewards/margins": 0.003834578674286604, "rewards/rejected": -0.0009919643634930253, "step": 90 }, { "epoch": 0.05268996117581808, "grad_norm": 2.091688632965088, "learning_rate": 2.596685082872928e-07, "logits/chosen": 0.743569016456604, "logits/rejected": 0.9253543615341187, "logps/chosen": -319.1153564453125, "logps/rejected": -271.44854736328125, "loss": 0.2319, "loss/chosen-sft": 1.252418041229248, "loss/dpo": 0.2319067418575287, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0031713570933789015, "rewards/margins": 0.002358622383326292, "rewards/rejected": 0.0008127348264679313, "step": 95 }, { "epoch": 0.05546311702717693, "grad_norm": 2.084585189819336, "learning_rate": 2.734806629834254e-07, "logits/chosen": 0.7346712350845337, "logits/rejected": 0.8320924639701843, "logps/chosen": -297.8607482910156, "logps/rejected": -247.6679229736328, "loss": 0.2305, "loss/chosen-sft": 1.1974223852157593, "loss/dpo": 0.23046079277992249, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0038714460097253323, "rewards/margins": 0.004991450812667608, "rewards/rejected": -0.001120004802942276, "step": 100 }, { "epoch": 0.05823627287853577, "grad_norm": 2.2226288318634033, "learning_rate": 2.87292817679558e-07, "logits/chosen": 0.7988497614860535, "logits/rejected": 1.012565016746521, "logps/chosen": -322.70849609375, "logps/rejected": -248.58609008789062, "loss": 0.2288, "loss/chosen-sft": 1.2408934831619263, "loss/dpo": 0.228750079870224, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.006251711398363113, "rewards/margins": 0.007590452674776316, "rewards/rejected": -0.001338740810751915, "step": 105 }, { "epoch": 0.06100942872989462, "grad_norm": 2.297593593597412, "learning_rate": 3.011049723756906e-07, "logits/chosen": 0.726061224937439, "logits/rejected": 1.0324876308441162, "logps/chosen": -281.13897705078125, "logps/rejected": -251.3507537841797, "loss": 0.2497, "loss/chosen-sft": 1.165907859802246, "loss/dpo": 0.24974966049194336, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0033826460130512714, "rewards/margins": 0.0045662978664040565, "rewards/rejected": -0.0011836517369374633, "step": 110 }, { "epoch": 0.06378258458125347, "grad_norm": 1.887861728668213, "learning_rate": 3.149171270718232e-07, "logits/chosen": 0.7413018345832825, "logits/rejected": 1.1185897588729858, "logps/chosen": -276.33709716796875, "logps/rejected": -222.5193328857422, "loss": 0.2343, "loss/chosen-sft": 1.2043964862823486, "loss/dpo": 0.23427948355674744, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.006615237798541784, "rewards/margins": 0.009055422618985176, "rewards/rejected": -0.002440184820443392, "step": 115 }, { "epoch": 0.06655574043261231, "grad_norm": 1.9467929601669312, "learning_rate": 3.287292817679558e-07, "logits/chosen": 0.779278039932251, "logits/rejected": 1.0071234703063965, "logps/chosen": -309.67266845703125, "logps/rejected": -277.6393127441406, "loss": 0.2353, "loss/chosen-sft": 1.250942587852478, "loss/dpo": 0.23532333970069885, "rewards/accuracies": 0.625, "rewards/chosen": 0.005773581098765135, "rewards/margins": 0.008276325650513172, "rewards/rejected": -0.0025027443189173937, "step": 120 }, { "epoch": 0.06932889628397115, "grad_norm": 1.7528764009475708, "learning_rate": 3.425414364640884e-07, "logits/chosen": 0.7105622291564941, "logits/rejected": 0.9455775022506714, "logps/chosen": -247.330810546875, "logps/rejected": -234.4990692138672, "loss": 0.2187, "loss/chosen-sft": 1.204803466796875, "loss/dpo": 0.21866345405578613, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.005182682536542416, "rewards/margins": 0.00711778411641717, "rewards/rejected": -0.0019351018127053976, "step": 125 }, { "epoch": 0.07210205213533001, "grad_norm": 2.430612802505493, "learning_rate": 3.56353591160221e-07, "logits/chosen": 0.8638172149658203, "logits/rejected": 1.0668808221817017, "logps/chosen": -282.2446594238281, "logps/rejected": -227.82461547851562, "loss": 0.2437, "loss/chosen-sft": 1.202839970588684, "loss/dpo": 0.243655726313591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0073145912028849125, "rewards/margins": 0.013259527273476124, "rewards/rejected": -0.005944933742284775, "step": 130 }, { "epoch": 0.07487520798668885, "grad_norm": 2.054070472717285, "learning_rate": 3.7016574585635355e-07, "logits/chosen": 0.6850544214248657, "logits/rejected": 0.9055711627006531, "logps/chosen": -329.90472412109375, "logps/rejected": -257.50933837890625, "loss": 0.232, "loss/chosen-sft": 1.2098791599273682, "loss/dpo": 0.2320254147052765, "rewards/accuracies": 0.6875, "rewards/chosen": 0.008226691745221615, "rewards/margins": 0.014872360043227673, "rewards/rejected": -0.006645667366683483, "step": 135 }, { "epoch": 0.0776483638380477, "grad_norm": 2.211216688156128, "learning_rate": 3.8397790055248617e-07, "logits/chosen": 0.7939187288284302, "logits/rejected": 0.9478033185005188, "logps/chosen": -318.32879638671875, "logps/rejected": -252.41796875, "loss": 0.2326, "loss/chosen-sft": 1.2694523334503174, "loss/dpo": 0.2325749397277832, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.010530084371566772, "rewards/margins": 0.015220420435070992, "rewards/rejected": -0.004690336063504219, "step": 140 }, { "epoch": 0.08042151968940654, "grad_norm": 2.2831854820251465, "learning_rate": 3.9779005524861873e-07, "logits/chosen": 0.7261658906936646, "logits/rejected": 0.7962777614593506, "logps/chosen": -322.5554504394531, "logps/rejected": -283.42431640625, "loss": 0.2576, "loss/chosen-sft": 1.2340948581695557, "loss/dpo": 0.25764793157577515, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0055245086550712585, "rewards/margins": 0.013468381948769093, "rewards/rejected": -0.00794387236237526, "step": 145 }, { "epoch": 0.08319467554076539, "grad_norm": 2.4128849506378174, "learning_rate": 4.1160220994475135e-07, "logits/chosen": 0.715286374092102, "logits/rejected": 0.9870149493217468, "logps/chosen": -293.37115478515625, "logps/rejected": -232.06796264648438, "loss": 0.2465, "loss/chosen-sft": 1.1868611574172974, "loss/dpo": 0.2465299814939499, "rewards/accuracies": 0.71875, "rewards/chosen": 0.008405817672610283, "rewards/margins": 0.021539855748414993, "rewards/rejected": -0.013134037144482136, "step": 150 }, { "epoch": 0.08596783139212424, "grad_norm": 2.3715765476226807, "learning_rate": 4.2541436464088397e-07, "logits/chosen": 0.7736817598342896, "logits/rejected": 0.842314600944519, "logps/chosen": -321.31622314453125, "logps/rejected": -244.69161987304688, "loss": 0.2485, "loss/chosen-sft": 1.2318105697631836, "loss/dpo": 0.24850551784038544, "rewards/accuracies": 0.6875, "rewards/chosen": 0.011977704241871834, "rewards/margins": 0.02396266534924507, "rewards/rejected": -0.011984961107373238, "step": 155 }, { "epoch": 0.08874098724348309, "grad_norm": 2.3416831493377686, "learning_rate": 4.3922651933701654e-07, "logits/chosen": 0.7620590925216675, "logits/rejected": 1.0964124202728271, "logps/chosen": -287.9859313964844, "logps/rejected": -227.1605682373047, "loss": 0.2379, "loss/chosen-sft": 1.2391362190246582, "loss/dpo": 0.23787717521190643, "rewards/accuracies": 0.71875, "rewards/chosen": 0.008434845134615898, "rewards/margins": 0.027770137414336205, "rewards/rejected": -0.019335290417075157, "step": 160 }, { "epoch": 0.09151414309484193, "grad_norm": 2.5888402462005615, "learning_rate": 4.5303867403314916e-07, "logits/chosen": 0.7178218960762024, "logits/rejected": 1.0630967617034912, "logps/chosen": -300.853759765625, "logps/rejected": -226.2397918701172, "loss": 0.2489, "loss/chosen-sft": 1.2291874885559082, "loss/dpo": 0.2488756626844406, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0051612951792776585, "rewards/margins": 0.02443910576403141, "rewards/rejected": -0.01927780732512474, "step": 165 }, { "epoch": 0.09428729894620078, "grad_norm": 2.380986213684082, "learning_rate": 4.668508287292817e-07, "logits/chosen": 0.769342303276062, "logits/rejected": 1.0527831315994263, "logps/chosen": -301.633056640625, "logps/rejected": -271.94549560546875, "loss": 0.2273, "loss/chosen-sft": 1.2724863290786743, "loss/dpo": 0.22729595005512238, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005723125766962767, "rewards/margins": 0.024905353784561157, "rewards/rejected": -0.019182229414582253, "step": 170 }, { "epoch": 0.09706045479755962, "grad_norm": 2.346064329147339, "learning_rate": 4.806629834254143e-07, "logits/chosen": 0.7390652894973755, "logits/rejected": 1.0698118209838867, "logps/chosen": -336.51434326171875, "logps/rejected": -264.5447692871094, "loss": 0.2329, "loss/chosen-sft": 1.2428892850875854, "loss/dpo": 0.2328546941280365, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.013324853964149952, "rewards/margins": 0.037267185747623444, "rewards/rejected": -0.023942332714796066, "step": 175 }, { "epoch": 0.09983361064891846, "grad_norm": 2.174849271774292, "learning_rate": 4.944751381215469e-07, "logits/chosen": 0.6287063360214233, "logits/rejected": 0.839179515838623, "logps/chosen": -281.9019470214844, "logps/rejected": -246.0726318359375, "loss": 0.2247, "loss/chosen-sft": 1.2559473514556885, "loss/dpo": 0.22472822666168213, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0028144109528511763, "rewards/margins": 0.03012235462665558, "rewards/rejected": -0.03293676674365997, "step": 180 }, { "epoch": 0.10260676650027732, "grad_norm": 2.126122236251831, "learning_rate": 4.999957796414774e-07, "logits/chosen": 0.7778046727180481, "logits/rejected": 0.8593734502792358, "logps/chosen": -305.8876037597656, "logps/rejected": -231.57009887695312, "loss": 0.2087, "loss/chosen-sft": 1.2814704179763794, "loss/dpo": 0.20866894721984863, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0015260865911841393, "rewards/margins": 0.032867975533008575, "rewards/rejected": -0.03439406305551529, "step": 185 }, { "epoch": 0.10537992235163617, "grad_norm": 2.394864082336426, "learning_rate": 4.999699890776339e-07, "logits/chosen": 0.7574223279953003, "logits/rejected": 0.981839656829834, "logps/chosen": -307.5397033691406, "logps/rejected": -260.81689453125, "loss": 0.2244, "loss/chosen-sft": 1.2601521015167236, "loss/dpo": 0.22438263893127441, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.001994546502828598, "rewards/margins": 0.03860156610608101, "rewards/rejected": -0.040596116334199905, "step": 190 }, { "epoch": 0.10815307820299501, "grad_norm": 2.5312979221343994, "learning_rate": 4.999207550094137e-07, "logits/chosen": 0.5832860469818115, "logits/rejected": 0.9633687138557434, "logps/chosen": -320.2137145996094, "logps/rejected": -255.04135131835938, "loss": 0.2567, "loss/chosen-sft": 1.1643339395523071, "loss/dpo": 0.25674429535865784, "rewards/accuracies": 0.6875, "rewards/chosen": -0.009322636760771275, "rewards/margins": 0.04288625717163086, "rewards/rejected": -0.05220889300107956, "step": 195 }, { "epoch": 0.11092623405435385, "grad_norm": 2.321089506149292, "learning_rate": 4.998480820542476e-07, "logits/chosen": 0.5820528268814087, "logits/rejected": 0.7807351350784302, "logps/chosen": -270.94195556640625, "logps/rejected": -220.6343231201172, "loss": 0.2267, "loss/chosen-sft": 1.1440722942352295, "loss/dpo": 0.22673571109771729, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.0010490523418411613, "rewards/margins": 0.05040453001856804, "rewards/rejected": -0.05145358294248581, "step": 200 }, { "epoch": 0.1136993899057127, "grad_norm": 2.525728940963745, "learning_rate": 4.997519770277884e-07, "logits/chosen": 0.44963234663009644, "logits/rejected": 0.9135919809341431, "logps/chosen": -272.36224365234375, "logps/rejected": -245.6114044189453, "loss": 0.2211, "loss/chosen-sft": 1.1812119483947754, "loss/dpo": 0.22114984691143036, "rewards/accuracies": 0.6875, "rewards/chosen": -0.022441323846578598, "rewards/margins": 0.04137767106294632, "rewards/rejected": -0.06381900608539581, "step": 205 }, { "epoch": 0.11647254575707154, "grad_norm": 2.5504720211029053, "learning_rate": 4.99632448943273e-07, "logits/chosen": 0.7174805998802185, "logits/rejected": 1.01616632938385, "logps/chosen": -292.91656494140625, "logps/rejected": -247.65097045898438, "loss": 0.2198, "loss/chosen-sft": 1.2257840633392334, "loss/dpo": 0.2198173552751541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.025929760187864304, "rewards/margins": 0.05272764712572098, "rewards/rejected": -0.07865741103887558, "step": 210 }, { "epoch": 0.1192457016084304, "grad_norm": 2.452855110168457, "learning_rate": 4.994895090106754e-07, "logits/chosen": 0.6810121536254883, "logits/rejected": 1.0060815811157227, "logps/chosen": -282.73846435546875, "logps/rejected": -267.6331481933594, "loss": 0.222, "loss/chosen-sft": 1.192631483078003, "loss/dpo": 0.2220323085784912, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.02360682748258114, "rewards/margins": 0.048061199486255646, "rewards/rejected": -0.07166802138090134, "step": 215 }, { "epoch": 0.12201885745978924, "grad_norm": 2.1740057468414307, "learning_rate": 4.993231706356567e-07, "logits/chosen": 0.6147283911705017, "logits/rejected": 0.8315596580505371, "logps/chosen": -336.437255859375, "logps/rejected": -253.25320434570312, "loss": 0.2107, "loss/chosen-sft": 1.323418378829956, "loss/dpo": 0.21066264808177948, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.03315647691488266, "rewards/margins": 0.057157598435878754, "rewards/rejected": -0.09031407535076141, "step": 220 }, { "epoch": 0.12479201331114809, "grad_norm": 2.743816375732422, "learning_rate": 4.991334494183074e-07, "logits/chosen": 0.6287509202957153, "logits/rejected": 0.9196675419807434, "logps/chosen": -317.734375, "logps/rejected": -269.40240478515625, "loss": 0.2176, "loss/chosen-sft": 1.2351751327514648, "loss/dpo": 0.21760547161102295, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.026932349428534508, "rewards/margins": 0.06921146810054779, "rewards/rejected": -0.09614382684230804, "step": 225 }, { "epoch": 0.12756516916250693, "grad_norm": 2.0664100646972656, "learning_rate": 4.989203631516842e-07, "logits/chosen": 0.5628304481506348, "logits/rejected": 0.9230527877807617, "logps/chosen": -290.1402282714844, "logps/rejected": -230.798828125, "loss": 0.2193, "loss/chosen-sft": 1.1874725818634033, "loss/dpo": 0.21933284401893616, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.04623774066567421, "rewards/margins": 0.04349591210484505, "rewards/rejected": -0.08973364531993866, "step": 230 }, { "epoch": 0.13033832501386577, "grad_norm": 2.657771587371826, "learning_rate": 4.986839318201412e-07, "logits/chosen": 0.5794273018836975, "logits/rejected": 0.8498503565788269, "logps/chosen": -283.7383117675781, "logps/rejected": -270.8831787109375, "loss": 0.2113, "loss/chosen-sft": 1.2316051721572876, "loss/dpo": 0.2112795114517212, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04904647916555405, "rewards/margins": 0.06235337257385254, "rewards/rejected": -0.11139985173940659, "step": 235 }, { "epoch": 0.13311148086522462, "grad_norm": 2.395413875579834, "learning_rate": 4.984241775974562e-07, "logits/chosen": 0.6039844751358032, "logits/rejected": 0.9748314619064331, "logps/chosen": -343.57330322265625, "logps/rejected": -282.4112548828125, "loss": 0.2046, "loss/chosen-sft": 1.2928663492202759, "loss/dpo": 0.20455893874168396, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.05615786463022232, "rewards/margins": 0.057152897119522095, "rewards/rejected": -0.11331076920032501, "step": 240 }, { "epoch": 0.13588463671658346, "grad_norm": 2.410879373550415, "learning_rate": 4.981411248447506e-07, "logits/chosen": 0.5458131432533264, "logits/rejected": 0.7987755537033081, "logps/chosen": -320.3564453125, "logps/rejected": -274.52349853515625, "loss": 0.2127, "loss/chosen-sft": 1.2482545375823975, "loss/dpo": 0.21267366409301758, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.05543135479092598, "rewards/margins": 0.07835234701633453, "rewards/rejected": -0.13378369808197021, "step": 245 }, { "epoch": 0.1386577925679423, "grad_norm": 2.126710891723633, "learning_rate": 4.978348001082048e-07, "logits/chosen": 0.5291799306869507, "logits/rejected": 0.7740551829338074, "logps/chosen": -305.97369384765625, "logps/rejected": -258.32623291015625, "loss": 0.1986, "loss/chosen-sft": 1.317022442817688, "loss/dpo": 0.19859442114830017, "rewards/accuracies": 0.71875, "rewards/chosen": -0.056786395609378815, "rewards/margins": 0.06887609511613846, "rewards/rejected": -0.12566249072551727, "step": 250 }, { "epoch": 0.14143094841930118, "grad_norm": 2.286869525909424, "learning_rate": 4.975052321165688e-07, "logits/chosen": 0.4996607303619385, "logits/rejected": 0.8591309785842896, "logps/chosen": -315.6490783691406, "logps/rejected": -246.1661834716797, "loss": 0.2036, "loss/chosen-sft": 1.2581641674041748, "loss/dpo": 0.2035764753818512, "rewards/accuracies": 0.625, "rewards/chosen": -0.0983358696103096, "rewards/margins": 0.06135006994009018, "rewards/rejected": -0.15968593955039978, "step": 255 }, { "epoch": 0.14420410427066002, "grad_norm": 2.3020002841949463, "learning_rate": 4.971524517784676e-07, "logits/chosen": 0.6534574627876282, "logits/rejected": 0.9783307313919067, "logps/chosen": -315.9989318847656, "logps/rejected": -300.7061767578125, "loss": 0.1887, "loss/chosen-sft": 1.3003921508789062, "loss/dpo": 0.18872812390327454, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0870845764875412, "rewards/margins": 0.09078534692525864, "rewards/rejected": -0.17786994576454163, "step": 260 }, { "epoch": 0.14697726012201887, "grad_norm": 2.2956702709198, "learning_rate": 4.967764921795026e-07, "logits/chosen": 0.4762963354587555, "logits/rejected": 0.7974573373794556, "logps/chosen": -320.36944580078125, "logps/rejected": -253.922119140625, "loss": 0.1965, "loss/chosen-sft": 1.2646160125732422, "loss/dpo": 0.19646450877189636, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10069359838962555, "rewards/margins": 0.07882802188396454, "rewards/rejected": -0.1795216202735901, "step": 265 }, { "epoch": 0.1497504159733777, "grad_norm": 2.5840351581573486, "learning_rate": 4.963773885791484e-07, "logits/chosen": 0.46905121207237244, "logits/rejected": 0.6983851194381714, "logps/chosen": -319.40997314453125, "logps/rejected": -264.07574462890625, "loss": 0.2039, "loss/chosen-sft": 1.2846953868865967, "loss/dpo": 0.2038877308368683, "rewards/accuracies": 0.59375, "rewards/chosen": -0.14658531546592712, "rewards/margins": 0.03527539223432541, "rewards/rejected": -0.18186071515083313, "step": 270 }, { "epoch": 0.15252357182473655, "grad_norm": 2.3462185859680176, "learning_rate": 4.959551784074461e-07, "logits/chosen": 0.5223512053489685, "logits/rejected": 0.673526406288147, "logps/chosen": -353.185546875, "logps/rejected": -287.7161865234375, "loss": 0.1814, "loss/chosen-sft": 1.3422802686691284, "loss/dpo": 0.18135803937911987, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.11449305713176727, "rewards/margins": 0.09264171123504639, "rewards/rejected": -0.20713476836681366, "step": 275 }, { "epoch": 0.1552967276760954, "grad_norm": 2.2738516330718994, "learning_rate": 4.955099012614933e-07, "logits/chosen": 0.589896023273468, "logits/rejected": 0.8485240936279297, "logps/chosen": -340.37811279296875, "logps/rejected": -278.1029968261719, "loss": 0.1715, "loss/chosen-sft": 1.3321316242218018, "loss/dpo": 0.17148999869823456, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12517675757408142, "rewards/margins": 0.09786481410264969, "rewards/rejected": -0.2230415791273117, "step": 280 }, { "epoch": 0.15806988352745424, "grad_norm": 2.3262434005737305, "learning_rate": 4.9504159890173e-07, "logits/chosen": 0.5216140747070312, "logits/rejected": 0.760550320148468, "logps/chosen": -334.56842041015625, "logps/rejected": -273.66644287109375, "loss": 0.1817, "loss/chosen-sft": 1.3172911405563354, "loss/dpo": 0.18172022700309753, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.15752620995044708, "rewards/margins": 0.09383035451173782, "rewards/rejected": -0.2513565421104431, "step": 285 }, { "epoch": 0.16084303937881309, "grad_norm": 2.1715376377105713, "learning_rate": 4.945503152480221e-07, "logits/chosen": 0.4839824140071869, "logits/rejected": 0.7215232849121094, "logps/chosen": -348.7749938964844, "logps/rejected": -296.0406188964844, "loss": 0.1779, "loss/chosen-sft": 1.3408844470977783, "loss/dpo": 0.17787811160087585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.13581573963165283, "rewards/margins": 0.10350732505321503, "rewards/rejected": -0.23932304978370667, "step": 290 }, { "epoch": 0.16361619523017193, "grad_norm": 1.9679666757583618, "learning_rate": 4.940360963755426e-07, "logits/chosen": 0.31268057227134705, "logits/rejected": 0.45721864700317383, "logps/chosen": -306.3217468261719, "logps/rejected": -259.17010498046875, "loss": 0.168, "loss/chosen-sft": 1.3279306888580322, "loss/dpo": 0.16799867153167725, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.14948979020118713, "rewards/margins": 0.11183127015829086, "rewards/rejected": -0.2613210082054138, "step": 295 }, { "epoch": 0.16638935108153077, "grad_norm": 2.079015016555786, "learning_rate": 4.934989905104502e-07, "logits/chosen": 0.38664117455482483, "logits/rejected": 0.5652385354042053, "logps/chosen": -327.77191162109375, "logps/rejected": -252.3622589111328, "loss": 0.1615, "loss/chosen-sft": 1.3207409381866455, "loss/dpo": 0.1614522486925125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15085890889167786, "rewards/margins": 0.1116364598274231, "rewards/rejected": -0.26249536871910095, "step": 300 }, { "epoch": 0.16916250693288962, "grad_norm": 1.7469545602798462, "learning_rate": 4.929390480253667e-07, "logits/chosen": 0.4214795231819153, "logits/rejected": 0.5547307729721069, "logps/chosen": -345.3471984863281, "logps/rejected": -296.81610107421875, "loss": 0.1597, "loss/chosen-sft": 1.3892717361450195, "loss/dpo": 0.15971598029136658, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.1656077653169632, "rewards/margins": 0.13407020270824432, "rewards/rejected": -0.2996779680252075, "step": 305 }, { "epoch": 0.1719356627842485, "grad_norm": 2.5472302436828613, "learning_rate": 4.923563214346525e-07, "logits/chosen": 0.2179243117570877, "logits/rejected": 0.5859929323196411, "logps/chosen": -359.1659240722656, "logps/rejected": -323.3959045410156, "loss": 0.1693, "loss/chosen-sft": 1.3262033462524414, "loss/dpo": 0.16931983828544617, "rewards/accuracies": 0.75, "rewards/chosen": -0.1735382229089737, "rewards/margins": 0.17630478739738464, "rewards/rejected": -0.34984302520751953, "step": 310 }, { "epoch": 0.17470881863560733, "grad_norm": 1.927839756011963, "learning_rate": 4.917508653894817e-07, "logits/chosen": 0.2809959352016449, "logits/rejected": 0.6429028511047363, "logps/chosen": -306.2704162597656, "logps/rejected": -285.570556640625, "loss": 0.1554, "loss/chosen-sft": 1.3090779781341553, "loss/dpo": 0.15539118647575378, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.19202136993408203, "rewards/margins": 0.13031750917434692, "rewards/rejected": -0.32233884930610657, "step": 315 }, { "epoch": 0.17748197448696618, "grad_norm": 1.4578968286514282, "learning_rate": 4.911227366727166e-07, "logits/chosen": 0.3806017339229584, "logits/rejected": 0.646203339099884, "logps/chosen": -343.3798828125, "logps/rejected": -279.1783752441406, "loss": 0.1326, "loss/chosen-sft": 1.4282009601593018, "loss/dpo": 0.1326414793729782, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20244988799095154, "rewards/margins": 0.11332936584949493, "rewards/rejected": -0.31577926874160767, "step": 320 }, { "epoch": 0.18025513033832502, "grad_norm": 2.2041287422180176, "learning_rate": 4.904719941935818e-07, "logits/chosen": 0.3938923478126526, "logits/rejected": 0.5521587133407593, "logps/chosen": -330.3848876953125, "logps/rejected": -284.7240905761719, "loss": 0.1679, "loss/chosen-sft": 1.335784673690796, "loss/dpo": 0.16794539988040924, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2126191109418869, "rewards/margins": 0.12985429167747498, "rewards/rejected": -0.34247341752052307, "step": 325 }, { "epoch": 0.18302828618968386, "grad_norm": 2.1968367099761963, "learning_rate": 4.897986989821405e-07, "logits/chosen": 0.27726611495018005, "logits/rejected": 0.42112284898757935, "logps/chosen": -304.1949462890625, "logps/rejected": -284.40496826171875, "loss": 0.1461, "loss/chosen-sft": 1.3867931365966797, "loss/dpo": 0.1461399793624878, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.24662432074546814, "rewards/margins": 0.13204023241996765, "rewards/rejected": -0.3786645829677582, "step": 330 }, { "epoch": 0.1858014420410427, "grad_norm": 1.8825585842132568, "learning_rate": 4.891029141835697e-07, "logits/chosen": 0.320846825838089, "logits/rejected": 0.5570532083511353, "logps/chosen": -327.5889587402344, "logps/rejected": -277.48101806640625, "loss": 0.1485, "loss/chosen-sft": 1.314846396446228, "loss/dpo": 0.1484725922346115, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.23770102858543396, "rewards/margins": 0.13969966769218445, "rewards/rejected": -0.3774007260799408, "step": 335 }, { "epoch": 0.18857459789240155, "grad_norm": 2.134202718734741, "learning_rate": 4.883847050522388e-07, "logits/chosen": 0.32603126764297485, "logits/rejected": 0.5855879783630371, "logps/chosen": -306.82940673828125, "logps/rejected": -306.5520324707031, "loss": 0.1386, "loss/chosen-sft": 1.3495581150054932, "loss/dpo": 0.138578861951828, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.25742700695991516, "rewards/margins": 0.12370799481868744, "rewards/rejected": -0.3811350464820862, "step": 340 }, { "epoch": 0.1913477537437604, "grad_norm": 1.7875416278839111, "learning_rate": 4.876441389455892e-07, "logits/chosen": 0.24733588099479675, "logits/rejected": 0.45660385489463806, "logps/chosen": -326.81988525390625, "logps/rejected": -290.23175048828125, "loss": 0.1307, "loss/chosen-sft": 1.372218132019043, "loss/dpo": 0.130675807595253, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.24057714641094208, "rewards/margins": 0.166746586561203, "rewards/rejected": -0.4073237478733063, "step": 345 }, { "epoch": 0.19412090959511924, "grad_norm": 1.9340115785598755, "learning_rate": 4.868812853178174e-07, "logits/chosen": 0.2431308925151825, "logits/rejected": 0.5894955992698669, "logps/chosen": -318.0989990234375, "logps/rejected": -283.68896484375, "loss": 0.1256, "loss/chosen-sft": 1.354346513748169, "loss/dpo": 0.12558932602405548, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2501407861709595, "rewards/margins": 0.15478388965129852, "rewards/rejected": -0.4049246907234192, "step": 350 }, { "epoch": 0.19689406544647808, "grad_norm": 2.164641857147217, "learning_rate": 4.860962157133614e-07, "logits/chosen": 0.042165856808423996, "logits/rejected": 0.2751336395740509, "logps/chosen": -312.33636474609375, "logps/rejected": -293.5239562988281, "loss": 0.1378, "loss/chosen-sft": 1.307612657546997, "loss/dpo": 0.1378055065870285, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2775038182735443, "rewards/margins": 0.1561960130929947, "rewards/rejected": -0.4336997866630554, "step": 355 }, { "epoch": 0.19966722129783693, "grad_norm": 1.9318678379058838, "learning_rate": 4.852890037601906e-07, "logits/chosen": 0.16851834952831268, "logits/rejected": 0.38687795400619507, "logps/chosen": -339.43377685546875, "logps/rejected": -299.70611572265625, "loss": 0.1384, "loss/chosen-sft": 1.4045716524124146, "loss/dpo": 0.1384141445159912, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2960866689682007, "rewards/margins": 0.16027973592281342, "rewards/rejected": -0.4563663899898529, "step": 360 }, { "epoch": 0.20244037714919577, "grad_norm": 1.874963402748108, "learning_rate": 4.844597251629008e-07, "logits/chosen": 0.21792784333229065, "logits/rejected": 0.3441501259803772, "logps/chosen": -325.2286682128906, "logps/rejected": -281.23638916015625, "loss": 0.1352, "loss/chosen-sft": 1.3615696430206299, "loss/dpo": 0.13523094356060028, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3180539608001709, "rewards/margins": 0.1318327933549881, "rewards/rejected": -0.4498867392539978, "step": 365 }, { "epoch": 0.20521353300055464, "grad_norm": 2.0710604190826416, "learning_rate": 4.836084576956137e-07, "logits/chosen": 0.3437557816505432, "logits/rejected": 0.5105153322219849, "logps/chosen": -339.3221435546875, "logps/rejected": -306.34954833984375, "loss": 0.1202, "loss/chosen-sft": 1.4008045196533203, "loss/dpo": 0.12022168934345245, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.295442670583725, "rewards/margins": 0.2085741013288498, "rewards/rejected": -0.5040167570114136, "step": 370 }, { "epoch": 0.2079866888519135, "grad_norm": 2.0409181118011475, "learning_rate": 4.827352811946839e-07, "logits/chosen": 0.10601860284805298, "logits/rejected": 0.19340696930885315, "logps/chosen": -363.87677001953125, "logps/rejected": -313.70306396484375, "loss": 0.1444, "loss/chosen-sft": 1.4021615982055664, "loss/dpo": 0.14439384639263153, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3728375732898712, "rewards/margins": 0.1328699290752411, "rewards/rejected": -0.5057075023651123, "step": 375 }, { "epoch": 0.21075984470327233, "grad_norm": 1.8490561246871948, "learning_rate": 4.818402775512101e-07, "logits/chosen": 0.12518136203289032, "logits/rejected": 0.1990843415260315, "logps/chosen": -330.71282958984375, "logps/rejected": -289.8016052246094, "loss": 0.1272, "loss/chosen-sft": 1.3432165384292603, "loss/dpo": 0.1271766871213913, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.35159942507743835, "rewards/margins": 0.19611182808876038, "rewards/rejected": -0.5477112531661987, "step": 380 }, { "epoch": 0.21353300055463117, "grad_norm": 2.0964889526367188, "learning_rate": 4.80923530703356e-07, "logits/chosen": 0.10008511692285538, "logits/rejected": 0.3669296205043793, "logps/chosen": -345.01715087890625, "logps/rejected": -335.6979064941406, "loss": 0.1178, "loss/chosen-sft": 1.375633955001831, "loss/dpo": 0.1178009957075119, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.33166831731796265, "rewards/margins": 0.257099449634552, "rewards/rejected": -0.5887677073478699, "step": 385 }, { "epoch": 0.21630615640599002, "grad_norm": 1.3537511825561523, "learning_rate": 4.799851266284776e-07, "logits/chosen": 0.029593368992209435, "logits/rejected": 0.2654074728488922, "logps/chosen": -345.6829528808594, "logps/rejected": -295.1934814453125, "loss": 0.1097, "loss/chosen-sft": 1.4121724367141724, "loss/dpo": 0.10970698297023773, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.36835533380508423, "rewards/margins": 0.22947156429290771, "rewards/rejected": -0.5978268980979919, "step": 390 }, { "epoch": 0.21907931225734886, "grad_norm": 2.3625447750091553, "learning_rate": 4.790251533350597e-07, "logits/chosen": 0.10083159059286118, "logits/rejected": 0.31363213062286377, "logps/chosen": -332.98846435546875, "logps/rejected": -296.0115051269531, "loss": 0.1227, "loss/chosen-sft": 1.3798365592956543, "loss/dpo": 0.12265870720148087, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4115685820579529, "rewards/margins": 0.17796972393989563, "rewards/rejected": -0.5895382761955261, "step": 395 }, { "epoch": 0.2218524681087077, "grad_norm": 1.8092533349990845, "learning_rate": 4.780437008544628e-07, "logits/chosen": 0.0437210276722908, "logits/rejected": 0.28447720408439636, "logps/chosen": -328.55364990234375, "logps/rejected": -299.6891174316406, "loss": 0.1123, "loss/chosen-sft": 1.4356980323791504, "loss/dpo": 0.11227130889892578, "rewards/accuracies": 0.75, "rewards/chosen": -0.3866044580936432, "rewards/margins": 0.21935506165027618, "rewards/rejected": -0.6059595346450806, "step": 400 }, { "epoch": 0.22462562396006655, "grad_norm": 1.755149245262146, "learning_rate": 4.770408612324783e-07, "logits/chosen": 0.06074788048863411, "logits/rejected": 0.29286664724349976, "logps/chosen": -342.78314208984375, "logps/rejected": -324.9090270996094, "loss": 0.1229, "loss/chosen-sft": 1.3950598239898682, "loss/dpo": 0.12285208702087402, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.40826615691185, "rewards/margins": 0.23281535506248474, "rewards/rejected": -0.6410815119743347, "step": 405 }, { "epoch": 0.2273987798114254, "grad_norm": 1.7682974338531494, "learning_rate": 4.760167285206968e-07, "logits/chosen": 0.03603815287351608, "logits/rejected": 0.14143748581409454, "logps/chosen": -357.19000244140625, "logps/rejected": -312.27984619140625, "loss": 0.1099, "loss/chosen-sft": 1.4361236095428467, "loss/dpo": 0.10990427434444427, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4452105462551117, "rewards/margins": 0.2284567803144455, "rewards/rejected": -0.673667311668396, "step": 410 }, { "epoch": 0.23017193566278424, "grad_norm": 1.7545676231384277, "learning_rate": 4.749713987676871e-07, "logits/chosen": 0.060189586132764816, "logits/rejected": 0.1871364414691925, "logps/chosen": -355.28924560546875, "logps/rejected": -317.2358093261719, "loss": 0.1093, "loss/chosen-sft": 1.3852781057357788, "loss/dpo": 0.10928022861480713, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.43528875708580017, "rewards/margins": 0.2679973840713501, "rewards/rejected": -0.7032861709594727, "step": 415 }, { "epoch": 0.23294509151414308, "grad_norm": 1.7819030284881592, "learning_rate": 4.7390497000998853e-07, "logits/chosen": -0.06483317911624908, "logits/rejected": 0.0744047462940216, "logps/chosen": -327.4217834472656, "logps/rejected": -300.5835876464844, "loss": 0.1031, "loss/chosen-sft": 1.4440373182296753, "loss/dpo": 0.10313411056995392, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.420682817697525, "rewards/margins": 0.2728646397590637, "rewards/rejected": -0.6935475468635559, "step": 420 }, { "epoch": 0.23571824736550195, "grad_norm": 1.5001357793807983, "learning_rate": 4.7281754226291627e-07, "logits/chosen": -0.01330060325562954, "logits/rejected": 0.23702244460582733, "logps/chosen": -346.0107116699219, "logps/rejected": -335.9864501953125, "loss": 0.0908, "loss/chosen-sft": 1.3927044868469238, "loss/dpo": 0.09075666218996048, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4789350628852844, "rewards/margins": 0.313282310962677, "rewards/rejected": -0.7922172546386719, "step": 425 }, { "epoch": 0.2384914032168608, "grad_norm": 1.6260863542556763, "learning_rate": 4.717092175111814e-07, "logits/chosen": 0.0023514986969530582, "logits/rejected": 0.2060910165309906, "logps/chosen": -372.6448059082031, "logps/rejected": -335.29534912109375, "loss": 0.101, "loss/chosen-sft": 1.4546012878417969, "loss/dpo": 0.10103006660938263, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5173962712287903, "rewards/margins": 0.24449630081653595, "rewards/rejected": -0.7618924975395203, "step": 430 }, { "epoch": 0.24126455906821964, "grad_norm": 1.8368723392486572, "learning_rate": 4.7058009969932666e-07, "logits/chosen": -0.010248428210616112, "logits/rejected": 0.10792305320501328, "logps/chosen": -364.9273376464844, "logps/rejected": -310.1815490722656, "loss": 0.104, "loss/chosen-sft": 1.4473296403884888, "loss/dpo": 0.10400193929672241, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5129069089889526, "rewards/margins": 0.27995672821998596, "rewards/rejected": -0.792863667011261, "step": 435 }, { "epoch": 0.24403771491957849, "grad_norm": 1.4362239837646484, "learning_rate": 4.694302947219775e-07, "logits/chosen": -0.04999478533864021, "logits/rejected": 0.15826420485973358, "logps/chosen": -371.3370361328125, "logps/rejected": -349.93511962890625, "loss": 0.0862, "loss/chosen-sft": 1.5128648281097412, "loss/dpo": 0.08617158234119415, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5207723379135132, "rewards/margins": 0.3013128638267517, "rewards/rejected": -0.8220852017402649, "step": 440 }, { "epoch": 0.24681087077093733, "grad_norm": 1.7123464345932007, "learning_rate": 4.6825991041391067e-07, "logits/chosen": -0.131384015083313, "logits/rejected": 0.14633068442344666, "logps/chosen": -360.64166259765625, "logps/rejected": -330.9405822753906, "loss": 0.0844, "loss/chosen-sft": 1.433370590209961, "loss/dpo": 0.08443091064691544, "rewards/accuracies": 0.6875, "rewards/chosen": -0.49853143095970154, "rewards/margins": 0.3446124196052551, "rewards/rejected": -0.8431438207626343, "step": 445 }, { "epoch": 0.24958402662229617, "grad_norm": 1.4186034202575684, "learning_rate": 4.670690565399415e-07, "logits/chosen": -0.17499125003814697, "logits/rejected": 0.10050486028194427, "logps/chosen": -378.93157958984375, "logps/rejected": -318.13885498046875, "loss": 0.0908, "loss/chosen-sft": 1.4939839839935303, "loss/dpo": 0.09077504277229309, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5793511271476746, "rewards/margins": 0.26338380575180054, "rewards/rejected": -0.8427349328994751, "step": 450 }, { "epoch": 0.252357182473655, "grad_norm": 1.2995809316635132, "learning_rate": 4.65857844784629e-07, "logits/chosen": -0.1015838161110878, "logits/rejected": 0.014798527583479881, "logps/chosen": -328.8960266113281, "logps/rejected": -330.8209228515625, "loss": 0.0818, "loss/chosen-sft": 1.502540946006775, "loss/dpo": 0.08181241899728775, "rewards/accuracies": 0.71875, "rewards/chosen": -0.55858314037323, "rewards/margins": 0.30958643555641174, "rewards/rejected": -0.8681696057319641, "step": 455 }, { "epoch": 0.25513033832501386, "grad_norm": 1.888208270072937, "learning_rate": 4.6462638874180173e-07, "logits/chosen": -0.0846543088555336, "logits/rejected": -0.07818257808685303, "logps/chosen": -361.853271484375, "logps/rejected": -330.44915771484375, "loss": 0.0959, "loss/chosen-sft": 1.524152398109436, "loss/dpo": 0.0959150493144989, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.630092978477478, "rewards/margins": 0.2400503158569336, "rewards/rejected": -0.8701432943344116, "step": 460 }, { "epoch": 0.2579034941763727, "grad_norm": 1.4479496479034424, "learning_rate": 4.633748039039044e-07, "logits/chosen": -0.17475584149360657, "logits/rejected": -0.1427185982465744, "logps/chosen": -379.28741455078125, "logps/rejected": -354.30877685546875, "loss": 0.0791, "loss/chosen-sft": 1.561486005783081, "loss/dpo": 0.0790865495800972, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5844415426254272, "rewards/margins": 0.31326019763946533, "rewards/rejected": -0.8977017402648926, "step": 465 }, { "epoch": 0.26067665002773155, "grad_norm": 2.0286715030670166, "learning_rate": 4.621032076511662e-07, "logits/chosen": -0.17609013617038727, "logits/rejected": -0.021662216633558273, "logps/chosen": -377.0018005371094, "logps/rejected": -337.9693603515625, "loss": 0.0829, "loss/chosen-sft": 1.5360634326934814, "loss/dpo": 0.08291391283273697, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.652755856513977, "rewards/margins": 0.29427844285964966, "rewards/rejected": -0.9470342397689819, "step": 470 }, { "epoch": 0.2634498058790904, "grad_norm": 1.7011218070983887, "learning_rate": 4.6081171924059245e-07, "logits/chosen": -0.164344921708107, "logits/rejected": 0.05261586979031563, "logps/chosen": -335.7785339355469, "logps/rejected": -326.3609313964844, "loss": 0.0754, "loss/chosen-sft": 1.5235470533370972, "loss/dpo": 0.07544606924057007, "rewards/accuracies": 0.65625, "rewards/chosen": -0.571322500705719, "rewards/margins": 0.26280707120895386, "rewards/rejected": -0.8341296315193176, "step": 475 }, { "epoch": 0.26622296173044924, "grad_norm": 2.2942631244659424, "learning_rate": 4.5950045979478004e-07, "logits/chosen": -0.2594403624534607, "logits/rejected": -0.0034357428085058928, "logps/chosen": -354.83197021484375, "logps/rejected": -359.304443359375, "loss": 0.0882, "loss/chosen-sft": 1.4360209703445435, "loss/dpo": 0.08824630081653595, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6372642517089844, "rewards/margins": 0.3590429425239563, "rewards/rejected": -0.9963071942329407, "step": 480 }, { "epoch": 0.2689961175818081, "grad_norm": 2.1030640602111816, "learning_rate": 4.5816955229055776e-07, "logits/chosen": -0.3849255442619324, "logits/rejected": -0.1723778247833252, "logps/chosen": -331.4612731933594, "logps/rejected": -328.07037353515625, "loss": 0.0885, "loss/chosen-sft": 1.4904568195343018, "loss/dpo": 0.08851548284292221, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6340426206588745, "rewards/margins": 0.27626457810401917, "rewards/rejected": -0.9103072881698608, "step": 485 }, { "epoch": 0.2717692734331669, "grad_norm": 1.9212195873260498, "learning_rate": 4.56819121547453e-07, "logits/chosen": -0.13772639632225037, "logits/rejected": -0.04014406353235245, "logps/chosen": -368.6828918457031, "logps/rejected": -321.24365234375, "loss": 0.0897, "loss/chosen-sft": 1.4933507442474365, "loss/dpo": 0.08970335870981216, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7113620638847351, "rewards/margins": 0.1993238627910614, "rewards/rejected": -0.9106858968734741, "step": 490 }, { "epoch": 0.27454242928452577, "grad_norm": 1.695456624031067, "learning_rate": 4.554492942159855e-07, "logits/chosen": -0.42002072930336, "logits/rejected": -0.18559008836746216, "logps/chosen": -350.0926818847656, "logps/rejected": -358.0768127441406, "loss": 0.0807, "loss/chosen-sft": 1.4628541469573975, "loss/dpo": 0.08066307753324509, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6113547086715698, "rewards/margins": 0.36685535311698914, "rewards/rejected": -0.9782101511955261, "step": 495 }, { "epoch": 0.2773155851358846, "grad_norm": 2.2545158863067627, "learning_rate": 4.540601987657893e-07, "logits/chosen": -0.27345049381256104, "logits/rejected": -0.0689864382147789, "logps/chosen": -370.7053527832031, "logps/rejected": -339.49029541015625, "loss": 0.0875, "loss/chosen-sft": 1.5028795003890991, "loss/dpo": 0.08748678863048553, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6775597333908081, "rewards/margins": 0.23481640219688416, "rewards/rejected": -0.9123761057853699, "step": 500 }, { "epoch": 0.28008874098724346, "grad_norm": 1.761813759803772, "learning_rate": 4.5265196547356453e-07, "logits/chosen": -0.4009808599948883, "logits/rejected": -0.2716490626335144, "logps/chosen": -375.43145751953125, "logps/rejected": -348.8159484863281, "loss": 0.0836, "loss/chosen-sft": 1.5295841693878174, "loss/dpo": 0.08356954157352448, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6790491342544556, "rewards/margins": 0.30093619227409363, "rewards/rejected": -0.979985237121582, "step": 505 }, { "epoch": 0.28286189683860236, "grad_norm": 1.7265639305114746, "learning_rate": 4.5122472641085887e-07, "logits/chosen": -0.2951423227787018, "logits/rejected": -0.06516732275485992, "logps/chosen": -327.62579345703125, "logps/rejected": -322.10516357421875, "loss": 0.0822, "loss/chosen-sft": 1.4988086223602295, "loss/dpo": 0.08221860229969025, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6168748140335083, "rewards/margins": 0.27371373772621155, "rewards/rejected": -0.8905885815620422, "step": 510 }, { "epoch": 0.2856350526899612, "grad_norm": 2.8561835289001465, "learning_rate": 4.497786154316815e-07, "logits/chosen": -0.3033020496368408, "logits/rejected": -0.1509987711906433, "logps/chosen": -405.4481506347656, "logps/rejected": -363.7564392089844, "loss": 0.0839, "loss/chosen-sft": 1.5500370264053345, "loss/dpo": 0.08386242389678955, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7174540758132935, "rewards/margins": 0.30318009853363037, "rewards/rejected": -1.0206342935562134, "step": 515 }, { "epoch": 0.28840820854132004, "grad_norm": 1.3990095853805542, "learning_rate": 4.483137681599495e-07, "logits/chosen": -0.4407324194908142, "logits/rejected": -0.3505704998970032, "logps/chosen": -393.1653747558594, "logps/rejected": -341.4064636230469, "loss": 0.0787, "loss/chosen-sft": 1.5403501987457275, "loss/dpo": 0.07872845977544785, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6447398066520691, "rewards/margins": 0.4145263731479645, "rewards/rejected": -1.0592660903930664, "step": 520 }, { "epoch": 0.2911813643926789, "grad_norm": 1.8205024003982544, "learning_rate": 4.468303219767683e-07, "logits/chosen": -0.3818379044532776, "logits/rejected": -0.13233591616153717, "logps/chosen": -356.06488037109375, "logps/rejected": -356.4136047363281, "loss": 0.0827, "loss/chosen-sft": 1.497135043144226, "loss/dpo": 0.08271769434213638, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6697732210159302, "rewards/margins": 0.2621932625770569, "rewards/rejected": -0.9319664835929871, "step": 525 }, { "epoch": 0.29395452024403773, "grad_norm": 1.3097429275512695, "learning_rate": 4.453284160075473e-07, "logits/chosen": -0.4805290699005127, "logits/rejected": -0.3880520164966583, "logps/chosen": -391.4358825683594, "logps/rejected": -344.5740661621094, "loss": 0.0787, "loss/chosen-sft": 1.4888331890106201, "loss/dpo": 0.07873310893774033, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7074838280677795, "rewards/margins": 0.3189099431037903, "rewards/rejected": -1.0263937711715698, "step": 530 }, { "epoch": 0.2967276760953966, "grad_norm": 1.8334152698516846, "learning_rate": 4.438081911089522e-07, "logits/chosen": -0.1407683938741684, "logits/rejected": -0.08939669281244278, "logps/chosen": -368.7952880859375, "logps/rejected": -324.58660888671875, "loss": 0.0787, "loss/chosen-sft": 1.536380410194397, "loss/dpo": 0.07865273952484131, "rewards/accuracies": 0.625, "rewards/chosen": -0.7058097124099731, "rewards/margins": 0.27695003151893616, "rewards/rejected": -0.9827596545219421, "step": 535 }, { "epoch": 0.2995008319467554, "grad_norm": 1.0951838493347168, "learning_rate": 4.422697898556945e-07, "logits/chosen": -0.1452847272157669, "logits/rejected": -0.12602165341377258, "logps/chosen": -388.03533935546875, "logps/rejected": -322.50787353515625, "loss": 0.0711, "loss/chosen-sft": 1.6297862529754639, "loss/dpo": 0.07111659646034241, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6191781759262085, "rewards/margins": 0.31716564297676086, "rewards/rejected": -0.936343789100647, "step": 540 }, { "epoch": 0.30227398779811426, "grad_norm": 1.171819806098938, "learning_rate": 4.4071335652716004e-07, "logits/chosen": -0.28522688150405884, "logits/rejected": 0.05751846358180046, "logps/chosen": -330.20245361328125, "logps/rejected": -353.6244812011719, "loss": 0.0786, "loss/chosen-sft": 1.4777370691299438, "loss/dpo": 0.07859226316213608, "rewards/accuracies": 0.6875, "rewards/chosen": -0.649788498878479, "rewards/margins": 0.34457841515541077, "rewards/rejected": -0.9943668246269226, "step": 545 }, { "epoch": 0.3050471436494731, "grad_norm": 2.0230906009674072, "learning_rate": 4.391390370938777e-07, "logits/chosen": -0.2850199043750763, "logits/rejected": -0.1109178215265274, "logps/chosen": -364.53826904296875, "logps/rejected": -338.34283447265625, "loss": 0.0867, "loss/chosen-sft": 1.5412209033966064, "loss/dpo": 0.08672328293323517, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6998955607414246, "rewards/margins": 0.25897401571273804, "rewards/rejected": -0.9588696360588074, "step": 550 }, { "epoch": 0.30782029950083195, "grad_norm": 1.2038522958755493, "learning_rate": 4.3754697920383006e-07, "logits/chosen": -0.34611284732818604, "logits/rejected": -0.049505867063999176, "logps/chosen": -355.5877990722656, "logps/rejected": -355.5694274902344, "loss": 0.0705, "loss/chosen-sft": 1.5247033834457397, "loss/dpo": 0.0704929307103157, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6597936749458313, "rewards/margins": 0.4111254811286926, "rewards/rejected": -1.070919156074524, "step": 555 }, { "epoch": 0.3105934553521908, "grad_norm": 1.383131980895996, "learning_rate": 4.359373321686053e-07, "logits/chosen": -0.47573018074035645, "logits/rejected": -0.38598376512527466, "logps/chosen": -388.10736083984375, "logps/rejected": -381.851318359375, "loss": 0.0792, "loss/chosen-sft": 1.4690505266189575, "loss/dpo": 0.07924602925777435, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6938058137893677, "rewards/margins": 0.39269718527793884, "rewards/rejected": -1.086503028869629, "step": 560 }, { "epoch": 0.31336661120354964, "grad_norm": 1.9160140752792358, "learning_rate": 4.343102469493947e-07, "logits/chosen": -0.3587764501571655, "logits/rejected": -0.2721293866634369, "logps/chosen": -403.26788330078125, "logps/rejected": -358.91046142578125, "loss": 0.068, "loss/chosen-sft": 1.6094681024551392, "loss/dpo": 0.06795226037502289, "rewards/accuracies": 0.625, "rewards/chosen": -0.7775420546531677, "rewards/margins": 0.31379085779190063, "rewards/rejected": -1.091333031654358, "step": 565 }, { "epoch": 0.3161397670549085, "grad_norm": 1.0593386888504028, "learning_rate": 4.326658761428342e-07, "logits/chosen": -0.4315316677093506, "logits/rejected": -0.2712632417678833, "logps/chosen": -381.10260009765625, "logps/rejected": -344.9892883300781, "loss": 0.0726, "loss/chosen-sft": 1.552372694015503, "loss/dpo": 0.07259351015090942, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.800487220287323, "rewards/margins": 0.31337645649909973, "rewards/rejected": -1.1138637065887451, "step": 570 }, { "epoch": 0.3189129229062673, "grad_norm": 1.4632201194763184, "learning_rate": 4.310043739666937e-07, "logits/chosen": -0.49550876021385193, "logits/rejected": -0.3280448317527771, "logps/chosen": -401.83856201171875, "logps/rejected": -399.66729736328125, "loss": 0.0516, "loss/chosen-sft": 1.647662878036499, "loss/dpo": 0.05160484462976456, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8304476737976074, "rewards/margins": 0.4837685227394104, "rewards/rejected": -1.314216136932373, "step": 575 }, { "epoch": 0.32168607875762617, "grad_norm": 1.8497745990753174, "learning_rate": 4.2932589624541296e-07, "logits/chosen": -0.3687261641025543, "logits/rejected": -0.27867016196250916, "logps/chosen": -376.83660888671875, "logps/rejected": -353.30670166015625, "loss": 0.0618, "loss/chosen-sft": 1.6445457935333252, "loss/dpo": 0.06179194524884224, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8887649774551392, "rewards/margins": 0.33651480078697205, "rewards/rejected": -1.2252798080444336, "step": 580 }, { "epoch": 0.324459234608985, "grad_norm": 1.1594619750976562, "learning_rate": 4.276306003954881e-07, "logits/chosen": -0.46756500005722046, "logits/rejected": -0.4428383708000183, "logps/chosen": -405.39593505859375, "logps/rejected": -375.8490905761719, "loss": 0.0648, "loss/chosen-sft": 1.6012241840362549, "loss/dpo": 0.06479503959417343, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0120012760162354, "rewards/margins": 0.2799423336982727, "rewards/rejected": -1.2919435501098633, "step": 585 }, { "epoch": 0.32723239046034386, "grad_norm": 1.4882887601852417, "learning_rate": 4.25918645410708e-07, "logits/chosen": -0.5149391889572144, "logits/rejected": -0.43163880705833435, "logps/chosen": -395.2956237792969, "logps/rejected": -388.7841491699219, "loss": 0.0565, "loss/chosen-sft": 1.619074821472168, "loss/dpo": 0.056492336094379425, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8736220598220825, "rewards/margins": 0.49877220392227173, "rewards/rejected": -1.372394323348999, "step": 590 }, { "epoch": 0.3300055463117027, "grad_norm": 1.6804783344268799, "learning_rate": 4.2419019184724316e-07, "logits/chosen": -0.4885106086730957, "logits/rejected": -0.2503131926059723, "logps/chosen": -417.79327392578125, "logps/rejected": -404.0755920410156, "loss": 0.0634, "loss/chosen-sft": 1.5858757495880127, "loss/dpo": 0.06336269527673721, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9350301623344421, "rewards/margins": 0.4474121630191803, "rewards/rejected": -1.3824422359466553, "step": 595 }, { "epoch": 0.33277870216306155, "grad_norm": 1.5922985076904297, "learning_rate": 4.224454018085878e-07, "logits/chosen": -0.3938080966472626, "logits/rejected": -0.20645618438720703, "logps/chosen": -417.41644287109375, "logps/rejected": -377.573486328125, "loss": 0.0521, "loss/chosen-sft": 1.6786199808120728, "loss/dpo": 0.05210758373141289, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9713231325149536, "rewards/margins": 0.3552365303039551, "rewards/rejected": -1.3265597820281982, "step": 600 }, { "epoch": 0.3355518580144204, "grad_norm": 1.135326862335205, "learning_rate": 4.206844389303569e-07, "logits/chosen": -0.5038084387779236, "logits/rejected": -0.1152668371796608, "logps/chosen": -368.07244873046875, "logps/rejected": -371.786865234375, "loss": 0.059, "loss/chosen-sft": 1.6389780044555664, "loss/dpo": 0.058953166007995605, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9387050867080688, "rewards/margins": 0.34334003925323486, "rewards/rejected": -1.2820451259613037, "step": 605 }, { "epoch": 0.33832501386577923, "grad_norm": 1.6886770725250244, "learning_rate": 4.1890746836493987e-07, "logits/chosen": -0.5036560893058777, "logits/rejected": -0.3065970242023468, "logps/chosen": -398.92559814453125, "logps/rejected": -394.6751403808594, "loss": 0.063, "loss/chosen-sft": 1.580437421798706, "loss/dpo": 0.0630171000957489, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9837394952774048, "rewards/margins": 0.3374081552028656, "rewards/rejected": -1.3211476802825928, "step": 610 }, { "epoch": 0.3410981697171381, "grad_norm": 1.2646410465240479, "learning_rate": 4.171146567660112e-07, "logits/chosen": -0.34558913111686707, "logits/rejected": -0.22349652647972107, "logps/chosen": -397.49908447265625, "logps/rejected": -370.159912109375, "loss": 0.0511, "loss/chosen-sft": 1.6435205936431885, "loss/dpo": 0.05106610804796219, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8979086875915527, "rewards/margins": 0.5196878910064697, "rewards/rejected": -1.4175965785980225, "step": 615 }, { "epoch": 0.343871325568497, "grad_norm": 1.721474528312683, "learning_rate": 4.153061722729013e-07, "logits/chosen": -0.4159115254878998, "logits/rejected": -0.3842083811759949, "logps/chosen": -425.87567138671875, "logps/rejected": -361.23883056640625, "loss": 0.0537, "loss/chosen-sft": 1.750331163406372, "loss/dpo": 0.05366608500480652, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9684473276138306, "rewards/margins": 0.3472323715686798, "rewards/rejected": -1.3156797885894775, "step": 620 }, { "epoch": 0.3466444814198558, "grad_norm": 0.8142698407173157, "learning_rate": 4.1348218449482723e-07, "logits/chosen": -0.5395044088363647, "logits/rejected": -0.4873916208744049, "logps/chosen": -400.04217529296875, "logps/rejected": -377.87615966796875, "loss": 0.052, "loss/chosen-sft": 1.6478407382965088, "loss/dpo": 0.05204028636217117, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9235207438468933, "rewards/margins": 0.4557490348815918, "rewards/rejected": -1.3792698383331299, "step": 625 }, { "epoch": 0.34941763727121466, "grad_norm": 1.6725468635559082, "learning_rate": 4.1164286449498584e-07, "logits/chosen": -0.5160374641418457, "logits/rejected": -0.267566978931427, "logps/chosen": -390.87774658203125, "logps/rejected": -379.4105224609375, "loss": 0.0671, "loss/chosen-sft": 1.6147594451904297, "loss/dpo": 0.06710793823003769, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0025107860565186, "rewards/margins": 0.2850556969642639, "rewards/rejected": -1.2875664234161377, "step": 630 }, { "epoch": 0.3521907931225735, "grad_norm": 1.5668613910675049, "learning_rate": 4.0978838477451065e-07, "logits/chosen": -0.36808034777641296, "logits/rejected": -0.3304065763950348, "logps/chosen": -414.526123046875, "logps/rejected": -388.3052062988281, "loss": 0.0505, "loss/chosen-sft": 1.8385422229766846, "loss/dpo": 0.05049455910921097, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9938969612121582, "rewards/margins": 0.41056522727012634, "rewards/rejected": -1.404462218284607, "step": 635 }, { "epoch": 0.35496394897393235, "grad_norm": 1.7163115739822388, "learning_rate": 4.079189192562938e-07, "logits/chosen": -0.597520649433136, "logits/rejected": -0.5070825815200806, "logps/chosen": -420.65447998046875, "logps/rejected": -367.829345703125, "loss": 0.0641, "loss/chosen-sft": 1.6417961120605469, "loss/dpo": 0.0640825405716896, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9867199659347534, "rewards/margins": 0.3000028729438782, "rewards/rejected": -1.2867228984832764, "step": 640 }, { "epoch": 0.3577371048252912, "grad_norm": 1.513505220413208, "learning_rate": 4.0603464326867456e-07, "logits/chosen": -0.3763376474380493, "logits/rejected": -0.37528449296951294, "logps/chosen": -425.38043212890625, "logps/rejected": -415.4266662597656, "loss": 0.0531, "loss/chosen-sft": 1.692238211631775, "loss/dpo": 0.053089797496795654, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8854948878288269, "rewards/margins": 0.5308740735054016, "rewards/rejected": -1.416368842124939, "step": 645 }, { "epoch": 0.36051026067665004, "grad_norm": 2.2651264667510986, "learning_rate": 4.041357335289962e-07, "logits/chosen": -0.6403440833091736, "logits/rejected": -0.5450460314750671, "logps/chosen": -418.7191467285156, "logps/rejected": -380.0513610839844, "loss": 0.0605, "loss/chosen-sft": 1.6550674438476562, "loss/dpo": 0.060495972633361816, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0474960803985596, "rewards/margins": 0.3858080506324768, "rewards/rejected": -1.4333041906356812, "step": 650 }, { "epoch": 0.3632834165280089, "grad_norm": 1.6215060949325562, "learning_rate": 4.0222236812703247e-07, "logits/chosen": -0.43776410818099976, "logits/rejected": -0.33489981293678284, "logps/chosen": -383.8864440917969, "logps/rejected": -378.69873046875, "loss": 0.0604, "loss/chosen-sft": 1.619616150856018, "loss/dpo": 0.060428131371736526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8916348218917847, "rewards/margins": 0.45284873247146606, "rewards/rejected": -1.3444836139678955, "step": 655 }, { "epoch": 0.36605657237936773, "grad_norm": 1.3622971773147583, "learning_rate": 4.002947265082854e-07, "logits/chosen": -0.5182867646217346, "logits/rejected": -0.41873541474342346, "logps/chosen": -382.6905212402344, "logps/rejected": -354.53240966796875, "loss": 0.0451, "loss/chosen-sft": 1.6769845485687256, "loss/dpo": 0.04514864459633827, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.910293698310852, "rewards/margins": 0.40142306685447693, "rewards/rejected": -1.3117166757583618, "step": 660 }, { "epoch": 0.3688297282307266, "grad_norm": 1.7427003383636475, "learning_rate": 3.983529894571558e-07, "logits/chosen": -0.5258729457855225, "logits/rejected": -0.35827913880348206, "logps/chosen": -410.27655029296875, "logps/rejected": -398.0840759277344, "loss": 0.0515, "loss/chosen-sft": 1.6551071405410767, "loss/dpo": 0.051458604633808136, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9875022172927856, "rewards/margins": 0.4931188225746155, "rewards/rejected": -1.4806209802627563, "step": 665 }, { "epoch": 0.3716028840820854, "grad_norm": 1.5637753009796143, "learning_rate": 3.963973390799887e-07, "logits/chosen": -0.3731691241264343, "logits/rejected": -0.3140907287597656, "logps/chosen": -387.2796936035156, "logps/rejected": -389.09967041015625, "loss": 0.0573, "loss/chosen-sft": 1.664841890335083, "loss/dpo": 0.057298194617033005, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9336613416671753, "rewards/margins": 0.4270879626274109, "rewards/rejected": -1.360749363899231, "step": 670 }, { "epoch": 0.37437603993344426, "grad_norm": 1.713516116142273, "learning_rate": 3.944279587879942e-07, "logits/chosen": -0.6181343197822571, "logits/rejected": -0.4861913323402405, "logps/chosen": -399.0451354980469, "logps/rejected": -375.4527282714844, "loss": 0.0681, "loss/chosen-sft": 1.6289466619491577, "loss/dpo": 0.06814940273761749, "rewards/accuracies": 0.625, "rewards/chosen": -1.020996332168579, "rewards/margins": 0.3115329444408417, "rewards/rejected": -1.3325293064117432, "step": 675 }, { "epoch": 0.3771491957848031, "grad_norm": 1.3407127857208252, "learning_rate": 3.9244503328004606e-07, "logits/chosen": -0.7082573175430298, "logits/rejected": -0.5655766725540161, "logps/chosen": -412.4944763183594, "logps/rejected": -384.94476318359375, "loss": 0.0535, "loss/chosen-sft": 1.6391160488128662, "loss/dpo": 0.05352095887064934, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.062304139137268, "rewards/margins": 0.3755071461200714, "rewards/rejected": -1.4378111362457275, "step": 680 }, { "epoch": 0.37992235163616195, "grad_norm": 0.8834872841835022, "learning_rate": 3.9044874852536013e-07, "logits/chosen": -0.5957868099212646, "logits/rejected": -0.3501487076282501, "logps/chosen": -390.2853088378906, "logps/rejected": -378.35296630859375, "loss": 0.0609, "loss/chosen-sft": 1.706976294517517, "loss/dpo": 0.06085295230150223, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0856157541275024, "rewards/margins": 0.3497942090034485, "rewards/rejected": -1.4354099035263062, "step": 685 }, { "epoch": 0.3826955074875208, "grad_norm": 1.8584004640579224, "learning_rate": 3.8843929174605283e-07, "logits/chosen": -0.5268598794937134, "logits/rejected": -0.5594683885574341, "logps/chosen": -399.52593994140625, "logps/rejected": -352.721923828125, "loss": 0.0588, "loss/chosen-sft": 1.7252442836761475, "loss/dpo": 0.058761436492204666, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9334037899971008, "rewards/margins": 0.39858493208885193, "rewards/rejected": -1.3319886922836304, "step": 690 }, { "epoch": 0.38546866333887964, "grad_norm": 1.2674587965011597, "learning_rate": 3.8641685139958234e-07, "logits/chosen": -0.5243244171142578, "logits/rejected": -0.3986015319824219, "logps/chosen": -428.97662353515625, "logps/rejected": -402.95074462890625, "loss": 0.0568, "loss/chosen-sft": 1.6090924739837646, "loss/dpo": 0.056836675852537155, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9936995506286621, "rewards/margins": 0.45986804366111755, "rewards/rejected": -1.4535675048828125, "step": 695 }, { "epoch": 0.3882418191902385, "grad_norm": 1.1967881917953491, "learning_rate": 3.8438161716107453e-07, "logits/chosen": -0.35420164465904236, "logits/rejected": -0.16235880553722382, "logps/chosen": -388.86260986328125, "logps/rejected": -373.76336669921875, "loss": 0.0531, "loss/chosen-sft": 1.7332239151000977, "loss/dpo": 0.05314627289772034, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9368470907211304, "rewards/margins": 0.39190053939819336, "rewards/rejected": -1.3287477493286133, "step": 700 }, { "epoch": 0.3910149750415973, "grad_norm": 1.0293500423431396, "learning_rate": 3.8233377990553376e-07, "logits/chosen": -0.5038026571273804, "logits/rejected": -0.45793700218200684, "logps/chosen": -392.74078369140625, "logps/rejected": -383.5316467285156, "loss": 0.0469, "loss/chosen-sft": 1.6530288457870483, "loss/dpo": 0.04691758006811142, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9149090051651001, "rewards/margins": 0.5396536588668823, "rewards/rejected": -1.4545625448226929, "step": 705 }, { "epoch": 0.39378813089295617, "grad_norm": 1.4593671560287476, "learning_rate": 3.80273531689942e-07, "logits/chosen": -0.5035001039505005, "logits/rejected": -0.39359813928604126, "logps/chosen": -410.6475524902344, "logps/rejected": -410.08355712890625, "loss": 0.0516, "loss/chosen-sft": 1.5783100128173828, "loss/dpo": 0.05163818597793579, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9400428533554077, "rewards/margins": 0.5864478945732117, "rewards/rejected": -1.5264908075332642, "step": 710 }, { "epoch": 0.396561286744315, "grad_norm": 1.9627400636672974, "learning_rate": 3.7820106573524645e-07, "logits/chosen": -0.3761715888977051, "logits/rejected": -0.33153462409973145, "logps/chosen": -387.8844909667969, "logps/rejected": -367.91168212890625, "loss": 0.0642, "loss/chosen-sft": 1.6243873834609985, "loss/dpo": 0.0642186850309372, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0104358196258545, "rewards/margins": 0.39246731996536255, "rewards/rejected": -1.4029031991958618, "step": 715 }, { "epoch": 0.39933444259567386, "grad_norm": 1.1319611072540283, "learning_rate": 3.7611657640823825e-07, "logits/chosen": -0.5168325901031494, "logits/rejected": -0.41257476806640625, "logps/chosen": -418.1272888183594, "logps/rejected": -379.2960510253906, "loss": 0.0503, "loss/chosen-sft": 1.699724793434143, "loss/dpo": 0.050268955528736115, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.030914545059204, "rewards/margins": 0.43790435791015625, "rewards/rejected": -1.46881902217865, "step": 720 }, { "epoch": 0.4021075984470327, "grad_norm": 1.7412174940109253, "learning_rate": 3.74020259203324e-07, "logits/chosen": -0.6365585327148438, "logits/rejected": -0.4046885371208191, "logps/chosen": -392.642822265625, "logps/rejected": -416.04986572265625, "loss": 0.0541, "loss/chosen-sft": 1.6220734119415283, "loss/dpo": 0.05405275151133537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0092906951904297, "rewards/margins": 0.5087220072746277, "rewards/rejected": -1.5180127620697021, "step": 725 }, { "epoch": 0.40488075429839154, "grad_norm": 1.3539364337921143, "learning_rate": 3.7191231072419095e-07, "logits/chosen": -0.41355252265930176, "logits/rejected": -0.23317034542560577, "logps/chosen": -427.7068786621094, "logps/rejected": -423.05108642578125, "loss": 0.045, "loss/chosen-sft": 1.7072865962982178, "loss/dpo": 0.04499746486544609, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0050982236862183, "rewards/margins": 0.5913174748420715, "rewards/rejected": -1.5964157581329346, "step": 730 }, { "epoch": 0.40765391014975044, "grad_norm": 2.4430134296417236, "learning_rate": 3.6979292866536864e-07, "logits/chosen": -0.5284135937690735, "logits/rejected": -0.39231494069099426, "logps/chosen": -444.33062744140625, "logps/rejected": -440.982177734375, "loss": 0.0645, "loss/chosen-sft": 1.6972286701202393, "loss/dpo": 0.06453467905521393, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0674129724502563, "rewards/margins": 0.47073474526405334, "rewards/rejected": -1.5381478071212769, "step": 735 }, { "epoch": 0.4104270660011093, "grad_norm": 1.5764257907867432, "learning_rate": 3.6766231179368815e-07, "logits/chosen": -0.6887901425361633, "logits/rejected": -0.5167983174324036, "logps/chosen": -443.1559143066406, "logps/rejected": -440.83489990234375, "loss": 0.0519, "loss/chosen-sft": 1.635066270828247, "loss/dpo": 0.05188404396176338, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.104542851448059, "rewards/margins": 0.5738195180892944, "rewards/rejected": -1.678362250328064, "step": 740 }, { "epoch": 0.41320022185246813, "grad_norm": 1.0604594945907593, "learning_rate": 3.6552065992964043e-07, "logits/chosen": -0.6154565215110779, "logits/rejected": -0.3693044185638428, "logps/chosen": -406.3134460449219, "logps/rejected": -377.28228759765625, "loss": 0.0516, "loss/chosen-sft": 1.6899700164794922, "loss/dpo": 0.05156542733311653, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0907108783721924, "rewards/margins": 0.35733407735824585, "rewards/rejected": -1.448045015335083, "step": 745 }, { "epoch": 0.415973377703827, "grad_norm": 1.4795838594436646, "learning_rate": 3.6336817392863625e-07, "logits/chosen": -0.4953575134277344, "logits/rejected": -0.3335438668727875, "logps/chosen": -399.979736328125, "logps/rejected": -392.7442321777344, "loss": 0.0494, "loss/chosen-sft": 1.6881647109985352, "loss/dpo": 0.04941638186573982, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1530383825302124, "rewards/margins": 0.4033992886543274, "rewards/rejected": -1.5564377307891846, "step": 750 }, { "epoch": 0.4187465335551858, "grad_norm": 1.2639776468276978, "learning_rate": 3.6120505566216906e-07, "logits/chosen": -0.669465184211731, "logits/rejected": -0.5370808839797974, "logps/chosen": -412.94366455078125, "logps/rejected": -406.2000427246094, "loss": 0.0412, "loss/chosen-sft": 1.7509901523590088, "loss/dpo": 0.04121355339884758, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.085228681564331, "rewards/margins": 0.5851390361785889, "rewards/rejected": -1.6703678369522095, "step": 755 }, { "epoch": 0.42151968940654466, "grad_norm": 1.776004672050476, "learning_rate": 3.5903150799888215e-07, "logits/chosen": -0.562536358833313, "logits/rejected": -0.49020713567733765, "logps/chosen": -412.8077087402344, "logps/rejected": -399.482421875, "loss": 0.0529, "loss/chosen-sft": 1.6743816137313843, "loss/dpo": 0.052908383309841156, "rewards/accuracies": 0.65625, "rewards/chosen": -1.026395320892334, "rewards/margins": 0.47851258516311646, "rewards/rejected": -1.5049078464508057, "step": 760 }, { "epoch": 0.4242928452579035, "grad_norm": 1.3225823640823364, "learning_rate": 3.5684773478554255e-07, "logits/chosen": -0.5152966976165771, "logits/rejected": -0.5033225417137146, "logps/chosen": -405.97271728515625, "logps/rejected": -377.90875244140625, "loss": 0.0571, "loss/chosen-sft": 1.7368265390396118, "loss/dpo": 0.05706434324383736, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0174944400787354, "rewards/margins": 0.47621211409568787, "rewards/rejected": -1.493706464767456, "step": 765 }, { "epoch": 0.42706600110926235, "grad_norm": 1.3815367221832275, "learning_rate": 3.546539408279235e-07, "logits/chosen": -0.49167943000793457, "logits/rejected": -0.2837482988834381, "logps/chosen": -405.2371520996094, "logps/rejected": -391.68670654296875, "loss": 0.0601, "loss/chosen-sft": 1.7111915349960327, "loss/dpo": 0.060050565749406815, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0395854711532593, "rewards/margins": 0.37245672941207886, "rewards/rejected": -1.412042260169983, "step": 770 }, { "epoch": 0.4298391569606212, "grad_norm": 1.1186020374298096, "learning_rate": 3.5245033187159647e-07, "logits/chosen": -0.5566674470901489, "logits/rejected": -0.4645940661430359, "logps/chosen": -405.1397399902344, "logps/rejected": -374.271728515625, "loss": 0.0514, "loss/chosen-sft": 1.6212804317474365, "loss/dpo": 0.05135621875524521, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9940582513809204, "rewards/margins": 0.45282116532325745, "rewards/rejected": -1.4468793869018555, "step": 775 }, { "epoch": 0.43261231281198004, "grad_norm": 1.5393182039260864, "learning_rate": 3.502371145826352e-07, "logits/chosen": -0.5995725989341736, "logits/rejected": -0.5144038200378418, "logps/chosen": -400.0671081542969, "logps/rejected": -385.246337890625, "loss": 0.0464, "loss/chosen-sft": 1.6027672290802002, "loss/dpo": 0.04639893397688866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9513114094734192, "rewards/margins": 0.5679217576980591, "rewards/rejected": -1.5192331075668335, "step": 780 }, { "epoch": 0.4353854686633389, "grad_norm": 1.7037338018417358, "learning_rate": 3.4801449652823374e-07, "logits/chosen": -0.8535438776016235, "logits/rejected": -0.6764947175979614, "logps/chosen": -405.5221252441406, "logps/rejected": -374.32440185546875, "loss": 0.0488, "loss/chosen-sft": 1.583449363708496, "loss/dpo": 0.048822566866874695, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.958821177482605, "rewards/margins": 0.5157214999198914, "rewards/rejected": -1.4745426177978516, "step": 785 }, { "epoch": 0.4381586245146977, "grad_norm": 1.703331708908081, "learning_rate": 3.4578268615723924e-07, "logits/chosen": -0.561161458492279, "logits/rejected": -0.5696666836738586, "logps/chosen": -420.22381591796875, "logps/rejected": -396.88299560546875, "loss": 0.0437, "loss/chosen-sft": 1.7269833087921143, "loss/dpo": 0.04372150078415871, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0612952709197998, "rewards/margins": 0.5599007606506348, "rewards/rejected": -1.6211960315704346, "step": 790 }, { "epoch": 0.44093178036605657, "grad_norm": 1.3786600828170776, "learning_rate": 3.4354189278060317e-07, "logits/chosen": -0.7774502635002136, "logits/rejected": -0.6075506806373596, "logps/chosen": -412.4271545410156, "logps/rejected": -393.829345703125, "loss": 0.0453, "loss/chosen-sft": 1.675945520401001, "loss/dpo": 0.04528175666928291, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1789729595184326, "rewards/margins": 0.48232507705688477, "rewards/rejected": -1.6612980365753174, "step": 795 }, { "epoch": 0.4437049362174154, "grad_norm": 1.5878552198410034, "learning_rate": 3.412923265517503e-07, "logits/chosen": -0.7400108575820923, "logits/rejected": -0.7512958645820618, "logps/chosen": -412.6996154785156, "logps/rejected": -379.6897277832031, "loss": 0.0488, "loss/chosen-sft": 1.7094510793685913, "loss/dpo": 0.048782043159008026, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1738940477371216, "rewards/margins": 0.429801881313324, "rewards/rejected": -1.6036958694458008, "step": 800 }, { "epoch": 0.44647809206877426, "grad_norm": 1.7441812753677368, "learning_rate": 3.390341984468699e-07, "logits/chosen": -0.7113388776779175, "logits/rejected": -0.6850872039794922, "logps/chosen": -428.8340759277344, "logps/rejected": -400.9315490722656, "loss": 0.0473, "loss/chosen-sft": 1.7859251499176025, "loss/dpo": 0.047270990908145905, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1826239824295044, "rewards/margins": 0.4750004708766937, "rewards/rejected": -1.6576244831085205, "step": 805 }, { "epoch": 0.4492512479201331, "grad_norm": 1.530585765838623, "learning_rate": 3.367677202451292e-07, "logits/chosen": -0.6908737421035767, "logits/rejected": -0.5597144365310669, "logps/chosen": -455.83758544921875, "logps/rejected": -452.01385498046875, "loss": 0.0584, "loss/chosen-sft": 1.6678813695907593, "loss/dpo": 0.05841095373034477, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1806871891021729, "rewards/margins": 0.6453709602355957, "rewards/rejected": -1.8260581493377686, "step": 810 }, { "epoch": 0.45202440377149194, "grad_norm": 0.9433002471923828, "learning_rate": 3.3449310450881164e-07, "logits/chosen": -0.7155448198318481, "logits/rejected": -0.6245291829109192, "logps/chosen": -421.4974060058594, "logps/rejected": -427.9629821777344, "loss": 0.0364, "loss/chosen-sft": 1.7820746898651123, "loss/dpo": 0.03638064116239548, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.212754249572754, "rewards/margins": 0.6936885714530945, "rewards/rejected": -1.9064428806304932, "step": 815 }, { "epoch": 0.4547975596228508, "grad_norm": 1.6761059761047363, "learning_rate": 3.322105645633813e-07, "logits/chosen": -0.6909006834030151, "logits/rejected": -0.5668486952781677, "logps/chosen": -440.548095703125, "logps/rejected": -421.02392578125, "loss": 0.0613, "loss/chosen-sft": 1.691504716873169, "loss/dpo": 0.06133972853422165, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.250173807144165, "rewards/margins": 0.3592910170555115, "rewards/rejected": -1.6094646453857422, "step": 820 }, { "epoch": 0.45757071547420963, "grad_norm": 1.7704241275787354, "learning_rate": 3.299203144774767e-07, "logits/chosen": -0.717354953289032, "logits/rejected": -0.6114916801452637, "logps/chosen": -436.1394958496094, "logps/rejected": -417.92218017578125, "loss": 0.0443, "loss/chosen-sft": 1.7568438053131104, "loss/dpo": 0.044312089681625366, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.234349250793457, "rewards/margins": 0.4998023509979248, "rewards/rejected": -1.7341516017913818, "step": 825 }, { "epoch": 0.4603438713255685, "grad_norm": 1.0728057622909546, "learning_rate": 3.276225690428338e-07, "logits/chosen": -0.7584112882614136, "logits/rejected": -0.7765355110168457, "logps/chosen": -411.444580078125, "logps/rejected": -426.4891052246094, "loss": 0.0411, "loss/chosen-sft": 1.7121307849884033, "loss/dpo": 0.04111065715551376, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1426109075546265, "rewards/margins": 0.6041598916053772, "rewards/rejected": -1.7467708587646484, "step": 830 }, { "epoch": 0.4631170271769273, "grad_norm": 1.3771647214889526, "learning_rate": 3.2531754375414206e-07, "logits/chosen": -0.7121502161026001, "logits/rejected": -0.732271671295166, "logps/chosen": -458.24688720703125, "logps/rejected": -436.6095275878906, "loss": 0.0516, "loss/chosen-sft": 1.7345997095108032, "loss/dpo": 0.051555633544921875, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2151943445205688, "rewards/margins": 0.511760413646698, "rewards/rejected": -1.7269548177719116, "step": 835 }, { "epoch": 0.46589018302828616, "grad_norm": 1.6047500371932983, "learning_rate": 3.230054547888339e-07, "logits/chosen": -0.763964831829071, "logits/rejected": -0.7086871862411499, "logps/chosen": -406.05316162109375, "logps/rejected": -391.0845642089844, "loss": 0.0415, "loss/chosen-sft": 1.747057318687439, "loss/dpo": 0.04153294861316681, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1723990440368652, "rewards/margins": 0.46168699860572815, "rewards/rejected": -1.634086012840271, "step": 840 }, { "epoch": 0.468663338879645, "grad_norm": 1.445021152496338, "learning_rate": 3.2068651898681076e-07, "logits/chosen": -0.742651104927063, "logits/rejected": -0.7188843488693237, "logps/chosen": -458.99652099609375, "logps/rejected": -454.03533935546875, "loss": 0.0491, "loss/chosen-sft": 1.6705642938613892, "loss/dpo": 0.0490889772772789, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.282627820968628, "rewards/margins": 0.5290722846984863, "rewards/rejected": -1.8117001056671143, "step": 845 }, { "epoch": 0.4714364947310039, "grad_norm": 2.1770212650299072, "learning_rate": 3.183609538301065e-07, "logits/chosen": -0.8837400674819946, "logits/rejected": -0.7734067440032959, "logps/chosen": -432.4523010253906, "logps/rejected": -402.35064697265625, "loss": 0.041, "loss/chosen-sft": 1.7752116918563843, "loss/dpo": 0.040977347642183304, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2817370891571045, "rewards/margins": 0.4654463827610016, "rewards/rejected": -1.7471835613250732, "step": 850 }, { "epoch": 0.47420965058236275, "grad_norm": 1.3028136491775513, "learning_rate": 3.1602897742249077e-07, "logits/chosen": -0.9581116437911987, "logits/rejected": -0.7644520998001099, "logps/chosen": -424.92352294921875, "logps/rejected": -427.76806640625, "loss": 0.0447, "loss/chosen-sft": 1.6508781909942627, "loss/dpo": 0.04467002674937248, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2735633850097656, "rewards/margins": 0.5769675970077515, "rewards/rejected": -1.850530982017517, "step": 855 }, { "epoch": 0.4769828064337216, "grad_norm": 1.5738519430160522, "learning_rate": 3.136908084690142e-07, "logits/chosen": -0.7455793619155884, "logits/rejected": -0.6809624433517456, "logps/chosen": -430.9115295410156, "logps/rejected": -438.87164306640625, "loss": 0.0372, "loss/chosen-sft": 1.8043369054794312, "loss/dpo": 0.037248264998197556, "rewards/accuracies": 0.71875, "rewards/chosen": -1.165959119796753, "rewards/margins": 0.6415104866027832, "rewards/rejected": -1.8074697256088257, "step": 860 }, { "epoch": 0.47975596228508044, "grad_norm": 0.9466845989227295, "learning_rate": 3.113466662554971e-07, "logits/chosen": -0.8632648587226868, "logits/rejected": -0.7621570825576782, "logps/chosen": -416.5755310058594, "logps/rejected": -404.36383056640625, "loss": 0.0405, "loss/chosen-sft": 1.7127773761749268, "loss/dpo": 0.04047512635588646, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1657226085662842, "rewards/margins": 0.5985251665115356, "rewards/rejected": -1.7642476558685303, "step": 865 }, { "epoch": 0.4825291181364393, "grad_norm": 1.4616957902908325, "learning_rate": 3.0899677062796356e-07, "logits/chosen": -0.9384894371032715, "logits/rejected": -0.7627253532409668, "logps/chosen": -454.59613037109375, "logps/rejected": -447.65667724609375, "loss": 0.0443, "loss/chosen-sft": 1.7974565029144287, "loss/dpo": 0.04427001625299454, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3299897909164429, "rewards/margins": 0.5200726389884949, "rewards/rejected": -1.850062370300293, "step": 870 }, { "epoch": 0.4853022739877981, "grad_norm": 1.1953011751174927, "learning_rate": 3.066413419720231e-07, "logits/chosen": -1.0708543062210083, "logits/rejected": -0.9470682144165039, "logps/chosen": -411.50323486328125, "logps/rejected": -438.43023681640625, "loss": 0.0373, "loss/chosen-sft": 1.778357744216919, "loss/dpo": 0.03726748377084732, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4992625713348389, "rewards/margins": 0.5397129058837891, "rewards/rejected": -2.038975477218628, "step": 875 }, { "epoch": 0.48807542983915697, "grad_norm": 1.2079335451126099, "learning_rate": 3.042806011922021e-07, "logits/chosen": -0.9137675166130066, "logits/rejected": -0.8694272041320801, "logps/chosen": -465.96954345703125, "logps/rejected": -473.0062561035156, "loss": 0.0423, "loss/chosen-sft": 1.8091964721679688, "loss/dpo": 0.04231221228837967, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3828151226043701, "rewards/margins": 0.6629751920700073, "rewards/rejected": -2.045790195465088, "step": 880 }, { "epoch": 0.4908485856905158, "grad_norm": 2.0538318157196045, "learning_rate": 3.019147696912256e-07, "logits/chosen": -0.8203264474868774, "logits/rejected": -0.8118319511413574, "logps/chosen": -457.15557861328125, "logps/rejected": -443.97412109375, "loss": 0.0359, "loss/chosen-sft": 1.819947600364685, "loss/dpo": 0.03590567782521248, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4309660196304321, "rewards/margins": 0.6293816566467285, "rewards/rejected": -2.060347557067871, "step": 885 }, { "epoch": 0.49362174154187466, "grad_norm": 1.3179012537002563, "learning_rate": 2.9954406934925353e-07, "logits/chosen": -1.0511844158172607, "logits/rejected": -0.8907807469367981, "logps/chosen": -441.5658264160156, "logps/rejected": -442.07025146484375, "loss": 0.0387, "loss/chosen-sft": 1.8559125661849976, "loss/dpo": 0.0386546328663826, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4241468906402588, "rewards/margins": 0.45265036821365356, "rewards/rejected": -1.8767973184585571, "step": 890 }, { "epoch": 0.4963948973932335, "grad_norm": 1.4792488813400269, "learning_rate": 2.9716872250307153e-07, "logits/chosen": -0.9463046789169312, "logits/rejected": -0.8543429374694824, "logps/chosen": -422.39288330078125, "logps/rejected": -453.1312561035156, "loss": 0.0381, "loss/chosen-sft": 1.7985944747924805, "loss/dpo": 0.0381343699991703, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2584816217422485, "rewards/margins": 0.6484702825546265, "rewards/rejected": -1.906951904296875, "step": 895 }, { "epoch": 0.49916805324459235, "grad_norm": 1.479712724685669, "learning_rate": 2.9478895192523867e-07, "logits/chosen": -0.9726383090019226, "logits/rejected": -0.8202565312385559, "logps/chosen": -414.5870056152344, "logps/rejected": -435.7919006347656, "loss": 0.0542, "loss/chosen-sft": 1.651619553565979, "loss/dpo": 0.05416691303253174, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2731552124023438, "rewards/margins": 0.5164347290992737, "rewards/rejected": -1.7895901203155518, "step": 900 }, { "epoch": 0.5019412090959512, "grad_norm": 1.1789251565933228, "learning_rate": 2.9240498080319503e-07, "logits/chosen": -0.8077837228775024, "logits/rejected": -0.7600533366203308, "logps/chosen": -395.1336975097656, "logps/rejected": -399.80316162109375, "loss": 0.0351, "loss/chosen-sft": 1.7661956548690796, "loss/dpo": 0.03513758257031441, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.110447883605957, "rewards/margins": 0.6378698348999023, "rewards/rejected": -1.7483177185058594, "step": 905 }, { "epoch": 0.50471436494731, "grad_norm": 1.0820693969726562, "learning_rate": 2.9001703271832987e-07, "logits/chosen": -0.839047908782959, "logits/rejected": -0.7069706916809082, "logps/chosen": -418.8855895996094, "logps/rejected": -439.6836853027344, "loss": 0.0325, "loss/chosen-sft": 1.841740369796753, "loss/dpo": 0.03246615082025528, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2718727588653564, "rewards/margins": 0.6148180365562439, "rewards/rejected": -1.8866908550262451, "step": 910 }, { "epoch": 0.5074875207986689, "grad_norm": 1.1299471855163574, "learning_rate": 2.8762533162501306e-07, "logits/chosen": -0.8962306976318359, "logits/rejected": -0.7364431619644165, "logps/chosen": -440.30419921875, "logps/rejected": -406.437744140625, "loss": 0.048, "loss/chosen-sft": 1.8074274063110352, "loss/dpo": 0.04800194129347801, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4012686014175415, "rewards/margins": 0.2889956831932068, "rewards/rejected": -1.690264344215393, "step": 915 }, { "epoch": 0.5102606766500277, "grad_norm": 1.7517938613891602, "learning_rate": 2.852301018295914e-07, "logits/chosen": -1.0834687948226929, "logits/rejected": -0.7870742082595825, "logps/chosen": -392.78662109375, "logps/rejected": -423.45294189453125, "loss": 0.0458, "loss/chosen-sft": 1.6456248760223389, "loss/dpo": 0.045752983540296555, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2369292974472046, "rewards/margins": 0.5108424425125122, "rewards/rejected": -1.7477716207504272, "step": 920 }, { "epoch": 0.5130338325013866, "grad_norm": 1.3080066442489624, "learning_rate": 2.828315679693518e-07, "logits/chosen": -0.7610379457473755, "logits/rejected": -0.6710953712463379, "logps/chosen": -434.26953125, "logps/rejected": -448.9422912597656, "loss": 0.0385, "loss/chosen-sft": 1.773113489151001, "loss/dpo": 0.03848852962255478, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2653634548187256, "rewards/margins": 0.6251915097236633, "rewards/rejected": -1.8905551433563232, "step": 925 }, { "epoch": 0.5158069883527454, "grad_norm": 1.5302928686141968, "learning_rate": 2.80429954991454e-07, "logits/chosen": -0.8365219831466675, "logits/rejected": -0.787796139717102, "logps/chosen": -418.86383056640625, "logps/rejected": -427.78790283203125, "loss": 0.0488, "loss/chosen-sft": 1.701751470565796, "loss/dpo": 0.048776544630527496, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2631399631500244, "rewards/margins": 0.5425049066543579, "rewards/rejected": -1.8056447505950928, "step": 930 }, { "epoch": 0.5185801442041043, "grad_norm": 1.1883572340011597, "learning_rate": 2.7802548813183364e-07, "logits/chosen": -0.8014055490493774, "logits/rejected": -0.8269286155700684, "logps/chosen": -445.94378662109375, "logps/rejected": -428.35107421875, "loss": 0.0446, "loss/chosen-sft": 1.728753685951233, "loss/dpo": 0.04457592964172363, "rewards/accuracies": 0.75, "rewards/chosen": -1.1154412031173706, "rewards/margins": 0.6285417675971985, "rewards/rejected": -1.7439830303192139, "step": 935 }, { "epoch": 0.5213533000554631, "grad_norm": 1.7934128046035767, "learning_rate": 2.756183928940784e-07, "logits/chosen": -0.7623527646064758, "logits/rejected": -0.7455809712409973, "logps/chosen": -425.8876037597656, "logps/rejected": -401.5950927734375, "loss": 0.0398, "loss/chosen-sft": 1.7588945627212524, "loss/dpo": 0.03979960083961487, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2177762985229492, "rewards/margins": 0.4867513179779053, "rewards/rejected": -1.704527497291565, "step": 940 }, { "epoch": 0.5241264559068219, "grad_norm": 1.373268723487854, "learning_rate": 2.7320889502827905e-07, "logits/chosen": -0.8963130712509155, "logits/rejected": -0.8474391102790833, "logps/chosen": -429.3999938964844, "logps/rejected": -427.1219177246094, "loss": 0.0446, "loss/chosen-sft": 1.7064225673675537, "loss/dpo": 0.04461617022752762, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2002841234207153, "rewards/margins": 0.6481701731681824, "rewards/rejected": -1.848454236984253, "step": 945 }, { "epoch": 0.5268996117581808, "grad_norm": 1.2301034927368164, "learning_rate": 2.707972205098576e-07, "logits/chosen": -0.9842405319213867, "logits/rejected": -1.0120937824249268, "logps/chosen": -438.3604431152344, "logps/rejected": -427.14764404296875, "loss": 0.0411, "loss/chosen-sft": 1.813296914100647, "loss/dpo": 0.041075654327869415, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2374821901321411, "rewards/margins": 0.6461740732192993, "rewards/rejected": -1.8836562633514404, "step": 950 }, { "epoch": 0.5296727676095396, "grad_norm": 1.549978256225586, "learning_rate": 2.68383595518374e-07, "logits/chosen": -0.8700895309448242, "logits/rejected": -0.8361631631851196, "logps/chosen": -442.2728576660156, "logps/rejected": -430.29986572265625, "loss": 0.0374, "loss/chosen-sft": 1.7138087749481201, "loss/dpo": 0.03742986172437668, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.300048589706421, "rewards/margins": 0.5438514947891235, "rewards/rejected": -1.8438999652862549, "step": 955 }, { "epoch": 0.5324459234608985, "grad_norm": 1.7318216562271118, "learning_rate": 2.659682464163138e-07, "logits/chosen": -1.0281853675842285, "logits/rejected": -1.0226585865020752, "logps/chosen": -469.50238037109375, "logps/rejected": -455.428955078125, "loss": 0.0578, "loss/chosen-sft": 1.7266900539398193, "loss/dpo": 0.057835765182971954, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2760839462280273, "rewards/margins": 0.5891796946525574, "rewards/rejected": -1.8652637004852295, "step": 960 }, { "epoch": 0.5352190793122573, "grad_norm": 1.1333941221237183, "learning_rate": 2.6355139972785885e-07, "logits/chosen": -0.9396898150444031, "logits/rejected": -0.849763035774231, "logps/chosen": -423.746337890625, "logps/rejected": -441.26837158203125, "loss": 0.0365, "loss/chosen-sft": 1.7641470432281494, "loss/dpo": 0.03645756468176842, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1291122436523438, "rewards/margins": 0.7151139378547668, "rewards/rejected": -1.8442262411117554, "step": 965 }, { "epoch": 0.5379922351636162, "grad_norm": 1.0045897960662842, "learning_rate": 2.6113328211764235e-07, "logits/chosen": -1.0510257482528687, "logits/rejected": -1.082379698753357, "logps/chosen": -449.94561767578125, "logps/rejected": -432.62933349609375, "loss": 0.0391, "loss/chosen-sft": 1.7299797534942627, "loss/dpo": 0.039098747074604034, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1797751188278198, "rewards/margins": 0.6614251136779785, "rewards/rejected": -1.8412002325057983, "step": 970 }, { "epoch": 0.540765391014975, "grad_norm": 1.4864047765731812, "learning_rate": 2.5871412036949153e-07, "logits/chosen": -0.956852912902832, "logits/rejected": -0.8994203805923462, "logps/chosen": -428.5096130371094, "logps/rejected": -419.07330322265625, "loss": 0.0463, "loss/chosen-sft": 1.774070143699646, "loss/dpo": 0.04627335071563721, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2709102630615234, "rewards/margins": 0.5661468505859375, "rewards/rejected": -1.837057113647461, "step": 975 }, { "epoch": 0.5435385468663338, "grad_norm": 1.1833720207214355, "learning_rate": 2.5629414136515825e-07, "logits/chosen": -0.7828256487846375, "logits/rejected": -0.6822995543479919, "logps/chosen": -444.83697509765625, "logps/rejected": -446.899658203125, "loss": 0.0362, "loss/chosen-sft": 1.8193594217300415, "loss/dpo": 0.03620842099189758, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2841075658798218, "rewards/margins": 0.670619547367096, "rewards/rejected": -1.9547271728515625, "step": 980 }, { "epoch": 0.5463117027176927, "grad_norm": 1.4741407632827759, "learning_rate": 2.5387357206304077e-07, "logits/chosen": -0.9925743341445923, "logits/rejected": -0.7799035310745239, "logps/chosen": -441.3125, "logps/rejected": -427.43115234375, "loss": 0.0341, "loss/chosen-sft": 1.7645461559295654, "loss/dpo": 0.034115031361579895, "rewards/accuracies": 0.65625, "rewards/chosen": -1.332214117050171, "rewards/margins": 0.5084002017974854, "rewards/rejected": -1.8406140804290771, "step": 985 }, { "epoch": 0.5490848585690515, "grad_norm": 2.5760786533355713, "learning_rate": 2.514526394768989e-07, "logits/chosen": -0.8460060358047485, "logits/rejected": -0.8547107577323914, "logps/chosen": -451.8817443847656, "logps/rejected": -449.7991638183594, "loss": 0.0474, "loss/chosen-sft": 1.7884471416473389, "loss/dpo": 0.04735780879855156, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3389358520507812, "rewards/margins": 0.6204281449317932, "rewards/rejected": -1.9593639373779297, "step": 990 }, { "epoch": 0.5518580144204104, "grad_norm": 1.9366549253463745, "learning_rate": 2.490315706545631e-07, "logits/chosen": -0.8773177862167358, "logits/rejected": -0.9315664172172546, "logps/chosen": -423.4974670410156, "logps/rejected": -398.5281066894531, "loss": 0.0402, "loss/chosen-sft": 1.7923309803009033, "loss/dpo": 0.04023148491978645, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3467055559158325, "rewards/margins": 0.4263473451137543, "rewards/rejected": -1.7730529308319092, "step": 995 }, { "epoch": 0.5546311702717692, "grad_norm": 1.0191140174865723, "learning_rate": 2.466105926566405e-07, "logits/chosen": -0.8884904980659485, "logits/rejected": -0.8517130017280579, "logps/chosen": -412.35595703125, "logps/rejected": -398.6607360839844, "loss": 0.0425, "loss/chosen-sft": 1.7367550134658813, "loss/dpo": 0.0425165630877018, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.207376480102539, "rewards/margins": 0.44255828857421875, "rewards/rejected": -1.6499347686767578, "step": 1000 }, { "epoch": 0.5574043261231281, "grad_norm": 1.5985798835754395, "learning_rate": 2.441899325352205e-07, "logits/chosen": -0.8503344655036926, "logits/rejected": -0.7258619070053101, "logps/chosen": -405.3533630371094, "logps/rejected": -423.4427795410156, "loss": 0.038, "loss/chosen-sft": 1.7873739004135132, "loss/dpo": 0.03796621412038803, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2806193828582764, "rewards/margins": 0.5564595460891724, "rewards/rejected": -1.8370788097381592, "step": 1005 }, { "epoch": 0.5601774819744869, "grad_norm": 2.0438764095306396, "learning_rate": 2.417698173125804e-07, "logits/chosen": -0.8835655450820923, "logits/rejected": -0.8549100160598755, "logps/chosen": -443.4007263183594, "logps/rejected": -412.9955139160156, "loss": 0.0476, "loss/chosen-sft": 1.781747817993164, "loss/dpo": 0.04762732982635498, "rewards/accuracies": 0.625, "rewards/chosen": -1.2374616861343384, "rewards/margins": 0.514583945274353, "rewards/rejected": -1.7520456314086914, "step": 1010 }, { "epoch": 0.5629506378258459, "grad_norm": 2.029576539993286, "learning_rate": 2.393504739598938e-07, "logits/chosen": -0.8470139503479004, "logits/rejected": -0.8256238102912903, "logps/chosen": -434.9107971191406, "logps/rejected": -436.6976623535156, "loss": 0.047, "loss/chosen-sft": 1.7188364267349243, "loss/dpo": 0.04702724516391754, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2836484909057617, "rewards/margins": 0.6734278202056885, "rewards/rejected": -1.9570764303207397, "step": 1015 }, { "epoch": 0.5657237936772047, "grad_norm": 1.5274903774261475, "learning_rate": 2.3693212937594436e-07, "logits/chosen": -0.6356366872787476, "logits/rejected": -0.5753142237663269, "logps/chosen": -411.853271484375, "logps/rejected": -436.2920837402344, "loss": 0.0407, "loss/chosen-sft": 1.8935306072235107, "loss/dpo": 0.040696293115615845, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.258502721786499, "rewards/margins": 0.4864071011543274, "rewards/rejected": -1.7449098825454712, "step": 1020 }, { "epoch": 0.5684969495285636, "grad_norm": 1.1039131879806519, "learning_rate": 2.3451501036584604e-07, "logits/chosen": -0.7690117359161377, "logits/rejected": -0.7704351544380188, "logps/chosen": -429.619140625, "logps/rejected": -413.0948791503906, "loss": 0.0304, "loss/chosen-sft": 1.7215017080307007, "loss/dpo": 0.030385727062821388, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2474838495254517, "rewards/margins": 0.6526226997375488, "rewards/rejected": -1.900106430053711, "step": 1025 }, { "epoch": 0.5712701053799224, "grad_norm": 2.5751590728759766, "learning_rate": 2.3209934361977194e-07, "logits/chosen": -0.9362251162528992, "logits/rejected": -0.8153011202812195, "logps/chosen": -431.95391845703125, "logps/rejected": -430.9054260253906, "loss": 0.0599, "loss/chosen-sft": 1.7196757793426514, "loss/dpo": 0.05986684560775757, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2645565271377563, "rewards/margins": 0.46636122465133667, "rewards/rejected": -1.7309175729751587, "step": 1030 }, { "epoch": 0.5740432612312812, "grad_norm": 1.936566710472107, "learning_rate": 2.296853556916941e-07, "logits/chosen": -0.6564992070198059, "logits/rejected": -0.532072126865387, "logps/chosen": -406.83099365234375, "logps/rejected": -410.1024475097656, "loss": 0.0359, "loss/chosen-sft": 1.8352649211883545, "loss/dpo": 0.03590982407331467, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2317606210708618, "rewards/margins": 0.5972455739974976, "rewards/rejected": -1.8290061950683594, "step": 1035 }, { "epoch": 0.5768164170826401, "grad_norm": 1.4029645919799805, "learning_rate": 2.2727327297813613e-07, "logits/chosen": -0.9582229852676392, "logits/rejected": -0.9992238283157349, "logps/chosen": -444.89752197265625, "logps/rejected": -424.8453674316406, "loss": 0.0355, "loss/chosen-sft": 1.804992914199829, "loss/dpo": 0.03546611964702606, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2539708614349365, "rewards/margins": 0.5192979574203491, "rewards/rejected": -1.7732689380645752, "step": 1040 }, { "epoch": 0.5795895729339989, "grad_norm": 1.8364492654800415, "learning_rate": 2.2486332169694095e-07, "logits/chosen": -0.7399333715438843, "logits/rejected": -0.7711300253868103, "logps/chosen": -425.58721923828125, "logps/rejected": -407.39349365234375, "loss": 0.0461, "loss/chosen-sft": 1.7470451593399048, "loss/dpo": 0.046109430491924286, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3421345949172974, "rewards/margins": 0.5037655830383301, "rewards/rejected": -1.8459001779556274, "step": 1045 }, { "epoch": 0.5823627287853578, "grad_norm": 1.4886951446533203, "learning_rate": 2.224557278660539e-07, "logits/chosen": -0.7063448429107666, "logits/rejected": -0.7334424257278442, "logps/chosen": -429.91693115234375, "logps/rejected": -450.59893798828125, "loss": 0.0326, "loss/chosen-sft": 1.7843490839004517, "loss/dpo": 0.03259655088186264, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2502191066741943, "rewards/margins": 0.8139023780822754, "rewards/rejected": -2.064121723175049, "step": 1050 }, { "epoch": 0.5851358846367166, "grad_norm": 1.7785381078720093, "learning_rate": 2.200507172823268e-07, "logits/chosen": -0.822382926940918, "logits/rejected": -0.698542058467865, "logps/chosen": -440.42431640625, "logps/rejected": -438.05169677734375, "loss": 0.0392, "loss/chosen-sft": 1.7739006280899048, "loss/dpo": 0.0392305888235569, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3763277530670166, "rewards/margins": 0.5221567153930664, "rewards/rejected": -1.8984845876693726, "step": 1055 }, { "epoch": 0.5879090404880755, "grad_norm": 1.6011799573898315, "learning_rate": 2.176485155003405e-07, "logits/chosen": -0.7036235928535461, "logits/rejected": -0.6139532327651978, "logps/chosen": -427.38519287109375, "logps/rejected": -418.2265625, "loss": 0.0372, "loss/chosen-sft": 1.8186620473861694, "loss/dpo": 0.03721202537417412, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3542938232421875, "rewards/margins": 0.4948766827583313, "rewards/rejected": -1.8491706848144531, "step": 1060 }, { "epoch": 0.5906821963394343, "grad_norm": 1.0207229852676392, "learning_rate": 2.1524934781125164e-07, "logits/chosen": -0.977032482624054, "logits/rejected": -0.9711772799491882, "logps/chosen": -445.39263916015625, "logps/rejected": -431.14007568359375, "loss": 0.0468, "loss/chosen-sft": 1.7722899913787842, "loss/dpo": 0.04676477983593941, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.318066120147705, "rewards/margins": 0.6112589836120605, "rewards/rejected": -1.9293251037597656, "step": 1065 }, { "epoch": 0.5934553521907932, "grad_norm": 1.426311731338501, "learning_rate": 2.1285343922166393e-07, "logits/chosen": -0.8198713064193726, "logits/rejected": -0.7067451477050781, "logps/chosen": -472.1852111816406, "logps/rejected": -444.33935546875, "loss": 0.0446, "loss/chosen-sft": 1.7742096185684204, "loss/dpo": 0.04458652064204216, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3653810024261475, "rewards/margins": 0.4912968575954437, "rewards/rejected": -1.8566780090332031, "step": 1070 }, { "epoch": 0.596228508042152, "grad_norm": 1.9216793775558472, "learning_rate": 2.104610144325252e-07, "logits/chosen": -0.9030396342277527, "logits/rejected": -0.9881137609481812, "logps/chosen": -442.3429260253906, "logps/rejected": -423.65838623046875, "loss": 0.0469, "loss/chosen-sft": 1.7527765035629272, "loss/dpo": 0.0468582846224308, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2682268619537354, "rewards/margins": 0.5394538640975952, "rewards/rejected": -1.8076804876327515, "step": 1075 }, { "epoch": 0.5990016638935108, "grad_norm": 1.1101206541061401, "learning_rate": 2.0807229781805415e-07, "logits/chosen": -0.9745734930038452, "logits/rejected": -0.7166577577590942, "logps/chosen": -407.92266845703125, "logps/rejected": -429.6647033691406, "loss": 0.0472, "loss/chosen-sft": 1.707130789756775, "loss/dpo": 0.04721622169017792, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.232958197593689, "rewards/margins": 0.6351754069328308, "rewards/rejected": -1.8681339025497437, "step": 1080 }, { "epoch": 0.6017748197448697, "grad_norm": 0.9766126871109009, "learning_rate": 2.056875134046976e-07, "logits/chosen": -0.772416889667511, "logits/rejected": -0.6425715684890747, "logps/chosen": -451.41204833984375, "logps/rejected": -415.15338134765625, "loss": 0.0364, "loss/chosen-sft": 1.8030576705932617, "loss/dpo": 0.036396466195583344, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2050635814666748, "rewards/margins": 0.5893860459327698, "rewards/rejected": -1.7944495677947998, "step": 1085 }, { "epoch": 0.6045479755962285, "grad_norm": 1.6555626392364502, "learning_rate": 2.0330688485011926e-07, "logits/chosen": -0.8580873608589172, "logits/rejected": -0.8450067639350891, "logps/chosen": -426.8406677246094, "logps/rejected": -436.485595703125, "loss": 0.0461, "loss/chosen-sft": 1.7769439220428467, "loss/dpo": 0.04613155499100685, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2692389488220215, "rewards/margins": 0.5166956186294556, "rewards/rejected": -1.7859344482421875, "step": 1090 }, { "epoch": 0.6073211314475874, "grad_norm": 1.8412526845932007, "learning_rate": 2.0093063542222508e-07, "logits/chosen": -0.8168280720710754, "logits/rejected": -0.662460446357727, "logps/chosen": -419.52154541015625, "logps/rejected": -474.30926513671875, "loss": 0.037, "loss/chosen-sft": 1.7386878728866577, "loss/dpo": 0.03695227950811386, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2162078619003296, "rewards/margins": 0.7796363830566406, "rewards/rejected": -1.9958442449569702, "step": 1095 }, { "epoch": 0.6100942872989462, "grad_norm": 0.9794536232948303, "learning_rate": 1.9855898797822295e-07, "logits/chosen": -0.6690505743026733, "logits/rejected": -0.6739727258682251, "logps/chosen": -416.6888732910156, "logps/rejected": -418.7158203125, "loss": 0.0298, "loss/chosen-sft": 1.8950073719024658, "loss/dpo": 0.029840370640158653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3056199550628662, "rewards/margins": 0.519034743309021, "rewards/rejected": -1.8246548175811768, "step": 1100 }, { "epoch": 0.6128674431503051, "grad_norm": 1.5625451803207397, "learning_rate": 1.9619216494372258e-07, "logits/chosen": -0.8960970044136047, "logits/rejected": -0.851621150970459, "logps/chosen": -416.8877868652344, "logps/rejected": -431.9337463378906, "loss": 0.0397, "loss/chosen-sft": 1.755854606628418, "loss/dpo": 0.039700526744127274, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3231550455093384, "rewards/margins": 0.486478716135025, "rewards/rejected": -1.8096338510513306, "step": 1105 }, { "epoch": 0.6156405990016639, "grad_norm": 0.9809625744819641, "learning_rate": 1.9383038829187523e-07, "logits/chosen": -0.8659313321113586, "logits/rejected": -0.7378997802734375, "logps/chosen": -466.25006103515625, "logps/rejected": -437.8448181152344, "loss": 0.0338, "loss/chosen-sft": 1.7783132791519165, "loss/dpo": 0.03376453369855881, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.262998342514038, "rewards/margins": 0.7884011268615723, "rewards/rejected": -2.0513997077941895, "step": 1110 }, { "epoch": 0.6184137548530227, "grad_norm": 1.4257228374481201, "learning_rate": 1.914738795225556e-07, "logits/chosen": -0.8100174069404602, "logits/rejected": -0.9278467893600464, "logps/chosen": -423.9007263183594, "logps/rejected": -412.9117126464844, "loss": 0.0436, "loss/chosen-sft": 1.7985398769378662, "loss/dpo": 0.04363011568784714, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3259937763214111, "rewards/margins": 0.5338584184646606, "rewards/rejected": -1.8598521947860718, "step": 1115 }, { "epoch": 0.6211869107043816, "grad_norm": 1.4828506708145142, "learning_rate": 1.8912285964158856e-07, "logits/chosen": -0.8464866876602173, "logits/rejected": -0.8284673690795898, "logps/chosen": -445.93475341796875, "logps/rejected": -457.19384765625, "loss": 0.0344, "loss/chosen-sft": 1.7983877658843994, "loss/dpo": 0.03439199924468994, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3726943731307983, "rewards/margins": 0.6521696448326111, "rewards/rejected": -2.0248641967773438, "step": 1120 }, { "epoch": 0.6239600665557404, "grad_norm": 1.2299528121948242, "learning_rate": 1.8677754914002231e-07, "logits/chosen": -0.9932361841201782, "logits/rejected": -1.02671480178833, "logps/chosen": -434.72509765625, "logps/rejected": -444.66766357421875, "loss": 0.0389, "loss/chosen-sft": 1.8104565143585205, "loss/dpo": 0.038856539875268936, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2957760095596313, "rewards/margins": 0.6685920357704163, "rewards/rejected": -1.9643681049346924, "step": 1125 }, { "epoch": 0.6267332224070993, "grad_norm": 1.7034136056900024, "learning_rate": 1.8443816797344896e-07, "logits/chosen": -0.8144285082817078, "logits/rejected": -0.6793054342269897, "logps/chosen": -461.71258544921875, "logps/rejected": -455.9671325683594, "loss": 0.0474, "loss/chosen-sft": 1.8361161947250366, "loss/dpo": 0.047358639538288116, "rewards/accuracies": 0.625, "rewards/chosen": -1.4464315176010132, "rewards/margins": 0.4728009104728699, "rewards/rejected": -1.9192323684692383, "step": 1130 }, { "epoch": 0.6295063782584581, "grad_norm": 1.4374454021453857, "learning_rate": 1.821049355413767e-07, "logits/chosen": -0.8650090098381042, "logits/rejected": -0.6468401551246643, "logps/chosen": -443.62274169921875, "logps/rejected": -443.31427001953125, "loss": 0.0332, "loss/chosen-sft": 1.8839133977890015, "loss/dpo": 0.03318975493311882, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5003621578216553, "rewards/margins": 0.500784158706665, "rewards/rejected": -2.0011465549468994, "step": 1135 }, { "epoch": 0.632279534109817, "grad_norm": 1.7502909898757935, "learning_rate": 1.7977807066665267e-07, "logits/chosen": -0.9573124647140503, "logits/rejected": -0.8072667121887207, "logps/chosen": -418.649658203125, "logps/rejected": -418.47589111328125, "loss": 0.0452, "loss/chosen-sft": 1.7528860569000244, "loss/dpo": 0.045168764889240265, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3320482969284058, "rewards/margins": 0.5464748740196228, "rewards/rejected": -1.8785232305526733, "step": 1140 }, { "epoch": 0.6350526899611758, "grad_norm": 1.0295249223709106, "learning_rate": 1.7745779157494096e-07, "logits/chosen": -0.8213936686515808, "logits/rejected": -0.7543411254882812, "logps/chosen": -437.8804626464844, "logps/rejected": -436.56982421875, "loss": 0.0303, "loss/chosen-sft": 1.7841157913208008, "loss/dpo": 0.030281806364655495, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.393178939819336, "rewards/margins": 0.679972231388092, "rewards/rejected": -2.073151111602783, "step": 1145 }, { "epoch": 0.6378258458125347, "grad_norm": 1.2215420007705688, "learning_rate": 1.7514431587425622e-07, "logits/chosen": -0.7637904286384583, "logits/rejected": -0.8227758407592773, "logps/chosen": -429.562255859375, "logps/rejected": -432.89971923828125, "loss": 0.0361, "loss/chosen-sft": 1.8592332601547241, "loss/dpo": 0.03613410145044327, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4023336172103882, "rewards/margins": 0.5858246684074402, "rewards/rejected": -1.9881582260131836, "step": 1150 }, { "epoch": 0.6405990016638935, "grad_norm": 1.6888597011566162, "learning_rate": 1.728378605345553e-07, "logits/chosen": -0.7143627405166626, "logits/rejected": -0.7377561330795288, "logps/chosen": -465.07403564453125, "logps/rejected": -435.6494140625, "loss": 0.0425, "loss/chosen-sft": 1.8083655834197998, "loss/dpo": 0.04253358393907547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.460288166999817, "rewards/margins": 0.5263352990150452, "rewards/rejected": -1.9866234064102173, "step": 1155 }, { "epoch": 0.6433721575152523, "grad_norm": 1.5432595014572144, "learning_rate": 1.705386418673882e-07, "logits/chosen": -0.8244765996932983, "logits/rejected": -0.7667987942695618, "logps/chosen": -437.60687255859375, "logps/rejected": -436.830078125, "loss": 0.0467, "loss/chosen-sft": 1.7204382419586182, "loss/dpo": 0.04673684015870094, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.34583580493927, "rewards/margins": 0.5071083903312683, "rewards/rejected": -1.8529441356658936, "step": 1160 }, { "epoch": 0.6461453133666112, "grad_norm": 1.2096738815307617, "learning_rate": 1.6824687550561208e-07, "logits/chosen": -0.7485244274139404, "logits/rejected": -0.7693239450454712, "logps/chosen": -464.8460388183594, "logps/rejected": -450.24713134765625, "loss": 0.0358, "loss/chosen-sft": 1.8007644414901733, "loss/dpo": 0.03579232841730118, "rewards/accuracies": 0.71875, "rewards/chosen": -1.335431694984436, "rewards/margins": 0.6632755994796753, "rewards/rejected": -1.9987071752548218, "step": 1165 }, { "epoch": 0.64891846921797, "grad_norm": 1.1814310550689697, "learning_rate": 1.659627763831671e-07, "logits/chosen": -0.7120985984802246, "logits/rejected": -0.6295477747917175, "logps/chosen": -436.17083740234375, "logps/rejected": -424.6572265625, "loss": 0.0357, "loss/chosen-sft": 1.8579403162002563, "loss/dpo": 0.035725630819797516, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.285359263420105, "rewards/margins": 0.5941758155822754, "rewards/rejected": -1.8795350790023804, "step": 1170 }, { "epoch": 0.6516916250693289, "grad_norm": 1.1809978485107422, "learning_rate": 1.6368655871491975e-07, "logits/chosen": -0.7668045163154602, "logits/rejected": -0.7772132754325867, "logps/chosen": -462.0221252441406, "logps/rejected": -435.96954345703125, "loss": 0.0424, "loss/chosen-sft": 1.831925630569458, "loss/dpo": 0.04238344356417656, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3251338005065918, "rewards/margins": 0.5784658193588257, "rewards/rejected": -1.903599500656128, "step": 1175 }, { "epoch": 0.6544647809206877, "grad_norm": 1.225167155265808, "learning_rate": 1.6141843597657172e-07, "logits/chosen": -0.8141440153121948, "logits/rejected": -0.7542480230331421, "logps/chosen": -421.45635986328125, "logps/rejected": -441.95025634765625, "loss": 0.0411, "loss/chosen-sft": 1.7617318630218506, "loss/dpo": 0.04108492285013199, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3422725200653076, "rewards/margins": 0.5261791944503784, "rewards/rejected": -1.8684518337249756, "step": 1180 }, { "epoch": 0.6572379367720466, "grad_norm": 1.3029569387435913, "learning_rate": 1.5915862088463968e-07, "logits/chosen": -0.996296226978302, "logits/rejected": -0.9346402287483215, "logps/chosen": -446.56622314453125, "logps/rejected": -449.5144958496094, "loss": 0.0419, "loss/chosen-sft": 1.6928361654281616, "loss/dpo": 0.041938044130802155, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3604621887207031, "rewards/margins": 0.7254354357719421, "rewards/rejected": -2.085897922515869, "step": 1185 }, { "epoch": 0.6600110926234054, "grad_norm": 1.26889967918396, "learning_rate": 1.5690732537650546e-07, "logits/chosen": -0.7706629037857056, "logits/rejected": -0.7561334371566772, "logps/chosen": -435.36602783203125, "logps/rejected": -470.29901123046875, "loss": 0.0294, "loss/chosen-sft": 1.8877112865447998, "loss/dpo": 0.029442256316542625, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4551079273223877, "rewards/margins": 0.6810146570205688, "rewards/rejected": -2.136122703552246, "step": 1190 }, { "epoch": 0.6627842484747642, "grad_norm": 1.2422409057617188, "learning_rate": 1.546647605905393e-07, "logits/chosen": -0.8117157816886902, "logits/rejected": -0.7619314193725586, "logps/chosen": -432.6480407714844, "logps/rejected": -441.0057678222656, "loss": 0.0352, "loss/chosen-sft": 1.8744285106658936, "loss/dpo": 0.035161614418029785, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.305821180343628, "rewards/margins": 0.5970739722251892, "rewards/rejected": -1.9028953313827515, "step": 1195 }, { "epoch": 0.6655574043261231, "grad_norm": 0.8602780699729919, "learning_rate": 1.52431136846298e-07, "logits/chosen": -0.9312012791633606, "logits/rejected": -0.7855014204978943, "logps/chosen": -424.40911865234375, "logps/rejected": -425.8111877441406, "loss": 0.0254, "loss/chosen-sft": 1.8612302541732788, "loss/dpo": 0.025435030460357666, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3491990566253662, "rewards/margins": 0.6118738055229187, "rewards/rejected": -1.9610726833343506, "step": 1200 }, { "epoch": 0.6683305601774819, "grad_norm": 1.010581135749817, "learning_rate": 1.5020666362480084e-07, "logits/chosen": -0.8483907580375671, "logits/rejected": -0.7495467066764832, "logps/chosen": -495.5743103027344, "logps/rejected": -482.33209228515625, "loss": 0.0387, "loss/chosen-sft": 1.8878498077392578, "loss/dpo": 0.03866366669535637, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4326591491699219, "rewards/margins": 0.7254186868667603, "rewards/rejected": -2.1580777168273926, "step": 1205 }, { "epoch": 0.6711037160288408, "grad_norm": 1.6181249618530273, "learning_rate": 1.4799154954888222e-07, "logits/chosen": -1.0427324771881104, "logits/rejected": -0.809046745300293, "logps/chosen": -415.38909912109375, "logps/rejected": -420.14453125, "loss": 0.0316, "loss/chosen-sft": 1.8586212396621704, "loss/dpo": 0.031649235635995865, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.452163577079773, "rewards/margins": 0.537935197353363, "rewards/rejected": -1.9900987148284912, "step": 1210 }, { "epoch": 0.6738768718801996, "grad_norm": 0.7974721789360046, "learning_rate": 1.4578600236362697e-07, "logits/chosen": -0.8521813154220581, "logits/rejected": -0.6828973889350891, "logps/chosen": -456.5556640625, "logps/rejected": -470.51971435546875, "loss": 0.0311, "loss/chosen-sft": 1.945521593093872, "loss/dpo": 0.03109545074403286, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4207074642181396, "rewards/margins": 0.7211933732032776, "rewards/rejected": -2.1419005393981934, "step": 1215 }, { "epoch": 0.6766500277315585, "grad_norm": 1.4059048891067505, "learning_rate": 1.435902289168861e-07, "logits/chosen": -0.9025594592094421, "logits/rejected": -0.835253894329071, "logps/chosen": -480.26593017578125, "logps/rejected": -467.8526306152344, "loss": 0.0337, "loss/chosen-sft": 1.9281394481658936, "loss/dpo": 0.033659420907497406, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5400809049606323, "rewards/margins": 0.5880559086799622, "rewards/rejected": -2.1281368732452393, "step": 1220 }, { "epoch": 0.6794231835829173, "grad_norm": 1.8963831663131714, "learning_rate": 1.4140443513987807e-07, "logits/chosen": -1.0080327987670898, "logits/rejected": -0.9464845657348633, "logps/chosen": -455.85968017578125, "logps/rejected": -438.92840576171875, "loss": 0.0337, "loss/chosen-sft": 1.840531349182129, "loss/dpo": 0.033664338290691376, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5052850246429443, "rewards/margins": 0.6105053424835205, "rewards/rejected": -2.115790843963623, "step": 1225 }, { "epoch": 0.6821963394342762, "grad_norm": 1.1899809837341309, "learning_rate": 1.3922882602787523e-07, "logits/chosen": -0.7441499829292297, "logits/rejected": -0.7358977198600769, "logps/chosen": -479.97064208984375, "logps/rejected": -476.0712890625, "loss": 0.0317, "loss/chosen-sft": 1.9372377395629883, "loss/dpo": 0.03166855126619339, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5703338384628296, "rewards/margins": 0.6448060870170593, "rewards/rejected": -2.2151401042938232, "step": 1230 }, { "epoch": 0.684969495285635, "grad_norm": 1.3064395189285278, "learning_rate": 1.3706360562097797e-07, "logits/chosen": -0.9031554460525513, "logits/rejected": -0.8757888674736023, "logps/chosen": -471.3316345214844, "logps/rejected": -446.17120361328125, "loss": 0.0316, "loss/chosen-sft": 1.8330347537994385, "loss/dpo": 0.031595904380083084, "rewards/accuracies": 0.6875, "rewards/chosen": -1.457507848739624, "rewards/margins": 0.67474764585495, "rewards/rejected": -2.1322555541992188, "step": 1235 }, { "epoch": 0.687742651136994, "grad_norm": 1.1494730710983276, "learning_rate": 1.3490897698497983e-07, "logits/chosen": -0.9943927526473999, "logits/rejected": -0.9441978335380554, "logps/chosen": -467.54266357421875, "logps/rejected": -473.6890563964844, "loss": 0.0293, "loss/chosen-sft": 1.9200853109359741, "loss/dpo": 0.02934952639043331, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.563489556312561, "rewards/margins": 0.7292519211769104, "rewards/rejected": -2.292741298675537, "step": 1240 }, { "epoch": 0.6905158069883528, "grad_norm": 1.2559505701065063, "learning_rate": 1.3276514219232142e-07, "logits/chosen": -0.6624242067337036, "logits/rejected": -0.8091727495193481, "logps/chosen": -471.83941650390625, "logps/rejected": -457.29510498046875, "loss": 0.0342, "loss/chosen-sft": 1.9265739917755127, "loss/dpo": 0.034207794815301895, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4902698993682861, "rewards/margins": 0.6235011219978333, "rewards/rejected": -2.1137709617614746, "step": 1245 }, { "epoch": 0.6932889628397116, "grad_norm": 1.1430951356887817, "learning_rate": 1.3063230230314027e-07, "logits/chosen": -0.9455582499504089, "logits/rejected": -0.8169571757316589, "logps/chosen": -444.04193115234375, "logps/rejected": -469.19354248046875, "loss": 0.0347, "loss/chosen-sft": 1.893571138381958, "loss/dpo": 0.034717872738838196, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5202562808990479, "rewards/margins": 0.6696333289146423, "rewards/rejected": -2.189889669418335, "step": 1250 }, { "epoch": 0.6960621186910705, "grad_norm": 1.2610682249069214, "learning_rate": 1.2851065734641364e-07, "logits/chosen": -0.9905312657356262, "logits/rejected": -0.8740830421447754, "logps/chosen": -422.3661193847656, "logps/rejected": -433.2808532714844, "loss": 0.0288, "loss/chosen-sft": 1.7975590229034424, "loss/dpo": 0.028823431581258774, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.442055583000183, "rewards/margins": 0.5924633741378784, "rewards/rejected": -2.0345191955566406, "step": 1255 }, { "epoch": 0.6988352745424293, "grad_norm": 0.9623591303825378, "learning_rate": 1.2640040630119916e-07, "logits/chosen": -0.7741963267326355, "logits/rejected": -0.8283836245536804, "logps/chosen": -455.494140625, "logps/rejected": -467.2837829589844, "loss": 0.0324, "loss/chosen-sft": 2.0400707721710205, "loss/dpo": 0.0323946438729763, "rewards/accuracies": 0.65625, "rewards/chosen": -1.560781717300415, "rewards/margins": 0.5312785506248474, "rewards/rejected": -2.0920603275299072, "step": 1260 }, { "epoch": 0.7016084303937882, "grad_norm": 1.337944746017456, "learning_rate": 1.243017470779729e-07, "logits/chosen": -1.0406231880187988, "logits/rejected": -0.8966327905654907, "logps/chosen": -426.70361328125, "logps/rejected": -453.76031494140625, "loss": 0.0359, "loss/chosen-sft": 1.8434947729110718, "loss/dpo": 0.03585369139909744, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4888029098510742, "rewards/margins": 0.7223333120346069, "rewards/rejected": -2.2111363410949707, "step": 1265 }, { "epoch": 0.704381586245147, "grad_norm": 0.8494482040405273, "learning_rate": 1.222148765000694e-07, "logits/chosen": -0.8672950863838196, "logits/rejected": -0.8264138102531433, "logps/chosen": -444.71246337890625, "logps/rejected": -443.8768615722656, "loss": 0.0316, "loss/chosen-sft": 1.826744794845581, "loss/dpo": 0.03159435838460922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.518831491470337, "rewards/margins": 0.6539059281349182, "rewards/rejected": -2.1727373600006104, "step": 1270 }, { "epoch": 0.7071547420965059, "grad_norm": 1.5342941284179688, "learning_rate": 1.2013999028522104e-07, "logits/chosen": -0.9066953659057617, "logits/rejected": -0.9141793251037598, "logps/chosen": -475.5211486816406, "logps/rejected": -463.80810546875, "loss": 0.0517, "loss/chosen-sft": 1.7181308269500732, "loss/dpo": 0.05173926800489426, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5922820568084717, "rewards/margins": 0.48251956701278687, "rewards/rejected": -2.0748016834259033, "step": 1275 }, { "epoch": 0.7099278979478647, "grad_norm": 1.1075633764266968, "learning_rate": 1.1807728302720418e-07, "logits/chosen": -0.8835026025772095, "logits/rejected": -0.8582647442817688, "logps/chosen": -450.48870849609375, "logps/rejected": -440.99871826171875, "loss": 0.0319, "loss/chosen-sft": 1.9053627252578735, "loss/dpo": 0.03186877816915512, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4612672328948975, "rewards/margins": 0.6721949577331543, "rewards/rejected": -2.133462429046631, "step": 1280 }, { "epoch": 0.7127010537992235, "grad_norm": 2.3948493003845215, "learning_rate": 1.1602694817758773e-07, "logits/chosen": -0.9616036415100098, "logits/rejected": -0.9405485987663269, "logps/chosen": -455.7767639160156, "logps/rejected": -471.544677734375, "loss": 0.0506, "loss/chosen-sft": 1.7842986583709717, "loss/dpo": 0.0505877360701561, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4995403289794922, "rewards/margins": 0.608048141002655, "rewards/rejected": -2.107588291168213, "step": 1285 }, { "epoch": 0.7154742096505824, "grad_norm": 1.8153194189071655, "learning_rate": 1.139891780275912e-07, "logits/chosen": -1.0241485834121704, "logits/rejected": -0.897274374961853, "logps/chosen": -420.4510803222656, "logps/rejected": -436.85003662109375, "loss": 0.0345, "loss/chosen-sft": 1.7239220142364502, "loss/dpo": 0.034504033625125885, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3688867092132568, "rewards/margins": 0.6113638877868652, "rewards/rejected": -1.980250597000122, "step": 1290 }, { "epoch": 0.7182473655019412, "grad_norm": 1.2911590337753296, "learning_rate": 1.119641636900502e-07, "logits/chosen": -0.9305152893066406, "logits/rejected": -0.9541549682617188, "logps/chosen": -439.82513427734375, "logps/rejected": -432.65167236328125, "loss": 0.0283, "loss/chosen-sft": 1.806671142578125, "loss/dpo": 0.02832832559943199, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3666812181472778, "rewards/margins": 0.6614538431167603, "rewards/rejected": -2.028134822845459, "step": 1295 }, { "epoch": 0.7210205213533001, "grad_norm": 1.451220989227295, "learning_rate": 1.0995209508149306e-07, "logits/chosen": -0.9968698620796204, "logits/rejected": -0.9646986722946167, "logps/chosen": -460.3720703125, "logps/rejected": -457.362548828125, "loss": 0.0421, "loss/chosen-sft": 1.8072360754013062, "loss/dpo": 0.04210533946752548, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4575574398040771, "rewards/margins": 0.5370498895645142, "rewards/rejected": -1.9946073293685913, "step": 1300 }, { "epoch": 0.7237936772046589, "grad_norm": 1.028705358505249, "learning_rate": 1.0795316090432893e-07, "logits/chosen": -0.7744545936584473, "logits/rejected": -0.6203697323799133, "logps/chosen": -474.93524169921875, "logps/rejected": -458.9364318847656, "loss": 0.0387, "loss/chosen-sft": 1.8132940530776978, "loss/dpo": 0.03869001194834709, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3626388311386108, "rewards/margins": 0.6128473281860352, "rewards/rejected": -1.975486397743225, "step": 1305 }, { "epoch": 0.7265668330560178, "grad_norm": 1.3705142736434937, "learning_rate": 1.0596754862915136e-07, "logits/chosen": -1.0295013189315796, "logits/rejected": -0.9204280972480774, "logps/chosen": -434.82366943359375, "logps/rejected": -449.77423095703125, "loss": 0.0357, "loss/chosen-sft": 1.7468935251235962, "loss/dpo": 0.035688553005456924, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2829020023345947, "rewards/margins": 0.707922101020813, "rewards/rejected": -1.9908241033554077, "step": 1310 }, { "epoch": 0.7293399889073766, "grad_norm": 0.8645944595336914, "learning_rate": 1.0399544447715494e-07, "logits/chosen": -0.797434389591217, "logits/rejected": -0.6160916090011597, "logps/chosen": -464.4483947753906, "logps/rejected": -458.0086364746094, "loss": 0.0418, "loss/chosen-sft": 1.9083738327026367, "loss/dpo": 0.04177068918943405, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5665171146392822, "rewards/margins": 0.468144029378891, "rewards/rejected": -2.034661054611206, "step": 1315 }, { "epoch": 0.7321131447587355, "grad_norm": 1.3339563608169556, "learning_rate": 1.0203703340267192e-07, "logits/chosen": -0.8840950727462769, "logits/rejected": -0.8916382789611816, "logps/chosen": -446.3101501464844, "logps/rejected": -463.23077392578125, "loss": 0.0404, "loss/chosen-sft": 1.7909902334213257, "loss/dpo": 0.0403718575835228, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.371553659439087, "rewards/margins": 0.7114600539207458, "rewards/rejected": -2.0830137729644775, "step": 1320 }, { "epoch": 0.7348863006100943, "grad_norm": 1.4580516815185547, "learning_rate": 1.0009249907582485e-07, "logits/chosen": -1.012226939201355, "logits/rejected": -0.7340652346611023, "logps/chosen": -443.7181701660156, "logps/rejected": -456.22320556640625, "loss": 0.0308, "loss/chosen-sft": 1.804347276687622, "loss/dpo": 0.030787784606218338, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5312182903289795, "rewards/margins": 0.6057604551315308, "rewards/rejected": -2.1369788646698, "step": 1325 }, { "epoch": 0.7376594564614531, "grad_norm": 0.9017294645309448, "learning_rate": 9.816202386530199e-08, "logits/chosen": -0.8607552647590637, "logits/rejected": -0.8176922798156738, "logps/chosen": -476.98260498046875, "logps/rejected": -490.8758239746094, "loss": 0.0357, "loss/chosen-sft": 1.7674789428710938, "loss/dpo": 0.035682059824466705, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4640976190567017, "rewards/margins": 0.7083105444908142, "rewards/rejected": -2.172408103942871, "step": 1330 }, { "epoch": 0.740432612312812, "grad_norm": 1.749300241470337, "learning_rate": 9.62457888212535e-08, "logits/chosen": -0.9406915903091431, "logits/rejected": -0.8014345169067383, "logps/chosen": -438.89849853515625, "logps/rejected": -451.3016662597656, "loss": 0.0358, "loss/chosen-sft": 1.861494779586792, "loss/dpo": 0.03579792380332947, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.349241018295288, "rewards/margins": 0.7064675688743591, "rewards/rejected": -2.055708408355713, "step": 1335 }, { "epoch": 0.7432057681641708, "grad_norm": 1.16977059841156, "learning_rate": 9.434397365831162e-08, "logits/chosen": -0.8669630289077759, "logits/rejected": -0.8108338117599487, "logps/chosen": -437.13848876953125, "logps/rejected": -468.7396545410156, "loss": 0.0317, "loss/chosen-sft": 1.831624984741211, "loss/dpo": 0.03170974552631378, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3475593328475952, "rewards/margins": 0.8466382026672363, "rewards/rejected": -2.194197654724121, "step": 1340 }, { "epoch": 0.7459789240155297, "grad_norm": 2.1936497688293457, "learning_rate": 9.245675673873577e-08, "logits/chosen": -0.9459794163703918, "logits/rejected": -0.9377508163452148, "logps/chosen": -516.6649780273438, "logps/rejected": -491.7989807128906, "loss": 0.0465, "loss/chosen-sft": 1.8350709676742554, "loss/dpo": 0.04651065915822983, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5745521783828735, "rewards/margins": 0.6551668047904968, "rewards/rejected": -2.2297191619873047, "step": 1345 }, { "epoch": 0.7487520798668885, "grad_norm": 1.4931093454360962, "learning_rate": 9.058431505568562e-08, "logits/chosen": -0.7636462450027466, "logits/rejected": -0.8169956207275391, "logps/chosen": -464.9522399902344, "logps/rejected": -456.91650390625, "loss": 0.0312, "loss/chosen-sft": 1.9652206897735596, "loss/dpo": 0.031192084774374962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4719616174697876, "rewards/margins": 0.6275067329406738, "rewards/rejected": -2.099468469619751, "step": 1350 }, { "epoch": 0.7515252357182474, "grad_norm": 1.4099466800689697, "learning_rate": 8.872682421662068e-08, "logits/chosen": -0.787927508354187, "logits/rejected": -0.7053574323654175, "logps/chosen": -463.33087158203125, "logps/rejected": -486.23675537109375, "loss": 0.0261, "loss/chosen-sft": 1.898769736289978, "loss/dpo": 0.026135969907045364, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4200894832611084, "rewards/margins": 0.8304470181465149, "rewards/rejected": -2.2505364418029785, "step": 1355 }, { "epoch": 0.7542983915696062, "grad_norm": 1.183593511581421, "learning_rate": 8.688445842683173e-08, "logits/chosen": -0.9655888676643372, "logits/rejected": -1.0269839763641357, "logps/chosen": -465.0248107910156, "logps/rejected": -462.23638916015625, "loss": 0.037, "loss/chosen-sft": 1.8918052911758423, "loss/dpo": 0.036974623799324036, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.49183988571167, "rewards/margins": 0.5933989882469177, "rewards/rejected": -2.0852386951446533, "step": 1360 }, { "epoch": 0.757071547420965, "grad_norm": 1.654320240020752, "learning_rate": 8.505739047310257e-08, "logits/chosen": -0.9955303072929382, "logits/rejected": -0.8278782963752747, "logps/chosen": -464.4917907714844, "logps/rejected": -458.9212951660156, "loss": 0.0295, "loss/chosen-sft": 1.8392751216888428, "loss/dpo": 0.02950271964073181, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4409606456756592, "rewards/margins": 0.6720392107963562, "rewards/rejected": -2.11299991607666, "step": 1365 }, { "epoch": 0.7598447032723239, "grad_norm": 2.021111249923706, "learning_rate": 8.324579170750518e-08, "logits/chosen": -1.111428141593933, "logits/rejected": -1.0972769260406494, "logps/chosen": -464.07928466796875, "logps/rejected": -461.49462890625, "loss": 0.0334, "loss/chosen-sft": 1.842139482498169, "loss/dpo": 0.03337743133306503, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.577466368675232, "rewards/margins": 0.6026979684829712, "rewards/rejected": -2.1801645755767822, "step": 1370 }, { "epoch": 0.7626178591236827, "grad_norm": 2.152736186981201, "learning_rate": 8.14498320313296e-08, "logits/chosen": -0.8720847964286804, "logits/rejected": -0.8584582209587097, "logps/chosen": -433.6831970214844, "logps/rejected": -444.64703369140625, "loss": 0.0365, "loss/chosen-sft": 1.9097425937652588, "loss/dpo": 0.03647807240486145, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5121186971664429, "rewards/margins": 0.6217208504676819, "rewards/rejected": -2.1338393688201904, "step": 1375 }, { "epoch": 0.7653910149750416, "grad_norm": 1.0486515760421753, "learning_rate": 7.966967987914932e-08, "logits/chosen": -0.8114673495292664, "logits/rejected": -0.8360360264778137, "logps/chosen": -451.40081787109375, "logps/rejected": -462.27850341796875, "loss": 0.0316, "loss/chosen-sft": 1.915459394454956, "loss/dpo": 0.03155887499451637, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4397701025009155, "rewards/margins": 0.7389817237854004, "rewards/rejected": -2.1787517070770264, "step": 1380 }, { "epoch": 0.7681641708264004, "grad_norm": 1.6471340656280518, "learning_rate": 7.7905502203025e-08, "logits/chosen": -0.8942529559135437, "logits/rejected": -0.7852484583854675, "logps/chosen": -430.1280822753906, "logps/rejected": -426.33160400390625, "loss": 0.0318, "loss/chosen-sft": 1.8815257549285889, "loss/dpo": 0.03184288740158081, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3281621932983398, "rewards/margins": 0.6897465586662292, "rewards/rejected": -2.0179085731506348, "step": 1385 }, { "epoch": 0.7709373266777593, "grad_norm": 1.7929314374923706, "learning_rate": 7.615746445684665e-08, "logits/chosen": -1.137683391571045, "logits/rejected": -1.0650242567062378, "logps/chosen": -483.86932373046875, "logps/rejected": -496.49847412109375, "loss": 0.0351, "loss/chosen-sft": 1.70965576171875, "loss/dpo": 0.03509928658604622, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5360954999923706, "rewards/margins": 0.8399691581726074, "rewards/rejected": -2.3760645389556885, "step": 1390 }, { "epoch": 0.7737104825291181, "grad_norm": 1.5007047653198242, "learning_rate": 7.442573058081644e-08, "logits/chosen": -1.022068738937378, "logits/rejected": -0.8677648305892944, "logps/chosen": -446.630859375, "logps/rejected": -453.5487365722656, "loss": 0.0413, "loss/chosen-sft": 1.7940679788589478, "loss/dpo": 0.04133762791752815, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.49905264377594, "rewards/margins": 0.5288316011428833, "rewards/rejected": -2.0278842449188232, "step": 1395 }, { "epoch": 0.776483638380477, "grad_norm": 1.9248064756393433, "learning_rate": 7.271046298607365e-08, "logits/chosen": -0.8683999180793762, "logits/rejected": -0.9180633425712585, "logps/chosen": -461.62811279296875, "logps/rejected": -475.0498046875, "loss": 0.0317, "loss/chosen-sft": 1.8975083827972412, "loss/dpo": 0.03167451545596123, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.472059726715088, "rewards/margins": 0.7187052965164185, "rewards/rejected": -2.190764904022217, "step": 1400 }, { "epoch": 0.7792567942318358, "grad_norm": 1.3342360258102417, "learning_rate": 7.101182253946281e-08, "logits/chosen": -0.8948928117752075, "logits/rejected": -0.6546001434326172, "logps/chosen": -442.4454650878906, "logps/rejected": -462.285888671875, "loss": 0.0349, "loss/chosen-sft": 1.9095041751861572, "loss/dpo": 0.034947969019412994, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5290088653564453, "rewards/margins": 0.6571269631385803, "rewards/rejected": -2.186135768890381, "step": 1405 }, { "epoch": 0.7820299500831946, "grad_norm": 1.7256712913513184, "learning_rate": 6.932996854844658e-08, "logits/chosen": -1.0015538930892944, "logits/rejected": -0.9933465719223022, "logps/chosen": -425.962646484375, "logps/rejected": -465.14178466796875, "loss": 0.0328, "loss/chosen-sft": 1.6965240240097046, "loss/dpo": 0.03280189260840416, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3612712621688843, "rewards/margins": 0.8677095174789429, "rewards/rejected": -2.228980779647827, "step": 1410 }, { "epoch": 0.7848031059345535, "grad_norm": 1.550096035003662, "learning_rate": 6.766505874616571e-08, "logits/chosen": -0.9519731402397156, "logits/rejected": -0.9299766421318054, "logps/chosen": -441.6609802246094, "logps/rejected": -446.9225158691406, "loss": 0.0292, "loss/chosen-sft": 1.765045166015625, "loss/dpo": 0.029176611453294754, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4031174182891846, "rewards/margins": 0.6740747094154358, "rewards/rejected": -2.0771920680999756, "step": 1415 }, { "epoch": 0.7875762617859123, "grad_norm": 1.3679091930389404, "learning_rate": 6.601724927664492e-08, "logits/chosen": -0.7503564953804016, "logits/rejected": -0.7818638682365417, "logps/chosen": -428.4849548339844, "logps/rejected": -410.2527770996094, "loss": 0.0353, "loss/chosen-sft": 1.8490569591522217, "loss/dpo": 0.03528434783220291, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3704160451889038, "rewards/margins": 0.5347402691841125, "rewards/rejected": -1.9051563739776611, "step": 1420 }, { "epoch": 0.7903494176372712, "grad_norm": 1.4390736818313599, "learning_rate": 6.438669468015018e-08, "logits/chosen": -1.0825473070144653, "logits/rejected": -1.1610548496246338, "logps/chosen": -407.60589599609375, "logps/rejected": -412.09747314453125, "loss": 0.04, "loss/chosen-sft": 1.8394687175750732, "loss/dpo": 0.04001585766673088, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.425935983657837, "rewards/margins": 0.44705772399902344, "rewards/rejected": -1.8729937076568604, "step": 1425 }, { "epoch": 0.79312257348863, "grad_norm": 1.4306334257125854, "learning_rate": 6.277354787869385e-08, "logits/chosen": -1.0177534818649292, "logits/rejected": -0.9789898991584778, "logps/chosen": -461.71197509765625, "logps/rejected": -433.2428283691406, "loss": 0.0298, "loss/chosen-sft": 1.7876182794570923, "loss/dpo": 0.029793113470077515, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4716134071350098, "rewards/margins": 0.5654765367507935, "rewards/rejected": -2.0370900630950928, "step": 1430 }, { "epoch": 0.7958957293399889, "grad_norm": 1.1099703311920166, "learning_rate": 6.117796016169374e-08, "logits/chosen": -1.059214472770691, "logits/rejected": -1.051519751548767, "logps/chosen": -492.03424072265625, "logps/rejected": -475.771484375, "loss": 0.0285, "loss/chosen-sft": 1.937116265296936, "loss/dpo": 0.02849132940173149, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6160300970077515, "rewards/margins": 0.6453025937080383, "rewards/rejected": -2.2613329887390137, "step": 1435 }, { "epoch": 0.7986688851913477, "grad_norm": 1.9658957719802856, "learning_rate": 5.9600081171784e-08, "logits/chosen": -0.9570498466491699, "logits/rejected": -0.9356196522712708, "logps/chosen": -460.81982421875, "logps/rejected": -476.683837890625, "loss": 0.0404, "loss/chosen-sft": 1.7493022680282593, "loss/dpo": 0.04036666080355644, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3861143589019775, "rewards/margins": 0.7235819697380066, "rewards/rejected": -2.109696388244629, "step": 1440 }, { "epoch": 0.8014420410427066, "grad_norm": 1.9839755296707153, "learning_rate": 5.8040058890781035e-08, "logits/chosen": -0.8933590054512024, "logits/rejected": -0.9177519083023071, "logps/chosen": -447.789794921875, "logps/rejected": -467.09344482421875, "loss": 0.0316, "loss/chosen-sft": 1.9308321475982666, "loss/dpo": 0.03159358352422714, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4023634195327759, "rewards/margins": 0.8167144656181335, "rewards/rejected": -2.2190778255462646, "step": 1445 }, { "epoch": 0.8042151968940654, "grad_norm": 1.4874529838562012, "learning_rate": 5.6498039625804574e-08, "logits/chosen": -0.9050949215888977, "logits/rejected": -0.8587920069694519, "logps/chosen": -458.5428161621094, "logps/rejected": -460.24542236328125, "loss": 0.0277, "loss/chosen-sft": 1.8650929927825928, "loss/dpo": 0.027697661891579628, "rewards/accuracies": 0.75, "rewards/chosen": -1.4219458103179932, "rewards/margins": 0.7902048826217651, "rewards/rejected": -2.2121505737304688, "step": 1450 }, { "epoch": 0.8069883527454242, "grad_norm": 1.2237356901168823, "learning_rate": 5.4974167995556955e-08, "logits/chosen": -1.0649144649505615, "logits/rejected": -0.894222617149353, "logps/chosen": -449.2327575683594, "logps/rejected": -476.51300048828125, "loss": 0.0296, "loss/chosen-sft": 1.8358997106552124, "loss/dpo": 0.029581155627965927, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4774913787841797, "rewards/margins": 0.7366743087768555, "rewards/rejected": -2.2141659259796143, "step": 1455 }, { "epoch": 0.8097615085967831, "grad_norm": 1.6183357238769531, "learning_rate": 5.346858691675915e-08, "logits/chosen": -0.8394588232040405, "logits/rejected": -0.8869924545288086, "logps/chosen": -450.36273193359375, "logps/rejected": -425.9990234375, "loss": 0.0354, "loss/chosen-sft": 1.9527273178100586, "loss/dpo": 0.035379212349653244, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.555987000465393, "rewards/margins": 0.40505796670913696, "rewards/rejected": -1.9610449075698853, "step": 1460 }, { "epoch": 0.812534664448142, "grad_norm": 1.2106329202651978, "learning_rate": 5.198143759074813e-08, "logits/chosen": -0.8181321024894714, "logits/rejected": -0.8268686532974243, "logps/chosen": -422.52069091796875, "logps/rejected": -439.79437255859375, "loss": 0.0218, "loss/chosen-sft": 1.9324089288711548, "loss/dpo": 0.02178550697863102, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4317119121551514, "rewards/margins": 0.7136107087135315, "rewards/rejected": -2.145322322845459, "step": 1465 }, { "epoch": 0.8153078202995009, "grad_norm": 2.4801158905029297, "learning_rate": 5.051285949023354e-08, "logits/chosen": -0.9226022958755493, "logits/rejected": -0.8704797625541687, "logps/chosen": -494.6982421875, "logps/rejected": -470.52734375, "loss": 0.0366, "loss/chosen-sft": 1.9025272130966187, "loss/dpo": 0.036645907908678055, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6271445751190186, "rewards/margins": 0.5947997570037842, "rewards/rejected": -2.2219443321228027, "step": 1470 }, { "epoch": 0.8180809761508597, "grad_norm": 1.554540753364563, "learning_rate": 4.906299034621761e-08, "logits/chosen": -0.918652355670929, "logits/rejected": -0.7777332067489624, "logps/chosen": -438.8077697753906, "logps/rejected": -469.95928955078125, "loss": 0.0358, "loss/chosen-sft": 1.8588802814483643, "loss/dpo": 0.035846300423145294, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3037010431289673, "rewards/margins": 0.817149817943573, "rewards/rejected": -2.1208510398864746, "step": 1475 }, { "epoch": 0.8208541320022186, "grad_norm": 1.5162110328674316, "learning_rate": 4.7631966135077974e-08, "logits/chosen": -0.9379565119743347, "logits/rejected": -0.8222628831863403, "logps/chosen": -460.7021484375, "logps/rejected": -467.64739990234375, "loss": 0.027, "loss/chosen-sft": 1.943677306175232, "loss/dpo": 0.026978474110364914, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.556178331375122, "rewards/margins": 0.6195975542068481, "rewards/rejected": -2.1757760047912598, "step": 1480 }, { "epoch": 0.8236272878535774, "grad_norm": 1.4927583932876587, "learning_rate": 4.621992106581504e-08, "logits/chosen": -1.0209442377090454, "logits/rejected": -0.7870509028434753, "logps/chosen": -411.0484924316406, "logps/rejected": -458.4707946777344, "loss": 0.0281, "loss/chosen-sft": 1.8953702449798584, "loss/dpo": 0.02814427576959133, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4694321155548096, "rewards/margins": 0.6849014163017273, "rewards/rejected": -2.1543333530426025, "step": 1485 }, { "epoch": 0.8264004437049363, "grad_norm": 1.1416542530059814, "learning_rate": 4.482698756746506e-08, "logits/chosen": -0.8766797184944153, "logits/rejected": -0.6899703145027161, "logps/chosen": -426.66851806640625, "logps/rejected": -439.741455078125, "loss": 0.0395, "loss/chosen-sft": 1.8772623538970947, "loss/dpo": 0.039542876183986664, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4856750965118408, "rewards/margins": 0.5127314329147339, "rewards/rejected": -1.9984066486358643, "step": 1490 }, { "epoch": 0.8291735995562951, "grad_norm": 1.0494132041931152, "learning_rate": 4.3453296276680375e-08, "logits/chosen": -0.7310534715652466, "logits/rejected": -0.7772519588470459, "logps/chosen": -459.5819396972656, "logps/rejected": -443.21087646484375, "loss": 0.0361, "loss/chosen-sft": 1.916404366493225, "loss/dpo": 0.03613205999135971, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4593141078948975, "rewards/margins": 0.5708898305892944, "rewards/rejected": -2.0302040576934814, "step": 1495 }, { "epoch": 0.831946755407654, "grad_norm": 1.4319831132888794, "learning_rate": 4.209897602547768e-08, "logits/chosen": -0.8863110542297363, "logits/rejected": -0.8555914759635925, "logps/chosen": -444.46142578125, "logps/rejected": -446.90155029296875, "loss": 0.0231, "loss/chosen-sft": 1.974479079246521, "loss/dpo": 0.023065898567438126, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5314444303512573, "rewards/margins": 0.6271489858627319, "rewards/rejected": -2.1585934162139893, "step": 1500 }, { "epoch": 0.8347199112590128, "grad_norm": 1.5065139532089233, "learning_rate": 4.076415382915527e-08, "logits/chosen": -0.9296368360519409, "logits/rejected": -0.8830710649490356, "logps/chosen": -458.8507385253906, "logps/rejected": -464.94677734375, "loss": 0.0392, "loss/chosen-sft": 1.758873701095581, "loss/dpo": 0.039175163954496384, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4754732847213745, "rewards/margins": 0.8373439908027649, "rewards/rejected": -2.312817335128784, "step": 1505 }, { "epoch": 0.8374930671103716, "grad_norm": 2.0096213817596436, "learning_rate": 3.944895487438102e-08, "logits/chosen": -0.8368681073188782, "logits/rejected": -0.8462071418762207, "logps/chosen": -440.0353088378906, "logps/rejected": -455.104736328125, "loss": 0.0277, "loss/chosen-sft": 1.8118178844451904, "loss/dpo": 0.02766679786145687, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4319641590118408, "rewards/margins": 0.7568520307540894, "rewards/rejected": -2.1888160705566406, "step": 1510 }, { "epoch": 0.8402662229617305, "grad_norm": 1.322721242904663, "learning_rate": 3.8153502507451727e-08, "logits/chosen": -0.8090510368347168, "logits/rejected": -0.7667346596717834, "logps/chosen": -448.12725830078125, "logps/rejected": -465.86297607421875, "loss": 0.0352, "loss/chosen-sft": 1.8181946277618408, "loss/dpo": 0.03516048565506935, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5605225563049316, "rewards/margins": 0.6109769940376282, "rewards/rejected": -2.171499729156494, "step": 1515 }, { "epoch": 0.8430393788130893, "grad_norm": 0.6515668034553528, "learning_rate": 3.687791822272493e-08, "logits/chosen": -0.8708721995353699, "logits/rejected": -0.8009916543960571, "logps/chosen": -413.84649658203125, "logps/rejected": -422.46844482421875, "loss": 0.0289, "loss/chosen-sft": 1.8942821025848389, "loss/dpo": 0.028922390192747116, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4152357578277588, "rewards/margins": 0.6280101537704468, "rewards/rejected": -2.043246030807495, "step": 1520 }, { "epoch": 0.8458125346644482, "grad_norm": 1.2580747604370117, "learning_rate": 3.562232165122461e-08, "logits/chosen": -0.9807391166687012, "logits/rejected": -0.8759809732437134, "logps/chosen": -468.5948791503906, "logps/rejected": -469.4264221191406, "loss": 0.0312, "loss/chosen-sft": 1.8805710077285767, "loss/dpo": 0.03115806356072426, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6106830835342407, "rewards/margins": 0.6084949374198914, "rewards/rejected": -2.2191781997680664, "step": 1525 }, { "epoch": 0.848585690515807, "grad_norm": 1.6682928800582886, "learning_rate": 3.438683054942154e-08, "logits/chosen": -0.9435014724731445, "logits/rejected": -0.9378792643547058, "logps/chosen": -497.65216064453125, "logps/rejected": -514.5195922851562, "loss": 0.0276, "loss/chosen-sft": 1.8878179788589478, "loss/dpo": 0.02764343097805977, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5859915018081665, "rewards/margins": 0.8114764094352722, "rewards/rejected": -2.397467851638794, "step": 1530 }, { "epoch": 0.8513588463671659, "grad_norm": 1.870586633682251, "learning_rate": 3.3171560788189555e-08, "logits/chosen": -0.7878917455673218, "logits/rejected": -0.8808261752128601, "logps/chosen": -448.91168212890625, "logps/rejected": -445.90191650390625, "loss": 0.03, "loss/chosen-sft": 1.9762178659439087, "loss/dpo": 0.029997188597917557, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4799820184707642, "rewards/margins": 0.5908954739570618, "rewards/rejected": -2.0708775520324707, "step": 1535 }, { "epoch": 0.8541320022185247, "grad_norm": 1.2771514654159546, "learning_rate": 3.19766263419384e-08, "logits/chosen": -0.9804418683052063, "logits/rejected": -0.9339153170585632, "logps/chosen": -451.52972412109375, "logps/rejected": -487.1279296875, "loss": 0.0356, "loss/chosen-sft": 1.733094573020935, "loss/dpo": 0.03555992618203163, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3957600593566895, "rewards/margins": 0.8448952436447144, "rewards/rejected": -2.2406554222106934, "step": 1540 }, { "epoch": 0.8569051580698835, "grad_norm": 1.825315237045288, "learning_rate": 3.080213927792471e-08, "logits/chosen": -0.9238882064819336, "logits/rejected": -0.9216313362121582, "logps/chosen": -449.4173278808594, "logps/rejected": -422.06097412109375, "loss": 0.0379, "loss/chosen-sft": 1.8662887811660767, "loss/dpo": 0.037861116230487823, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5062415599822998, "rewards/margins": 0.5276464223861694, "rewards/rejected": -2.0338878631591797, "step": 1545 }, { "epoch": 0.8596783139212424, "grad_norm": 1.1899453401565552, "learning_rate": 2.9648209745741838e-08, "logits/chosen": -0.9589411020278931, "logits/rejected": -0.8292596936225891, "logps/chosen": -456.76092529296875, "logps/rejected": -475.18206787109375, "loss": 0.0399, "loss/chosen-sft": 1.9160964488983154, "loss/dpo": 0.03985082358121872, "rewards/accuracies": 0.625, "rewards/chosen": -1.461181879043579, "rewards/margins": 0.6755915880203247, "rewards/rejected": -2.1367735862731934, "step": 1550 }, { "epoch": 0.8624514697726012, "grad_norm": 0.9635495543479919, "learning_rate": 2.8514945966989085e-08, "logits/chosen": -1.0600335597991943, "logits/rejected": -0.9049699902534485, "logps/chosen": -442.63555908203125, "logps/rejected": -459.53570556640625, "loss": 0.0311, "loss/chosen-sft": 1.8001188039779663, "loss/dpo": 0.03108590468764305, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4419844150543213, "rewards/margins": 0.7059625387191772, "rewards/rejected": -2.147946834564209, "step": 1555 }, { "epoch": 0.8652246256239601, "grad_norm": 1.15142822265625, "learning_rate": 2.7402454225122744e-08, "logits/chosen": -1.0275518894195557, "logits/rejected": -0.8722109794616699, "logps/chosen": -416.63787841796875, "logps/rejected": -473.12542724609375, "loss": 0.0299, "loss/chosen-sft": 1.8142344951629639, "loss/dpo": 0.029919322580099106, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4069955348968506, "rewards/margins": 0.8083630800247192, "rewards/rejected": -2.2153587341308594, "step": 1560 }, { "epoch": 0.8679977814753189, "grad_norm": 1.1056207418441772, "learning_rate": 2.631083885548749e-08, "logits/chosen": -0.9130992889404297, "logits/rejected": -0.7697763442993164, "logps/chosen": -439.4908752441406, "logps/rejected": -480.8926696777344, "loss": 0.0255, "loss/chosen-sft": 1.9055465459823608, "loss/dpo": 0.025480791926383972, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5075581073760986, "rewards/margins": 0.8769866824150085, "rewards/rejected": -2.384544849395752, "step": 1565 }, { "epoch": 0.8707709373266778, "grad_norm": 1.4552932977676392, "learning_rate": 2.524020223553208e-08, "logits/chosen": -0.8160026669502258, "logits/rejected": -0.8994795083999634, "logps/chosen": -484.19598388671875, "logps/rejected": -469.77947998046875, "loss": 0.0308, "loss/chosen-sft": 1.9661228656768799, "loss/dpo": 0.03077336587011814, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.574336051940918, "rewards/margins": 0.6738288998603821, "rewards/rejected": -2.2481648921966553, "step": 1570 }, { "epoch": 0.8735440931780366, "grad_norm": 1.7502833604812622, "learning_rate": 2.4190644775207075e-08, "logits/chosen": -0.9263548851013184, "logits/rejected": -0.8413556814193726, "logps/chosen": -437.6075744628906, "logps/rejected": -467.7953186035156, "loss": 0.0372, "loss/chosen-sft": 1.796007513999939, "loss/dpo": 0.03721100836992264, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3773536682128906, "rewards/margins": 0.7799967527389526, "rewards/rejected": -2.157350540161133, "step": 1575 }, { "epoch": 0.8763172490293955, "grad_norm": 0.8901669383049011, "learning_rate": 2.316226490754844e-08, "logits/chosen": -0.9022026062011719, "logits/rejected": -0.9546858072280884, "logps/chosen": -455.98046875, "logps/rejected": -470.28582763671875, "loss": 0.0353, "loss/chosen-sft": 1.8491671085357666, "loss/dpo": 0.035286836326122284, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4982842206954956, "rewards/margins": 0.7641604542732239, "rewards/rejected": -2.2624447345733643, "step": 1580 }, { "epoch": 0.8790904048807543, "grad_norm": 0.9218027591705322, "learning_rate": 2.215515907944576e-08, "logits/chosen": -0.9104015231132507, "logits/rejected": -0.8896605372428894, "logps/chosen": -470.4886169433594, "logps/rejected": -467.1653747558594, "loss": 0.0378, "loss/chosen-sft": 1.9249181747436523, "loss/dpo": 0.037777096033096313, "rewards/accuracies": 0.625, "rewards/chosen": -1.5565952062606812, "rewards/margins": 0.5057957768440247, "rewards/rejected": -2.0623910427093506, "step": 1585 }, { "epoch": 0.8818635607321131, "grad_norm": 1.4500895738601685, "learning_rate": 2.1169421742596923e-08, "logits/chosen": -0.8355404734611511, "logits/rejected": -0.8188692331314087, "logps/chosen": -468.78759765625, "logps/rejected": -436.91619873046875, "loss": 0.0279, "loss/chosen-sft": 1.9327714443206787, "loss/dpo": 0.02788035199046135, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.53583824634552, "rewards/margins": 0.596767783164978, "rewards/rejected": -2.132605791091919, "step": 1590 }, { "epoch": 0.884636716583472, "grad_norm": 1.5247975587844849, "learning_rate": 2.02051453446499e-08, "logits/chosen": -0.9076545834541321, "logits/rejected": -0.8217021822929382, "logps/chosen": -462.9772033691406, "logps/rejected": -458.1619567871094, "loss": 0.0357, "loss/chosen-sft": 1.872582197189331, "loss/dpo": 0.035654447972774506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4506585597991943, "rewards/margins": 0.6560076475143433, "rewards/rejected": -2.106666088104248, "step": 1595 }, { "epoch": 0.8874098724348308, "grad_norm": 2.1406707763671875, "learning_rate": 1.9262420320532768e-08, "logits/chosen": -0.9324251413345337, "logits/rejected": -0.7913103103637695, "logps/chosen": -460.52789306640625, "logps/rejected": -489.23614501953125, "loss": 0.0312, "loss/chosen-sft": 1.8192570209503174, "loss/dpo": 0.03123651072382927, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4744288921356201, "rewards/margins": 0.7909864187240601, "rewards/rejected": -2.2654154300689697, "step": 1600 }, { "epoch": 0.8901830282861897, "grad_norm": 2.196462631225586, "learning_rate": 1.8341335083971815e-08, "logits/chosen": -0.7642068862915039, "logits/rejected": -0.7473465800285339, "logps/chosen": -469.9163513183594, "logps/rejected": -457.0773010253906, "loss": 0.0376, "loss/chosen-sft": 1.9092369079589844, "loss/dpo": 0.037584852427244186, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5675512552261353, "rewards/margins": 0.54124915599823, "rewards/rejected": -2.1088004112243652, "step": 1605 }, { "epoch": 0.8929561841375485, "grad_norm": 1.6157978773117065, "learning_rate": 1.7441976019200166e-08, "logits/chosen": -1.053679347038269, "logits/rejected": -0.9105228185653687, "logps/chosen": -452.0389709472656, "logps/rejected": -464.1913146972656, "loss": 0.0342, "loss/chosen-sft": 1.905279517173767, "loss/dpo": 0.034248754382133484, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5297168493270874, "rewards/margins": 0.5347718596458435, "rewards/rejected": -2.064488649368286, "step": 1610 }, { "epoch": 0.8957293399889074, "grad_norm": 1.4368631839752197, "learning_rate": 1.6564427472855662e-08, "logits/chosen": -0.928158164024353, "logits/rejected": -0.781406819820404, "logps/chosen": -456.40380859375, "logps/rejected": -482.193603515625, "loss": 0.0359, "loss/chosen-sft": 1.7659343481063843, "loss/dpo": 0.03594128414988518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.445878267288208, "rewards/margins": 0.774651825428009, "rewards/rejected": -2.2205300331115723, "step": 1615 }, { "epoch": 0.8985024958402662, "grad_norm": 1.0532777309417725, "learning_rate": 1.570877174607088e-08, "logits/chosen": -0.9026380777359009, "logits/rejected": -0.7336758375167847, "logps/chosen": -474.20599365234375, "logps/rejected": -458.268798828125, "loss": 0.039, "loss/chosen-sft": 1.8853695392608643, "loss/dpo": 0.03895549476146698, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5534743070602417, "rewards/margins": 0.538535475730896, "rewards/rejected": -2.0920097827911377, "step": 1620 }, { "epoch": 0.901275651691625, "grad_norm": 1.640979290008545, "learning_rate": 1.4875089086754111e-08, "logits/chosen": -0.9097537994384766, "logits/rejected": -0.68541419506073, "logps/chosen": -440.2255859375, "logps/rejected": -470.017333984375, "loss": 0.0278, "loss/chosen-sft": 1.8272202014923096, "loss/dpo": 0.027846310287714005, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4420444965362549, "rewards/margins": 0.7163407206535339, "rewards/rejected": -2.1583850383758545, "step": 1625 }, { "epoch": 0.9040488075429839, "grad_norm": 1.3717536926269531, "learning_rate": 1.4063457682063573e-08, "logits/chosen": -1.025506854057312, "logits/rejected": -0.8326213955879211, "logps/chosen": -396.4437561035156, "logps/rejected": -455.72265625, "loss": 0.0323, "loss/chosen-sft": 1.8843291997909546, "loss/dpo": 0.032339178025722504, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4136439561843872, "rewards/margins": 0.7241330146789551, "rewards/rejected": -2.137777090072632, "step": 1630 }, { "epoch": 0.9068219633943427, "grad_norm": 1.1654001474380493, "learning_rate": 1.3273953651074393e-08, "logits/chosen": -1.0155668258666992, "logits/rejected": -0.751710057258606, "logps/chosen": -461.4085998535156, "logps/rejected": -451.8213806152344, "loss": 0.0251, "loss/chosen-sft": 1.8744451999664307, "loss/dpo": 0.02511768974363804, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5510212182998657, "rewards/margins": 0.5998275876045227, "rewards/rejected": -2.150848865509033, "step": 1635 }, { "epoch": 0.9095951192457016, "grad_norm": 1.6467472314834595, "learning_rate": 1.250665103763987e-08, "logits/chosen": -0.8909111022949219, "logits/rejected": -0.8106695413589478, "logps/chosen": -450.2090759277344, "logps/rejected": -483.452880859375, "loss": 0.0419, "loss/chosen-sft": 1.795325517654419, "loss/dpo": 0.04194999486207962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.475182056427002, "rewards/margins": 0.7644132971763611, "rewards/rejected": -2.239595413208008, "step": 1640 }, { "epoch": 0.9123682750970604, "grad_norm": 1.179222583770752, "learning_rate": 1.1761621803447336e-08, "logits/chosen": -0.8358624577522278, "logits/rejected": -0.7992157936096191, "logps/chosen": -449.79736328125, "logps/rejected": -441.9453125, "loss": 0.0362, "loss/chosen-sft": 1.9050792455673218, "loss/dpo": 0.03616093099117279, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.381279468536377, "rewards/margins": 0.6649306416511536, "rewards/rejected": -2.0462100505828857, "step": 1645 }, { "epoch": 0.9151414309484193, "grad_norm": 1.244621753692627, "learning_rate": 1.1038935821268941e-08, "logits/chosen": -0.833348274230957, "logits/rejected": -0.6609139442443848, "logps/chosen": -409.50445556640625, "logps/rejected": -461.4767150878906, "loss": 0.0329, "loss/chosen-sft": 1.8607597351074219, "loss/dpo": 0.032938309013843536, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4654487371444702, "rewards/margins": 0.6329339146614075, "rewards/rejected": -2.0983824729919434, "step": 1650 }, { "epoch": 0.9179145867997781, "grad_norm": 1.552565336227417, "learning_rate": 1.0338660868408927e-08, "logits/chosen": -0.7576335668563843, "logits/rejected": -0.7207523584365845, "logps/chosen": -497.5868225097656, "logps/rejected": -482.0480041503906, "loss": 0.0367, "loss/chosen-sft": 1.8623535633087158, "loss/dpo": 0.036711666733026505, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4475281238555908, "rewards/margins": 0.6721404790878296, "rewards/rejected": -2.119668483734131, "step": 1655 }, { "epoch": 0.920687742651137, "grad_norm": 2.2845067977905273, "learning_rate": 9.660862620346877e-09, "logits/chosen": -0.7926570177078247, "logits/rejected": -0.7714422345161438, "logps/chosen": -473.17706298828125, "logps/rejected": -469.4546813964844, "loss": 0.0379, "loss/chosen-sft": 1.8994096517562866, "loss/dpo": 0.0379011332988739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5939931869506836, "rewards/margins": 0.5131052136421204, "rewards/rejected": -2.107098340988159, "step": 1660 }, { "epoch": 0.9234608985024958, "grad_norm": 1.7006827592849731, "learning_rate": 9.005604644578473e-09, "logits/chosen": -0.8427762985229492, "logits/rejected": -0.8619502186775208, "logps/chosen": -480.4871520996094, "logps/rejected": -467.28363037109375, "loss": 0.0539, "loss/chosen-sft": 1.8251368999481201, "loss/dpo": 0.05387115478515625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6016905307769775, "rewards/margins": 0.5135762691497803, "rewards/rejected": -2.115266799926758, "step": 1665 }, { "epoch": 0.9262340543538546, "grad_norm": 1.814226508140564, "learning_rate": 8.372948394653717e-09, "logits/chosen": -0.8938384056091309, "logits/rejected": -0.8742619752883911, "logps/chosen": -495.4317321777344, "logps/rejected": -477.78497314453125, "loss": 0.0345, "loss/chosen-sft": 1.8645175695419312, "loss/dpo": 0.03449907898902893, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5643129348754883, "rewards/margins": 0.605742871761322, "rewards/rejected": -2.170055866241455, "step": 1670 }, { "epoch": 0.9290072102052135, "grad_norm": 2.352726697921753, "learning_rate": 7.762953204413475e-09, "logits/chosen": -1.0139451026916504, "logits/rejected": -1.1322184801101685, "logps/chosen": -484.33172607421875, "logps/rejected": -444.97344970703125, "loss": 0.044, "loss/chosen-sft": 1.884155511856079, "loss/dpo": 0.04397551342844963, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5331767797470093, "rewards/margins": 0.5316352844238281, "rewards/rejected": -2.064812183380127, "step": 1675 }, { "epoch": 0.9317803660565723, "grad_norm": 1.378034234046936, "learning_rate": 7.175676282424964e-09, "logits/chosen": -0.9042679667472839, "logits/rejected": -0.8096022605895996, "logps/chosen": -460.42950439453125, "logps/rejected": -475.254638671875, "loss": 0.0342, "loss/chosen-sft": 1.8950878381729126, "loss/dpo": 0.03422309830784798, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.513983130455017, "rewards/margins": 0.7315508127212524, "rewards/rejected": -2.2455339431762695, "step": 1680 }, { "epoch": 0.9345535219079312, "grad_norm": 1.6129722595214844, "learning_rate": 6.611172706616291e-09, "logits/chosen": -0.9313820004463196, "logits/rejected": -0.8965193033218384, "logps/chosen": -446.47283935546875, "logps/rejected": -424.822509765625, "loss": 0.0305, "loss/chosen-sft": 1.8703186511993408, "loss/dpo": 0.030467282980680466, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5669549703598022, "rewards/margins": 0.5169414281845093, "rewards/rejected": -2.0838963985443115, "step": 1685 }, { "epoch": 0.93732667775929, "grad_norm": 1.3904266357421875, "learning_rate": 6.069495419111003e-09, "logits/chosen": -0.8602801561355591, "logits/rejected": -0.8798410296440125, "logps/chosen": -514.514892578125, "logps/rejected": -513.6451416015625, "loss": 0.0346, "loss/chosen-sft": 1.8502308130264282, "loss/dpo": 0.03457609936594963, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4935173988342285, "rewards/margins": 0.9913633465766907, "rewards/rejected": -2.4848809242248535, "step": 1690 }, { "epoch": 0.940099833610649, "grad_norm": 1.7256861925125122, "learning_rate": 5.550695221263002e-09, "logits/chosen": -1.1261186599731445, "logits/rejected": -1.107710599899292, "logps/chosen": -490.0545959472656, "logps/rejected": -447.8335876464844, "loss": 0.0385, "loss/chosen-sft": 1.7753324508666992, "loss/dpo": 0.038456808775663376, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.554856538772583, "rewards/margins": 0.5833019018173218, "rewards/rejected": -2.1381585597991943, "step": 1695 }, { "epoch": 0.9428729894620078, "grad_norm": 1.7229971885681152, "learning_rate": 5.054820768891854e-09, "logits/chosen": -0.7668136954307556, "logits/rejected": -0.7551401257514954, "logps/chosen": -470.37451171875, "logps/rejected": -467.57177734375, "loss": 0.0284, "loss/chosen-sft": 1.8557796478271484, "loss/dpo": 0.028406251221895218, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5322498083114624, "rewards/margins": 0.6786109805107117, "rewards/rejected": -2.2108609676361084, "step": 1700 }, { "epoch": 0.9456461453133667, "grad_norm": 2.3088462352752686, "learning_rate": 4.581918567719917e-09, "logits/chosen": -0.9519344568252563, "logits/rejected": -0.9092620015144348, "logps/chosen": -490.27813720703125, "logps/rejected": -480.19171142578125, "loss": 0.0364, "loss/chosen-sft": 1.8951151371002197, "loss/dpo": 0.03644304722547531, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4337555170059204, "rewards/margins": 0.7906380891799927, "rewards/rejected": -2.224393367767334, "step": 1705 }, { "epoch": 0.9484193011647255, "grad_norm": 1.4663399457931519, "learning_rate": 4.132032969010546e-09, "logits/chosen": -0.9776910543441772, "logits/rejected": -0.8744028210639954, "logps/chosen": -475.00274658203125, "logps/rejected": -476.331298828125, "loss": 0.0408, "loss/chosen-sft": 1.855271339416504, "loss/dpo": 0.040752846747636795, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.570431113243103, "rewards/margins": 0.5336312055587769, "rewards/rejected": -2.10406231880188, "step": 1710 }, { "epoch": 0.9511924570160843, "grad_norm": 1.3314710855484009, "learning_rate": 3.705206165408703e-09, "logits/chosen": -0.9017612338066101, "logits/rejected": -0.9057533144950867, "logps/chosen": -448.4554138183594, "logps/rejected": -430.2477111816406, "loss": 0.0334, "loss/chosen-sft": 1.8261409997940063, "loss/dpo": 0.03343508765101433, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4459483623504639, "rewards/margins": 0.5900785326957703, "rewards/rejected": -2.0360267162323, "step": 1715 }, { "epoch": 0.9539656128674432, "grad_norm": 1.0403763055801392, "learning_rate": 3.301478186983897e-09, "logits/chosen": -0.9940805435180664, "logits/rejected": -1.1318776607513428, "logps/chosen": -436.7303161621094, "logps/rejected": -440.62969970703125, "loss": 0.0342, "loss/chosen-sft": 1.7668132781982422, "loss/dpo": 0.03422313928604126, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4205049276351929, "rewards/margins": 0.8040353655815125, "rewards/rejected": -2.2245402336120605, "step": 1720 }, { "epoch": 0.956738768718802, "grad_norm": 1.7417303323745728, "learning_rate": 2.9208868974759937e-09, "logits/chosen": -0.8438510894775391, "logits/rejected": -0.8072006106376648, "logps/chosen": -452.6871643066406, "logps/rejected": -420.52227783203125, "loss": 0.0271, "loss/chosen-sft": 1.9397671222686768, "loss/dpo": 0.027111122384667397, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4771382808685303, "rewards/margins": 0.5044761896133423, "rewards/rejected": -1.981614351272583, "step": 1725 }, { "epoch": 0.9595119245701609, "grad_norm": 1.5373972654342651, "learning_rate": 2.5634679907440006e-09, "logits/chosen": -0.9225784540176392, "logits/rejected": -0.8528642654418945, "logps/chosen": -441.6475524902344, "logps/rejected": -410.14324951171875, "loss": 0.0367, "loss/chosen-sft": 1.814764380455017, "loss/dpo": 0.03674568608403206, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4582865238189697, "rewards/margins": 0.40831345319747925, "rewards/rejected": -1.8665997982025146, "step": 1730 }, { "epoch": 0.9622850804215197, "grad_norm": 1.2935632467269897, "learning_rate": 2.229254987418744e-09, "logits/chosen": -0.8527067303657532, "logits/rejected": -0.7993227243423462, "logps/chosen": -499.4395446777344, "logps/rejected": -482.21197509765625, "loss": 0.0354, "loss/chosen-sft": 1.8621113300323486, "loss/dpo": 0.03540179878473282, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4600284099578857, "rewards/margins": 0.7969890832901001, "rewards/rejected": -2.2570176124572754, "step": 1735 }, { "epoch": 0.9650582362728786, "grad_norm": 1.218867540359497, "learning_rate": 1.9182792317588294e-09, "logits/chosen": -0.9154025912284851, "logits/rejected": -0.8462129831314087, "logps/chosen": -430.40594482421875, "logps/rejected": -414.01397705078125, "loss": 0.0362, "loss/chosen-sft": 1.7988450527191162, "loss/dpo": 0.03615463525056839, "rewards/accuracies": 0.625, "rewards/chosen": -1.418278694152832, "rewards/margins": 0.5125333666801453, "rewards/rejected": -1.930812120437622, "step": 1740 }, { "epoch": 0.9678313921242374, "grad_norm": 1.9518085718154907, "learning_rate": 1.6305698887113806e-09, "logits/chosen": -0.9541261792182922, "logits/rejected": -0.8191477656364441, "logps/chosen": -407.2529602050781, "logps/rejected": -422.18865966796875, "loss": 0.0323, "loss/chosen-sft": 1.8834625482559204, "loss/dpo": 0.0322595052421093, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3437700271606445, "rewards/margins": 0.6855727434158325, "rewards/rejected": -2.0293426513671875, "step": 1745 }, { "epoch": 0.9706045479755963, "grad_norm": 1.7286250591278076, "learning_rate": 1.366153941176451e-09, "logits/chosen": -1.052943468093872, "logits/rejected": -0.9355376362800598, "logps/chosen": -460.63153076171875, "logps/rejected": -460.62994384765625, "loss": 0.0427, "loss/chosen-sft": 1.877771019935608, "loss/dpo": 0.042682547122240067, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5018032789230347, "rewards/margins": 0.6279391050338745, "rewards/rejected": -2.129742383956909, "step": 1750 }, { "epoch": 0.9733777038269551, "grad_norm": 0.8861921429634094, "learning_rate": 1.1250561874766029e-09, "logits/chosen": -1.0122451782226562, "logits/rejected": -0.9112497568130493, "logps/chosen": -472.03948974609375, "logps/rejected": -466.75006103515625, "loss": 0.0358, "loss/chosen-sft": 1.8839130401611328, "loss/dpo": 0.03582266345620155, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5027332305908203, "rewards/margins": 0.59858238697052, "rewards/rejected": -2.101315498352051, "step": 1755 }, { "epoch": 0.9761508596783139, "grad_norm": 1.1699074506759644, "learning_rate": 9.072992390312117e-10, "logits/chosen": -0.8916594386100769, "logits/rejected": -0.7321555018424988, "logps/chosen": -414.146728515625, "logps/rejected": -429.60125732421875, "loss": 0.0415, "loss/chosen-sft": 1.8041099309921265, "loss/dpo": 0.04147591441869736, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4231741428375244, "rewards/margins": 0.5474594235420227, "rewards/rejected": -1.9706335067749023, "step": 1760 }, { "epoch": 0.9789240155296728, "grad_norm": 2.7589733600616455, "learning_rate": 7.12903518235719e-10, "logits/chosen": -1.0372343063354492, "logits/rejected": -0.8465463519096375, "logps/chosen": -406.49755859375, "logps/rejected": -411.850341796875, "loss": 0.0382, "loss/chosen-sft": 1.7656471729278564, "loss/dpo": 0.03815629705786705, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.410011649131775, "rewards/margins": 0.5866178274154663, "rewards/rejected": -1.9966297149658203, "step": 1765 }, { "epoch": 0.9816971713810316, "grad_norm": 1.103261947631836, "learning_rate": 5.418872565464139e-10, "logits/chosen": -0.8118406534194946, "logits/rejected": -0.6884601712226868, "logps/chosen": -441.5873107910156, "logps/rejected": -420.6312561035156, "loss": 0.0308, "loss/chosen-sft": 1.925108551979065, "loss/dpo": 0.03077712655067444, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.486276388168335, "rewards/margins": 0.5229099988937378, "rewards/rejected": -2.0091867446899414, "step": 1770 }, { "epoch": 0.9844703272323905, "grad_norm": 1.2194581031799316, "learning_rate": 3.942664927706063e-10, "logits/chosen": -1.0195016860961914, "logits/rejected": -0.9845107793807983, "logps/chosen": -432.424560546875, "logps/rejected": -431.7875061035156, "loss": 0.0297, "loss/chosen-sft": 1.9019267559051514, "loss/dpo": 0.02968726120889187, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5260287523269653, "rewards/margins": 0.49342551827430725, "rewards/rejected": -2.01945424079895, "step": 1775 }, { "epoch": 0.9872434830837493, "grad_norm": 1.3297661542892456, "learning_rate": 2.700550715623029e-10, "logits/chosen": -0.9309781193733215, "logits/rejected": -0.765326201915741, "logps/chosen": -445.7706604003906, "logps/rejected": -450.1620178222656, "loss": 0.0314, "loss/chosen-sft": 1.8302927017211914, "loss/dpo": 0.03142596781253815, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3721158504486084, "rewards/margins": 0.6550136804580688, "rewards/rejected": -2.0271294116973877, "step": 1780 }, { "epoch": 0.9900166389351082, "grad_norm": 1.6393502950668335, "learning_rate": 1.692646421239674e-10, "logits/chosen": -0.9880639910697937, "logits/rejected": -0.8702710270881653, "logps/chosen": -454.1924743652344, "logps/rejected": -474.16741943359375, "loss": 0.0314, "loss/chosen-sft": 1.8532609939575195, "loss/dpo": 0.031443167477846146, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4502158164978027, "rewards/margins": 0.7252721786499023, "rewards/rejected": -2.175487995147705, "step": 1785 }, { "epoch": 0.992789794786467, "grad_norm": 1.782798409461975, "learning_rate": 9.190465711375606e-11, "logits/chosen": -0.8094294667243958, "logits/rejected": -0.825986385345459, "logps/chosen": -438.8473205566406, "logps/rejected": -475.4677734375, "loss": 0.0401, "loss/chosen-sft": 1.7468044757843018, "loss/dpo": 0.04005669802427292, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3847625255584717, "rewards/margins": 0.7623748779296875, "rewards/rejected": -2.147137403488159, "step": 1790 }, { "epoch": 0.9955629506378258, "grad_norm": 1.64493989944458, "learning_rate": 3.798237175925423e-11, "logits/chosen": -0.9432379007339478, "logits/rejected": -0.890951931476593, "logps/chosen": -427.1351623535156, "logps/rejected": -463.54705810546875, "loss": 0.0352, "loss/chosen-sft": 1.7833278179168701, "loss/dpo": 0.03518053516745567, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4109861850738525, "rewards/margins": 0.7486652135848999, "rewards/rejected": -2.159651279449463, "step": 1795 }, { "epoch": 0.9983361064891847, "grad_norm": 1.9120142459869385, "learning_rate": 7.502843176826478e-12, "logits/chosen": -0.9462081789970398, "logits/rejected": -0.8944603204727173, "logps/chosen": -472.4647521972656, "logps/rejected": -491.3990173339844, "loss": 0.0391, "loss/chosen-sft": 1.819483757019043, "loss/dpo": 0.03906460851430893, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.514047384262085, "rewards/margins": 0.6475565433502197, "rewards/rejected": -2.161604166030884, "step": 1800 }, { "epoch": 1.0, "step": 1803, "total_flos": 0.0, "train_loss": 0.08027586419418396, "train_runtime": 2186.6137, "train_samples_per_second": 26.375, "train_steps_per_second": 0.825 } ], "logging_steps": 5, "max_steps": 1803, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }