{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07223164374097105, "eval_steps": 500, "global_step": 1150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.281012499214873e-05, "grad_norm": 3.7086377143859863, "learning_rate": 0.0, "logits/chosen": -1.8339842557907104, "logits/rejected": -1.8535029888153076, "logps/chosen": -253.55299377441406, "logps/rejected": -256.02752685546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0006281012499214873, "grad_norm": 2.868622303009033, "learning_rate": 9e-06, "logits/chosen": -1.9413594007492065, "logits/rejected": -1.995047926902771, "logps/chosen": -248.8763885498047, "logps/rejected": -231.43014526367188, "loss": 0.6561, "rewards/accuracies": 0.6388888955116272, "rewards/chosen": 0.10710998624563217, "rewards/margins": 0.08558527380228043, "rewards/rejected": 0.021524710580706596, "step": 10 }, { "epoch": 0.0012562024998429746, "grad_norm": 2.5598604679107666, "learning_rate": 1.9e-05, "logits/chosen": -1.947582483291626, "logits/rejected": -2.027026653289795, "logps/chosen": -241.2770538330078, "logps/rejected": -214.34658813476562, "loss": 0.5165, "rewards/accuracies": 0.75, "rewards/chosen": 0.5332101583480835, "rewards/margins": 0.6832095980644226, "rewards/rejected": -0.1499994695186615, "step": 20 }, { "epoch": 0.001884303749764462, "grad_norm": 2.094403028488159, "learning_rate": 2.9e-05, "logits/chosen": -1.7763592004776, "logits/rejected": -1.7671773433685303, "logps/chosen": -255.46047973632812, "logps/rejected": -251.75900268554688, "loss": 0.4294, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7961806058883667, "rewards/margins": 1.6154584884643555, "rewards/rejected": -0.8192778825759888, "step": 30 }, { "epoch": 0.0025124049996859492, "grad_norm": 0.6335717439651489, "learning_rate": 3.9000000000000006e-05, "logits/chosen": -1.7676454782485962, "logits/rejected": -1.775486707687378, "logps/chosen": -245.90878295898438, "logps/rejected": -262.4179382324219, "loss": 0.4507, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.38805627822875977, "rewards/margins": 2.8238627910614014, "rewards/rejected": -2.4358065128326416, "step": 40 }, { "epoch": 0.0031405062496074367, "grad_norm": 2.543531656265259, "learning_rate": 4.9e-05, "logits/chosen": -1.8130578994750977, "logits/rejected": -1.859423279762268, "logps/chosen": -258.4760437011719, "logps/rejected": -282.54986572265625, "loss": 0.375, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5745254755020142, "rewards/margins": 3.9984238147735596, "rewards/rejected": -5.572949409484863, "step": 50 }, { "epoch": 0.003768607499528924, "grad_norm": 3.926485776901245, "learning_rate": 5.9e-05, "logits/chosen": -1.735634207725525, "logits/rejected": -1.7788416147232056, "logps/chosen": -299.06561279296875, "logps/rejected": -319.36175537109375, "loss": 0.4598, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.3173580169677734, "rewards/margins": 4.576085567474365, "rewards/rejected": -7.8934431076049805, "step": 60 }, { "epoch": 0.0043967087494504115, "grad_norm": 2.7220869064331055, "learning_rate": 6.9e-05, "logits/chosen": -1.663975477218628, "logits/rejected": -1.6840848922729492, "logps/chosen": -284.1070251464844, "logps/rejected": -310.91595458984375, "loss": 0.4238, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3122127056121826, "rewards/margins": 4.7849040031433105, "rewards/rejected": -7.097116947174072, "step": 70 }, { "epoch": 0.0050248099993718985, "grad_norm": 4.404968738555908, "learning_rate": 7.900000000000001e-05, "logits/chosen": -1.6881587505340576, "logits/rejected": -1.7024128437042236, "logps/chosen": -267.61456298828125, "logps/rejected": -289.51031494140625, "loss": 0.3982, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0072319507598877, "rewards/margins": 3.649735689163208, "rewards/rejected": -4.6569671630859375, "step": 80 }, { "epoch": 0.005652911249293386, "grad_norm": 4.015685558319092, "learning_rate": 8.900000000000001e-05, "logits/chosen": -1.7967586517333984, "logits/rejected": -1.7952836751937866, "logps/chosen": -292.94305419921875, "logps/rejected": -315.1247253417969, "loss": 0.4299, "rewards/accuracies": 0.8125, "rewards/chosen": -3.157033681869507, "rewards/margins": 3.965163469314575, "rewards/rejected": -7.122197151184082, "step": 90 }, { "epoch": 0.006281012499214873, "grad_norm": 8.7476806640625, "learning_rate": 9.900000000000001e-05, "logits/chosen": -1.745796799659729, "logits/rejected": -1.715309500694275, "logps/chosen": -313.0873107910156, "logps/rejected": -341.35430908203125, "loss": 0.5043, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.297704219818115, "rewards/margins": 4.502422332763672, "rewards/rejected": -8.800127029418945, "step": 100 }, { "epoch": 0.006909113749136361, "grad_norm": 4.677415370941162, "learning_rate": 0.000109, "logits/chosen": -1.7246410846710205, "logits/rejected": -1.73520827293396, "logps/chosen": -304.52642822265625, "logps/rejected": -343.18572998046875, "loss": 0.5592, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.685637474060059, "rewards/margins": 4.813635349273682, "rewards/rejected": -9.499273300170898, "step": 110 }, { "epoch": 0.007537214999057848, "grad_norm": 2.837193727493286, "learning_rate": 0.000119, "logits/chosen": -1.7960655689239502, "logits/rejected": -1.82514226436615, "logps/chosen": -315.5843811035156, "logps/rejected": -336.8552551269531, "loss": 0.5037, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.102546691894531, "rewards/margins": 4.557218074798584, "rewards/rejected": -10.659764289855957, "step": 120 }, { "epoch": 0.008165316248979336, "grad_norm": 3.6775598526000977, "learning_rate": 0.00012900000000000002, "logits/chosen": -1.710066556930542, "logits/rejected": -1.6805146932601929, "logps/chosen": -333.0748596191406, "logps/rejected": -371.88970947265625, "loss": 0.3182, "rewards/accuracies": 0.84375, "rewards/chosen": -5.8403000831604, "rewards/margins": 5.994283199310303, "rewards/rejected": -11.83458423614502, "step": 130 }, { "epoch": 0.008793417498900823, "grad_norm": 6.358480930328369, "learning_rate": 0.000139, "logits/chosen": -1.724496841430664, "logits/rejected": -1.751909613609314, "logps/chosen": -338.35394287109375, "logps/rejected": -388.6962890625, "loss": 0.532, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.19396448135376, "rewards/margins": 7.523868560791016, "rewards/rejected": -14.717832565307617, "step": 140 }, { "epoch": 0.00942151874882231, "grad_norm": 12.958605766296387, "learning_rate": 0.00014900000000000002, "logits/chosen": -1.7843220233917236, "logits/rejected": -1.793779730796814, "logps/chosen": -364.802978515625, "logps/rejected": -412.31439208984375, "loss": 0.8101, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -11.800384521484375, "rewards/margins": 6.584033012390137, "rewards/rejected": -18.384418487548828, "step": 150 }, { "epoch": 0.010049619998743797, "grad_norm": 6.717767715454102, "learning_rate": 0.00015900000000000002, "logits/chosen": -1.820043921470642, "logits/rejected": -1.82791268825531, "logps/chosen": -359.44232177734375, "logps/rejected": -367.754150390625, "loss": 0.8169, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.247892379760742, "rewards/margins": 4.090855121612549, "rewards/rejected": -13.33874797821045, "step": 160 }, { "epoch": 0.010677721248665286, "grad_norm": 7.202780723571777, "learning_rate": 0.00016900000000000002, "logits/chosen": -1.5785781145095825, "logits/rejected": -1.57716965675354, "logps/chosen": -339.47918701171875, "logps/rejected": -403.1285705566406, "loss": 0.4774, "rewards/accuracies": 0.78125, "rewards/chosen": -6.2112603187561035, "rewards/margins": 7.217446804046631, "rewards/rejected": -13.42870807647705, "step": 170 }, { "epoch": 0.011305822498586773, "grad_norm": 28.87391471862793, "learning_rate": 0.00017900000000000001, "logits/chosen": -1.610966444015503, "logits/rejected": -1.612408995628357, "logps/chosen": -365.82452392578125, "logps/rejected": -425.76629638671875, "loss": 0.5113, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -9.420952796936035, "rewards/margins": 8.279291152954102, "rewards/rejected": -17.700244903564453, "step": 180 }, { "epoch": 0.01193392374850826, "grad_norm": 9.530638694763184, "learning_rate": 0.00018899999999999999, "logits/chosen": -1.7442734241485596, "logits/rejected": -1.7305431365966797, "logps/chosen": -382.35052490234375, "logps/rejected": -459.91351318359375, "loss": 0.5497, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -10.981498718261719, "rewards/margins": 9.491922378540039, "rewards/rejected": -20.473421096801758, "step": 190 }, { "epoch": 0.012562024998429747, "grad_norm": 3.895458221435547, "learning_rate": 0.000199, "logits/chosen": -1.6646429300308228, "logits/rejected": -1.7108173370361328, "logps/chosen": -321.7096252441406, "logps/rejected": -348.0752258300781, "loss": 0.4338, "rewards/accuracies": 0.84375, "rewards/chosen": -5.286570072174072, "rewards/margins": 5.001010894775391, "rewards/rejected": -10.287581443786621, "step": 200 }, { "epoch": 0.013190126248351234, "grad_norm": 25.07195472717285, "learning_rate": 0.00019999987630032453, "logits/chosen": -1.6504408121109009, "logits/rejected": -1.7005176544189453, "logps/chosen": -372.27264404296875, "logps/rejected": -417.4801330566406, "loss": 0.6261, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -11.502450942993164, "rewards/margins": 6.584995269775391, "rewards/rejected": -18.087448120117188, "step": 210 }, { "epoch": 0.013818227498272722, "grad_norm": 38.50764465332031, "learning_rate": 0.00019999944869690103, "logits/chosen": -1.6193021535873413, "logits/rejected": -1.5823488235473633, "logps/chosen": -464.146240234375, "logps/rejected": -572.8192749023438, "loss": 0.8033, "rewards/accuracies": 0.78125, "rewards/chosen": -20.47043228149414, "rewards/margins": 12.26713752746582, "rewards/rejected": -32.737571716308594, "step": 220 }, { "epoch": 0.01444632874819421, "grad_norm": 89.9546127319336, "learning_rate": 0.00019999871566387864, "logits/chosen": -1.5943282842636108, "logits/rejected": -1.5745569467544556, "logps/chosen": -388.8191833496094, "logps/rejected": -452.4623107910156, "loss": 0.5702, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -13.816723823547363, "rewards/margins": 7.889412879943848, "rewards/rejected": -21.706134796142578, "step": 230 }, { "epoch": 0.015074429998115696, "grad_norm": 9.676877975463867, "learning_rate": 0.00019999767720349634, "logits/chosen": -1.5301122665405273, "logits/rejected": -1.5427255630493164, "logps/chosen": -414.4371643066406, "logps/rejected": -503.2247619628906, "loss": 0.7861, "rewards/accuracies": 0.8125, "rewards/chosen": -13.572049140930176, "rewards/margins": 10.039185523986816, "rewards/rejected": -23.611236572265625, "step": 240 }, { "epoch": 0.015702531248037183, "grad_norm": 13.622501373291016, "learning_rate": 0.0001999963333189259, "logits/chosen": -1.5465519428253174, "logits/rejected": -1.6000416278839111, "logps/chosen": -392.14691162109375, "logps/rejected": -461.580810546875, "loss": 0.5635, "rewards/accuracies": 0.8125, "rewards/chosen": -13.290196418762207, "rewards/margins": 8.662670135498047, "rewards/rejected": -21.952865600585938, "step": 250 }, { "epoch": 0.016330632497958672, "grad_norm": 15.179641723632812, "learning_rate": 0.000199994684014272, "logits/chosen": -1.6586672067642212, "logits/rejected": -1.7231919765472412, "logps/chosen": -362.83416748046875, "logps/rejected": -412.607421875, "loss": 0.9069, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -11.394574165344238, "rewards/margins": 5.575031280517578, "rewards/rejected": -16.9696044921875, "step": 260 }, { "epoch": 0.016958733747880157, "grad_norm": 218.73663330078125, "learning_rate": 0.00019999272929457207, "logits/chosen": -1.5414860248565674, "logits/rejected": -1.5340508222579956, "logps/chosen": -407.935302734375, "logps/rejected": -457.61358642578125, "loss": 1.2129, "rewards/accuracies": 0.75, "rewards/chosen": -15.647921562194824, "rewards/margins": 5.878279685974121, "rewards/rejected": -21.526203155517578, "step": 270 }, { "epoch": 0.017586834997801646, "grad_norm": 16.871973037719727, "learning_rate": 0.00019999046916579645, "logits/chosen": -1.7745367288589478, "logits/rejected": -1.8200035095214844, "logps/chosen": -344.9629211425781, "logps/rejected": -375.575439453125, "loss": 0.7149, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -9.627102851867676, "rewards/margins": 4.816814422607422, "rewards/rejected": -14.443916320800781, "step": 280 }, { "epoch": 0.018214936247723135, "grad_norm": 10.940070152282715, "learning_rate": 0.00019998790363484829, "logits/chosen": -1.5259405374526978, "logits/rejected": -1.5246999263763428, "logps/chosen": -437.1673278808594, "logps/rejected": -496.84814453125, "loss": 1.5234, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -17.606922149658203, "rewards/margins": 7.5267767906188965, "rewards/rejected": -25.133695602416992, "step": 290 }, { "epoch": 0.01884303749764462, "grad_norm": 33.66419219970703, "learning_rate": 0.00019998503270956356, "logits/chosen": -1.4269921779632568, "logits/rejected": -1.402541160583496, "logps/chosen": -418.0978088378906, "logps/rejected": -511.68719482421875, "loss": 0.9066, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -14.789546012878418, "rewards/margins": 11.723508834838867, "rewards/rejected": -26.5130558013916, "step": 300 }, { "epoch": 0.01947113874756611, "grad_norm": 10.588113784790039, "learning_rate": 0.00019998185639871093, "logits/chosen": -1.509630799293518, "logits/rejected": -1.5115585327148438, "logps/chosen": -453.97576904296875, "logps/rejected": -571.0075073242188, "loss": 1.1362, "rewards/accuracies": 0.8125, "rewards/chosen": -20.761215209960938, "rewards/margins": 13.013384819030762, "rewards/rejected": -33.774600982666016, "step": 310 }, { "epoch": 0.020099239997487594, "grad_norm": 37.5242919921875, "learning_rate": 0.00019997837471199184, "logits/chosen": -1.5979125499725342, "logits/rejected": -1.6052278280258179, "logps/chosen": -561.2927856445312, "logps/rejected": -602.5609130859375, "loss": 2.3508, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -28.971038818359375, "rewards/margins": 6.913781642913818, "rewards/rejected": -35.88481903076172, "step": 320 }, { "epoch": 0.020727341247409083, "grad_norm": 32.54042434692383, "learning_rate": 0.00019997458766004046, "logits/chosen": -1.6302763223648071, "logits/rejected": -1.6192137002944946, "logps/chosen": -617.36962890625, "logps/rejected": -664.2512817382812, "loss": 2.14, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -34.82728958129883, "rewards/margins": 7.2062883377075195, "rewards/rejected": -42.03357696533203, "step": 330 }, { "epoch": 0.02135544249733057, "grad_norm": 25.500333786010742, "learning_rate": 0.00019997049525442365, "logits/chosen": -1.4438846111297607, "logits/rejected": -1.4399240016937256, "logps/chosen": -619.9395751953125, "logps/rejected": -687.3679809570312, "loss": 0.504, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -34.79469680786133, "rewards/margins": 9.648870468139648, "rewards/rejected": -44.44356918334961, "step": 340 }, { "epoch": 0.021983543747252057, "grad_norm": 18.664222717285156, "learning_rate": 0.00019996609750764084, "logits/chosen": -1.6479333639144897, "logits/rejected": -1.6415951251983643, "logps/chosen": -541.8936767578125, "logps/rejected": -606.9635620117188, "loss": 0.3766, "rewards/accuracies": 0.875, "rewards/chosen": -25.513315200805664, "rewards/margins": 9.282835960388184, "rewards/rejected": -34.79615020751953, "step": 350 }, { "epoch": 0.022611644997173545, "grad_norm": 85.91929626464844, "learning_rate": 0.00019996139443312417, "logits/chosen": -1.7720321416854858, "logits/rejected": -1.7591564655303955, "logps/chosen": -461.93634033203125, "logps/rejected": -535.3576049804688, "loss": 0.8762, "rewards/accuracies": 0.8125, "rewards/chosen": -21.481040954589844, "rewards/margins": 8.501348495483398, "rewards/rejected": -29.982391357421875, "step": 360 }, { "epoch": 0.02323974624709503, "grad_norm": 23.130050659179688, "learning_rate": 0.0001999563860452383, "logits/chosen": -1.6624904870986938, "logits/rejected": -1.6370083093643188, "logps/chosen": -535.2438354492188, "logps/rejected": -697.1766967773438, "loss": 0.8517, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -28.613937377929688, "rewards/margins": 16.479190826416016, "rewards/rejected": -45.09312438964844, "step": 370 }, { "epoch": 0.02386784749701652, "grad_norm": 7.051431655883789, "learning_rate": 0.00019995107235928043, "logits/chosen": -1.7520787715911865, "logits/rejected": -1.7464268207550049, "logps/chosen": -481.37969970703125, "logps/rejected": -620.73291015625, "loss": 0.6928, "rewards/accuracies": 0.8125, "rewards/chosen": -22.962947845458984, "rewards/margins": 15.677946090698242, "rewards/rejected": -38.640892028808594, "step": 380 }, { "epoch": 0.024495948746938008, "grad_norm": 15.853219032287598, "learning_rate": 0.00019994545339148016, "logits/chosen": -1.8724607229232788, "logits/rejected": -1.8405532836914062, "logps/chosen": -498.14471435546875, "logps/rejected": -623.45458984375, "loss": 0.7653, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -24.93818473815918, "rewards/margins": 13.593586921691895, "rewards/rejected": -38.531776428222656, "step": 390 }, { "epoch": 0.025124049996859493, "grad_norm": 9.82873249053955, "learning_rate": 0.00019993952915899963, "logits/chosen": -1.88755202293396, "logits/rejected": -1.8915197849273682, "logps/chosen": -460.7791442871094, "logps/rejected": -576.2188720703125, "loss": 1.0629, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -19.17923355102539, "rewards/margins": 14.28106689453125, "rewards/rejected": -33.46030044555664, "step": 400 }, { "epoch": 0.025752151246780982, "grad_norm": 171.42929077148438, "learning_rate": 0.00019993329967993328, "logits/chosen": -1.570020318031311, "logits/rejected": -1.5180249214172363, "logps/chosen": -590.2997436523438, "logps/rejected": -682.8253784179688, "loss": 1.3303, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -32.831451416015625, "rewards/margins": 10.786356925964355, "rewards/rejected": -43.61780548095703, "step": 410 }, { "epoch": 0.026380252496702467, "grad_norm": 90.52941131591797, "learning_rate": 0.00019992676497330788, "logits/chosen": -2.024940013885498, "logits/rejected": -2.04145884513855, "logps/chosen": -582.4156494140625, "logps/rejected": -660.9640502929688, "loss": 1.2081, "rewards/accuracies": 0.8125, "rewards/chosen": -33.084922790527344, "rewards/margins": 8.610158920288086, "rewards/rejected": -41.6950798034668, "step": 420 }, { "epoch": 0.027008353746623956, "grad_norm": 26.57841682434082, "learning_rate": 0.00019991992505908248, "logits/chosen": -1.887359380722046, "logits/rejected": -1.9305862188339233, "logps/chosen": -653.6644287109375, "logps/rejected": -783.5684814453125, "loss": 1.2063, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -37.6458740234375, "rewards/margins": 16.444263458251953, "rewards/rejected": -54.09013748168945, "step": 430 }, { "epoch": 0.027636454996545445, "grad_norm": 8.989371299743652, "learning_rate": 0.0001999127799581483, "logits/chosen": -1.575971245765686, "logits/rejected": -1.6285250186920166, "logps/chosen": -485.9659118652344, "logps/rejected": -576.9669189453125, "loss": 0.7182, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -24.31283950805664, "rewards/margins": 10.831331253051758, "rewards/rejected": -35.144168853759766, "step": 440 }, { "epoch": 0.02826455624646693, "grad_norm": 15.764599800109863, "learning_rate": 0.00019990532969232873, "logits/chosen": -1.8778953552246094, "logits/rejected": -1.859750747680664, "logps/chosen": -503.9375, "logps/rejected": -594.2620849609375, "loss": 2.1169, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -23.802776336669922, "rewards/margins": 11.389697074890137, "rewards/rejected": -35.19247817993164, "step": 450 }, { "epoch": 0.02889265749638842, "grad_norm": 5.814212799072266, "learning_rate": 0.00019989757428437926, "logits/chosen": -1.8384931087493896, "logits/rejected": -1.8424545526504517, "logps/chosen": -482.1886291503906, "logps/rejected": -695.9044189453125, "loss": 0.5215, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -21.515850067138672, "rewards/margins": 22.82857894897461, "rewards/rejected": -44.34442901611328, "step": 460 }, { "epoch": 0.029520758746309904, "grad_norm": 35.80422592163086, "learning_rate": 0.0001998895137579872, "logits/chosen": -1.6341959238052368, "logits/rejected": -1.6305592060089111, "logps/chosen": -536.01513671875, "logps/rejected": -727.3978881835938, "loss": 1.1777, "rewards/accuracies": 0.78125, "rewards/chosen": -26.25558853149414, "rewards/margins": 21.620969772338867, "rewards/rejected": -47.876564025878906, "step": 470 }, { "epoch": 0.030148859996231393, "grad_norm": 80.5127944946289, "learning_rate": 0.00019988114813777207, "logits/chosen": -1.6829744577407837, "logits/rejected": -1.6813862323760986, "logps/chosen": -633.1478271484375, "logps/rejected": -696.3626708984375, "loss": 4.0996, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -38.153289794921875, "rewards/margins": 8.53782844543457, "rewards/rejected": -46.69112014770508, "step": 480 }, { "epoch": 0.03077696124615288, "grad_norm": 378.8959045410156, "learning_rate": 0.000199872477449285, "logits/chosen": -1.719346046447754, "logits/rejected": -1.7496726512908936, "logps/chosen": -626.4200439453125, "logps/rejected": -688.7330322265625, "loss": 4.2823, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -35.26522445678711, "rewards/margins": 9.042797088623047, "rewards/rejected": -44.30802536010742, "step": 490 }, { "epoch": 0.03140506249607437, "grad_norm": 19.214263916015625, "learning_rate": 0.000199863501719009, "logits/chosen": -1.723623514175415, "logits/rejected": -1.73153817653656, "logps/chosen": -540.8836669921875, "logps/rejected": -679.5390625, "loss": 1.5576, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -27.86795997619629, "rewards/margins": 15.523216247558594, "rewards/rejected": -43.391178131103516, "step": 500 }, { "epoch": 0.03203316374599585, "grad_norm": 8.97232723236084, "learning_rate": 0.00019985422097435875, "logits/chosen": -1.6546077728271484, "logits/rejected": -1.6532886028289795, "logps/chosen": -518.208984375, "logps/rejected": -591.9423828125, "loss": 1.0681, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -26.089229583740234, "rewards/margins": 9.524391174316406, "rewards/rejected": -35.61362075805664, "step": 510 }, { "epoch": 0.032661264995917344, "grad_norm": 42.89692687988281, "learning_rate": 0.0001998446352436806, "logits/chosen": -1.457244634628296, "logits/rejected": -1.425767183303833, "logps/chosen": -569.390869140625, "logps/rejected": -651.7835693359375, "loss": 1.2866, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -31.174707412719727, "rewards/margins": 10.0496244430542, "rewards/rejected": -41.22433853149414, "step": 520 }, { "epoch": 0.03328936624583883, "grad_norm": 69.73723602294922, "learning_rate": 0.00019983474455625232, "logits/chosen": -1.4673748016357422, "logits/rejected": -1.423694133758545, "logps/chosen": -504.98046875, "logps/rejected": -594.1463012695312, "loss": 1.4426, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -25.477123260498047, "rewards/margins": 10.604917526245117, "rewards/rejected": -36.0820426940918, "step": 530 }, { "epoch": 0.033917467495760314, "grad_norm": 50.221065521240234, "learning_rate": 0.00019982454894228315, "logits/chosen": -1.4635629653930664, "logits/rejected": -1.4632017612457275, "logps/chosen": -539.5511474609375, "logps/rejected": -637.5697021484375, "loss": 1.6965, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -26.6545352935791, "rewards/margins": 12.554473876953125, "rewards/rejected": -39.209007263183594, "step": 540 }, { "epoch": 0.03454556874568181, "grad_norm": 6.850295066833496, "learning_rate": 0.00019981404843291375, "logits/chosen": -1.5759927034378052, "logits/rejected": -1.563253402709961, "logps/chosen": -513.0505981445312, "logps/rejected": -658.166748046875, "loss": 0.6884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.010568618774414, "rewards/margins": 16.778453826904297, "rewards/rejected": -41.789024353027344, "step": 550 }, { "epoch": 0.03517366999560329, "grad_norm": 111.46468353271484, "learning_rate": 0.0001998032430602159, "logits/chosen": -1.644719123840332, "logits/rejected": -1.6438827514648438, "logps/chosen": -761.154541015625, "logps/rejected": -983.3796997070312, "loss": 3.958, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -48.887386322021484, "rewards/margins": 23.982112884521484, "rewards/rejected": -72.86949157714844, "step": 560 }, { "epoch": 0.03580177124552478, "grad_norm": 23.0783634185791, "learning_rate": 0.0001997921328571926, "logits/chosen": -1.5785070657730103, "logits/rejected": -1.5767858028411865, "logps/chosen": -694.576171875, "logps/rejected": -852.560546875, "loss": 1.9785, "rewards/accuracies": 0.78125, "rewards/chosen": -43.04147720336914, "rewards/margins": 18.63860321044922, "rewards/rejected": -61.68007278442383, "step": 570 }, { "epoch": 0.03642987249544627, "grad_norm": 29.372636795043945, "learning_rate": 0.00019978071785777793, "logits/chosen": -1.387078046798706, "logits/rejected": -1.332202434539795, "logps/chosen": -642.7764892578125, "logps/rejected": -923.02880859375, "loss": 1.4264, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -38.9932861328125, "rewards/margins": 29.377742767333984, "rewards/rejected": -68.37102508544922, "step": 580 }, { "epoch": 0.037057973745367755, "grad_norm": 22.37207794189453, "learning_rate": 0.0001997689980968368, "logits/chosen": -1.6408836841583252, "logits/rejected": -1.618880033493042, "logps/chosen": -578.1907958984375, "logps/rejected": -778.0479736328125, "loss": 0.9336, "rewards/accuracies": 0.8125, "rewards/chosen": -31.888656616210938, "rewards/margins": 20.696504592895508, "rewards/rejected": -52.58516311645508, "step": 590 }, { "epoch": 0.03768607499528924, "grad_norm": 55.33461380004883, "learning_rate": 0.00019975697361016508, "logits/chosen": -1.7676374912261963, "logits/rejected": -1.7803316116333008, "logps/chosen": -693.692138671875, "logps/rejected": -776.8673706054688, "loss": 2.3925, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -42.84964370727539, "rewards/margins": 11.315534591674805, "rewards/rejected": -54.16518020629883, "step": 600 }, { "epoch": 0.038314176245210725, "grad_norm": 43.618263244628906, "learning_rate": 0.00019974464443448927, "logits/chosen": -1.356321096420288, "logits/rejected": -1.3284635543823242, "logps/chosen": -578.3988037109375, "logps/rejected": -668.302001953125, "loss": 1.6001, "rewards/accuracies": 0.78125, "rewards/chosen": -31.271854400634766, "rewards/margins": 12.231229782104492, "rewards/rejected": -43.50308609008789, "step": 610 }, { "epoch": 0.03894227749513222, "grad_norm": 48.47336196899414, "learning_rate": 0.00019973201060746658, "logits/chosen": -1.6047855615615845, "logits/rejected": -1.6225919723510742, "logps/chosen": -692.2528076171875, "logps/rejected": -878.3848876953125, "loss": 1.628, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -43.53544616699219, "rewards/margins": 20.068923950195312, "rewards/rejected": -63.60437774658203, "step": 620 }, { "epoch": 0.0395703787450537, "grad_norm": 286.8766784667969, "learning_rate": 0.00019971907216768465, "logits/chosen": -1.6992285251617432, "logits/rejected": -1.7330913543701172, "logps/chosen": -702.7931518554688, "logps/rejected": -886.8973388671875, "loss": 1.3801, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -43.19356155395508, "rewards/margins": 20.7227725982666, "rewards/rejected": -63.91633224487305, "step": 630 }, { "epoch": 0.04019847999497519, "grad_norm": 59.94353103637695, "learning_rate": 0.0001997058291546615, "logits/chosen": -1.8438717126846313, "logits/rejected": -1.8766376972198486, "logps/chosen": -763.4388427734375, "logps/rejected": -886.6614990234375, "loss": 2.025, "rewards/accuracies": 0.75, "rewards/chosen": -51.276458740234375, "rewards/margins": 13.276979446411133, "rewards/rejected": -64.55343627929688, "step": 640 }, { "epoch": 0.04082658124489668, "grad_norm": 199.64759826660156, "learning_rate": 0.00019969228160884544, "logits/chosen": -1.6561319828033447, "logits/rejected": -1.6853063106536865, "logps/chosen": -938.7986450195312, "logps/rejected": -1068.320556640625, "loss": 4.4954, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -67.7055892944336, "rewards/margins": 15.657336235046387, "rewards/rejected": -83.36293029785156, "step": 650 }, { "epoch": 0.041454682494818165, "grad_norm": 225.52076721191406, "learning_rate": 0.00019967842957161494, "logits/chosen": -1.589758276939392, "logits/rejected": -1.6436010599136353, "logps/chosen": -744.1083374023438, "logps/rejected": -943.9347534179688, "loss": 2.1873, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -49.05168914794922, "rewards/margins": 21.870317459106445, "rewards/rejected": -70.92200469970703, "step": 660 }, { "epoch": 0.04208278374473965, "grad_norm": 465.0030212402344, "learning_rate": 0.0001996642730852784, "logits/chosen": -2.002872943878174, "logits/rejected": -2.024874687194824, "logps/chosen": -1312.8953857421875, "logps/rejected": -1468.654052734375, "loss": 7.8307, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -106.37077331542969, "rewards/margins": 16.59918975830078, "rewards/rejected": -122.96995544433594, "step": 670 }, { "epoch": 0.04271088499466114, "grad_norm": 1029.0079345703125, "learning_rate": 0.00019964981219307428, "logits/chosen": -1.885021448135376, "logits/rejected": -1.9575564861297607, "logps/chosen": -1252.5545654296875, "logps/rejected": -1326.286865234375, "loss": 7.4508, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -99.17045593261719, "rewards/margins": 9.275118827819824, "rewards/rejected": -108.445556640625, "step": 680 }, { "epoch": 0.04333898624458263, "grad_norm": 392.4693908691406, "learning_rate": 0.00019963504693917052, "logits/chosen": -1.7068595886230469, "logits/rejected": -1.774989366531372, "logps/chosen": -1286.7598876953125, "logps/rejected": -1517.222412109375, "loss": 8.9089, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -101.77996826171875, "rewards/margins": 24.891000747680664, "rewards/rejected": -126.67097473144531, "step": 690 }, { "epoch": 0.04396708749450411, "grad_norm": 522.7285766601562, "learning_rate": 0.00019961997736866492, "logits/chosen": -1.9995582103729248, "logits/rejected": -2.072981357574463, "logps/chosen": -1036.725341796875, "logps/rejected": -1173.202392578125, "loss": 6.2864, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -78.75416564941406, "rewards/margins": 15.054057121276855, "rewards/rejected": -93.8082275390625, "step": 700 }, { "epoch": 0.0445951887444256, "grad_norm": 2269.765625, "learning_rate": 0.0001996046035275846, "logits/chosen": -1.7774550914764404, "logits/rejected": -1.9110187292099, "logps/chosen": -1136.3192138671875, "logps/rejected": -1346.834716796875, "loss": 6.3798, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -86.65657806396484, "rewards/margins": 23.44802474975586, "rewards/rejected": -110.10459899902344, "step": 710 }, { "epoch": 0.04522328999434709, "grad_norm": 413.6594543457031, "learning_rate": 0.00019958892546288615, "logits/chosen": -2.4312257766723633, "logits/rejected": -2.4450879096984863, "logps/chosen": -3732.583251953125, "logps/rejected": -3033.38037109375, "loss": 82.9003, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -345.91571044921875, "rewards/margins": -67.03892517089844, "rewards/rejected": -278.8768615722656, "step": 720 }, { "epoch": 0.045851391244268576, "grad_norm": 500.1474914550781, "learning_rate": 0.0001995729432224552, "logits/chosen": -1.8346540927886963, "logits/rejected": -1.8392951488494873, "logps/chosen": -2634.52685546875, "logps/rejected": -2249.791259765625, "loss": 49.7974, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -238.5042266845703, "rewards/margins": -36.161537170410156, "rewards/rejected": -202.34268188476562, "step": 730 }, { "epoch": 0.04647949249419006, "grad_norm": 208.2945098876953, "learning_rate": 0.00019955665685510661, "logits/chosen": -2.162415027618408, "logits/rejected": -2.1644978523254395, "logps/chosen": -2900.498046875, "logps/rejected": -2426.0439453125, "loss": 60.0939, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -264.02056884765625, "rewards/margins": -44.15208053588867, "rewards/rejected": -219.8684844970703, "step": 740 }, { "epoch": 0.04710759374411155, "grad_norm": 217.46322631835938, "learning_rate": 0.00019954006641058399, "logits/chosen": -1.728189468383789, "logits/rejected": -1.7283258438110352, "logps/chosen": -2785.4755859375, "logps/rejected": -2382.31640625, "loss": 55.7123, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -250.3723602294922, "rewards/margins": -37.46326446533203, "rewards/rejected": -212.90908813476562, "step": 750 }, { "epoch": 0.04773569499403304, "grad_norm": 30.279375076293945, "learning_rate": 0.00019952317193955968, "logits/chosen": -1.712280035018921, "logits/rejected": -1.712415099143982, "logps/chosen": -2574.61767578125, "logps/rejected": -2247.047607421875, "loss": 49.6674, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -230.19296264648438, "rewards/margins": -30.719844818115234, "rewards/rejected": -199.47312927246094, "step": 760 }, { "epoch": 0.048363796243954524, "grad_norm": 57.67124557495117, "learning_rate": 0.00019950597349363482, "logits/chosen": -1.7299903631210327, "logits/rejected": -1.730055809020996, "logps/chosen": -2615.30517578125, "logps/rejected": -2313.686279296875, "loss": 43.7828, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -234.658447265625, "rewards/margins": -28.67075538635254, "rewards/rejected": -205.9877166748047, "step": 770 }, { "epoch": 0.048991897493876016, "grad_norm": 83.82339477539062, "learning_rate": 0.00019948847112533872, "logits/chosen": -1.67208731174469, "logits/rejected": -1.672123670578003, "logps/chosen": -2435.185791015625, "logps/rejected": -2112.514892578125, "loss": 45.4174, "rewards/accuracies": 0.375, "rewards/chosen": -217.42410278320312, "rewards/margins": -30.300617218017578, "rewards/rejected": -187.1234893798828, "step": 780 }, { "epoch": 0.0496199987437975, "grad_norm": 157.60299682617188, "learning_rate": 0.00019947066488812919, "logits/chosen": -1.6495654582977295, "logits/rejected": -1.6498143672943115, "logps/chosen": -2799.1640625, "logps/rejected": -2364.3642578125, "loss": 54.1762, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": -250.8790283203125, "rewards/margins": -40.323848724365234, "rewards/rejected": -210.55520629882812, "step": 790 }, { "epoch": 0.050248099993718987, "grad_norm": 33.55050277709961, "learning_rate": 0.00019945255483639195, "logits/chosen": -1.934316635131836, "logits/rejected": -1.9342164993286133, "logps/chosen": -2529.01513671875, "logps/rejected": -2145.40087890625, "loss": 47.817, "rewards/accuracies": 0.34375, "rewards/chosen": -227.28494262695312, "rewards/margins": -36.5882682800293, "rewards/rejected": -190.69667053222656, "step": 800 }, { "epoch": 0.05087620124364047, "grad_norm": 33.80958938598633, "learning_rate": 0.00019943414102544083, "logits/chosen": -2.0099902153015137, "logits/rejected": -2.009880542755127, "logps/chosen": -2300.076416015625, "logps/rejected": -2037.953125, "loss": 42.1386, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -205.62857055664062, "rewards/margins": -24.795612335205078, "rewards/rejected": -180.83297729492188, "step": 810 }, { "epoch": 0.051504302493561964, "grad_norm": 114.56985473632812, "learning_rate": 0.00019941542351151732, "logits/chosen": -1.9268079996109009, "logits/rejected": -1.9269371032714844, "logps/chosen": -2702.51513671875, "logps/rejected": -2280.36181640625, "loss": 54.172, "rewards/accuracies": 0.34375, "rewards/chosen": -241.6612548828125, "rewards/margins": -39.36709213256836, "rewards/rejected": -202.29417419433594, "step": 820 }, { "epoch": 0.05213240374348345, "grad_norm": 32.363704681396484, "learning_rate": 0.0001993964023517906, "logits/chosen": -1.9412562847137451, "logits/rejected": -1.9411704540252686, "logps/chosen": -2561.27001953125, "logps/rejected": -2275.432373046875, "loss": 44.5065, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -229.55078125, "rewards/margins": -26.52230453491211, "rewards/rejected": -203.02847290039062, "step": 830 }, { "epoch": 0.052760504993404934, "grad_norm": 50.04779052734375, "learning_rate": 0.00019937707760435725, "logits/chosen": -2.017848491668701, "logits/rejected": -2.0176830291748047, "logps/chosen": -2510.478759765625, "logps/rejected": -2220.4755859375, "loss": 43.5817, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -224.2565155029297, "rewards/margins": -26.275188446044922, "rewards/rejected": -197.9813232421875, "step": 840 }, { "epoch": 0.05338860624332643, "grad_norm": 99.80337524414062, "learning_rate": 0.0001993574493282411, "logits/chosen": -2.062908887863159, "logits/rejected": -2.063027858734131, "logps/chosen": -2650.47900390625, "logps/rejected": -2259.56982421875, "loss": 55.837, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -237.4520263671875, "rewards/margins": -36.34313201904297, "rewards/rejected": -201.10888671875, "step": 850 }, { "epoch": 0.05401670749324791, "grad_norm": 32.06324768066406, "learning_rate": 0.00019933751758339298, "logits/chosen": -2.058439016342163, "logits/rejected": -2.0586676597595215, "logps/chosen": -2384.542724609375, "logps/rejected": -2145.514892578125, "loss": 36.635, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -214.06094360351562, "rewards/margins": -23.10455894470215, "rewards/rejected": -190.9563751220703, "step": 860 }, { "epoch": 0.0546448087431694, "grad_norm": 33.27601623535156, "learning_rate": 0.00019931728243069075, "logits/chosen": -2.0423336029052734, "logits/rejected": -2.0433266162872314, "logps/chosen": -2424.102294921875, "logps/rejected": -2104.13037109375, "loss": 45.6059, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -216.86239624023438, "rewards/margins": -29.9754638671875, "rewards/rejected": -186.88693237304688, "step": 870 }, { "epoch": 0.05527290999309089, "grad_norm": 8.302653312683105, "learning_rate": 0.00019929674393193885, "logits/chosen": -1.9965121746063232, "logits/rejected": -1.9979193210601807, "logps/chosen": -2260.541015625, "logps/rejected": -2012.0537109375, "loss": 37.6216, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -202.09765625, "rewards/margins": -23.90249252319336, "rewards/rejected": -178.1951446533203, "step": 880 }, { "epoch": 0.055901011243012375, "grad_norm": 49.94055938720703, "learning_rate": 0.0001992759021498683, "logits/chosen": -2.0236964225769043, "logits/rejected": -2.0326337814331055, "logps/chosen": -2478.661376953125, "logps/rejected": -2211.896484375, "loss": 44.0456, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -220.9909210205078, "rewards/margins": -25.075870513916016, "rewards/rejected": -195.91506958007812, "step": 890 }, { "epoch": 0.05652911249293386, "grad_norm": 23.935443878173828, "learning_rate": 0.00019925475714813642, "logits/chosen": -2.0544962882995605, "logits/rejected": -2.0543346405029297, "logps/chosen": -2269.01904296875, "logps/rejected": -1969.6546630859375, "loss": 40.0804, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -201.8982391357422, "rewards/margins": -27.336307525634766, "rewards/rejected": -174.5619659423828, "step": 900 }, { "epoch": 0.057157213742855345, "grad_norm": 10.479208946228027, "learning_rate": 0.0001992333089913266, "logits/chosen": -2.16322922706604, "logits/rejected": -2.15558123588562, "logps/chosen": -2234.328857421875, "logps/rejected": -1985.298583984375, "loss": 40.993, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -199.21878051757812, "rewards/margins": -24.10614776611328, "rewards/rejected": -175.1125946044922, "step": 910 }, { "epoch": 0.05778531499277684, "grad_norm": 11.444966316223145, "learning_rate": 0.00019921155774494834, "logits/chosen": -2.1777141094207764, "logits/rejected": -2.174402952194214, "logps/chosen": -2311.43115234375, "logps/rejected": -2022.725341796875, "loss": 41.4152, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -206.24282836914062, "rewards/margins": -27.16840171813965, "rewards/rejected": -179.07443237304688, "step": 920 }, { "epoch": 0.05841341624269832, "grad_norm": 30.955385208129883, "learning_rate": 0.0001991895034754367, "logits/chosen": -2.2076773643493652, "logits/rejected": -2.2089266777038574, "logps/chosen": -2599.807861328125, "logps/rejected": -2318.37060546875, "loss": 42.4005, "rewards/accuracies": 0.40625, "rewards/chosen": -232.23434448242188, "rewards/margins": -26.513622283935547, "rewards/rejected": -205.72073364257812, "step": 930 }, { "epoch": 0.05904151749261981, "grad_norm": 23.035261154174805, "learning_rate": 0.00019916714625015227, "logits/chosen": -2.1916956901550293, "logits/rejected": -2.1908867359161377, "logps/chosen": -2434.56884765625, "logps/rejected": -2107.47119140625, "loss": 45.1593, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -216.96755981445312, "rewards/margins": -31.494918823242188, "rewards/rejected": -185.47264099121094, "step": 940 }, { "epoch": 0.0596696187425413, "grad_norm": 18.68404197692871, "learning_rate": 0.00019914448613738106, "logits/chosen": -2.2704803943634033, "logits/rejected": -2.2579779624938965, "logps/chosen": -2458.69189453125, "logps/rejected": -2222.619140625, "loss": 38.3093, "rewards/accuracies": 0.34375, "rewards/chosen": -219.3262481689453, "rewards/margins": -22.385156631469727, "rewards/rejected": -196.94107055664062, "step": 950 }, { "epoch": 0.060297719992462785, "grad_norm": 160.93423461914062, "learning_rate": 0.0001991215232063341, "logits/chosen": -2.3261501789093018, "logits/rejected": -2.323645830154419, "logps/chosen": -2287.62548828125, "logps/rejected": -2092.545654296875, "loss": 41.3738, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -203.22201538085938, "rewards/margins": -18.429567337036133, "rewards/rejected": -184.79244995117188, "step": 960 }, { "epoch": 0.06092582124238427, "grad_norm": 375.4884338378906, "learning_rate": 0.00019909825752714743, "logits/chosen": -2.36720609664917, "logits/rejected": -2.3556370735168457, "logps/chosen": -2625.92333984375, "logps/rejected": -2331.1015625, "loss": 44.8854, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -234.37850952148438, "rewards/margins": -27.28865623474121, "rewards/rejected": -207.08984375, "step": 970 }, { "epoch": 0.06155392249230576, "grad_norm": 783.9384765625, "learning_rate": 0.00019907468917088167, "logits/chosen": -2.236072540283203, "logits/rejected": -2.256068706512451, "logps/chosen": -2365.45654296875, "logps/rejected": -2085.861328125, "loss": 38.6396, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -211.521484375, "rewards/margins": -26.044498443603516, "rewards/rejected": -185.4770050048828, "step": 980 }, { "epoch": 0.06218202374222725, "grad_norm": 28.447830200195312, "learning_rate": 0.00019905081820952196, "logits/chosen": -2.598353862762451, "logits/rejected": -2.6240992546081543, "logps/chosen": -2471.71240234375, "logps/rejected": -2153.321044921875, "loss": 45.1464, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -221.5684051513672, "rewards/margins": -30.11161231994629, "rewards/rejected": -191.45681762695312, "step": 990 }, { "epoch": 0.06281012499214873, "grad_norm": 2125.212646484375, "learning_rate": 0.00019902664471597764, "logits/chosen": -2.3416192531585693, "logits/rejected": -2.3438401222229004, "logps/chosen": -2671.303955078125, "logps/rejected": -2387.85791015625, "loss": 46.112, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -240.5626983642578, "rewards/margins": -26.22954750061035, "rewards/rejected": -214.33316040039062, "step": 1000 }, { "epoch": 0.06343822624207023, "grad_norm": 727.9296264648438, "learning_rate": 0.0001990021687640822, "logits/chosen": -2.477464437484741, "logits/rejected": -2.5252153873443604, "logps/chosen": -2566.926025390625, "logps/rejected": -2287.06298828125, "loss": 44.0325, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -229.95248413085938, "rewards/margins": -25.690563201904297, "rewards/rejected": -204.26193237304688, "step": 1010 }, { "epoch": 0.0640663274919917, "grad_norm": 449.7696228027344, "learning_rate": 0.00019897739042859286, "logits/chosen": -2.3235955238342285, "logits/rejected": -2.3614401817321777, "logps/chosen": -2350.79345703125, "logps/rejected": -2165.26708984375, "loss": 36.487, "rewards/accuracies": 0.4375, "rewards/chosen": -209.9047088623047, "rewards/margins": -17.175960540771484, "rewards/rejected": -192.72872924804688, "step": 1020 }, { "epoch": 0.0646944287419132, "grad_norm": 187.83128356933594, "learning_rate": 0.00019895230978519027, "logits/chosen": -2.4946370124816895, "logits/rejected": -2.5378799438476562, "logps/chosen": -2458.1220703125, "logps/rejected": -2247.060791015625, "loss": 37.1141, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -220.6074676513672, "rewards/margins": -20.21994972229004, "rewards/rejected": -200.3875274658203, "step": 1030 }, { "epoch": 0.06532252999183469, "grad_norm": 549.5263671875, "learning_rate": 0.00019892692691047875, "logits/chosen": -2.6173880100250244, "logits/rejected": -2.6950318813323975, "logps/chosen": -2575.59033203125, "logps/rejected": -2456.432373046875, "loss": 33.7259, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -231.40023803710938, "rewards/margins": -10.714682579040527, "rewards/rejected": -220.68557739257812, "step": 1040 }, { "epoch": 0.06595063124175617, "grad_norm": 936.9767456054688, "learning_rate": 0.0001989012418819854, "logits/chosen": -2.600580930709839, "logits/rejected": -2.636406660079956, "logps/chosen": -2571.770751953125, "logps/rejected": -2338.911865234375, "loss": 39.8577, "rewards/accuracies": 0.46875, "rewards/chosen": -231.35671997070312, "rewards/margins": -20.439619064331055, "rewards/rejected": -210.91708374023438, "step": 1050 }, { "epoch": 0.06657873249167766, "grad_norm": 153.9656524658203, "learning_rate": 0.00019887525477816037, "logits/chosen": -2.478044033050537, "logits/rejected": -2.5068297386169434, "logps/chosen": -2567.91259765625, "logps/rejected": -2268.873291015625, "loss": 42.5613, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -229.70986938476562, "rewards/margins": -27.01144027709961, "rewards/rejected": -202.6984100341797, "step": 1060 }, { "epoch": 0.06720683374159915, "grad_norm": 169.6083984375, "learning_rate": 0.00019884896567837643, "logits/chosen": -2.344329833984375, "logits/rejected": -2.3877031803131104, "logps/chosen": -2456.10107421875, "logps/rejected": -2295.291259765625, "loss": 34.8078, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -219.85922241210938, "rewards/margins": -14.12236499786377, "rewards/rejected": -205.73684692382812, "step": 1070 }, { "epoch": 0.06783493499152063, "grad_norm": 1363.688232421875, "learning_rate": 0.0001988223746629287, "logits/chosen": -2.5865371227264404, "logits/rejected": -2.6248626708984375, "logps/chosen": -2963.736572265625, "logps/rejected": -2739.84765625, "loss": 44.1905, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -270.51385498046875, "rewards/margins": -19.617473602294922, "rewards/rejected": -250.89639282226562, "step": 1080 }, { "epoch": 0.06846303624144212, "grad_norm": 318.1181640625, "learning_rate": 0.00019879548181303444, "logits/chosen": -2.5181894302368164, "logits/rejected": -2.5209131240844727, "logps/chosen": -2676.705078125, "logps/rejected": -2344.05810546875, "loss": 49.3269, "rewards/accuracies": 0.375, "rewards/chosen": -243.09378051757812, "rewards/margins": -31.563501358032227, "rewards/rejected": -211.5302734375, "step": 1090 }, { "epoch": 0.06909113749136361, "grad_norm": 72.50196838378906, "learning_rate": 0.00019876828721083288, "logits/chosen": -2.2359652519226074, "logits/rejected": -2.241807222366333, "logps/chosen": -3414.450439453125, "logps/rejected": -2996.220947265625, "loss": 59.6502, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -315.58392333984375, "rewards/margins": -38.8865966796875, "rewards/rejected": -276.6972961425781, "step": 1100 }, { "epoch": 0.06971923874128509, "grad_norm": 7.386142730712891, "learning_rate": 0.00019874079093938475, "logits/chosen": -1.8908523321151733, "logits/rejected": -1.890451192855835, "logps/chosen": -2971.73095703125, "logps/rejected": -2527.33984375, "loss": 62.1834, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -272.0863037109375, "rewards/margins": -42.23844528198242, "rewards/rejected": -229.8478546142578, "step": 1110 }, { "epoch": 0.07034733999120658, "grad_norm": 57.4601936340332, "learning_rate": 0.00019871299308267236, "logits/chosen": -2.0698752403259277, "logits/rejected": -2.07073974609375, "logps/chosen": -3427.44482421875, "logps/rejected": -3055.614990234375, "loss": 60.3318, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -316.5343933105469, "rewards/margins": -34.9571418762207, "rewards/rejected": -281.5772705078125, "step": 1120 }, { "epoch": 0.07097544124112808, "grad_norm": 73.67867279052734, "learning_rate": 0.000198684893725599, "logits/chosen": -2.4461870193481445, "logits/rejected": -2.4461569786071777, "logps/chosen": -2597.948974609375, "logps/rejected": -2381.15234375, "loss": 43.2422, "rewards/accuracies": 0.4375, "rewards/chosen": -234.696044921875, "rewards/margins": -20.43332290649414, "rewards/rejected": -214.26272583007812, "step": 1130 }, { "epoch": 0.07160354249104955, "grad_norm": 24.876911163330078, "learning_rate": 0.00019865649295398893, "logits/chosen": -2.420064687728882, "logits/rejected": -2.4179110527038574, "logps/chosen": -2495.120849609375, "logps/rejected": -2211.05419921875, "loss": 44.6958, "rewards/accuracies": 0.375, "rewards/chosen": -223.16287231445312, "rewards/margins": -26.39817237854004, "rewards/rejected": -196.76467895507812, "step": 1140 }, { "epoch": 0.07223164374097105, "grad_norm": 1182.2891845703125, "learning_rate": 0.00019862779085458697, "logits/chosen": -2.5218708515167236, "logits/rejected": -2.5206217765808105, "logps/chosen": -2362.462158203125, "logps/rejected": -2134.76416015625, "loss": 38.0256, "rewards/accuracies": 0.40625, "rewards/chosen": -211.2278594970703, "rewards/margins": -21.650144577026367, "rewards/rejected": -189.57769775390625, "step": 1150 } ], "logging_steps": 10, "max_steps": 18176, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }